存档在 2007年10月

用C#写的垂直搜索之爬虫

2007年10月27日

搞垂直搜索搞了这么写天,现在已经把我们学院的网页爬完了,非常快,时间不超过30s.




  1using System;
  2using System.Collections.Generic;
  3using System.ComponentModel;
  4using System.Data;
  5using System.Drawing;
  6using System.Text;
  7using System.Windows.Forms;
  8using System.Web.Security;
  9using System.IO;
 10using System.Net;
 11using System.Text.RegularExpressions;
 12
 13namespace ie
 14{
 15    public partial class Form1 : Form
 16    {
 17        public Form1()
 18        {
 19            InitializeComponent();
 20
 21        }

 22
 23        private void button1_Click(object sender, EventArgs e)
 24        {
 25
 26            for (Int32 i = 100; i <= 1750; i++)
 27            {
 28                String ii = i.ToString();
 29
 30                string strmd5 = FormsAuthentication.HashPasswordForStoringInConfigFile(ii, md5);
 31                string x = http://ie.wh.sdu.edu.cn/show. + strmd5 + .ie;
 32                readText read = new readText();
 33                read.setUrl(x);
 34                if (read.equalS() == 0continue;
 35                else
 36                {
 37                    read.writeText(ii);
 38                }

 39            }

 40            MessageBox.Show(完成!);
 41        }

 42        public class readText
 43        {
 44            private string[] text;
 45            private string url;
 46            public readText()
 47            {
 48                text = new string[6];
 49                for (int i = 0; i <= 5; i++)
 50                    text[i] = “”;
 51                url = “”;
 52            }

 53            public void setUrl(string urr)
 54            {
 55
 56                url = urr;
 57            }

 58
 59            public int equalS()
 60            {
 61                int x = read();
 62                if (x == 1return 0;
 63                for (int i = 0; i < 5; i++)
 64                {
 65                    string s = text[i];
 66                    if (s.Trim().Equals(“”))
 67                        return 0;
 68                }

 69                return 1;
 70            }

 71
 72
 73            public void writeText(String str)
 74            {
 75
 76                string s = url + rn;
 77                for (int i = 0; i <= 5; i++)
 78                {
 79                    s = s + text[i] + rn;
 80                }

 81                string ss = d:ie + str + .txt;
 82                if (!Directory.Exists(d:ie))
 83                    Directory.CreateDirectory(d:ie);
 84                File.AppendAllText(ss, s);
 85            }

 86            public int read()
 87            {
 88
 89                string sss;
 90                WebRequest wreq = WebRequest.Create(url);//_url是字符串,代表要得到的网页。
 91                WebResponse wres = wreq.GetResponse();
 92                Stream stream = wres.GetResponseStream();//这里得到的流是网页内容
 93                if (stream == nullreturn 1;
 94                StreamReader sr = new StreamReader(stream, Encoding.Default);
 95                StringBuilder sb = new StringBuilder();
 96                string rl;
 97                while ((rl = sr.ReadLine()) != null)
 98                {
 99                    sb.Append(rl);
100                }

101                sss = sb.ToString();
102                wres.Close();
103                pReadIe(sss);
104                return 0;
105            }

106            public void pReadIe(String sInput)
107            {
108                MatchCollection matches;
109                Regex[] extractHTML = new Regex[5];
110                extractHTML[0= new Regex(<td class=title align=center>(?<name>[^/]+)</td>);
111                extractHTML[1= new Regex(@”作者:[s]+(?<name>[S]+));
112                extractHTML[2= new Regex(<td align=center class=adddatetime>(?<data>[^ ]+)[s](?<time>[^ ]+)</td>);
113                extractHTML[3= new Regex(@”&nbsp;&nbsp;浏览:[s]+(?<num>[d]+)[s]+次</td>);
114                extractHTML[4= new Regex(<td align=left class=content>(?<count>[wW]+)</td>[s]+</tr>[s]+<tr>[s]+<td align=);
115                for (int i = 0; i <= 4; i++)
116                {
117                    matches = extractHTML[i].Matches(sInput);
118
119                    foreach (Match matchMade in matches)
120                    {
121
122                        switch (i)
123                        {
124                            case 0: text[0= matchMade.Groups[1].Value; break;
125                            case 1: text[1= matchMade.Groups[1].Value; break;
126                            case 2: text[2= matchMade.Groups[1].Value;
127                                text[3= matchMade.Groups[2].Value;
128                                break;
129                            case 3: text[4= matchMade.Groups[1].Value; break;
130                            case 4: text[5= matchMade.Groups[1].Value; break;
131                        }

132
133                    }

134                }

135            }

136        }

137
138
139    }

140}

141
142