成人午夜激情影院,小视频免费在线观看,国产精品夜夜嗨,欧美日韩精品一区二区在线播放

ASP.NET采集系統萬能正則表達式

2010-08-28 10:52:45來源:西部e網作者:

由于經常要寫一些采集的程序,下面的三個函數是采集中的很常用的函數。姑且叫采集系統萬能正則表達式吧。

第一個://獲取頁面的html源碼
 public  string GetHtmlSource(string Url, string charset)
        
{
            
if (charset == "" || charset == null) charset = "gb2312";
            
string text1 = "";
            
try
            
{
                HttpWebRequest request1 
= (HttpWebRequest)WebRequest.Create(Url);
                HttpWebResponse response1 
= (HttpWebResponse)request1.GetResponse();
                Stream stream1 
= response1.GetResponseStream();
                StreamReader reader1 
= new StreamReader(stream1, Encoding.GetEncoding(charset));
                text1 
= reader1.ReadToEnd();
                stream1.Close();
                response1.Close();
            }

            
catch (Exception exception1)
            
{
            }

            
return text1;
        }

第二個:截取字符串

public string SniffwebCode(string code, string wordsBegin, string wordsEnd)
        
{
            
string NewsTitle = "";
            Regex regex1 
= new Regex("" + wordsBegin + @"(?<title>[\s\S]+?)" + wordsEnd + "", RegexOptions.Compiled | RegexOptions.IgnoreCase);
            
for (Match match1 = regex1.Match(code); match1.Success; match1 = match1.NextMatch())
            
{
                NewsTitle 
= match1.Groups["title"].ToString();
            }

            
return NewsTitle;

        }

第三個:截取網址

public ArrayList SniffwebCodeReturnList(string code, string wordsBegin, string wordsEnd)
        
{
            ArrayList urlList 
= new ArrayList();
            
//string NewsTitle = "";
            Regex regex1 = new Regex("" + wordsBegin + @"(?<title>[\s\S]+?)" + wordsEnd + "", RegexOptions.Compiled | RegexOptions.IgnoreCase);
            
for (Match match1 = regex1.Match(code); match1.Success; match1 = match1.NextMatch())
            
{
                urlList.Add(match1.Groups[
"title"].ToString());
            }

            
return urlList;

        }

全部代碼如下:

using System;
using System.Collections.Generic;
using System.Text;
using System.Data;
using System.Data.OleDb;
using System.IO;
using System.Text.RegularExpressions;
using System.Text;
using System.Collections;
using System.Net;
namespace getWeb
{
    public class DBconn
    {
      //   public string dbConnString = @"User ID=sa;Data Source=.;Password=sa;Initial Catalog=GetWeb;Provider=SQLOLEDB.1";
        public string dbConnString = @"provider=microsoft.jet.oledb.4.0;data source=Getweb.mdb";
        public static string GetSource(string Url, string charset)
        {
            if (charset == "" || charset == null) charset = "gb2312";
            string text1 = "";
            try
            {
                Stream stream1 = new WebClient().OpenRead(Url);
                text1 = new StreamReader(stream1, Encoding.GetEncoding(charset)).ReadToEnd();
                stream1.Close();
            }
            catch (Exception exception1)
            {
            }
            return text1;
        }

        public  string GetHtmlSource(string Url, string charset)
        {
            if (charset == "" || charset == null) charset = "gb2312";
            string text1 = "";
            try
            {
                HttpWebRequest request1 = (HttpWebRequest)WebRequest.Create(Url);
                HttpWebResponse response1 = (HttpWebResponse)request1.GetResponse();
                Stream stream1 = response1.GetResponseStream();
                StreamReader reader1 = new StreamReader(stream1, Encoding.GetEncoding(charset));
                text1 = reader1.ReadToEnd();
                stream1.Close();
                response1.Close();
            }
            catch (Exception exception1)
            {
            }
            return text1;
        }

        public string Get_Http(string a_strUrl, int timeout)
        {
            string strResult;

            try
            {
                HttpWebRequest myReq = (HttpWebRequest)HttpWebRequest.Create(a_strUrl);
                myReq.Timeout = timeout;
                HttpWebResponse HttpWResp = (HttpWebResponse)myReq.GetResponse();

                Stream myStream = HttpWResp.GetResponseStream();

                StreamReader sr = new StreamReader(myStream, Encoding.Default);
                StringBuilder strBuilder = new StringBuilder();
                while (-1 != sr.Peek())
                {
                    strBuilder.Append(sr.ReadLine() + "\r\n");
                }

                strResult = strBuilder.ToString();
            }
            catch (Exception exp)
            {
                strResult = "錯誤:" + exp.Message;
            }

            return strResult;

        }

        //獲取頁面內容后,分析頁面中連接地址取到要抓取的url:
        //處理頁面標題和鏈接
        public string SniffwebCode(string code, string wordsBegin, string wordsEnd)
        {
            string NewsTitle = "\";
            Regex regex1 = new Regex("" + wordsBegin + @"(?<title>[\s\S]+?)" + wordsEnd + "", RegexOptions.Compiled | RegexOptions.IgnoreCase);
            for (Match match1 = regex1.Match(code); match1.Success; match1 = match1.NextMatch())
            {
                NewsTitle = match1.Groups["title"].ToString();
            }
            return NewsTitle;

        }


        public ArrayList SniffwebCodeReturnList(string code, string wordsBegin, string wordsEnd)
        {
            ArrayList urlList = new ArrayList();
            //string NewsTitle = "\";
            Regex regex1 = new Regex("" + wordsBegin + @"(?<title>[\s\S]+?)" + wordsEnd + "", RegexOptions.Compiled | RegexOptions.IgnoreCase);
            for (Match match1 = regex1.Match(code); match1.Success; match1 = match1.NextMatch())
            {
                urlList.Add(match1.Groups["title"].ToString());
            }
            return urlList;

        }
    

    }
}

關鍵詞:ASP.NET
主站蜘蛛池模板: 宿松县| 凤凰县| 武汉市| 库伦旗| 临夏市| 安多县| 昌江| 札达县| 焦作市| 观塘区| 乡城县| 江源县| 平利县| 兴国县| 海淀区| 沙洋县| 共和县| 垣曲县| 和顺县| 汝南县| 中江县| 新密市| 枝江市| 昌邑市| 万源市| 仁怀市| 张家港市| 登封市| 内丘县| 瓦房店市| 宁陕县| 东平县| 榆社县| 东海县| 克什克腾旗| 新宾| 大城县| 尤溪县| 黄浦区| 湄潭县| 大英县|