Marc Boizeau's blog: A web crawler(regular expressions and httprequest)

Friday, June 17, 2005

A web crawler(regular expressions and httprequest)

Hello everybody, i'm back to the blog!

Today I'd like to demonstrate two powerfull features of the .net Framework: the httpWebrequest/httWebResponse and the RegEx object. I will then propose an implementation of a mini Web Crawler.

httpWebrequest and httWebResponse are both object of the System.Net namespace.
With this objects, retrieving a text stream from a URI is as easy as this:

// Creates an HttpWebRequest with the specified URL.
HttpWebRequest myHttpWebRequest;

myHttpWebRequest = (HttpWebRequest)WebRequest.Create(strURL);

// Sends the HttpWebRequest and waits for the response.
HttpWebResponse myHttpWebResponse = (HttpWebResponse)myHttpWebRequest.GetResponse();
// Gets the stream associated with the response.
Stream receiveStream = myHttpWebResponse.GetResponseStream();
Encoding encode = System.Text.Encoding.GetEncoding("utf-8");
// Pipes the stream to a higher level stream reader with the required encoding format.
StreamReader readStream = new StreamReader( receiveStream, encode );
Char[] read = new Char[256];
// Reads 256 characters at a time.
int count = readStream.Read( read, 0, 256 );

while (count > 0)
{
// Dumps the 256 characters on a string .
String str = new String(read,0, count);
strRes +=str;
count = readStream.Read(read, 0, 256);
}
// Releases the resources of the response.
myHttpWebResponse.Close();
// Releases the resources of the Stream.
readStream.Close();

The regEx object is the .net implementation for regular expressions. If you don't know what it is just remember it's the best way to perform text analysis.
For instance here is a function which retrieves in an array the "href"s in a web page.

private string [] extractURIS(string strHTML)
{
Match m;

String[] results = new String[101];

// Create a new Regex object and define the regular expression.
Regex r = new Regex("href\\s*=\\s*(?:\"(?'1'[^\"]*)\"|(?'1'\\S+)).*>",
RegexOptions.IgnoreCase|RegexOptions.Compiled);

// Use the Matches method to find all matches in the input string.
// Loop through the match collection to retrieve all
// matches.
int i=0;
m = r.Match(strHTML);

for (i=0; (m.Success && i<100); m = m.NextMatch())
{i++;

// Add the match string to the string array.
results[i] = m.Groups[1].ToString() ;

}
return results;
}

OK, have a great breath and look at this
"href\\s*=\\s*(?:\"(?'1'[^\"]*)\"|(?'1'\\S+)).*>"
let's decrypt this regular expression .
"href" just mean that the patern is preceded by the word "href"
\\s stands for "any space character"
\\s* stands for "a 0 or n space character"
so the first part of the regEx :"href\\s*=\\s*" mean that the pattern we are looking for begins by "href" then spaces, then the "=" character and then spaces.

[^\"] means "everything but a double quote character" (the \ is here to say " isnt a special character of the RegEx)
so [^\"]* is a succession of 0 or n character whithout any double quote

(?'1'......) means that whe give the name '1' to the .... expression
Here we use it two times in the subexpression:
\"(?'1'[^\"]*)\"|(?'1'\\S+)
knowing that "|" is a logicall OR, you can guess that '1' is either
a succession of "without double quote" between two double quotes or "a succession of at least one non spaces characters"(\\S+)

(?:....) just propagate the subexpression to the top level .

a dot (.) means "any character"
so:
.*> means that our regEx ends with a succession of any character an then a ">"

wow...
for further info have a look to : http://msdn.microsoft.com/library/default.asp?url=/library/en-us/cpgenref/html/cpconregularexpressionslanguageelements.asp

So lets use it in our little web crawler.

To compile the following , save it with a ".cs" extension and use the csc.exe of your framework installation, for instance use the following:
C:\WINDOWS\Microsoft.NET\Framework\v1.1.4322\csc.exe /target:winexe /recurse:*.*
int the cs directory.

using System;
using System.Drawing;
using System.Collections;
using System.ComponentModel;
using System.Windows.Forms;
using System.Data;
using System.Net;
using System.IO;
using System.Web;
using System.Text;
using System.Text.RegularExpressions;
using System.Collections.Specialized;

namespace webrequest
{
/// <summary>
/// Description résumée de Form1.
/// </summary>
public class WebCrawler : System.Windows.Forms.Form
{
private System.Windows.Forms.Button btLoad;
private System.Windows.Forms.TextBox tbURI;
private System.Windows.Forms.RichTextBox rtbHTML;
private System.Windows.Forms.RichTextBox rtbLOg;

private string MainString;
private StringDictionary dicURL;
private System.Windows.Forms.Label label1;
private System.Windows.Forms.Label label2;
private System.Windows.Forms.TextBox tbiProf;
private System.Windows.Forms.TextBox tbMax;
private System.Windows.Forms.Label label3;
/// <summary>
/// Variable nécessaire au concepteur.
/// </summary>
private System.ComponentModel.Container components = null;

public WebCrawler()
{
dicURL = new StringDictionary();
//
// Requis pour la prise en charge du Concepteur Windows Forms
//
InitializeComponent();

//
// TODO : ajoutez le code du constructeur après l'appel à InitializeComponent
//
}

/// <summary>
/// Nettoyage des ressources utilisées.
/// </summary>
protected override void Dispose( bool disposing )
{
if( disposing )
{
if (components != null)
{
components.Dispose();
}
}
base.Dispose( disposing );
}

#region Code généré par le Concepteur Windows Form
/// <summary>
/// Méthode requise pour la prise en charge du concepteur - ne modifiez pas
/// le contenu de cette méthode avec l'éditeur de code.
/// </summary>
private void InitializeComponent()
{
this.btLoad = new System.Windows.Forms.Button();
this.tbURI = new System.Windows.Forms.TextBox();
this.rtbHTML = new System.Windows.Forms.RichTextBox();
this.rtbLOg = new System.Windows.Forms.RichTextBox();
this.tbiProf = new System.Windows.Forms.TextBox();
this.label1 = new System.Windows.Forms.Label();
this.label2 = new System.Windows.Forms.Label();
this.tbMax = new System.Windows.Forms.TextBox();
this.label3 = new System.Windows.Forms.Label();
this.SuspendLayout();
//
// btLoad
//
this.btLoad.Anchor = ((System.Windows.Forms.AnchorStyles)((((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Bottom)
| System.Windows.Forms.AnchorStyles.Left)
| System.Windows.Forms.AnchorStyles.Right)));
this.btLoad.Location = new System.Drawing.Point(16, 112);
this.btLoad.Name = "btLoad";
this.btLoad.Size = new System.Drawing.Size(272, 120);
this.btLoad.TabIndex = 0;
this.btLoad.Text = "Crawl!";
this.btLoad.Click += new System.EventHandler(this.btLoad_Click);
//
// tbURI
//
this.tbURI.Anchor = ((System.Windows.Forms.AnchorStyles)(((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Left)
| System.Windows.Forms.AnchorStyles.Right)));
this.tbURI.Location = new System.Drawing.Point(120, 16);
this.tbURI.Name = "tbURI";
this.tbURI.Size = new System.Drawing.Size(712, 20);
this.tbURI.TabIndex = 1;
this.tbURI.Text = "http://oraclevsmicrosoft.blogspot.com";
//
// rtbHTML
//
this.rtbHTML.Anchor = ((System.Windows.Forms.AnchorStyles)(((System.Windows.Forms.AnchorStyles.Bottom | System.Windows.Forms.AnchorStyles.Left)
| System.Windows.Forms.AnchorStyles.Right)));
this.rtbHTML.Location = new System.Drawing.Point(8, 248);
this.rtbHTML.Name = "rtbHTML";
this.rtbHTML.Size = new System.Drawing.Size(832, 136);
this.rtbHTML.TabIndex = 2;
this.rtbHTML.Text = "";
//
// rtbLOg
//
this.rtbLOg.Anchor = ((System.Windows.Forms.AnchorStyles)(((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Bottom)
| System.Windows.Forms.AnchorStyles.Right)));
this.rtbLOg.Location = new System.Drawing.Point(297, 40);
this.rtbLOg.Name = "rtbLOg";
this.rtbLOg.Size = new System.Drawing.Size(536, 200);
this.rtbLOg.TabIndex = 3;
this.rtbLOg.Text = "";
this.rtbLOg.LinkClicked += new System.Windows.Forms.LinkClickedEventHandler(this.rtbLOg_LinkClicked);
//
// tbiProf
//
this.tbiProf.Location = new System.Drawing.Point(120, 48);
this.tbiProf.Name = "tbiProf";
this.tbiProf.Size = new System.Drawing.Size(48, 20);
this.tbiProf.TabIndex = 4;
this.tbiProf.Text = "5";
//
// label1
//
this.label1.Location = new System.Drawing.Point(8, 16);
this.label1.Name = "label1";
this.label1.Size = new System.Drawing.Size(88, 16);
this.label1.TabIndex = 5;
this.label1.Text = "Starting URI";
//
// label2
//
this.label2.Location = new System.Drawing.Point(8, 48);
this.label2.Name = "label2";
this.label2.Size = new System.Drawing.Size(88, 16);
this.label2.TabIndex = 6;
this.label2.Text = "Max depht";
//
// tbMax
//
this.tbMax.Location = new System.Drawing.Point(120, 80);
this.tbMax.Name = "tbMax";
this.tbMax.Size = new System.Drawing.Size(104, 20);
this.tbMax.TabIndex = 4;
this.tbMax.Text = "1000";
//
// label3
//
this.label3.Location = new System.Drawing.Point(8, 88);
this.label3.Name = "label3";
this.label3.Size = new System.Drawing.Size(112, 16);
this.label3.TabIndex = 6;
this.label3.Text = "Max number of links";
//
// WebCrawler
//
this.AutoScaleBaseSize = new System.Drawing.Size(5, 13);
this.ClientSize = new System.Drawing.Size(848, 398);
this.Controls.Add(this.label2);
this.Controls.Add(this.label1);
this.Controls.Add(this.tbiProf);
this.Controls.Add(this.rtbLOg);
this.Controls.Add(this.rtbHTML);
this.Controls.Add(this.tbURI);
this.Controls.Add(this.btLoad);
this.Controls.Add(this.tbMax);
this.Controls.Add(this.label3);
this.MinimumSize = new System.Drawing.Size(856, 432);
this.Name = "WebCrawler";
this.Text = "Form1";
this.ResumeLayout(false);

}
#endregion

/// <summary>
/// Point d'entrée principal de l'application.
/// </summary>
[STAThread]
static void Main()
{
Application.Run(new WebCrawler());
}

// ne garde que le body du document et remplace toutes les balises par un espace simple
// enfin decode le html pour remplacer les énbsp; et autres
private string deleteTagg(string strHTML)
{
Match m;
string strRes ="";
Regex r = new Regex ("<body.*>(.|\\n)*</body>",
RegexOptions.IgnoreCase);
m = r.Match(strHTML);
if (m.Success )
{
strRes = m.Value;

}
r = null;
r = new Regex ("<(.|\\n)*?>",
RegexOptions.IgnoreCase);
strRes = r.Replace(strRes," ");

strRes =HttpUtility.HtmlDecode(strRes);

return strRes ;
}
private string [] extractURIS(string strHTML)
{
Match m;

String[] results = new String[101];

// Create a new Regex object and define the regular expression.
Regex r = new Regex ("href\\s*=\\s*(?:\"(?'1'[^\"]*)\"|(?'1'\\S+)).*>",
RegexOptions.IgnoreCase|RegexOptions.Compiled);

// Use the Matches method to find all matches in the input string.
// Loop through the match collection to retrieve all
// matches.
int i=0;
m = r.Match(strHTML);

for (i=0; (m.Success && i<100); m = m.NextMatch())
{i++;

// Add the match string to the string array.
results[i] = m.Groups[1].ToString() ;

}
return results;
}
private string loadUrl(string strURL)
{
string strRes="" ;
try
{
// Creates an HttpWebRequest with the specified URL.
HttpWebRequest myHttpWebRequest;

myHttpWebRequest = (HttpWebRequest)WebRequest.Create(strURL);

// Sends the HttpWebRequest and waits for the response.
HttpWebResponse myHttpWebResponse = (HttpWebResponse)myHttpWebRequest.GetResponse();
// Gets the stream associated with the response.
Stream receiveStream = myHttpWebResponse.GetResponseStream();
Encoding encode = System.Text.Encoding.GetEncoding("utf-8");
// Pipes the stream to a higher level stream reader with the required encoding format.
StreamReader readStream = new StreamReader( receiveStream, encode );
Char[] read = new Char[256];
// Reads 256 characters at a time.
int count = readStream.Read( read, 0, 256 );

while (count > 0)
{
// Dumps the 256 characters on a string .
String str = new String(read,0, count);
strRes +=str;
count = readStream.Read(read, 0, 256);
}
// Releases the resources of the response.
myHttpWebResponse.Close();
// Releases the resources of the Stream.
readStream.Close();

}

catch(Exception exp)
{
rtbLOg.AppendText("error at URI : "+ strURL +"\n");
strRes ="";
}
return strRes;
}

private void btLoad_Click(object sender, System.EventArgs e)
{
recure(tbURI.Text,Convert.ToInt16(tbiProf.Text));
rtbLOg.AppendText(dicURL.Count.ToString ());
}
private void recure(string strURL,int ilevel)
{
MainString="";
Match m;

if ((ilevel)>0 )
{

MainString = loadUrl(strURL);

if (dicURL.Count<Convert.ToInt32(tbMax.Text))
{
// récuperation des URLS
string []strArUris= extractURIS(MainString);
for (int i=0; i<strArUris.Length&&dicURL.Count<Convert.ToInt32(tbMax.Text);i++)
{
if (strArUris[i]!=null)
{
// check that the URi begins with a http
Regex r = new Regex ("http://.+",
RegexOptions.IgnoreCase);
m = r.Match(strArUris[i]);

if (!dicURL.ContainsKey(strArUris[i])&& m.Success )
{
dicURL.Add(strArUris[i],"1");
recure(strArUris[i],ilevel-1);
rtbLOg.AppendText(strArUris[i]+" " +ilevel+"\n");

}
else
dicURL[strArUris[i]]=Convert.ToString(Convert.ToInt32(dicURL[strArUris[i]])+1);

}
}
}
//here could be a text flow extraction
// rtbHTML.Text =deleteTagg(rtbHTML.Text);
//rtbHTML.Text =deleteTagg("</body>");
}

}

private void rtbLOg_LinkClicked(object sender, System.Windows.Forms.LinkClickedEventArgs e)
{
//Launch in ie
System.Diagnostics.Process.Start(e.LinkText);
rtbHTML.Text= deleteTagg(loadUrl(e.LinkText));
}

}
}

Thats all folks! Hope this helps, comments are welcome.

¶ 5:21 AM

Comments:

very cool post

# posted by

website design New York City : June 12, 2009 2:40 AM