A web crawler(regular expressions and  httprequest)
   Hello  everybody, i'm back to the blog!
Today I'd like to demonstrate  two powerfull features of the .net Framework: the httpWebrequest/httWebResponse and the  RegEx object.  I will  then propose an implementation  of a  mini Web Crawler.
httpWebrequest and httWebResponse are both object  of the System.Net  namespace. 
With  this objects, retrieving a text stream from a URI  is as easy as this:
// Creates an HttpWebRequest with the specified URL. 
HttpWebRequest myHttpWebRequest;
  
myHttpWebRequest = (HttpWebRequest)WebRequest.Create(strURL); 
      
// Sends the HttpWebRequest and waits for the response.            
HttpWebResponse myHttpWebResponse = (HttpWebResponse)myHttpWebRequest.GetResponse(); 
// Gets the stream associated with the response.
Stream receiveStream = myHttpWebResponse.GetResponseStream();
Encoding encode = System.Text.Encoding.GetEncoding("utf-8");
// Pipes the stream to a higher level stream reader with the required encoding format. 
StreamReader readStream = new StreamReader( receiveStream, encode );
Char[] read = new Char[256];
// Reads 256 characters at a time.    
int count = readStream.Read( read, 0, 256 );
      
while (count > 0) 
{
 // Dumps the 256 characters on a string .
 String str = new String(read,0, count);
 strRes  +=str;
 count = readStream.Read(read, 0, 256);
}
// Releases the resources of the response.
myHttpWebResponse.Close();
// Releases the resources of the Stream.
readStream.Close(); 
The regEx object is the .net implementation for regular expressions. If you don't know what it is just remember it's the best way to perform text analysis. 
For instance here is a function which retrieves in an array the "href"s in a web page. 
  private  string [] extractURIS(string strHTML)
  {
   Match m;
   String[] results = new String[101];
 
    
   // Create a new Regex object and define the regular expression.
   Regex r = new Regex("href\\s*=\\s*(?:\"(?'1'[^\"]*)\"|(?'1'\\S+)).*>",
   RegexOptions.IgnoreCase|RegexOptions.Compiled);
   // Use the Matches method to find all matches in the input string.
   // Loop through the match collection to retrieve all 
   // matches.
   int i=0;
   m = r.Match(strHTML);
    
   for (i=0; (m.Success && i<100); m = m.NextMatch()) 
   {i++;
    // Add the match string to the string array.   
    results[i] = m.Groups[1].ToString() ;
 
   }
  return results;
  }
OK,  have a great  breath and  look at this
"href\\s*=\\s*(?:\"(?'1'[^\"]*)\"|(?'1'\\S+)).*>"
let's  decrypt this  regular expression . 
"href"   just mean that the patern is preceded by the word "href"
\\s stands for  "any  space character" 
\\s*  stands for "a 0 or n space character"
so  the first part of the regEx  :"href\\s*=\\s*" mean  that the pattern we are looking for  begins by "href" then  spaces,  then  the "=" character and  then spaces. 
[^\"]  means "everything but a double quote  character" (the \ is here to say " isnt a special character of the RegEx)
so [^\"]*  is  a succession of 0 or n  character whithout any double quote
(?'1'......) means  that whe give the name '1'  to the .... expression 
Here we use it  two times    in the subexpression:
\"(?'1'[^\"]*)\"|(?'1'\\S+) 
knowing that "|" is a logicall OR, you can guess that '1' is  either 
a succession of "without double quote" between  two  double quotes  or  "a succession of at least one non spaces characters"(\\S+)
(?:....)  just propagate the subexpression  to the  top level . 
a dot (.) means "any character"
so:
.*> means that our  regEx  ends with a succession of  any character an then a ">"
wow... 
for further info have a look to : http://msdn.microsoft.com/library/default.asp?url=/library/en-us/cpgenref/html/cpconregularexpressionslanguageelements.asp
So  lets use it in our little  web crawler. 
To compile  the following  , save it with a ".cs" extension and use the csc.exe  of your  framework installation, for instance use the following:
C:\WINDOWS\Microsoft.NET\Framework\v1.1.4322\csc.exe /target:winexe /recurse:*.*
int the cs directory. 
using System;
using System.Drawing;
using System.Collections;
using System.ComponentModel;
using System.Windows.Forms;
using System.Data;
using System.Net;
using System.IO;
using System.Web;
using System.Text;
using System.Text.RegularExpressions;
using System.Collections.Specialized;
namespace webrequest
{
/// <summary>
/// Description résumée de Form1.
/// </summary>
public class WebCrawler : System.Windows.Forms.Form
{
private System.Windows.Forms.Button btLoad;
private System.Windows.Forms.TextBox tbURI;
private System.Windows.Forms.RichTextBox rtbHTML;
private System.Windows.Forms.RichTextBox rtbLOg;
private string MainString;
private StringDictionary dicURL;
private System.Windows.Forms.Label label1;
private System.Windows.Forms.Label label2;
private System.Windows.Forms.TextBox tbiProf;
private System.Windows.Forms.TextBox tbMax;
private System.Windows.Forms.Label label3; 
/// <summary>
/// Variable nécessaire au concepteur.
/// </summary>
private System.ComponentModel.Container components = null;
public WebCrawler()
{
 dicURL = new StringDictionary();
 //
 // Requis pour la prise en charge du Concepteur Windows Forms
 //
 InitializeComponent();
 //
 // TODO : ajoutez le code du constructeur après l'appel à InitializeComponent
 //
}
/// <summary>
/// Nettoyage des ressources utilisées.
/// </summary>
protected override void Dispose( bool disposing )
{
 if( disposing )
 {
  if (components != null) 
  {
   components.Dispose();
  }
 }
 base.Dispose( disposing );
}
#region Code généré par le Concepteur Windows Form
/// <summary>
/// Méthode requise pour la prise en charge du concepteur - ne modifiez pas
/// le contenu de cette méthode avec l'éditeur de code.
/// </summary>
private void InitializeComponent()
{
 this.btLoad = new System.Windows.Forms.Button();
 this.tbURI = new System.Windows.Forms.TextBox();
 this.rtbHTML = new System.Windows.Forms.RichTextBox();
 this.rtbLOg = new System.Windows.Forms.RichTextBox();
 this.tbiProf = new System.Windows.Forms.TextBox();
 this.label1 = new System.Windows.Forms.Label();
 this.label2 = new System.Windows.Forms.Label();
 this.tbMax = new System.Windows.Forms.TextBox();
 this.label3 = new System.Windows.Forms.Label();
 this.SuspendLayout();
 // 
 // btLoad
 // 
 this.btLoad.Anchor = ((System.Windows.Forms.AnchorStyles)((((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Bottom) 
  | System.Windows.Forms.AnchorStyles.Left) 
  | System.Windows.Forms.AnchorStyles.Right)));
 this.btLoad.Location = new System.Drawing.Point(16, 112);
 this.btLoad.Name = "btLoad";
 this.btLoad.Size = new System.Drawing.Size(272, 120);
 this.btLoad.TabIndex = 0;
 this.btLoad.Text = "Crawl!";
 this.btLoad.Click += new System.EventHandler(this.btLoad_Click);
 // 
 // tbURI
 // 
 this.tbURI.Anchor = ((System.Windows.Forms.AnchorStyles)(((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Left) 
  | System.Windows.Forms.AnchorStyles.Right)));
 this.tbURI.Location = new System.Drawing.Point(120, 16);
 this.tbURI.Name = "tbURI";
 this.tbURI.Size = new System.Drawing.Size(712, 20);
 this.tbURI.TabIndex = 1;
 this.tbURI.Text = "http://oraclevsmicrosoft.blogspot.com";
 // 
 // rtbHTML
 // 
 this.rtbHTML.Anchor = ((System.Windows.Forms.AnchorStyles)(((System.Windows.Forms.AnchorStyles.Bottom | System.Windows.Forms.AnchorStyles.Left) 
  | System.Windows.Forms.AnchorStyles.Right)));
 this.rtbHTML.Location = new System.Drawing.Point(8, 248);
 this.rtbHTML.Name = "rtbHTML";
 this.rtbHTML.Size = new System.Drawing.Size(832, 136);
 this.rtbHTML.TabIndex = 2;
 this.rtbHTML.Text = "";
 // 
 // rtbLOg
 // 
 this.rtbLOg.Anchor = ((System.Windows.Forms.AnchorStyles)(((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Bottom) 
  | System.Windows.Forms.AnchorStyles.Right)));
 this.rtbLOg.Location = new System.Drawing.Point(297, 40);
 this.rtbLOg.Name = "rtbLOg";
 this.rtbLOg.Size = new System.Drawing.Size(536, 200);
 this.rtbLOg.TabIndex = 3;
 this.rtbLOg.Text = "";
 this.rtbLOg.LinkClicked += new System.Windows.Forms.LinkClickedEventHandler(this.rtbLOg_LinkClicked);
 // 
 // tbiProf
 // 
 this.tbiProf.Location = new System.Drawing.Point(120, 48);
 this.tbiProf.Name = "tbiProf";
 this.tbiProf.Size = new System.Drawing.Size(48, 20);
 this.tbiProf.TabIndex = 4;
 this.tbiProf.Text = "5";
 // 
 // label1
 // 
 this.label1.Location = new System.Drawing.Point(8, 16);
 this.label1.Name = "label1";
 this.label1.Size = new System.Drawing.Size(88, 16);
 this.label1.TabIndex = 5;
 this.label1.Text = "Starting URI";
 // 
 // label2
 // 
 this.label2.Location = new System.Drawing.Point(8, 48);
 this.label2.Name = "label2";
 this.label2.Size = new System.Drawing.Size(88, 16);
 this.label2.TabIndex = 6;
 this.label2.Text = "Max depht";
 // 
 // tbMax
 // 
 this.tbMax.Location = new System.Drawing.Point(120, 80);
 this.tbMax.Name = "tbMax";
 this.tbMax.Size = new System.Drawing.Size(104, 20);
 this.tbMax.TabIndex = 4;
 this.tbMax.Text = "1000";
 // 
 // label3
 // 
 this.label3.Location = new System.Drawing.Point(8, 88);
 this.label3.Name = "label3";
 this.label3.Size = new System.Drawing.Size(112, 16);
 this.label3.TabIndex = 6;
 this.label3.Text = "Max number of links";
 // 
 // WebCrawler
 // 
 this.AutoScaleBaseSize = new System.Drawing.Size(5, 13);
 this.ClientSize = new System.Drawing.Size(848, 398);
 this.Controls.Add(this.label2);
 this.Controls.Add(this.label1);
 this.Controls.Add(this.tbiProf);
 this.Controls.Add(this.rtbLOg);
 this.Controls.Add(this.rtbHTML);
 this.Controls.Add(this.tbURI);
 this.Controls.Add(this.btLoad);
 this.Controls.Add(this.tbMax);
 this.Controls.Add(this.label3);
 this.MinimumSize = new System.Drawing.Size(856, 432);
 this.Name = "WebCrawler";
 this.Text = "Form1";
 this.ResumeLayout(false);
}
#endregion
/// <summary>
/// Point d'entrée principal de l'application.
/// </summary>
[STAThread]
static void Main() 
{
 Application.Run(new WebCrawler());
}
// ne garde que le body  du document et remplace toutes les balises par un espace simple
// enfin  decode le html pour remplacer les énbsp; et autres
private  string deleteTagg(string strHTML)
{
 Match m;
 string strRes ="";
 Regex r = new Regex ("<body.*>(.|\\n)*</body>",
  RegexOptions.IgnoreCase);
 m = r.Match(strHTML); 
 if (m.Success ) 
 {
   strRes  = m.Value;
  
 }
 r = null;
 r = new Regex ("<(.|\\n)*?>",
  RegexOptions.IgnoreCase);
 strRes = r.Replace(strRes," ");
 strRes  =HttpUtility.HtmlDecode(strRes);
 return strRes ;
}
private  string [] extractURIS(string strHTML)
{
 Match m;
 String[] results = new String[101];
 // Create a new Regex object and define the regular expression.
 Regex r = new Regex ("href\\s*=\\s*(?:\"(?'1'[^\"]*)\"|(?'1'\\S+)).*>",
 RegexOptions.IgnoreCase|RegexOptions.Compiled);
 // Use the Matches method to find all matches in the input string.
 // Loop through the match collection to retrieve all 
 // matches.
 int i=0;
 m = r.Match(strHTML);
  
 for (i=0; (m.Success && i<100); m = m.NextMatch()) 
 {i++;
  // Add the match string to the string array.   
  results[i] = m.Groups[1].ToString() ;
 }
return results;
}
private string  loadUrl(string strURL)
{
 string strRes="" ; 
 try 
 {
  // Creates an HttpWebRequest with the specified URL. 
  HttpWebRequest myHttpWebRequest;
  
  myHttpWebRequest = (HttpWebRequest)WebRequest.Create(strURL); 
    
  // Sends the HttpWebRequest and waits for the response.            
  HttpWebResponse myHttpWebResponse = (HttpWebResponse)myHttpWebRequest.GetResponse(); 
  // Gets the stream associated with the response.
  Stream receiveStream = myHttpWebResponse.GetResponseStream();
  Encoding encode = System.Text.Encoding.GetEncoding("utf-8");
  // Pipes the stream to a higher level stream reader with the required encoding format. 
  StreamReader readStream = new StreamReader( receiveStream, encode );
  Char[] read = new Char[256];
  // Reads 256 characters at a time.    
  int count = readStream.Read( read, 0, 256 );
    
  while (count > 0) 
  {
   // Dumps the 256 characters on a string .
   String str = new String(read,0, count);
   strRes  +=str;
   count = readStream.Read(read, 0, 256);
  }
  // Releases the resources of the response.
  myHttpWebResponse.Close();
  // Releases the resources of the Stream.
  readStream.Close();
  
 }
 catch(Exception exp)
 {
  rtbLOg.AppendText("error at URI : "+ strURL +"\n");
  strRes ="";
 }
 return strRes;
}
private void btLoad_Click(object sender, System.EventArgs e)
{
 recure(tbURI.Text,Convert.ToInt16(tbiProf.Text));
 rtbLOg.AppendText(dicURL.Count.ToString ());
}
private void recure(string strURL,int  ilevel)
{
 MainString="";
 Match m;
  
 
 if ((ilevel)>0 )
 {
  
  MainString = loadUrl(strURL);
 
   if (dicURL.Count<Convert.ToInt32(tbMax.Text))
   {
    // récuperation des URLS
    string []strArUris= extractURIS(MainString);
    for (int i=0; i<strArUris.Length&&dicURL.Count<Convert.ToInt32(tbMax.Text);i++)
    {
     if (strArUris[i]!=null)
     {
      // check that the  URi begins with  a http
      Regex r = new Regex ("http://.+",
       RegexOptions.IgnoreCase);
      m = r.Match(strArUris[i]); 
      
      if (!dicURL.ContainsKey(strArUris[i])&& m.Success )
      {
       dicURL.Add(strArUris[i],"1");
       recure(strArUris[i],ilevel-1);
       rtbLOg.AppendText(strArUris[i]+"  " +ilevel+"\n");
      
      }
      else
       dicURL[strArUris[i]]=Convert.ToString(Convert.ToInt32(dicURL[strArUris[i]])+1);
     }
    }
   }
   //here could be a text flow extraction 
   //  rtbHTML.Text  =deleteTagg(rtbHTML.Text);
   //rtbHTML.Text  =deleteTagg("</body>");
 }
  
}
private void rtbLOg_LinkClicked(object sender, System.Windows.Forms.LinkClickedEventArgs e)
{
 //Launch  in ie 
 System.Diagnostics.Process.Start(e.LinkText);
 rtbHTML.Text= deleteTagg(loadUrl(e.LinkText));
}
}
}
Thats all folks! Hope this helps, comments are welcome.