.comment-link {margin-left:.6em;}
Marc Boizeau's blog
Friday, June 24, 2005
  self description
Somebody ask for the code of the self descripting c# code i've done a few times ago here it is :


using System;
using System.Collections;
using System.ComponentModel;
using System.Windows.Forms;
using System.CodeDom;
using System.CodeDom.Compiler;
using Microsoft.CSharp;
using System.Collections.Specialized;
namespace WindowsApplication2
{
/// Main form

public class Form1 : System.Windows.Forms.Form
{
private System.Windows.Forms.Button button2;
private System.Windows.Forms.Button button1;
private System.Windows.Forms.Button bt_itself;
private System.Windows.Forms.RichTextBox rt1;
private System.Windows.Forms.ListBox listBox1;
private System.ComponentModel.Container components = null;

public Form1()
{
//
// Requis pour la prise en charge du Concepteur Windows Forms
//
InitializeComponent();
}

/// Nettoyage des ressources utilisées.
protected override void Dispose( bool disposing )
{
if( disposing )
{
if (components != null)
{
components.Dispose();
}
}
base.Dispose( disposing );
}

#region Code généré par le Concepteur Windows Form
/// <summary>
/// Méthode requise pour la prise en charge du concepteur - ne modifiez pas
/// le contenu de cette méthode avec l'éditeur de code.
/// </summary>
private void InitializeComponent() {
this.listBox1 = new System.Windows.Forms.ListBox();
this.rt1 = new System.Windows.Forms.RichTextBox();
this.bt_itself = new System.Windows.Forms.Button();
this.button1 = new System.Windows.Forms.Button();
this.button2 = new System.Windows.Forms.Button();
this.SuspendLayout();
//
// listBox1
//
this.listBox1.Location = new System.Drawing.Point(24, 360);
this.listBox1.Name = "listBox1";
this.listBox1.Size = new System.Drawing.Size(1016, 108);
this.listBox1.TabIndex = 3;
//
// rt1
//
this.rt1.Location = new System.Drawing.Point(24, 16);
this.rt1.Name = "rt1";
this.rt1.Size = new System.Drawing.Size(632, 328);
this.rt1.TabIndex = 1;
this.rt1.Text = "richTextBox1";
//
// bt_itself
//
this.bt_itself.Location = new System.Drawing.Point(696, 40);
this.bt_itself.Name = "bt_itself";
this.bt_itself.Size = new System.Drawing.Size(72, 24);
this.bt_itself.TabIndex = 0;
this.bt_itself.Text = "it self";
this.bt_itself.Click += new System.EventHandler(this.bt_itself_Click);
//
// button1
//
this.button1.Location = new System.Drawing.Point(696, 8);
this.button1.Name = "button1";
this.button1.Size = new System.Drawing.Size(72, 24);
this.button1.TabIndex = 0;
this.button1.Text = "genere";
this.button1.Click += new System.EventHandler(this.button1_Click);
//
// button2
//
this.button2.Location = new System.Drawing.Point(696, 72);
this.button2.Name = "button2";
this.button2.Size = new System.Drawing.Size(72, 24);
this.button2.TabIndex = 4;
this.button2.Text = "compile";
this.button2.Click += new System.EventHandler(this.button2_Click);

//
// Form1
//
this.AutoScaleBaseSize = new System.Drawing.Size(5, 13);
this.ClientSize = new System.Drawing.Size(1032, 526);
this.Controls.Add(this.button2);
this.Controls.Add(this.listBox1);
this.Controls.Add(this.rt1);
this.Controls.Add(this.button1);
this.Controls.Add(this.bt_itself);
this.Name = "Form1";
this.Text = "Form1";
this.Load += new System.EventHandler(this.Form1_Load);
this.ResumeLayout(false);
}
#endregion

/// Point d'entrée principal de l'application.
[STAThread]
static void Main()
{
Application.Run(new Form1());
}
// methode de base
string join(string[] arr,string sep)
{string strReturn="" ;
foreach (string elt in arr)
strReturn += elt+sep;
strReturn.Substring(0,strReturn.Length-sep.Length);

return strReturn;
}

void seekandreplace(string[] arr,string strSrc,string strReplace)
{
int i=0;
for(i=0;i<=arr.Length-1;i++)
{ if(arr[i]==strSrc)
arr[i]=strReplace;
}
}
string format (string str)
{
str = str.Replace("\\","\\\\");
str = str.Replace("\"","\\\"");
return str;
}
string joinformat(string[] arr,string sep)
{
string strReturn= "" ;
foreach (string elt in arr)
{ strReturn +=format( elt)+sep;}

strReturn = strReturn.Substring( 0,strReturn.Length-sep.Length);

return strReturn;
}
private void Form1_Load(object sender, System.EventArgs e)
{
//nothing special here
}

private void button1_Click(object sender, System.EventArgs e)
{// here is starting the main part of the code:
//
//the CodeCompileUnit is the abstract code structure:

// Create a new CodeCompileUnit to contain the program graph
CodeCompileUnit CompileUnit = new CodeCompileUnit();
// Declare a new namespace called Samples.
CodeNamespace Samples = new CodeNamespace("Samples");
// Add the new namespace to the compile unit.
CompileUnit.Namespaces.Add( Samples );

// Add the new namespace import for the System namespace.
Samples.Imports.Add( new CodeNamespaceImport("System") );

// Declare a new type called Class1.
CodeTypeDeclaration Class1 = new CodeTypeDeclaration("echo");
// Add the new type to the namespace's type collection.
Samples.Types.Add(Class1);

//a variable declaration
CodeVariableDeclarationStatement Var = new
CodeVariableDeclarationStatement("System.String","str1");
// Declare a new code entry point method
CodeEntryPointMethod Start = new CodeEntryPointMethod();
// Create a new method invocation expression.
CodeMethodInvokeExpression cs1 =
new CodeMethodInvokeExpression(
// Call the System.Console.WriteLine method.

new CodeTypeReferenceExpression("System.Console"), "ReadLine");
CodeAssignStatement as1 = new CodeAssignStatement(
new CodeVariableReferenceExpression("str1"),cs1);

// Create a new method invocation expression.
CodeMethodInvokeExpression cs2 =
new CodeMethodInvokeExpression(
// Call the System.Console.WriteLine method.
new CodeTypeReferenceExpression("System.Console"), "WriteLine");
cs2.Parameters.Add(new CodeVariableReferenceExpression(Var.Name));

Start.Statements.Add(Var);
// Add the new method code statement.
Start.Statements.Add( as1);
// Add the new method code statement.
Start.Statements.Add(new CodeExpressionStatement(cs2));

// Add the code entry point method to the type's members collection
Class1.Members.Add( Start );


System.IO.StringWriter Sw = new System.IO.StringWriter();


Microsoft.CSharp.CSharpCodeProvider provider =
new CSharpCodeProvider();
System.CodeDom.Compiler.ICodeGenerator generator = provider.CreateGenerator(Sw);
CodeGeneratorOptions genOptions = new CodeGeneratorOptions();

// The code generator should insert blank lines
genOptions.BlankLinesBetweenMembers = true;

try
{
generator.GenerateCodeFromCompileUnit(CompileUnit,Sw,genOptions);
}
catch (Exception Exc)
{
System.Windows.Forms.MessageBox.Show (Exc.Message);
}
rt1.Text = Sw.ToString();

}

private void button2_Click(object sender, System.EventArgs e)
{
Microsoft.CSharp.CSharpCodeProvider provider =
new CSharpCodeProvider();
///
// Compilation
//instanciate csharp compiler
System.CodeDom.Compiler.ICodeCompiler MyCompiler = provider.CreateCompiler();
System.CodeDom.Compiler.CompilerParameters cp = new CompilerParameters();
cp.GenerateExecutable = true;
cp.CompilerOptions= " /target:exe";
cp.ReferencedAssemblies.AddRange( new string[]{"System.Windows.Forms.dll","System.dll","System.drawing.dll",""});
//where your exe will be saved
cp.OutputAssembly = "c:\\echo.exe";
// Invoke compilation.
CompilerResults cr = MyCompiler.CompileAssemblyFromSource ( cp,rt1.Text);
//
// Return the results of compilation.

//eventually load compilation output to the listbox (usefull to debug)
listBox1.DataSource=cr.Output;

//where is it?
MessageBox.Show ( cr.PathToAssembly );
}

private void bt_itself_Click(object sender, System.EventArgs e)
{
string[] arr=new string[]{
"using System;",
"using System.Collections;",
"using System.ComponentModel;",
"using System.Windows.Forms;",
"using System.CodeDom;",
"using System.CodeDom.Compiler;",
"using Microsoft.CSharp;",
"using System.Collections.Specialized;",
"namespace WindowsApplication2",
"{",
" /// Main form",
"",
" public class Form1 : System.Windows.Forms.Form",
" {",
" private System.Windows.Forms.Button button2;",
" private System.Windows.Forms.Button button1;",
" private System.Windows.Forms.Button bt_itself;",
" private System.Windows.Forms.RichTextBox rt1;",
" private System.Windows.Forms.ListBox listBox1;",
" private System.ComponentModel.Container components = null;",
"",
" public Form1()",
" {",
" //",
" // Requis pour la prise en charge du Concepteur Windows Forms",
" //",
" InitializeComponent();",
" }",
"",
" /// Nettoyage des ressources utilisées.",
" protected override void Dispose( bool disposing )",
" {",
" if( disposing )",
" {",
" if (components != null)",
" {",
" components.Dispose();",
" }",
" }",
" base.Dispose( disposing );",
" }",
"",
" #region Code généré par le Concepteur Windows Form",
" /// <summary>",
" /// Méthode requise pour la prise en charge du concepteur - ne modifiez pas",
" /// le contenu de cette méthode avec l'éditeur de code.",
" /// </summary>",
" private void InitializeComponent() {",
" this.listBox1 = new System.Windows.Forms.ListBox();",
" this.rt1 = new System.Windows.Forms.RichTextBox();",
" this.bt_itself = new System.Windows.Forms.Button();",
" this.button1 = new System.Windows.Forms.Button();",
" this.button2 = new System.Windows.Forms.Button();",
" this.SuspendLayout();",
" // ",
" // listBox1",
" // ",
" this.listBox1.Location = new System.Drawing.Point(24, 360);",
" this.listBox1.Name = \"listBox1\";",
" this.listBox1.Size = new System.Drawing.Size(1016, 108);",
" this.listBox1.TabIndex = 3;",
" // ",
" // rt1",
" // ",
" this.rt1.Location = new System.Drawing.Point(24, 16);",
" this.rt1.Name = \"rt1\";",
" this.rt1.Size = new System.Drawing.Size(632, 328);",
" this.rt1.TabIndex = 1;",
" this.rt1.Text = \"richTextBox1\";",
" // ",
" // bt_itself",
" // ",
" this.bt_itself.Location = new System.Drawing.Point(696, 40);",
" this.bt_itself.Name = \"bt_itself\";",
" this.bt_itself.Size = new System.Drawing.Size(72, 24);",
" this.bt_itself.TabIndex = 0;",
" this.bt_itself.Text = \"it self\";",
" this.bt_itself.Click += new System.EventHandler(this.bt_itself_Click);",
" // ",
" // button1",
" // ",
" this.button1.Location = new System.Drawing.Point(696, 8);",
" this.button1.Name = \"button1\";",
" this.button1.Size = new System.Drawing.Size(72, 24);",
" this.button1.TabIndex = 0;",
" this.button1.Text = \"genere\";",
" this.button1.Click += new System.EventHandler(this.button1_Click);",
" // ",
" // button2",
" // ",
" this.button2.Location = new System.Drawing.Point(696, 72);",
" this.button2.Name = \"button2\";",
" this.button2.Size = new System.Drawing.Size(72, 24);",
" this.button2.TabIndex = 4;",
" this.button2.Text = \"compile\";",
" this.button2.Click += new System.EventHandler(this.button2_Click);",
" ",
" // ",
" // Form1",
" // ",
" this.AutoScaleBaseSize = new System.Drawing.Size(5, 13);",
" this.ClientSize = new System.Drawing.Size(1032, 526);",
" this.Controls.Add(this.button2);",
" this.Controls.Add(this.listBox1);",
" this.Controls.Add(this.rt1);",
" this.Controls.Add(this.button1);",
" this.Controls.Add(this.bt_itself);",
" this.Name = \"Form1\";",
" this.Text = \"Form1\";",
" this.Load += new System.EventHandler(this.Form1_Load);",
" this.ResumeLayout(false);",
" }",
" #endregion",
"",
" /// Point d'entrée principal de l'application.",
" [STAThread]",
" static void Main()",
" {",
" Application.Run(new Form1());",
" }",
"// methode de base ",
" string join(string[] arr,string sep)",
" {string strReturn=\"\" ;",
" foreach (string elt in arr)",
" strReturn += elt+sep;",
" strReturn.Substring(0,strReturn.Length-sep.Length);",
" ",
" return strReturn;",
" }",
"",
" void seekandreplace(string[] arr,string strSrc,string strReplace)",
" {",
" int i=0;",
" for(i=0;i<=arr.Length-1;i++)",
" { if(arr[i]==strSrc)",
" arr[i]=strReplace;",
" }",
" }",
" string format (string str)",
" {",
" str = str.Replace(\"\\\\\",\"\\\\\\\\\");",
" str = str.Replace(\"\\\"\",\"\\\\\\\"\");",
" return str; ",
" }",
" string joinformat(string[] arr,string sep)",
" {",
" string strReturn= \"\" ;",
" foreach (string elt in arr)",
" { strReturn +=format( elt)+sep;}",
" ",
" strReturn = strReturn.Substring( 0,strReturn.Length-sep.Length);",
" ",
" return strReturn;",
" }",
" private void Form1_Load(object sender, System.EventArgs e)",
" {",
" //nothing special here ",
" }",
"",
" private void button1_Click(object sender, System.EventArgs e)",
" {// here is starting the main part of the code:",
" //",
" //the CodeCompileUnit is the abstract code structure:",
"",
" // Create a new CodeCompileUnit to contain the program graph",
" CodeCompileUnit CompileUnit = new CodeCompileUnit();",
" // Declare a new namespace called Samples.",
" CodeNamespace Samples = new CodeNamespace(\"Samples\");",
" // Add the new namespace to the compile unit.",
" CompileUnit.Namespaces.Add( Samples );",
"",
" // Add the new namespace import for the System namespace.",
" Samples.Imports.Add( new CodeNamespaceImport(\"System\") );",
"",
" // Declare a new type called Class1.",
" CodeTypeDeclaration Class1 = new CodeTypeDeclaration(\"echo\");",
" // Add the new type to the namespace's type collection.",
" Samples.Types.Add(Class1);",
"",
" //a variable declaration",
" CodeVariableDeclarationStatement Var = new",
" CodeVariableDeclarationStatement(\"System.String\",\"str1\");",
" // Declare a new code entry point method",
" CodeEntryPointMethod Start = new CodeEntryPointMethod();",
" // Create a new method invocation expression.",
" CodeMethodInvokeExpression cs1 =",
" new CodeMethodInvokeExpression(",
" // Call the System.Console.WriteLine method.",
" ",
" new CodeTypeReferenceExpression(\"System.Console\"), \"ReadLine\");",
" CodeAssignStatement as1 = new CodeAssignStatement(",
" new CodeVariableReferenceExpression(\"str1\"),cs1);",
"",
" // Create a new method invocation expression.",
" CodeMethodInvokeExpression cs2 =",
" new CodeMethodInvokeExpression(",
" // Call the System.Console.WriteLine method.",
" new CodeTypeReferenceExpression(\"System.Console\"), \"WriteLine\");",
" cs2.Parameters.Add(new CodeVariableReferenceExpression(Var.Name));",
"",
" Start.Statements.Add(Var);",
" // Add the new method code statement.",
" Start.Statements.Add( as1);",
" // Add the new method code statement.",
" Start.Statements.Add(new CodeExpressionStatement(cs2));",
"",
" // Add the code entry point method to the type's members collection",
" Class1.Members.Add( Start );",
"",
"",
" System.IO.StringWriter Sw = new System.IO.StringWriter();",
"",
" ",
" Microsoft.CSharp.CSharpCodeProvider provider =",
" new CSharpCodeProvider();",
" System.CodeDom.Compiler.ICodeGenerator generator = provider.CreateGenerator(Sw);",
" CodeGeneratorOptions genOptions = new CodeGeneratorOptions();",
"",
" // The code generator should insert blank lines ",
" genOptions.BlankLinesBetweenMembers = true;",
"",
" try",
" {",
" generator.GenerateCodeFromCompileUnit(CompileUnit,Sw,genOptions);",
" }",
" catch (Exception Exc)",
" {",
" System.Windows.Forms.MessageBox.Show (Exc.Message);",
" }",
" rt1.Text = Sw.ToString();",
"",
" }",
" ",
" private void button2_Click(object sender, System.EventArgs e)",
" {",
" Microsoft.CSharp.CSharpCodeProvider provider =",
" new CSharpCodeProvider();",
" ///",
"// Compilation ",
" //instanciate csharp compiler",
" System.CodeDom.Compiler.ICodeCompiler MyCompiler = provider.CreateCompiler();",
" System.CodeDom.Compiler.CompilerParameters cp = new CompilerParameters();",
" cp.GenerateExecutable = true;",
" cp.CompilerOptions= \" /target:exe\";",
" cp.ReferencedAssemblies.AddRange( new string[]{\"System.Windows.Forms.dll\",\"System.dll\",\"System.drawing.dll\",\"\"});",
" //where your exe will be saved",
" cp.OutputAssembly = \"c:\\\\echo.exe\";",
" // Invoke compilation.",
" CompilerResults cr = MyCompiler.CompileAssemblyFromSource ( cp,rt1.Text);",
" //",
" // Return the results of compilation.",
"",
" //eventually load compilation output to the listbox (usefull to debug)",
" listBox1.DataSource=cr.Output;",
"",
" //where is it? ",
" MessageBox.Show ( cr.PathToAssembly );",
" }",
"",
" private void bt_itself_Click(object sender, System.EventArgs e)",
" {",
" string[] arr=new string[]{",
"Les sanglots longs des violons de l'automne bercent mon coeur d'une langueur monotone",
" };",
" seekandreplace(arr,\"Les sanglots longs des violons de l'automne bercent mon coeur d'une langueur monotone\",\"\\\"\"+joinformat(arr,\"\\\",\\n\\\"\")+\"\\\"\");",
" this.rt1.Text = join(arr,\"\\n\");",
" }",
" }",
"}"
};
seekandreplace(arr,"Les sanglots longs des violons de l'automne bercent mon coeur d'une langueur monotone","\""+joinformat(arr,"\",\n\"")+"\"");
this.rt1.Text = join(arr,"\n");
}
}
}


 
Friday, June 17, 2005
  A web crawler(regular expressions and httprequest)
Hello everybody, i'm back to the blog!

Today I'd like to demonstrate two powerfull features of the .net Framework: the httpWebrequest/httWebResponse and the RegEx object. I will then propose an implementation of a mini Web Crawler.

httpWebrequest and httWebResponse are both object of the System.Net namespace.
With this objects, retrieving a text stream from a URI is as easy as this:


// Creates an HttpWebRequest with the specified URL.
HttpWebRequest myHttpWebRequest;

myHttpWebRequest = (HttpWebRequest)WebRequest.Create(strURL);

// Sends the HttpWebRequest and waits for the response.
HttpWebResponse myHttpWebResponse = (HttpWebResponse)myHttpWebRequest.GetResponse();
// Gets the stream associated with the response.
Stream receiveStream = myHttpWebResponse.GetResponseStream();
Encoding encode = System.Text.Encoding.GetEncoding("utf-8");
// Pipes the stream to a higher level stream reader with the required encoding format.
StreamReader readStream = new StreamReader( receiveStream, encode );
Char[] read = new Char[256];
// Reads 256 characters at a time.
int count = readStream.Read( read, 0, 256 );

while (count > 0)
{
// Dumps the 256 characters on a string .
String str = new String(read,0, count);
strRes +=str;
count = readStream.Read(read, 0, 256);
}
// Releases the resources of the response.
myHttpWebResponse.Close();
// Releases the resources of the Stream.
readStream.Close();


The regEx object is the .net implementation for regular expressions. If you don't know what it is just remember it's the best way to perform text analysis.
For instance here is a function which retrieves in an array the "href"s in a web page.


private string [] extractURIS(string strHTML)
{
Match m;

String[] results = new String[101];


// Create a new Regex object and define the regular expression.
Regex r = new Regex("href\\s*=\\s*(?:\"(?'1'[^\"]*)\"|(?'1'\\S+)).*>",
RegexOptions.IgnoreCase|RegexOptions.Compiled);

// Use the Matches method to find all matches in the input string.
// Loop through the match collection to retrieve all
// matches.
int i=0;
m = r.Match(strHTML);

for (i=0; (m.Success && i<100); m = m.NextMatch())
{i++;

// Add the match string to the string array.
results[i] = m.Groups[1].ToString() ;

}
return results;
}



OK, have a great breath and look at this
"href\\s*=\\s*(?:\"(?'1'[^\"]*)\"|(?'1'\\S+)).*>"
let's decrypt this regular expression .
"href" just mean that the patern is preceded by the word "href"
\\s stands for "any space character"
\\s* stands for "a 0 or n space character"
so the first part of the regEx :"href\\s*=\\s*" mean that the pattern we are looking for begins by "href" then spaces, then the "=" character and then spaces.

[^\"] means "everything but a double quote character" (the \ is here to say " isnt a special character of the RegEx)
so [^\"]* is a succession of 0 or n character whithout any double quote

(?'1'......) means that whe give the name '1' to the .... expression
Here we use it two times in the subexpression:
\"(?'1'[^\"]*)\"|(?'1'\\S+)
knowing that "|" is a logicall OR, you can guess that '1' is either
a succession of "without double quote" between two double quotes or "a succession of at least one non spaces characters"(\\S+)

(?:....) just propagate the subexpression to the top level .

a dot (.) means "any character"
so:
.*> means that our regEx ends with a succession of any character an then a ">"

wow...
for further info have a look to : http://msdn.microsoft.com/library/default.asp?url=/library/en-us/cpgenref/html/cpconregularexpressionslanguageelements.asp


So lets use it in our little web crawler.

To compile the following , save it with a ".cs" extension and use the csc.exe of your framework installation, for instance use the following:
C:\WINDOWS\Microsoft.NET\Framework\v1.1.4322\csc.exe /target:winexe /recurse:*.*
int the cs directory.


using System;
using System.Drawing;
using System.Collections;
using System.ComponentModel;
using System.Windows.Forms;
using System.Data;
using System.Net;
using System.IO;
using System.Web;
using System.Text;
using System.Text.RegularExpressions;
using System.Collections.Specialized;

namespace webrequest
{
/// <summary>
/// Description résumée de Form1.
/// </summary>
public class WebCrawler : System.Windows.Forms.Form
{
private System.Windows.Forms.Button btLoad;
private System.Windows.Forms.TextBox tbURI;
private System.Windows.Forms.RichTextBox rtbHTML;
private System.Windows.Forms.RichTextBox rtbLOg;

private string MainString;
private StringDictionary dicURL;
private System.Windows.Forms.Label label1;
private System.Windows.Forms.Label label2;
private System.Windows.Forms.TextBox tbiProf;
private System.Windows.Forms.TextBox tbMax;
private System.Windows.Forms.Label label3;
/// <summary>
/// Variable nécessaire au concepteur.
/// </summary>
private System.ComponentModel.Container components = null;

public WebCrawler()
{
dicURL = new StringDictionary();
//
// Requis pour la prise en charge du Concepteur Windows Forms
//
InitializeComponent();

//
// TODO : ajoutez le code du constructeur après l'appel à InitializeComponent
//
}

/// <summary>
/// Nettoyage des ressources utilisées.
/// </summary>
protected override void Dispose( bool disposing )
{
if( disposing )
{
if (components != null)
{
components.Dispose();
}
}
base.Dispose( disposing );
}

#region Code généré par le Concepteur Windows Form
/// <summary>
/// Méthode requise pour la prise en charge du concepteur - ne modifiez pas
/// le contenu de cette méthode avec l'éditeur de code.
/// </summary>
private void InitializeComponent()
{
this.btLoad = new System.Windows.Forms.Button();
this.tbURI = new System.Windows.Forms.TextBox();
this.rtbHTML = new System.Windows.Forms.RichTextBox();
this.rtbLOg = new System.Windows.Forms.RichTextBox();
this.tbiProf = new System.Windows.Forms.TextBox();
this.label1 = new System.Windows.Forms.Label();
this.label2 = new System.Windows.Forms.Label();
this.tbMax = new System.Windows.Forms.TextBox();
this.label3 = new System.Windows.Forms.Label();
this.SuspendLayout();
//
// btLoad
//
this.btLoad.Anchor = ((System.Windows.Forms.AnchorStyles)((((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Bottom)
| System.Windows.Forms.AnchorStyles.Left)
| System.Windows.Forms.AnchorStyles.Right)));
this.btLoad.Location = new System.Drawing.Point(16, 112);
this.btLoad.Name = "btLoad";
this.btLoad.Size = new System.Drawing.Size(272, 120);
this.btLoad.TabIndex = 0;
this.btLoad.Text = "Crawl!";
this.btLoad.Click += new System.EventHandler(this.btLoad_Click);
//
// tbURI
//
this.tbURI.Anchor = ((System.Windows.Forms.AnchorStyles)(((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Left)
| System.Windows.Forms.AnchorStyles.Right)));
this.tbURI.Location = new System.Drawing.Point(120, 16);
this.tbURI.Name = "tbURI";
this.tbURI.Size = new System.Drawing.Size(712, 20);
this.tbURI.TabIndex = 1;
this.tbURI.Text = "http://oraclevsmicrosoft.blogspot.com";
//
// rtbHTML
//
this.rtbHTML.Anchor = ((System.Windows.Forms.AnchorStyles)(((System.Windows.Forms.AnchorStyles.Bottom | System.Windows.Forms.AnchorStyles.Left)
| System.Windows.Forms.AnchorStyles.Right)));
this.rtbHTML.Location = new System.Drawing.Point(8, 248);
this.rtbHTML.Name = "rtbHTML";
this.rtbHTML.Size = new System.Drawing.Size(832, 136);
this.rtbHTML.TabIndex = 2;
this.rtbHTML.Text = "";
//
// rtbLOg
//
this.rtbLOg.Anchor = ((System.Windows.Forms.AnchorStyles)(((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Bottom)
| System.Windows.Forms.AnchorStyles.Right)));
this.rtbLOg.Location = new System.Drawing.Point(297, 40);
this.rtbLOg.Name = "rtbLOg";
this.rtbLOg.Size = new System.Drawing.Size(536, 200);
this.rtbLOg.TabIndex = 3;
this.rtbLOg.Text = "";
this.rtbLOg.LinkClicked += new System.Windows.Forms.LinkClickedEventHandler(this.rtbLOg_LinkClicked);
//
// tbiProf
//
this.tbiProf.Location = new System.Drawing.Point(120, 48);
this.tbiProf.Name = "tbiProf";
this.tbiProf.Size = new System.Drawing.Size(48, 20);
this.tbiProf.TabIndex = 4;
this.tbiProf.Text = "5";
//
// label1
//
this.label1.Location = new System.Drawing.Point(8, 16);
this.label1.Name = "label1";
this.label1.Size = new System.Drawing.Size(88, 16);
this.label1.TabIndex = 5;
this.label1.Text = "Starting URI";
//
// label2
//
this.label2.Location = new System.Drawing.Point(8, 48);
this.label2.Name = "label2";
this.label2.Size = new System.Drawing.Size(88, 16);
this.label2.TabIndex = 6;
this.label2.Text = "Max depht";
//
// tbMax
//
this.tbMax.Location = new System.Drawing.Point(120, 80);
this.tbMax.Name = "tbMax";
this.tbMax.Size = new System.Drawing.Size(104, 20);
this.tbMax.TabIndex = 4;
this.tbMax.Text = "1000";
//
// label3
//
this.label3.Location = new System.Drawing.Point(8, 88);
this.label3.Name = "label3";
this.label3.Size = new System.Drawing.Size(112, 16);
this.label3.TabIndex = 6;
this.label3.Text = "Max number of links";
//
// WebCrawler
//
this.AutoScaleBaseSize = new System.Drawing.Size(5, 13);
this.ClientSize = new System.Drawing.Size(848, 398);
this.Controls.Add(this.label2);
this.Controls.Add(this.label1);
this.Controls.Add(this.tbiProf);
this.Controls.Add(this.rtbLOg);
this.Controls.Add(this.rtbHTML);
this.Controls.Add(this.tbURI);
this.Controls.Add(this.btLoad);
this.Controls.Add(this.tbMax);
this.Controls.Add(this.label3);
this.MinimumSize = new System.Drawing.Size(856, 432);
this.Name = "WebCrawler";
this.Text = "Form1";
this.ResumeLayout(false);

}
#endregion

/// <summary>
/// Point d'entrée principal de l'application.
/// </summary>
[STAThread]
static void Main()
{
Application.Run(new WebCrawler());
}

// ne garde que le body du document et remplace toutes les balises par un espace simple
// enfin decode le html pour remplacer les énbsp; et autres
private string deleteTagg(string strHTML)
{
Match m;
string strRes ="";
Regex r = new Regex ("<body.*>(.|\\n)*</body>",
RegexOptions.IgnoreCase);
m = r.Match(strHTML);
if (m.Success )
{
strRes = m.Value;

}
r = null;
r = new Regex ("<(.|\\n)*?>",
RegexOptions.IgnoreCase);
strRes = r.Replace(strRes," ");

strRes =HttpUtility.HtmlDecode(strRes);

return strRes ;
}
private string [] extractURIS(string strHTML)
{
Match m;

String[] results = new String[101];


// Create a new Regex object and define the regular expression.
Regex r = new Regex ("href\\s*=\\s*(?:\"(?'1'[^\"]*)\"|(?'1'\\S+)).*>",
RegexOptions.IgnoreCase|RegexOptions.Compiled);

// Use the Matches method to find all matches in the input string.
// Loop through the match collection to retrieve all
// matches.
int i=0;
m = r.Match(strHTML);

for (i=0; (m.Success && i<100); m = m.NextMatch())
{i++;

// Add the match string to the string array.
results[i] = m.Groups[1].ToString() ;

}
return results;
}
private string loadUrl(string strURL)
{
string strRes="" ;
try
{
// Creates an HttpWebRequest with the specified URL.
HttpWebRequest myHttpWebRequest;

myHttpWebRequest = (HttpWebRequest)WebRequest.Create(strURL);

// Sends the HttpWebRequest and waits for the response.
HttpWebResponse myHttpWebResponse = (HttpWebResponse)myHttpWebRequest.GetResponse();
// Gets the stream associated with the response.
Stream receiveStream = myHttpWebResponse.GetResponseStream();
Encoding encode = System.Text.Encoding.GetEncoding("utf-8");
// Pipes the stream to a higher level stream reader with the required encoding format.
StreamReader readStream = new StreamReader( receiveStream, encode );
Char[] read = new Char[256];
// Reads 256 characters at a time.
int count = readStream.Read( read, 0, 256 );

while (count > 0)
{
// Dumps the 256 characters on a string .
String str = new String(read,0, count);
strRes +=str;
count = readStream.Read(read, 0, 256);
}
// Releases the resources of the response.
myHttpWebResponse.Close();
// Releases the resources of the Stream.
readStream.Close();

}

catch(Exception exp)
{
rtbLOg.AppendText("error at URI : "+ strURL +"\n");
strRes ="";
}
return strRes;
}

private void btLoad_Click(object sender, System.EventArgs e)
{
recure(tbURI.Text,Convert.ToInt16(tbiProf.Text));
rtbLOg.AppendText(dicURL.Count.ToString ());
}
private void recure(string strURL,int ilevel)
{
MainString="";
Match m;




if ((ilevel)>0 )
{

MainString = loadUrl(strURL);

if (dicURL.Count<Convert.ToInt32(tbMax.Text))
{
// récuperation des URLS
string []strArUris= extractURIS(MainString);
for (int i=0; i<strArUris.Length&&dicURL.Count<Convert.ToInt32(tbMax.Text);i++)
{
if (strArUris[i]!=null)
{
// check that the URi begins with a http
Regex r = new Regex ("http://.+",
RegexOptions.IgnoreCase);
m = r.Match(strArUris[i]);

if (!dicURL.ContainsKey(strArUris[i])&& m.Success )
{
dicURL.Add(strArUris[i],"1");
recure(strArUris[i],ilevel-1);
rtbLOg.AppendText(strArUris[i]+" " +ilevel+"\n");

}
else
dicURL[strArUris[i]]=Convert.ToString(Convert.ToInt32(dicURL[strArUris[i]])+1);

}
}
}
//here could be a text flow extraction
// rtbHTML.Text =deleteTagg(rtbHTML.Text);
//rtbHTML.Text =deleteTagg("</body>");
}

}

private void rtbLOg_LinkClicked(object sender, System.Windows.Forms.LinkClickedEventArgs e)
{
//Launch in ie
System.Diagnostics.Process.Start(e.LinkText);
rtbHTML.Text= deleteTagg(loadUrl(e.LinkText));
}

}
}

Thats all folks! Hope this helps, comments are welcome.

 
You are a developer and work with Oracle and Microsoft technologies? Have a look!
ATOM
How to:
Use updatable views in Access
Get data in Excel from Oracle 1
Get data in Excel from Oracle 2
Draw the Mandelbrot set using C#
Use the "Grouping Sets" SQl Syntax
Use the "Rollup" SQl Syntax
Use the "Rank over" SQl Syntax

Go to wordpress
When does the next bus pass?
Thanks
googled2cd966929769ab9
two lines in the datagrid header
Context saving with persistent datasets
.net webservice session
Winform, Web Services & credential
back to work
self description
ARCHIVES
October 2004 / November 2004 / December 2004 / January 2005 / February 2005 / March 2005 / April 2005 / June 2005 / August 2005 / September 2005 / December 2005 / February 2006 / December 2006 / March 2009 /


Powered by Blogger

mboizeau.free.fr