Blog of a Filipino Developer about C#, VB.NET, ASP.NET, Java, PHP, SQL Server, MySql and Oracle RSS 2.0
 Tuesday, November 06, 2007

A fried of mine IM'ed me today asking for help about an specific task that was assigned to him by his project manager. He is currently working on a project that has the client getting furious alot because the client discovered that most of the links on their site were broken (the vicious 404 erros) or are not pointing to the right pages (misplaced links). His PM wasn't happy at all so he was asked me to help him create a program that would parse a website and get all URLs accessible inside a page and dump the result into a text file.

I had a little bit of free time so i decided to help him by building this small application to show him how he can accomplish the task in C#.

using System;
using System.Collections.Generic;
using System.Text.RegularExpressions;
using System.IO;
using System.Net;

namespace KeithRull.GiveMeUrls
{
   class Program
   {
      static void Main(string[] args)
      {
         //the url to scrape
         Uri urlToScrape = new Uri("http://www.devpinoy.org");
         //the list that would contain the urls recovered from the specified uri
         List<string> listOfUrls = GetAllUrlsFromUri(urlToScrape);

         string fileName = SaveToFile(listOfUrls);

         Console.WriteLine("Parsing completed! Urls saved to file: {0}", fileName);

         Console.ReadLine();
      }

      public static List<string> GetAllUrlsFromUri(Uri urlToScrape)
      {
         //the list that would hold the urls
         List<string> listOfUrls = new List<string>();
         //the search pattern that we are going to use for our regular expression
         string searchPattern = "href\\s*=\\s*(?:(?:\\\"(?<url>[^\\\"]*)\\\")|(?<url>[^\\s]* ))";

         //get the contents of the page and put it to a string
         string pageContents = GetPageContents(urlToScrape);

         //our regular expression should ignore case
         Regex regEx = new Regex(searchPattern, RegexOptions.IgnoreCase);

         //get all the maching values generated by our regular expression
         Match match = regEx.Match(pageContents);

         //loop thru all the matching strings
         while (match.Success)
         {
            //assign the match value to a temporary placeholder
            string urlFound = match.Value;

            //check to see if the url does not include the full path(e.g: default.aspx)
            if (listOfUrls.IndexOf(urlFound) < 0)
            {
               string urlToAdd = urlFound;
               if (urlFound.StartsWith("href=\"javascript:"))
               {
                  //do nothing, we need to display it as is.
               }
               else if (urlFound.StartsWith("href=\"/") || !urlFound.StartsWith("href=\"http://"))
               {
                  //add the scrape url to the beginning of our found string
                  urlToAdd = urlFound.Insert(6, urlToScrape.OriginalString);
               }
               //add the url to our list
               listOfUrls.Add(urlToAdd);
            }
            //move to the next match result
            match = match.NextMatch();
         }

         //return the list of urls that we have recovered from the site
         return listOfUrls;
      }

      /// <summary>
      /// Reads a webpage and captures it html representation into a string
      /// </summary>
      /// <param name="urlToScrape">the website you want to read</param>
      /// <returns>the html representation of the site</returns>
      private static string GetPageContents(Uri urlToScrape)
      {
         HttpWebResponse httpWebResponse = null;
         StreamReader streamReader = null;
         string pageContents = String.Empty;

         try
         {
            //create a webrequest object for the url
            WebRequest webRequest = WebRequest.Create(urlToScrape);
            //convert the webrequest to an httpwebrequest
            HttpWebRequest httpWebRequest = (HttpWebRequest)webRequest;
            //assign a timeout value for the process
            httpWebRequest.Timeout = 100000;

            //create a webresponse object to hold the response generated for our request
            WebResponse webResponse = httpWebRequest.GetResponse();
            //convert the webresponse to httpwebresponse
            httpWebResponse = (HttpWebResponse)webResponse;

            //get the response stream and assign it to our streamreader
            streamReader = new StreamReader(httpWebResponse.GetResponseStream());

            //read the contents of the stream
            pageContents = streamReader.ReadToEnd();
         }
         catch (Exception ex)
         {
            //buble up the error
            throw ex;
         }
         finally
         {
            //close our webresponse object
            httpWebResponse.Close();
            //close our streamreader object
            streamReader.Close();
         }

         //return the page contents
         return pageContents;
      }

      /// <summary>
      /// Saves our list of urls to a text file
      /// </summary>
      /// <param name="listOfUrls">the list containing the urls</param>
      /// <returns>the filename created for the file</returns>
      public static string SaveToFile(List<string> listOfUrls)
      {
         //the file name
         string fileName = String.Format("{0}.{1}",Guid.NewGuid(), "txt");

         //create a streamwriter for our file
         StreamWriter sw = File.CreateText(fileName);

         //loop thru each string in our collection
         foreach (string url in listOfUrls)
         {
            //write the string to our file
            sw.WriteLine(url);
         }

         //close oour streamwriter
         sw.Close();

         //return our filename
         return fileName;
      }
   }
}

Basically, the code does is it accepts a url and then parses that page using a regular expression to check all the strings that matches our search pattern. Once it finishes the processing of the page, it would then dump all those urls into a text file.

I sent the code to him and he was very happy with the result. Sweet!

Tuesday, November 06, 2007 8:21:58 PM (GMT Standard Time, UTC+00:00)  #    Comments [0] -
.NET | Fun Stuff
Archive
<November 2007>
SunMonTueWedThuFriSat
28293031123
45678910
11121314151617
18192021222324
2526272829301
2345678
About the author/Disclaimer

Disclaimer
The opinions expressed herein are my own personal opinions and do not represent my employer's view in any way.

© Copyright 2008
Keith Rull
Sign In
Statistics
Total Posts: 260
This Year: 57
This Month: 0
This Week: 0
Comments: 116
Themes
Pick a theme:
Ads
All Content © 2008, Keith Rull
DasBlog theme 'Business' created by Christoph De Baene (delarou)