A fried of mine IM'ed me today asking for help about an specific task that was assigned to him by his project manager. He is currently working on a project that has the client getting furious alot because the client discovered that most of the links on their site were broken (the vicious 404 erros) or are not pointing to the right pages (misplaced links). His PM wasn't happy at all so he was asked me to help him create a program that would parse a website and get all URLs accessible inside a page and dump the result into a text file.
I had a little bit of free time so i decided to help him by building this small application to show him how he can accomplish the task in C#.
using System;
using System.Collections.Generic;
using System.Text.RegularExpressions;
using System.IO;
using System.Net;
namespace KeithRull.GiveMeUrls
{
class Program
{
static void Main(string[] args)
{
//the url to scrape
Uri urlToScrape = new Uri("http://www.devpinoy.org");
//the list that would contain the urls recovered from the specified uri
List<string> listOfUrls = GetAllUrlsFromUri(urlToScrape);
string fileName = SaveToFile(listOfUrls);
Console.WriteLine("Parsing completed! Urls saved to file: {0}", fileName);
Console.ReadLine();
}
public static List<string> GetAllUrlsFromUri(Uri urlToScrape)
{
//the list that would hold the urls
List<string> listOfUrls = new List<string>();
//the search pattern that we are going to use for our regular expression
string searchPattern = "href\\s*=\\s*(?:(?:\\\"(?<url>[^\\\"]*)\\\")|(?<url>[^\\s]* ))";
//get the contents of the page and put it to a string
string pageContents = GetPageContents(urlToScrape);
//our regular expression should ignore case
Regex regEx = new Regex(searchPattern, RegexOptions.IgnoreCase);
//get all the maching values generated by our regular expression
Match match = regEx.Match(pageContents);
//loop thru all the matching strings
while (match.Success)
{
//assign the match value to a temporary placeholder
string urlFound = match.Value;
//check to see if the url does not include the full path(e.g: default.aspx)
if (listOfUrls.IndexOf(urlFound) < 0)
{
string urlToAdd = urlFound;
if (urlFound.StartsWith("href=\"javascript:"))
{
//do nothing, we need to display it as is.
}
else if (urlFound.StartsWith("href=\"/") || !urlFound.StartsWith("href=\"http://"))
{
//add the scrape url to the beginning of our found string
urlToAdd = urlFound.Insert(6, urlToScrape.OriginalString);
}
//add the url to our list
listOfUrls.Add(urlToAdd);
}
//move to the next match result
match = match.NextMatch();
}
//return the list of urls that we have recovered from the site
return listOfUrls;
}
/// <summary>
/// Reads a webpage and captures it html representation into a string
/// </summary>
/// <param name="urlToScrape">the website you want to read</param>
/// <returns>the html representation of the site</returns>
private static string GetPageContents(Uri urlToScrape)
{
HttpWebResponse httpWebResponse = null;
StreamReader streamReader = null;
string pageContents = String.Empty;
try
{
//create a webrequest object for the url
WebRequest webRequest = WebRequest.Create(urlToScrape);
//convert the webrequest to an httpwebrequest
HttpWebRequest httpWebRequest = (HttpWebRequest)webRequest;
//assign a timeout value for the process
httpWebRequest.Timeout = 100000;
//create a webresponse object to hold the response generated for our request
WebResponse webResponse = httpWebRequest.GetResponse();
//convert the webresponse to httpwebresponse
httpWebResponse = (HttpWebResponse)webResponse;
//get the response stream and assign it to our streamreader
streamReader = new StreamReader(httpWebResponse.GetResponseStream());
//read the contents of the stream
pageContents = streamReader.ReadToEnd();
}
catch (Exception ex)
{
//buble up the error
throw ex;
}
finally
{
//close our webresponse object
httpWebResponse.Close();
//close our streamreader object
streamReader.Close();
}
//return the page contents
return pageContents;
}
/// <summary>
/// Saves our list of urls to a text file
/// </summary>
/// <param name="listOfUrls">the list containing the urls</param>
/// <returns>the filename created for the file</returns>
public static string SaveToFile(List<string> listOfUrls)
{
//the file name
string fileName = String.Format("{0}.{1}",Guid.NewGuid(), "txt");
//create a streamwriter for our file
StreamWriter sw = File.CreateText(fileName);
//loop thru each string in our collection
foreach (string url in listOfUrls)
{
//write the string to our file
sw.WriteLine(url);
}
//close oour streamwriter
sw.Close();
//return our filename
return fileName;
}
}
}
Basically, the code does is it accepts a url and then parses that page using a regular expression to check all the strings that matches our search pattern. Once it finishes the processing of the page, it would then dump all those urls into a text file.
I sent the code to him and he was very happy with the result. Sweet!