Web Scraping in ASP.NET

I know that there are other samples of web scraping out there, but here's mine.  One of my customers asked me how to scrape our ASP.NET Web application, so I though that I might post the example code.  I like the viewstate regex - it's my first time using lookarounds in a regular expression.

using System;
using System.Text;
using System.Text.RegularExpressions;
using System.Net;
using System.IO;

namespace Dennany.WebScrape {
 
class MainClass {
   
    [
STAThread]
   
static void Main(string[] args) {

      try {
       // Modify as appropriate:
      
const string baseUri = "http://remotewebhost/webpagedirectory/";
       const string loginDlgUri = baseUri + "LoginDlg.aspx";
       
const string mainConsoleUri = baseUri + "Mainpage.aspx";
       
const string username = "myuser";
       
const string password = "p@ssw0rd";

      
// This cookie container will persist the ASP.NET session ID cookie
      
CookieContainer cookies = new CookieContainer();

      // perform the first http request against
     
// the asp.net application login dialog.
     
HttpWebRequest request =
        (
HttpWebRequest) WebRequest.Create(loginDlgUri);

      //get the response object, so that we may get the session cookie.
     
HttpWebResponse response =
       (
HttpWebResponse)request.GetResponse();
    
     
// populate the cookie container.
     
request.CookieContainer = cookies;
      response.Cookies =
        request.CookieContainer.GetCookies(request.RequestUri);

      // read the incoming stream containing the login dialog page.
      
StreamReader reader =
       
new StreamReader(response.GetResponseStream());

      string loginDlgPage = reader.ReadToEnd();

      reader.Close();

     
// extract the viewstate value from the login dialog page.
     
// We need to post this back,
     
// along with the username and password
     
string viewState = GetViewState(loginDlgPage);

      // build postback string
      
// This string will vary depending on the page. The best
     
// way to find out what your postback should look like is to
     
// monitor a normal login using a utility like TCPTrace.
     
string postback = 
       
String.Format("__VIEWSTATE={0}&txtUserName={1}" +
         "&txtPassword={2}&txtMessage=&btnOK=OK"
,
         viewState, username, password);

      // our second request is the POST of the username / password data.
     
HttpWebRequest request2 =
      
(HttpWebRequest)WebRequest.Create(loginDlgUri);

      request2.Method = "POST";
     
request2.ContentType = "application/x-www-form-urlencoded";
     
request2.CookieContainer = cookies;

     
// write our postback data into the request stream
    
StreamWriter writer =
      
new StreamWriter(request2.GetRequestStream());
    
writer.Write(postback);
    
writer.Close();

     request2.GetResponse().Close();

     // our third request is for the actual webpage after the login.
    
HttpWebRequest request3 =
     
(HttpWebRequest)WebRequest.Create(mainConsoleUri);
    
request3.CookieContainer = cookies;

     reader =
       new StreamReader(request3.GetResponse().GetResponseStream());

     // and read the response
    
string page = reader.ReadToEnd();

     reader.Close();

    // our webpage data is in the 'page' string.
   
Console.WriteLine(page);
 
}

  catch(Exception ex) {
   
Console.WriteLine(ex);
 
}
 
}

  // extract the viewstate data from a page.
 
private static string GetViewState(string aspxPage) {
   
Regex regex =
    
new Regex("(?<=(__viewstate\".value.\")).*(?=\"./>)",RegexOptions.IgnoreCase);

    Match match =
     
regex.Match(aspxPage);

    return System.Web.HttpUtility.UrlEncode(match.Value);
 
}
 
}
}
// EOF

7 Comments

Comments have been disabled for this content.