.

html parser – HtmlAgilityPack

 

public List<book> RetrieveBooks(string xxx)
{
    List<book> list = new List<book>();
    HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument();
    // There are various options, set as needed
    htmlDoc.OptionFixNestedTags = true;
    // xxx contains the html
    htmlDoc.LoadHtml(xxx);
    {
        if (htmlDoc.DocumentNode != null)
        {
            //table cellpadding=1 cellspacing=1 border=0 class=""dataTable""  div class="elementList
            HtmlNodeCollection items = htmlDoc.DocumentNode.SelectNodes("//div[@class='elementList']");
            if (items != null)
            {
                //HtmlNodeCollection items = tableNode.ChildNodes;//.SelectNodes("//tr");
                foreach (HtmlNode itemX in items)
                    if (itemX.ChildNodes[0].Name != "th")
                    {
                        HtmlNode firstnode = itemX.ChildNodes[1].ChildNodes[1];//OuterHtml = "<a href=\"/genres/14th-century\" class=\"mediumText actionLinkLite\">14th-century</a>"
                        string href;
                        string text;
                        try
                        {
                            href = firstnode.Attributes["href"].Value;
                            text = firstnode.Attributes["title"].Value;
                        }
                        catch { continue; }


                        string othername = itemX.ChildNodes[1].ChildNodes[3].InnerText;
                        if (othername != text)
                            othername = text;

                        HtmlNode secondnode = itemX.ChildNodes[1].ChildNodes[9];//OuterHtml = "<a href=\"/genres/14th-century\" class=\"mediumText actionLinkLite\">14th-century</a>"
                        string author = secondnode.InnerText;
                        string authorHref = secondnode.ChildNodes[1].Attributes["href"].Value;
                        HtmlNode thirdnode = itemX.ChildNodes[1].ChildNodes[15];
                        string txt = thirdnode.InnerText;
                        Regex r = new Regex(@"avg rating \d.(\d)+");
                        Match m = r.Match(txt);
                        string averating = m.Value.Substring(10);
                        double avedbl = Convert.ToDouble(averating);

                        Regex r2 = new Regex(@"([\d,])+ rating");
                        Match m2 = r2.Match(txt);
                        string tots = m2.Value.Substring(0, m2.Value.Length - 6);
                        double totdbl = Convert.ToDouble(tots);

                        list.Add(new book() { Name = text, Author = author, AveRating = avedbl, BookUrl = href, RateCount = totdbl });
                    }
            }
        }
    } return list;
}        /// <summary>

 

HtmlAgilityPack.dll (132.00 kb) [old]

 

http://htmlagilitypack.codeplex.com/

might also want to look at: http://webkitdotnet.sourceforge.net/ which renders the javascript etc

One Response to this post.

  1. Posted by pietman on 28.05.12 at 2:10 pm

    another simple example:

    internal void Process(string Text)
    {
    var results = new List();

    HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument();
    htmlDoc.OptionFixNestedTags = true;
    htmlDoc.LoadHtml(Text);
    {
    if (htmlDoc.DocumentNode != null)
    {
    //

    if (items != null)
    {
    foreach (HtmlNode itemX in items)
    {
    var href = itemX.Attributes["href"].Value;
    var text = itemX.Attributes["title"].Value;
    }
    }
    }
    }
    }

    public List results;

    public IEnumerable Results {
    get { return new List
    ();}
    }
    }

    public class result
    {
    }

What's your thoughts on this?

*

Protected by WP Anti Spam