Index ¦ Archives ¦ Atom ¦ RSS

Extracting relevant text from HTML pages

Some time back I had done some work on extracting topics from an arbitrary piece of text using Wikipedia data. Recently I thought of a concept to put that algorithm to work. As a part of this project, I need to extract relevant text from an arbitrary HTML page. By relevant I mean the “meat” of the page devoid of navigation links and side-content.

This algorithm has the following steps:

  • make doc from html data (clean html)
  • identify content nodes (nodes having substantial content)
  • prune xml tree to remove irrelevant nodes
  • get the most linked node from pruned tree (subtree contains relevant text)
  • make the dot graph

I’ve pasted the relevant python module below for easy reading. However, if you want to download the code and hack it, you can get all the files from here.

Code files

  • content_extract.py - actual work gets done here (file pasted below)
  • cextract.py - cgi front-end which fetched url content and feeds to above script.
  • cextract_config.py - cgi script configuration file. You have to adjust this to your environment.

Try it right here and right now

url:

Some samples

Please let me know if you find cases for which the algorithm does not work. Even better would be to download the code and hack it up and post back. I am eager to see what you can come up with.

#!/usr/bin/env python

import sys
from cStringIO import StringIO

from lxml import etree #http://codespeak.net/lxml/

IGNORABLE_TAGS = set(['script', 'a'])
MIN_TEXT_LEN = 50

def get_text(node):
    '''
    Given a XML node, extract all the text it contains.
    (does not recurse into children)
    '''
    text = [node.text or '']
    for cnode in node.getchildren():
        tail = cnode.tail
        if tail is not None:
            text.append(cnode.tail)

    text = '\n'.join(text).strip()
    return text

def get_xml(node):
    '''
    Convert the sub-tree from node downwards
    into string XML representation.
    '''
    return etree.tostring(node)

def create_doc(data):
    '''
    Construct XML tree datastructure from xml string representation.
    '''
    parser = etree.HTMLParser()
    doc = etree.parse(StringIO(data), parser)
    return doc

def get_content_nodes(doc):
    '''
    Identify nodes in the XML document that
    have substantial text.
    '''
    nodes = []

    for n in doc.xpath('//*'):
        tag = n.tag

        if tag.lower() in IGNORABLE_TAGS:
            continue

        text = get_text(n)
        if not text:
            continue

        if len(text) < MIN_TEXT_LEN:
            continue

        nodes.append(n)

    return nodes

def make_pruned_tree(content_nodes):
    '''
    Prune the whole XML tree by remnoving nodes
    other than content nodes and their ancestors.
    '''
    nodes = {}
    links = {}

    for node in content_nodes:

        nodes[id(node)] = node

        parent = node.getparent()
        if parent is not None:
            links[id(node)] = id(parent)

        for anode in node.iterancestors():
            _id = id(anode)
            parent = anode.getparent()
            if parent is not None:
                links[_id] = id(parent)

            if _id not in nodes:
                nodes[_id] = anode

    return nodes, links

def get_inlink_counts(links):
    '''
    Given the inter-node links, find out which
    node has maximum number of links coming into it.
    '''
    counts = {}

    for from_id, to_id in links.iteritems():
        count = counts.setdefault(to_id, 0)
        counts[to_id] = count + 1

    return counts

def get_most_linked_node(nodes, links):
    '''
    Identify the node which is most linked.
    (i,e) has most number of inlinks.
    '''
    inlink_counts = get_inlink_counts(links)

    mcount, mid = max([(count, _id) for _id, count in inlink_counts.iteritems()])
    node = nodes[mid]
    return node

def make_dot_graph(nodes, links, chosen_node, stream):
    '''
    Construct the dot format graph representation
    so that graphviz can render the tree for visualization.
    '''
    o = stream

    print >> o, "digraph G {"

    for _id, node in nodes.iteritems():

        tlen = len(get_text(node))
        tag = node.tag

        if tlen:
            text = '%s (%d)' % (tag, tlen)
        else:
            text = tag

        if _id == chosen_node:
            attrs = 'style=filled color=lightblue'
        else:
            attrs = ''

        print >> o, "%s [label=\"%s\" %s];" % (_id, text, attrs)

    for fid, tid in links.iteritems():
        print >> o, "%d -> %d;" % (fid, tid)

    print >> o, "}"

def main():
    # make doc from html data (cleans html)
    doc = create_doc(sys.stdin.read())

    # identify content nodes
    content_nodes = get_content_nodes(doc)

    # prune xml tree to remove irrelevant nodes
    nodes, links = make_pruned_tree(content_nodes)

    # get the most linked node from pruned tree
    mnode = get_most_linked_node(nodes, links)

    # make the dot graph
    make_dot_graph(nodes, links, id(mnode), sys.stdout)

if __name__ == '__main__':
    #Eg: wget "http://blog.prashanthellina.com" -O - | python thisscript.py | dot -Tpng -o /tmp/test.png ; eog /tmp/test.png
    main()

© Prashanth Ellina. Built using Pelican. Theme by Giulio Fidente on github.