Some time back I had done some work on extracting topics from an arbitrary piece of text using Wikipedia data. Recently I thought of a concept to put that algorithm to work. As a part of this project, I need to extract relevant text from an arbitrary HTML page. By relevant I mean the “meat” of the page devoid of navigation links and side-content.
This algorithm has the following steps:
- make doc from html data (clean html)
- identify content nodes (nodes having substantial content)
- prune xml tree to remove irrelevant nodes
- get the most linked node from pruned tree (subtree contains relevant text)
- make the dot graph
I’ve pasted the relevant python module below for easy reading. However, if you want to download the code and hack it, you can get all the files from here.
Code files
- content_extract.py – actual work gets done here (file pasted below)
- cextract.py – cgi front-end which fetched url content and feeds to above script.
- cextract_config.py – cgi script configuration file. You have to adjust this to your environment.
Try it right here and right now
Some samples
- http://news.bbc.co.uk/sport2/hi/motorsport/formula_one/8169436.stm
- http://www.prashanthellina.com/cextract_data/640ca0b7c6e818b2b1bf952a206a6388.html
- http://www.telegraph.co.uk/news/worldnews/europe/france/5913494/Nicolas-Sarkozy-to-slow-down-after-collapsing-while-jogging.html
Please let me know if you find cases for which the algorithm does not work. Even better would be to download the code and hack it up and post back. I am eager to see what you can come up with.
#!/usr/bin/env python
import sys
from cStringIO import StringIO
from lxml import etree #http://codespeak.net/lxml/
IGNORABLE_TAGS = set(['script', 'a'])
MIN_TEXT_LEN = 50
def get_text(node):
'''
Given a XML node, extract all the text it contains.
(does not recurse into children)
'''
text = [node.text or '']
for cnode in node.getchildren():
tail = cnode.tail
if tail is not None:
text.append(cnode.tail)
text = '\n'.join(text).strip()
return text
def get_xml(node):
'''
Convert the sub-tree from node downwards
into string XML representation.
'''
return etree.tostring(node)
def create_doc(data):
'''
Construct XML tree datastructure from xml string representation.
'''
parser = etree.HTMLParser()
doc = etree.parse(StringIO(data), parser)
return doc
def get_content_nodes(doc):
'''
Identify nodes in the XML document that
have substantial text.
'''
nodes = []
for n in doc.xpath('//*'):
tag = n.tag
if tag.lower() in IGNORABLE_TAGS:
continue
text = get_text(n)
if not text:
continue
if len(text) < MIN_TEXT_LEN:
continue
nodes.append(n)
return nodes
def make_pruned_tree(content_nodes):
'''
Prune the whole XML tree by remnoving nodes
other than content nodes and their ancestors.
'''
nodes = {}
links = {}
for node in content_nodes:
nodes[id(node)] = node
parent = node.getparent()
if parent is not None:
links[id(node)] = id(parent)
for anode in node.iterancestors():
_id = id(anode)
parent = anode.getparent()
if parent is not None:
links[_id] = id(parent)
if _id not in nodes:
nodes[_id] = anode
return nodes, links
def get_inlink_counts(links):
'''
Given the inter-node links, find out which
node has maximum number of links coming into it.
'''
counts = {}
for from_id, to_id in links.iteritems():
count = counts.setdefault(to_id, 0)
counts[to_id] = count + 1
return counts
def get_most_linked_node(nodes, links):
'''
Identify the node which is most linked.
(i,e) has most number of inlinks.
'''
inlink_counts = get_inlink_counts(links)
mcount, mid = max([(count, _id) for _id, count in inlink_counts.iteritems()])
node = nodes[mid]
return node
def make_dot_graph(nodes, links, chosen_node, stream):
'''
Construct the dot format graph representation
so that graphviz can render the tree for visualization.
'''
o = stream
print >> o, "digraph G {"
for _id, node in nodes.iteritems():
tlen = len(get_text(node))
tag = node.tag
if tlen:
text = '%s (%d)' % (tag, tlen)
else:
text = tag
if _id == chosen_node:
attrs = 'style=filled color=lightblue'
else:
attrs = ''
print >> o, "%s [label=\"%s\" %s];" % (_id, text, attrs)
for fid, tid in links.iteritems():
print >> o, "%d -> %d;" % (fid, tid)
print >> o, "}"
def main():
# make doc from html data (cleans html)
doc = create_doc(sys.stdin.read())
# identify content nodes
content_nodes = get_content_nodes(doc)
# prune xml tree to remove irrelevant nodes
nodes, links = make_pruned_tree(content_nodes)
# get the most linked node from pruned tree
mnode = get_most_linked_node(nodes, links)
# make the dot graph
make_dot_graph(nodes, links, id(mnode), sys.stdout)
if __name__ == '__main__':
#Eg: wget "http://blog.prashanthellina.com" -O - | python thisscript.py | dot -Tpng -o /tmp/test.png ; eog /tmp/test.png
main()














