As a part of a project I am working on, I had to cluster urls on a page. After some light googling I found, python-cluster. You can find below a simple python script to illustrate the usage of python-cluster library.
Code
import pprint
from difflib import SequenceMatcher
# http://python-cluster.sourceforge.net/
from cluster import HierarchicalClustering
# input urls to be clustered
urls = [
'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28814385',
'#articles',
'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28814335',
'http://yro.slashdot.org/~drDugan/',
'http://web.sourceforge.com/privacy.php',
'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28815123',
'http://slashdot.org//slashdot.org/~Darkness404',
'http://slashdot.org//radio.slashdot.org',
'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&op=Reply&threshold=1&commentsort=0&mode=thread&pid=28814429',
'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&op=Reply&threshold=1&commentsort=0&mode=thread&pid=28814457',
'http://slashdot.org//slashdot.org/article.pl?sid=09/07/24/1545238',
'http://slashdot.org//slashdot.org/comments.pl?sid=09/07/24/1545238&cid=28810581',
'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28815269',
'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28814657',
'http://web.sourceforge.com/terms.php'
'http://slashdot.org//it.slashdot.org/search',
'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28814581',
'http://xkcd.com/612/',
'http://web.sourceforge.com/advertising',
'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&op=Reply&threshold=1&commentsort=0&mode=thread&pid=28814785',
]
# distance function compares two urls and finds the distance
# uses SequenceMatcher from python standard module difflib
def distance(url1, url2):
ratio = SequenceMatcher(None, url1, url2).ratio()
return 1.0 - ratio
# Perform clustering
hc = HierarchicalClustering(urls, distance)
clusters = hc.getlevel(0.2)
pprint.pprint(clusters)
Output
[['#articles'],
['http://xkcd.com/612/'],
['http://web.sourceforge.com/privacy.php'],
['http://web.sourceforge.com/advertising'],
['http://web.sourceforge.com/terms.phphttp://slashdot.org//it.slashdot.org/search'],
['http://yro.slashdot.org/~drDugan/'],
['http://slashdot.org//slashdot.org/~Darkness404'],
['http://slashdot.org//radio.slashdot.org'],
['http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&op=Reply&threshold=1&commentsort=0&mode=thread&pid=28814785',
'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&op=Reply&threshold=1&commentsort=0&mode=thread&pid=28814429',
'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&op=Reply&threshold=1&commentsort=0&mode=thread&pid=28814457'],
['http://slashdot.org//slashdot.org/article.pl?sid=09/07/24/1545238',
'http://slashdot.org//slashdot.org/comments.pl?sid=09/07/24/1545238&cid=28810581',
'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28815123',
'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28815269',
'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28814385',
'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28814335',
'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28814657',
'http://slashdot.org//it.slashdot.org/comments.pl?sid=1314601&cid=28814581']]