Sample html page parser with libxml2dom in Python
This is a sample how to parse html page with libxml2dom.
Befor run the script you should install libxml2dom ( pip install libxml2dom ). Enjoy :)
For more information about XPath i recommend you visit http://www.w3schools.com/xpath/xpath_syntax.asp and http://www.boddie.org.uk/python/libxml2dom.html
#!/usr/bin/python
# -*- coding: utf-8 -*-
'''
Created on 17.02.2011
@author: Razzhivin Alexander
'''
import urllib2
import libxml2dom
def load_page_content(url):
try:
conn = urllib2.Request(url)
c = urllib2.urlopen(conn)
return c.read()
except:
print "Can't load %s" % url
def collect_urls(url, xpath):
page = load_page_content(url)
doc = libxml2dom.parseString(page, html=1, htmlencoding='utf-8', validate=1)
atags = doc.xpath(xpath)
urls = []
for a in atags:
urls.append( (a.textContent.lstrip().rstrip(),a.getAttribute('href')) )
return urls
def main():
from pprint import pprint
urls= collect_urls('http://httpbots.com/en/','//*[@id="content"]/*//a')
for u in urls:
print '%s: %s' % (u[0], u[1])
if __name__ == '__main__':
main()
Login
Sign up

