2011年6月6日

twisted lxml

# coding:utf-8
import urlparse
import cStringIO
from twisted.internet import reactor
from twisted.web.client import getPage
from twisted.python.util import println
from lxml import etree
"""
def saveImg(data, name):
print 'Save', name
open(name, 'wb').write(data)
imgCount -= 1
if not imgCount:
reactor.stop()
"""
def parseHtml(html, base):
#print html
parser = etree.HTMLParser()
tree = etree.parse(cStringIO.StringIO(html), parser)

tables = tree.xpath("//table")
print len(tables)
f = open('table.html', 'a')
for table in tables:
tablehtml = (etree.tostring(table, pretty_print=True))
f.write(tablehtml)
f.close()
#trs = tbody[0].getchildren()
#for tr in trs:
# print tr.tag
#table = tbody[0].getparent()
#print table.text
#div = table.getparent()
#print div.text
"""
trs = tree.xpath("//tr[@class='basic2']", smart_strings=False)
for tr in trs:
if tr.text:
print tr.text
for td in tr:
if td.tag=='td':
if td.text:
try:
print td.text
except UnicodeEncodeError:
print td.text.encode('utf-8')
else:
print td[0].text
"""
"""
imgs = tree.xpath("//img")
imgCount += len(imgs)
for img in imgs:
url = urlparse.urljoin(base, img.get('src'))
name = url.split('/')[-1]
print url
d = getPage(url)
d.addCallback(saveImg, name=name)
"""
url = 'http://www.twse.com.tw/ch/trading/fund/MI_QFIIS/MI_QFIIS.php?select2=all&input_date=100%2F06%2F03'
agent = getPage(url)
agent.addCallback(parseHtml, base=url)
agent.addErrback(println)
reactor.run()