2011年6月7日

twisted beautifulsoup

# coding:utf-8
import urlparse
import cStringIO
from twisted.internet import reactor
from twisted.web.client import getPage
from twisted.python.util import println
from BeautifulSoup import BeautifulSoup, BeautifulStoneSoup
import re
import csv
import codecs

def getContent(td):
tddiv = td.find("div")
if tddiv:
return str(tddiv.contents[0])
else:
return str(td.contents[0])

def parseHtml(html, base=None):
print 'parsed'
soup = BeautifulSoup(html)
print soup.originalEncoding
div = soup.find('div', id='tbl-container')
dat = [ map(getContent, row.findAll("td")[:8]) for row in div.findAll("tr")[:5] ]
file = open('stock.csv', mode='w')
w=csv.writer(file,dialect='excel')
w.writerows(dat)
file.close()
reactor.stop()
url = 'http://www.twse.com.tw'
#file = open('table.html', 'r')
#parseHtml(file)
#file.close()
agent = getPage(url)
agent.addCallback(parseHtml, base=url)
agent.addErrback(println)
reactor.run()

沒有留言:

張貼留言