SushiWen's Blog: twisted beautifulsoup

# coding:utf-8

import urlparse

import cStringIO

from twisted.internet import reactor

from twisted.web.client import getPage

from twisted.python.util import println

from BeautifulSoup import BeautifulSoup, BeautifulStoneSoup

import re

import csv

import codecs

def getContent(td):

tddiv = td.find("div")

if tddiv:

return str(tddiv.contents[0])

else:

return str(td.contents[0])

def parseHtml(html, base=None):

print 'parsed'

soup = BeautifulSoup(html)

print soup.originalEncoding

div = soup.find('div', id='tbl-container')

dat = [ map(getContent, row.findAll("td")[:8]) for row in div.findAll("tr")[:5] ]

file = open('stock.csv', mode='w')

w=csv.writer(file,dialect='excel')

w.writerows(dat)

file.close()

reactor.stop()

url = 'http://www.twse.com.tw'

#file = open('table.html', 'r')

#parseHtml(file)

#file.close()

agent = getPage(url)

agent.addCallback(parseHtml, base=url)

agent.addErrback(println)

reactor.run()

SushiWen's Blog