无聊写点小例子玩玩,非常简单。
抓取的网站地址为:http://activeproxy.net/en/
主要就是urlopen获取内容,使用SGMLParser筛选代理数据,存储。
最后的存储文件片断:
checktime country proxyip anonymous speed port connect
02:57 Germany 94.249.168.231 no 1181 3128 0,006
03:14 Germany 93.180.156.21 no 807 3128 0,009
18:05 Germany 188.138.115.15 no 744 3128 0,009
06:31 Germany 91.250.83.172 no 578 3128 0,011
00:30 United States 5.153.4.90 no 606 3128 0,013
14:25 France 91.121.58.235 no 490 3128 0,014
抓取用的python脚本:
#!/usr/bin/python
import os
import urllib2
from sgmllib import SGMLParser
url = 'http://activeproxy.net/en/%d/'
class ProxyParser(SGMLParser):
def __init__(self):
SGMLParser.__init__(self)
self.proxies = []
self.update()
def update(self):
self.tr = False
self.td = False
self.proxy = {}
self.cur_attr = ''
self.count = 0
def start_tr(self, attr):
self.tr = True
def end_tr(self):
if (self.proxy):
self.proxies.append(self.proxy)
self.count += 1
self.tr = False
self.proxy = {}
def start_td(self, attr):
self.td = True
if self.tr:
self.cur_attr = attr[0][1][4:]
def end_td(self):
self.td = False
self.cur_attr = ''
def handle_data(self, text):
if self.tr and self.td and not text.startswith('\n'):
self.proxy[self.cur_attr] = text
i = 1
proxyParser = ProxyParser()
while True:
proxyParser.update()
print 'proceeding...', url % (i)
content = urllib2.urlopen(url % (i)).read()
content = content.decode('gb2312').encode('utf-8')
proxyParser.feed(content)
print proxyParser.count, 'fetched.'
i += 1
if i > 100 or proxyParser.count != 20:
break;
f = open('active_proxy', 'w')
str_format = '%10s%25s%20s%15s%10s%10s%10s' + os.linesep
if proxyParser.proxies:
f.write(str_format % tuple(proxyParser.proxies[0].keys()))
for item in proxyParser.proxies:
if len(item.values()) == 7:
f.write(str_format % tuple(item.values()))
f.close()