使用python获取IP代理列表

无聊写点小例子玩玩，非常简单。
抓取的网站地址为：http://activeproxy.net/en/

主要就是urlopen获取内容，使用SGMLParser筛选代理数据，存储。

最后的存储文件片断:

 checktime                  country             proxyip      anonymous     speed      port   connect
57                  Germany      94.249.168.231             no      1181      3128     0,006
14                  Germany       93.180.156.21             no       807      3128     0,009
05                  Germany      188.138.115.15             no       744      3128     0,009
31                  Germany       91.250.83.172             no       578      3128     0,011
30            United States          5.153.4.90             no       606      3128     0,013
25                   France       91.121.58.235             no       490      3128     0,014

抓取用的python脚本:

#!/usr/bin/python

import os
import urllib2
from sgmllib import SGMLParser

url = 'http://activeproxy.net/en/%d/'

class ProxyParser(SGMLParser):
    def __init__(self):
        SGMLParser.__init__(self)
        self.proxies = []
        self.update()

    def update(self):
        self.tr = False
        self.td = False
        self.proxy = {}
        self.cur_attr = ''
        self.count = 0

    def start_tr(self, attr):
        self.tr = True 

    def end_tr(self):
        if (self.proxy):
            self.proxies.append(self.proxy)
            self.count += 1
        self.tr = False
        self.proxy = {}

    def start_td(self, attr):
        self.td = True 
        if self.tr:
            self.cur_attr = attr[0][1][4:]

    def end_td(self):
        self.td = False
        self.cur_attr = ''

    def handle_data(self, text):
        if self.tr and self.td and not text.startswith('\n'):
            self.proxy[self.cur_attr] = text


i = 1
proxyParser = ProxyParser()
while True:
    proxyParser.update()
    print 'proceeding...', url % (i)
    content = urllib2.urlopen(url % (i)).read()
    content = content.decode('gb2312').encode('utf-8')
    proxyParser.feed(content)
    print proxyParser.count, 'fetched.'
    i += 1
    if i > 100 or proxyParser.count != 20:
        break;


f = open('active_proxy', 'w')
str_format = '%10s%25s%20s%15s%10s%10s%10s' + os.linesep
if proxyParser.proxies:
    f.write(str_format % tuple(proxyParser.proxies[0].keys()))

for item in proxyParser.proxies:
    if len(item.values()) == 7:
        f.write(str_format % tuple(item.values()))

f.close()

PREVIOUStwo word break problems

NEXTleetcode上2Sum 3Sum 4Sum以及kSum问题的分析