Python爬虫代理池搭建的方法步骤
# -*- coding: utf-8 -*-
from lxml import etree
from ipproxy import IPProxy
from proxy_util import strip, request_page, logger
class ProxyBaseCrawler(object):
def __init__(self, queue=None, website=None, urls=[]):
self.queue = queue
self.website = website
self.urls = urls
def _start_crawl(self):
raise NotImplementedError
class KuaiDailiCrawler(ProxyBaseCrawler): # 快代理
def _start_crawl(self):
for url_dict in self.urls:
logger.info("开始爬取 [ " + self.website + " ] :::> [ " + url_dict['type'] + " ]")
has_more = True
url = None
while has_more:
if 'page' in url_dict.keys() and str.find(url_dict['url'], '{}') != -1:
url = url_dict['url'].format(str(url_dict['page']))
url_dict['page'] = url_dict['page'] + 1
else:
url = url_dict['url']
has_more = False
html = etree.HTML(request_page(url))
tr_list = html.xpath("https://table[@class='table table-bordered table-striped']/tbody/tr")
for tr in tr_list:
ip = tr.xpath("http://www.jb51.net/article/td[@data-title='IP']/text()")[0] if len(
tr.xpath("http://www.jb51.net/article/td[@data-title='IP']/text()")) else None
port = tr.xpath("http://www.jb51.net/article/td[@data-title='PORT']/text()")[0] if len(
tr.xpath("http://www.jb51.net/article/td[@data-title='PORT']/text()")) else None
schema = tr.xpath("http://www.jb51.net/article/td[@data-title='类型']/text()")[0] if len(
tr.xpath("http://www.jb51.net/article/td[@data-title='类型']/text()")) else None
proxy = IPProxy(schema=strip(schema), ip=strip(ip), port=strip(port))
if proxy._check_format():
self.queue.push(proxy)
if tr_list is None:
has_more = False
class FeiyiDailiCrawler(ProxyBaseCrawler): # 飞蚁代理
def _start_crawl(self):
for url_dict in self.urls:
logger.info("开始爬取 [ " + self.website + " ] :::> [ " + url_dict['type'] + " ]")
has_more = True
url = None
while has_more:
if 'page' in url_dict.keys() and str.find(url_dict['url'], '{}') != -1:
url = url_dict['url'].format(str(url_dict['page']))
url_dict['page'] = url_dict['page'] + 1
else:
url = url_dict['url']
has_more = False
html = etree.HTML(request_page(url))
tr_list = html.xpath("https://div[@id='main-content']//table/tr[position()>1]")
for tr in tr_list:
ip = tr.xpath("http://www.jb51.net/article/td[1]/text()")[0] if len(tr.xpath("http://www.jb51.net/article/td[1]/text()")) else None
port = tr.xpath("http://www.jb51.net/article/td[2]/text()")[0] if len(tr.xpath("http://www.jb51.net/article/td[2]/text()")) else None
schema = tr.xpath("http://www.jb51.net/article/td[4]/text()")[0] if len(tr.xpath("http://www.jb51.net/article/td[4]/text()")) else None
proxy = IPProxy(schema=strip(schema), ip=strip(ip), port=strip(port))
if proxy._check_format():
self.queue.push(proxy)
if tr_list is None:
has_more = False
class WuyouDailiCrawler(ProxyBaseCrawler): # 无忧代理
def _start_crawl(self):
for url_dict in self.urls:
logger.info("开始爬取 [ " + self.website + " ] :::> [ " + url_dict['type'] + " ]")
has_more = True
url = None
while has_more:
if 'page' in url_dict.keys() and str.find(url_dict['url'], '{}') != -1:
url = url_dict['url'].format(str(url_dict['page']))
url_dict['page'] = url_dict['page'] + 1
else:
url = url_dict['url']
has_more = False
html = etree.HTML(request_page(url))
ul_list = html.xpath("https://div[@class='wlist'][2]//ul[@class='l2']")
for ul in ul_list:
ip = ul.xpath("http://www.jb51.net/article/span[1]/li/text()")[0] if len(ul.xpath("http://www.jb51.net/article/span[1]/li/text()")) else None
port = ul.xpath("http://www.jb51.net/article/span[2]/li/text()")[0] if len(ul.xpath("http://www.jb51.net/article/span[2]/li/text()")) else None
schema = ul.xpath("http://www.jb51.net/article/span[4]/li/text()")[0] if len(ul.xpath("http://www.jb51.net/article/span[4]/li/text()")) else None
proxy = IPProxy(schema=strip(schema), ip=strip(ip), port=strip(port))
if proxy._check_format():
self.queue.push(proxy)
if ul_list is None:
has_more = False
class IPhaiDailiCrawler(ProxyBaseCrawler): # IP海代理
def _start_crawl(self):
for url_dict in self.urls:
logger.info("开始爬取 [ " + self.website + " ] :::> [ " + url_dict['type'] + " ]")
has_more = True
url = None
while has_more:
if 'page' in url_dict.keys() and str.find(url_dict['url'], '{}') != -1:
url = url_dict['url'].format(str(url_dict['page']))
url_dict['page'] = url_dict['page'] + 1
else:
url = url_dict['url']
has_more = False
html = etree.HTML(request_page(url))
tr_list = html.xpath("https://table//tr[position()>1]")
for tr in tr_list:
ip = tr.xpath("http://www.jb51.net/article/td[1]/text()")[0] if len(tr.xpath("http://www.jb51.net/article/td[1]/text()")) else None
port = tr.xpath("http://www.jb51.net/article/td[2]/text()")[0] if len(tr.xpath("http://www.jb51.net/article/td[2]/text()")) else None
schema = tr.xpath("http://www.jb51.net/article/td[4]/text()")[0] if len(tr.xpath("http://www.jb51.net/article/td[4]/text()")) else None
proxy = IPProxy(schema=strip(schema), ip=strip(ip), port=strip(port))
if proxy._check_format():
self.queue.push(proxy)
if tr_list is None:
has_more = False
class YunDailiCrawler(ProxyBaseCrawler): # 云代理
def _start_crawl(self):
for url_dict in self.urls:
logger.info("开始爬取 [ " + self.website + " ] :::> [ " + url_dict['type'] + " ]")
has_more = True
url = None
while has_more:
if 'page' in url_dict.keys() and str.find(url_dict['url'], '{}') != -1:
url = url_dict['url'].format(str(url_dict['page']))
url_dict['page'] = url_dict['page'] + 1
else:
url = url_dict['url']
has_more = False
html = etree.HTML(request_page(url, encoding='gbk'))
tr_list = html.xpath("https://table/tbody/tr")
for tr in tr_list:
ip = tr.xpath("http://www.jb51.net/article/td[1]/text()")[0] if len(tr.xpath("http://www.jb51.net/article/td[1]/text()")) else None
port = tr.xpath("http://www.jb51.net/article/td[2]/text()")[0] if len(tr.xpath("http://www.jb51.net/article/td[2]/text()")) else None
schema = tr.xpath("http://www.jb51.net/article/td[4]/text()")[0] if len(tr.xpath("http://www.jb51.net/article/td[4]/text()")) else None
proxy = IPProxy(schema=strip(schema), ip=strip(ip), port=strip(port))
if proxy._check_format():
self.queue.push(proxy)
if tr_list is None:
has_more = False
class XiCiDailiCrawler(ProxyBaseCrawler): # 西刺代理
def _start_crawl(self):
for url_dict in self.urls:
logger.info("开始爬取 [ " + self.website + " ] :::> [ " + url_dict['type'] + " ]")
has_more = True
url = None
while has_more:
if 'page' in url_dict.keys() and str.find(url_dict['url'], '{}') != -1:
url = url_dict['url'].format(str(url_dict['page']))
url_dict['page'] = url_dict['page'] + 1
else:
url = url_dict['url']
has_more = False
html = etree.HTML(request_page(url))
tr_list = html.xpath("https://table[@id='ip_list']//tr[@class!='subtitle']")
for tr in tr_list:
ip = tr.xpath("http://www.jb51.net/article/td[2]/text()")[0] if len(tr.xpath("http://www.jb51.net/article/td[2]/text()")) else None
port = tr.xpath("http://www.jb51.net/article/td[3]/text()")[0] if len(tr.xpath("http://www.jb51.net/article/td[3]/text()")) else None
schema = tr.xpath("http://www.jb51.net/article/td[6]/text()")[0] if len(tr.xpath("http://www.jb51.net/article/td[6]/text()")) else None
if schema.lower() == "http" or schema.lower() == "https":
proxy = IPProxy(schema=strip(schema), ip=strip(ip), port=strip(port))
if proxy._check_format():
self.queue.push(proxy)
if tr_list is None:
has_more = False