Python爬虫代理池搭建的方法步骤

　　# -*- coding: utf-8 -*-

　　from lxml import etree

　　from ipproxy import IPProxy

　　from proxy_util import strip, request_page, logger

　　class ProxyBaseCrawler(object):

　　def __init__(self, queue=None, website=None, urls=[]):

　　self.queue = queue

　　self.website = website

　　self.urls = urls

　　def _start_crawl(self):

　　raise NotImplementedError

　　class KuaiDailiCrawler(ProxyBaseCrawler): # 快代理

　　def _start_crawl(self):

　　for url_dict in self.urls:

　　logger.info("开始爬取 [ " + self.website + " ] :::> [ " + url_dict['type'] + " ]")

　　has_more = True

　　url = None

　　while has_more:

　　if 'page' in url_dict.keys() and str.find(url_dict['url'], '{}') != -1:

　　url = url_dict['url'].format(str(url_dict['page']))

　　url_dict['page'] = url_dict['page'] + 1

　　else:

　　url = url_dict['url']

　　has_more = False

　　html = etree.HTML(request_page(url))

　　tr_list = html.xpath("https://table[@class='table table-bordered table-striped']/tbody/tr")

　　for tr in tr_list:

　　ip = tr.xpath("http://www.jb51.net/article/td[@data-title='IP']/text()")[0] if len(

　　tr.xpath("http://www.jb51.net/article/td[@data-title='IP']/text()")) else None

　　port = tr.xpath("http://www.jb51.net/article/td[@data-title='PORT']/text()")[0] if len(

　　tr.xpath("http://www.jb51.net/article/td[@data-title='PORT']/text()")) else None

　　schema = tr.xpath("http://www.jb51.net/article/td[@data-title='类型']/text()")[0] if len(

　　tr.xpath("http://www.jb51.net/article/td[@data-title='类型']/text()")) else None

　　proxy = IPProxy(schema=strip(schema), ip=strip(ip), port=strip(port))

　　if proxy._check_format():

　　self.queue.push(proxy)

　　if tr_list is None:

　　has_more = False

　　class FeiyiDailiCrawler(ProxyBaseCrawler): # 飞蚁代理

　　def _start_crawl(self):

　　for url_dict in self.urls:

　　logger.info("开始爬取 [ " + self.website + " ] :::> [ " + url_dict['type'] + " ]")

　　has_more = True

　　url = None

　　while has_more:

　　if 'page' in url_dict.keys() and str.find(url_dict['url'], '{}') != -1:

　　url = url_dict['url'].format(str(url_dict['page']))

　　url_dict['page'] = url_dict['page'] + 1

　　else:

　　url = url_dict['url']

　　has_more = False

　　html = etree.HTML(request_page(url))

　　tr_list = html.xpath("https://div[@id='main-content']//table/tr[position()>1]")

　　for tr in tr_list:

　　ip = tr.xpath("http://www.jb51.net/article/td[1]/text()")[0] if len(tr.xpath("http://www.jb51.net/article/td[1]/text()")) else None

　　port = tr.xpath("http://www.jb51.net/article/td[2]/text()")[0] if len(tr.xpath("http://www.jb51.net/article/td[2]/text()")) else None

　　schema = tr.xpath("http://www.jb51.net/article/td[4]/text()")[0] if len(tr.xpath("http://www.jb51.net/article/td[4]/text()")) else None

　　proxy = IPProxy(schema=strip(schema), ip=strip(ip), port=strip(port))

　　if proxy._check_format():

　　self.queue.push(proxy)

　　if tr_list is None:

　　has_more = False

　　class WuyouDailiCrawler(ProxyBaseCrawler): # 无忧代理

　　def _start_crawl(self):

　　for url_dict in self.urls:

　　logger.info("开始爬取 [ " + self.website + " ] :::> [ " + url_dict['type'] + " ]")

　　has_more = True

　　url = None

　　while has_more:

　　if 'page' in url_dict.keys() and str.find(url_dict['url'], '{}') != -1:

　　url = url_dict['url'].format(str(url_dict['page']))

　　url_dict['page'] = url_dict['page'] + 1

　　else:

　　url = url_dict['url']

　　has_more = False

　　html = etree.HTML(request_page(url))

　　ul_list = html.xpath("https://div[@class='wlist'][2]//ul[@class='l2']")

　　for ul in ul_list:

　　ip = ul.xpath("http://www.jb51.net/article/span[1]/li/text()")[0] if len(ul.xpath("http://www.jb51.net/article/span[1]/li/text()")) else None

　　port = ul.xpath("http://www.jb51.net/article/span[2]/li/text()")[0] if len(ul.xpath("http://www.jb51.net/article/span[2]/li/text()")) else None

　　schema = ul.xpath("http://www.jb51.net/article/span[4]/li/text()")[0] if len(ul.xpath("http://www.jb51.net/article/span[4]/li/text()")) else None

　　proxy = IPProxy(schema=strip(schema), ip=strip(ip), port=strip(port))

　　if proxy._check_format():

　　self.queue.push(proxy)

　　if ul_list is None:

　　has_more = False

　　class IPhaiDailiCrawler(ProxyBaseCrawler): # IP海代理

　　def _start_crawl(self):

　　for url_dict in self.urls:

　　logger.info("开始爬取 [ " + self.website + " ] :::> [ " + url_dict['type'] + " ]")

　　has_more = True

　　url = None

　　while has_more:

　　if 'page' in url_dict.keys() and str.find(url_dict['url'], '{}') != -1:

　　url = url_dict['url'].format(str(url_dict['page']))

　　url_dict['page'] = url_dict['page'] + 1

　　else:

　　url = url_dict['url']

　　has_more = False

　　html = etree.HTML(request_page(url))

　　tr_list = html.xpath("https://table//tr[position()>1]")

　　for tr in tr_list:

　　ip = tr.xpath("http://www.jb51.net/article/td[1]/text()")[0] if len(tr.xpath("http://www.jb51.net/article/td[1]/text()")) else None

　　port = tr.xpath("http://www.jb51.net/article/td[2]/text()")[0] if len(tr.xpath("http://www.jb51.net/article/td[2]/text()")) else None

　　schema = tr.xpath("http://www.jb51.net/article/td[4]/text()")[0] if len(tr.xpath("http://www.jb51.net/article/td[4]/text()")) else None

　　proxy = IPProxy(schema=strip(schema), ip=strip(ip), port=strip(port))

　　if proxy._check_format():

　　self.queue.push(proxy)

　　if tr_list is None:

　　has_more = False

　　class YunDailiCrawler(ProxyBaseCrawler): # 云代理

　　def _start_crawl(self):

　　for url_dict in self.urls:

　　logger.info("开始爬取 [ " + self.website + " ] :::> [ " + url_dict['type'] + " ]")

　　has_more = True

　　url = None

　　while has_more:

　　if 'page' in url_dict.keys() and str.find(url_dict['url'], '{}') != -1:

　　url = url_dict['url'].format(str(url_dict['page']))

　　url_dict['page'] = url_dict['page'] + 1

　　else:

　　url = url_dict['url']

　　has_more = False

　　html = etree.HTML(request_page(url, encoding='gbk'))

　　tr_list = html.xpath("https://table/tbody/tr")

　　for tr in tr_list:

　　ip = tr.xpath("http://www.jb51.net/article/td[1]/text()")[0] if len(tr.xpath("http://www.jb51.net/article/td[1]/text()")) else None

　　port = tr.xpath("http://www.jb51.net/article/td[2]/text()")[0] if len(tr.xpath("http://www.jb51.net/article/td[2]/text()")) else None

　　schema = tr.xpath("http://www.jb51.net/article/td[4]/text()")[0] if len(tr.xpath("http://www.jb51.net/article/td[4]/text()")) else None

　　proxy = IPProxy(schema=strip(schema), ip=strip(ip), port=strip(port))

　　if proxy._check_format():

　　self.queue.push(proxy)

　　if tr_list is None:

　　has_more = False

　　class XiCiDailiCrawler(ProxyBaseCrawler): # 西刺代理

　　def _start_crawl(self):

　　for url_dict in self.urls:

　　logger.info("开始爬取 [ " + self.website + " ] :::> [ " + url_dict['type'] + " ]")

　　has_more = True

　　url = None

　　while has_more:

　　if 'page' in url_dict.keys() and str.find(url_dict['url'], '{}') != -1:

　　url = url_dict['url'].format(str(url_dict['page']))

　　url_dict['page'] = url_dict['page'] + 1

　　else:

　　url = url_dict['url']

　　has_more = False

　　html = etree.HTML(request_page(url))

　　tr_list = html.xpath("https://table[@id='ip_list']//tr[@class!='subtitle']")

　　for tr in tr_list:

　　ip = tr.xpath("http://www.jb51.net/article/td[2]/text()")[0] if len(tr.xpath("http://www.jb51.net/article/td[2]/text()")) else None

　　port = tr.xpath("http://www.jb51.net/article/td[3]/text()")[0] if len(tr.xpath("http://www.jb51.net/article/td[3]/text()")) else None

　　schema = tr.xpath("http://www.jb51.net/article/td[6]/text()")[0] if len(tr.xpath("http://www.jb51.net/article/td[6]/text()")) else None

　　if schema.lower() == "http" or schema.lower() == "https":

　　proxy = IPProxy(schema=strip(schema), ip=strip(ip), port=strip(port))

　　if proxy._check_format():

　　self.queue.push(proxy)

　　if tr_list is None:

　　has_more = False

您可能感兴趣的文章:

Python爬虫代理池搭建的方法步骤

相关文章

大家感兴趣的内容

最近更新的内容