Python web crawler

jiancheng.zhai

Created: 2016-10-12 Wed 16:14

流程

flow.png

请求

urllib

values = {}
values['username'] = "1016903103@qq.com"
values['password'] = "XXXX"
headers = {'User-Agent':'Mozilla/6.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6' }
data = urllib.urlencode(values) 
request = urllib2.Request("http://www.baidu.com", data, headers=headers)
response = urllib2.urlopen(request)
print response.read()

requests: HTTP for Humans 发送请求

# 设置编码
r.encoding
'utf-8'
r.encoding = 'ISO-8859-1'

# 定制 headers
url = 'https://api.github.com/some/endpoint'
headers = {'user-agent': 'my-app/0.0.1'}
r = requests.get(url, headers=headers)

# 发送请求
r = requests.put("http://httpbin.org/put")
r = requests.delete("http://httpbin.org/delete")
r = requests.head("http://httpbin.org/get")
r = requests.options("http://httpbin.org/get")

requests: HTTP for Humans 传递参数

payload = {'key1': 'value1', 'key2': 'value2'}
r = requests.get("http://httpbin.org/get", params=payload)
print(r.url)
http://httpbin.org/get?key2=value2&key1=value1
# 读取响应内容
r = requests.get('https://github.com/timeline.json')
r.text
u'[{"repository":{"open_issues":0,"url":"https://github.com/...

解析

html - beautifulsoup

html = urllib2.urlopen(req, timeout=20).read()
soup = BeautifulSoup(html, "lxml")
title = soup.title.get_text()
tr_list = soup.find_all('table')[0].find_all('tr')

html - pyquery

from pyquery import PyQuery as pq
doc = pq(requests.get(url, headers=headers).text)
en_mean_list = doc('div.trans-container')('ul')('p.wordGroup')

a.png

js - selenium

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
base_url = 'http://www.baidu.com/'
browser = webdriver.Chrome()
browser.get(base_url)

# 读取响应内容, 这里使用 pyquery 进行解析
doc = pq(browser.page_source)
doc.xhtml_to_html()
# 点击元素
browser.find_element_by_css_selector('.pagination .next').click()
# 获取 cookies
cookies = browser.get_cookies()

js - PhantomJS

优化

防屏蔽 - 代理池

page_num = 10
url_prefix = "http://www.xicidaili.com/wn/{pn}"
url_list = [url_prefix.format(pn=x) for x in range(1, page_num)]
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.154 Safari/537.36'}
headers['Content-Type'] = 'application/x-www-form-urlencoded'
requests = requests.get(url, headers=headers)
soup = BeautifulSoup(requests.text)
tables = soup.findAll('table')
trs = table.findAll('tr')
res_list = []
for index, tr in enumerate(trs):
    res_list.append(item.text.encode('utf8').strip())

ip, port, country, proxy_type, protocol = res_list[1:6]

其他: TOR / ADSL

防屏蔽 - headers

good_header.jpg

bad_header.png

防屏蔽 - fake headers

# 随机换 user agent
from fake_useragent import UserAgent
ua = UserAgent()
headers = {'User-Agent': ua.random}

防屏蔽 - 合理的抓取频率

time.sleep(5 + random.randint(1,3))

DNS Server

cookie

import browsercookie
cj = browsercookie.chrome()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
response = opener.open(url)
html = response.read()

chrome 插件 EditThisCookie http://www.editthiscookie.com/

bloom filter

# 大样本下 Hash 空间/时间消耗均不理想
from pybloom import BloomFilter, ScalableBloomFilter
# 误报率 0.1%
bf = BloomFilter(capacity=10000, error_rate=0.001)
bf.add('test')
print 'test' in bf
sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
sbf.add('dddd')
print 'ddd' in sbf

并发 - 进程池

from multiprocessing import Process, Event
from multiprocessing import Manager
from multiprocessing import Queue, JoinableQueue
POISON_PILL = "POISON_END_PROCESSING"
manager = Manager()
# 多进程共享队列
working_queue = manager.Queue()
output_queue = manager.Queue()
WORKER_NUM = 20

# 启动写线程
writer_proc = Process(target=write_to_disk, args=(output_queue, ))
writer_proc.start()

pool = Pool(processes=WORKER_NUM)
pool.apply(crawl_url, (working_queue, output_queue))
#pool.apply_async(crawl_url, (working_queue, output_queue))
pool.close()
pool.join()

并发 - 协程

def crawl_url():
    send_headers = {'User-Agent': 'Mozilla/6.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
    crawl_res = {}
    while True:
        url = yield crawl_res
        request = urllib2.Request(url, headers=headers)
        html = urllib2.urlopen(req, timeout=10).read()
        title = BeautifulSoup(html, "lxml").title.get_text()
        crawl_res[url] = title

crawler = crawl_url
# 要先执行一次 next 方法
crawl_res = crawler.next()

result_list = []
for url in url_list:
    crawl_res = crawler.send(url)
    result_list.append(crawl_res)

并发 - 多进程 + 协程

# multiprocessing 的 pool 和 gevent 冲突
from multiprocessing import Process
import gevent
from gevent import monkey; monkey.patch_all()

#使用协程来执行
def process_start(tasks):
    gevent.joinall(tasks)


with open(crawl_url_file) as url_reader:
    url = url_reader.readline.strip()
    task_list = []
    i = 0
    while url:
        i += 1
        task_list.append(gevent.spawn(crawl_method, url, ))

        if i >= MAX_TASK_NUM:
            p = Process(target=process_start, args=(task_list, ))
            p.start()
            task_list = []
            i = 0
        url = url_reader.readline.strip()

    # 抓取剩余 url
    if task_list:
        p = Process(target=process_start, args=(task_list, ))
        p.start()

并发 - 分布式

import redis
redis_conn = redis.StrictRedis(host='192.168.1.108', port=6379)
WORKING_QUEUE = 'WORKING'
OUTPUT_QUEUE = 'OUTPUT'
# writer 进程
writer = Process(target=write_to_disk, args=(redis_conn, OUTPUT_QUEUE))
writer.start()
# worker 进程
task_list = []
i = 0
while redis_conn.llen(WORKING_QUEUE) > 0:
    i += 1
    task_list.append(redis_conn.lpop(WORKING_QUEUE))

    if i >= MAX_TASK_NUM:
        p = Process(target=crawl_url, args=(task_list, OUTPUT_QUEUE))
        p.start()
        task_list = []
        i = 0

# 抓取剩余 url
if task_list:
    p = Process(target=crawl_url, args=(task_list, OUTPUT_QUEUE))
    p.start()

存储

sqllite

# 轻量级
# 跨平台
# 读写速度快
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker, mapper
from sqlalchemy import Column, Integer, String, FLOAT, DATE
from sqlalchemy.ext.declarative import declarative_base
engine = create_engine('sqlite:///proxy.db')
Base = declarative_base()
# 定义数据 model
class Proxy(Base):
    __tablename__ = 'proxy'

    id = Column(Integer, primary_key=True)
    ip = Column(String(100))
    port = Column(String(100))
    country = Column(String(100))
    proxy_type = Column(String(100))
    protocol = Column(String(100))
    delay = Column(FLOAT)
    date = Column(DATE)

Base.metadata.create_all(engine)
DBSession = sessionmaker(bind=engine)
session = DBSession()
proxy_list = session.query(Proxy).all()
session.close()

mongodb

from pymongo import MongoClient
client = MongoClient()
db = client.proxy_db
proxy = db.get_collection("proxy")
proxy_item = {"ip": 127.0.0.1, "port": 1080}
proxy.insert(proxy_item)

postgreSQL

# JSONB 类型完全支持索引
# 大数据量场景速度快,性能曲线平稳
# 插入查询操作效率高

SQLALCHEMY_DATABASE_URI = "postgresql://Patrick:123456@localhost/youdao"
from sqlalchemy.dialects.postgresql import JSONB

class YoudaoDict(db.Model):

    __tablename__ = 'youdao_dict'

    id = db.Column(db.Integer, primary_key=True)
    char = db.Column(db.String(20))
    char_unicode = db.Column(db.String(20))
    content = db.Column(JSONB)

    def __repr__(self):
        return ("<char : %s>" % self.char)

res_list = YoudaoDict.query.filter(
YoudaoDict.content['detail_mean'].astext.like("%"+query+"%")).all()[:10]

框架

scrapy - architecture

scrapy.png

scrapy - tree

c.png

scrapy - items.py

import scrapy

class DomainItem(scrapy.Item):
    domain = scrapy.Field()
    title = scrapy.Field()
    keywords = scrapy.Field()
    desc = scrapy.Field()

scrapy - spider.py

from scrapy.spider import BaseSpider
from domain_parse.items import DomainItem

class DomainSpider(BaseSpider):
    name = 'domain'
    domain_list = open('domain_list')
    start_urls = [ domain.strip() for domain in domain_list.readlines() ]
    domain_list.close()

    def parse(self, response):
        domain = response.url
        title = response.selector.xpath("//title/text()").extract_first()
        keywords = response.selector.xpath("//meta[@name='keywords']/@content").extract_first()
        desc = response.selector.xpath("//meta[@name='description']/@content").extract_first()
        yield DomainItem(domain=domain, title=title, keywords=keywords, desc=desc)

scrapy - settings.py

BOT_NAME = 'domain_parse'
SPIDER_MODULES = ['domain_parse.spiders']
NEWSPIDER_MODULE = 'domain_parse.spiders'
RETRY_TIMES = 10
RETRY_HTTP_CODES = [500, 503, 504, 400, 403, 404, 408]
DOWNLOADER_MIDDLEWARES = {
    'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware' : None,
    'scrapy.contrib.downloadermiddleware.retry.RetryMiddleware': 90,
    'domain_parse.spiders.random_proxy.RandomProxy': 100,
    'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 110,
    'domain_parse.spiders.user_agent_pool.UserAgentPoolMiddleware': 500,
}
PROXY_DB = 'proxy.db'
DOMAIN_LIST = 'domain_list'
CONCURRENT_REQUESTS = 100
REACTOR_THREADPOOL_MAXSIZE = 20
COOKIES_ENABLED=False
REDIRECT_ENABLED = False
AJAXCRAWL_ENABLED = True
DOWNLOAD_HANDLERS={'s3':None,}

pyspider

  • Write script in Python
  • Powerful WebUI with script editor, task monitor, project manager and result viewer
  • MySQL, MongoDB, Redis, SQLite, Elasticsearch; PostgreSQL with SQLAlchemy as database backend
  • RabbitMQ, Beanstalk, Redis and Kombu as message queue
  • Task priority, retry, periodical, recrawl by age, etc…
  • Distributed architecture, Crawl Javascript pages, Python 2&3, etc…

https://github.com/binux/pyspider

Q & A