Scrapy 爬虫 - 实战


(案例一)手机 App 抓包爬虫

1. items.py

class DouyuspiderItem(scrapy.Item):
    name = scrapy.Field()# 存储照片的名字
    imagesUrls = scrapy.Field()# 照片的url路径
    imagesPath = scrapy.Field()# 照片保存在本地的路径

2. spiders/douyu.py

import scrapy
import json
from douyuSpider.items import DouyuspiderItem

class DouyuSpider(scrapy.Spider):
    name = "douyu"
    allowd_domains = ["http://capi.douyucdn.cn"]

    offset = 0
    url = "http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset="
    start_urls = [url + str(offset)]

  def parse(self, response):
      # 返回从json里获取 data段数据集合
      data = json.loads(response.text)["data"]

      for each in data:
          item = DouyuspiderItem()
          item["name"] = each["nickname"]
          item["imagesUrls"] = each["vertical_src"]

          yield item

      self.offset += 20
      yield scrapy.Request(self.url + str(self.offset), callback = self.parse)

3. 设置 setting.py


ITEM_PIPELINES = {'douyuSpider.pipelines.ImagesPipeline': 1}

# Images 的存放位置,之后会在pipelines.py里调用
IMAGES_STORE = "/Users/Power/lesson_python/douyuSpider/Images"

# user-agent
USER_AGENT = 'DYZB/2.290 (iPhone; iOS 9.3.4; Scale/2.00)'

4. pipelines.py

import scrapy
import os
from scrapy.pipelines.images import ImagesPipeline
from scrapy.utils.project import get_project_settings

class ImagesPipeline(ImagesPipeline):
    IMAGES_STORE = get_project_settings().get("IMAGES_STORE")

    def get_media_requests(self, item, info):
        image_url = item["imagesUrls"]
        yield scrapy.Request(image_url)

    def item_completed(self, results, item, info):
        # 固定写法,获取图片路径,同时判断这个路径是否正确,如果正确,就放到 image_path里,ImagesPipeline源码剖析可见
        image_path = [x["path"] for ok, x in results if ok]

        os.rename(self.IMAGES_STORE + "/" + image_path[0], self.IMAGES_STORE + "/" + item["name"] + ".jpg")
        item["imagesPath"] = self.IMAGES_STORE + "/" + item["name"]

        return item

#get_media_requests的作用就是为每一个图片链接生成一个Request对象,这个方法的输出将作为item_completed的输入中的results,results是一个元组,每个元组包括(success, imageinfoorfailure)。如果success=true,imageinfoor_failure是一个字典,包括url/path/checksum三个key。

在项目根目录下新建 main.py 文件,用于调试

from scrapy import cmdline
cmdline.execute('scrapy crawl douyu'.split())

执行程序

py2 main.py

(案例二)阳光热线问政平台

http://wz.sun0769.com/index.php/question/questionType?type=4

爬取投诉帖子的编号、帖子的 url、帖子的标题,和帖子里的内容。

items.py

import scrapy

class DongguanItem(scrapy.Item):
    # 每个帖子的标题
    title = scrapy.Field()
    # 每个帖子的编号
    number = scrapy.Field()
    # 每个帖子的文字内容
    content = scrapy.Field()
    # 每个帖子的url
    url = scrapy.Field()

spiders/sunwz.py

Spider 版本
# -*- coding: utf-8 -*-

import scrapy
from dongguan.items import DongguanItem

class SunSpider(CrawlSpider):
    name = 'sun'
    allowed_domains = ['wz.sun0769.com']
    url = 'http://wz.sun0769.com/index.php/question/questionType?type=4&page='
    offset = 0
    start_urls = [url + str(offset)]

    def parse(self, response):
        # 取出每个页面里帖子链接列表
        links = response.xpath("//div[@class='greyframe']/table//td/a[@class='news14']/@href").extract()
        # 迭代发送每个帖子的请求,调用parse_item方法处理
        for link in links:
            yield scrapy.Request(link, callback = self.parse_item)
        # 设置页码终止条件,并且每次发送新的页面请求调用parse方法处理
        if self.offset <= 71130:
            self.offset += 30
            yield scrapy.Request(self.url + str(self.offset), callback = self.parse)

    # 处理每个帖子里
    def parse_item(self, response):
        item = DongguanItem()
        # 标题
        item['title'] = response.xpath('//div[contains(@class, "pagecenter p3")]//strong/text()').extract()[0]

        # 编号
        item['number'] = item['title'].split(' ')[-1].split(":")[-1]

        # 文字内容,默认先取出有图片情况下的文字内容列表
        content = response.xpath('//div[@class="contentext"]/text()').extract()
        # 如果没有内容,则取出没有图片情况下的文字内容列表
        if len(content) == 0:
            content = response.xpath('//div[@class="c1 text14_2"]/text()').extract()
            # content为列表,通过join方法拼接为字符串,并去除首尾空格
            item['content'] = "".join(content).strip()
        else:
            item['content'] = "".join(content).strip()

        # 链接
        item['url'] = response.url

        yield item
CrawlSpider 版本

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from dongguan.items import DongguanItem
import time


class SunSpider(CrawlSpider):
    name = 'sun'
    allowed_domains = ['wz.sun0769.com']
    start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page=']

    # 每一页的匹配规则
    pagelink = LinkExtractor(allow=('type=4'))
    # 每个帖子的匹配规则
    contentlink = LinkExtractor(allow=r'/html/question/\d+/\d+.shtml')

    rules = [
        # 本案例为特殊情况,需要调用deal_links方法处理每个页面里的链接
        Rule(pagelink, process_links = "deal_links", follow = True),
        Rule(contentlink, callback = 'parse_item')
    ]

    # 需要重新处理每个页面里的链接,将链接里的‘Type&type=4?page=xxx’替换为‘Type?type=4&page=xxx’(或者是Type&page=xxx?type=4’替换为‘Type?page=xxx&type=4’),否则无法发送这个链接
    def deal_links(self, links):
        for link in links:
            link.url = link.url.replace("?","&").replace("Type&", "Type?")
            print link.url
        return links


    def parse_item(self, response):
        print response.url
        item = DongguanItem()
        # 标题
        item['title'] = response.xpath('//div[contains(@class, "pagecenter p3")]//strong/text()').extract()[0]

        # 编号
        item['number'] = item['title'].split(' ')[-1].split(":")[-1]

        # 文字内容,默认先取出有图片情况下的文字内容列表
        content = response.xpath('//div[@class="contentext"]/text()').extract()
        # 如果没有内容,则取出没有图片情况下的文字内容列表
        if len(content) == 0:
            content = response.xpath('//div[@class="c1 text14_2"]/text()').extract()
            # content为列表,通过join方法拼接为字符串,并去除首尾空格
            item['content'] = "".join(content).strip()
        else:
            item['content'] = "".join(content).strip()

        # 链接
        item['url'] = response.url

        yield item

pipelines.py

# -*- coding: utf-8 -*-

# 文件处理类库,可以指定编码格式
import codecs
import json

class JsonWriterPipeline(object):

    def __init__(self):
        # 创建一个只写文件,指定文本编码格式为utf-8
        self.filename = codecs.open('sunwz.json', 'w', encoding='utf-8')

    def process_item(self, item, spider):
        content = json.dumps(dict(item), ensure_ascii=False) + "\n"
        self.filename.write(content)
        return item

    def spider_closed(self, spider):
        self.file.close()

settings.py

ITEM_PIPELINES = {
    'dongguan.pipelines.DongguanPipeline': 300,
}

# 日志文件名和处理等级
LOG_FILE = "dg.log"
LOG_LEVEL = "DEBUG"

在项目根目录下新建main.py文件,用于调试

from scrapy import cmdline
cmdline.execute('scrapy crawl sunwz'.split())

执行程序

py2 main.py

(案例三)新浪网分类资讯爬虫

爬取新浪网导航页所有下所有大类、小类、小类里的子链接,以及子链接页面的新闻内容。

新建项目

scrapy startproject sina

items.py

import scrapy
import sys
reload(sys)
sys.setdefaultencoding("utf-8")

class SinaItem(scrapy.Item):
    # 大类的标题 和 url
    parentTitle = scrapy.Field()
    parentUrls = scrapy.Field()

    # 小类的标题 和 子url
    subTitle = scrapy.Field()
    subUrls = scrapy.Field()

    # 小类目录存储路径
    subFilename = scrapy.Field()

    # 小类下的子链接
    sonUrls = scrapy.Field()

    # 文章标题和内容
    head = scrapy.Field()
    content = scrapy.Field()

spiders/sina.py

# -*- coding: utf-8 -*-

from Sina.items import SinaItem
import scrapy
import os

import sys
reload(sys)
sys.setdefaultencoding("utf-8")


class SinaSpider(scrapy.Spider):
    name= "sina"
    allowed_domains= ["sina.com.cn"]
    start_urls= [
       "http://news.sina.com.cn/guide/"
    ]

    def parse(self, response):
        items= []
        # 所有大类的url 和 标题
        parentUrls = response.xpath('//div[@id=\"tab01\"]/div/h3/a/@href').extract()
        parentTitle = response.xpath("//div[@id=\"tab01\"]/div/h3/a/text()").extract()

        # 所有小类的ur 和 标题
        subUrls  = response.xpath('//div[@id=\"tab01\"]/div/ul/li/a/@href').extract()
        subTitle = response.xpath('//div[@id=\"tab01\"]/div/ul/li/a/text()').extract()

        #爬取所有大类
        for i in range(0, len(parentTitle)):
            # 指定大类目录的路径和目录名
            parentFilename = "./Data/" + parentTitle[i]

            #如果目录不存在,则创建目录
            if(not os.path.exists(parentFilename)):
                os.makedirs(parentFilename)

            # 爬取所有小类
            for j in range(0, len(subUrls)):
                item = SinaItem()

                # 保存大类的title和urls
                item['parentTitle'] = parentTitle[i]
                item['parentUrls'] = parentUrls[i]

                # 检查小类的url是否以同类别大类url开头,如果是返回True (sports.sina.com.cn 和 sports.sina.com.cn/nba)
                if_belong = subUrls[j].startswith(item['parentUrls'])

                # 如果属于本大类,将存储目录放在本大类目录下
                if(if_belong):
                    subFilename =parentFilename + '/'+ subTitle[j]
                    # 如果目录不存在,则创建目录
                    if(not os.path.exists(subFilename)):
                        os.makedirs(subFilename)

                    # 存储 小类url、title和filename字段数据
                    item['subUrls'] = subUrls[j]
                    item['subTitle'] =subTitle[j]
                    item['subFilename'] = subFilename

                    items.append(item)

        #发送每个小类url的Request请求,得到Response连同包含meta数据 一同交给回调函数 second_parse 方法处理
        for item in items:
            yield scrapy.Request( url = item['subUrls'], meta={'meta_1': item}, callback=self.second_parse)

    #对于返回的小类的url,再进行递归请求
    def second_parse(self, response):
        # 提取每次Response的meta数据
        meta_1= response.meta['meta_1']

        # 取出小类里所有子链接
        sonUrls = response.xpath('//a/@href').extract()

        items= []
        for i in range(0, len(sonUrls)):
            # 检查每个链接是否以大类url开头、以.shtml结尾,如果是返回True
            if_belong = sonUrls[i].endswith('.shtml') and sonUrls[i].startswith(meta_1['parentUrls'])

            # 如果属于本大类,获取字段值放在同一个item下便于传输
            if(if_belong):
                item = SinaItem()
                item['parentTitle'] =meta_1['parentTitle']
                item['parentUrls'] =meta_1['parentUrls']
                item['subUrls'] = meta_1['subUrls']
                item['subTitle'] = meta_1['subTitle']
                item['subFilename'] = meta_1['subFilename']
                item['sonUrls'] = sonUrls[i]
                items.append(item)

        #发送每个小类下子链接url的Request请求,得到Response后连同包含meta数据 一同交给回调函数 detail_parse 方法处理
        for item in items:
                yield scrapy.Request(url=item['sonUrls'], meta={'meta_2':item}, callback = self.detail_parse)

    # 数据解析方法,获取文章标题和内容
    def detail_parse(self, response):
        item = response.meta['meta_2']
        content = ""
        head = response.xpath('//h1[@id=\"main_title\"]/text()')
        content_list = response.xpath('//div[@id=\"artibody\"]/p/text()').extract()

        # 将p标签里的文本内容合并到一起
        for content_one in content_list:
            content += content_one

        item['head']= head
        item['content']= content

        yield item

pipelines.py

from scrapy import signals
import sys
reload(sys)
sys.setdefaultencoding("utf-8")

class SinaPipeline(object):
    def process_item(self, item, spider):
        sonUrls = item['sonUrls']

        # 文件名为子链接url中间部分,并将 / 替换为 _,保存为 .txt格式
        filename = sonUrls[7:-6].replace('/','_')
        filename += ".txt"

        fp = open(item['subFilename']+'/'+filename, 'w')
        fp.write(item['content'])
        fp.close()

        return item

settings.py

BOT_NAME = 'Sina'

SPIDER_MODULES = ['Sina.spiders']
NEWSPIDER_MODULE = 'Sina.spiders'

ITEM_PIPELINES = {
    'Sina.pipelines.SinaPipeline': 300,
}

LOG_LEVEL = 'DEBUG'

在项目根目录下新建main.py文件,用于调试

from scrapy import cmdline
cmdline.execute('scrapy crawl sina'.split())

执行程序

py2 main.py

(案例四)图片下载器爬虫

items.py

class CoserItem(scrapy.Item):
    url = scrapy.Field()
    name = scrapy.Field()
    info = scrapy.Field()
    image_urls = scrapy.Field()
    images = scrapy.Field()

spiders/coser.py

# -*- coding: utf-8 -*-
from scrapy.selector import Selector
import scrapy
from scrapy.contrib.loader import ItemLoader
from Cosplay.items import CoserItem


class CoserSpider(scrapy.Spider):
    name = "coser"
    allowed_domains = ["bcy.net"]
    start_urls = (
        'http://bcy.net/cn125101',
        'http://bcy.net/cn126487',
        'http://bcy.net/cn126173'
    )

    def parse(self, response):
        sel = Selector(response)

        for link in sel.xpath("//ul[@class='js-articles l-works']/li[@class='l-work--big']/article[@class='work work--second-created']/h2[@class='work__title']/a/@href").extract():
            link = 'http://bcy.net%s' % link
            request = scrapy.Request(link, callback=self.parse_item)
            yield request

    def parse_item(self, response):
        l = ItemLoader(item=CoserItem(), response=response)
        l.add_xpath('name', "//h1[@class='js-post-title']/text()")
        l.add_xpath('info', "//div[@class='post__info']/div[@class='post__type post__info-group']/span/text()")
        urls = l.get_xpath('//img[@class="detail_std detail_clickable"]/@src')
        urls = [url.replace('/w650', '') for url in urls]
        l.add_value('image_urls', urls)
        l.add_value('url', response.url)

        return l.load_item()

pipelines.py

import requests
from Cosplay import settings
import os


class ImageDownloadPipeline(object):
    def process_item(self, item, spider):
        if 'image_urls' in item:
            images = []
            dir_path = '%s/%s' % (settings.IMAGES_STORE, spider.name)

            if not os.path.exists(dir_path):
                os.makedirs(dir_path)
            for image_url in item['image_urls']:
                us = image_url.split('/')[3:]
                image_file_name = '_'.join(us)
                file_path = '%s/%s' % (dir_path, image_file_name)
                images.append(file_path)
                if os.path.exists(file_path):
                    continue

                with open(file_path, 'wb') as handle:
                    response = requests.get(image_url, stream=True)
                    for block in response.iter_content(1024):
                        if not block:
                            break

                        handle.write(block)

            item['images'] = images
        return item

settings.py


ITEM_PIPELINES = {'Cosplay.pipelines.ImageDownloadPipeline': 1}

IMAGES_STORE = '../Images'

DOWNLOAD_DELAY = 0.25    # 250 ms of delay

在项目根目录下新建main.py文件,用于调试

from scrapy import cmdline
cmdline.execute('scrapy crawl coser'.split())

执行程序

py2 main.py

(案例五)用 Pymongo 保存数据

爬取豆瓣电影top250movie.douban.com/top250的电影数据,并保存在MongoDB中。

items.py

class DoubanspiderItem(scrapy.Item):
    # 电影标题
    title = scrapy.Field()
    # 电影评分
    score = scrapy.Field()
    # 电影信息
    content = scrapy.Field()
    # 简介
    info = scrapy.Field()

spiders/douban.py

import scrapy
from doubanSpider.items import DoubanspiderItem


class DoubanSpider(scrapy.Spider):
    name = "douban"
    allowed_domains = ["movie.douban.com"]
    start = 0
    url = 'https://movie.douban.com/top250?start='
    end = '&filter='
    start_urls = [url + str(start) + end]

    def parse(self, response):

        item = DoubanspiderItem()

        movies = response.xpath("//div[@class=\'info\']")

        for each in movies:
            title = each.xpath('div[@class="hd"]/a/span[@class="title"]/text()').extract()
            content = each.xpath('div[@class="bd"]/p/text()').extract()
            score = each.xpath('div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()').extract()
            info = each.xpath('div[@class="bd"]/p[@class="quote"]/span/text()').extract()

            item['title'] = title[0]
            # 以;作为分隔,将content列表里所有元素合并成一个新的字符串
            item['content'] = ';'.join(content)
            item['score'] = score[0]
            item['info'] = info[0]
            # 提交item

            yield item

        if self.start <= 225:
            self.start += 25
            yield scrapy.Request(self.url + str(self.start) + self.end, callback=self.parse)

pipelines.py


from scrapy.conf import settings
import pymongo

class DoubanspiderPipeline(object):
    def __init__(self):
        # 获取setting主机名、端口号和数据库名
        host = settings['MONGODB_HOST']
        port = settings['MONGODB_PORT']
        dbname = settings['MONGODB_DBNAME']

        # pymongo.MongoClient(host, port) 创建MongoDB链接
        client = pymongo.MongoClient(host=host,port=port)

        # 指向指定的数据库
        mdb = client[dbname]
        # 获取数据库里存放数据的表名
        self.post = mdb[settings['MONGODB_DOCNAME']]


    def process_item(self, item, spider):
        data = dict(item)
        # 向指定的表里添加数据
        self.post.insert(data)
        return item

settings.py

BOT_NAME = 'doubanSpider'

SPIDER_MODULES = ['doubanSpider.spiders']
NEWSPIDER_MODULE = 'doubanSpider.spiders'

ITEM_PIPELINES = {
        'doubanSpider.pipelines.DoubanspiderPipeline' : 300
        }

# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'

# MONGODB 主机环回地址127.0.0.1
MONGODB_HOST = '127.0.0.1'
# 端口号,默认是27017
MONGODB_PORT = 27017
# 设置数据库名称
MONGODB_DBNAME = 'DouBan'
# 存放本次数据的表名称
MONGODB_DOCNAME = 'DouBanMovies'

运行

启动MongoDB数据库需要两个命令:

mongod:是mongoDB数据库进程本身
mongo:是命令行shell客户端


sudo mongod # 首先启动数据库服务,再执行Scrapy
sudo mongo # 启动数据库shell

在mongo shell下使用命令:

# 查看当前数据库
> db

# 列出所有的数据库
> show dbs

# 连接DouBan数据库
> use DouBan

# 列出所有表
> show collections

# 查看表里的数据
> db.DouBanMoives.find()

(案例六)三种 scrapy 模拟登陆策略

为什么需要模拟登录?
  • 获取 cookies,能够爬取登录后的页面。

requests  是如何实现模拟登录的?
  • 直接携带 cookies 去请求页面
  • 找到相应的接口,然后发送 POST 请求
selenium 是如何模拟登录的?
  • 找到对应的 input 标签,输入文字,点击登录。



注意:模拟登陆时,必须保证settings.py里的 COOKIES_ENABLED(Cookies中间件) 处于开启状态

COOKIES_ENABLED = True 或 # COOKIES_ENABLED = False

策略一:直接POST数据(比如需要登陆的账户信息)

只要是需要提供post数据的,就可以用这种方法。下面示例里post的数据是账户密码:

# -*- coding: utf-8 -*-
import scrapy


class Renren1Spider(scrapy.Spider):
    name = "renren1"
    allowed_domains = ["renren.com"]

    def start_requests(self):
        url = 'http://www.renren.com/PLogin.do'
        # FormRequest 是Scrapy发送POST请求的方法
        yield scrapy.FormRequest(
                url = url,
                formdata = {"email" : "mr_mao_hacker@163.com", "password" : "axxxxxxxe"},
                callback = self.parse_page)

    def parse_page(self, response):
        with open("mao2.html", "w") as filename:
            filename.write(response.body)

策略二:标准的模拟登陆步骤

正统模拟登录方法:

  1. 首先发送登录页面的get请求,获取到页面里的登录必须的参数(比如说zhihu登陆界面的 _xsrf)

  2. 然后和账户密码一起post到服务器,登录成功

# -*- coding: utf-8 -*-
import scrapy

class Renren2Spider(scrapy.Spider):
    name = "renren2"
    allowed_domains = ["renren.com"]
    start_urls = (
        "http://www.renren.com/PLogin.do",
    )

    # 处理start_urls里的登录url的响应内容,提取登陆需要的参数(如果需要的话)
    def parse(self, response):
        # 提取登陆需要的参数
        #_xsrf = response.xpath("//_xsrf").extract()[0]

        # 发送请求参数,并调用指定回调函数处理
        yield scrapy.FormRequest.from_response(
                response,
                formdata = {"email" : "mr_mao_hacker@163.com", "password" : "axxxxxxxe"},#, "_xsrf" = _xsrf},
                callback = self.parse_page
            )

    # 获取登录成功状态,访问需要登录后才能访问的页面
    def parse_page(self, response):
        url = "http://www.renren.com/422167102/profile"
        yield scrapy.Request(url, callback = self.parse_newpage)

    # 处理响应内容
    def parse_newpage(self, response):
        with open("xiao.html", "w") as filename:
            filename.write(response.body)

策略三:直接使用保存登陆状态的 Cookie 模拟登陆

如果实在没办法了,可以用这种方法模拟登录,虽然麻烦一点,但是成功率100%

# -*- coding: utf-8 -*-
import scrapy

class RenrenSpider(scrapy.Spider):
    name = "renren"
    allowed_domains = ["renren.com"]
    start_urls = (
        'http://www.renren.com/111111',
        'http://www.renren.com/222222',
        'http://www.renren.com/333333',
    )

    cookies = {
    "anonymid" : "ixrna3fysufnwv",
    "_r01_" : "1",
    "ap" : "327550029",
    "JSESSIONID" : "abciwg61A_RvtaRS3GjOv",
    "depovince" : "GW",
    "springskin" : "set",
    "jebe_key" : "f6fb270b-d06d-42e6-8b53-e67c3156aa7e%7Cc13c37f53bca9e1e7132d4b58ce00fa3%7C1484060607478%7C1%7C1486198628950",
    "t" : "691808127750a83d33704a565d8340ae9",
    "societyguester" : "691808127750a83d33704a565d8340ae9",
    "id" : "327550029",
    "xnsid" : "f42b25cf",
    "loginfrom" : "syshome"
    }

    # 可以重写Spider类的start_requests方法,附带Cookie值,发送POST请求
    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.FormRequest(url, cookies = self.cookies, callback = self.parse_page)

    # 处理响应内容
    def parse_page(self, response):
        print "===========" + response.url
        with open("deng.html", "w") as filename:
            filename.write(response.body)