问题:在运行scrapy的过程中,如果想主动退出该怎么做?
背景:比如说我只要爬取当日的新闻,那么在遍历的时候,如果出现了超过1条不是当日的新闻,那么就不爬取了,就主动退出爬虫,这个时候该怎么做呢?
代码如下:
import scrapy
from torrentSpider.items.NavigationItem import NavigationItem
from torrentSpider.items.TorrentItem import TorrentItem
import time
import random
import logging
import os
class XxxSpider(scrapy.Spider):
name = "xxx_spider"
allowed_domains = ['www.xxx.com']
start_urls = ['http://www.xxx.com/1.html']
# 网站前缀
web_pre_url = 'http://xxx.com'
# 计数
count = 0
def parse(self, response):
# 设置请求也随机延迟
time.sleep(random.randint(0, 5))
# 获取导航栏的数量
navigation_type_number = response.xpath('//*[@id="hypoNav"]/div/ul/li/em/a/text()').extract()
for n_k in range(1, len(navigation_type_number)):
navigation_item = NavigationItem()
# 网站标题
navigation_item['navigation_title'] = response.xpath('//*[@id="logoSea"]/div[1]/a/img/@alt').extract()[0]
# 导航栏目分类名称
navigation_item['navigation_type'] = response.xpath('//*[@id="hypoNav"]/div/ul/li['+str(n_k+1)+']/em/a/text()').extract()[0]
# 导航链接
navigation_item['navigation_url'] = response.xpath('//*[@id="hypoNav"]/div/ul/li['+str(n_k+1)+']/em/a/@href').extract()[0]
# 获取子导航栏的数量
sub_navigation_type_number = response.xpath('//*[@id="nodeNav"]/div/ul/li/em/a/span/text()').extract()
for sub_k in range(1, len(sub_navigation_type_number)):
sub_navigation_item = NavigationItem()
# 网站标题
sub_navigation_item['navigation_title'] = response.xpath('//*[@id="logoSea"]/div[1]/a/img/@alt').extract()[0]
# 副导航栏目分类名称
sub_navigation_item['sub_navigation_type'] = response.xpath('//*[@id="nodeNav"]/div/ul/li['+str(sub_k)+']/em/a/span/text()').extract()[0]
# 副导航栏链接
sub_navigation_item['sub_navigation_url'] = response.xpath('//*[@id="nodeNav"]/div/ul/li['+str(sub_k)+']/em/a/@href').extract()[0]
# 获取每页电影条目数长度
movie_name_tr_array = response.xpath('/html/body/div[2]/table[1]/tr/td[1]/table[2]/tbody/tr').extract()
for i_k in range(1, len(movie_name_tr_array)):
# 子链接
str_sub_url = '/html/body/div[2]/table[1]/tr/td[1]/table[2]/tbody/tr['+str(i_k)+']/td[1]/a/@href'
m_link = self.web_pre_url + response.xpath(str_sub_url).extract()[0]
yield scrapy.Request(url=m_link, callback=self.parse_links, dont_filter=True)
# 解析下一页
next_link = response.xpath('//*[@class="pagegbk"]/@href').extract()
if next_link:
if len(next_link) == 1:
next_link = next_link[0]
else:
next_link = next_link[1]
yield scrapy.Request(self.web_pre_url + next_link, callback=self.parse)
# 爬取子链接
def parse_links(self, response):
torrent_item = TorrentItem()
# 标题
torrent_item['torrent_title'] = self.check_xpath_value(response, '/html/body/div[2]/table[1]/tbody/tr/td/font/text()')
# 影片名称
torrent_item['torrent_name'] = self.check_xpath_value(response, '/html/body/div[2]/table[2]/tbody/tr/td/div[1]/font[1]/text()')
# 导演
torrent_item['torrent_director'] = self.check_xpath_value(response, '/html/body/div[2]/table[2]/tbody/tr/td/div[1]/font[2]/text()')
# 影片演员
torrent_item['torrent_actor'] = self.check_xpath_value(response, '/html/body/div[2]/table[2]/tbody/tr/td/div[1]/span/font[2]/text()')
# 语言
torrent_item['torrent_language'] = self.check_xpath_value(response, '/html/body/div[2]/table[2]/tbody/tr/td/div[1]/font[3]/text()')
# 影片类型
torrent_item['torrent_type'] = self.check_xpath_value(response, '/html/body/div[2]/table[2]/tbody/tr/td/div[1]/font[4]/text()')
# 影片地区
torrent_item['torrent_region'] = self.check_xpath_value(response, '/html/body/div[2]/table[2]/tbody/tr/td/div[1]/font[5]/text()')
# 更新时间
torrent_item['torrent_update_time'] = self.check_xpath_value(response, '/html/body/div[2]/table[2]/tbody/tr/td/div[1]/font[6]/text()')
# 影片状态
torrent_item['torrent_status'] = self.check_xpath_value(response, '/html/body/div[2]/table[2]/tbody/tr/td/div[1]/font[7]/text()')
# 上映日期
torrent_item['torrent_show_time'] = self.check_xpath_value(response, '/html/body/div[2]/table[2]/tbody/tr/td/div[1]/font[8]/text()')
# 剧情介绍
torrent_item['torrent_introduction'] = self.check_xpath_value(response, '/html/body/div[2]/table[2]/tbody/tr/td/div[2]/text()')
# 影片地址
torrent_item['torrent_url'] = self.check_xpath_value(response, '//*[@id="plist"]/table[2]/tbody/tr[2]/td/ul/li/input/@value')
# 获取当前时间并格式化
current_date = time.strftime('%Y-%m-%d', time.localtime())
print('current_date = %s' % str(current_date))
print('torrent_update_time = %s' % torrent_item['torrent_update_time'])
# 如果不是当天的就不爬取,并且计数
if torrent_item['torrent_update_time'] == str(current_date):
yield torrent_item
else:
self.count = self.count + 1
# 判断计数是否超过50,超过就不爬取了
if self.count > 1:
# logging.info("计数超过10,停止爬虫")
self.crawler.engine.close_spider(self, '计数超过10,停止爬虫!')
pass
# 判断是否为空
@staticmethod
def check_xpath_value(response, xpath_url):
xpath_value = response.xpath(xpath_url).extract()
if xpath_value:
if xpath_value[0].strip() != '':
return xpath_value[0]
else:
return "null"
else:
return "null"
注意以上代码中标红的地方:
self.crawler.engine.close_spider(self, '计数超过10,停止爬虫!')
1,此行代码是写在spider文件中的
2,虽然这一行代码会停止爬虫,但是这一行代码的停止并不是立即停止
原因是因为当我们不更改爬虫的setting.py文件的时候,默认配置是:
# Configure maximum concurrent requests performed by Scrapy (default: 16) # CONCURRENT_REQUESTS = 32
含义就是:Scrapy downloader 并发请求(concurrent requests)的最大值,默认: 16
那么这个时候的问题来了,按照以上的写法,在队列里就已经有十几个请求了,你停止之后,这十几个请求依旧会执行下去,所以并不是立即停止,如果想改变的话,就必须改变此项配置,设为:
CONCURRENT_REQUESTS = 1
seo优化_前端开发_渗透技术



