# -*- coding: utf-8 -*- import scrapy from youni_spiders.items import BudejieItem class BudejieSpider(scrapy.Spider): name = 'budejie' allowed_domains = ['budejie.com'] start_urls = ['http://budejie.com/'] custom_settings = { 'ITEM_PIPELINES': {'youni_spiders.pipelines.BuDeJiePipeline': 1} } def parse(self, response): lies = response.css('div.j-r-list >ul >li') for li in lies: id = li.css('div.j-r-list-tool::attr(data-id)').extract_first() username = li.css('a.u-user-name::text').extract_first() content = '\n'.join(li.css('div.j-r-list-c-desc a::text').extract()) img = li.css('div.j-r-list-c-img >a >img::attr(data-original)').extract() yield BudejieItem(id=id, username=username, content=content, img=img) next_page = response.css('a.pagenxt::attr(href)').extract_first() if next_page is not None: next_page = response.urljoin(next_page) yield scrapy.Request(next_page, callback=self.parse)