1234567891011121314151617181920212223242526 |
- # -*- coding: utf-8 -*-
- import scrapy
- from youni_spiders.items import BudejieItem
- class BudejieSpider(scrapy.Spider):
- name = 'budejie'
- allowed_domains = ['budejie.com']
- start_urls = ['http://budejie.com/']
- custom_settings = {
- 'ITEM_PIPELINES': {'youni_spiders.pipelines.BuDeJiePipeline': 1}
- }
- def parse(self, response):
- lies = response.css('div.j-r-list >ul >li')
- for li in lies:
- id = li.css('div.j-r-list-tool::attr(data-id)').extract_first()
- username = li.css('a.u-user-name::text').extract_first()
- content = '\n'.join(li.css('div.j-r-list-c-desc a::text').extract())
- img = li.css('div.j-r-list-c-img >a >img::attr(data-original)').extract()
- yield BudejieItem(id=id, username=username, content=content, img=img)
- next_page = response.css('a.pagenxt::attr(href)').extract_first()
- if next_page is not None:
- next_page = response.urljoin(next_page)
- yield scrapy.Request(next_page, callback=self.parse)
|