budejie.py 1.0 KB

1234567891011121314151617181920212223242526
  1. # -*- coding: utf-8 -*-
  2. import scrapy
  3. from youni_spiders.items import BudejieItem
  4. class BudejieSpider(scrapy.Spider):
  5. name = 'budejie'
  6. allowed_domains = ['budejie.com']
  7. start_urls = ['http://budejie.com/']
  8. custom_settings = {
  9. 'ITEM_PIPELINES': {'youni_spiders.pipelines.BuDeJiePipeline': 1}
  10. }
  11. def parse(self, response):
  12. lies = response.css('div.j-r-list >ul >li')
  13. for li in lies:
  14. id = li.css('div.j-r-list-tool::attr(data-id)').extract_first()
  15. username = li.css('a.u-user-name::text').extract_first()
  16. content = '\n'.join(li.css('div.j-r-list-c-desc a::text').extract())
  17. img = li.css('div.j-r-list-c-img >a >img::attr(data-original)').extract()
  18. yield BudejieItem(id=id, username=username, content=content, img=img)
  19. next_page = response.css('a.pagenxt::attr(href)').extract_first()
  20. if next_page is not None:
  21. next_page = response.urljoin(next_page)
  22. yield scrapy.Request(next_page, callback=self.parse)