vice.py 1.1 KB

12345678910111213141516171819202122232425262728293031323334353637
  1. # -*- coding: utf-8 -*-
  2. import scrapy
  3. import json
  4. import re
  5. from scrapy import Request
  6. from youni_spiders.items import ViceItem
  7. class ViceSpider(scrapy.Spider):
  8. name = 'vice'
  9. allowed_domains = ['vice.cn']
  10. custom_settings = {
  11. 'ITEM_PIPELINES': {'youni_spiders.pipelines.VicePipeline': 1,}
  12. }
  13. headers = {
  14. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36',
  15. }
  16. def start_requests(self):
  17. url = 'http://www.vice.cn/ajax/dndmore?ids_not_in='
  18. yield Request(url, headers=self.headers)
  19. def parse(self, response):
  20. datas = json.loads(response.body)
  21. item = ViceItem()
  22. if datas:
  23. ids = ''
  24. for data in datas['items']:
  25. item['id'] = data['id']
  26. item['content'] = data['preview']
  27. item['img'] = data['portrait']
  28. ids += data['id']+','
  29. yield item
  30. next_url = response.url+ids
  31. yield Request(next_url,headers=self.headers)