5 years ago · 38aed20c6b
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
 
				+<component name="InspectionProjectProfileManager">
			
 
				+  <settings>
			
 
				+    <option name="USE_PROJECT_PROFILE" value="false" />
			
 
				+    <version value="1.0" />
			
 
				+  </settings>
			
 
				+</component>
			
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -0,0 +1,7 @@
 
				+<?xml version="1.0" encoding="UTF-8"?>
			
 
				+<project version="4">
			
 
				+  <component name="JavaScriptSettings">
			
 
				+    <option name="languageLevel" value="ES6" />
			
 
				+  </component>
			
 
				+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" />
			
 
				+</project>
			
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
 
				+<?xml version="1.0" encoding="UTF-8"?>
			
 
				+<project version="4">
			
 
				+  <component name="ProjectModuleManager">
			
 
				+    <modules>
			
 
				+      <module fileurl="file://$PROJECT_DIR$/.idea/youni_spiders.iml" filepath="$PROJECT_DIR$/.idea/youni_spiders.iml" />
			
 
				+    </modules>
			
 
				+  </component>
			
 
				+</project>
			
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -0,0 +1,59 @@
 
				+<?xml version="1.0" encoding="UTF-8"?>
			
 
				+<project version="4">
			
 
				+  <component name="ChangeListManager">
			
 
				+    <list default="true" id="f980d344-a8e5-444b-b699-bc3ae98c67d4" name="Default Changelist" comment="" />
			
 
				+    <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
			
 
				+    <option name="SHOW_DIALOG" value="false" />
			
 
				+    <option name="HIGHLIGHT_CONFLICTS" value="true" />
			
 
				+    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
			
 
				+    <option name="LAST_RESOLUTION" value="IGNORE" />
			
 
				+  </component>
			
 
				+  <component name="FileTemplateManagerImpl">
			
 
				+    <option name="RECENT_TEMPLATES">
			
 
				+      <list>
			
 
				+        <option value="Python Script" />
			
 
				+      </list>
			
 
				+    </option>
			
 
				+  </component>
			
 
				+  <component name="ProjectId" id="1PjFJbuVMVAUcNG8Q0LIqn1rCbM" />
			
 
				+  <component name="PropertiesComponent">
			
 
				+    <property name="WebServerToolWindowFactoryState" value="false" />
			
 
				+    <property name="last_opened_file_path" value="$PROJECT_DIR$/youni_spiders" />
			
 
				+  </component>
			
 
				+  <component name="RecentsManager">
			
 
				+    <key name="CopyFile.RECENT_KEYS">
			
 
				+      <recent name="$PROJECT_DIR$/youni_spiders" />
			
 
				+      <recent name="$PROJECT_DIR$/youni_spiders/spiders" />
			
 
				+    </key>
			
 
				+  </component>
			
 
				+  <component name="RunDashboard">
			
 
				+    <option name="ruleStates">
			
 
				+      <list>
			
 
				+        <RuleState>
			
 
				+          <option name="name" value="ConfigurationTypeDashboardGroupingRule" />
			
 
				+        </RuleState>
			
 
				+        <RuleState>
			
 
				+          <option name="name" value="StatusDashboardGroupingRule" />
			
 
				+        </RuleState>
			
 
				+      </list>
			
 
				+    </option>
			
 
				+  </component>
			
 
				+  <component name="SvnConfiguration">
			
 
				+    <configuration />
			
 
				+  </component>
			
 
				+  <component name="TaskManager">
			
 
				+    <task active="true" id="Default" summary="Default task">
			
 
				+      <changelist id="f980d344-a8e5-444b-b699-bc3ae98c67d4" name="Default Changelist" comment="" />
			
 
				+      <created>1566379527770</created>
			
 
				+      <option name="number" value="Default" />
			
 
				+      <option name="presentableId" value="Default" />
			
 
				+      <updated>1566379527770</updated>
			
 
				+      <workItem from="1566379531265" duration="2906000" />
			
 
				+      <workItem from="1566545104251" duration="20656000" />
			
 
				+    </task>
			
 
				+    <servers />
			
 
				+  </component>
			
 
				+  <component name="TypeScriptGeneratedFilesManager">
			
 
				+    <option name="version" value="1" />
			
 
				+  </component>
			
 
				+</project>
			
--- a/.idea/youni_spiders.iml
+++ b/.idea/youni_spiders.iml
@@ -0,0 +1,12 @@
 
				+<?xml version="1.0" encoding="UTF-8"?>
			
 
				+<module type="PYTHON_MODULE" version="4">
			
 
				+  <component name="NewModuleRootManager">
			
 
				+    <content url="file://$MODULE_DIR$" />
			
 
				+    <orderEntry type="inheritedJdk" />
			
 
				+    <orderEntry type="sourceFolder" forTests="false" />
			
 
				+  </component>
			
 
				+  <component name="TestRunnerService">
			
 
				+    <option name="projectConfiguration" value="Twisted Trial" />
			
 
				+    <option name="PROJECT_TEST_RUNNER" value="Twisted Trial" />
			
 
				+  </component>
			
 
				+</module>
			
--- a/aa.json
+++ b/aa.json
--- a/main.py
+++ b/main.py
@@ -0,0 +1,11 @@
 
				+# -*- coding:utf-8 -*-
			
 
				+
			
 
				+import sys
			
 
				+import os
			
 
				+import time
			
 
				+
			
 
				+while True:
			
 
				+    os.system("scrapy crawl budejie")
			
 
				+    os.system("scrapy crawl vice")
			
 
				+    print("程序开始休眠...")
			
 
				+    # time.sleep(3600)  # 休眠 1 小时后继续爬取
			
--- a/scrapy.cfg
+++ b/scrapy.cfg
@@ -0,0 +1,11 @@
 
				+# Automatically created by: scrapy startproject
			
 
				+#
			
 
				+# For more information about the [deploy] section see:
			
 
				+# https://scrapyd.readthedocs.io/en/latest/deploy.html
			
 
				+
			
 
				+[settings]
			
 
				+default = youni_spiders.settings
			
 
				+
			
 
				+[deploy]
			
 
				+#url = http://localhost:6800/
			
 
				+project = youni_spiders
			
--- a/v.json
+++ b/v.json
--- a/youni_spiders/__init__.py
+++ b/youni_spiders/__init__.py
--- a/youni_spiders/__pycache__/__init__.cpython-37.pyc
+++ b/youni_spiders/__pycache__/__init__.cpython-37.pyc
--- a/youni_spiders/__pycache__/items.cpython-37.pyc
+++ b/youni_spiders/__pycache__/items.cpython-37.pyc
--- a/youni_spiders/__pycache__/pipelines.cpython-37.pyc
+++ b/youni_spiders/__pycache__/pipelines.cpython-37.pyc
--- a/youni_spiders/__pycache__/settings.cpython-37.pyc
+++ b/youni_spiders/__pycache__/settings.cpython-37.pyc
--- a/youni_spiders/items.py
+++ b/youni_spiders/items.py
@@ -0,0 +1,25 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+
			
 
				+# Define here the models for your scraped items
			
 
				+#
			
 
				+# See documentation in:
			
 
				+# https://docs.scrapy.org/en/latest/topics/items.html
			
 
				+
			
 
				+import scrapy
			
 
				+
			
 
				+
			
 
				+class YouniSpidersItem(scrapy.Item):
			
 
				+    # define the fields for your item here like:
			
 
				+    # name = scrapy.Field()
			
 
				+    pass
			
 
				+
			
 
				+class BudejieItem(scrapy.Item):
			
 
				+    id = scrapy.Field()
			
 
				+    username = scrapy.Field()
			
 
				+    content = scrapy.Field()
			
 
				+    img = scrapy.Field()
			
 
				+
			
 
				+class ViceItem(scrapy.Item):
			
 
				+    id = scrapy.Field()
			
 
				+    content = scrapy.Field()
			
 
				+    img = scrapy.Field()
			
--- a/youni_spiders/middlewares.py
+++ b/youni_spiders/middlewares.py
@@ -0,0 +1,103 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+
			
 
				+# Define here the models for your spider middleware
			
 
				+#
			
 
				+# See documentation in:
			
 
				+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
			
 
				+
			
 
				+from scrapy import signals
			
 
				+
			
 
				+
			
 
				+class YouniSpidersSpiderMiddleware(object):
			
 
				+    # Not all methods need to be defined. If a method is not defined,
			
 
				+    # scrapy acts as if the spider middleware does not modify the
			
 
				+    # passed objects.
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_crawler(cls, crawler):
			
 
				+        # This method is used by Scrapy to create your spiders.
			
 
				+        s = cls()
			
 
				+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
			
 
				+        return s
			
 
				+
			
 
				+    def process_spider_input(self, response, spider):
			
 
				+        # Called for each response that goes through the spider
			
 
				+        # middleware and into the spider.
			
 
				+
			
 
				+        # Should return None or raise an exception.
			
 
				+        return None
			
 
				+
			
 
				+    def process_spider_output(self, response, result, spider):
			
 
				+        # Called with the results returned from the Spider, after
			
 
				+        # it has processed the response.
			
 
				+
			
 
				+        # Must return an iterable of Request, dict or Item objects.
			
 
				+        for i in result:
			
 
				+            yield i
			
 
				+
			
 
				+    def process_spider_exception(self, response, exception, spider):
			
 
				+        # Called when a spider or process_spider_input() method
			
 
				+        # (from other spider middleware) raises an exception.
			
 
				+
			
 
				+        # Should return either None or an iterable of Request, dict
			
 
				+        # or Item objects.
			
 
				+        pass
			
 
				+
			
 
				+    def process_start_requests(self, start_requests, spider):
			
 
				+        # Called with the start requests of the spider, and works
			
 
				+        # similarly to the process_spider_output() method, except
			
 
				+        # that it doesn’t have a response associated.
			
 
				+
			
 
				+        # Must return only requests (not items).
			
 
				+        for r in start_requests:
			
 
				+            yield r
			
 
				+
			
 
				+    def spider_opened(self, spider):
			
 
				+        spider.logger.info('Spider opened: %s' % spider.name)
			
 
				+
			
 
				+
			
 
				+class YouniSpidersDownloaderMiddleware(object):
			
 
				+    # Not all methods need to be defined. If a method is not defined,
			
 
				+    # scrapy acts as if the downloader middleware does not modify the
			
 
				+    # passed objects.
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_crawler(cls, crawler):
			
 
				+        # This method is used by Scrapy to create your spiders.
			
 
				+        s = cls()
			
 
				+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
			
 
				+        return s
			
 
				+
			
 
				+    def process_request(self, request, spider):
			
 
				+        # Called for each request that goes through the downloader
			
 
				+        # middleware.
			
 
				+
			
 
				+        # Must either:
			
 
				+        # - return None: continue processing this request
			
 
				+        # - or return a Response object
			
 
				+        # - or return a Request object
			
 
				+        # - or raise IgnoreRequest: process_exception() methods of
			
 
				+        #   installed downloader middleware will be called
			
 
				+        return None
			
 
				+
			
 
				+    def process_response(self, request, response, spider):
			
 
				+        # Called with the response returned from the downloader.
			
 
				+
			
 
				+        # Must either;
			
 
				+        # - return a Response object
			
 
				+        # - return a Request object
			
 
				+        # - or raise IgnoreRequest
			
 
				+        return response
			
 
				+
			
 
				+    def process_exception(self, request, exception, spider):
			
 
				+        # Called when a download handler or a process_request()
			
 
				+        # (from other downloader middleware) raises an exception.
			
 
				+
			
 
				+        # Must either:
			
 
				+        # - return None: continue processing this exception
			
 
				+        # - return a Response object: stops process_exception() chain
			
 
				+        # - return a Request object: stops process_exception() chain
			
 
				+        pass
			
 
				+
			
 
				+    def spider_opened(self, spider):
			
 
				+        spider.logger.info('Spider opened: %s' % spider.name)
			
--- a/youni_spiders/pipelines.py
+++ b/youni_spiders/pipelines.py
@@ -0,0 +1,122 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+
			
 
				+# Define your item pipelines here
			
 
				+#
			
 
				+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
			
 
				+# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
			
 
				+from .settings import MYSQL_HOST
			
 
				+from .settings import MYSQL_DBNAME
			
 
				+from .settings import MYSQL_USER
			
 
				+from .settings import MYSQL_PASSWORD
			
 
				+import pymysql.cursors
			
 
				+import logging
			
 
				+import time
			
 
				+
			
 
				+
			
 
				+class YouniSpidersPipeline(object):
			
 
				+    def process_item(self, item, spider):
			
 
				+        return item
			
 
				+
			
 
				+
			
 
				+class BuDeJiePipeline(object):
			
 
				+    def __init__(self):
			
 
				+        # 连接数据库
			
 
				+        self.connect = pymysql.connect(
			
 
				+            host=MYSQL_HOST,  # 数据库地址
			
 
				+            port=3306,  # 数据库端口
			
 
				+            db=MYSQL_DBNAME,  # 数据库名
			
 
				+            user=MYSQL_USER,  # 数据库用户名
			
 
				+            passwd=MYSQL_PASSWORD,  # 数据库密码
			
 
				+            charset='utf8',  # 编码方式
			
 
				+            use_unicode=True)
			
 
				+        # 通过cursor执行增删查改
			
 
				+        self.cursor = self.connect.cursor()
			
 
				+
			
 
				+    def process_item(self, item, spider):
			
 
				+        main_img = ''
			
 
				+        imgs = []
			
 
				+        if type(item['img']).__name__ == "str":
			
 
				+            logging.debug('main_img is str type')
			
 
				+            main_img = item['img']
			
 
				+            imgs = [item['img']]
			
 
				+        elif type(item['img']).__name__ == "list":
			
 
				+            logging.debug('main_img is list type')
			
 
				+            main_img = item['img'][0]
			
 
				+            imgs = item['img']
			
 
				+        logging.debug(type(item['img']))
			
 
				+        logging.debug('main_img=' + main_img)
			
 
				+        select_sql = "SELECT * FROM post_store WHERE source = %s and  source_id=%s"
			
 
				+        logging.debug("select=" + select_sql)
			
 
				+        insert_sql = "insert into post_store(source, source_id, type, img ,video, title,content,created_at,updated_at) value (%s, %s, %s, %s, %s, %s,%s,%s,%s)"
			
 
				+        imgs_sql = "insert into post_store_imgs(post_store_id,img,created_at,updated_at) value (%s,%s,%s,%s)"
			
 
				+        logging.debug("insert=" + insert_sql)
			
 
				+        try:
			
 
				+            self.cursor.execute(select_sql, ('budejie', item['id']))
			
 
				+            logging.debug(select_sql)
			
 
				+            row = self.cursor.fetchone()
			
 
				+            logging.debug(row)
			
 
				+            if row is None:
			
 
				+                self.cursor.execute(insert_sql, (
			
 
				+                    'budejie', item['id'], 'image', main_img, '', '', item['content'],
			
 
				+                    time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
			
 
				+                    time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
			
 
				+                lastrowid = int(self.cursor.lastrowid)
			
 
				+                if lastrowid:
			
 
				+                    imgs_data = []
			
 
				+                    for i in imgs:
			
 
				+                        imgs_data.append((lastrowid, i, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
			
 
				+                                          time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
			
 
				+                    self.cursor.executemany(imgs_sql, imgs_data)
			
 
				+                # 提交sql语句
			
 
				+                self.connect.commit()
			
 
				+        except BaseException as e:
			
 
				+            logging.debug('exception=' + repr(e))
			
 
				+            self.connect.rollback()
			
 
				+        return item  # 必须实现返回
			
 
				+
			
 
				+
			
 
				+class VicePipeline(object):
			
 
				+    def __init__(self):
			
 
				+        # 连接数据库
			
 
				+        self.connect = pymysql.connect(
			
 
				+            host=MYSQL_HOST,  # 数据库地址
			
 
				+            port=3306,  # 数据库端口
			
 
				+            db=MYSQL_DBNAME,  # 数据库名
			
 
				+            user=MYSQL_USER,  # 数据库用户名
			
 
				+            passwd=MYSQL_PASSWORD,  # 数据库密码
			
 
				+            charset='utf8',  # 编码方式
			
 
				+            use_unicode=True)
			
 
				+        # 通过cursor执行增删查改
			
 
				+        self.cursor = self.connect.cursor()
			
 
				+
			
 
				+    def process_item(self, item, spider):
			
 
				+        logging.debug("pipline")
			
 
				+        if type(item['img']).__name__ == "str":
			
 
				+            main_img = item['img']
			
 
				+            imgs = [item['img']]
			
 
				+        elif type(item['img']).__name__ == "list":
			
 
				+            main_img = item['img'][0]
			
 
				+            imgs = item['img']
			
 
				+        select_sql = "SELECT * FROM post_store WHERE source = %s and  source_id=%s"
			
 
				+        insert_sql = "insert into post_store(source, source_id, type, img ,video, title,content,created_at,updated_at) value (%s, %s, %s, %s, %s, %s,%s,%s,%s)"
			
 
				+        imgs_sql = "insert into post_store_imgs(post_store_id,img,created_at,updated_at) value (%s,%s,%s,%s)"
			
 
				+        try:
			
 
				+            self.cursor.execute(select_sql, ('vice', item['id']))
			
 
				+            row = self.cursor.fetchone()
			
 
				+            if row is None:
			
 
				+                self.cursor.execute(insert_sql, (
			
 
				+                    'vice', item['id'], 'image', main_img, '', '', item['content'],
			
 
				+                    time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
			
 
				+                    time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
			
 
				+                lastrowid = int(self.cursor.lastrowid)
			
 
				+                if lastrowid:
			
 
				+                    imgs_data = []
			
 
				+                    for i in imgs:
			
 
				+                        imgs_data.append((lastrowid, i, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
			
 
				+                                          time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
			
 
				+                    self.cursor.executemany(imgs_sql, imgs_data)
			
 
				+            # 提交sql语句
			
 
				+            self.connect.commit()
			
 
				+        except:
			
 
				+            self.connect.rollback()
			
 
				+        return item  # 必须实现返回
			
--- a/youni_spiders/settings.py
+++ b/youni_spiders/settings.py
@@ -0,0 +1,95 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+
			
 
				+# Scrapy settings for youni_spiders project
			
 
				+#
			
 
				+# For simplicity, this file contains only settings considered important or
			
 
				+# commonly used. You can find more settings consulting the documentation:
			
 
				+#
			
 
				+#     https://docs.scrapy.org/en/latest/topics/settings.html
			
 
				+#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
			
 
				+#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
			
 
				+
			
 
				+BOT_NAME = 'youni_spiders'
			
 
				+
			
 
				+SPIDER_MODULES = ['youni_spiders.spiders']
			
 
				+NEWSPIDER_MODULE = 'youni_spiders.spiders'
			
 
				+
			
 
				+MYSQL_HOST = "47.92.174.125"
			
 
				+MYSQL_DBNAME = "platform-community"
			
 
				+MYSQL_USER = "caihongxingqiu"
			
 
				+MYSQL_PASSWORD = "Dzj6rBKIWg1WIGfc"
			
 
				+
			
 
				+# Crawl responsibly by identifying yourself (and your website) on the user-agent
			
 
				+#USER_AGENT = 'youni_spiders (+http://www.yourdomain.com)'
			
 
				+
			
 
				+# Obey robots.txt rules
			
 
				+ROBOTSTXT_OBEY = False
			
 
				+
			
 
				+# Configure maximum concurrent requests performed by Scrapy (default: 16)
			
 
				+#CONCURRENT_REQUESTS = 32
			
 
				+
			
 
				+# Configure a delay for requests for the same website (default: 0)
			
 
				+# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
			
 
				+# See also autothrottle settings and docs
			
 
				+DOWNLOAD_DELAY = 3
			
 
				+# The download delay setting will honor only one of:
			
 
				+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
			
 
				+#CONCURRENT_REQUESTS_PER_IP = 16
			
 
				+
			
 
				+# Disable cookies (enabled by default)
			
 
				+COOKIES_ENABLED = False
			
 
				+
			
 
				+# Disable Telnet Console (enabled by default)
			
 
				+#TELNETCONSOLE_ENABLED = False
			
 
				+
			
 
				+# Override the default request headers:
			
 
				+#DEFAULT_REQUEST_HEADERS = {
			
 
				+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
			
 
				+#   'Accept-Language': 'en',
			
 
				+#}
			
 
				+
			
 
				+# Enable or disable spider middlewares
			
 
				+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
			
 
				+#SPIDER_MIDDLEWARES = {
			
 
				+#    'youni_spiders.middlewares.YouniSpidersSpiderMiddleware': 543,
			
 
				+#}
			
 
				+
			
 
				+# Enable or disable downloader middlewares
			
 
				+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
			
 
				+#DOWNLOADER_MIDDLEWARES = {
			
 
				+#    'youni_spiders.middlewares.YouniSpidersDownloaderMiddleware': 543,
			
 
				+#}
			
 
				+
			
 
				+# Enable or disable extensions
			
 
				+# See https://docs.scrapy.org/en/latest/topics/extensions.html
			
 
				+#EXTENSIONS = {
			
 
				+#    'scrapy.extensions.telnet.TelnetConsole': None,
			
 
				+#}
			
 
				+
			
 
				+# Configure item pipelines
			
 
				+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
			
 
				+ITEM_PIPELINES = {
			
 
				+   'youni_spiders.pipelines.BuDeJiePipeline': 1,
			
 
				+   'youni_spiders.pipelines.VicePipeline': 1,
			
 
				+}
			
 
				+
			
 
				+# Enable and configure the AutoThrottle extension (disabled by default)
			
 
				+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
			
 
				+AUTOTHROTTLE_ENABLED = True
			
 
				+# The initial download delay
			
 
				+AUTOTHROTTLE_START_DELAY = 5
			
 
				+# The maximum download delay to be set in case of high latencies
			
 
				+AUTOTHROTTLE_MAX_DELAY = 60
			
 
				+# The average number of requests Scrapy should be sending in parallel to
			
 
				+# each remote server
			
 
				+AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
			
 
				+# Enable showing throttling stats for every response received:
			
 
				+#AUTOTHROTTLE_DEBUG = False
			
 
				+
			
 
				+# Enable and configure HTTP caching (disabled by default)
			
 
				+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
			
 
				+#HTTPCACHE_ENABLED = True
			
 
				+#HTTPCACHE_EXPIRATION_SECS = 0
			
 
				+#HTTPCACHE_DIR = 'httpcache'
			
 
				+#HTTPCACHE_IGNORE_HTTP_CODES = []
			
 
				+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
			
--- a/youni_spiders/spiders/__init__.py
+++ b/youni_spiders/spiders/__init__.py
@@ -0,0 +1,4 @@
 
				+# This package will contain the spiders of your Scrapy project
			
 
				+#
			
 
				+# Please refer to the documentation for information on how to create and manage
			
 
				+# your spiders.
			
--- a/youni_spiders/spiders/__pycache__/__init__.cpython-37.pyc
+++ b/youni_spiders/spiders/__pycache__/__init__.cpython-37.pyc
--- a/youni_spiders/spiders/__pycache__/budejie.cpython-37.pyc
+++ b/youni_spiders/spiders/__pycache__/budejie.cpython-37.pyc
--- a/youni_spiders/spiders/__pycache__/vice.cpython-37.pyc
+++ b/youni_spiders/spiders/__pycache__/vice.cpython-37.pyc
--- a/youni_spiders/spiders/budejie.py
+++ b/youni_spiders/spiders/budejie.py
@@ -0,0 +1,25 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+import scrapy
			
 
				+from youni_spiders.items import BudejieItem
			
 
				+
			
 
				+
			
 
				+class BudejieSpider(scrapy.Spider):
			
 
				+    name = 'budejie'
			
 
				+    allowed_domains = ['budejie.com']
			
 
				+    start_urls = ['http://budejie.com/']
			
 
				+    custom_settings = {
			
 
				+        'ITEM_PIPELINES': {'youni_spiders.pipelines.BuDeJiePipeline': 1}
			
 
				+    }
			
 
				+
			
 
				+    def parse(self, response):
			
 
				+        lies = response.css('div.j-r-list >ul >li')
			
 
				+        for li in lies:
			
 
				+            id = li.css('div.j-r-list-tool::attr(data-id)').extract_first()
			
 
				+            username = li.css('a.u-user-name::text').extract_first()
			
 
				+            content = '\n'.join(li.css('div.j-r-list-c-desc a::text').extract())
			
 
				+            img = li.css('div.j-r-list-c-img >a >img::attr(data-original)').extract()
			
 
				+            yield BudejieItem(id=id, username=username, content=content, img=img)
			
 
				+        next_page = response.css('a.pagenxt::attr(href)').extract_first()
			
 
				+        if next_page is not None:
			
 
				+            next_page = response.urljoin(next_page)
			
 
				+            yield scrapy.Request(next_page, callback=self.parse)
			
--- a/youni_spiders/spiders/vice.py
+++ b/youni_spiders/spiders/vice.py
@@ -0,0 +1,36 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+import scrapy
			
 
				+import json
			
 
				+import re
			
 
				+from scrapy import Request
			
 
				+from youni_spiders.items import ViceItem
			
 
				+
			
 
				+
			
 
				+class ViceSpider(scrapy.Spider):
			
 
				+    name = 'vice'
			
 
				+    allowed_domains = ['vice.cn']
			
 
				+    custom_settings = {
			
 
				+        'ITEM_PIPELINES': {'youni_spiders.pipelines.VicePipeline': 1,}
			
 
				+    }
			
 
				+    headers = {
			
 
				+        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36',
			
 
				+    }
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        url = 'http://www.vice.cn/ajax/dndmore?ids_not_in='
			
 
				+        yield Request(url, headers=self.headers)
			
 
				+
			
 
				+    def parse(self, response):
			
 
				+        datas = json.loads(response.body)
			
 
				+        item = ViceItem()
			
 
				+        if datas:
			
 
				+            ids = ''
			
 
				+            for data in datas['items']:
			
 
				+                item['id'] = data['id']
			
 
				+                item['content'] = data['preview']
			
 
				+                item['img'] = data['portrait']
			
 
				+                ids += data['id']+','
			
 
				+                yield item
			
 
				+
			
 
				+            next_url = response.url+ids
			
 
				+            yield Request(next_url,headers=self.headers)