xielin 5 years ago
commit
38aed20c6b

+ 6 - 0
.idea/inspectionProfiles/profiles_settings.xml

@@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

+ 7 - 0
.idea/misc.xml

@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="JavaScriptSettings">
+    <option name="languageLevel" value="ES6" />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" />
+</project>

+ 8 - 0
.idea/modules.xml

@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/youni_spiders.iml" filepath="$PROJECT_DIR$/.idea/youni_spiders.iml" />
+    </modules>
+  </component>
+</project>

+ 59 - 0
.idea/workspace.xml

@@ -0,0 +1,59 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ChangeListManager">
+    <list default="true" id="f980d344-a8e5-444b-b699-bc3ae98c67d4" name="Default Changelist" comment="" />
+    <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
+    <option name="SHOW_DIALOG" value="false" />
+    <option name="HIGHLIGHT_CONFLICTS" value="true" />
+    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
+    <option name="LAST_RESOLUTION" value="IGNORE" />
+  </component>
+  <component name="FileTemplateManagerImpl">
+    <option name="RECENT_TEMPLATES">
+      <list>
+        <option value="Python Script" />
+      </list>
+    </option>
+  </component>
+  <component name="ProjectId" id="1PjFJbuVMVAUcNG8Q0LIqn1rCbM" />
+  <component name="PropertiesComponent">
+    <property name="WebServerToolWindowFactoryState" value="false" />
+    <property name="last_opened_file_path" value="$PROJECT_DIR$/youni_spiders" />
+  </component>
+  <component name="RecentsManager">
+    <key name="CopyFile.RECENT_KEYS">
+      <recent name="$PROJECT_DIR$/youni_spiders" />
+      <recent name="$PROJECT_DIR$/youni_spiders/spiders" />
+    </key>
+  </component>
+  <component name="RunDashboard">
+    <option name="ruleStates">
+      <list>
+        <RuleState>
+          <option name="name" value="ConfigurationTypeDashboardGroupingRule" />
+        </RuleState>
+        <RuleState>
+          <option name="name" value="StatusDashboardGroupingRule" />
+        </RuleState>
+      </list>
+    </option>
+  </component>
+  <component name="SvnConfiguration">
+    <configuration />
+  </component>
+  <component name="TaskManager">
+    <task active="true" id="Default" summary="Default task">
+      <changelist id="f980d344-a8e5-444b-b699-bc3ae98c67d4" name="Default Changelist" comment="" />
+      <created>1566379527770</created>
+      <option name="number" value="Default" />
+      <option name="presentableId" value="Default" />
+      <updated>1566379527770</updated>
+      <workItem from="1566379531265" duration="2906000" />
+      <workItem from="1566545104251" duration="20656000" />
+    </task>
+    <servers />
+  </component>
+  <component name="TypeScriptGeneratedFilesManager">
+    <option name="version" value="1" />
+  </component>
+</project>

+ 12 - 0
.idea/youni_spiders.iml

@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="TestRunnerService">
+    <option name="projectConfiguration" value="Twisted Trial" />
+    <option name="PROJECT_TEST_RUNNER" value="Twisted Trial" />
+  </component>
+</module>

File diff suppressed because it is too large
+ 1002 - 0
aa.json


+ 11 - 0
main.py

@@ -0,0 +1,11 @@
+# -*- coding:utf-8 -*-
+
+import sys
+import os
+import time
+
+while True:
+    os.system("scrapy crawl budejie")
+    os.system("scrapy crawl vice")
+    print("程序开始休眠...")
+    # time.sleep(3600)  # 休眠 1 小时后继续爬取

+ 11 - 0
scrapy.cfg

@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = youni_spiders.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = youni_spiders

File diff suppressed because it is too large
+ 207 - 0
v.json


+ 0 - 0
youni_spiders/__init__.py


BIN
youni_spiders/__pycache__/__init__.cpython-37.pyc


BIN
youni_spiders/__pycache__/items.cpython-37.pyc


BIN
youni_spiders/__pycache__/pipelines.cpython-37.pyc


BIN
youni_spiders/__pycache__/settings.cpython-37.pyc


+ 25 - 0
youni_spiders/items.py

@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class YouniSpidersItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    pass
+
+class BudejieItem(scrapy.Item):
+    id = scrapy.Field()
+    username = scrapy.Field()
+    content = scrapy.Field()
+    img = scrapy.Field()
+
+class ViceItem(scrapy.Item):
+    id = scrapy.Field()
+    content = scrapy.Field()
+    img = scrapy.Field()

+ 103 - 0
youni_spiders/middlewares.py

@@ -0,0 +1,103 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+
+class YouniSpidersSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Request, dict
+        # or Item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+
+
+class YouniSpidersDownloaderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)

+ 122 - 0
youni_spiders/pipelines.py

@@ -0,0 +1,122 @@
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+from .settings import MYSQL_HOST
+from .settings import MYSQL_DBNAME
+from .settings import MYSQL_USER
+from .settings import MYSQL_PASSWORD
+import pymysql.cursors
+import logging
+import time
+
+
+class YouniSpidersPipeline(object):
+    def process_item(self, item, spider):
+        return item
+
+
+class BuDeJiePipeline(object):
+    def __init__(self):
+        # 连接数据库
+        self.connect = pymysql.connect(
+            host=MYSQL_HOST,  # 数据库地址
+            port=3306,  # 数据库端口
+            db=MYSQL_DBNAME,  # 数据库名
+            user=MYSQL_USER,  # 数据库用户名
+            passwd=MYSQL_PASSWORD,  # 数据库密码
+            charset='utf8',  # 编码方式
+            use_unicode=True)
+        # 通过cursor执行增删查改
+        self.cursor = self.connect.cursor()
+
+    def process_item(self, item, spider):
+        main_img = ''
+        imgs = []
+        if type(item['img']).__name__ == "str":
+            logging.debug('main_img is str type')
+            main_img = item['img']
+            imgs = [item['img']]
+        elif type(item['img']).__name__ == "list":
+            logging.debug('main_img is list type')
+            main_img = item['img'][0]
+            imgs = item['img']
+        logging.debug(type(item['img']))
+        logging.debug('main_img=' + main_img)
+        select_sql = "SELECT * FROM post_store WHERE source = %s and  source_id=%s"
+        logging.debug("select=" + select_sql)
+        insert_sql = "insert into post_store(source, source_id, type, img ,video, title,content,created_at,updated_at) value (%s, %s, %s, %s, %s, %s,%s,%s,%s)"
+        imgs_sql = "insert into post_store_imgs(post_store_id,img,created_at,updated_at) value (%s,%s,%s,%s)"
+        logging.debug("insert=" + insert_sql)
+        try:
+            self.cursor.execute(select_sql, ('budejie', item['id']))
+            logging.debug(select_sql)
+            row = self.cursor.fetchone()
+            logging.debug(row)
+            if row is None:
+                self.cursor.execute(insert_sql, (
+                    'budejie', item['id'], 'image', main_img, '', '', item['content'],
+                    time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
+                    time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
+                lastrowid = int(self.cursor.lastrowid)
+                if lastrowid:
+                    imgs_data = []
+                    for i in imgs:
+                        imgs_data.append((lastrowid, i, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
+                                          time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
+                    self.cursor.executemany(imgs_sql, imgs_data)
+                # 提交sql语句
+                self.connect.commit()
+        except BaseException as e:
+            logging.debug('exception=' + repr(e))
+            self.connect.rollback()
+        return item  # 必须实现返回
+
+
+class VicePipeline(object):
+    def __init__(self):
+        # 连接数据库
+        self.connect = pymysql.connect(
+            host=MYSQL_HOST,  # 数据库地址
+            port=3306,  # 数据库端口
+            db=MYSQL_DBNAME,  # 数据库名
+            user=MYSQL_USER,  # 数据库用户名
+            passwd=MYSQL_PASSWORD,  # 数据库密码
+            charset='utf8',  # 编码方式
+            use_unicode=True)
+        # 通过cursor执行增删查改
+        self.cursor = self.connect.cursor()
+
+    def process_item(self, item, spider):
+        logging.debug("pipline")
+        if type(item['img']).__name__ == "str":
+            main_img = item['img']
+            imgs = [item['img']]
+        elif type(item['img']).__name__ == "list":
+            main_img = item['img'][0]
+            imgs = item['img']
+        select_sql = "SELECT * FROM post_store WHERE source = %s and  source_id=%s"
+        insert_sql = "insert into post_store(source, source_id, type, img ,video, title,content,created_at,updated_at) value (%s, %s, %s, %s, %s, %s,%s,%s,%s)"
+        imgs_sql = "insert into post_store_imgs(post_store_id,img,created_at,updated_at) value (%s,%s,%s,%s)"
+        try:
+            self.cursor.execute(select_sql, ('vice', item['id']))
+            row = self.cursor.fetchone()
+            if row is None:
+                self.cursor.execute(insert_sql, (
+                    'vice', item['id'], 'image', main_img, '', '', item['content'],
+                    time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
+                    time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
+                lastrowid = int(self.cursor.lastrowid)
+                if lastrowid:
+                    imgs_data = []
+                    for i in imgs:
+                        imgs_data.append((lastrowid, i, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
+                                          time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
+                    self.cursor.executemany(imgs_sql, imgs_data)
+            # 提交sql语句
+            self.connect.commit()
+        except:
+            self.connect.rollback()
+        return item  # 必须实现返回

+ 95 - 0
youni_spiders/settings.py

@@ -0,0 +1,95 @@
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for youni_spiders project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://docs.scrapy.org/en/latest/topics/settings.html
+#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'youni_spiders'
+
+SPIDER_MODULES = ['youni_spiders.spiders']
+NEWSPIDER_MODULE = 'youni_spiders.spiders'
+
+MYSQL_HOST = "47.92.174.125"
+MYSQL_DBNAME = "platform-community"
+MYSQL_USER = "caihongxingqiu"
+MYSQL_PASSWORD = "Dzj6rBKIWg1WIGfc"
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'youni_spiders (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = False
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'youni_spiders.middlewares.YouniSpidersSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'youni_spiders.middlewares.YouniSpidersDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See https://docs.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+   'youni_spiders.pipelines.BuDeJiePipeline': 1,
+   'youni_spiders.pipelines.VicePipeline': 1,
+}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
+AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

+ 4 - 0
youni_spiders/spiders/__init__.py

@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.

BIN
youni_spiders/spiders/__pycache__/__init__.cpython-37.pyc


BIN
youni_spiders/spiders/__pycache__/budejie.cpython-37.pyc


BIN
youni_spiders/spiders/__pycache__/vice.cpython-37.pyc


+ 25 - 0
youni_spiders/spiders/budejie.py

@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+import scrapy
+from youni_spiders.items import BudejieItem
+
+
+class BudejieSpider(scrapy.Spider):
+    name = 'budejie'
+    allowed_domains = ['budejie.com']
+    start_urls = ['http://budejie.com/']
+    custom_settings = {
+        'ITEM_PIPELINES': {'youni_spiders.pipelines.BuDeJiePipeline': 1}
+    }
+
+    def parse(self, response):
+        lies = response.css('div.j-r-list >ul >li')
+        for li in lies:
+            id = li.css('div.j-r-list-tool::attr(data-id)').extract_first()
+            username = li.css('a.u-user-name::text').extract_first()
+            content = '\n'.join(li.css('div.j-r-list-c-desc a::text').extract())
+            img = li.css('div.j-r-list-c-img >a >img::attr(data-original)').extract()
+            yield BudejieItem(id=id, username=username, content=content, img=img)
+        next_page = response.css('a.pagenxt::attr(href)').extract_first()
+        if next_page is not None:
+            next_page = response.urljoin(next_page)
+            yield scrapy.Request(next_page, callback=self.parse)

+ 36 - 0
youni_spiders/spiders/vice.py

@@ -0,0 +1,36 @@
+# -*- coding: utf-8 -*-
+import scrapy
+import json
+import re
+from scrapy import Request
+from youni_spiders.items import ViceItem
+
+
+class ViceSpider(scrapy.Spider):
+    name = 'vice'
+    allowed_domains = ['vice.cn']
+    custom_settings = {
+        'ITEM_PIPELINES': {'youni_spiders.pipelines.VicePipeline': 1,}
+    }
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36',
+    }
+
+    def start_requests(self):
+        url = 'http://www.vice.cn/ajax/dndmore?ids_not_in='
+        yield Request(url, headers=self.headers)
+
+    def parse(self, response):
+        datas = json.loads(response.body)
+        item = ViceItem()
+        if datas:
+            ids = ''
+            for data in datas['items']:
+                item['id'] = data['id']
+                item['content'] = data['preview']
+                item['img'] = data['portrait']
+                ids += data['id']+','
+                yield item
+
+            next_url = response.url+ids
+            yield Request(next_url,headers=self.headers)