Quellcode durchsuchen

add doc for crawl

blackbinbin vor 6 Jahren
Ursprung
Commit
370ab8163c

+ 9 - 3
SUMMARY.md

@@ -1,4 +1,10 @@
-# Summary
-
-* [Introduction](README.md)
+# 多玩可视化爬虫系统
 
+* [简介](README.md)
+* 安装以及配置
+    * [安装](install/README.md#install)
+    * [配置](install/README.md#config)
+* 目录结构
+    * [目录](code/README.md#code)
+* 部署
+    * [部署](run/README.md#run)

+ 70 - 0
_book/code/README.md

@@ -0,0 +1,70 @@
+#code
+
+目录结构
+```
+│  app.js	//处理路由和异常捕捉
+│  common.js	//配置环境config
+│  index.js		//多进程启动
+│
+├─bin
+│  │  checkProxyPool.js		//从redis代理池中拿出ip直接请求某个网址,如果失效则删除
+│  │  checkZombieChrome.js	//使用headless浏览器模式会出现很多僵尸chrome进程,用此脚本杀死
+│  │  checkZombieSpider.js	//检查所有crawlworker并记录到表proc_log,并且杀死proc_log中运行太久的僵尸程序
+│  │  crawl.js		//获取浏览器模式和普通模式的规则然后进行任务
+│  │  crawlMaster.js 	//抓取任务的master分配入redis队列
+│  │  crawlWorker.js	//从redis获取抓取任务
+│  │  fetchPage.js		
+│  │  test.js
+│  │
+│  ├─linux_bash
+│  │      crontab.sh
+│  │      supervisor.ini
+│  │
+│  └─NameClient
+│          subNsEvent.js
+│
+├─conf	//配置目录
+│  │  code.inc.js
+│  │  config.dev.inc.js
+│  │  config.form.inc.js
+│  │  config.inc.js
+│  │  r2m_config.inc.js
+│  │
+│  └─conf_ns	//名字服务器配置
+│          config.code.inc.js
+│          config.globals.inc.js
+│          config.r2m.inc.js
+│          config.shop.inc.js
+│
+├─controllers	//爬虫开放的api,用于预览爬取获取页面和网页上执行任务查看任务执行情况
+│      DefaultController.js
+│
+├─extensions
+│      function_extend.js
+│
+├─models
+│      AmcMsg.js	//爬虫爬取报警上报
+│      Browser.js	//headless浏览器模式下的浏览器类
+│      JTool.js		//选择器使用的类工具,例如格式化时间等
+│      MapData.js	//名字服务中配置的数据库表内字段的操作类
+│      ProxyPool.js		//代理池类
+│      Spider.js	//爬虫类,非常重要,包含了爬取过程中的一系列函数
+│
+└─views
+    │  doc.ejs
+    │  error.ejs
+    │  index.ejs
+    │
+    └─name_server
+            js.ejs
+```
+
+---
+#系统示意图
+![图片](/img/图1-2.jpg)
+
+#请求代理示意图
+![图片](/img/图1-1.jpg)
+
+#可视化流程示意图
+![图片](/img/图1-3.jpg)

+ 394 - 0
_book/code/index.html

@@ -0,0 +1,394 @@
+
+<!DOCTYPE HTML>
+<html lang="" >
+    <head>
+        <meta charset="UTF-8">
+        <meta content="text/html; charset=utf-8" http-equiv="Content-Type">
+        <title>目录 · GitBook</title>
+        <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+        <meta name="description" content="">
+        <meta name="generator" content="GitBook 3.2.3">
+        
+        
+        
+    
+    <link rel="stylesheet" href="../gitbook/style.css">
+
+    
+            
+                
+                <link rel="stylesheet" href="../gitbook/gitbook-plugin-highlight/website.css">
+                
+            
+                
+                <link rel="stylesheet" href="../gitbook/gitbook-plugin-search/search.css">
+                
+            
+                
+                <link rel="stylesheet" href="../gitbook/gitbook-plugin-fontsettings/website.css">
+                
+            
+        
+
+    
+
+    
+        
+    
+        
+    
+        
+    
+        
+    
+        
+    
+        
+    
+
+        
+    
+    
+    <meta name="HandheldFriendly" content="true"/>
+    <meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=no">
+    <meta name="apple-mobile-web-app-capable" content="yes">
+    <meta name="apple-mobile-web-app-status-bar-style" content="black">
+    <link rel="apple-touch-icon-precomposed" sizes="152x152" href="../gitbook/images/apple-touch-icon-precomposed-152.png">
+    <link rel="shortcut icon" href="../gitbook/images/favicon.ico" type="image/x-icon">
+
+    
+    
+
+    </head>
+    <body>
+        
+<div class="book">
+    <div class="book-summary">
+        
+            
+<div id="book-search-input" role="search">
+    <input type="text" placeholder="Type to search" />
+</div>
+
+            
+                <nav role="navigation">
+                
+
+
+<ul class="summary">
+    
+    
+
+    
+
+    
+        
+        
+    
+        <li class="chapter " data-level="1.1" data-path="../">
+            
+                <a href="../">
+            
+                    
+                    简介
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="1.2" >
+            
+                <span>
+            
+                    
+                    安装以及配置
+            
+                </span>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="1.2.1" data-path="../install/">
+            
+                <a href="../install/#install">
+            
+                    
+                    安装
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="1.2.2" data-path="../install/">
+            
+                <a href="../install/#config">
+            
+                    
+                    配置
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="1.3" >
+            
+                <span>
+            
+                    
+                    目录结构
+            
+                </span>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="1.3.1" data-path="./">
+            
+                <a href="./#code">
+            
+                    
+                    目录
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="1.4" >
+            
+                <span>
+            
+                    
+                    部署
+            
+                </span>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="1.4.1" data-path="../run/">
+            
+                <a href="../run/#run">
+            
+                    
+                    部署
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+
+    
+
+    <li class="divider"></li>
+
+    <li>
+        <a href="https://www.gitbook.com" target="blank" class="gitbook-link">
+            Published with GitBook
+        </a>
+    </li>
+</ul>
+
+
+                </nav>
+            
+        
+    </div>
+
+    <div class="book-body">
+        
+            <div class="body-inner">
+                
+                    
+
+<div class="book-header" role="navigation">
+    
+
+    <!-- Title -->
+    <h1>
+        <i class="fa fa-circle-o-notch fa-spin"></i>
+        <a href=".." >目录</a>
+    </h1>
+</div>
+
+
+
+
+                    <div class="page-wrapper" tabindex="-1" role="main">
+                        <div class="page-inner">
+                            
+<div id="book-search-results">
+    <div class="search-noresults">
+    
+                                <section class="normal markdown-section">
+                                
+                                <h1 id="code">code</h1>
+<p>&#x76EE;&#x5F55;&#x7ED3;&#x6784;</p>
+<pre><code>&#x2502;  app.js    //&#x5904;&#x7406;&#x8DEF;&#x7531;&#x548C;&#x5F02;&#x5E38;&#x6355;&#x6349;
+&#x2502;  common.js    //&#x914D;&#x7F6E;&#x73AF;&#x5883;config
+&#x2502;  index.js        //&#x591A;&#x8FDB;&#x7A0B;&#x542F;&#x52A8;
+&#x2502;
+&#x251C;&#x2500;bin
+&#x2502;  &#x2502;  checkProxyPool.js        //&#x4ECE;redis&#x4EE3;&#x7406;&#x6C60;&#x4E2D;&#x62FF;&#x51FA;ip&#x76F4;&#x63A5;&#x8BF7;&#x6C42;&#x67D0;&#x4E2A;&#x7F51;&#x5740;&#xFF0C;&#x5982;&#x679C;&#x5931;&#x6548;&#x5219;&#x5220;&#x9664;
+&#x2502;  &#x2502;  checkZombieChrome.js    //&#x4F7F;&#x7528;headless&#x6D4F;&#x89C8;&#x5668;&#x6A21;&#x5F0F;&#x4F1A;&#x51FA;&#x73B0;&#x5F88;&#x591A;&#x50F5;&#x5C38;chrome&#x8FDB;&#x7A0B;&#xFF0C;&#x7528;&#x6B64;&#x811A;&#x672C;&#x6740;&#x6B7B;
+&#x2502;  &#x2502;  checkZombieSpider.js    //&#x68C0;&#x67E5;&#x6240;&#x6709;crawlworker&#x5E76;&#x8BB0;&#x5F55;&#x5230;&#x8868;proc_log&#xFF0C;&#x5E76;&#x4E14;&#x6740;&#x6B7B;proc_log&#x4E2D;&#x8FD0;&#x884C;&#x592A;&#x4E45;&#x7684;&#x50F5;&#x5C38;&#x7A0B;&#x5E8F;
+&#x2502;  &#x2502;  crawl.js        //&#x83B7;&#x53D6;&#x6D4F;&#x89C8;&#x5668;&#x6A21;&#x5F0F;&#x548C;&#x666E;&#x901A;&#x6A21;&#x5F0F;&#x7684;&#x89C4;&#x5219;&#x7136;&#x540E;&#x8FDB;&#x884C;&#x4EFB;&#x52A1;
+&#x2502;  &#x2502;  crawlMaster.js     //&#x6293;&#x53D6;&#x4EFB;&#x52A1;&#x7684;master&#x5206;&#x914D;&#x5165;redis&#x961F;&#x5217;
+&#x2502;  &#x2502;  crawlWorker.js    //&#x4ECE;redis&#x83B7;&#x53D6;&#x6293;&#x53D6;&#x4EFB;&#x52A1;
+&#x2502;  &#x2502;  fetchPage.js        
+&#x2502;  &#x2502;  test.js
+&#x2502;  &#x2502;
+&#x2502;  &#x251C;&#x2500;linux_bash
+&#x2502;  &#x2502;      crontab.sh
+&#x2502;  &#x2502;      supervisor.ini
+&#x2502;  &#x2502;
+&#x2502;  &#x2514;&#x2500;NameClient
+&#x2502;          subNsEvent.js
+&#x2502;
+&#x251C;&#x2500;conf    //&#x914D;&#x7F6E;&#x76EE;&#x5F55;
+&#x2502;  &#x2502;  code.inc.js
+&#x2502;  &#x2502;  config.dev.inc.js
+&#x2502;  &#x2502;  config.form.inc.js
+&#x2502;  &#x2502;  config.inc.js
+&#x2502;  &#x2502;  r2m_config.inc.js
+&#x2502;  &#x2502;
+&#x2502;  &#x2514;&#x2500;conf_ns    //&#x540D;&#x5B57;&#x670D;&#x52A1;&#x5668;&#x914D;&#x7F6E;
+&#x2502;          config.code.inc.js
+&#x2502;          config.globals.inc.js
+&#x2502;          config.r2m.inc.js
+&#x2502;          config.shop.inc.js
+&#x2502;
+&#x251C;&#x2500;controllers    //&#x722C;&#x866B;&#x5F00;&#x653E;&#x7684;api&#xFF0C;&#x7528;&#x4E8E;&#x9884;&#x89C8;&#x722C;&#x53D6;&#x83B7;&#x53D6;&#x9875;&#x9762;&#x548C;&#x7F51;&#x9875;&#x4E0A;&#x6267;&#x884C;&#x4EFB;&#x52A1;&#x67E5;&#x770B;&#x4EFB;&#x52A1;&#x6267;&#x884C;&#x60C5;&#x51B5;
+&#x2502;      DefaultController.js
+&#x2502;
+&#x251C;&#x2500;extensions
+&#x2502;      function_extend.js
+&#x2502;
+&#x251C;&#x2500;models
+&#x2502;      AmcMsg.js    //&#x722C;&#x866B;&#x722C;&#x53D6;&#x62A5;&#x8B66;&#x4E0A;&#x62A5;
+&#x2502;      Browser.js    //headless&#x6D4F;&#x89C8;&#x5668;&#x6A21;&#x5F0F;&#x4E0B;&#x7684;&#x6D4F;&#x89C8;&#x5668;&#x7C7B;
+&#x2502;      JTool.js        //&#x9009;&#x62E9;&#x5668;&#x4F7F;&#x7528;&#x7684;&#x7C7B;&#x5DE5;&#x5177;&#xFF0C;&#x4F8B;&#x5982;&#x683C;&#x5F0F;&#x5316;&#x65F6;&#x95F4;&#x7B49;
+&#x2502;      MapData.js    //&#x540D;&#x5B57;&#x670D;&#x52A1;&#x4E2D;&#x914D;&#x7F6E;&#x7684;&#x6570;&#x636E;&#x5E93;&#x8868;&#x5185;&#x5B57;&#x6BB5;&#x7684;&#x64CD;&#x4F5C;&#x7C7B;
+&#x2502;      ProxyPool.js        //&#x4EE3;&#x7406;&#x6C60;&#x7C7B;
+&#x2502;      Spider.js    //&#x722C;&#x866B;&#x7C7B;&#xFF0C;&#x975E;&#x5E38;&#x91CD;&#x8981;&#xFF0C;&#x5305;&#x542B;&#x4E86;&#x722C;&#x53D6;&#x8FC7;&#x7A0B;&#x4E2D;&#x7684;&#x4E00;&#x7CFB;&#x5217;&#x51FD;&#x6570;
+&#x2502;
+&#x2514;&#x2500;views
+    &#x2502;  doc.ejs
+    &#x2502;  error.ejs
+    &#x2502;  index.ejs
+    &#x2502;
+    &#x2514;&#x2500;name_server
+            js.ejs
+</code></pre><hr>
+<h1 id="&#x7CFB;&#x7EDF;&#x793A;&#x610F;&#x56FE;">&#x7CFB;&#x7EDF;&#x793A;&#x610F;&#x56FE;</h1>
+<p><img src="../img/&#x56FE;1-2.jpg" alt="&#x56FE;&#x7247;"></p>
+<h1 id="&#x8BF7;&#x6C42;&#x4EE3;&#x7406;&#x793A;&#x610F;&#x56FE;">&#x8BF7;&#x6C42;&#x4EE3;&#x7406;&#x793A;&#x610F;&#x56FE;</h1>
+<p><img src="../img/&#x56FE;1-1.jpg" alt="&#x56FE;&#x7247;"></p>
+<h1 id="&#x53EF;&#x89C6;&#x5316;&#x6D41;&#x7A0B;&#x793A;&#x610F;&#x56FE;">&#x53EF;&#x89C6;&#x5316;&#x6D41;&#x7A0B;&#x793A;&#x610F;&#x56FE;</h1>
+<p><img src="../img/&#x56FE;1-3.jpg" alt="&#x56FE;&#x7247;"></p>
+
+                                
+                                </section>
+                            
+    </div>
+    <div class="search-results">
+        <div class="has-results">
+            
+            <h1 class="search-results-title"><span class='search-results-count'></span> results matching "<span class='search-query'></span>"</h1>
+            <ul class="search-results-list"></ul>
+            
+        </div>
+        <div class="no-results">
+            
+            <h1 class="search-results-title">No results matching "<span class='search-query'></span>"</h1>
+            
+        </div>
+    </div>
+</div>
+
+                        </div>
+                    </div>
+                
+            </div>
+
+            
+                
+                
+            
+        
+    </div>
+
+    <script>
+        var gitbook = gitbook || [];
+        gitbook.push(function() {
+            gitbook.page.hasChanged({"page":{"title":"目录","level":"1.3.1","depth":2,"next":{"title":"部署","level":"1.4","depth":1,"ref":"","articles":[{"title":"部署","level":"1.4.1","depth":2,"anchor":"#run","path":"run/README.md","ref":"run/README.md#run","articles":[]}]},"previous":{"title":"目录结构","level":"1.3","depth":1,"ref":"","articles":[{"title":"目录","level":"1.3.1","depth":2,"anchor":"#code","path":"code/README.md","ref":"code/README.md#code","articles":[]}]},"dir":"ltr"},"config":{"gitbook":"*","theme":"default","variables":{},"plugins":["livereload"],"pluginsConfig":{"livereload":{},"highlight":{},"search":{},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"fontsettings":{"theme":"white","family":"sans","size":2},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":false}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"}},"file":{"path":"code/README.md","mtime":"2018-10-10T06:52:47.360Z","type":"markdown"},"gitbook":{"version":"3.2.3","time":"2018-10-10T07:06:25.421Z"},"basePath":"..","book":{"language":""}});
+        });
+    </script>
+</div>
+
+        
+    <script src="../gitbook/gitbook.js"></script>
+    <script src="../gitbook/theme.js"></script>
+    
+        
+        <script src="../gitbook/gitbook-plugin-livereload/plugin.js"></script>
+        
+    
+        
+        <script src="../gitbook/gitbook-plugin-search/search-engine.js"></script>
+        
+    
+        
+        <script src="../gitbook/gitbook-plugin-search/search.js"></script>
+        
+    
+        
+        <script src="../gitbook/gitbook-plugin-lunr/lunr.min.js"></script>
+        
+    
+        
+        <script src="../gitbook/gitbook-plugin-lunr/search-lunr.js"></script>
+        
+    
+        
+        <script src="../gitbook/gitbook-plugin-sharing/buttons.js"></script>
+        
+    
+        
+        <script src="../gitbook/gitbook-plugin-fontsettings/fontsettings.js"></script>
+        
+    
+
+    </body>
+</html>
+

BIN
_book/img/图1-1.jpg


BIN
_book/img/图1-2.jpg


BIN
_book/img/图1-3.jpg


BIN
_book/img/图2-1.jpg


+ 113 - 4
_book/index.html

@@ -4,7 +4,7 @@
     <head>
         <meta charset="UTF-8">
         <meta content="text/html; charset=utf-8" http-equiv="Content-Type">
-        <title>Introduction · GitBook</title>
+        <title>简介 · GitBook</title>
         <meta http-equiv="X-UA-Compatible" content="IE=edge" />
         <meta name="description" content="">
         <meta name="generator" content="GitBook 3.2.3">
@@ -90,7 +90,7 @@
                 <a href="./">
             
                     
-                    Introduction
+                    简介
             
                 </a>
             
@@ -98,6 +98,115 @@
             
         </li>
     
+        <li class="chapter " data-level="1.2" >
+            
+                <span>
+            
+                    
+                    安装以及配置
+            
+                </span>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="1.2.1" data-path="install/">
+            
+                <a href="install/#install">
+            
+                    
+                    安装
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="1.2.2" data-path="install/">
+            
+                <a href="install/#config">
+            
+                    
+                    配置
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="1.3" >
+            
+                <span>
+            
+                    
+                    目录结构
+            
+                </span>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="1.3.1" data-path="code/">
+            
+                <a href="code/#code">
+            
+                    
+                    目录
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="1.4" >
+            
+                <span>
+            
+                    
+                    部署
+            
+                </span>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="1.4.1" data-path="run/">
+            
+                <a href="run/#run">
+            
+                    
+                    部署
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
 
     
 
@@ -128,7 +237,7 @@
     <!-- Title -->
     <h1>
         <i class="fa fa-circle-o-notch fa-spin"></i>
-        <a href="." >Introduction</a>
+        <a href="." >简介</a>
     </h1>
 </div>
 
@@ -180,7 +289,7 @@
     <script>
         var gitbook = gitbook || [];
         gitbook.push(function() {
-            gitbook.page.hasChanged({"page":{"title":"Introduction","level":"1.1","depth":1,"dir":"ltr"},"config":{"gitbook":"*","theme":"default","variables":{},"plugins":["livereload"],"pluginsConfig":{"livereload":{},"highlight":{},"search":{},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"fontsettings":{"theme":"white","family":"sans","size":2},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":false}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"}},"file":{"path":"README.md","mtime":"2018-10-09T09:29:15.034Z","type":"markdown"},"gitbook":{"version":"3.2.3","time":"2018-10-09T09:31:09.950Z"},"basePath":".","book":{"language":""}});
+            gitbook.page.hasChanged({"page":{"title":"简介","level":"1.1","depth":1,"next":{"title":"安装以及配置","level":"1.2","depth":1,"ref":"","articles":[{"title":"安装","level":"1.2.1","depth":2,"anchor":"#install","path":"install/README.md","ref":"install/README.md#install","articles":[]},{"title":"配置","level":"1.2.2","depth":2,"anchor":"#config","path":"install/README.md","ref":"install/README.md#config","articles":[]}]},"dir":"ltr"},"config":{"gitbook":"*","theme":"default","variables":{},"plugins":["livereload"],"pluginsConfig":{"livereload":{},"highlight":{},"search":{},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"fontsettings":{"theme":"white","family":"sans","size":2},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":false}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"}},"file":{"path":"README.md","mtime":"2018-10-09T09:29:15.034Z","type":"markdown"},"gitbook":{"version":"3.2.3","time":"2018-10-10T07:06:25.421Z"},"basePath":".","book":{"language":""}});
         });
     </script>
 </div>

+ 74 - 0
_book/install/README.md

@@ -0,0 +1,74 @@
+#install
+
+安装node相关依赖,node要求v8.0以上
+```
+npm install
+```
+如果pupperteer安装出现问题,请参考如下
+```
+安装 1.15.0 版本的 puppeteer
+1.
+下载安装包
+https://github.com/GoogleChrome/puppeteer/releases
+
+2.这里的路径根据项目地址来
+安装包解压到/data/webapps/test.spider.duowan.com/protected/node_modules目录下的puppeteer
+
+3.
+进入puppeteer
+执行:
+sudo npm install -d package.json  安装好依赖模块
+sudo node install.js              安装puppeteer 服务(这个命令会自己安装chromium)
+
+缺什么模块用 sudo npm install 模块名
+```
+
+
+#config
+配置项目
+
+* 设置端口
+  /protected/conf/config.${env}.inc.js	其中 ${env} 是环境,分别为 dev-开发,form-正式
+```
+process.env.PORT = 端口号;
+```
+
+* 配置数据库
+  /protected/conf/config.${env}.inc.js
+
+```
+let dbInfo = {};
+//数据库配置
+dbInfo['Web'] = {
+    host : '61.160.36.225',
+    user : 'ojiatest',
+    password : 'ojia305',
+    database : 'Web',
+    port : 3306,
+    connectionLimit : 100
+};
+
+//redis配置
+let redisInfo = {};
+redisInfo['name_serv'] = {
+    'host' : '61.160.36.225',
+    'port' : 6405,
+    'pwd' : 'ojia123',
+    'db' : 1,
+    'connet_timeout' : 0
+};
+```
+
+**需要注意的是名字服务器发布后,会自动更新到 /protected/conf/conf_ns ,但是需要重启node进程,否则不会生效**
+
+
+
+* 代理
+  爬虫需要代理池,所有的代理ip获取都是通过访问 cjms 管理后台的接口:/protected/models/ProxyPool.js
+
+   
+
+     1.获取ip列表:getXProxyList()
+     2.获取某个域名效果最好的代理:getXProxyBest(domain)
+     3.每次上报使用的代理好坏情况:reportProxy(domain, proxy, score)
+

+ 394 - 0
_book/install/index.html

@@ -0,0 +1,394 @@
+
+<!DOCTYPE HTML>
+<html lang="" >
+    <head>
+        <meta charset="UTF-8">
+        <meta content="text/html; charset=utf-8" http-equiv="Content-Type">
+        <title>安装 · GitBook</title>
+        <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+        <meta name="description" content="">
+        <meta name="generator" content="GitBook 3.2.3">
+        
+        
+        
+    
+    <link rel="stylesheet" href="../gitbook/style.css">
+
+    
+            
+                
+                <link rel="stylesheet" href="../gitbook/gitbook-plugin-highlight/website.css">
+                
+            
+                
+                <link rel="stylesheet" href="../gitbook/gitbook-plugin-search/search.css">
+                
+            
+                
+                <link rel="stylesheet" href="../gitbook/gitbook-plugin-fontsettings/website.css">
+                
+            
+        
+
+    
+
+    
+        
+    
+        
+    
+        
+    
+        
+    
+        
+    
+        
+    
+
+        
+    
+    
+    <meta name="HandheldFriendly" content="true"/>
+    <meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=no">
+    <meta name="apple-mobile-web-app-capable" content="yes">
+    <meta name="apple-mobile-web-app-status-bar-style" content="black">
+    <link rel="apple-touch-icon-precomposed" sizes="152x152" href="../gitbook/images/apple-touch-icon-precomposed-152.png">
+    <link rel="shortcut icon" href="../gitbook/images/favicon.ico" type="image/x-icon">
+
+    
+    <link rel="next" href="./" />
+    
+    
+
+    </head>
+    <body>
+        
+<div class="book">
+    <div class="book-summary">
+        
+            
+<div id="book-search-input" role="search">
+    <input type="text" placeholder="Type to search" />
+</div>
+
+            
+                <nav role="navigation">
+                
+
+
+<ul class="summary">
+    
+    
+
+    
+
+    
+        
+        
+    
+        <li class="chapter " data-level="1.1" data-path="../">
+            
+                <a href="../">
+            
+                    
+                    简介
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="1.2" >
+            
+                <span>
+            
+                    
+                    安装以及配置
+            
+                </span>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="1.2.1" data-path="./">
+            
+                <a href="./#install">
+            
+                    
+                    安装
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="1.2.2" data-path="./">
+            
+                <a href="./#config">
+            
+                    
+                    配置
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="1.3" >
+            
+                <span>
+            
+                    
+                    目录结构
+            
+                </span>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="1.3.1" data-path="../code/">
+            
+                <a href="../code/#code">
+            
+                    
+                    目录
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="1.4" >
+            
+                <span>
+            
+                    
+                    部署
+            
+                </span>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="1.4.1" data-path="../run/">
+            
+                <a href="../run/#run">
+            
+                    
+                    部署
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+
+    
+
+    <li class="divider"></li>
+
+    <li>
+        <a href="https://www.gitbook.com" target="blank" class="gitbook-link">
+            Published with GitBook
+        </a>
+    </li>
+</ul>
+
+
+                </nav>
+            
+        
+    </div>
+
+    <div class="book-body">
+        
+            <div class="body-inner">
+                
+                    
+
+<div class="book-header" role="navigation">
+    
+
+    <!-- Title -->
+    <h1>
+        <i class="fa fa-circle-o-notch fa-spin"></i>
+        <a href=".." >安装</a>
+    </h1>
+</div>
+
+
+
+
+                    <div class="page-wrapper" tabindex="-1" role="main">
+                        <div class="page-inner">
+                            
+<div id="book-search-results">
+    <div class="search-noresults">
+    
+                                <section class="normal markdown-section">
+                                
+                                <h1 id="install">install</h1>
+<p>&#x5B89;&#x88C5;node&#x76F8;&#x5173;&#x4F9D;&#x8D56;&#xFF0C;node&#x8981;&#x6C42;v8.0&#x4EE5;&#x4E0A;</p>
+<pre><code>npm install
+</code></pre><p>&#x5982;&#x679C;pupperteer&#x5B89;&#x88C5;&#x51FA;&#x73B0;&#x95EE;&#x9898;&#xFF0C;&#x8BF7;&#x53C2;&#x8003;&#x5982;&#x4E0B;</p>
+<pre><code>&#x5B89;&#x88C5; 1.15.0 &#x7248;&#x672C;&#x7684; puppeteer
+1.
+&#x4E0B;&#x8F7D;&#x5B89;&#x88C5;&#x5305;
+https://github.com/GoogleChrome/puppeteer/releases
+
+2.&#x8FD9;&#x91CC;&#x7684;&#x8DEF;&#x5F84;&#x6839;&#x636E;&#x9879;&#x76EE;&#x5730;&#x5740;&#x6765;
+&#x5B89;&#x88C5;&#x5305;&#x89E3;&#x538B;&#x5230;/data/webapps/test.spider.duowan.com/protected/node_modules&#x76EE;&#x5F55;&#x4E0B;&#x7684;puppeteer
+
+3.
+&#x8FDB;&#x5165;puppeteer
+&#x6267;&#x884C;&#xFF1A;
+sudo npm install -d package.json  &#x5B89;&#x88C5;&#x597D;&#x4F9D;&#x8D56;&#x6A21;&#x5757;
+sudo node install.js              &#x5B89;&#x88C5;puppeteer &#x670D;&#x52A1;&#xFF08;&#x8FD9;&#x4E2A;&#x547D;&#x4EE4;&#x4F1A;&#x81EA;&#x5DF1;&#x5B89;&#x88C5;chromium)
+
+&#x7F3A;&#x4EC0;&#x4E48;&#x6A21;&#x5757;&#x7528; sudo npm install &#x6A21;&#x5757;&#x540D;
+</code></pre><h1 id="config">config</h1>
+<p>&#x914D;&#x7F6E;&#x9879;&#x76EE;</p>
+<ul>
+<li><p>&#x8BBE;&#x7F6E;&#x7AEF;&#x53E3;
+/protected/conf/config.${env}.inc.js    &#x5176;&#x4E2D; ${env} &#x662F;&#x73AF;&#x5883;&#xFF0C;&#x5206;&#x522B;&#x4E3A; dev-&#x5F00;&#x53D1;&#xFF0C;form-&#x6B63;&#x5F0F;</p>
+<pre><code>process.env.PORT = &#x7AEF;&#x53E3;&#x53F7;;
+</code></pre></li>
+<li><p>&#x914D;&#x7F6E;&#x6570;&#x636E;&#x5E93;
+/protected/conf/config.${env}.inc.js</p>
+</li>
+</ul>
+<pre><code>let dbInfo = {};
+//&#x6570;&#x636E;&#x5E93;&#x914D;&#x7F6E;
+dbInfo[&apos;Web&apos;] = {
+    host : &apos;61.160.36.225&apos;,
+    user : &apos;ojiatest&apos;,
+    password : &apos;ojia305&apos;,
+    database : &apos;Web&apos;,
+    port : 3306,
+    connectionLimit : 100
+};
+
+//redis&#x914D;&#x7F6E;
+let redisInfo = {};
+redisInfo[&apos;name_serv&apos;] = {
+    &apos;host&apos; : &apos;61.160.36.225&apos;,
+    &apos;port&apos; : 6405,
+    &apos;pwd&apos; : &apos;ojia123&apos;,
+    &apos;db&apos; : 1,
+    &apos;connet_timeout&apos; : 0
+};
+</code></pre><p><strong>&#x9700;&#x8981;&#x6CE8;&#x610F;&#x7684;&#x662F;&#x540D;&#x5B57;&#x670D;&#x52A1;&#x5668;&#x53D1;&#x5E03;&#x540E;&#xFF0C;&#x4F1A;&#x81EA;&#x52A8;&#x66F4;&#x65B0;&#x5230; /protected/conf/conf_ns &#xFF0C;&#x4F46;&#x662F;&#x9700;&#x8981;&#x91CD;&#x542F;node&#x8FDB;&#x7A0B;&#xFF0C;&#x5426;&#x5219;&#x4E0D;&#x4F1A;&#x751F;&#x6548;</strong></p>
+<ul>
+<li>&#x4EE3;&#x7406;
+&#x722C;&#x866B;&#x9700;&#x8981;&#x4EE3;&#x7406;&#x6C60;&#xFF0C;&#x6240;&#x6709;&#x7684;&#x4EE3;&#x7406;ip&#x83B7;&#x53D6;&#x90FD;&#x662F;&#x901A;&#x8FC7;&#x8BBF;&#x95EE; cjms &#x7BA1;&#x7406;&#x540E;&#x53F0;&#x7684;&#x63A5;&#x53E3;&#xFF1A;/protected/models/ProxyPool.js</li>
+</ul>
+<pre><code> 1.&#x83B7;&#x53D6;ip&#x5217;&#x8868;&#xFF1A;getXProxyList()
+ 2.&#x83B7;&#x53D6;&#x67D0;&#x4E2A;&#x57DF;&#x540D;&#x6548;&#x679C;&#x6700;&#x597D;&#x7684;&#x4EE3;&#x7406;&#xFF1A;getXProxyBest(domain)
+ 3.&#x6BCF;&#x6B21;&#x4E0A;&#x62A5;&#x4F7F;&#x7528;&#x7684;&#x4EE3;&#x7406;&#x597D;&#x574F;&#x60C5;&#x51B5;&#xFF1A;reportProxy(domain, proxy, score)
+</code></pre>
+                                
+                                </section>
+                            
+    </div>
+    <div class="search-results">
+        <div class="has-results">
+            
+            <h1 class="search-results-title"><span class='search-results-count'></span> results matching "<span class='search-query'></span>"</h1>
+            <ul class="search-results-list"></ul>
+            
+        </div>
+        <div class="no-results">
+            
+            <h1 class="search-results-title">No results matching "<span class='search-query'></span>"</h1>
+            
+        </div>
+    </div>
+</div>
+
+                        </div>
+                    </div>
+                
+            </div>
+
+            
+                
+                
+                <a href="./#config" class="navigation navigation-next navigation-unique" aria-label="Next page: 配置">
+                    <i class="fa fa-angle-right"></i>
+                </a>
+                
+            
+        
+    </div>
+
+    <script>
+        var gitbook = gitbook || [];
+        gitbook.push(function() {
+            gitbook.page.hasChanged({"page":{"title":"安装","level":"1.2.1","depth":2,"next":{"title":"配置","level":"1.2.2","depth":2,"anchor":"#config","path":"install/README.md","ref":"install/README.md#config","articles":[]},"previous":{"title":"安装以及配置","level":"1.2","depth":1,"ref":"","articles":[{"title":"安装","level":"1.2.1","depth":2,"anchor":"#install","path":"install/README.md","ref":"install/README.md#install","articles":[]},{"title":"配置","level":"1.2.2","depth":2,"anchor":"#config","path":"install/README.md","ref":"install/README.md#config","articles":[]}]},"dir":"ltr"},"config":{"gitbook":"*","theme":"default","variables":{},"plugins":["livereload"],"pluginsConfig":{"livereload":{},"highlight":{},"search":{},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"fontsettings":{"theme":"white","family":"sans","size":2},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":false}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"}},"file":{"path":"install/README.md","mtime":"2018-10-10T06:42:56.680Z","type":"markdown"},"gitbook":{"version":"3.2.3","time":"2018-10-10T07:06:25.421Z"},"basePath":"..","book":{"language":""}});
+        });
+    </script>
+</div>
+
+        
+    <script src="../gitbook/gitbook.js"></script>
+    <script src="../gitbook/theme.js"></script>
+    
+        
+        <script src="../gitbook/gitbook-plugin-livereload/plugin.js"></script>
+        
+    
+        
+        <script src="../gitbook/gitbook-plugin-search/search-engine.js"></script>
+        
+    
+        
+        <script src="../gitbook/gitbook-plugin-search/search.js"></script>
+        
+    
+        
+        <script src="../gitbook/gitbook-plugin-lunr/lunr.min.js"></script>
+        
+    
+        
+        <script src="../gitbook/gitbook-plugin-lunr/search-lunr.js"></script>
+        
+    
+        
+        <script src="../gitbook/gitbook-plugin-sharing/buttons.js"></script>
+        
+    
+        
+        <script src="../gitbook/gitbook-plugin-fontsettings/fontsettings.js"></script>
+        
+    
+
+    </body>
+</html>
+

+ 28 - 0
_book/run/README.md

@@ -0,0 +1,28 @@
+#run
+
+用 supervisor 守护进程来启动node爬虫进程
+```
+vim /data/services/supervisor.conf
+```
+
+举个栗子:
+```
+[program:node1]
+command=node /data/webapps/test.spider.duowan.com/protected/index.js
+process_name=WEB_test.spider.duowan.com
+directory=/data/webapps/test.spider.duowan.com/protected/
+numprocs=1
+autostart=true
+autorestart=true
+stdout_logfile=/tmp/WEB_test.spider.duowan.com.log
+
+```
+
+设置进程后,查看:
+```
+sudo supervisorctl
+```
+可以看到
+![图片](/img/图2-1.jpg)
+
+**主要启动的脚本为 index.js checkProxyPool.js crawlMaster.js crawlWorker.js**

+ 347 - 0
_book/run/index.html

@@ -0,0 +1,347 @@
+
+<!DOCTYPE HTML>
+<html lang="" >
+    <head>
+        <meta charset="UTF-8">
+        <meta content="text/html; charset=utf-8" http-equiv="Content-Type">
+        <title>部署 · GitBook</title>
+        <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+        <meta name="description" content="">
+        <meta name="generator" content="GitBook 3.2.3">
+        
+        
+        
+    
+    <link rel="stylesheet" href="../gitbook/style.css">
+
+    
+            
+                
+                <link rel="stylesheet" href="../gitbook/gitbook-plugin-highlight/website.css">
+                
+            
+                
+                <link rel="stylesheet" href="../gitbook/gitbook-plugin-search/search.css">
+                
+            
+                
+                <link rel="stylesheet" href="../gitbook/gitbook-plugin-fontsettings/website.css">
+                
+            
+        
+
+    
+
+    
+        
+    
+        
+    
+        
+    
+        
+    
+        
+    
+        
+    
+
+        
+    
+    
+    <meta name="HandheldFriendly" content="true"/>
+    <meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=no">
+    <meta name="apple-mobile-web-app-capable" content="yes">
+    <meta name="apple-mobile-web-app-status-bar-style" content="black">
+    <link rel="apple-touch-icon-precomposed" sizes="152x152" href="../gitbook/images/apple-touch-icon-precomposed-152.png">
+    <link rel="shortcut icon" href="../gitbook/images/favicon.ico" type="image/x-icon">
+
+    
+    
+
+    </head>
+    <body>
+        
+<div class="book">
+    <div class="book-summary">
+        
+            
+<div id="book-search-input" role="search">
+    <input type="text" placeholder="Type to search" />
+</div>
+
+            
+                <nav role="navigation">
+                
+
+
+<ul class="summary">
+    
+    
+
+    
+
+    
+        
+        
+    
+        <li class="chapter " data-level="1.1" data-path="../">
+            
+                <a href="../">
+            
+                    
+                    简介
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="1.2" >
+            
+                <span>
+            
+                    
+                    安装以及配置
+            
+                </span>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="1.2.1" data-path="../install/">
+            
+                <a href="../install/#install">
+            
+                    
+                    安装
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="1.2.2" data-path="../install/">
+            
+                <a href="../install/#config">
+            
+                    
+                    配置
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="1.3" >
+            
+                <span>
+            
+                    
+                    目录结构
+            
+                </span>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="1.3.1" data-path="../code/">
+            
+                <a href="../code/#code">
+            
+                    
+                    目录
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="1.4" >
+            
+                <span>
+            
+                    
+                    部署
+            
+                </span>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="1.4.1" data-path="./">
+            
+                <a href="./#run">
+            
+                    
+                    部署
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+
+    
+
+    <li class="divider"></li>
+
+    <li>
+        <a href="https://www.gitbook.com" target="blank" class="gitbook-link">
+            Published with GitBook
+        </a>
+    </li>
+</ul>
+
+
+                </nav>
+            
+        
+    </div>
+
+    <div class="book-body">
+        
+            <div class="body-inner">
+                
+                    
+
+<div class="book-header" role="navigation">
+    
+
+    <!-- Title -->
+    <h1>
+        <i class="fa fa-circle-o-notch fa-spin"></i>
+        <a href=".." >部署</a>
+    </h1>
+</div>
+
+
+
+
+                    <div class="page-wrapper" tabindex="-1" role="main">
+                        <div class="page-inner">
+                            
+<div id="book-search-results">
+    <div class="search-noresults">
+    
+                                <section class="normal markdown-section">
+                                
+                                <h1 id="run">run</h1>
+<p>&#x7528; supervisor &#x5B88;&#x62A4;&#x8FDB;&#x7A0B;&#x6765;&#x542F;&#x52A8;node&#x722C;&#x866B;&#x8FDB;&#x7A0B;</p>
+<pre><code>vim /data/services/supervisor.conf
+</code></pre><p>&#x4E3E;&#x4E2A;&#x6817;&#x5B50;&#xFF1A;</p>
+<pre><code>[program:node1]
+command=node /data/webapps/test.spider.duowan.com/protected/index.js
+process_name=WEB_test.spider.duowan.com
+directory=/data/webapps/test.spider.duowan.com/protected/
+numprocs=1
+autostart=true
+autorestart=true
+stdout_logfile=/tmp/WEB_test.spider.duowan.com.log
+</code></pre><p>&#x8BBE;&#x7F6E;&#x8FDB;&#x7A0B;&#x540E;&#xFF0C;&#x67E5;&#x770B;&#xFF1A;</p>
+<pre><code>sudo supervisorctl
+</code></pre><p>&#x53EF;&#x4EE5;&#x770B;&#x5230;
+<img src="../img/&#x56FE;2-1.jpg" alt="&#x56FE;&#x7247;"></p>
+<p><strong>&#x4E3B;&#x8981;&#x542F;&#x52A8;&#x7684;&#x811A;&#x672C;&#x4E3A; index.js checkProxyPool.js crawlMaster.js crawlWorker.js</strong></p>
+
+                                
+                                </section>
+                            
+    </div>
+    <div class="search-results">
+        <div class="has-results">
+            
+            <h1 class="search-results-title"><span class='search-results-count'></span> results matching "<span class='search-query'></span>"</h1>
+            <ul class="search-results-list"></ul>
+            
+        </div>
+        <div class="no-results">
+            
+            <h1 class="search-results-title">No results matching "<span class='search-query'></span>"</h1>
+            
+        </div>
+    </div>
+</div>
+
+                        </div>
+                    </div>
+                
+            </div>
+
+            
+                
+                
+            
+        
+    </div>
+
+    <script>
+        var gitbook = gitbook || [];
+        gitbook.push(function() {
+            gitbook.page.hasChanged({"page":{"title":"部署","level":"1.4.1","depth":2,"previous":{"title":"部署","level":"1.4","depth":1,"ref":"","articles":[{"title":"部署","level":"1.4.1","depth":2,"anchor":"#run","path":"run/README.md","ref":"run/README.md#run","articles":[]}]},"dir":"ltr"},"config":{"gitbook":"*","theme":"default","variables":{},"plugins":["livereload"],"pluginsConfig":{"livereload":{},"highlight":{},"search":{},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"fontsettings":{"theme":"white","family":"sans","size":2},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":false}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"}},"file":{"path":"run/README.md","mtime":"2018-10-10T07:06:11.974Z","type":"markdown"},"gitbook":{"version":"3.2.3","time":"2018-10-10T07:06:25.421Z"},"basePath":"..","book":{"language":""}});
+        });
+    </script>
+</div>
+
+        
+    <script src="../gitbook/gitbook.js"></script>
+    <script src="../gitbook/theme.js"></script>
+    
+        
+        <script src="../gitbook/gitbook-plugin-livereload/plugin.js"></script>
+        
+    
+        
+        <script src="../gitbook/gitbook-plugin-search/search-engine.js"></script>
+        
+    
+        
+        <script src="../gitbook/gitbook-plugin-search/search.js"></script>
+        
+    
+        
+        <script src="../gitbook/gitbook-plugin-lunr/lunr.min.js"></script>
+        
+    
+        
+        <script src="../gitbook/gitbook-plugin-lunr/search-lunr.js"></script>
+        
+    
+        
+        <script src="../gitbook/gitbook-plugin-sharing/buttons.js"></script>
+        
+    
+        
+        <script src="../gitbook/gitbook-plugin-fontsettings/fontsettings.js"></script>
+        
+    
+
+    </body>
+</html>
+

Datei-Diff unterdrückt, da er zu groß ist
+ 0 - 0
_book/search_index.json


+ 70 - 0
code/README.md

@@ -0,0 +1,70 @@
+#code
+
+目录结构
+```
+│  app.js	//处理路由和异常捕捉
+│  common.js	//配置环境config
+│  index.js		//多进程启动
+│
+├─bin
+│  │  checkProxyPool.js		//从redis代理池中拿出ip直接请求某个网址,如果失效则删除
+│  │  checkZombieChrome.js	//使用headless浏览器模式会出现很多僵尸chrome进程,用此脚本杀死
+│  │  checkZombieSpider.js	//检查所有crawlworker并记录到表proc_log,并且杀死proc_log中运行太久的僵尸程序
+│  │  crawl.js		//获取浏览器模式和普通模式的规则然后进行任务
+│  │  crawlMaster.js 	//抓取任务的master分配入redis队列
+│  │  crawlWorker.js	//从redis获取抓取任务
+│  │  fetchPage.js		
+│  │  test.js
+│  │
+│  ├─linux_bash
+│  │      crontab.sh
+│  │      supervisor.ini
+│  │
+│  └─NameClient
+│          subNsEvent.js
+│
+├─conf	//配置目录
+│  │  code.inc.js
+│  │  config.dev.inc.js
+│  │  config.form.inc.js
+│  │  config.inc.js
+│  │  r2m_config.inc.js
+│  │
+│  └─conf_ns	//名字服务器配置
+│          config.code.inc.js
+│          config.globals.inc.js
+│          config.r2m.inc.js
+│          config.shop.inc.js
+│
+├─controllers	//爬虫开放的api,用于预览爬取获取页面和网页上执行任务查看任务执行情况
+│      DefaultController.js
+│
+├─extensions
+│      function_extend.js
+│
+├─models
+│      AmcMsg.js	//爬虫爬取报警上报
+│      Browser.js	//headless浏览器模式下的浏览器类
+│      JTool.js		//选择器使用的类工具,例如格式化时间等
+│      MapData.js	//名字服务中配置的数据库表内字段的操作类
+│      ProxyPool.js		//代理池类
+│      Spider.js	//爬虫类,非常重要,包含了爬取过程中的一系列函数
+│
+└─views
+    │  doc.ejs
+    │  error.ejs
+    │  index.ejs
+    │
+    └─name_server
+            js.ejs
+```
+
+---
+#系统示意图
+![图片](/img/图1-2.jpg)
+
+#请求代理示意图
+![图片](/img/图1-1.jpg)
+
+#可视化流程示意图
+![图片](/img/图1-3.jpg)

BIN
img/图1-1.jpg


BIN
img/图1-2.jpg


BIN
img/图1-3.jpg


BIN
img/图2-1.jpg


+ 74 - 0
install/README.md

@@ -0,0 +1,74 @@
+#install
+
+安装node相关依赖,node要求v8.0以上
+```
+npm install
+```
+如果pupperteer安装出现问题,请参考如下
+```
+安装 1.15.0 版本的 puppeteer
+1.
+下载安装包
+https://github.com/GoogleChrome/puppeteer/releases
+
+2.这里的路径根据项目地址来
+安装包解压到/data/webapps/test.spider.duowan.com/protected/node_modules目录下的puppeteer
+
+3.
+进入puppeteer
+执行:
+sudo npm install -d package.json  安装好依赖模块
+sudo node install.js              安装puppeteer 服务(这个命令会自己安装chromium)
+
+缺什么模块用 sudo npm install 模块名
+```
+
+
+#config
+配置项目
+
+* 设置端口
+  /protected/conf/config.${env}.inc.js	其中 ${env} 是环境,分别为 dev-开发,form-正式
+```
+process.env.PORT = 端口号;
+```
+
+* 配置数据库
+  /protected/conf/config.${env}.inc.js
+
+```
+let dbInfo = {};
+//数据库配置
+dbInfo['Web'] = {
+    host : '61.160.36.225',
+    user : 'ojiatest',
+    password : 'ojia305',
+    database : 'Web',
+    port : 3306,
+    connectionLimit : 100
+};
+
+//redis配置
+let redisInfo = {};
+redisInfo['name_serv'] = {
+    'host' : '61.160.36.225',
+    'port' : 6405,
+    'pwd' : 'ojia123',
+    'db' : 1,
+    'connet_timeout' : 0
+};
+```
+
+**需要注意的是名字服务器发布后,会自动更新到 /protected/conf/conf_ns ,但是需要重启node进程,否则不会生效**
+
+
+
+* 代理
+  爬虫需要代理池,所有的代理ip获取都是通过访问 cjms 管理后台的接口:/protected/models/ProxyPool.js
+
+   
+
+     1.获取ip列表:getXProxyList()
+     2.获取某个域名效果最好的代理:getXProxyBest(domain)
+     3.每次上报使用的代理好坏情况:reportProxy(domain, proxy, score)
+

+ 28 - 0
run/README.md

@@ -0,0 +1,28 @@
+#run
+
+用 supervisor 守护进程来启动node爬虫进程
+```
+vim /data/services/supervisor.conf
+```
+
+举个栗子:
+```
+[program:node1]
+command=node /data/webapps/test.spider.duowan.com/protected/index.js
+process_name=WEB_test.spider.duowan.com
+directory=/data/webapps/test.spider.duowan.com/protected/
+numprocs=1
+autostart=true
+autorestart=true
+stdout_logfile=/tmp/WEB_test.spider.duowan.com.log
+
+```
+
+设置进程后,查看:
+```
+sudo supervisorctl
+```
+可以看到
+![图片](/img/图2-1.jpg)
+
+**主要启动的脚本为 index.js checkProxyPool.js crawlMaster.js crawlWorker.js**

Einige Dateien werden nicht angezeigt, da zu viele Dateien in diesem Diff geändert wurden.