blackbinbin il y a 6 ans
Parent
commit
a279a8a2ad

+ 3 - 1
SUMMARY.md

@@ -7,4 +7,6 @@
 * 目录结构
     * [目录](code/README.md#code)
 * 部署
-    * [部署](run/README.md#run)
+    * [部署](run/README.md#run)
+* 使用
+    * [新建规则](use/README.md#规则)

+ 33 - 1
_book/code/index.html

@@ -207,6 +207,38 @@
             
         </li>
     
+        <li class="chapter " data-level="1.5" >
+            
+                <span>
+            
+                    
+                    使用
+            
+                </span>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="1.5.1" data-path="../use/">
+            
+                <a href="../use/#规则">
+            
+                    
+                    新建规则
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
 
     
 
@@ -351,7 +383,7 @@
     <script>
         var gitbook = gitbook || [];
         gitbook.push(function() {
-            gitbook.page.hasChanged({"page":{"title":"目录","level":"1.3.1","depth":2,"next":{"title":"部署","level":"1.4","depth":1,"ref":"","articles":[{"title":"部署","level":"1.4.1","depth":2,"anchor":"#run","path":"run/README.md","ref":"run/README.md#run","articles":[]}]},"previous":{"title":"目录结构","level":"1.3","depth":1,"ref":"","articles":[{"title":"目录","level":"1.3.1","depth":2,"anchor":"#code","path":"code/README.md","ref":"code/README.md#code","articles":[]}]},"dir":"ltr"},"config":{"gitbook":"*","theme":"default","variables":{},"plugins":["livereload"],"pluginsConfig":{"livereload":{},"highlight":{},"search":{},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"fontsettings":{"theme":"white","family":"sans","size":2},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":false}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"}},"file":{"path":"code/README.md","mtime":"2018-10-10T06:52:47.360Z","type":"markdown"},"gitbook":{"version":"3.2.3","time":"2018-10-10T07:06:25.421Z"},"basePath":"..","book":{"language":""}});
+            gitbook.page.hasChanged({"page":{"title":"目录","level":"1.3.1","depth":2,"next":{"title":"部署","level":"1.4","depth":1,"ref":"","articles":[{"title":"部署","level":"1.4.1","depth":2,"anchor":"#run","path":"run/README.md","ref":"run/README.md#run","articles":[]}]},"previous":{"title":"目录结构","level":"1.3","depth":1,"ref":"","articles":[{"title":"目录","level":"1.3.1","depth":2,"anchor":"#code","path":"code/README.md","ref":"code/README.md#code","articles":[]}]},"dir":"ltr"},"config":{"gitbook":"*","theme":"default","variables":{},"plugins":["livereload"],"pluginsConfig":{"livereload":{},"highlight":{},"search":{},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"fontsettings":{"theme":"white","family":"sans","size":2},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":false}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"}},"file":{"path":"code/README.md","mtime":"2018-10-10T06:52:47.360Z","type":"markdown"},"gitbook":{"version":"3.2.3","time":"2018-10-10T09:33:12.148Z"},"basePath":"..","book":{"language":""}});
         });
     </script>
 </div>

BIN
_book/img/图3-1.jpg


BIN
_book/img/图3-2.jpg


BIN
_book/img/图3-3.jpg


BIN
_book/img/图4-1.jpg


BIN
_book/img/图4-2.jpg


BIN
_book/img/图4-3.jpg


BIN
_book/img/图4-4.jpg


BIN
_book/img/图4-5.jpg


BIN
_book/img/图4-6.jpg


BIN
_book/img/图4-7.jpg


+ 33 - 1
_book/index.html

@@ -207,6 +207,38 @@
             
         </li>
     
+        <li class="chapter " data-level="1.5" >
+            
+                <span>
+            
+                    
+                    使用
+            
+                </span>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="1.5.1" data-path="use/">
+            
+                <a href="use/#规则">
+            
+                    
+                    新建规则
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
 
     
 
@@ -289,7 +321,7 @@
     <script>
         var gitbook = gitbook || [];
         gitbook.push(function() {
-            gitbook.page.hasChanged({"page":{"title":"简介","level":"1.1","depth":1,"next":{"title":"安装以及配置","level":"1.2","depth":1,"ref":"","articles":[{"title":"安装","level":"1.2.1","depth":2,"anchor":"#install","path":"install/README.md","ref":"install/README.md#install","articles":[]},{"title":"配置","level":"1.2.2","depth":2,"anchor":"#config","path":"install/README.md","ref":"install/README.md#config","articles":[]}]},"dir":"ltr"},"config":{"gitbook":"*","theme":"default","variables":{},"plugins":["livereload"],"pluginsConfig":{"livereload":{},"highlight":{},"search":{},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"fontsettings":{"theme":"white","family":"sans","size":2},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":false}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"}},"file":{"path":"README.md","mtime":"2018-10-09T09:29:15.034Z","type":"markdown"},"gitbook":{"version":"3.2.3","time":"2018-10-10T07:06:25.421Z"},"basePath":".","book":{"language":""}});
+            gitbook.page.hasChanged({"page":{"title":"简介","level":"1.1","depth":1,"next":{"title":"安装以及配置","level":"1.2","depth":1,"ref":"","articles":[{"title":"安装","level":"1.2.1","depth":2,"anchor":"#install","path":"install/README.md","ref":"install/README.md#install","articles":[]},{"title":"配置","level":"1.2.2","depth":2,"anchor":"#config","path":"install/README.md","ref":"install/README.md#config","articles":[]}]},"dir":"ltr"},"config":{"gitbook":"*","theme":"default","variables":{},"plugins":["livereload"],"pluginsConfig":{"livereload":{},"highlight":{},"search":{},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"fontsettings":{"theme":"white","family":"sans","size":2},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":false}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"}},"file":{"path":"README.md","mtime":"2018-10-09T09:29:15.034Z","type":"markdown"},"gitbook":{"version":"3.2.3","time":"2018-10-10T09:33:12.148Z"},"basePath":".","book":{"language":""}});
         });
     </script>
 </div>

+ 33 - 1
_book/install/index.html

@@ -209,6 +209,38 @@
             
         </li>
     
+        <li class="chapter " data-level="1.5" >
+            
+                <span>
+            
+                    
+                    使用
+            
+                </span>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="1.5.1" data-path="../use/">
+            
+                <a href="../use/#规则">
+            
+                    
+                    新建规则
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
 
     
 
@@ -351,7 +383,7 @@ redisInfo[&apos;name_serv&apos;] = {
     <script>
         var gitbook = gitbook || [];
         gitbook.push(function() {
-            gitbook.page.hasChanged({"page":{"title":"安装","level":"1.2.1","depth":2,"next":{"title":"配置","level":"1.2.2","depth":2,"anchor":"#config","path":"install/README.md","ref":"install/README.md#config","articles":[]},"previous":{"title":"安装以及配置","level":"1.2","depth":1,"ref":"","articles":[{"title":"安装","level":"1.2.1","depth":2,"anchor":"#install","path":"install/README.md","ref":"install/README.md#install","articles":[]},{"title":"配置","level":"1.2.2","depth":2,"anchor":"#config","path":"install/README.md","ref":"install/README.md#config","articles":[]}]},"dir":"ltr"},"config":{"gitbook":"*","theme":"default","variables":{},"plugins":["livereload"],"pluginsConfig":{"livereload":{},"highlight":{},"search":{},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"fontsettings":{"theme":"white","family":"sans","size":2},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":false}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"}},"file":{"path":"install/README.md","mtime":"2018-10-10T06:42:56.680Z","type":"markdown"},"gitbook":{"version":"3.2.3","time":"2018-10-10T07:06:25.421Z"},"basePath":"..","book":{"language":""}});
+            gitbook.page.hasChanged({"page":{"title":"安装","level":"1.2.1","depth":2,"next":{"title":"配置","level":"1.2.2","depth":2,"anchor":"#config","path":"install/README.md","ref":"install/README.md#config","articles":[]},"previous":{"title":"安装以及配置","level":"1.2","depth":1,"ref":"","articles":[{"title":"安装","level":"1.2.1","depth":2,"anchor":"#install","path":"install/README.md","ref":"install/README.md#install","articles":[]},{"title":"配置","level":"1.2.2","depth":2,"anchor":"#config","path":"install/README.md","ref":"install/README.md#config","articles":[]}]},"dir":"ltr"},"config":{"gitbook":"*","theme":"default","variables":{},"plugins":["livereload"],"pluginsConfig":{"livereload":{},"highlight":{},"search":{},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"fontsettings":{"theme":"white","family":"sans","size":2},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":false}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"}},"file":{"path":"install/README.md","mtime":"2018-10-10T06:42:56.680Z","type":"markdown"},"gitbook":{"version":"3.2.3","time":"2018-10-10T09:33:12.148Z"},"basePath":"..","book":{"language":""}});
         });
     </script>
 </div>

+ 33 - 1
_book/run/index.html

@@ -207,6 +207,38 @@
             
         </li>
     
+        <li class="chapter " data-level="1.5" >
+            
+                <span>
+            
+                    
+                    使用
+            
+                </span>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="1.5.1" data-path="../use/">
+            
+                <a href="../use/#规则">
+            
+                    
+                    新建规则
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
 
     
 
@@ -304,7 +336,7 @@ stdout_logfile=/tmp/WEB_test.spider.duowan.com.log
     <script>
         var gitbook = gitbook || [];
         gitbook.push(function() {
-            gitbook.page.hasChanged({"page":{"title":"部署","level":"1.4.1","depth":2,"previous":{"title":"部署","level":"1.4","depth":1,"ref":"","articles":[{"title":"部署","level":"1.4.1","depth":2,"anchor":"#run","path":"run/README.md","ref":"run/README.md#run","articles":[]}]},"dir":"ltr"},"config":{"gitbook":"*","theme":"default","variables":{},"plugins":["livereload"],"pluginsConfig":{"livereload":{},"highlight":{},"search":{},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"fontsettings":{"theme":"white","family":"sans","size":2},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":false}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"}},"file":{"path":"run/README.md","mtime":"2018-10-10T07:06:11.974Z","type":"markdown"},"gitbook":{"version":"3.2.3","time":"2018-10-10T07:06:25.421Z"},"basePath":"..","book":{"language":""}});
+            gitbook.page.hasChanged({"page":{"title":"部署","level":"1.4.1","depth":2,"next":{"title":"使用","level":"1.5","depth":1,"ref":"","articles":[{"title":"新建规则","level":"1.5.1","depth":2,"anchor":"#规则","path":"use/README.md","ref":"use/README.md#规则","articles":[]}]},"previous":{"title":"部署","level":"1.4","depth":1,"ref":"","articles":[{"title":"部署","level":"1.4.1","depth":2,"anchor":"#run","path":"run/README.md","ref":"run/README.md#run","articles":[]}]},"dir":"ltr"},"config":{"gitbook":"*","theme":"default","variables":{},"plugins":["livereload"],"pluginsConfig":{"livereload":{},"highlight":{},"search":{},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"fontsettings":{"theme":"white","family":"sans","size":2},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":false}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"}},"file":{"path":"run/README.md","mtime":"2018-10-10T07:06:11.974Z","type":"markdown"},"gitbook":{"version":"3.2.3","time":"2018-10-10T09:33:12.148Z"},"basePath":"..","book":{"language":""}});
         });
     </script>
 </div>

Fichier diff supprimé car celui-ci est trop grand
+ 0 - 0
_book/search_index.json


+ 119 - 0
_book/use/README.md

@@ -0,0 +1,119 @@
+#规则
+>新建规则,规则对应需要抓取的相同类型的网页,比如列表页,详情页
+>无论是 /list?page=1 还是 /list?page=n 只要是抓取这个类型的页面获取的数据结构一样的都可以归类成一个规则内
+
+![图片](/img/图3-1.jpg)
+
+* 规则id是唯一的
+
+* 示例url是此规则的举例url,在此url上写采集规则
+
+* 请求模式:普通和浏览器模式,默认普通模式,如果页面是js渲染的则用浏览器模式
+
+* 数据类型:html/json,如果是爬取的api那么使用json,如果是普通页面则用html
+
+* 等待条件,如果是请求模式选择浏览器模式,那么爬虫会根据这里设置的等待条件等待页面加载出这个等待的元素后才回调返回此页面的渲染好的html
+
+* 页面预处理,可以在爬虫获取返回html后先处理页面html,然后再接下去用选择器获取去页面数据,比如有些页面是404,此404是一张图片,如果是直接传下去给选择器,那么会报错缺少字段,如果是预处理时就判断是否404图片,那么直接返回当做抓取成功不会进入选择器。$html, $, page,_task, JTool, Tool 这几个变量可以直接使用,在爬虫内部定义如下:
+
+```
+	async _preprocess(content, page) {
+        if (this.rule.data_type === 'json') {
+            content = content.trim();
+            let lastChar = php.substr(content, -1);
+            if (lastChar === ')') {
+                let pos = content.indexOf('(');
+                content = content.substr(pos + 1, content.length - pos - 2);
+            }
+        }
+
+        let preprocess = this.rule.preprocess && this.rule.preprocess.trim();
+        if (preprocess) {
+            let func = php.create_function('$html, $, page, _task, JTool, Tool', preprocess);
+
+            let $ = null;
+            let $html = null;
+            if (this.rule.data_type === 'html') {
+                $ = cheerio.load(content, { decodeEntities: false });
+                JTool.initJquery($);
+
+                $html = $('html');
+                let flag = func($html, $, page, this.task, JTool, Tool);
+                if (flag === false) {
+                    this.skip = true;
+                }
+                return $('<div></div>').html($html).html();
+            } else if (this.rule.data_type === 'json') {
+                $html = content;
+                return func($html, $, page, JTool, Tool);
+            }
+        }
+        
+        return content;
+    }
+```
+
+---
+
+#选择器
+>选择器负责拾取页面的数据,在浏览器上用js调试,可以立马生效查看获取的数据,这是可视化的基础,而不必如同其他爬虫一样需要每个网页手写不同的选择器
+
+![图片](/img/图3-2.jpg)
+
+* 选择器:查找页面元素并返回
+* 解析$el:上面选择器获取的节点元素,返回的变量用 $el 表示,和jquery操作节点元素一致
+```
+可用变量如下
+_task:任务对象,[ 'url', 'rule_id', 'task_id', 'task_key' ]
+JTool.fixColspan($table, $): 展开表格colspan,删除不合法的列
+JTool.formatDate(date): 把int/Date类型的数据,格式化为:Y-m-d
+JTool.formatDateTime(date): 把int/Date类型的数据,格式化为:Y-m-d H:i:s
+JTool.formatUrl(url): 把相对地址变化成绝对地址
+JTool.formaRichText(content): 富文本的图片相对地址变化成绝对地址,去掉script标签
+JTool.md5(str): md5加密
+```
+* Next规则id:指定当爬取玩此规则后进入的下一个规则,可以设置上下文逻辑,比如列表页面的下一个规则应该是详情页
+* 选择 单项/多项:如果是爬取的数据是多行的则选择多项,如果是只有一项那么选择单项。这里插入的逻辑是,如果是多项的取各个列名字段相同下标的数据处理组成一条数据记录插入数据库例如:
+```
+a 列名:选择多项获取到的是一个数组 ['a', 'b', 'c']
+b 列名:选择多项获取到的是一个数组 ['d', 'e', 'f']
+
+爬虫会将上面的数据格式化成:
+[['a','d'], ['b','e'], ['c','f']]
+逐条插入/更新到数据库
+```
+* 数据-仅插入/更新/仅更新:仅插入(insert)是当数据是新数据的时候插入到数据库,第二次爬取的时候数据不会再更新入库,更新(replace)是当第二次爬取的时候会执行更新到数据库,仅更新(update)是第二次爬取的数据和当前的数据库的数据不一致才会入库更新
+* 必填:如果是在选择器分析页面数据中没有此字段,那么会报错,如果是可选,则不会报错
+* 开关:开启后选择器会解析页面抓取数据,否则不进入此选择器的逻辑
+* 转存:如果是图片,可以将此图片转存至自己服务器然后更新到数据库,这是一个异步的过程,入库后,转存脚本会扫描需要转存的数据,如果没有转存则将其下载到bs2然后重新将新的资源url地址更新到数据库内
+* 只填充:只填充的字段,只在更新模式有效,选择器获取的数据会和临时表中上次抓取的老数据对比,老数据不存在时才填充
+
+---
+
+#配置库表
+>爬虫的数据库表从名字服务器内的配置拉取到表 db_table,data_db中
+
+![图片](/img/图4-3.jpg)
+![图片](/img/图4-2.jpg)
+
+---
+
+#任务
+>当规则添加后,需要配置任务,爬虫才能进行入库 
+
+![图片](/img/图4-1.jpg)
+* 爬虫的地址:指的爬虫的初始入口url,爬虫任务根据此url,开始爬取,如果你配置了Next规则id,那么爬虫当爬取完此规则后会自动进入下一规则,这样就实现了上下问逻辑的网页爬取
+
+当添加完任务后还可以在网页上尝试点击执行,网页返回的是此任务执行期间的打印的日志
+![图片](/img/图4-4.jpg)
+![图片](/img/图4-7.jpg)
+
+---
+
+#调试
+>通过日志和网页运行爬取任务可以调试一个规则是否有报错
+
+![图片](/img/图4-6.jpg)
+![图片](/img/图4-5.jpg)
+
+

+ 476 - 0
_book/use/index.html

@@ -0,0 +1,476 @@
+
+<!DOCTYPE HTML>
+<html lang="" >
+    <head>
+        <meta charset="UTF-8">
+        <meta content="text/html; charset=utf-8" http-equiv="Content-Type">
+        <title>新建规则 · GitBook</title>
+        <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+        <meta name="description" content="">
+        <meta name="generator" content="GitBook 3.2.3">
+        
+        
+        
+    
+    <link rel="stylesheet" href="../gitbook/style.css">
+
+    
+            
+                
+                <link rel="stylesheet" href="../gitbook/gitbook-plugin-highlight/website.css">
+                
+            
+                
+                <link rel="stylesheet" href="../gitbook/gitbook-plugin-search/search.css">
+                
+            
+                
+                <link rel="stylesheet" href="../gitbook/gitbook-plugin-fontsettings/website.css">
+                
+            
+        
+
+    
+
+    
+        
+    
+        
+    
+        
+    
+        
+    
+        
+    
+        
+    
+
+        
+    
+    
+    <meta name="HandheldFriendly" content="true"/>
+    <meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=no">
+    <meta name="apple-mobile-web-app-capable" content="yes">
+    <meta name="apple-mobile-web-app-status-bar-style" content="black">
+    <link rel="apple-touch-icon-precomposed" sizes="152x152" href="../gitbook/images/apple-touch-icon-precomposed-152.png">
+    <link rel="shortcut icon" href="../gitbook/images/favicon.ico" type="image/x-icon">
+
+    
+    
+
+    </head>
+    <body>
+        
+<div class="book">
+    <div class="book-summary">
+        
+            
+<div id="book-search-input" role="search">
+    <input type="text" placeholder="Type to search" />
+</div>
+
+            
+                <nav role="navigation">
+                
+
+
+<ul class="summary">
+    
+    
+
+    
+
+    
+        
+        
+    
+        <li class="chapter " data-level="1.1" data-path="../">
+            
+                <a href="../">
+            
+                    
+                    简介
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="1.2" >
+            
+                <span>
+            
+                    
+                    安装以及配置
+            
+                </span>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="1.2.1" data-path="../install/">
+            
+                <a href="../install/#install">
+            
+                    
+                    安装
+            
+                </a>
+            
+
+            
+        </li>
+    
+        <li class="chapter " data-level="1.2.2" data-path="../install/">
+            
+                <a href="../install/#config">
+            
+                    
+                    配置
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="1.3" >
+            
+                <span>
+            
+                    
+                    目录结构
+            
+                </span>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="1.3.1" data-path="../code/">
+            
+                <a href="../code/#code">
+            
+                    
+                    目录
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="1.4" >
+            
+                <span>
+            
+                    
+                    部署
+            
+                </span>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="1.4.1" data-path="../run/">
+            
+                <a href="../run/#run">
+            
+                    
+                    部署
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+        <li class="chapter " data-level="1.5" >
+            
+                <span>
+            
+                    
+                    使用
+            
+                </span>
+            
+
+            
+            <ul class="articles">
+                
+    
+        <li class="chapter " data-level="1.5.1" data-path="./">
+            
+                <a href="./#规则">
+            
+                    
+                    新建规则
+            
+                </a>
+            
+
+            
+        </li>
+    
+
+            </ul>
+            
+        </li>
+    
+
+    
+
+    <li class="divider"></li>
+
+    <li>
+        <a href="https://www.gitbook.com" target="blank" class="gitbook-link">
+            Published with GitBook
+        </a>
+    </li>
+</ul>
+
+
+                </nav>
+            
+        
+    </div>
+
+    <div class="book-body">
+        
+            <div class="body-inner">
+                
+                    
+
+<div class="book-header" role="navigation">
+    
+
+    <!-- Title -->
+    <h1>
+        <i class="fa fa-circle-o-notch fa-spin"></i>
+        <a href=".." >新建规则</a>
+    </h1>
+</div>
+
+
+
+
+                    <div class="page-wrapper" tabindex="-1" role="main">
+                        <div class="page-inner">
+                            
+<div id="book-search-results">
+    <div class="search-noresults">
+    
+                                <section class="normal markdown-section">
+                                
+                                <h1 id="&#x89C4;&#x5219;">&#x89C4;&#x5219;</h1>
+<blockquote>
+<p>&#x65B0;&#x5EFA;&#x89C4;&#x5219;&#xFF0C;&#x89C4;&#x5219;&#x5BF9;&#x5E94;&#x9700;&#x8981;&#x6293;&#x53D6;&#x7684;&#x76F8;&#x540C;&#x7C7B;&#x578B;&#x7684;&#x7F51;&#x9875;&#xFF0C;&#x6BD4;&#x5982;&#x5217;&#x8868;&#x9875;&#xFF0C;&#x8BE6;&#x60C5;&#x9875;
+&#x65E0;&#x8BBA;&#x662F; /list?page=1 &#x8FD8;&#x662F; /list?page=n &#x53EA;&#x8981;&#x662F;&#x6293;&#x53D6;&#x8FD9;&#x4E2A;&#x7C7B;&#x578B;&#x7684;&#x9875;&#x9762;&#x83B7;&#x53D6;&#x7684;&#x6570;&#x636E;&#x7ED3;&#x6784;&#x4E00;&#x6837;&#x7684;&#x90FD;&#x53EF;&#x4EE5;&#x5F52;&#x7C7B;&#x6210;&#x4E00;&#x4E2A;&#x89C4;&#x5219;&#x5185;</p>
+</blockquote>
+<p><img src="../img/&#x56FE;3-1.jpg" alt="&#x56FE;&#x7247;"></p>
+<ul>
+<li><p>&#x89C4;&#x5219;id&#x662F;&#x552F;&#x4E00;&#x7684;</p>
+</li>
+<li><p>&#x793A;&#x4F8B;url&#x662F;&#x6B64;&#x89C4;&#x5219;&#x7684;&#x4E3E;&#x4F8B;url&#xFF0C;&#x5728;&#x6B64;url&#x4E0A;&#x5199;&#x91C7;&#x96C6;&#x89C4;&#x5219;</p>
+</li>
+<li><p>&#x8BF7;&#x6C42;&#x6A21;&#x5F0F;&#xFF1A;&#x666E;&#x901A;&#x548C;&#x6D4F;&#x89C8;&#x5668;&#x6A21;&#x5F0F;&#xFF0C;&#x9ED8;&#x8BA4;&#x666E;&#x901A;&#x6A21;&#x5F0F;&#xFF0C;&#x5982;&#x679C;&#x9875;&#x9762;&#x662F;js&#x6E32;&#x67D3;&#x7684;&#x5219;&#x7528;&#x6D4F;&#x89C8;&#x5668;&#x6A21;&#x5F0F;</p>
+</li>
+<li><p>&#x6570;&#x636E;&#x7C7B;&#x578B;&#xFF1A;html/json&#xFF0C;&#x5982;&#x679C;&#x662F;&#x722C;&#x53D6;&#x7684;api&#x90A3;&#x4E48;&#x4F7F;&#x7528;json&#xFF0C;&#x5982;&#x679C;&#x662F;&#x666E;&#x901A;&#x9875;&#x9762;&#x5219;&#x7528;html</p>
+</li>
+<li><p>&#x7B49;&#x5F85;&#x6761;&#x4EF6;&#xFF0C;&#x5982;&#x679C;&#x662F;&#x8BF7;&#x6C42;&#x6A21;&#x5F0F;&#x9009;&#x62E9;&#x6D4F;&#x89C8;&#x5668;&#x6A21;&#x5F0F;&#xFF0C;&#x90A3;&#x4E48;&#x722C;&#x866B;&#x4F1A;&#x6839;&#x636E;&#x8FD9;&#x91CC;&#x8BBE;&#x7F6E;&#x7684;&#x7B49;&#x5F85;&#x6761;&#x4EF6;&#x7B49;&#x5F85;&#x9875;&#x9762;&#x52A0;&#x8F7D;&#x51FA;&#x8FD9;&#x4E2A;&#x7B49;&#x5F85;&#x7684;&#x5143;&#x7D20;&#x540E;&#x624D;&#x56DE;&#x8C03;&#x8FD4;&#x56DE;&#x6B64;&#x9875;&#x9762;&#x7684;&#x6E32;&#x67D3;&#x597D;&#x7684;html</p>
+</li>
+<li><p>&#x9875;&#x9762;&#x9884;&#x5904;&#x7406;&#xFF0C;&#x53EF;&#x4EE5;&#x5728;&#x722C;&#x866B;&#x83B7;&#x53D6;&#x8FD4;&#x56DE;html&#x540E;&#x5148;&#x5904;&#x7406;&#x9875;&#x9762;html&#xFF0C;&#x7136;&#x540E;&#x518D;&#x63A5;&#x4E0B;&#x53BB;&#x7528;&#x9009;&#x62E9;&#x5668;&#x83B7;&#x53D6;&#x53BB;&#x9875;&#x9762;&#x6570;&#x636E;&#xFF0C;&#x6BD4;&#x5982;&#x6709;&#x4E9B;&#x9875;&#x9762;&#x662F;404&#xFF0C;&#x6B64;404&#x662F;&#x4E00;&#x5F20;&#x56FE;&#x7247;&#xFF0C;&#x5982;&#x679C;&#x662F;&#x76F4;&#x63A5;&#x4F20;&#x4E0B;&#x53BB;&#x7ED9;&#x9009;&#x62E9;&#x5668;&#xFF0C;&#x90A3;&#x4E48;&#x4F1A;&#x62A5;&#x9519;&#x7F3A;&#x5C11;&#x5B57;&#x6BB5;&#xFF0C;&#x5982;&#x679C;&#x662F;&#x9884;&#x5904;&#x7406;&#x65F6;&#x5C31;&#x5224;&#x65AD;&#x662F;&#x5426;404&#x56FE;&#x7247;&#xFF0C;&#x90A3;&#x4E48;&#x76F4;&#x63A5;&#x8FD4;&#x56DE;&#x5F53;&#x505A;&#x6293;&#x53D6;&#x6210;&#x529F;&#x4E0D;&#x4F1A;&#x8FDB;&#x5165;&#x9009;&#x62E9;&#x5668;&#x3002;$html, $, page,_task, JTool, Tool &#x8FD9;&#x51E0;&#x4E2A;&#x53D8;&#x91CF;&#x53EF;&#x4EE5;&#x76F4;&#x63A5;&#x4F7F;&#x7528;&#xFF0C;&#x5728;&#x722C;&#x866B;&#x5185;&#x90E8;&#x5B9A;&#x4E49;&#x5982;&#x4E0B;&#xFF1A;</p>
+</li>
+</ul>
+<pre><code>    async _preprocess(content, page) {
+        if (this.rule.data_type === &apos;json&apos;) {
+            content = content.trim();
+            let lastChar = php.substr(content, -1);
+            if (lastChar === &apos;)&apos;) {
+                let pos = content.indexOf(&apos;(&apos;);
+                content = content.substr(pos + 1, content.length - pos - 2);
+            }
+        }
+
+        let preprocess = this.rule.preprocess &amp;&amp; this.rule.preprocess.trim();
+        if (preprocess) {
+            let func = php.create_function(&apos;$html, $, page, _task, JTool, Tool&apos;, preprocess);
+
+            let $ = null;
+            let $html = null;
+            if (this.rule.data_type === &apos;html&apos;) {
+                $ = cheerio.load(content, { decodeEntities: false });
+                JTool.initJquery($);
+
+                $html = $(&apos;html&apos;);
+                let flag = func($html, $, page, this.task, JTool, Tool);
+                if (flag === false) {
+                    this.skip = true;
+                }
+                return $(&apos;&lt;div&gt;&lt;/div&gt;&apos;).html($html).html();
+            } else if (this.rule.data_type === &apos;json&apos;) {
+                $html = content;
+                return func($html, $, page, JTool, Tool);
+            }
+        }
+
+        return content;
+    }
+</code></pre><hr>
+<h1 id="&#x9009;&#x62E9;&#x5668;">&#x9009;&#x62E9;&#x5668;</h1>
+<blockquote>
+<p>&#x9009;&#x62E9;&#x5668;&#x8D1F;&#x8D23;&#x62FE;&#x53D6;&#x9875;&#x9762;&#x7684;&#x6570;&#x636E;&#xFF0C;&#x5728;&#x6D4F;&#x89C8;&#x5668;&#x4E0A;&#x7528;js&#x8C03;&#x8BD5;&#xFF0C;&#x53EF;&#x4EE5;&#x7ACB;&#x9A6C;&#x751F;&#x6548;&#x67E5;&#x770B;&#x83B7;&#x53D6;&#x7684;&#x6570;&#x636E;&#xFF0C;&#x8FD9;&#x662F;&#x53EF;&#x89C6;&#x5316;&#x7684;&#x57FA;&#x7840;&#xFF0C;&#x800C;&#x4E0D;&#x5FC5;&#x5982;&#x540C;&#x5176;&#x4ED6;&#x722C;&#x866B;&#x4E00;&#x6837;&#x9700;&#x8981;&#x6BCF;&#x4E2A;&#x7F51;&#x9875;&#x624B;&#x5199;&#x4E0D;&#x540C;&#x7684;&#x9009;&#x62E9;&#x5668;</p>
+</blockquote>
+<p><img src="../img/&#x56FE;3-2.jpg" alt="&#x56FE;&#x7247;"></p>
+<ul>
+<li>&#x9009;&#x62E9;&#x5668;&#xFF1A;&#x67E5;&#x627E;&#x9875;&#x9762;&#x5143;&#x7D20;&#x5E76;&#x8FD4;&#x56DE;</li>
+<li>&#x89E3;&#x6790;$el&#xFF1A;&#x4E0A;&#x9762;&#x9009;&#x62E9;&#x5668;&#x83B7;&#x53D6;&#x7684;&#x8282;&#x70B9;&#x5143;&#x7D20;&#xFF0C;&#x8FD4;&#x56DE;&#x7684;&#x53D8;&#x91CF;&#x7528; $el &#x8868;&#x793A;&#xFF0C;&#x548C;jquery&#x64CD;&#x4F5C;&#x8282;&#x70B9;&#x5143;&#x7D20;&#x4E00;&#x81F4;<pre><code>&#x53EF;&#x7528;&#x53D8;&#x91CF;&#x5982;&#x4E0B;
+_task&#xFF1A;&#x4EFB;&#x52A1;&#x5BF9;&#x8C61;&#xFF0C;[ &apos;url&apos;, &apos;rule_id&apos;, &apos;task_id&apos;, &apos;task_key&apos; ]
+JTool.fixColspan($table, $): &#x5C55;&#x5F00;&#x8868;&#x683C;colspan&#xFF0C;&#x5220;&#x9664;&#x4E0D;&#x5408;&#x6CD5;&#x7684;&#x5217;
+JTool.formatDate(date): &#x628A;int/Date&#x7C7B;&#x578B;&#x7684;&#x6570;&#x636E;&#xFF0C;&#x683C;&#x5F0F;&#x5316;&#x4E3A;&#xFF1A;Y-m-d
+JTool.formatDateTime(date): &#x628A;int/Date&#x7C7B;&#x578B;&#x7684;&#x6570;&#x636E;&#xFF0C;&#x683C;&#x5F0F;&#x5316;&#x4E3A;&#xFF1A;Y-m-d H:i:s
+JTool.formatUrl(url): &#x628A;&#x76F8;&#x5BF9;&#x5730;&#x5740;&#x53D8;&#x5316;&#x6210;&#x7EDD;&#x5BF9;&#x5730;&#x5740;
+JTool.formaRichText(content): &#x5BCC;&#x6587;&#x672C;&#x7684;&#x56FE;&#x7247;&#x76F8;&#x5BF9;&#x5730;&#x5740;&#x53D8;&#x5316;&#x6210;&#x7EDD;&#x5BF9;&#x5730;&#x5740;&#xFF0C;&#x53BB;&#x6389;script&#x6807;&#x7B7E;
+JTool.md5(str): md5&#x52A0;&#x5BC6;
+</code></pre></li>
+<li>Next&#x89C4;&#x5219;id&#xFF1A;&#x6307;&#x5B9A;&#x5F53;&#x722C;&#x53D6;&#x73A9;&#x6B64;&#x89C4;&#x5219;&#x540E;&#x8FDB;&#x5165;&#x7684;&#x4E0B;&#x4E00;&#x4E2A;&#x89C4;&#x5219;&#xFF0C;&#x53EF;&#x4EE5;&#x8BBE;&#x7F6E;&#x4E0A;&#x4E0B;&#x6587;&#x903B;&#x8F91;&#xFF0C;&#x6BD4;&#x5982;&#x5217;&#x8868;&#x9875;&#x9762;&#x7684;&#x4E0B;&#x4E00;&#x4E2A;&#x89C4;&#x5219;&#x5E94;&#x8BE5;&#x662F;&#x8BE6;&#x60C5;&#x9875;</li>
+<li>&#x9009;&#x62E9; &#x5355;&#x9879;/&#x591A;&#x9879;&#xFF1A;&#x5982;&#x679C;&#x662F;&#x722C;&#x53D6;&#x7684;&#x6570;&#x636E;&#x662F;&#x591A;&#x884C;&#x7684;&#x5219;&#x9009;&#x62E9;&#x591A;&#x9879;&#xFF0C;&#x5982;&#x679C;&#x662F;&#x53EA;&#x6709;&#x4E00;&#x9879;&#x90A3;&#x4E48;&#x9009;&#x62E9;&#x5355;&#x9879;&#x3002;&#x8FD9;&#x91CC;&#x63D2;&#x5165;&#x7684;&#x903B;&#x8F91;&#x662F;&#xFF0C;&#x5982;&#x679C;&#x662F;&#x591A;&#x9879;&#x7684;&#x53D6;&#x5404;&#x4E2A;&#x5217;&#x540D;&#x5B57;&#x6BB5;&#x76F8;&#x540C;&#x4E0B;&#x6807;&#x7684;&#x6570;&#x636E;&#x5904;&#x7406;&#x7EC4;&#x6210;&#x4E00;&#x6761;&#x6570;&#x636E;&#x8BB0;&#x5F55;&#x63D2;&#x5165;&#x6570;&#x636E;&#x5E93;&#x4F8B;&#x5982;&#xFF1A;
+```
+a &#x5217;&#x540D;&#xFF1A;&#x9009;&#x62E9;&#x591A;&#x9879;&#x83B7;&#x53D6;&#x5230;&#x7684;&#x662F;&#x4E00;&#x4E2A;&#x6570;&#x7EC4; [&apos;a&apos;, &apos;b&apos;, &apos;c&apos;]
+b &#x5217;&#x540D;&#xFF1A;&#x9009;&#x62E9;&#x591A;&#x9879;&#x83B7;&#x53D6;&#x5230;&#x7684;&#x662F;&#x4E00;&#x4E2A;&#x6570;&#x7EC4; [&apos;d&apos;, &apos;e&apos;, &apos;f&apos;]</li>
+</ul>
+<p>&#x722C;&#x866B;&#x4F1A;&#x5C06;&#x4E0A;&#x9762;&#x7684;&#x6570;&#x636E;&#x683C;&#x5F0F;&#x5316;&#x6210;:
+[[&apos;a&apos;,&apos;d&apos;], [&apos;b&apos;,&apos;e&apos;], [&apos;c&apos;,&apos;f&apos;]]
+&#x9010;&#x6761;&#x63D2;&#x5165;/&#x66F4;&#x65B0;&#x5230;&#x6570;&#x636E;&#x5E93;
+```</p>
+<ul>
+<li>&#x6570;&#x636E;-&#x4EC5;&#x63D2;&#x5165;/&#x66F4;&#x65B0;/&#x4EC5;&#x66F4;&#x65B0;&#xFF1A;&#x4EC5;&#x63D2;&#x5165;(insert)&#x662F;&#x5F53;&#x6570;&#x636E;&#x662F;&#x65B0;&#x6570;&#x636E;&#x7684;&#x65F6;&#x5019;&#x63D2;&#x5165;&#x5230;&#x6570;&#x636E;&#x5E93;&#xFF0C;&#x7B2C;&#x4E8C;&#x6B21;&#x722C;&#x53D6;&#x7684;&#x65F6;&#x5019;&#x6570;&#x636E;&#x4E0D;&#x4F1A;&#x518D;&#x66F4;&#x65B0;&#x5165;&#x5E93;&#xFF0C;&#x66F4;&#x65B0;(replace)&#x662F;&#x5F53;&#x7B2C;&#x4E8C;&#x6B21;&#x722C;&#x53D6;&#x7684;&#x65F6;&#x5019;&#x4F1A;&#x6267;&#x884C;&#x66F4;&#x65B0;&#x5230;&#x6570;&#x636E;&#x5E93;&#xFF0C;&#x4EC5;&#x66F4;&#x65B0;(update)&#x662F;&#x7B2C;&#x4E8C;&#x6B21;&#x722C;&#x53D6;&#x7684;&#x6570;&#x636E;&#x548C;&#x5F53;&#x524D;&#x7684;&#x6570;&#x636E;&#x5E93;&#x7684;&#x6570;&#x636E;&#x4E0D;&#x4E00;&#x81F4;&#x624D;&#x4F1A;&#x5165;&#x5E93;&#x66F4;&#x65B0;</li>
+<li>&#x5FC5;&#x586B;&#xFF1A;&#x5982;&#x679C;&#x662F;&#x5728;&#x9009;&#x62E9;&#x5668;&#x5206;&#x6790;&#x9875;&#x9762;&#x6570;&#x636E;&#x4E2D;&#x6CA1;&#x6709;&#x6B64;&#x5B57;&#x6BB5;&#xFF0C;&#x90A3;&#x4E48;&#x4F1A;&#x62A5;&#x9519;&#xFF0C;&#x5982;&#x679C;&#x662F;&#x53EF;&#x9009;&#xFF0C;&#x5219;&#x4E0D;&#x4F1A;&#x62A5;&#x9519;</li>
+<li>&#x5F00;&#x5173;&#xFF1A;&#x5F00;&#x542F;&#x540E;&#x9009;&#x62E9;&#x5668;&#x4F1A;&#x89E3;&#x6790;&#x9875;&#x9762;&#x6293;&#x53D6;&#x6570;&#x636E;&#xFF0C;&#x5426;&#x5219;&#x4E0D;&#x8FDB;&#x5165;&#x6B64;&#x9009;&#x62E9;&#x5668;&#x7684;&#x903B;&#x8F91;</li>
+<li>&#x8F6C;&#x5B58;&#xFF1A;&#x5982;&#x679C;&#x662F;&#x56FE;&#x7247;&#xFF0C;&#x53EF;&#x4EE5;&#x5C06;&#x6B64;&#x56FE;&#x7247;&#x8F6C;&#x5B58;&#x81F3;&#x81EA;&#x5DF1;&#x670D;&#x52A1;&#x5668;&#x7136;&#x540E;&#x66F4;&#x65B0;&#x5230;&#x6570;&#x636E;&#x5E93;&#xFF0C;&#x8FD9;&#x662F;&#x4E00;&#x4E2A;&#x5F02;&#x6B65;&#x7684;&#x8FC7;&#x7A0B;&#xFF0C;&#x5165;&#x5E93;&#x540E;&#xFF0C;&#x8F6C;&#x5B58;&#x811A;&#x672C;&#x4F1A;&#x626B;&#x63CF;&#x9700;&#x8981;&#x8F6C;&#x5B58;&#x7684;&#x6570;&#x636E;&#xFF0C;&#x5982;&#x679C;&#x6CA1;&#x6709;&#x8F6C;&#x5B58;&#x5219;&#x5C06;&#x5176;&#x4E0B;&#x8F7D;&#x5230;bs2&#x7136;&#x540E;&#x91CD;&#x65B0;&#x5C06;&#x65B0;&#x7684;&#x8D44;&#x6E90;url&#x5730;&#x5740;&#x66F4;&#x65B0;&#x5230;&#x6570;&#x636E;&#x5E93;&#x5185;</li>
+<li>&#x53EA;&#x586B;&#x5145;&#xFF1A;&#x53EA;&#x586B;&#x5145;&#x7684;&#x5B57;&#x6BB5;&#xFF0C;&#x53EA;&#x5728;&#x66F4;&#x65B0;&#x6A21;&#x5F0F;&#x6709;&#x6548;&#xFF0C;&#x9009;&#x62E9;&#x5668;&#x83B7;&#x53D6;&#x7684;&#x6570;&#x636E;&#x4F1A;&#x548C;&#x4E34;&#x65F6;&#x8868;&#x4E2D;&#x4E0A;&#x6B21;&#x6293;&#x53D6;&#x7684;&#x8001;&#x6570;&#x636E;&#x5BF9;&#x6BD4;&#xFF0C;&#x8001;&#x6570;&#x636E;&#x4E0D;&#x5B58;&#x5728;&#x65F6;&#x624D;&#x586B;&#x5145;</li>
+</ul>
+<hr>
+<h1 id="&#x914D;&#x7F6E;&#x5E93;&#x8868;">&#x914D;&#x7F6E;&#x5E93;&#x8868;</h1>
+<blockquote>
+<p>&#x722C;&#x866B;&#x7684;&#x6570;&#x636E;&#x5E93;&#x8868;&#x4ECE;&#x540D;&#x5B57;&#x670D;&#x52A1;&#x5668;&#x5185;&#x7684;&#x914D;&#x7F6E;&#x62C9;&#x53D6;&#x5230;&#x8868; db_table&#xFF0C;data_db&#x4E2D;</p>
+</blockquote>
+<p><img src="../img/&#x56FE;4-3.jpg" alt="&#x56FE;&#x7247;">
+<img src="../img/&#x56FE;4-2.jpg" alt="&#x56FE;&#x7247;"></p>
+<hr>
+<h1 id="&#x4EFB;&#x52A1;">&#x4EFB;&#x52A1;</h1>
+<blockquote>
+<p>&#x5F53;&#x89C4;&#x5219;&#x6DFB;&#x52A0;&#x540E;&#xFF0C;&#x9700;&#x8981;&#x914D;&#x7F6E;&#x4EFB;&#x52A1;&#xFF0C;&#x722C;&#x866B;&#x624D;&#x80FD;&#x8FDB;&#x884C;&#x5165;&#x5E93; </p>
+</blockquote>
+<p><img src="../img/&#x56FE;4-1.jpg" alt="&#x56FE;&#x7247;"></p>
+<ul>
+<li>&#x722C;&#x866B;&#x7684;&#x5730;&#x5740;&#xFF1A;&#x6307;&#x7684;&#x722C;&#x866B;&#x7684;&#x521D;&#x59CB;&#x5165;&#x53E3;url&#xFF0C;&#x722C;&#x866B;&#x4EFB;&#x52A1;&#x6839;&#x636E;&#x6B64;url&#xFF0C;&#x5F00;&#x59CB;&#x722C;&#x53D6;&#xFF0C;&#x5982;&#x679C;&#x4F60;&#x914D;&#x7F6E;&#x4E86;Next&#x89C4;&#x5219;id&#xFF0C;&#x90A3;&#x4E48;&#x722C;&#x866B;&#x5F53;&#x722C;&#x53D6;&#x5B8C;&#x6B64;&#x89C4;&#x5219;&#x540E;&#x4F1A;&#x81EA;&#x52A8;&#x8FDB;&#x5165;&#x4E0B;&#x4E00;&#x89C4;&#x5219;&#xFF0C;&#x8FD9;&#x6837;&#x5C31;&#x5B9E;&#x73B0;&#x4E86;&#x4E0A;&#x4E0B;&#x95EE;&#x903B;&#x8F91;&#x7684;&#x7F51;&#x9875;&#x722C;&#x53D6;</li>
+</ul>
+<p>&#x5F53;&#x6DFB;&#x52A0;&#x5B8C;&#x4EFB;&#x52A1;&#x540E;&#x8FD8;&#x53EF;&#x4EE5;&#x5728;&#x7F51;&#x9875;&#x4E0A;&#x5C1D;&#x8BD5;&#x70B9;&#x51FB;&#x6267;&#x884C;&#xFF0C;&#x7F51;&#x9875;&#x8FD4;&#x56DE;&#x7684;&#x662F;&#x6B64;&#x4EFB;&#x52A1;&#x6267;&#x884C;&#x671F;&#x95F4;&#x7684;&#x6253;&#x5370;&#x7684;&#x65E5;&#x5FD7;
+<img src="../img/&#x56FE;4-4.jpg" alt="&#x56FE;&#x7247;">
+<img src="../img/&#x56FE;4-7.jpg" alt="&#x56FE;&#x7247;"></p>
+<hr>
+<h1 id="&#x8C03;&#x8BD5;">&#x8C03;&#x8BD5;</h1>
+<blockquote>
+<p>&#x901A;&#x8FC7;&#x65E5;&#x5FD7;&#x548C;&#x7F51;&#x9875;&#x8FD0;&#x884C;&#x722C;&#x53D6;&#x4EFB;&#x52A1;&#x53EF;&#x4EE5;&#x8C03;&#x8BD5;&#x4E00;&#x4E2A;&#x89C4;&#x5219;&#x662F;&#x5426;&#x6709;&#x62A5;&#x9519;</p>
+</blockquote>
+<p><img src="../img/&#x56FE;4-6.jpg" alt="&#x56FE;&#x7247;">
+<img src="../img/&#x56FE;4-5.jpg" alt="&#x56FE;&#x7247;"></p>
+
+                                
+                                </section>
+                            
+    </div>
+    <div class="search-results">
+        <div class="has-results">
+            
+            <h1 class="search-results-title"><span class='search-results-count'></span> results matching "<span class='search-query'></span>"</h1>
+            <ul class="search-results-list"></ul>
+            
+        </div>
+        <div class="no-results">
+            
+            <h1 class="search-results-title">No results matching "<span class='search-query'></span>"</h1>
+            
+        </div>
+    </div>
+</div>
+
+                        </div>
+                    </div>
+                
+            </div>
+
+            
+                
+                
+            
+        
+    </div>
+
+    <script>
+        var gitbook = gitbook || [];
+        gitbook.push(function() {
+            gitbook.page.hasChanged({"page":{"title":"新建规则","level":"1.5.1","depth":2,"previous":{"title":"使用","level":"1.5","depth":1,"ref":"","articles":[{"title":"新建规则","level":"1.5.1","depth":2,"anchor":"#规则","path":"use/README.md","ref":"use/README.md#规则","articles":[]}]},"dir":"ltr"},"config":{"gitbook":"*","theme":"default","variables":{},"plugins":["livereload"],"pluginsConfig":{"livereload":{},"highlight":{},"search":{},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"fontsettings":{"theme":"white","family":"sans","size":2},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":false}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"}},"file":{"path":"use/README.md","mtime":"2018-10-10T09:32:56.540Z","type":"markdown"},"gitbook":{"version":"3.2.3","time":"2018-10-10T09:33:12.148Z"},"basePath":"..","book":{"language":""}});
+        });
+    </script>
+</div>
+
+        
+    <script src="../gitbook/gitbook.js"></script>
+    <script src="../gitbook/theme.js"></script>
+    
+        
+        <script src="../gitbook/gitbook-plugin-livereload/plugin.js"></script>
+        
+    
+        
+        <script src="../gitbook/gitbook-plugin-search/search-engine.js"></script>
+        
+    
+        
+        <script src="../gitbook/gitbook-plugin-search/search.js"></script>
+        
+    
+        
+        <script src="../gitbook/gitbook-plugin-lunr/lunr.min.js"></script>
+        
+    
+        
+        <script src="../gitbook/gitbook-plugin-lunr/search-lunr.js"></script>
+        
+    
+        
+        <script src="../gitbook/gitbook-plugin-sharing/buttons.js"></script>
+        
+    
+        
+        <script src="../gitbook/gitbook-plugin-fontsettings/fontsettings.js"></script>
+        
+    
+
+    </body>
+</html>
+

BIN
img/图3-1.jpg


BIN
img/图3-2.jpg


BIN
img/图3-3.jpg


BIN
img/图4-1.jpg


BIN
img/图4-2.jpg


BIN
img/图4-3.jpg


BIN
img/图4-4.jpg


BIN
img/图4-5.jpg


BIN
img/图4-6.jpg


BIN
img/图4-7.jpg


+ 119 - 0
use/README.md

@@ -0,0 +1,119 @@
+#规则
+>新建规则,规则对应需要抓取的相同类型的网页,比如列表页,详情页
+>无论是 /list?page=1 还是 /list?page=n 只要是抓取这个类型的页面获取的数据结构一样的都可以归类成一个规则内
+
+![图片](/img/图3-1.jpg)
+
+* 规则id是唯一的
+
+* 示例url是此规则的举例url,在此url上写采集规则
+
+* 请求模式:普通和浏览器模式,默认普通模式,如果页面是js渲染的则用浏览器模式
+
+* 数据类型:html/json,如果是爬取的api那么使用json,如果是普通页面则用html
+
+* 等待条件,如果是请求模式选择浏览器模式,那么爬虫会根据这里设置的等待条件等待页面加载出这个等待的元素后才回调返回此页面的渲染好的html
+
+* 页面预处理,可以在爬虫获取返回html后先处理页面html,然后再接下去用选择器获取去页面数据,比如有些页面是404,此404是一张图片,如果是直接传下去给选择器,那么会报错缺少字段,如果是预处理时就判断是否404图片,那么直接返回当做抓取成功不会进入选择器。$html, $, page,_task, JTool, Tool 这几个变量可以直接使用,在爬虫内部定义如下:
+
+```
+	async _preprocess(content, page) {
+        if (this.rule.data_type === 'json') {
+            content = content.trim();
+            let lastChar = php.substr(content, -1);
+            if (lastChar === ')') {
+                let pos = content.indexOf('(');
+                content = content.substr(pos + 1, content.length - pos - 2);
+            }
+        }
+
+        let preprocess = this.rule.preprocess && this.rule.preprocess.trim();
+        if (preprocess) {
+            let func = php.create_function('$html, $, page, _task, JTool, Tool', preprocess);
+
+            let $ = null;
+            let $html = null;
+            if (this.rule.data_type === 'html') {
+                $ = cheerio.load(content, { decodeEntities: false });
+                JTool.initJquery($);
+
+                $html = $('html');
+                let flag = func($html, $, page, this.task, JTool, Tool);
+                if (flag === false) {
+                    this.skip = true;
+                }
+                return $('<div></div>').html($html).html();
+            } else if (this.rule.data_type === 'json') {
+                $html = content;
+                return func($html, $, page, JTool, Tool);
+            }
+        }
+        
+        return content;
+    }
+```
+
+---
+
+#选择器
+>选择器负责拾取页面的数据,在浏览器上用js调试,可以立马生效查看获取的数据,这是可视化的基础,而不必如同其他爬虫一样需要每个网页手写不同的选择器
+
+![图片](/img/图3-2.jpg)
+
+* 选择器:查找页面元素并返回
+* 解析$el:上面选择器获取的节点元素,返回的变量用 $el 表示,和jquery操作节点元素一致
+```
+可用变量如下
+_task:任务对象,[ 'url', 'rule_id', 'task_id', 'task_key' ]
+JTool.fixColspan($table, $): 展开表格colspan,删除不合法的列
+JTool.formatDate(date): 把int/Date类型的数据,格式化为:Y-m-d
+JTool.formatDateTime(date): 把int/Date类型的数据,格式化为:Y-m-d H:i:s
+JTool.formatUrl(url): 把相对地址变化成绝对地址
+JTool.formaRichText(content): 富文本的图片相对地址变化成绝对地址,去掉script标签
+JTool.md5(str): md5加密
+```
+* Next规则id:指定当爬取玩此规则后进入的下一个规则,可以设置上下文逻辑,比如列表页面的下一个规则应该是详情页
+* 选择 单项/多项:如果是爬取的数据是多行的则选择多项,如果是只有一项那么选择单项。这里插入的逻辑是,如果是多项的取各个列名字段相同下标的数据处理组成一条数据记录插入数据库例如:
+```
+a 列名:选择多项获取到的是一个数组 ['a', 'b', 'c']
+b 列名:选择多项获取到的是一个数组 ['d', 'e', 'f']
+
+爬虫会将上面的数据格式化成:
+[['a','d'], ['b','e'], ['c','f']]
+逐条插入/更新到数据库
+```
+* 数据-仅插入/更新/仅更新:仅插入(insert)是当数据是新数据的时候插入到数据库,第二次爬取的时候数据不会再更新入库,更新(replace)是当第二次爬取的时候会执行更新到数据库,仅更新(update)是第二次爬取的数据和当前的数据库的数据不一致才会入库更新
+* 必填:如果是在选择器分析页面数据中没有此字段,那么会报错,如果是可选,则不会报错
+* 开关:开启后选择器会解析页面抓取数据,否则不进入此选择器的逻辑
+* 转存:如果是图片,可以将此图片转存至自己服务器然后更新到数据库,这是一个异步的过程,入库后,转存脚本会扫描需要转存的数据,如果没有转存则将其下载到bs2然后重新将新的资源url地址更新到数据库内
+* 只填充:只填充的字段,只在更新模式有效,选择器获取的数据会和临时表中上次抓取的老数据对比,老数据不存在时才填充
+
+---
+
+#配置库表
+>爬虫的数据库表从名字服务器内的配置拉取到表 db_table,data_db中
+
+![图片](/img/图4-3.jpg)
+![图片](/img/图4-2.jpg)
+
+---
+
+#任务
+>当规则添加后,需要配置任务,爬虫才能进行入库 
+
+![图片](/img/图4-1.jpg)
+* 爬虫的地址:指的爬虫的初始入口url,爬虫任务根据此url,开始爬取,如果你配置了Next规则id,那么爬虫当爬取完此规则后会自动进入下一规则,这样就实现了上下问逻辑的网页爬取
+
+当添加完任务后还可以在网页上尝试点击执行,网页返回的是此任务执行期间的打印的日志
+![图片](/img/图4-4.jpg)
+![图片](/img/图4-7.jpg)
+
+---
+
+#调试
+>通过日志和网页运行爬取任务可以调试一个规则是否有报错
+
+![图片](/img/图4-6.jpg)
+![图片](/img/图4-5.jpg)
+
+

Certains fichiers n'ont pas été affichés car il y a eu trop de fichiers modifiés dans ce diff