饭叔的知识整理

pyspider 运行分析

web上点击run
POST FORMDATA:

webdav_mode:false
script:#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2015-08-26 08:49:37
# Project: aaa

from pyspider.libs.base_handler import *


class Handler(BaseHandler):
    crawl_config = {
    }

    @every(minutes=24 * 60)
    def on_start(self):
        self.crawl('http://www.baidu.com', callback=self.index_page)

    @config(age=10 * 24 * 60 * 60)
    def index_page(self, response):
        for each in response.doc('a[href^="http"]').items():
            self.crawl(each.attr.href, callback=self.detail_page)

    @config(priority=2)
    def detail_page(self, response):
        return {
            "url": response.url,
            "title": response.doc('title').text(),
        }
task:{
  "process": {
    "callback": "on_start"
  },
  "project": "aaa",
  "taskid": "data:,on_start",
  "url": "data:,on_start"
}

@app.route('/debug//run', methods=['POST', ])

第一次:

task=
{
    u'process': {u'callback': u'on_start'}, 
    u'project': u'aaa', 
    u'url': u'data:,on_start', 
    u'taskid': u'data:,on_start'
}

project=
{
    'status': 'DEBUG', 
    'name': u'aaa', 
    'script': u'#!/usr/bin/env python\n# -*- encoding: utf-8 -*-\n# Created on 2015-08-26 08:49:37\n# Project: aaa\n\nfrom pyspider.libs.base_handler import *\n\n\nclass Handler(BaseHandler):\n    crawl_config = {\n    }\n\n    @every(minutes=24 * 60)\n    def on_start(self):\n        self.crawl(\'http://www.baidu.com\', callback=self.index_page)\n\n    @config(age=10 * 24 * 60 * 60)\n    def index_page(self, response):\n        for each in response.doc(\'a[href^="http"]\').items():\n            self.crawl(each.attr.href, callback=self.detail_page)\n\n    @config(priority=2)\n    def detail_page(self, response):\n        return {\n            "url": response.url,\n            "title": response.doc(\'title\').text(),\n        }\n'
}

fetch_result = app.config['fetch'](task)
fetch_result=
{
    u'cookies': {}, 
    u'url': u'data:,on_start', 
    u'orig_url': u'data:,on_start', 
    u'time': 0, 
    u'content': u'on_start', 
    u'headers': {}, 
    u'status_code': 200, 
    u'save': None
}

ret = module['instance'].run_task(module['module'], task, response)
ret=<pyspider.processor.processor.ProcessorResult object at 0x10ee0b0d0>
ret.follows=[{'schedule': {'age': 864000}, 'process': {'callback': 'index_page'}, 'project': u'aaa', 'url': 'http://www.baidu.com/', 'taskid': 'f03f5717616221de41881be555473a02', 'fetch': {}}]

result=
{
    'fetch_result': 
        {    
            u'cookies': {}, 
            u'url': u'data:,on_start', 
            u'orig_url': u'data:,on_start', 
            u'time': 0, 
            u'content': u'on_start', 
            u'headers': {}, 
            u'status_code': 200, 
            u'save': None
        }, 
    'logs': u'', 
    'follows': 
        [{
            'schedule': {'age': 864000}, 
            'process': {'callback': 'index_page'}, 
            'project': u'aaa', 
            'url': 'http://www.baidu.com/', 
            'taskid': 'f03f5717616221de41881be555473a02', 
            'fetch': {}
        }], 
    'messages': [], 
    'result': None, 
    'time': 828.4301750659943
}

result被返回到web
RECEIVE:

{  
   "fetch_result":{  
      "content":"on_start",
      "cookies":{  

      },
      "headers":{  

      },
      "orig_url":"data:,on_start",
      "save":null,
      "status_code":200,
      "time":0,
      "url":"data:,on_start"
   },
   "follows":[  
      {  
         "fetch":{  

         },
         "process":{  
            "callback":"index_page"
         },
         "project":"aaa",
         "schedule":{  
            "age":864000
         },
         "taskid":"f03f5717616221de41881be555473a02",
         "url":"http://www.baidu.com/"
      }
   ],
   "logs":"",
   "messages":[  

   ],
   "result":null,
   "time":828.4301750659943
}