Scrapy

Scrapy CheatSheet

递归爬取

class DmozSpider(CrawlSpider):
    name = "dmoz"
    allowed_domains = ["mydomain.nl"]
    start_urls = [
        "http://www.mydomain.nl/Zuid-Holland"
    ]

    rules = (Rule(SgmlLinkExtractor(allow=('*Zuid-Holland*', )), callback='parse_winkel', follow=True),)

    def parse_winkel(self, response):
        sel = Selector(response)
        sites = sel.xpath('//ul[@id="itemsList"]/li')
        items = []

        for site in sites:
            item = WinkelItem()
            item['adres'] = site.xpath('.//a/text()').extract(), site.xpath('text()').extract(), sel.xpath('//h1/text()').re(r'winkel\s*(.*)')
            items.append(item)
        return items