import scrapy class DownFilesItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() #pass #file_urls用来存放要下载的文件url地址 file_urls = scrapy.Field() files = scrapy.Field()
from scrapy.linkextractors import LinkExtractor from Down_Files.items import DownFilesItem class DownFileSpider(scrapy.Spider): name = 'Down_File' allowed_domains = ['matplotlib.org'] start_urls = ['http://matplotlib.org/examples/index.html'] def parse(self,response): le = LinkExtractor(restrict_xpaths=('//*[@id="matplotlib-examples"]/div/ul/li/ul/li/a'),deny='/index.html$') for link in le.extract_links(response): yield scrapy.Request(link.url,callback=self.parse_files) def parse_files(self, response): href = response.xpath('//div[@class="body"]/div/p/a/@href').extract_first() url = response.urljoin(href) item = DownFilesItem() item['file_urls'] = [url] return item
成果图如下
ImagesPipeline
下载So_Image的图片
items.py
1 2 3 4 5 6 7
import scrapy class DownSoImageItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() #跟下载文件一样,存放图片的url地址 image_urls = scrapy.Field()