使用scrapy爬取苏宁易购图书信息

发布时间:2025-12-09 19:43:57 浏览次数:4

理论基础详见:https://blog.csdn.net/apollo_miracle/article/details/84987459

# -*- coding: utf-8 -*-import refrom copy import deepcopyimport scrapyclass BookSpider(scrapy.Spider):name = 'book'allowed_domains = ['suning.com']start_urls = ['https://book.suning.com/']def parse(self, response):# 获取大分类的分组p_list = response.xpath("//p[@class='left-menu-container']/p[@class='menu-list']/p[@class='menu-item']")p_sub_list = response.xpath("//p[@class='left-menu-container']/p[@class='menu-list']/p[@class='menu-sub']")for p in p_list:item = {}# 大分类的名字item["b_cate"] = p.xpath(".//h3/a/text()").extract_first()# 当前大分类的所有的中间分类的位置current_sub_p = p_sub_list[p_list.index(p)]# 获取中间分类的分组p_list = current_sub_p.xpath(".//p[@class='submenu-left']/p")for p in p_list:item["m_cate"] = p.xpath(".//a/text()").extract_first()# 获取小分类的分组s_list = p.xpath("./following-sibling::ul[1]/li")for s in s_list:# 小分类的名字item["s_cate"] = s.xpath(".//a/text()").extract_first()# 小分类的URL地址item["s_href"] = s.xpath(".//a/@href").extract_first()# 请求图书的列表页yield scrapy.Request(item["s_href"],callback=self.parse_book_list,meta={"item": deepcopy(item)})# 发送请求,获取列表页第一页后一部分的数据next_url_temp = "https://list.suning.com/emall/showProductList.do?ci={}&pg=03&cp=0&il=0&iy=0&adNumber=0&n=1&ch=4&prune=0&sesab=ACBAAB&id=IDENTIFYING&cc=010&paging=1&sub=0"# 获取url地址的cici = item["s_href"].split("-")[1]next_url = next_url_temp.format(ci)yield scrapy.Request(next_url,callback=self.parse_book_list,meta={"item": deepcopy(item)})def parse_book_list(self, response):item = response.meta["item"]# 获取图书列表页的分组# book_list = response.xpath("//p[@id='filter-results']/ul/li")book_list = response.xpath("//li[contains(@class,'product book')]")for book in book_list:# 书名item["book_name"] = book.xpath(".//p[@class='sell-point']/a/text()").extract_first()# 书的url地址,不完整item["book_href"] = book.xpath(".//p[@class='sell-point']/a/@href").extract_first()# 书店名item["book_store"] = book.xpath(".//p[@class='seller oh no-more ']/a/text()").extract_first()# 发送详情页的请求yield response.follow(item["book_href"],callback=self.parse_book_detail,meta={"item": deepcopy(item)})# 列表页翻页next_page_url_1 = "https://list.suning.com/emall/showProductList.do?ci={}&pg=03&cp={}&il=0&iy=0&adNumber=0&n=1&ch=4&prune=0&sesab=ACBAAB&id=IDENTIFYING&cc=010"next_page_url_2 = "https://list.suning.com/emall/showProductList.do?ci={}&pg=03&cp={}&il=0&iy=0&adNumber=0&n=1&ch=4&prune=0&sesab=ACBAAB&id=IDENTIFYING&cc=010&paging=1&sub=0"# 获取url地址的cici = item["s_href"].split("-")[1]# 当前的页码数current_page = re.findall(r'param.currentPage = "(.*?)";', response.body.decode())[0]# 总的页码数total_page = re.findall(r'param.pageNumbers = "(.*?)";', response.body.decode())[0]# print(total_page, "*" * 30)while int(current_page) < int(total_page):next_page_num = int(current_page) + 1# 组装前半部分URLnext_url_1 = next_page_url_1.format(ci, next_page_num)yield scrapy.Request(next_url_1,callback=self.parse_book_list,meta={"item": item})# 组装后半部分URLnext_url_2 = next_page_url_2.format(ci, next_page_num)yield scrapy.Request(next_url_2,callback=self.parse_book_list,meta={"item": item})def parse_book_detail(self, response):"""处理图书详情页内容"""item = response.meta["item"]price_url_temp = "https://pas.suning.com/nspcsale_0_000000000{}_000000000{}_{}_10_010_0100101_226503_1000000_9017_10106_Z001___{}_{}___.html"p1 = response.url.split("/")[-1].split(".")[-2]p3 = response.url.split("/")[-2]p4 = re.findall(r'"catenIds":"(.*?)"', response.body.decode())if p4:p4 = p4[0]p5 = re.findall(r'"weight":"(.*?)"', response.body.decode())[0]price_url = price_url_temp.format(p1, p1, p3, p4, p5)yield scrapy.Request(price_url,callback=self.parse_book_pirce,meta={"item": item})def parse_book_pirce(self, response):"""提取图书的价格"""item = response.meta["item"]price = re.findall(r'"netPrice":"(.*?)"', response.body.decode())if price:item["book_price"] = price[0]print(item)yield item

 

需要做网站?需要网络推广?欢迎咨询客户经理 13272073477