python爬取网页内容(怎么在Python中使用Scrapy爬取网页内容)

发布时间:2025-12-11 02:34:37 浏览次数:2

爬虫主程序:

#-*-coding:utf-8-*-importscrapyfromscrapy.httpimportRequestfromzjf.FsmzItemsimportFsmzItemfromscrapy.selectorimportSelector#圈圈:情感生活classMySpider(scrapy.Spider):#爬虫名name="MySpider"#设定域名allowed_domains=["nvsheng.com"]#爬取地址start_urls=[]#flagx=0#爬取方法defparse(self,response):item=FsmzItem()sel=Selector(response)item['title']=sel.xpath('//h2/text()').extract()item['text']=sel.xpath('//*[@class="content"]/p/text()').extract()item['imags']=sel.xpath('//p[@id="content"]/p/a/img/@src|//p[@id="content"]/p/img/@src').extract()ifMySpider.x==0:page_list=MySpider.getUrl(self,response)forpage_singleinpage_list:yieldRequest(page_single)MySpider.x+=1yielditem#init:动态传入参数#命令行传参写法:scrapycrawlMySpider-astart_url="http://some_url"def__init__(self,*args,**kwargs):super(MySpider,self).__init__(*args,**kwargs)self.start_urls=[kwargs.get('start_url')]defgetUrl(self,response):url_list=[]select=Selector(response)page_list_tmp=select.xpath('//p[@class="viewnewpages"]/a[not(@class="next")]/@href').extract()forpage_tmpinpage_list_tmp:ifpage_tmpnotinurl_list:url_list.append("http://www.nvsheng.com/emotion/px/"+page_tmp)returnurl_list

PipeLines类

#-*-coding:utf-8-*-#Defineyouritempipelineshere##Don'tforgettoaddyourpipelinetotheITEM_PIPELINESsetting#See:http://doc.scrapy.org/en/latest/topics/item-pipeline.htmlfromzjfimportsettingsimportjson,os,re,randomimporturllib.requestimportrequests,jsonfromrequests_toolbelt.multipart.encoderimportMultipartEncoderclassMyPipeline(object):flag=1post_title=''post_text=[]post_text_imageUrl_list=[]cs=[]user_id=''def__init__(self):MyPipeline.user_id=MyPipeline.getRandomUser('37619,18441390,18441391')#processthedatadefprocess_item(self,item,spider):#获取随机user_id,模拟发帖user_id=MyPipeline.user_id#获取正文text_str_tmptext=item['text']text_str_tmp=""forstrintext:text_str_tmp=text_str_tmp+str#print(text_str_tmp)#获取标题ifMyPipeline.flag==1:title=item['title']MyPipeline.post_title=MyPipeline.post_title+title[0]#保存并上传图片text_insert_pic=''text_insert_pic_w=''text_insert_pic_h=''forimag_urlinitem['imags']:img_name=imag_url.replace('/','').replace('.','').replace('|','').replace(':','')pic_dir=settings.IMAGES_STORE+'%s.jpg'%(img_name)urllib.request.urlretrieve(imag_url,pic_dir)#图片上传,返回jsonupload_img_result=MyPipeline.uploadImage(pic_dir,'image/jpeg')#获取json中保存图片路径text_insert_pic=upload_img_result['result']['image_url']text_insert_pic_w=upload_img_result['result']['w']text_insert_pic_h=upload_img_result['result']['h']#拼接jsonifMyPipeline.flag==1:cs_json={"c":text_str_tmp,"i":"","w":text_insert_pic_w,"h":text_insert_pic_h}else:cs_json={"c":text_str_tmp,"i":text_insert_pic,"w":text_insert_pic_w,"h":text_insert_pic_h}MyPipeline.cs.append(cs_json)MyPipeline.flag+=1returnitem#spider开启时被调用defopen_spider(self,spider):pass#sipder关闭时被调用defclose_spider(self,spider):strcs=json.dumps(MyPipeline.cs)jsonData={"apisign":"99ea3eda4b45549162c4a741d58baa60","user_id":MyPipeline.user_id,"gid":30,"t":MyPipeline.post_title,"cs":strcs}MyPipeline.uploadPost(jsonData)#上传图片defuploadImage(img_path,content_type):"uploadImagefunctions"#UPLOAD_IMG_URL="http://api.qa.douguo.net/robot/uploadpostimage"UPLOAD_IMG_URL="http://api.douguo.net/robot/uploadpostimage"#传图片#imgPath='D:\pics\http___img_nvsheng_com_uploads_allimg_170119_18-1f1191g440_jpg.jpg'm=MultipartEncoder(#fields={'user_id':'192323',#'images':('filename',open(imgPath,'rb'),'image/JPEG')}fields={'user_id':MyPipeline.user_id,'apisign':'99ea3eda4b45549162c4a741d58baa60','image':('filename',open(img_path,'rb'),'image/jpeg')})r=requests.post(UPLOAD_IMG_URL,data=m,headers={'Content-Type':m.content_type})returnr.json()defuploadPost(jsonData):CREATE_POST_URL=http://api.douguo.net/robot/uploadimagespost
reqPost=requests.post(CREATE_POST_URL,data=jsonData)
defgetRandomUser(userStr):user_list=[]user_chooesd=''foruser_idinstr(userStr).split(','):user_list.append(user_id)userId_idx=random.randint(1,len(user_list))user_chooesd=user_list[userId_idx-1]returnuser_chooesd

字段保存Items类

#-*-coding:utf-8-*-#Defineherethemodelsforyourscrapeditems##Seedocumentationin:#http://doc.scrapy.org/en/latest/topics/items.htmlimportscrapyclassFsmzItem(scrapy.Item):#definethefieldsforyouritemherelike:#name=scrapy.Field()title=scrapy.Field()#tutor=scrapy.Field()#strongText=scrapy.Field()text=scrapy.Field()imags=scrapy.Field()

在命令行里键入

scrapycrawlMySpider-astart_url=www.aaa.com
python爬取网页内容
需要做网站?需要网络推广?欢迎咨询客户经理 13272073477