采集代理ip 地址【西刺,快代理】

发布时间:2025-12-10 11:31:46 浏览次数:2

# 嗯,。。。因为经常需要使用代理去抓一点东西,就有了下面一段代码,第一版不是很好,后面用到了再来优化

import re,pymysql,time,redisfrom urllib.request import Requestfrom urllib.request import urlopenheaders={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',}url = 'http://www.xicidaili.com/wt/'db = pymysql.connect(host='127.0.0.1', user='root', password="liu",database='test', port=3306, charset='utf8')cur = db.cursor()def url_response(url,cur):response = urlopen(Request(url,headers=headers)).read()response = response.decode()pattern='<td>(.*?)</td>\s+<td>(\d+)</td>\s+<td>\s+<a href="/.*?">[\u4e00-\u9fa5]+</a>\s+</td>\s+<td class="country">高匿</td>\s+<td>(\w+)</td>\s+<td class="country">\s+<p title="(\d.\d+)秒"'regex = re.compile(pattern)ip_list = regex.findall(response)for i in ip_list:out_time = float(i[3])ip_ = i[0]+':'+i[1]sql = 'select ip_ from ip where ip_ = "%s"'%ip_cur.execute(sql)if cur.fetchone():print('重复数据跳过')continueif out_time < 1:sql = "insert into ip(ip_,time_,xy_) values('%s','%s','%s')"%(ip_,out_time,i[2])cur.execute(sql)print('插入成功,',i)else:passfor i in range(1,3):_ = url+str(i)url_response(_,cur)db.commit()time.sleep(2)

 

# 第一版使用 mysql存着也没啥用处 ,然后就给来了redis# 加入线程池的使用,让抓取更加速度import re,time,redisfrom concurrent.futures import ThreadPoolExecutorfrom urllib.request import Requestfrom urllib.request import urlopenheaders={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',}url = 'http://www.xicidaili.com/wt/'class R(object):def __init__(self):r_pool =redis.ConnectionPool(host='127.0.0.1',db=0,password=None,port=6379)self.redis_obj = redis.Redis(connection_pool=r_pool)def setex(self,name,value,time):return self.redis_obj.setex(name,value,time)def get(self,name):return self.redis_obj.get(name)def url_response(url,redis_obj):response = urlopen(Request(url,headers=headers)).read()response = response.decode()pattern='<td>(.*?)</td>\s+<td>(\d+)</td>\s+<td>\s+<a href="/.*?">[\u4e00-\u9fa5]+</a>\s+</td>\s+<td class="country">高匿</td>\s+<td>(\w+)</td>\s+<td class="country">\s+<p title="(\d.\d+)秒"'regex = re.compile(pattern)ip_list = regex.findall(response)for i in ip_list:out_time = float(i[3])ip_ = i[0]+':'+i[1]if redis_obj.get(ip_):print('重复数据跳过')continueif out_time < 1:redis_obj.setex(ip_,1,60*30)print('插入成功,',ip_)else:passr = R()T = ThreadPoolExecutor(4)for i in range(1,5):_ = url+str(i)T.submit(url_response,_,r)print('执行完成 ')T.shutdown() 更新第二版 ,使用redis___西刺

 



 

2018-12-17:

  第二版出错更新: 15行 :return self.redis_obj.setex(name,time,value)  #此处已改正

# python 3.7from lxml import etreeimport requests,time,redisclass Kuai_IP(object):def __init__(self):self.headers = {'Host': 'www.kuaidaili.com','Upgrade-Insecure-Requests': '1','User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36','Referer': 'https://www.kuaidaili.com/free/inha/1/'}self.static='https://www.kuaidaili.com/free/inha/%s/'Con_pool = redis.ConnectionPool(host='127.0.0.1')self.r = redis.Redis(connection_pool=Con_pool)def getPage(self,page_index):if page_index==1:self.headers['Referer']='https://www.kuaidaili.com/free/inha/'else:self.headers['Referer']='https://www.kuaidaili.com/free/inha/'+str(page_index-1)+'/'res = requests.get(url=self.static%page_index,headers=self.headers)self.parse(res.text)def parse(self,res):html = etree.HTML(res)r_list = html.xpath('//tbody/tr/td/text()')if r_list:for i in range(1, int(len(r_list) / 7)):_ =r_list[i * 7:(i + 1) * 7]self.r.setex(_[3]+'://'+_[0]+':'+_[1],30,_[3])print(_)else:print(r_list)def work_on(self):page_index = 2 # 爬取页数for i in range(1,page_index+1):self.getPage(i)print(i, '---------')time.sleep(2)if __name__ == '__main__':ip = Kuai_IP()ip.work_on() View Code______快代理IP——没做过滤

 


 


 

2018-12-20

  嗯,这次由于用到的代理比较多,就把西刺和快代理的代码合到了一起,没做什么大的改进,

  1 代理ip格式全部成为 requests代理的形式{'http':'xxx://xx.xx.xx.xx:xxx'} 方便requests的调用

 

# -*- coding:utf-8 -*-# @time:2018-12-20 22:23import re,redis,time,requestsfrom concurrent.futures import ThreadPoolExecutorfrom urllib.request import Requestfrom urllib.request import urlopenpage = 10 #定义抓取页数,由于快代理一页的ip比较少,然后我就在此的基础上+10headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',}url = 'http://www.xicidaili.com/wt/'class R(object):def __init__(self):r_pool =redis.ConnectionPool(host='127.0.0.1',db=0,password=None,port=6379)self.redis_obj = redis.Redis(connection_pool=r_pool)def setex(self,name,value,time):return self.redis_obj.setex(name, value, time)def get(self,name):return self.redis_obj.get(name)def url_response(url,redis_obj):response = urlopen(Request(url,headers=headers)).read()response = response.decode()pattern='<td>(.*?)</td>\s+<td>(\d+)</td>\s+<td>\s+<a href="/.*?">[\u4e00-\u9fa5]+</a>\s+</td>\s+<td class="country">高匿</td>\s+<td>(\w+)</td>\s+<td class="country">\s+<p title="(\d.\d+)秒"'regex = re.compile(pattern)ip_list = regex.findall(response)for i in ip_list:out_time = float(i[3])ip_ = i[0]+':'+i[1]if redis_obj.get(ip_):print('重复数据跳过')continueif out_time < 1:ip_ = "HTTP://"+str(ip_)redis_obj.setex(ip_,1,60*30*20)print('插入成功,',ip_)else:passfrom lxml import etreeclass Kuai_IP(object):def __init__(self):self.headers = {'Host': 'www.kuaidaili.com','Upgrade-Insecure-Requests': '1','User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36','Referer': 'https://www.kuaidaili.com/free/inha/1/'}self.static='https://www.kuaidaili.com/free/inha/%s/'Con_pool = redis.ConnectionPool(host='127.0.0.1',port=6379,db=0)self.r = redis.Redis(connection_pool=Con_pool)def getPage(self,page_index):if page_index==1:self.headers['Referer']='https://www.kuaidaili.com/free/inha/'else:self.headers['Referer']='https://www.kuaidaili.com/free/inha/'+str(page_index-1)+'/'res = requests.get(url=self.static%page_index,headers=self.headers)self.parse(res.text)def parse(self,res):html = etree.HTML(res)r_list = html.xpath('//tbody/tr/td/text()')if r_list:for i in range(1, int(len(r_list) / 7)):_ =r_list[i * 7:(i + 1) * 7]self.r.setex(_[3]+'://'+_[0]+':'+_[1],_[3],30*60*24)print(_)else:print(r_list)def work_on(self):page_index = page+10 # 爬取页数for i in range(1,page_index+1):self.getPage(i)print(i, '---------')time.sleep(2)if __name__ == '__main__':r = R()T = ThreadPoolExecutor(4)for i in range(1,page):_ = url+str(i)T.submit(url_response,_,r)print('执行完成 ')T.shutdown()ip = Kuai_IP()ip.work_on() View Code---第三版融合,总共将近500个ip,应该是够用的

 



 

 

2018-12-24

  嗯,上面代理在80行出现错误,因为快代理的这个超时时间有时出现的是数字+文字,然后在设置缓存的时候就

  出现了【 value is not an integer or out of range】

# -*- coding:utf-8 -*-# @time:2018-12-18 22:23# @Auther:1043453579@qq.comimport re,redis,time,requestsfrom concurrent.futures import ThreadPoolExecutorfrom urllib.request import Requestfrom urllib.request import urlopenpage = 10headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',}url = 'http://www.xicidaili.com/wt/'class R(object):def __init__(self):r_pool =redis.ConnectionPool(host='127.0.0.1',db=0,password=None,port=6379)self.redis_obj = redis.Redis(connection_pool=r_pool)def setex(self,name,value,time):return self.redis_obj.setex(name, time, value)def get(self,name):return self.redis_obj.get(name)def url_response(url,redis_obj):response = urlopen(Request(url,headers=headers)).read()response = response.decode()pattern='<td>(.*?)</td>\s+<td>(\d+)</td>\s+<td>\s+<a href="/.*?">[\u4e00-\u9fa5]+</a>\s+</td>\s+<td class="country">高匿</td>\s+<td>(\w+)</td>\s+<td class="country">\s+<p title="(\d.\d+)秒"'regex = re.compile(pattern)ip_list = regex.findall(response)for i in ip_list:out_time = float(i[3])ip_ = i[0]+':'+i[1]if redis_obj.get(ip_):print('重复数据跳过')continueif out_time < 1:ip_ = "HTTP://"+str(ip_)redis_obj.setex(ip_,1,60*30*20)print('插入成功,',ip_)else:passr = R()T = ThreadPoolExecutor(4)for i in range(1,page):_ = url+str(i)T.submit(url_response,_,r)print('执行完成 ')T.shutdown()from lxml import etreeclass Kuai_IP(object):def __init__(self):self.headers = {'Host': 'www.kuaidaili.com','Upgrade-Insecure-Requests': '1','User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36','Referer': 'https://www.kuaidaili.com/free/inha/1/'}self.static='https://www.kuaidaili.com/free/inha/%s/'Con_pool = redis.ConnectionPool(host='127.0.0.1',port=6379,db=0)self.r = redis.Redis(connection_pool=Con_pool)def getPage(self,page_index):if page_index==1:self.headers['Referer']='https://www.kuaidaili.com/free/inha/'else:self.headers['Referer']='https://www.kuaidaili.com/free/inha/'+str(page_index-1)+'/'res = requests.get(url=self.static%page_index,headers=self.headers)self.parse(res.text)def parse(self,res):html = etree.HTML(res)r_list = html.xpath('//tbody/tr/td/text()')if r_list:for i in range(1, int(len(r_list) / 7)):_ =r_list[i * 7:(i + 1) * 7]print('1',_)#exit()self.r.setex(_[3]+'://'+_[0]+':'+_[1],30*60*24,30*60*24)# print(_)else:print(r_list)def work_on(self):page_index = page+10 # 爬取页数for i in range(1,page_index+1):self.getPage(i)print(i, '---------')time.sleep(2)ip = Kuai_IP()ip.work_on() View Code---第四版融合,将近500+ip,应该是够用的

 

转载于:https://www.cnblogs.com/Skyda/p/9706315.html

需要做网站?需要网络推广?欢迎咨询客户经理 13272073477