IP代理 当前位置: Home » python » IP代理 import requests from bs4 import BeautifulSoup import re def IPspider1(): response = requests.get('http://www.proxy360.cn/default.aspx') html = response.text soup = BeautifulSoup(html, 'lxml') proxyList = [] ipss=soup.find_all('div',class_='proxylistitem') for ips in ipss: ip=ips.find_all('span',class_='tbBottomLine') proxy = {'http': ip[0].text.strip() + ':' + ip[1].text.strip(), 'https': ip[0].text.strip() + ':' + ip[1].text.strip()} proxyList.append(proxy) #print(proxyList) return proxyList #print("本次从Proxy360爬取{}个代理IP".format(len(ipss))) def IPspider2(url): header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'} response = requests.get(url,headers=header) html = response.text soup = BeautifulSoup(html, 'lxml') iptable=soup.find('table') #print(iptable) proxyList = [] ips=iptable.find_all('tr',class_='odd') for ip in ips: proxy = {'http': ip.find_all('td')[1].text.strip()+':'+ip.find_all('td')[2].text.strip(), 'https': ip.find_all('td')[1].text.strip()+':'+ip.find_all('td')[2].text.strip()} url = "http://httpbin.org/ip" try: response = requests.get(url, proxies=proxy, timeout=5) if response.status_code==requests.codes.ok: print('http://'+ip.find_all('td')[1].text.strip()+':'+ip.find_all('td')[2].text.strip()+"有效的IP地址") print(response.encoding.text) proxyList.append(proxy) else: print('http://'+ip.find_all('td')[1].text.strip()+':'+ip.find_all('td')[2].text.strip()+"无效的IP地址") except: continue print(proxyList) print("本次从西刺代理爬取{}个代理IP".format(len(ips))) def IPspider3(url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'} session = requests.session() page = session.get(url, headers=headers) soup = BeautifulSoup(page.text, 'lxml') proxyList = [] taglist = soup.find_all('tr', attrs={'class': re.compile("(odd)|()")}) for trtag in taglist: tdlist = trtag.find_all('td') proxy = {'http': tdlist[1].string + ':' + tdlist[2].string, 'https': tdlist[1].string + ':' + tdlist[2].string} url = "'http://1212.ip138.com/ic.asp'" try: response = session.get(url, proxies=proxy, timeout=5) proxyList.append(proxy) if(len(proxyList) == 10): break except : continue print(proxyList) return proxyList #IPspider1("http://www.proxy360.cn/default.aspx") IPspider2("http://www.xicidaili.com/") #IPspider3("http://www.xicidaili.com/nn")
Comments | NOTHING