import sys
import threading
import Queue
import requests
from bs4 import BeautifulSoup as bs
import re
header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}
class Baidu_spider(threading.Thread):
def __init__(self,queue):
threading.Thread.__init__(self)
self.queue = queue
def run(self):
while not self.queue.empty():
urls = self.queue.get()
try:
spider(urls)
except Exception,e:
print e
pass
def spider(url):
r = requests.get(url, headers=header)
soup = bs(r.content,'lxml' )
urls = soup.find_all(name='a',attrs={'data-click':re.compile('.'),'class':None})
for i in urls:
url_r = requests.get(i['href'],headers = header)
if url_r.status_code == 200:
url_index = url_r.url.split('/')[0]+'//'+url_r.url.split('/')[2]
url_para = url_r.url
print url_index,'...'
f2 = open('url_para.txt', 'a+')
f2.write(url_para + '\n')
f2.close()
with open('url_index.txt') as f:
if url_index not in f.read():
f1 = open('url_index.txt','a+')
f1.write(url_index+'\n')
f1.close()
def keyword(key):
queue = Queue.Queue()
for i in range(0,101,10):
queue.put("https://www.baidu.com/s?wd=%s&pn=%s"%(key,str(i)))
threads = []
threading_count = 3
for i in range(threading_count):
threads.append(Baidu_spider(queue))
for i in range(threading_count):
threads[i].start()
for i in range(threading_count):
threads[i].join()
if __name__ == '__main__':
f1 = open('url_index.txt','w')
f1.close()
f2 = open('url_para.txt','w')
f2.close()
if len(sys.argv) != 2:
print "example:url_key.py keyword"
else:
keyword(sys.argv[1])
运行效果图
代码是模仿i春秋上面的课程打出来的 思路和内容都很容易理解