Python百度URL采集器

import sys
import threading
import  Queue
import requests
from bs4 import BeautifulSoup as bs
import re


header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}
class Baidu_spider(threading.Thread):
def __init__(self,queue):
        threading.Thread.__init__(self)
self.queue = queue
def run(self):
while not self.queue.empty():
            urls = self.queue.get()
try:
                spider(urls)
except Exception,e:
print e
pass


def spider(url):
    r = requests.get(url, headers=header)
    soup = bs(r.content,'lxml' )
    urls = soup.find_all(name='a',attrs={'data-click':re.compile('.'),'class':None})
for i in urls:
        url_r = requests.get(i['href'],headers = header)
if url_r.status_code == 200:
            url_index = url_r.url.split('/')[0]+'//'+url_r.url.split('/')[2]
            url_para = url_r.url
print url_index,'...'
f2 = open('url_para.txt', 'a+')
            f2.write(url_para + '\n')
            f2.close()
with open('url_index.txt') as f:
if  url_index not in f.read():
                    f1 = open('url_index.txt','a+')
                    f1.write(url_index+'\n')
                    f1.close()



def keyword(key):
    queue = Queue.Queue()
for i in range(0,101,10):
        queue.put("https://www.baidu.com/s?wd=%s&pn=%s"%(key,str(i)))
    threads = []
    threading_count = 3
for i in range(threading_count):
        threads.append(Baidu_spider(queue))
for i in range(threading_count):
        threads[i].start()
for i in range(threading_count):
        threads[i].join()



if __name__ == '__main__':
    f1 = open('url_index.txt','w')
    f1.close()
    f2 = open('url_para.txt','w')
    f2.close()
if len(sys.argv) != 2:
print "example:url_key.py keyword"
else:
        keyword(sys.argv[1])


运行效果图




代码是模仿i春秋上面的课程打出来的  思路和内容都很容易理解

2017-06-04  /   

评论