多线程爬虫实现

优化爬取糗事百科段子的爬虫

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import threading
import time
import requests
from lxml import etree
import json
from queue import Queue
class QiushiSpider:

def __init__(self):
print(time.time())
self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',}
self.url_temp = 'https://www.qiushibaike.com/8hr/page/{}/'
self.url_queue = Queue()
self.html_queue = Queue()
self.content_queue = Queue()

def get_url_list(self):
# url_list = [ self.url_temp.format(i) for i in range(1,14)]
# return url_list
for i in range(1,14):
self.url_queue.put(self.url_temp.format(i))

def parse_url(self):
while True:
url = self.url_queue.get()
response = requests.get(url,headers=self.headers)
# return response.content.decode()
self.html_queue.put(response.content.decode())
self.url_queue.task_done()

def get_content_list(self):
while True:
html_str = self.html_queue.get()
html = etree.HTML(html_str)
div_list = html.xpath('//div[@id="content-left"]/div')
content_list = []
for div in div_list:
item = {}
item['content'] = div.xpath('.//div[@class="content"]/span/text()')
item['content'] = [ i.replace('\n','') for i in item['content']]
item['author_gender'] = div.xpath('.//div[contains(@class,"articleGender")]/@class')
item['author_gender'] = item['author_gender'][0].split(' ')[-1].replace('Icon','') if len(item['author_gender']) >0 else None
item['author_age'] = div.xpath('.//div[contains(@class,"articleGender")]/text()')
item['author_age'] = item['author_age'][0] if len(item['author_age']) >0 else None
item['content_img'] = div.xpath('.//div[@class="thumb"]/a/img/@src')
item['content_img'] = 'https:' + item['content_img'][0] if len(item['content_img']) > 0 else None
item['author_img'] = div.xpath('.//div[@class="author clearfix"]/a/img/@src')
item['author_img'] = 'https:' + item['author_img'][0] if len(item['author_img']) > 0 else None
content_list.append(item)
self.content_queue.put(content_list)
self.html_queue.task_done()

def save_content_list(self):
while True:
content_list = self.content_queue.get()
for i in content_list:
with open('qiushi.txt','a',encoding='utf-8') as f:
f.write(json.dumps(i,ensure_ascii=False))
f.write('\n')
self.content_queue.task_done()

def run(self):
thread_list = []
t_url = threading.Thread(target=self.get_url_list)
thread_list.append(t_url)
t_html = threading.Thread(target=self.parse_url)
thread_list.append(t_html)
t_content = threading.Thread(target=self.get_content_list)
thread_list.append(t_content)
t_save = threading.Thread(target=self.save_content_list)
thread_list.append(t_save)
for t in thread_list:
t.setDaemon(True)
t.start()
for q in [self.url_queue,self.html_queue,self.content_queue]:
q.join()
print(time.time())


if __name__ == '__main__':
qiushi = QiushiSpider()
qiushi.run()
print('完成噜!!!')
文章作者: Mr joe
文章链接: http://mrjoe.cc/2018/06/24/多线程爬虫实现/
版权声明: 本博客所有文章除特别声明外,均采用 CC BY-NC-SA 4.0 许可协议。转载请注明来自 Mrjoe的博客
欢迎关注哦