多进程多线程爬取多层结构的妹子写真网站

  经常网上瞎逛,总是会遇到各种各样的网站。“偶然间”看到了一个无广告且有着大量妹子写真图片的网站,浏览了一会忍不住拿起卫生纸…擦了擦电脑屏幕。

  作为开发且热爱爬数据的码农来说,这个网站简直不要太赞。粗略看了下整个网站,发现类型众多,类型下的套图更是甚多,图片约有数几十万张。还等什么,爬他!倾尽硬盘的爬他!!!

  经过审查多个页面,了解了整个网站的架构。画了下面的脑图。

  爬取思路就是获取类型页(所有类型)–>某单个类型–某单类型下的多套图(分页)–多套图下多张图(分页),对于这种很多层且很多页数的网站唯有一层一层的扒下去,分页的循环的读取。

  有了思路写起代码还是很快的,下面看看第一版没用到多线程的代码,没用到多线程,图片得一张一张的下载,一张下载完了才能接着下载另一张,网站响应的也慢(网站流量做了限制?),导致下载一张图片也很慢。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from saveDB import conndb
import requests
import urllib.request
from bs4 import BeautifulSoup
import os
import re
import uuid
import urllib

# 网址已打码,免得律师函警告
root_url = 'https://www.******.net'
img_path = 'F:\\reptile\\meizi'


def parse_(url):
# 解析网址
req = requests.get(url)
# 设置编码,浏览器查看网站编码:F12,控制开输入document.characterSet回车即可查看
req.encoding = 'GBK'
# 获取网页所有内容
soup = BeautifulSoup(req.text, 'html.parser')
return soup


# windows文件夹命名不允许的符号替换成下划线
def validateTitle(title):
pattern = r'[\\/:*?"<>|\r\n]+'
new_title = re.sub(pattern, "_", title)
return new_title


# 爬取结构
# 类型页(所有类型)-->单个类型--单类型下多套图(分页)--多套图下多张图(分页)
if __name__ == '__main__':
conn, cur = conndb.conn_db()
# 类型页
url = '{}/html/3/'.format(root_url)
soup = parse_(url)

find_all = soup.find_all(name='div', attrs={"class": "jigou"})[0]('li')

for i in find_all:
group_url = i(['a'])[0].get('href')
b = parse_(group_url)
b_find_all = b.find_all(name='div', attrs={"class", "hezi"})[0]('li')

for x in b_find_all:
img_group_url = x(['a'])[0].get('href')
img_index = x(['a'])[0]('img')[0].get('src')
img_title1 = x(['a'])[1].text.strip()

beautiful_soup = parse_(img_group_url)
strip2 = beautiful_soup.find_all(name='div', attrs={"id": "pages"})[0]('a')
member = []
for pp in strip2:
member.append(pp)
member2 = member[2:len(member) - 1]
member2.append(1)

uuid1 = uuid.uuid4()

staA = conndb.exe_update(cur,
"insert into meizitu_list_1 (uuid,title,img_id, thumb_src_min) values('%s', '%s', '%s','%s')" % (
uuid1, img_title1, '999999', img_index
))
print('11:{}保存完成'.format(img_title1))

i = 0
for p in member2:
if p == 1:
img_detail_url = img_group_url
else:
img_detail_url = '{}{}'.format(group_url, p.get('href'))
parse_1 = parse_(img_detail_url)
content_ = parse_1.find_all(name='div', attrs={"class": "content"})[0]('img')

title = validateTitle(img_title1)

img_path2 = img_path + '\\' + title
if not os.path.exists(img_path2):
os.makedirs(img_path2)

for img in content_:
i += 1
img_url = img.get('src')
img_title = img.get('alt')
try:
uuid2 = uuid.uuid4()
satB = conndb.exe_update(cur,
"insert into meizitu_datail_1 (uuid,img_id,img_url,img_title) values ('%s', '%s','%s', '%s') " % (
uuid2, uuid1, img, img_title
))

print('22:{}{}保存完成'.format(img_detail_url, img_title))

res = requests.get(img_url)
with open("{}\\{}{}".format(img_path2, i, '.jpg'), 'wb')as f:
f.write(res.content)
except:
print('------------------------------------------------------>')

conndb.conn_close(conn, cur)

  下面是初步学习了多线程知识改造的。献丑献丑,改造后,图片都是一整页的同时下载,速度瞬间提升8倍,为什么是8倍呢?因为网站一页就展示8张图片。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import requests
import urllib.request
from bs4 import BeautifulSoup
import os
import re
import uuid
import urllib
import threading
import multiprocessing

# 网址已打码,免得律师函警告
root_url = 'https://www.******.net'
img_path = 'F:\\reptile\\meizi'


def parse_(url):
# 解析网址
req = requests.get(url)
# 设置编码,浏览器查看网站编码:F12,控制开输入document.characterSet回车即可查看
req.encoding = 'GBK'
# 获取网页所有内容
soup = BeautifulSoup(req.text, 'html.parser')
return soup


# windows文件夹命名不允许的符号替换成下划线
def validateTitle(title):
pattern = r'[\\/:*?"<>|\r\n]+'
new_title = re.sub(pattern, "_", title)
return new_title


# 该类型分为很多页
def page_url(p, img_group_url, group_url, img_title1, img_path):
if p == 1:
img_detail_url = img_group_url
else:
img_detail_url = '{}{}'.format(group_url, p.get('href'))
parse_1 = parse_(img_detail_url)
content_ = parse_1.find_all(name='div', attrs={"class": "content"})[0]('img')

title = validateTitle(img_title1)

img_path2 = img_path + '\\' + title
if not os.path.exists(img_path2):
os.makedirs(img_path2)

# 把本页的所有图片加入线程中进行多线程下载
threads = []
for img in content_:
img_url = img.get('src')
img_title = img.get('alt')
t = threading.Thread(target=download_img, args=(img_detail_url, img_url, img_path2, img_title))
threads.append(t)
for i in threads:
i.start()
for i in threads:
i.join()


# 解析每一个类型
def type_url(x):
img_group_url = x(['a'])[0].get('href')
img_title1 = x(['a'])[1].text.strip()

beautiful_soup = parse_(img_group_url)
strip2 = beautiful_soup.find_all(name='div', attrs={"id": "pages"})[0]('a')
member = []
for pp in strip2:
member.append(pp)
member2 = member[2:len(member) - 1]
member2.append(1)

print('11:{}开始保存'.format(img_title1))

# 加入进程中(已注释掉,目标图片服务器相应速度慢,
# 下载慢,进程不断添加,达到定量时,可能也会报栈溢出问题)
for p in member2:
# pool.apply_async(func=page_url, args=(p, img_group_url, group_url, img_title1, img_path))
page_url(p, img_group_url, group_url, img_title1, img_path)


def download_img(img_detail_url, img_url, img_path2, img_title):
try:
uuid2 = uuid.uuid4()
print('22:{}{}保存完成'.format(img_detail_url, img_title))
res = requests.get(img_url)
with open("{}\\{}{}".format(img_path2, uuid2, '.jpg'), 'wb')as f:
f.write(res.content)
except:
print('------------------------------------------------------>')


# 大类型的具体某一页套图
def type_page_url(url):
beautiful_soup = parse_(url)
b_find_all = beautiful_soup.find_all(name='div', attrs={"class", "hezi"})[0]('li')

# 循环传进去具体一页上的所有套图地址
for x in b_find_all:
type_url(x)


# 爬取结构
# 类型页(所有类型)-->单个类型--单类型下多套图(分页)--多套图下多张图(分页)
if __name__ == '__main__':
# 进程池,这里设置了4各
pool = multiprocessing.Pool(processes=4)
# 类型页
url = '{}/html/3/'.format(root_url)
soup = parse_(url)

# 解析所有类型获取地址
find_all = soup.find_all(name='div', attrs={"class": "jigou"})[0]('li')

# 循环每一个类型
for i in find_all:
group_url = i(['a'])[0].get('href')
b = parse_(group_url)
# 获取这个类型下有多少页套图
strip2 = b.find_all(name='div', attrs={"id": "pages"})[0]('a')
member = []
for pp in strip2:
member.append(pp)
member2 = member[2:len(member) - 3]
member2.append(1)

# 获取具体这一页的套图
for p in member2:
href = p.attrs['href']
# 大类型的第一页地址
if p == 1:
num = re.findall(r"\d+\.?\d*", group_url)
url = '{}list_{}_1.html'.format(group_url, num[1])
else:
url = group_url + href
type_page_url(url)

pool.close()
pool.join()

  最后看看爬取下来的战绩吧。经过一夜的爬取,已经保存了近20G,十好几万张图片。。。

  基本就是这样吧,代码还是很简单的,没啥好说的,都有注释。主要的技术点就是其中的那两块多进程、多线程的地方。


-------------本文结束感谢您的阅读-------------
感觉文章不错,就赏个吧!
0%