python爬取网页

爬取最新电影,内容包括标题、年代、产地、类别、评分、片场、导演、主演、下载链接。

代码实现
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
from lxml import etree
import requests

base_domain = "https://www.dytt8.net"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36 Edg/80.0.361.111"
}


def get_detail_url(url):
resp = requests.get(url, headers=headers)
# 解码出现"illegal multibyte sequence",其他编码无法解决,使用ignore
# text = resp.content.decode("gbk", errors="ignore")
text = resp.text
html = etree.HTML(text)
detail_urls = html.xpath("//table[@class='tbspan']//a/@href")
detail_urls = map(lambda url: base_domain + url, detail_urls)
return detail_urls


def get_html(url):
resp = requests.get(url, headers=headers)
text = resp.content.decode("gbk", errors="ignore")
html = etree.HTML(text)
return html


def parse_info(info, rule):
return info.replace(rule, "").strip()


def parse_detail_page(url):
movie = {}
html = get_html(url)
title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
movie["title"] = title
zooms = html.xpath("//div[@id='Zoom']")[0]
poster = zooms.xpath(".//img/@src")
movie["poster"] = poster

infos = zooms.xpath(".//text()")
for index, info in enumerate(infos):
if info.startswith("◎年  代"):
info = parse_info(info, "◎年  代")
movie["year"] = info
elif info.startswith("◎产  地"):
info = parse_info(info, "◎产  地")
movie["country"] = info
elif info.startswith("◎类  别"):
info = parse_info(info, "◎类  别")
movie["category"] = info
elif info.startswith("◎豆瓣评分"):
info = parse_info(info, "◎豆瓣评分")
movie["score"] = info
elif info.startswith("◎片  长"):
info = parse_info(info, "◎片  长")
movie["timelength"] = info
elif info.startswith("◎导  演"):
info = parse_info(info, "◎导  演")
movie["director"] = info
elif info.startswith("◎主  演"):
info = parse_info(info, "◎主  演")
actors = [info]
for x in range(index + 1, len(infos)):
actor = infos[x].strip()
if actor.startswith("◎"):
break
actors.append(actor)
movie["actors"] = actors
elif info.startswith("◎简  介"):
info = parse_info(info, "◎简  介")
for x in range(index + 1, len(infos)):
profile = infos[x].strip()
if profile.startswith("【"):
break
movie["profile"] = profile

down_url = zooms.xpath(".//table//a/@href")
movie["down_url"] = down_url
return movie


def spider():
index_url = "https://www.dytt8.net/html/gndy/dyzz/index.html"
base_url = "https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html"
html = get_html(index_url)
pages = html.xpath("//select[@name='sldd']/option")
movies = []
for x in range(1, len(pages) + 1):
url = base_url.format(x)
detail_urls = get_detail_url(url)
for detail_url in detail_urls:
# 遍历每一页中的所有电影
movie = parse_detail_page(detail_url)
movies.append(movie)
print(movie)


if __name__ == '__main__':
spider()