1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
| from lxml import etree import requests
base_domain = "https://www.dytt8.net" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36 Edg/80.0.361.111" }
def get_detail_url(url): resp = requests.get(url, headers=headers) text = resp.text html = etree.HTML(text) detail_urls = html.xpath("//table[@class='tbspan']//a/@href") detail_urls = map(lambda url: base_domain + url, detail_urls) return detail_urls
def get_html(url): resp = requests.get(url, headers=headers) text = resp.content.decode("gbk", errors="ignore") html = etree.HTML(text) return html
def parse_info(info, rule): return info.replace(rule, "").strip()
def parse_detail_page(url): movie = {} html = get_html(url) title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0] movie["title"] = title zooms = html.xpath("//div[@id='Zoom']")[0] poster = zooms.xpath(".//img/@src") movie["poster"] = poster
infos = zooms.xpath(".//text()") for index, info in enumerate(infos): if info.startswith("◎年 代"): info = parse_info(info, "◎年 代") movie["year"] = info elif info.startswith("◎产 地"): info = parse_info(info, "◎产 地") movie["country"] = info elif info.startswith("◎类 别"): info = parse_info(info, "◎类 别") movie["category"] = info elif info.startswith("◎豆瓣评分"): info = parse_info(info, "◎豆瓣评分") movie["score"] = info elif info.startswith("◎片 长"): info = parse_info(info, "◎片 长") movie["timelength"] = info elif info.startswith("◎导 演"): info = parse_info(info, "◎导 演") movie["director"] = info elif info.startswith("◎主 演"): info = parse_info(info, "◎主 演") actors = [info] for x in range(index + 1, len(infos)): actor = infos[x].strip() if actor.startswith("◎"): break actors.append(actor) movie["actors"] = actors elif info.startswith("◎简 介"): info = parse_info(info, "◎简 介") for x in range(index + 1, len(infos)): profile = infos[x].strip() if profile.startswith("【"): break movie["profile"] = profile
down_url = zooms.xpath(".//table//a/@href") movie["down_url"] = down_url return movie
def spider(): index_url = "https://www.dytt8.net/html/gndy/dyzz/index.html" base_url = "https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html" html = get_html(index_url) pages = html.xpath("//select[@name='sldd']/option") movies = [] for x in range(1, len(pages) + 1): url = base_url.format(x) detail_urls = get_detail_url(url) for detail_url in detail_urls: movie = parse_detail_page(detail_url) movies.append(movie) print(movie)
if __name__ == '__main__': spider()
|