您的位置：首页 > 脚本大全 > > 正文

如何用python爬取最新电影（使用python实现抓取腾讯视频所有电影的爬虫）

更多时间：2021-10-16 00:37:04 类别：脚本大全浏览量：2284

如何用python爬取最新电影

使用python实现抓取腾讯视频所有电影的爬虫

用python实现的抓取腾讯视频所有电影的爬虫

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136 # -*- coding: utf-8 -*-

import re

import urllib2

from bs4import beautifulsoup

import string, time

import pymongo

num =0 #全局变量,电影数量

m_type = u'' #全局变量,电影类型

m_site = u'qq' #全局变量,电影网站

#根据指定的url获取网页内容

def gethtml(url):

req = urllib2.request(url)

response = urllib2.urlopen(req)

html = response.read()

return html

#从电影分类列表页面获取电影分类

def gettags(html):

global m_type

soup = beautifulsoup(html) #过滤出分类内容

#print soup

#<ulclass="clearfix _group" gname="mi_type" gtype="1">

tags_all = soup.find_all('ul', {'class' :'clearfix _group' ,'gname' :'mi_type'})

#print len(tags_all), tags_all

#print str(tags_all[1]).replace('\n','')

#<a _hot="tag.sub" class="_gtag _hotkey" href="http://v.qq.com/list/1_0_-1_-1_1_0_0_20_0_-1_0.html" title="动作" tvalue="0">动作</a>

re_tags = r'<a _hot=\"tag\.sub\" class=\"_gtag _hotkey\" href=\"(.+?)\" title=\"(.+?)\" tvalue=\"(.+?)\">.+?</a>'

p = re.compile(re_tags, re.dotall)

tags = p.findall(str(tags_all[0]))

if tags:

tags_url = {}

#print tags

for tagin tags:

tag_url = tag[0].decode('utf-8')

#print tag_url

m_type = tag[1].decode('utf-8')

tags_url[m_type] = tag_url

else:

print"not find"

return tags_url

#获取每个分类的页数

def get_pages(tag_url):

tag_html = gethtml(tag_url)

#liclass="paginator

soup = beautifulsoup(tag_html) #过滤出标记页面的html

#print soup

#<liclass="mod_pagenav" id="pager">

li_page = soup.find_all('li', {'class' :'mod_pagenav','id' :'pager'})

#print li_page #len(li_page), li_page[0]

#<aclass="c_txt6" href="http://v.qq.com/list/1_2_-1_-1_1_0_24_20_0_-1_0.html" title="25"><span>25</span></a>

re_pages = r'<a class=.+?><span>(.+?)</span></a>'

p = re.compile(re_pages, re.dotall)

pages = p.findall(str(li_page[0]))

#print pages

if len(pages) >1:

return pages[-2]

else:

return 1

def getmovielist(html):

soup = beautifulsoup(html)

#<ulclass="mod_list_pic_130">

lis = soup.find_all('ul', {'class' :'mod_list_pic_130'})

#print lis

for li_htmlin lis:

li_html = str(li_html).replace('\n','')

#print li_html

getmovie(li_html)

def getmovie(html):

global num

global m_type

global m_site

re_movie = r'<li><a class=\"mod_poster_130\" href=\"(.+?)\" target=\"_blank\" title=\"(.+?)\"><img.+?</li>'

p = re.compile(re_movie, re.dotall)

movies = p.findall(html)

if movies:

conn = pymongo.connection('localhost',27017)

movie_db = conn.dianying

playlinks = movie_db.playlinks

#print movies

for moviein movies:

#print movie

num +=1

print"%s : %d" % ("=" *70, num)

values = dict(

movie_title = movie[1],

movie_url = movie[0],

movie_site = m_site,

movie_type = m_type

)

print values

playlinks.insert(values)

print"_" *70

num +=1

print"%s : %d" % ("=" *70, num)

#else:

# print"not find"

def getmovieinfo(url):

html = gethtml(url)

soup = beautifulsoup(html)

#pack pack_album album_cover

lis = soup.find_all('li', {'class' :'pack pack_album album_cover'})

#print lis[0]

#<a href="http://www.tudou.com/albumplay/9nyofxc_lhi/32jqhikjyki.html" target="new" title="《血滴子》独家纪录片" wl="1"> </a>

re_info = r'<a href=\"(.+?)\" target=\"new\" title=\"(.+?)\" wl=\".+?\"> </a>'

p_info = re.compile(re_info, re.dotall)

m_info = p_info.findall(str(lis[0]))

if m_info:

return m_info

else:

print"not find movie info"

return m_info

def insertdb(movieinfo):

global conn

movie_db = conn.dianying_at

movies = movie_db.movies

movies.insert(movieinfo)

if __name__ =="__main__":

global conn

tags_url ="http://v.qq.com/list/1_-1_-1_-1_1_0_0_20_0_-1_0.html"

#print tags_url

tags_html = gethtml(tags_url)

#print tags_html

tag_urls = gettags(tags_html)

#print tag_urls

for urlin tag_urls.items():

print str(url[1]).encode('utf-8') #,url[0]

maxpage =int(get_pages(str(url[1]).encode('utf-8')))

print maxpage

for xin range(0, maxpage):

#http://v.qq.com/list/1_0_-1_-1_1_0_0_20_0_-1_0.html

m_url = str(url[1]).replace('0_20_0_-1_0.html','')

movie_url ="%s%d_20_0_-1_0.html" % (m_url, x)

print movie_url

movie_html = gethtml(movie_url.encode('utf-8'))

#print movie_html

getmovielist(movie_html)

time.sleep(0.1)

总结

以上所述是小编给大家介绍的使用python实现抓取腾讯视频所有电影的爬虫，希望对大家有所帮助，如果大家有任何疑问欢迎给我留言，小编会及时回复大家的！

原文链接：https://blog.csdn.net/qq_40196321/article/details/89190327

标签：爬虫 Python 抓取

上一篇：docker给镜像加名称（Alpine Docker镜像字体的问题解决操作）

下一篇：美国云服务器的优点（美国哪个机房的云主机比较稳定？）

您可能感兴趣

如何用python爬取最新电影（使用python实现抓取腾讯视频所有电影的爬虫）

如何用python爬取最新电影

热门推荐

排行榜