您的位置：首页 > 脚本大全 > > 正文

python3爬虫实例代码（python3通过selenium爬虫获取到dj商品的实例代码）

更多时间：2021-10-12 00:13:22 类别：脚本大全浏览量：2945

python3爬虫实例代码

python3通过selenium爬虫获取到dj商品的实例代码

先给大家介绍下python3 selenium使用

其实这个就相当于模拟人的点击事件来连续的访问浏览器。如果你玩过王者荣耀的话在2016年一月份的版本里面就有一个bug。

安卓手机下载一个按键精灵就可以在冒险模式里面设置按键，让手机自动玩闯关，一局19个金币，一晚上就一个英雄了。不过

程序员也不是吃素的。给一个星期设置了大概4000金币上限。有兴趣的可以去试试。（注：手机需要root）

进入正题：

1

2

3

4

5 from selenium import webdriver

from selenium.webdriver.common.by import by

from selenium.webdriver.common.keys import keys

from selenium.webdriver.support import expected_conditions as ec

from selenium.webdriver.support.wait import webdriverwait

在写之前需要下载selenium模块

1

2

3

4

5

6

7

8

9

10

11

12

13

14 brguge=webdriver.chrome()#声明驱动对象

try:

brguge.get('https://www.baidu.com')#发送get请求

input=brguge.find_element_by_id('kw')#找到目标

input.send_keys('python')#输入python关键字

input.send_keys(keys.enter)#敲入回车

wait=webdriverwait(brguge,10)#等待元素加载出来

wait.until(ec.presence_of_element_located(by.id,'content_left'))#加载

print(brguge.current_url)#输出搜索的路径

print(brguge.get_cookie())#输出cookie

print(brguge.page_source)#输出结果源代码

finally:

brguge.close()#关闭谷歌浏览器

下面是一些selenium模块的基本用法

查找元素

单个元素

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15 (from selenium import webdriver)

brguge.find_element_by_id('q')用这个元素找id是q的元素

brguge.find_element_by_css_selector('#q')找css样式是q的

brguge.find_element_by_xpath('//*[ @id="q"]')三个效果一样

brguge.find_element_by_name()通过name来查找

brguge.find_element_by_link_text()通过link来查找

brguge.find_element_by_partial_link_text()

brguge.find_element_by_tag_name()

brguge.find_element_by_class_name()通过class查找

from selenium import webdriver

from selenium.webdriver.common.by import by

brguge.find_element(by.id,'q')通用查找方式

    多个元素（find_elements）加了个s
        他会以列表的形式打印出来
        brguge.find_elements_by_css_selector('.service-bd li')css样式为li的元素
        brguge.find_elements（by.css_selector,'.service-bd li'）两个作用一样
        (利用索引就可以获取单个或多个元素了)
    元素交互操作（获取元素然后再给他指令）
        选择输入框 --》send_keys('输入文字')--》clear()清空输入框--在输入别的--》找到搜索--》click(点击)
        input.clear()清空按钮
    交互动作（将动作附加到动作链中串行执行）
        switch_to_frame('iframeresult')
        用css样式分别找到两个要交互
        调用actionchains(调用谷歌的)
        drag_and_drop(source,target)第一个到第二个上面
        perform()

下面看下python3通过selenium爬虫获取到dj商品的实例代码。

具体代码如下所示：

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126 from selenium import webdriver

from selenium.webdriver.common.by import by

from selenium.webdriver.common.keys import keys

from selenium.webdriver.support.wait import webdriverwait

from selenium.webdriver.support import expected_conditions as ec

from selenium.webdriver.chrome.options import options

from selenium.common.exceptions import nosuchelementexception

from lxml import etree

import time, json

jd_url_login = "https://www.jd.com/"

class customizeexception(exception):

def __init__(self, status, msg):

self.status = status

self.msg = msg

class jd:

def __init__(self):

self.browser = none

self.__init_browser()

def __init_browser(self):

options = options()

options.add_argument("--headless")

options.add_experimental_option('excludeswitches', ['enable-automation'])

# 设置为无图模式

options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})

self.browser = webdriver.chrome(options=options)

# 设置浏览器最大化窗口

self.browser.maximize_window()

# 隐式等待时间为3s

self.browser.implicitly_wait(3)

self.browser.get(jd_url_login)

self.wait = webdriverwait(self.browser, 10)

def __search_goods(self, goods):

'''搜索商品的方法'''

self.file = open("jd-{}.json".format(goods), "a", encoding="utf-8")

self.wait.until(ec.presence_of_all_elements_located((by.id, "key")))

serach_input = self.browser.find_element_by_id("key")

serach_input.clear()

serach_input.send_keys(goods, keys.enter)

def __get_goods_info(self, page_source):

'''从网页源码中获取到想要的数据'''

selector_html = etree.html(page_source)

# 商品名字不要获取title属性，以后再改吧，最好是获取到商品名的文本内容

goods_name = selector_html.xpath("//li[@class='gl-i-wrap']//li[contains(@class,'p-name')]/a/@title")

# 商品价格

goods_price = selector_html.xpath("//li[@class='gl-i-wrap']//li[@class='p-price']/strong/i/text()")

# 商品评价数量

comment_num_selector = selector_html.xpath("//li[@class='p-commit']/strong")

comment_num = [selector.xpath("string(.)") for selector in comment_num_selector]

# 商品店铺

shop_name = selector_html.xpath("//a[@class='curr-shop']/text()")

goods_zip = zip(goods_name, goods_price, comment_num, shop_name)

for goods_info in goods_zip:

dic = {}

dic["goods_name"] = goods_info[0]

dic["goods_price"] = goods_info[1]

dic["comment_num"] = goods_info[2]

dic["shop_name"] = goods_info[3]

# print("商品名字>>:", goods_info[0])

# print("商品价格>>:", goods_info[1])

# print("商品评价数量>>:", goods_info[2])

# print("商品店铺>>:", goods_info[3])

# print("*" * 100)

yield dic

def __swipe_page(self):

'''上下滑动页面，将完整的网页源码返回'''

height = self.browser.execute_script("return document.body.scrollheight;")

js = "window.scrollto(0, {});".format(height)

self.browser.execute_script(js)

while true:

time.sleep(1)

now_height = self.browser.execute_script("return document.body.scrollheight;")

if height == now_height:

return self.browser.page_source

js = "window.scrollto({}, {});".format(height, now_height)

self.browser.execute_script(js)

height = now_height

def __is_element_exists(self, xpath):

'''检测一个xpath是否能够找到'''

try:

self.browser.find_element_by_xpath(xpath=xpath)

return true

except nosuchelementexception:

return false

def __click_next_page(self):

'''点击下一页，实现翻页功能'''

self.wait.until(ec.presence_of_all_elements_located((by.class_name, "pn-next")))

xpath = "//a[@class='pn-next']"

if not self.__is_element_exists(xpath):

raise customizeexception(10000, "该商品访问完毕")

self.browser.find_element_by_xpath(xpath).click()

def __write_to_json(self, dic: dict):

data_json = json.dumps(dic, ensure_ascii=false)

self.file.write(data_json + "\n")

def run(self, goods):

self.__search_goods(goods)

n = 1

while true:

print("正在爬取商品 <{}>---第{}页......".format(goods, n))

time.sleep(3)

html = self.__swipe_page()

for dic in self.__get_goods_info(html):

self.__write_to_json(dic)

try:

self.__click_next_page()

except customizeexception:

try:

goods = goods_list.pop(0)

self.run(goods)

except indexerror:

return

n += 1

def __del__(self):

self.browser.close()

self.file.close()

if __name__ == '__main__':

jd = jd()

goods_list = ["纯牛奶", "酸奶", "奶茶", "床上用品", "电磁炉", "电视", "小米笔记本", "华硕笔记本", "联想笔记本", "男士洗面奶", "女士洗面奶", "沐浴露", "洗发露",

"牙刷", "牙膏", "拖鞋", "剃须刀", "水手服", "运动服", "红龙果", "苹果", "香蕉", "洗衣液", "电饭煲"]

try:

goods = goods_list.pop(0)

except indexerror:

raise customizeexception(20000, "goods_list不能为空")

try:

jd.run(goods)

finally:

del jd

总结

以上所述是小编给大家介绍的python3通过selenium爬虫获取到dj商品的实例代码,希望对大家有所帮助，如果大家有任何疑问请给我留言，小编会及时回复大家的。在此也非常感谢大家对开心学习网网站的支持！

如果你觉得本文对你有帮助，欢迎转载，烦请注明出处，谢谢！

原文链接：https://www.cnblogs.com/zhuchunyu/archive/2019/04/25/10765875.html

标签：爬虫 Python3 selenium

上一篇：阿里云虚拟主机带公网地址吗（阿里云虚拟主机被搜索引擎爬虫访问耗费大量流量解决方法）

下一篇：html5中的语义化标签有哪些（详解HTML5常用的语义化标签）

您可能感兴趣

python3爬虫实例代码（python3通过selenium爬虫获取到dj商品的实例代码）

python3爬虫实例代码

热门推荐

排行榜