您的位置：首页 > 脚本大全 > > 正文

python有什么好的微信公众号（python下载微信公众号相关文章）

更多时间：2022-01-20 00:41:47 类别：脚本大全浏览量：1631

python有什么好的微信公众号

python下载微信公众号相关文章

本文实例为大家分享了python下载微信公众号相关文章的具体代码，供大家参考，具体内容如下

目的：从零开始学自动化测试公众号中下载“pytest"一系列文档

1、搜索微信号文章关键字搜索

2、对搜索结果前n页进行解析，获取文章标题和对应url

主要使用的是requests和bs4中的beautifulsoup

weixin.py

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60 import requests

from urllib.parse import quote

from bs4 import beautifulsoup

import re

from weixinspider.html2doc import myhtmlparser

class weixinspider(object):

def __init__(self, gzh_name, pageno,keyword):

self.gzh_name = gzh_name

self.pageno = pageno

self.keyword = keyword.lower()

self.page_url = []

self.article_list = []

self.headers = {

'user-agent': 'mozilla/5.0 (windows nt 10.0; wow64) applewebkit/537.36 (khtml, like gecko) chrome/70.0.3538.110 safari/537.36'}

self.timeout = 5

# [...] 用来表示一组字符,单独列出：[amk] 匹配 'a'，'m'或'k'

# re+ 匹配1个或多个的表达式。

self.pattern = r'[\\/:*?"<>|\r\n]+'

def get_page_url(self):

for i in range(1,self.pageno+1):

# https://weixin.sogou.com/weixin?query=从零开始学自动化测试&_sug_type_=&s_from=input&_sug_=n&type=2&page=2&ie=utf8

url = "https://weixin.sogou.com/weixin?query=%s&_sug_type_=&s_from=input&_sug_=n&type=2&page=%s&ie=utf8" \

% (quote(self.gzh_name),i)

self.page_url.append(url)

def get_article_url(self):

article = {}

for url in self.page_url:

response = requests.get(url,headers=self.headers,timeout=self.timeout)

result = beautifulsoup(response.text, 'html.parser')

articles = result.select('ul[class="news-list"] > li > li[class="txt-box"] > h3 > a ')

for a in articles:

# print(a.text)

# print(a["href"])

if self.keyword in a.text.lower():

new_text=re.sub(self.pattern,"",a.text)

article[new_text] = a["href"]

self.article_list.append(article)

headers = {'user-agent':

'mozilla/5.0 (windows nt 10.0; wow64) applewebkit/537.36 (khtml, like gecko) chrome/70.0.3538.110 safari/537.36'}

timeout = 5

gzh_name = 'pytest文档'

my_gzh = weixinspider(gzh_name,5,'pytest')

my_gzh.get_page_url()

# print(my_gzh.page_url)

my_gzh.get_article_url()

# print(my_gzh.article_list)

for article in my_gzh.article_list:

for (key,value) in article.items():

url=value

html_response = requests.get(url,headers=headers,timeout=timeout)

myhtmlparser = myhtmlparser(key)

myhtmlparser.feed(html_response.text)

myhtmlparser.doc.save(myhtmlparser.docfile)

html2doc.py

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76 from html.parser import htmlparser

import requests

from docx import document

import re

from docx.shared import rgbcolor

import docx

class myhtmlparser(htmlparser):

def __init__(self,docname):

htmlparser.__init__(self)

self.docname=docname

self.docfile = r"d:\pytest\%s.doc"%self.docname

self.doc=document()

self.title = false

self.code = false

self.text=''

self.processing =none

self.codeprocessing =none

self.picindex = 1

self.headers = {

'user-agent': 'mozilla/5.0 (windows nt 10.0; wow64) applewebkit/537.36 (khtml, like gecko) chrome/70.0.3538.110 safari/537.36'}

self.timeout = 5

def handle_startendtag(self, tag, attrs):

# 图片的处理比较复杂，首先需要找到对应的图片的url，然后下载并写入doc中

if tag == "img":

if len(attrs) == 0:

pass

else:

for (variable, value) in attrs:

if variable == "data-type":

picname = r"d:\pytest\%s%s.%s" % (self.docname, self.picindex, value)

# print(picname)

if variable == "data-src":

picdata = requests.get(value, headers=self.headers, timeout=self.timeout)

# print(value)

self.picindex = self.picindex + 1

# print(self.picindex)

with open(picname, "wb") as pic:

pic.write(picdata.content)

try:

self.doc.add_picture(picname)

except docx.image.exceptions.unexpectedendoffileerror as e:

print(e)

def handle_starttag(self, tag, attrs):

if re.match(r"h(\d)", tag):

self.title = true

if tag =="p":

self.processing = tag

if tag == "code":

self.code = true

self.codeprocessing = tag

def handle_data(self, data):

if self.title == true:

self.doc.add_heading(data, level=2)

# if self.in_li == true and self.tag == "p":

if self.processing:

self.text = self.text + data

if self.code == true:

p =self.doc.add_paragraph()

run=p.add_run(data)

run.font.color.rgb = rgbcolor(111,111,111)

def handle_endtag(self, tag):

self.title = false

# self.code = false

if tag == self.processing:

self.doc.add_paragraph(self.text)

self.processing = none

self.text=''

if tag == self.codeprocessing:

self.code =false

运行结果：

python有什么好的微信公众号（python下载微信公众号相关文章）

缺少部分文档，如pytest文档4，是因为搜狗微信文章搜索结果中就没有

以上就是本文的全部内容，希望对大家的学习有所帮助，也希望大家多多支持开心学习网。

原文链接：https://blog.csdn.net/yaoliuwei1426/article/details/84707163

标签：微信公众号 Python 下载

上一篇：python 文本文件读取方法（Python逐行读取文件中内容的简单方法）

下一篇：mysql创建存储过程的代码（MySQL修改存储过程的详细步骤）

您可能感兴趣

python有什么好的微信公众号（python下载微信公众号相关文章）

python有什么好的微信公众号

热门推荐

排行榜