1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184
| import os import re import jieba import urllib.request import matplotlib.pyplot as plt from wordcloud import WordCloud from urllib.request import urlopen from urllib import parse
def loadPage(url, filename): """ 作用:根据url发送请求,获取服务器响应文件 url: 需要爬取的url地址 filename : 处理的文件名 """ print("正在下载 " + filename) html = urlopen(url).read().decode("utf-8") return html
def writePage(html, filename): """ 作用:将html内容写入到本地 html:服务器相应文件内容 """ print("正在保存 " + filename) with open(new_file_name+kw+filename, "w", encoding="utf-8") as f: f.write(html) print("-" * 30)
def tiebaSpider(url, beginPage, endPage): """ 作用:贴吧爬虫调度器,负责组合处理每个页面的url url : 贴吧url的前部分 beginPage : 起始页 endPage : 结束页 """ for page in range(beginPage, endPage + 1): pn = (page - 1) * 50 filename = "第" + str(page) + "页.html" fullurl = url + "&pn=" + str(pn) print(fullurl) html = loadPage(fullurl, filename) writePage(html, filename)
def newpage(url): """ 作用:获取贴吧最新消息并展示出来 fullurl : 完全连接 data_topic_name:最新主题 data_made_time:创建时间 data_main_info: """ fullurl = url + key + "&ie=utf-8&pn=" header = ("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.") urllib.request.build_opener().addheaders = [header] urllib.request.install_opener(urllib.request.build_opener()) data = urllib.request.urlopen(fullurl).read().decode() data_topic_name = re.compile('title="主题作者:(.*?)"').findall(data) data_made_time = re.compile('title="创建时间">(.*?)</span>').findall(data) data_main_info = re.compile('<a rel="noreferrer" href="/p/.*?" title="(.*?)"').findall(data) data_end_name = re.compile('title="最后回复人:(.*?)"').findall(data) data_img_picture = re.compile('class="thumbnail vpic_wrap"><img src="" attr=".*?" data-original="(.*?)"').findall(data) for picture in range(0, len(data_img_picture)): imgurl = data_img_picture[picture] file_name = new_file_name+kw+str(picture) + ".jpg" urllib.request.urlretrieve(imgurl, filename=file_name) mademakdowntitle(biaochang) markdownbiao(biaochang, data_made_time) markdownbiao(biaochang, data_topic_name) markdownbiao(biaochang, data_main_info) markdownbiao(biaochang, data_end_name) tiezhun = "热门贴主" tiezi = "热门帖子" huoyue = "贴吧活跃" tiebayuntu(data_topic_name, tiezhun) tiebayuntu(data_main_info, tiezi) tiebayuntu(data_end_name, huoyue)
def markdownbiao(biaochang,dataname): """ 作用:判断目录是否存 fullurl : 完全连接 data_topic_name:最新主题 data_made_time:创建时间 data_main_info:主体信息 """ i = 0 for i in range(0,biaochang): if i == 0: file.write(str(jianduan)) elif i < biaochang: file.write(dataname[i-1]+str(jianduan)) else: file.write(str(jianduan)) file.write("\n")
def mademakdowntitle(biaochang): for biaoge in range(0, biaochang): if biaoge == 0: file.write(jianduan) elif biaoge < biaochang: file.write(str(biaoge) + jianduan) else: file.write(jianduan) file.write("\n") for timebiao in range(0, biaochang - 1): if timebiao == 0: file.write(jianduan) elif timebiao < biaochang: file.write("----" + jianduan) else: file.write(jianduan) file.write("\n")
def mknewdir(name): """ 作用:判断目录是否存 fullurl : 完全连接 data_topic_name:最新主题 data_made_time:创建时间 data_main_info:主体信息 """ isExists = os.path.exists(name) if not isExists: os.makedirs(name) print(name + "目录创建成功") return True else: print(name + "目录目录已存在") return False
def tiebayuntu(data_name,picture_name): text = str(data_name) cut_text = jieba.cut(text) result = " ".join(cut_text) wc = WordCloud( font_path='FZMengRTJW.TTF', background_color='white', width=1920, height=1080, max_font_size=100, min_font_size=10, max_words=1000 ) wc.generate(result) wc.to_file(new_file_name+kw+picture_name+".png") plt.figure('贴吧热热门贴主') plt.axis('off')
if __name__ == "__main__": kw = input("请输入需要爬取的贴吧名:") new_file_name = "./"+kw+"/" mknewdir(new_file_name) beginPage = int(input("请输入起始页:")) endPage = int(input("请输入结束页:")) if (beginPage == 1)&(endPage == 1): biaochang = int(input("请输markdown入表长")) else: biaochang = 30
file = open(new_file_name+kw+"yanshi.md", "w", encoding="utf-8")
jianduan = "|" biaochang = biaochang + 1
url = "http://tieba.baidu.com/f?" key = parse.urlencode({"kw": kw}) fullurl = url + key print("key=", key) newpage(url) tiebaSpider(fullurl, beginPage, endPage) html = urlopen("http://tieba.baidu.com/f?kw=python&pn=100")
file.close()
|