周末时间利用pyton抓取了百度文库小学一年级的例题。
以下为代码,仅供学习,不得用于其他用途。
# -*- coding:UTF-8 -*- from urllib.request import urlretrieve import re import requests import json def getHtml(url): req = requests.get(url=url) #print(req.text) return req.text def getImg(html,name): reg = r'var pageData = (.+)};' imgre = re.compile(reg) imglist = re.findall(imgre,html) imglistjson = imglist[0] + "}" print(imglistjson) text = json.loads(imglistjson) if "pageList" in text["xReader"]["readerInfo"]: pageList = text["xReader"]["readerInfo"]["pageList"] for item in pageList: urlretrieve(item["url"],'%s-%s.jpg' % (name, item["page"])) def download(url,name): print(url) html = getHtml(url) getImg(html,name) dict = {} sum = 100 for num in range(0,sum): url = "https://abg.baidu.com/abg/search/getsearchlist?query=%E4%B8%80%E5%B9%B4%E7%BA%A7%E6%80%9D%E7%BB%B4%E7%BB%83%E4%B9%A0%E9%A2%98&fileType=&cid1=0&cid2=0&order=4&pn=" + str(num) + "&rn=24&showManual=0" html = getHtml(url) text = json.loads(html) for item in text["data"]["list"]: if item["templateId"] not in dict: dict[item["templateId"]] = item["templateId"] download("https://abg.baidu.com/view/"+item["templateId"]+"?fr=search-income-top3page","思维训练题-"+item["templateId"]) print(len(dict))
请赞赏
朋友,创作不易;为犒赏小编的辛勤劳动,请她喝杯咖啡吧!
给她赞赏,您将财运亨通