Loading... ``` import re import urllib.request, urllib.error import bs4 import time baseUrl = "https://www.luogu.com.cn/problem/P" savePath = "C:\\Users\\666\\Desktop\\p\\" # 存放位置 maxn = 7000 # 最大爬取数 def main(): print("计划爬取到P{}".format(maxn)) for i in range(1000, maxn + 1): time.sleep(0.2) try: print("正在爬取P{}".format(i)) html = getHTML(baseUrl + str(i)) problemMD = getMD(html) name = getName(html) saveData(problemMD, "P" + str(i) + " " + name + ".md") except Exception: pass continue print("爬取完毕") def getHTML(url): headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/5" "37.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36" } request = urllib.request.Request(url=url, headers=headers) response = urllib.request.urlopen(request) html = response.read().decode('utf-8') return html def getName(html): bs = bs4.BeautifulSoup(html, "html.parser") name = str(bs.select("h1")[0]) name = re.sub("<h1>", "", name) name = re.sub("</?[a-zA-Z]+[^<>]*>", "", name) return name def getMD(html): bs = bs4.BeautifulSoup(html, "html.parser") core = bs.select("article")[0] md = str(core) md = re.sub("<h1>", "# ", md) md = re.sub("<h2>", "## ", md) md = re.sub("<h3>", "#### ", md) md = re.sub("</?[a-zA-Z]+[^<>]*>", "", md) md = func(md) return md def func(s: str) -> str: res = "" left = True for i in s: if i != "$": res = res + i elif left == True: res = res + "`" + i left = not (left) else: res = res + i + "`" left = not (left) return res def saveData(data, filename): cfilename = savePath + filename file = open(cfilename, "w", encoding="utf-8") for d in data: file.writelines(d) file.close() if __name__ == '__main__': main() ``` Last modification:September 10, 2021 © Allow specification reprint Like 0 如果觉得我的文章对你有用,请随意赞赏
Comment here is closed