洛谷题库爬虫

洛谷题库爬虫

mi0e

September 10, 2021

1111 views

No comments

2055 words

Python 编程

```
import re
import urllib.request, urllib.error
import bs4
import time

baseUrl = "https://www.luogu.com.cn/problem/P"
savePath = "C:\\Users\\666\\Desktop\\p\\"	# 存放位置
maxn = 7000	# 最大爬取数

def main():
    print("计划爬取到P{}".format(maxn))
    for i in range(1000, maxn + 1):
        time.sleep(0.2)
        try:
            print("正在爬取P{}".format(i))
            html = getHTML(baseUrl + str(i))
            problemMD = getMD(html)
            name = getName(html)
            saveData(problemMD, "P" + str(i) + " " + name + ".md")
        except Exception:
            pass
        continue
    print("爬取完毕")

def getHTML(url):
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/5"
                      "37.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
    }
    request = urllib.request.Request(url=url, headers=headers)
    response = urllib.request.urlopen(request)
    html = response.read().decode('utf-8')
    return html

def getName(html):
    bs = bs4.BeautifulSoup(html, "html.parser")
    name = str(bs.select("h1")[0])
    name = re.sub("<h1>", "", name)
    name = re.sub("</?[a-zA-Z]+[^<>]*>", "", name)
    return name

def getMD(html):
    bs = bs4.BeautifulSoup(html, "html.parser")
    core = bs.select("article")[0]
    md = str(core)
    md = re.sub("<h1>", "# ", md)
    md = re.sub("<h2>", "## ", md)
    md = re.sub("<h3>", "#### ", md)
    md = re.sub("</?[a-zA-Z]+[^<>]*>", "", md)
    md = func(md)
    return md

def func(s: str) -> str:
    res = ""
    left = True
    for i in s:
        if i != "$":
            res = res + i
        elif left == True:
            res = res + "`" + i
            left = not (left)
        else:
            res = res + i + "`"
            left = not (left)
    return res

def saveData(data, filename):
    cfilename = savePath + filename
    file = open(cfilename, "w", encoding="utf-8")
    for d in data:
        file.writelines(d)
    file.close()

if __name__ == '__main__':
    main()

```

Last modification：September 10, 2021