Kwonpop-Webbrain-ko / crawl_wiki.py
kwonpop's picture
Upload 8 files
62dbf75 verified
import requests
from bs4 import BeautifulSoup
import re
KEYWORDS = ["๊น€์ •์€", "๋ถํ•œ", "ํ•ต", "์ œ์žฌ"]
out = open("wiki.txt", "w", encoding="utf-8")
for kw in KEYWORDS:
url = f"https://ko.wikipedia.org/wiki/{kw}"
html = requests.get(url).text
soup = BeautifulSoup(html, "html.parser")
text = soup.get_text()
text = re.sub(r"\s+", " ", text)
out.write(kw + " " + text[:3000] + "\n")
out.close()
print("wiki done")