Spaces:

cyberspyde
/

chatbot-team4

Sleeping

chatbot-team4 / utils /scrape_JBNU_FOCUS.py

Cyberspyde

Final Update

ce24d59 over 2 years ago

702 Bytes

	import requests, re
	from bs4 import BeautifulSoup

	def scrape_page(url):
	response = requests.get(url)
	soup = BeautifulSoup(response.content, "html.parser")
	text = soup.get_text()
	text = text.strip()
	text = text.replace("\n", "")
	pattern = re.compile("[\u3131-\u3163\uac00-\ud7a3]+")

	if text != "":
	print(text)
	return text

	def scrape_recursive(url, output_file):
	text = scrape_page(url)
	if text is not None:
	with open(output_file, "w", encoding='utf-8') as f:
	f.write(text)


	url = "https://www.jbnu.ac.kr/eng/?menuID=350&mode=view&no="

	for k in range(1, 320):
	scrape_recursive(url+str(k), "data/output{}.txt".format(k))