Spaces:

RithwikG
/

comp-prog-bot

Running

App Files Files Community

comp-prog-bot / utils /scrapers /usaco.py

RithwikG

initial commit

7a8878c over 1 year ago

raw

history blame contribute delete

3.88 kB

	from bs4 import BeautifulSoup
	import requests
	from selenium.webdriver import Chrome
	from selenium.webdriver.chrome.options import Options
	import os

	dir_path = os.path.dirname(os.path.realpath(__file__))

	SAVE_PATH = dir_path + '/prescraped/usaco/'
	scraped_problems = os.listdir(SAVE_PATH + "Problems")
	scraped_editorials = os.listdir(SAVE_PATH + "Editorials")


	def anti_scrape(soup):
	if soup.text == "Just a moment...Enable JavaScript and cookies to continue":
	print("Bypassing anti-scrap protection...")
	scr = soup.findAll("script")[-1].string
	scr = scr[scr.index("var a=toNumbers"):].split(';')
	line = scr[0]
	abc = []
	while "toNumbers" in line:
	i = line.index("toNumbers")
	line = line[i+11:]
	abc.append(line[:line.index('"')])
	from Crypto.Cipher import AES
	def to_numbers(x):
	return bytes(int(x[i:i+2], 16) for i in range(0, len(x), 2))
	key, iv, cipher = map(to_numbers, abc)
	aes = AES.new(key, AES.MODE_CBC, iv)
	rcpc = aes.decrypt(cipher).hex()
	print(f"RCPC = {rcpc}")
	url = scr[-2]
	url = url[url.index('"')+1:-1]
	r = requests.get(url, cookies={"RCPC": rcpc})
	s = r.text
	soup = BeautifulSoup(s, "html.parser")

	def read(file_path):
	res = ""
	with open(file_path, 'r') as f:
	res = f.read()

	return res

	def from_url(url):
	return url.split('/')[-1]

	def problem(url):
	pid = from_url(url)
	if (pid in scraped_problems):
	statement = read(SAVE_PATH + "Problems/" + pid)
	if (len(statement)):
	return {"statement": statement}

	response = requests.get(url)

	soup = BeautifulSoup(response.text, 'html.parser')

	soup = soup.find_all(class_='problem-text')[0]

	while soup.pre != None: # removes all code
	soup.pre.decompose()



	prob = soup.text

	prob = prob.split("SAMPLE INPUT")

	prob[-1] = prob[-1].split("SCORING:")


	prob = prob[0] + "SCORING:" + prob[-1][-1]

	with open(SAVE_PATH + 'Problems/' + pid, 'w') as f:
	f.write(prob)
	scraped_problems.append(pid)

	return {"statement": prob}



	def editorial(prob_url, edi_url, bot=None, query_func=None): # TODO: Fix random line breaks in the scrapes
	pid = from_url(edi_url)
	print(pid, scraped_editorials)
	if (pid in scraped_editorials):
	edi = read(SAVE_PATH + "Editorials/" + pid)
	if (len(edi)):
	return edi



	response = requests.get(edi_url)

	soup = BeautifulSoup(response.text, 'html.parser')

	while soup.pre != None: # removes all code
	soup.pre.decompose()

	edi = []

	for tag in soup.find_all(['p']):
	if (tag.parent.name != 'body'):
	continue

	latex_content = tag.text

	# print(tag.parent.name)

	# for elem in tag.descendants: # In case LaTeX doesn't render automatically with bs4

	# if (elem.find_parent().name != 'p' and elem.find_parent().name != 'a' and elem.find_parent().name != 'center'):
	# continue

	# if isinstance(elem, str):
	# latex_content += elem
	# elif elem.name == "script" and elem.get("type") == "math/tex":
	# latex_content += "$$$" + elem.string + "$$$"


	# if ("code:" in latex_content.lower()):
	# continue
	edi.append(latex_content)

	edi = '\n'.join(edi)

	# print('bot', bot)

	# if (bot):
	# edi = bot.chat(query_func(problem(prob_url), edi))


	with open(SAVE_PATH + 'Editorials/' + pid, 'w') as f:
	f.write(edi)
	scraped_editorials.append(pid)

	return edi



	# print(editorial('https://usaco.org/current/data/sol_prob2_platinum_open24.html'))
	# print(problem('https://usaco.org/index.php?page=viewproblem2&cpid=1428')['statement'])