Spaces:
Running
Running
| from bs4 import BeautifulSoup | |
| import requests | |
| from selenium.webdriver import Chrome | |
| from selenium.webdriver.chrome.options import Options | |
| import os | |
| dir_path = os.path.dirname(os.path.realpath(__file__)) | |
| SAVE_PATH = dir_path + '/prescraped/usaco/' | |
| scraped_problems = os.listdir(SAVE_PATH + "Problems") | |
| scraped_editorials = os.listdir(SAVE_PATH + "Editorials") | |
| def anti_scrape(soup): | |
| if soup.text == "Just a moment...Enable JavaScript and cookies to continue": | |
| print("Bypassing anti-scrap protection...") | |
| scr = soup.findAll("script")[-1].string | |
| scr = scr[scr.index("var a=toNumbers"):].split(';') | |
| line = scr[0] | |
| abc = [] | |
| while "toNumbers" in line: | |
| i = line.index("toNumbers") | |
| line = line[i+11:] | |
| abc.append(line[:line.index('"')]) | |
| from Crypto.Cipher import AES | |
| def to_numbers(x): | |
| return bytes(int(x[i:i+2], 16) for i in range(0, len(x), 2)) | |
| key, iv, cipher = map(to_numbers, abc) | |
| aes = AES.new(key, AES.MODE_CBC, iv) | |
| rcpc = aes.decrypt(cipher).hex() | |
| print(f"RCPC = {rcpc}") | |
| url = scr[-2] | |
| url = url[url.index('"')+1:-1] | |
| r = requests.get(url, cookies={"RCPC": rcpc}) | |
| s = r.text | |
| soup = BeautifulSoup(s, "html.parser") | |
| def read(file_path): | |
| res = "" | |
| with open(file_path, 'r') as f: | |
| res = f.read() | |
| return res | |
| def from_url(url): | |
| return url.split('/')[-1] | |
| def problem(url): | |
| pid = from_url(url) | |
| if (pid in scraped_problems): | |
| statement = read(SAVE_PATH + "Problems/" + pid) | |
| if (len(statement)): | |
| return {"statement": statement} | |
| response = requests.get(url) | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| soup = soup.find_all(class_='problem-text')[0] | |
| while soup.pre != None: # removes all code | |
| soup.pre.decompose() | |
| prob = soup.text | |
| prob = prob.split("SAMPLE INPUT") | |
| prob[-1] = prob[-1].split("SCORING:") | |
| prob = prob[0] + "SCORING:" + prob[-1][-1] | |
| with open(SAVE_PATH + 'Problems/' + pid, 'w') as f: | |
| f.write(prob) | |
| scraped_problems.append(pid) | |
| return {"statement": prob} | |
| def editorial(prob_url, edi_url, bot=None, query_func=None): # TODO: Fix random line breaks in the scrapes | |
| pid = from_url(edi_url) | |
| print(pid, scraped_editorials) | |
| if (pid in scraped_editorials): | |
| edi = read(SAVE_PATH + "Editorials/" + pid) | |
| if (len(edi)): | |
| return edi | |
| response = requests.get(edi_url) | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| while soup.pre != None: # removes all code | |
| soup.pre.decompose() | |
| edi = [] | |
| for tag in soup.find_all(['p']): | |
| if (tag.parent.name != 'body'): | |
| continue | |
| latex_content = tag.text | |
| # print(tag.parent.name) | |
| # for elem in tag.descendants: # In case LaTeX doesn't render automatically with bs4 | |
| # if (elem.find_parent().name != 'p' and elem.find_parent().name != 'a' and elem.find_parent().name != 'center'): | |
| # continue | |
| # if isinstance(elem, str): | |
| # latex_content += elem | |
| # elif elem.name == "script" and elem.get("type") == "math/tex": | |
| # latex_content += "$$$" + elem.string + "$$$" | |
| # if ("code:" in latex_content.lower()): | |
| # continue | |
| edi.append(latex_content) | |
| edi = '\n'.join(edi) | |
| # print('bot', bot) | |
| # if (bot): | |
| # edi = bot.chat(query_func(problem(prob_url), edi)) | |
| with open(SAVE_PATH + 'Editorials/' + pid, 'w') as f: | |
| f.write(edi) | |
| scraped_editorials.append(pid) | |
| return edi | |
| # print(editorial('https://usaco.org/current/data/sol_prob2_platinum_open24.html')) | |
| # print(problem('https://usaco.org/index.php?page=viewproblem2&cpid=1428')['statement']) | |