from bs4 import BeautifulSoup import requests import numpy as np from datetime import datetime from reportlab.lib.pagesizes import letter from reportlab.pdfgen import canvas from reportlab.lib.utils import ImageReader import io from PIL import Image def gen_link(): if np.random.choice([True, False]): # AMC np.random.seed() year = np.random.randint(2015, 2023) AB = np.random.choice(['A', 'B']) # Question mu, sigma = 18, 5 s = np.random.normal(mu, sigma, 1000) s = np.round(s) s = s[(s >= 10) & (s <= 25)] q = int(np.random.choice(s)) link = f'https://artofproblemsolving.com/wiki/index.php/{year}_AMC_12{AB}_Problems/Problem_{q}' else: # AIME np.random.seed() year = np.random.randint(2005, 2023) I = np.random.choice(['I', 'II']) mu, sigma = 6, 4 s = np.random.normal(mu, sigma, 1000) s = np.round(s) s = s[(s >= 1) & (s <= 15)] q = int(np.random.choice(s)) link = f'https://artofproblemsolving.com/wiki/index.php/{year}_AIME_{I}_Problems/Problem_{q}' return link def convert_to_renderable_html(text): text = text.replace('//latex.artofproblemsolving.com', 'https://latex.artofproblemsolving.com') return text def get_problem(url): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" } response = requests.get(url, headers=headers) soup = BeautifulSoup(response.text, 'html.parser') problem_headline = soup.find('span', {'class': 'mw-headline', 'id': 'Problem'}) if problem_headline: problem_content = [] for sibling in problem_headline.parent.find_next_siblings(): if sibling.name == 'h2': break elif sibling.name == 'p': problem_content.append(convert_to_renderable_html(str(sibling))) problem_html = " ".join(problem_content) return problem_html else: print("No problem found") def gen_html(num): all_q = str() num_tried = 0 num_succ = 0 while True: try: link = gen_link() print(link) hype = f'to link' qhtml = get_problem(link) all_q += (hype + qhtml) num_succ += 1 except Exception as e: print(f"Error: {e}") pass num_tried += 1 if num_succ >= num or num_tried > 20: break all_q = f'''
{all_q} ''' return all_q def generate_pdf_content(html_content): # Use BeautifulSoup to parse the HTML content soup = BeautifulSoup(html_content, 'html.parser') # Extract text and images elements = [] for tag in soup.find_all(['p', 'a', 'img']): if tag.name == 'p': elements.append(('text', tag.get_text())) elif tag.name == 'a': elements.append(('link', tag.get('href'), tag.get_text())) elif tag.name == 'img': img_url = tag.get('src') response = requests.get(img_url) img = Image.open(io.BytesIO(response.content)) elements.append(('image', img)) return elements def create_pdf(filename, content): c = canvas.Canvas(filename, pagesize=letter) width, height = letter y = height - 40 for elem in content: if elem[0] == 'text': c.drawString(30, y, elem[1]) y -= 20 elif elem[0] == 'link': c.drawString(30, y, f'{elem[2]}: {elem[1]}') y -= 20 elif elem[0] == 'image': img_reader = ImageReader(elem[1]) c.drawImage(img_reader, 30, y - elem[1].size[1], width=elem[1].size[0], height=elem[1].size[1]) y -= elem[1].size[1] + 20 if y < 50: c.showPage() y = height - 40 c.save() def convert_html_to_pdf(html_content, output_filename): content = generate_pdf_content(html_content) create_pdf(output_filename, content)