Spaces:
Runtime error
Runtime error
| # This module pulls the MPEP day from the internet | |
| # Saves each chapter as a separate document | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import os | |
| import markdownify | |
| from tqdm import tqdm | |
| chapter_numbers = list(range(100,3000,100)) | |
| for chapter in tqdm(chapter_numbers, desc=" outer", position=0): | |
| URL = f"https://www.uspto.gov/web/offices/pac/mpep/mpep-{chapter:04}.html" #cast into four digits | |
| page = requests.get(URL) | |
| os.mkdir(f"data/{chapter:04}") | |
| soup = BeautifulSoup(page.content, "html.parser") | |
| div = soup.find("div", {"id": "article"}) | |
| sections = div.find_all("li") | |
| for section in tqdm(sections, desc=" inner loop", position=1, leave=False): | |
| section_number = section.find('a')['href'] | |
| URL = f"https://www.uspto.gov/web/offices/pac/mpep/{section_number}" | |
| page = requests.get(URL) | |
| soup = BeautifulSoup(page.content, "html.parser") | |
| div = soup.find_all("div", class_="Section") | |
| h = markdownify.markdownify(str(div), heading_style="ATX") | |
| filename = f"data/{chapter:04}/{section_number}.txt" | |
| with open(filename, "w") as file: | |
| file.write(h) | |