| import os |
| import requests |
| import concurrent.futures |
|
|
| |
| start_patent = 4000000 |
| end_patent = 11600000 |
|
|
| |
| if not os.path.exists("data"): |
| os.makedirs("data") |
|
|
| |
| def download_page(patent_number): |
| url = f"https://patents.justia.com/patent/{patent_number}" |
| response = requests.get(url) |
| if response.status_code == 404: |
| print(f"Skipping {url} (404 Not Found)") |
| return |
| filename = f"data/{patent_number}.html" |
| with open(filename, "w", encoding="utf-8") as f: |
| f.write(response.text) |
| print(f"Downloaded {url} to {filename}") |
|
|
| |
| with concurrent.futures.ThreadPoolExecutor() as executor: |
| |
| futures = [executor.submit(download_page, patent_number) for patent_number in range(start_patent, end_patent + 1)] |
| |
| concurrent.futures.wait(futures) |