patent-ai / download_script.py
debaghtk's picture
download 1000 patents
98c6817
import os
import requests
import concurrent.futures
# Define the start and end patent numbers
start_patent = 4000000
end_patent = 11600000
# Create the data directory if it doesn't exist
if not os.path.exists("data"):
os.makedirs("data")
# Define a function to download a single page
def download_page(patent_number):
url = f"https://patents.justia.com/patent/{patent_number}"
response = requests.get(url)
if response.status_code == 404:
print(f"Skipping {url} (404 Not Found)")
return
filename = f"data/{patent_number}.html"
with open(filename, "w", encoding="utf-8") as f:
f.write(response.text)
print(f"Downloaded {url} to {filename}")
# Create a thread pool to download pages in parallel
with concurrent.futures.ThreadPoolExecutor() as executor:
# Submit download tasks for each patent number
futures = [executor.submit(download_page, patent_number) for patent_number in range(start_patent, end_patent + 1)]
# Wait for all tasks to complete
concurrent.futures.wait(futures)