Spaces:

debaghtk
/

patent-ai

Runtime error

patent-ai / download_script.py

download 1000 patents

98c6817 over 2 years ago

1.05 kB

	import os
	import requests
	import concurrent.futures

	# Define the start and end patent numbers
	start_patent = 4000000
	end_patent = 11600000

	# Create the data directory if it doesn't exist
	if not os.path.exists("data"):
	os.makedirs("data")

	# Define a function to download a single page
	def download_page(patent_number):
	url = f"https://patents.justia.com/patent/{patent_number}"
	response = requests.get(url)
	if response.status_code == 404:
	print(f"Skipping {url} (404 Not Found)")
	return
	filename = f"data/{patent_number}.html"
	with open(filename, "w", encoding="utf-8") as f:
	f.write(response.text)
	print(f"Downloaded {url} to {filename}")

	# Create a thread pool to download pages in parallel
	with concurrent.futures.ThreadPoolExecutor() as executor:
	# Submit download tasks for each patent number
	futures = [executor.submit(download_page, patent_number) for patent_number in range(start_patent, end_patent + 1)]
	# Wait for all tasks to complete
	concurrent.futures.wait(futures)