Spaces:

DrishtiSharma
/

chat-w-google-patents

Runtime error

App Files Files Community

chat-w-google-patents / patent_downloader.py

DrishtiSharma

Create patent_downloader.py

a417f74 verified 12 months ago

raw

history blame

4.48 kB

	from typing import List, Union, Optional
	import os
	import requests
	import re
	import time
	import shutil
	import subprocess
	import pandas as pd
	from selenium import webdriver
	from selenium.webdriver.common.keys import Keys
	from selenium.webdriver.chrome.service import Service
	from selenium.webdriver.chrome.options import Options
	from bs4 import BeautifulSoup
	import chromedriver_autoinstaller

	class PatentDownloader:
	url = "https://patents.google.com"

	def __init__(self, verbose: bool = False):
	"""
	Parameters
	----------
	verbose : bool
	Print additional debug information.
	"""
	self.verbose = verbose
	self.chrome_path = self.install_chrome()

	def install_chrome(self) -> str:
	"""
	Download and install Google Chrome dynamically.
	Returns
	-------
	str: Path to the Chrome binary.
	"""
	chrome_path = "/usr/bin/google-chrome"

	if not shutil.which("google-chrome"):
	print("Downloading and installing Google Chrome...")
	subprocess.run(
	"wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb -O chrome.deb",
	shell=True,
	check=True,
	)
	subprocess.run(
	"apt-get update && apt-get install -y ./chrome.deb",
	shell=True,
	check=True,
	)
	os.remove("chrome.deb")

	if not shutil.which("google-chrome"):
	raise ValueError("Google Chrome installation failed!")
	return chrome_path

	def download(self, patent: Union[str, List[str]], output_path: str = "./",
	waiting_time: int = 6, remove_kind_codes: Optional[List[str]] = None) -> None:
	"""
	Download patent document(s) as PDF.
	"""
	if isinstance(patent, list) or os.path.isfile(patent):
	self.get_pdfs(patent, output_path, waiting_time, remove_kind_codes)
	else:
	self.get_pdf(patent, output_path, waiting_time, remove_kind_codes)

	def get_pdf(self, patent: str, output_path: str = "./", waiting_time: int = 6,
	remove_kind_codes: Optional[List[str]] = None) -> None:
	"""
	Download a single patent PDF.
	"""
	if remove_kind_codes:
	for kind_code in remove_kind_codes:
	patent = re.sub(kind_code + "$", "", patent)

	# Automatically install ChromeDriver
	chromedriver_autoinstaller.install()

	# Set up Chrome options
	chrome_options = Options()
	chrome_options.binary_location = self.chrome_path
	chrome_options.add_argument("--headless")
	chrome_options.add_argument("--no-sandbox")
	chrome_options.add_argument("--disable-dev-shm-usage")

	# Initialize Selenium WebDriver
	service = Service()
	driver = webdriver.Chrome(service=service, options=chrome_options)
	driver.get(self.url)

	try:
	# Search for the patent
	element = driver.find_element("css selector", "input[type='search']")

	element.send_keys(patent)
	element.send_keys(Keys.RETURN)
	time.sleep(waiting_time)

	# Parse HTML and get the PDF link
	soup = BeautifulSoup(driver.page_source, "html.parser")
	pdf_link = self.get_pdf_link(soup, patent)
	finally:
	driver.quit()

	# Download the PDF
	if pdf_link:
	validate_directory(output_path)
	pdf_content = requests.get(pdf_link).content
	with open(os.path.join(output_path, f"{patent}.pdf"), "wb") as file:
	file.write(pdf_content)
	print(f">>> Patent {patent} successfully downloaded <<<")
	else:
	print(f"Error: PDF link for patent {patent} not found!")

	@staticmethod
	def get_pdf_link(soup: BeautifulSoup, patent: str) -> Optional[str]:
	"""
	Extract the PDF link from parsed HTML.
	"""
	pdf_links = [link['href'] for link in soup.find_all('a', href=True) if link['href'].lower().endswith("pdf")]
	for link in pdf_links:
	if patent.lower() in link.lower():
	return link
	return None


	def validate_directory(directory: str) -> None:
	"""
	Ensure the output directory exists.
	"""
	if not os.path.exists(directory):
	os.makedirs(directory)