Spaces:
Runtime error
Runtime error
Update patent_downloader.py
Browse files- patent_downloader.py +10 -35
patent_downloader.py
CHANGED
|
@@ -13,7 +13,6 @@ from selenium import webdriver
|
|
| 13 |
from selenium.webdriver.common.keys import Keys
|
| 14 |
from selenium.webdriver.chrome.service import Service
|
| 15 |
from selenium.webdriver.chrome.options import Options
|
| 16 |
-
from bs4 import BeautifulSoup
|
| 17 |
from selenium.webdriver.common.by import By
|
| 18 |
from selenium.webdriver.support.ui import WebDriverWait
|
| 19 |
from selenium.webdriver.support import expected_conditions as EC
|
|
@@ -91,7 +90,6 @@ class PatentDownloader:
|
|
| 91 |
driver = webdriver.Chrome(service=service, options=chrome_options)
|
| 92 |
|
| 93 |
file_path = os.path.join(output_path, f"{patent}.pdf")
|
| 94 |
-
pdf_link = None
|
| 95 |
|
| 96 |
try:
|
| 97 |
print(f"Navigating to Google Patents...")
|
|
@@ -104,27 +102,24 @@ class PatentDownloader:
|
|
| 104 |
search_input.send_keys(patent)
|
| 105 |
search_input.send_keys(Keys.RETURN)
|
| 106 |
|
| 107 |
-
print("Waiting for the
|
| 108 |
-
|
| 109 |
-
|
|
|
|
| 110 |
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
if not pdf_link:
|
| 115 |
-
raise FileNotFoundError("Could not find a valid PDF link.")
|
| 116 |
|
| 117 |
-
#
|
| 118 |
-
|
| 119 |
|
| 120 |
except Exception as e:
|
| 121 |
print(f"An error occurred: {e}")
|
|
|
|
| 122 |
finally:
|
| 123 |
driver.quit()
|
| 124 |
|
| 125 |
-
# Download and validate the PDF
|
| 126 |
-
self.download_and_validate_pdf(pdf_link, file_path)
|
| 127 |
-
|
| 128 |
def get_pdfs(self, patents: Union[List[str], str], output_path: str = "./", waiting_time: int = 10) -> None:
|
| 129 |
"""
|
| 130 |
Download multiple patent PDFs.
|
|
@@ -142,26 +137,6 @@ class PatentDownloader:
|
|
| 142 |
print(f"Downloading patent: {patent}")
|
| 143 |
self.get_pdf(patent, output_path, waiting_time)
|
| 144 |
|
| 145 |
-
@staticmethod
|
| 146 |
-
def get_pdf_link(soup: BeautifulSoup) -> Optional[str]:
|
| 147 |
-
"""
|
| 148 |
-
Extract the PDF link from the parsed HTML.
|
| 149 |
-
Prioritizes the 'Download PDF' button and storage links.
|
| 150 |
-
"""
|
| 151 |
-
# Look for "Download PDF" button
|
| 152 |
-
download_button = soup.find("a", string=re.compile("Download PDF", re.IGNORECASE))
|
| 153 |
-
if download_button and download_button.get("href"):
|
| 154 |
-
return download_button["href"]
|
| 155 |
-
|
| 156 |
-
# Fallback: Search for links containing 'patentimages.storage.googleapis.com'
|
| 157 |
-
pdf_links = [link['href'] for link in soup.find_all('a', href=True)
|
| 158 |
-
if 'patentimages.storage.googleapis.com' in link['href']]
|
| 159 |
-
if pdf_links:
|
| 160 |
-
print(f"Found PDF link: {pdf_links[0]}")
|
| 161 |
-
return pdf_links[0]
|
| 162 |
-
|
| 163 |
-
return None
|
| 164 |
-
|
| 165 |
def download_and_validate_pdf(self, pdf_link: str, file_path: str):
|
| 166 |
"""
|
| 167 |
Download the PDF and validate its integrity.
|
|
|
|
| 13 |
from selenium.webdriver.common.keys import Keys
|
| 14 |
from selenium.webdriver.chrome.service import Service
|
| 15 |
from selenium.webdriver.chrome.options import Options
|
|
|
|
| 16 |
from selenium.webdriver.common.by import By
|
| 17 |
from selenium.webdriver.support.ui import WebDriverWait
|
| 18 |
from selenium.webdriver.support import expected_conditions as EC
|
|
|
|
| 90 |
driver = webdriver.Chrome(service=service, options=chrome_options)
|
| 91 |
|
| 92 |
file_path = os.path.join(output_path, f"{patent}.pdf")
|
|
|
|
| 93 |
|
| 94 |
try:
|
| 95 |
print(f"Navigating to Google Patents...")
|
|
|
|
| 102 |
search_input.send_keys(patent)
|
| 103 |
search_input.send_keys(Keys.RETURN)
|
| 104 |
|
| 105 |
+
print("Waiting for the 'Download PDF' button...")
|
| 106 |
+
pdf_button_xpath = "//a[contains(text(),'Download PDF')]"
|
| 107 |
+
WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, pdf_button_xpath)))
|
| 108 |
+
pdf_link_element = driver.find_element(By.XPATH, pdf_button_xpath)
|
| 109 |
|
| 110 |
+
# Extract the PDF link
|
| 111 |
+
pdf_link = pdf_link_element.get_attribute("href")
|
| 112 |
+
print(f"PDF link found: {pdf_link}")
|
|
|
|
|
|
|
| 113 |
|
| 114 |
+
# Download and validate the PDF
|
| 115 |
+
self.download_and_validate_pdf(pdf_link, file_path)
|
| 116 |
|
| 117 |
except Exception as e:
|
| 118 |
print(f"An error occurred: {e}")
|
| 119 |
+
raise FileNotFoundError("Could not find a valid PDF link.")
|
| 120 |
finally:
|
| 121 |
driver.quit()
|
| 122 |
|
|
|
|
|
|
|
|
|
|
| 123 |
def get_pdfs(self, patents: Union[List[str], str], output_path: str = "./", waiting_time: int = 10) -> None:
|
| 124 |
"""
|
| 125 |
Download multiple patent PDFs.
|
|
|
|
| 137 |
print(f"Downloading patent: {patent}")
|
| 138 |
self.get_pdf(patent, output_path, waiting_time)
|
| 139 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
def download_and_validate_pdf(self, pdf_link: str, file_path: str):
|
| 141 |
"""
|
| 142 |
Download the PDF and validate its integrity.
|