Spaces:

DrishtiSharma
/

chat-w-google-patents

Runtime error

App Files Files Community

DrishtiSharma commited on Dec 19, 2024

Commit

26ad68f

verified ·

1 Parent(s): 6d7c7fc

Update patent_downloader.py

Browse files

Files changed (1) hide show

patent_downloader.py +10 -35

patent_downloader.py CHANGED Viewed

@@ -13,7 +13,6 @@ from selenium import webdriver
 from selenium.webdriver.common.keys import Keys
 from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.chrome.options import Options
-from bs4 import BeautifulSoup
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
@@ -91,7 +90,6 @@ class PatentDownloader:
         driver = webdriver.Chrome(service=service, options=chrome_options)
         file_path = os.path.join(output_path, f"{patent}.pdf")
-        pdf_link = None
         try:
             print(f"Navigating to Google Patents...")
@@ -104,27 +102,24 @@ class PatentDownloader:
             search_input.send_keys(patent)
             search_input.send_keys(Keys.RETURN)
-            print("Waiting for the results page...")
-            WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
-            time.sleep(waiting_time)
-            print("Extracting PDF link...")
-            soup = BeautifulSoup(driver.page_source, "html.parser")
-            pdf_link = self.get_pdf_link(soup)
-            if not pdf_link:
-                raise FileNotFoundError("Could not find a valid PDF link.")
-            # Handle absolute PDF link
-            print(f"PDF link extracted: {pdf_link}")
         except Exception as e:
             print(f"An error occurred: {e}")
         finally:
             driver.quit()
-        # Download and validate the PDF
-        self.download_and_validate_pdf(pdf_link, file_path)
     def get_pdfs(self, patents: Union[List[str], str], output_path: str = "./", waiting_time: int = 10) -> None:
         """
         Download multiple patent PDFs.
@@ -142,26 +137,6 @@ class PatentDownloader:
             print(f"Downloading patent: {patent}")
             self.get_pdf(patent, output_path, waiting_time)
-    @staticmethod
-    def get_pdf_link(soup: BeautifulSoup) -> Optional[str]:
-        """
-        Extract the PDF link from the parsed HTML.
-        Prioritizes the 'Download PDF' button and storage links.
-        """
-        # Look for "Download PDF" button
-        download_button = soup.find("a", string=re.compile("Download PDF", re.IGNORECASE))
-        if download_button and download_button.get("href"):
-            return download_button["href"]
-        # Fallback: Search for links containing 'patentimages.storage.googleapis.com'
-        pdf_links = [link['href'] for link in soup.find_all('a', href=True)
-                     if 'patentimages.storage.googleapis.com' in link['href']]
-        if pdf_links:
-            print(f"Found PDF link: {pdf_links[0]}")
-            return pdf_links[0]
-        return None
     def download_and_validate_pdf(self, pdf_link: str, file_path: str):
         """
         Download the PDF and validate its integrity.

 from selenium.webdriver.common.keys import Keys
 from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
         driver = webdriver.Chrome(service=service, options=chrome_options)
         file_path = os.path.join(output_path, f"{patent}.pdf")
         try:
             print(f"Navigating to Google Patents...")
             search_input.send_keys(patent)
             search_input.send_keys(Keys.RETURN)
+            print("Waiting for the 'Download PDF' button...")
+            pdf_button_xpath = "//a[contains(text(),'Download PDF')]"
+            WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, pdf_button_xpath)))
+            pdf_link_element = driver.find_element(By.XPATH, pdf_button_xpath)
+            # Extract the PDF link
+            pdf_link = pdf_link_element.get_attribute("href")
+            print(f"PDF link found: {pdf_link}")
+            # Download and validate the PDF
+            self.download_and_validate_pdf(pdf_link, file_path)
         except Exception as e:
             print(f"An error occurred: {e}")
+            raise FileNotFoundError("Could not find a valid PDF link.")
         finally:
             driver.quit()
     def get_pdfs(self, patents: Union[List[str], str], output_path: str = "./", waiting_time: int = 10) -> None:
         """
         Download multiple patent PDFs.
             print(f"Downloading patent: {patent}")
             self.get_pdf(patent, output_path, waiting_time)
     def download_and_validate_pdf(self, pdf_link: str, file_path: str):
         """
         Download the PDF and validate its integrity.