DrishtiSharma commited on
Commit
26ad68f
·
verified ·
1 Parent(s): 6d7c7fc

Update patent_downloader.py

Browse files
Files changed (1) hide show
  1. patent_downloader.py +10 -35
patent_downloader.py CHANGED
@@ -13,7 +13,6 @@ from selenium import webdriver
13
  from selenium.webdriver.common.keys import Keys
14
  from selenium.webdriver.chrome.service import Service
15
  from selenium.webdriver.chrome.options import Options
16
- from bs4 import BeautifulSoup
17
  from selenium.webdriver.common.by import By
18
  from selenium.webdriver.support.ui import WebDriverWait
19
  from selenium.webdriver.support import expected_conditions as EC
@@ -91,7 +90,6 @@ class PatentDownloader:
91
  driver = webdriver.Chrome(service=service, options=chrome_options)
92
 
93
  file_path = os.path.join(output_path, f"{patent}.pdf")
94
- pdf_link = None
95
 
96
  try:
97
  print(f"Navigating to Google Patents...")
@@ -104,27 +102,24 @@ class PatentDownloader:
104
  search_input.send_keys(patent)
105
  search_input.send_keys(Keys.RETURN)
106
 
107
- print("Waiting for the results page...")
108
- WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
109
- time.sleep(waiting_time)
 
110
 
111
- print("Extracting PDF link...")
112
- soup = BeautifulSoup(driver.page_source, "html.parser")
113
- pdf_link = self.get_pdf_link(soup)
114
- if not pdf_link:
115
- raise FileNotFoundError("Could not find a valid PDF link.")
116
 
117
- # Handle absolute PDF link
118
- print(f"PDF link extracted: {pdf_link}")
119
 
120
  except Exception as e:
121
  print(f"An error occurred: {e}")
 
122
  finally:
123
  driver.quit()
124
 
125
- # Download and validate the PDF
126
- self.download_and_validate_pdf(pdf_link, file_path)
127
-
128
  def get_pdfs(self, patents: Union[List[str], str], output_path: str = "./", waiting_time: int = 10) -> None:
129
  """
130
  Download multiple patent PDFs.
@@ -142,26 +137,6 @@ class PatentDownloader:
142
  print(f"Downloading patent: {patent}")
143
  self.get_pdf(patent, output_path, waiting_time)
144
 
145
- @staticmethod
146
- def get_pdf_link(soup: BeautifulSoup) -> Optional[str]:
147
- """
148
- Extract the PDF link from the parsed HTML.
149
- Prioritizes the 'Download PDF' button and storage links.
150
- """
151
- # Look for "Download PDF" button
152
- download_button = soup.find("a", string=re.compile("Download PDF", re.IGNORECASE))
153
- if download_button and download_button.get("href"):
154
- return download_button["href"]
155
-
156
- # Fallback: Search for links containing 'patentimages.storage.googleapis.com'
157
- pdf_links = [link['href'] for link in soup.find_all('a', href=True)
158
- if 'patentimages.storage.googleapis.com' in link['href']]
159
- if pdf_links:
160
- print(f"Found PDF link: {pdf_links[0]}")
161
- return pdf_links[0]
162
-
163
- return None
164
-
165
  def download_and_validate_pdf(self, pdf_link: str, file_path: str):
166
  """
167
  Download the PDF and validate its integrity.
 
13
  from selenium.webdriver.common.keys import Keys
14
  from selenium.webdriver.chrome.service import Service
15
  from selenium.webdriver.chrome.options import Options
 
16
  from selenium.webdriver.common.by import By
17
  from selenium.webdriver.support.ui import WebDriverWait
18
  from selenium.webdriver.support import expected_conditions as EC
 
90
  driver = webdriver.Chrome(service=service, options=chrome_options)
91
 
92
  file_path = os.path.join(output_path, f"{patent}.pdf")
 
93
 
94
  try:
95
  print(f"Navigating to Google Patents...")
 
102
  search_input.send_keys(patent)
103
  search_input.send_keys(Keys.RETURN)
104
 
105
+ print("Waiting for the 'Download PDF' button...")
106
+ pdf_button_xpath = "//a[contains(text(),'Download PDF')]"
107
+ WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, pdf_button_xpath)))
108
+ pdf_link_element = driver.find_element(By.XPATH, pdf_button_xpath)
109
 
110
+ # Extract the PDF link
111
+ pdf_link = pdf_link_element.get_attribute("href")
112
+ print(f"PDF link found: {pdf_link}")
 
 
113
 
114
+ # Download and validate the PDF
115
+ self.download_and_validate_pdf(pdf_link, file_path)
116
 
117
  except Exception as e:
118
  print(f"An error occurred: {e}")
119
+ raise FileNotFoundError("Could not find a valid PDF link.")
120
  finally:
121
  driver.quit()
122
 
 
 
 
123
  def get_pdfs(self, patents: Union[List[str], str], output_path: str = "./", waiting_time: int = 10) -> None:
124
  """
125
  Download multiple patent PDFs.
 
137
  print(f"Downloading patent: {patent}")
138
  self.get_pdf(patent, output_path, waiting_time)
139
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  def download_and_validate_pdf(self, pdf_link: str, file_path: str):
141
  """
142
  Download the PDF and validate its integrity.