Spaces:

usmanyousaf
/

AI-WebScraper-App

Paused

App Files Files Community

usmanyousaf commited on Sep 29, 2024

Commit

135b855

verified ·

1 Parent(s): bce428a

Update scrape.py

Browse files

Files changed (1) hide show

scrape.py +23 -16

scrape.py CHANGED Viewed

@@ -1,41 +1,48 @@
 from selenium import webdriver
 from selenium.common.exceptions import WebDriverException
-from selenium import webdriver  # type: ignore
-from selenium.webdriver.chrome.service import Service  # type: ignore
-from selenium.webdriver.chrome.options import Options  # type: ignore
-from bs4 import BeautifulSoup  # type: ignore
 import time
-# Define the ChromeDriver path directly
-CHROME_DRIVER_PATH = "./chrome"
 def scrape_website(website):
     print("Connecting to Chrome Browser...")
-    # Setup ChromeDriver service and options
-    service = Service(CHROME_DRIVER_PATH)
     options = Options()
     options.add_argument("--headless")  # Run in headless mode for deployment
-    driver = webdriver.Chrome(service=service, options=options)
     try:
-        driver.get(website)
         print("Waiting for CAPTCHA to be solved manually (if present)...")
         # Optional waiting loop for manual CAPTCHA solving
-        while "captcha" in driver.page_source.lower():
             print("CAPTCHA detected, waiting...")
             time.sleep(5)
         print("CAPTCHA solved or not present. Scraping page content...")
-        html = driver.page_source
         return html
     finally:
-        driver.quit()
 def extract_body_content(html_content):
     soup = BeautifulSoup(html_content, "html.parser")
     body_content = soup.body
     return str(body_content) if body_content else ""

 from selenium import webdriver
 from selenium.common.exceptions import WebDriverException
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.chrome.options import Options
+from bs4 import BeautifulSoup
 import time
 def scrape_website(website):
     print("Connecting to Chrome Browser...")
+    # Setup ChromeDriver options
     options = Options()
     options.add_argument("--headless")  # Run in headless mode for deployment
+    options.add_argument('--no-sandbox')  # Overcome limited resource problems
+    options.add_argument('--disable-dev-shm-usage')  # Overcome limited resource problems
+    # Initialize the driver without a specified service (assumes ChromeDriver is in PATH)
+    wd = None
     try:
+        wd = webdriver.Chrome(options=options)
+        wd.set_window_size(1080, 720)  # Set the window size
+        wd.get(website)
+        wd.implicitly_wait(10)
         print("Waiting for CAPTCHA to be solved manually (if present)...")
         # Optional waiting loop for manual CAPTCHA solving
+        while "captcha" in wd.page_source.lower():
             print("CAPTCHA detected, waiting...")
             time.sleep(5)
         print("CAPTCHA solved or not present. Scraping page content...")
+        html = wd.page_source
         return html
+    except WebDriverException as e:
+        print(f"WebDriverException occurred: {e}")
+        return None  # Return None or an empty string based on your requirement
     finally:
+        if wd:
+            wd.quit()
 def extract_body_content(html_content):
+    if html_content is None:
+        return ""  # Return empty if there is no content
     soup = BeautifulSoup(html_content, "html.parser")
     body_content = soup.body
     return str(body_content) if body_content else ""