Spaces:

usmanyousaf
/

AI-WebScraper-App

Paused

App Files Files Community

usmanyousaf commited on Sep 29, 2024

Commit

59e7319

verified ·

1 Parent(s): de916e4

Update scrape.py

Browse files

Files changed (1) hide show

scrape.py +14 -11

scrape.py CHANGED Viewed

@@ -1,23 +1,23 @@
 from selenium import webdriver
-from selenium.webdriver.chrome.service import Service
-from selenium.webdriver.chrome.options import Options
 from bs4 import BeautifulSoup
-from dotenv import load_dotenv
-import os
 import time
-load_dotenv()
-CHROME_DRIVER_PATH = os.getenv("./chromedriver")
 def scrape_website(website):
     print("Connecting to Chrome Browser...")
-    # Setup ChromeDriver service and options
-    service = Service(CHROME_DRIVER_PATH)
-    options = Options()
-    driver = webdriver.Chrome(service=service, options=options)
     try:
         driver.get(website)
         print("Waiting for CAPTCHA to be solved manually (if present)...")
@@ -44,9 +44,11 @@ def extract_body_content(html_content):
 def clean_body_content(body_content):
     soup = BeautifulSoup(body_content, "html.parser")
     for script_or_style in soup(["script", "style"]):
         script_or_style.extract()
     cleaned_content = soup.get_text(separator="\n")
     cleaned_content = "\n".join(
         line.strip() for line in cleaned_content.splitlines() if line.strip()
@@ -55,6 +57,7 @@ def clean_body_content(body_content):
     return cleaned_content
 def split_dom_content(dom_content, max_length=6000):
     return [
         dom_content[i : i + max_length] for i in range(0, len(dom_content), max_length)
     ]

 from selenium import webdriver
+from webdriver_manager.chrome import ChromeDriverManager
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.chrome.options import Options
 from bs4 import BeautifulSoup
 import time
+# No need for explicit CHROME_DRIVER_PATH or .env usage, WebDriverManager handles it.
+options = Options()
+options.add_argument("--headless")
+options.add_argument("--no-sandbox")
+options.add_argument("--disable-dev-shm-usage")
+# Use WebDriverManager to automatically download and install the correct version
+driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
 def scrape_website(website):
     print("Connecting to Chrome Browser...")
     try:
         driver.get(website)
         print("Waiting for CAPTCHA to be solved manually (if present)...")
 def clean_body_content(body_content):
     soup = BeautifulSoup(body_content, "html.parser")
+    # Remove all <script> and <style> elements
     for script_or_style in soup(["script", "style"]):
         script_or_style.extract()
+    # Extract and clean text
     cleaned_content = soup.get_text(separator="\n")
     cleaned_content = "\n".join(
         line.strip() for line in cleaned_content.splitlines() if line.strip()
     return cleaned_content
 def split_dom_content(dom_content, max_length=6000):
+    # Split the content into chunks of max_length characters
     return [
         dom_content[i : i + max_length] for i in range(0, len(dom_content), max_length)
     ]