Spaces:

usmanyousaf
/

AI-WebScraper-App

Paused

App Files Files Community

usmanyousaf commited on Sep 29, 2024

Commit

2a0bf8d

verified ·

1 Parent(s): 8d0e46c

Update scrape.py

Browse files

Files changed (1) hide show

scrape.py +34 -14

scrape.py CHANGED Viewed

@@ -1,21 +1,43 @@
-import requests
-from bs4 import BeautifulSoup
-from dotenv import load_dotenv
 import os
 load_dotenv()
 def scrape_website(website):
-    print(f"Fetching website content from {website}...")
-    headers = {"User-Agent": "Mozilla/5.0"}
-    response = requests.get(website, headers=headers)
-    if response.status_code == 200:
-        print("Website content fetched successfully!")
-        return response.text
-    else:
-        print(f"Failed to fetch website: Status code {response.status_code}")
-        return ""
 def extract_body_content(html_content):
     soup = BeautifulSoup(html_content, "html.parser")
@@ -27,11 +49,9 @@ def extract_body_content(html_content):
 def clean_body_content(body_content):
     soup = BeautifulSoup(body_content, "html.parser")
-    # Remove script and style elements
     for script_or_style in soup(["script", "style"]):
         script_or_style.extract()
-    # Extract text and clean up the content
     cleaned_content = soup.get_text(separator="\n")
     cleaned_content = "\n".join(
         line.strip() for line in cleaned_content.splitlines() if line.strip()

+from selenium import webdriver  # type: ignore
+from selenium.webdriver.chrome.service import Service  # type: ignore
+from selenium.webdriver.chrome.options import Options  # type: ignore
+from bs4 import BeautifulSoup  # type: ignore
+from dotenv import load_dotenv  # type: ignore
 import os
+import time
+# Load environment variables
 load_dotenv()
+CHROME_DRIVER_PATH = os.getenv("CHROME_DRIVER_PATH", "./chrome")
 def scrape_website(website):
+    print("Connecting to Chrome Browser...")
+    # Setup ChromeDriver service and options for headless scraping
+    service = Service(CHROME_DRIVER_PATH)
+    options = Options()
+    options.add_argument("--headless")  # Run Chrome in headless mode
+    options.add_argument("--no-sandbox")
+    options.add_argument("--disable-dev-shm-usage")
+    driver = webdriver.Chrome(service=service, options=options)
+    try:
+        driver.get(website)
+        print("Waiting for CAPTCHA to be solved manually (if present)...")
+        # Optional waiting loop for manual CAPTCHA solving
+        while "captcha" in driver.page_source.lower():
+            print("CAPTCHA detected, waiting...")
+            time.sleep(5)
+        print("CAPTCHA solved or not present. Scraping page content...")
+        html = driver.page_source
+        return html
+    finally:
+        driver.quit()
 def extract_body_content(html_content):
     soup = BeautifulSoup(html_content, "html.parser")
 def clean_body_content(body_content):
     soup = BeautifulSoup(body_content, "html.parser")
     for script_or_style in soup(["script", "style"]):
         script_or_style.extract()
     cleaned_content = soup.get_text(separator="\n")
     cleaned_content = "\n".join(
         line.strip() for line in cleaned_content.splitlines() if line.strip()