Spaces:

usmanyousaf
/

AI-WebScraper-App

Paused

App Files Files Community

usmanyousaf commited on Sep 29, 2024

Commit

6a10786

verified ·

1 Parent(s): 54cd769

Update scrape.py

Browse files

Files changed (1) hide show

scrape.py +17 -35

scrape.py CHANGED Viewed

@@ -1,38 +1,21 @@
-from selenium import webdriver
-from webdriver_manager.chrome import ChromeDriverManager
-from selenium.webdriver.chrome.service import Service
-from selenium.webdriver.chrome.options import Options
-from bs4 import BeautifulSoup
-import time
-# No need for explicit CHROME_DRIVER_PATH or .env usage, WebDriverManager handles it.
-options = Options()
-options.add_argument("--headless")
-options.add_argument("--no-sandbox")
-options.add_argument("--disable-dev-shm-usage")
-# Use WebDriverManager to automatically download and install the correct version
-driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
 def scrape_website(website):
-    print("Connecting to Chrome Browser...")
-    try:
-        driver.get(website)
-        print("Waiting for CAPTCHA to be solved manually (if present)...")
-        # Optional waiting loop for manual CAPTCHA solving
-        while "captcha" in driver.page_source.lower():
-            print("CAPTCHA detected, waiting...")
-            time.sleep(5)
-        print("CAPTCHA solved or not present. Scraping page content...")
-        html = driver.page_source
-        return html
-    finally:
-        driver.quit()
 def extract_body_content(html_content):
     soup = BeautifulSoup(html_content, "html.parser")
@@ -44,11 +27,11 @@ def extract_body_content(html_content):
 def clean_body_content(body_content):
     soup = BeautifulSoup(body_content, "html.parser")
-    # Remove all <script> and <style> elements
     for script_or_style in soup(["script", "style"]):
         script_or_style.extract()
-    # Extract and clean text
     cleaned_content = soup.get_text(separator="\n")
     cleaned_content = "\n".join(
         line.strip() for line in cleaned_content.splitlines() if line.strip()
@@ -57,7 +40,6 @@ def clean_body_content(body_content):
     return cleaned_content
 def split_dom_content(dom_content, max_length=6000):
-    # Split the content into chunks of max_length characters
     return [
         dom_content[i : i + max_length] for i in range(0, len(dom_content), max_length)
     ]

+import requests
+from bs4 import BeautifulSoup
+from dotenv import load_dotenv
+import os
+load_dotenv()
 def scrape_website(website):
+    print(f"Fetching website content from {website}...")
+    headers = {"User-Agent": "Mozilla/5.0"}
+    response = requests.get(website, headers=headers)
+    if response.status_code == 200:
+        print("Website content fetched successfully!")
+        return response.text
+    else:
+        print(f"Failed to fetch website: Status code {response.status_code}")
+        return ""
 def extract_body_content(html_content):
     soup = BeautifulSoup(html_content, "html.parser")
 def clean_body_content(body_content):
     soup = BeautifulSoup(body_content, "html.parser")
+    # Remove script and style elements
     for script_or_style in soup(["script", "style"]):
         script_or_style.extract()
+    # Extract text and clean up the content
     cleaned_content = soup.get_text(separator="\n")
     cleaned_content = "\n".join(
         line.strip() for line in cleaned_content.splitlines() if line.strip()
     return cleaned_content
 def split_dom_content(dom_content, max_length=6000):
     return [
         dom_content[i : i + max_length] for i in range(0, len(dom_content), max_length)
     ]