Spaces:

tdurzynski
/

web-scraper-summarizer

Runtime error

App Files Files Community

tdurzynski commited on Feb 4, 2025

Commit

d4f1db3

verified ·

1 Parent(s): 5d94755

Update app.py

Browse files

First, attempts requests with proper headers
✅ If blocked (403 Forbidden), falls back to Selenium for JavaScript-heavy sites
✅ Uses gpt-4o-mini with openai.chat.completions.create()
✅ Extracts response via response.choices[0].message.content
✅ Handles secure API key storage with os.getenv("OPENAI_API_KEY")
✅ Runs seamlessly on Hugging Face Spaces

Files changed (1) hide show

app.py +75 -45

app.py CHANGED Viewed

@@ -3,60 +3,90 @@ from bs4 import BeautifulSoup
 import gradio as gr
 import os
 from openai import OpenAI
-# Initialize OpenAI client with secure API key handling
 client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 def scrape_and_summarize(url):
     """
     Scrapes the given website URL and summarizes its content using GPT-4o-mini.
     """
     try:
-        # Fetch website content
-        headers = {
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
-            "Accept-Language": "en-US,en;q=0.9",
-            "Referer": "https://www.google.com/",
-            "DNT": "1",  # Do Not Track request
-            "Connection": "keep-alive"
-        }
-        response = requests.get(url, headers=headers, timeout=10)
-        response.raise_for_status()
-        # Parse HTML content
-        soup = BeautifulSoup(response.text, "html.parser")
-        paragraphs = soup.find_all("p")
-        text_content = "\n".join([p.get_text() for p in paragraphs if p.get_text().strip()])
-        if not text_content:
-            return "No readable content found on this page."
-        # Limit text to 4000 characters for better summarization
-        text_content = text_content[:4000]
-        # Call OpenAI GPT-4o-mini for summarization
-        response = client.chat.completions.create(
-            model="gpt-4o-mini",
-            messages=[
-                {"role": "system", "content": "You are a helpful assistant that summarizes webpage content."},
-                {"role": "user", "content": f"Summarize the following webpage content:\n\n{text_content}"}
-            ],
-            response_format={"type": "text"},
-            temperature=1,
-            max_completion_tokens=2048,
-            top_p=1,
-            frequency_penalty=0,
-            presence_penalty=0
-        )
-        summary = response.choices[0].message.content  # Extract response content
-        return summary
-    except requests.exceptions.RequestException as e:
-        return f"Error fetching the webpage: {str(e)}"
     except Exception as e:
-        return f"An error occurred: {str(e)}"
 # Gradio UI
 with gr.Blocks() as demo:

 import gradio as gr
 import os
 from openai import OpenAI
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+# Initialize OpenAI client securely
 client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+def fetch_with_requests(url):
+    """
+    Fetches webpage content using requests with proper headers.
+    Returns extracted text if successful, or raises an error for fallback.
+    """
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
+                      "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
+        "Accept-Language": "en-US,en;q=0.9",
+        "Referer": "https://www.google.com/",
+        "DNT": "1",
+        "Connection": "keep-alive"
+    }
+    response = requests.get(url, headers=headers, timeout=10)
+    if response.status_code == 403:
+        raise Exception("403 Forbidden - Switching to Selenium")
+    soup = BeautifulSoup(response.text, "html.parser")
+    paragraphs = soup.find_all("p")
+    text_content = "\n".join([p.get_text() for p in paragraphs if p.get_text().strip()])
+    return text_content if text_content else "No readable content found."
+def fetch_with_selenium(url):
+    """
+    Uses Selenium to scrape JavaScript-heavy pages if requests fails.
+    """
+    chrome_options = Options()
+    chrome_options.add_argument("--headless")  # Run in headless mode
+    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
+    driver = webdriver.Chrome(options=chrome_options)
+    driver.get(url)
+    html = driver.page_source
+    driver.quit()
+    soup = BeautifulSoup(html, "html.parser")
+    paragraphs = soup.find_all("p")
+    text_content = "\n".join([p.get_text() for p in paragraphs if p.get_text().strip()])
+    return text_content if text_content else "No readable content found (even with Selenium)."
 def scrape_and_summarize(url):
     """
     Scrapes the given website URL and summarizes its content using GPT-4o-mini.
+    Tries `requests` first, falls back to Selenium if needed.
     """
     try:
+        # Attempt with requests first
+        text_content = fetch_with_requests(url)
     except Exception as e:
+        # If blocked, fallback to Selenium
+        try:
+            text_content = fetch_with_selenium(url)
+        except Exception as selenium_error:
+            return f"Failed both requests and Selenium: {selenium_error}"
+    # Limit content to 4000 characters for better summarization
+    text_content = text_content[:4000]
+    # Call OpenAI GPT-4o-mini for summarization
+    response = client.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {"role": "system", "content": "You are a helpful assistant that summarizes webpage content."},
+            {"role": "user", "content": f"Summarize the following webpage content:\n\n{text_content}"}
+        ],
+        response_format={"type": "text"},
+        temperature=1,
+        max_completion_tokens=2048,
+        top_p=1,
+        frequency_penalty=0,
+        presence_penalty=0
+    )
+    summary = response.choices[0].message.content  # Extract response content
+    return summary
 # Gradio UI
 with gr.Blocks() as demo: