Spaces:

tdurzynski
/

web-scraper-summarizer

Runtime error

File size: 3,816 Bytes

import requests
from bs4 import BeautifulSoup
import gradio as gr
import os
from openai import OpenAI
from selenium import webdriver
import undetected_chromedriver as uc
from selenium.webdriver.chrome.options import Options

# Initialize OpenAI client securely
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def fetch_with_requests(url):
    """
    Fetches webpage content using requests with proper headers.
    Returns extracted text if successful, or raises an error for fallback.
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                      "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
        "Accept-Language": "en-US,en;q=0.9",
        "Referer": "https://www.google.com/",
        "DNT": "1",
        "Connection": "keep-alive"
    }

    response = requests.get(url, headers=headers, timeout=10)
    if response.status_code == 403:
        raise Exception("403 Forbidden - Switching to Selenium")
    
    soup = BeautifulSoup(response.text, "html.parser")
    paragraphs = soup.find_all("p")
    text_content = "\n".join([p.get_text() for p in paragraphs if p.get_text().strip()])
    
    return text_content if text_content else "No readable content found."

def fetch_with_selenium(url):
    """
    Uses Selenium with an undetected Chrome driver to scrape JavaScript-heavy pages.
    """
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    
    driver = uc.Chrome(options=chrome_options)
    
    driver.get(url)
    html = driver.page_source
    driver.quit()

    soup = BeautifulSoup(html, "html.parser")
    paragraphs = soup.find_all("p")
    text_content = "\n".join([p.get_text() for p in paragraphs if p.get_text().strip()])

    return text_content if text_content else "No readable content found (even with Selenium)."

def scrape_and_summarize(url):
    """
    Scrapes the given website URL and summarizes its content using GPT-4o-mini.
    Tries `requests` first, falls back to Selenium if needed.
    """
    try:
        # Attempt with requests first
        text_content = fetch_with_requests(url)
    except Exception as e:
        # If blocked, fallback to Selenium
        try:
            text_content = fetch_with_selenium(url)
        except Exception as selenium_error:
            return f"Failed both requests and Selenium: {selenium_error}"

    # Limit content to 4000 characters for better summarization
    text_content = text_content[:4000]

    # Call OpenAI GPT-4o-mini for summarization
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that summarizes webpage content."},
            {"role": "user", "content": f"Summarize the following webpage content:\n\n{text_content}"}
        ],
        response_format={"type": "text"},
        temperature=1,
        max_completion_tokens=2048,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )

    summary = response.choices[0].message.content  # Extract response content
    return summary

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# Web Page Summarizer")
    gr.Markdown("Enter a website URL to get a summary of its content.")

    url_input = gr.Textbox(label="Website URL", placeholder="https://example.com")
    output = gr.Textbox(label="Summary", interactive=False)
    submit_button = gr.Button("Summarize")

    submit_button.click(scrape_and_summarize, inputs=[url_input], outputs=[output])

# Launch Gradio App
if __name__ == "__main__":
    demo.launch()