File size: 3,816 Bytes
4290320
 
 
 
b342761
d4f1db3
5892725
d4f1db3
4290320
d4f1db3
b342761
4290320
d4f1db3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5892725
d4f1db3
 
 
5892725
 
 
 
 
 
d4f1db3
 
 
 
 
 
 
 
 
 
4290320
 
b342761
d4f1db3
4290320
 
d4f1db3
 
4290320
d4f1db3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4290320
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b342761
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import requests
from bs4 import BeautifulSoup
import gradio as gr
import os
from openai import OpenAI
from selenium import webdriver
import undetected_chromedriver as uc
from selenium.webdriver.chrome.options import Options

# Initialize OpenAI client securely
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def fetch_with_requests(url):
    """
    Fetches webpage content using requests with proper headers.
    Returns extracted text if successful, or raises an error for fallback.
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                      "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
        "Accept-Language": "en-US,en;q=0.9",
        "Referer": "https://www.google.com/",
        "DNT": "1",
        "Connection": "keep-alive"
    }

    response = requests.get(url, headers=headers, timeout=10)
    if response.status_code == 403:
        raise Exception("403 Forbidden - Switching to Selenium")
    
    soup = BeautifulSoup(response.text, "html.parser")
    paragraphs = soup.find_all("p")
    text_content = "\n".join([p.get_text() for p in paragraphs if p.get_text().strip()])
    
    return text_content if text_content else "No readable content found."

def fetch_with_selenium(url):
    """
    Uses Selenium with an undetected Chrome driver to scrape JavaScript-heavy pages.
    """
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    
    driver = uc.Chrome(options=chrome_options)
    
    driver.get(url)
    html = driver.page_source
    driver.quit()

    soup = BeautifulSoup(html, "html.parser")
    paragraphs = soup.find_all("p")
    text_content = "\n".join([p.get_text() for p in paragraphs if p.get_text().strip()])

    return text_content if text_content else "No readable content found (even with Selenium)."

def scrape_and_summarize(url):
    """
    Scrapes the given website URL and summarizes its content using GPT-4o-mini.
    Tries `requests` first, falls back to Selenium if needed.
    """
    try:
        # Attempt with requests first
        text_content = fetch_with_requests(url)
    except Exception as e:
        # If blocked, fallback to Selenium
        try:
            text_content = fetch_with_selenium(url)
        except Exception as selenium_error:
            return f"Failed both requests and Selenium: {selenium_error}"

    # Limit content to 4000 characters for better summarization
    text_content = text_content[:4000]

    # Call OpenAI GPT-4o-mini for summarization
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that summarizes webpage content."},
            {"role": "user", "content": f"Summarize the following webpage content:\n\n{text_content}"}
        ],
        response_format={"type": "text"},
        temperature=1,
        max_completion_tokens=2048,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )

    summary = response.choices[0].message.content  # Extract response content
    return summary

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# Web Page Summarizer")
    gr.Markdown("Enter a website URL to get a summary of its content.")

    url_input = gr.Textbox(label="Website URL", placeholder="https://example.com")
    output = gr.Textbox(label="Summary", interactive=False)
    submit_button = gr.Button("Summarize")

    submit_button.click(scrape_and_summarize, inputs=[url_input], outputs=[output])

# Launch Gradio App
if __name__ == "__main__":
    demo.launch()