import requests from bs4 import BeautifulSoup import gradio as gr import os from openai import OpenAI from selenium import webdriver import undetected_chromedriver as uc from selenium.webdriver.chrome.options import Options # Initialize OpenAI client securely client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) def fetch_with_requests(url): """ Fetches webpage content using requests with proper headers. Returns extracted text if successful, or raises an error for fallback. """ headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", "Accept-Language": "en-US,en;q=0.9", "Referer": "https://www.google.com/", "DNT": "1", "Connection": "keep-alive" } response = requests.get(url, headers=headers, timeout=10) if response.status_code == 403: raise Exception("403 Forbidden - Switching to Selenium") soup = BeautifulSoup(response.text, "html.parser") paragraphs = soup.find_all("p") text_content = "\n".join([p.get_text() for p in paragraphs if p.get_text().strip()]) return text_content if text_content else "No readable content found." def fetch_with_selenium(url): """ Uses Selenium with an undetected Chrome driver to scrape JavaScript-heavy pages. """ chrome_options = Options() chrome_options.add_argument("--headless") # Run in headless mode chrome_options.add_argument("--disable-gpu") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") driver = uc.Chrome(options=chrome_options) driver.get(url) html = driver.page_source driver.quit() soup = BeautifulSoup(html, "html.parser") paragraphs = soup.find_all("p") text_content = "\n".join([p.get_text() for p in paragraphs if p.get_text().strip()]) return text_content if text_content else "No readable content found (even with Selenium)." def scrape_and_summarize(url): """ Scrapes the given website URL and summarizes its content using GPT-4o-mini. Tries `requests` first, falls back to Selenium if needed. """ try: # Attempt with requests first text_content = fetch_with_requests(url) except Exception as e: # If blocked, fallback to Selenium try: text_content = fetch_with_selenium(url) except Exception as selenium_error: return f"Failed both requests and Selenium: {selenium_error}" # Limit content to 4000 characters for better summarization text_content = text_content[:4000] # Call OpenAI GPT-4o-mini for summarization response = client.chat.completions.create( model="gpt-4o-mini", messages=[ {"role": "system", "content": "You are a helpful assistant that summarizes webpage content."}, {"role": "user", "content": f"Summarize the following webpage content:\n\n{text_content}"} ], response_format={"type": "text"}, temperature=1, max_completion_tokens=2048, top_p=1, frequency_penalty=0, presence_penalty=0 ) summary = response.choices[0].message.content # Extract response content return summary # Gradio UI with gr.Blocks() as demo: gr.Markdown("# Web Page Summarizer") gr.Markdown("Enter a website URL to get a summary of its content.") url_input = gr.Textbox(label="Website URL", placeholder="https://example.com") output = gr.Textbox(label="Summary", interactive=False) submit_button = gr.Button("Summarize") submit_button.click(scrape_and_summarize, inputs=[url_input], outputs=[output]) # Launch Gradio App if __name__ == "__main__": demo.launch()