Spaces:
Runtime error
Runtime error
| import requests | |
| from bs4 import BeautifulSoup | |
| import gradio as gr | |
| import os | |
| from openai import OpenAI | |
| from selenium import webdriver | |
| import undetected_chromedriver as uc | |
| from selenium.webdriver.chrome.options import Options | |
| # Initialize OpenAI client securely | |
| client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) | |
| def fetch_with_requests(url): | |
| """ | |
| Fetches webpage content using requests with proper headers. | |
| Returns extracted text if successful, or raises an error for fallback. | |
| """ | |
| headers = { | |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " | |
| "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", | |
| "Accept-Language": "en-US,en;q=0.9", | |
| "Referer": "https://www.google.com/", | |
| "DNT": "1", | |
| "Connection": "keep-alive" | |
| } | |
| response = requests.get(url, headers=headers, timeout=10) | |
| if response.status_code == 403: | |
| raise Exception("403 Forbidden - Switching to Selenium") | |
| soup = BeautifulSoup(response.text, "html.parser") | |
| paragraphs = soup.find_all("p") | |
| text_content = "\n".join([p.get_text() for p in paragraphs if p.get_text().strip()]) | |
| return text_content if text_content else "No readable content found." | |
| def fetch_with_selenium(url): | |
| """ | |
| Uses Selenium with an undetected Chrome driver to scrape JavaScript-heavy pages. | |
| """ | |
| chrome_options = Options() | |
| chrome_options.add_argument("--headless") # Run in headless mode | |
| chrome_options.add_argument("--disable-gpu") | |
| chrome_options.add_argument("--no-sandbox") | |
| chrome_options.add_argument("--disable-dev-shm-usage") | |
| driver = uc.Chrome(options=chrome_options) | |
| driver.get(url) | |
| html = driver.page_source | |
| driver.quit() | |
| soup = BeautifulSoup(html, "html.parser") | |
| paragraphs = soup.find_all("p") | |
| text_content = "\n".join([p.get_text() for p in paragraphs if p.get_text().strip()]) | |
| return text_content if text_content else "No readable content found (even with Selenium)." | |
| def scrape_and_summarize(url): | |
| """ | |
| Scrapes the given website URL and summarizes its content using GPT-4o-mini. | |
| Tries `requests` first, falls back to Selenium if needed. | |
| """ | |
| try: | |
| # Attempt with requests first | |
| text_content = fetch_with_requests(url) | |
| except Exception as e: | |
| # If blocked, fallback to Selenium | |
| try: | |
| text_content = fetch_with_selenium(url) | |
| except Exception as selenium_error: | |
| return f"Failed both requests and Selenium: {selenium_error}" | |
| # Limit content to 4000 characters for better summarization | |
| text_content = text_content[:4000] | |
| # Call OpenAI GPT-4o-mini for summarization | |
| response = client.chat.completions.create( | |
| model="gpt-4o-mini", | |
| messages=[ | |
| {"role": "system", "content": "You are a helpful assistant that summarizes webpage content."}, | |
| {"role": "user", "content": f"Summarize the following webpage content:\n\n{text_content}"} | |
| ], | |
| response_format={"type": "text"}, | |
| temperature=1, | |
| max_completion_tokens=2048, | |
| top_p=1, | |
| frequency_penalty=0, | |
| presence_penalty=0 | |
| ) | |
| summary = response.choices[0].message.content # Extract response content | |
| return summary | |
| # Gradio UI | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# Web Page Summarizer") | |
| gr.Markdown("Enter a website URL to get a summary of its content.") | |
| url_input = gr.Textbox(label="Website URL", placeholder="https://example.com") | |
| output = gr.Textbox(label="Summary", interactive=False) | |
| submit_button = gr.Button("Summarize") | |
| submit_button.click(scrape_and_summarize, inputs=[url_input], outputs=[output]) | |
| # Launch Gradio App | |
| if __name__ == "__main__": | |
| demo.launch() | |