Spaces:

bala00712200502
/

webscraping

Sleeping

App Files Files Community

bala00712200502 commited on Apr 15, 2025

Commit

5bdae9e

verified ·

1 Parent(s): 8062622

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -161

app.py CHANGED Viewed

@@ -1,179 +1,74 @@
 import os
 import requests
 from bs4 import BeautifulSoup
 from dotenv import load_dotenv
-from crewai import Agent, Task, Crew, Process
-from langchain_openai import ChatOpenAI
-import gradio as gr
-from tenacity import retry, stop_after_attempt, wait_exponential
-import logging
-# Configure logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
 # Load environment variables
 load_dotenv()
-# Initialize OpenAI LLM with robust error handling
-class SafeChatOpenAI(ChatOpenAI):
-    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
-    def _call(self, *args, **kwargs):
-        try:
-            return super()._call(*args, **kwargs)
-        except Exception as e:
-            logger.error(f"OpenAI API Error: {str(e)}")
-            if "Incorrect API key" in str(e):
-                raise ValueError("Invalid OpenAI API key configuration")
-            raise ConnectionError("OpenAI service unavailable. Please try again later.")
-try:
-    llm = SafeChatOpenAI(
-        model="gpt-3.5-turbo",
-        temperature=0.5,  # More deterministic output
-        request_timeout=60,
-        max_retries=2
-    )
-except Exception as e:
-    logger.critical(f"LLM initialization failed: {str(e)}")
-    raise RuntimeError("Failed to initialize AI services")
-class WebScraperAgent:
-    def __init__(self):
-        try:
-            # Define agents
-            self.scraper_agent = Agent(
-                role='Senior Web Scraper',
-                goal='Extract clean content from any webpage',
-                backstory="""Expert in extracting information from complex websites,
-                adept at handling various structures and formats.""",
-                verbose=False,
-                llm=llm
-            )
-            self.analyst_agent = Agent(
-                role='Content Analyst',
-                goal='Provide clear, concise summaries',
-                backstory="""Specializes in analyzing and summarizing web content
-                into key points and actionable insights.""",
-                verbose=False,
-                llm=llm
-            )
-        except Exception as e:
-            logger.error(f"Agent creation failed: {str(e)}")
-            raise
-    @retry(stop=stop_after_attempt(2), wait=wait_exponential(multiplier=1, min=2, max=5))
-    def scrape_website(self, url):
-        """Robust web scraping function with error handling"""
-        try:
-            headers = {
-                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
-                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
-                'Accept-Language': 'en-US,en;q=0.5'
-            }
-            response = requests.get(url, headers=headers, timeout=20)
-            response.raise_for_status()
-            soup = BeautifulSoup(response.text, 'html.parser')
-            # Remove unwanted elements
-            for element in soup(['script', 'style', 'nav', 'footer', 'iframe', 'noscript']):
-                element.decompose()
-            # Get clean text
-            text = soup.get_text(separator='\n', strip=True)
-            return text[:3000]  # Limit to avoid token limits
-        except Exception as e:
-            logger.warning(f"Failed to scrape {url}: {str(e)}")
-            raise ConnectionError(f"Couldn't access this website. Error: {str(e)}")
-    def analyze_content(self, content):
-        """Process content through AI analysis pipeline"""
-        try:
-            # Define tasks
-            scrape_task = Task(
-                description="Extract and clean the main content from this webpage data.",
-                expected_output="Well-formatted text containing the core content.",
-                agent=self.scraper_agent
-            )
-            analyze_task = Task(
-                description="Analyze this content and extract key information.",
-                expected_output="""Concise summary with:
-                - 3-5 key bullet points
-                - Main topics covered
-                - Any important statistics or facts""",
-                agent=self.analyst_agent
-            )
-            # Create and run crew
-            crew = Crew(
-                agents=[self.scraper_agent, self.analyst_agent],
-                tasks=[scrape_task, analyze_task],
-                verbose=False,
-                process=Process.sequential
-            )
-            return crew.kickoff(inputs={'content': content})
-        except Exception as e:
-            logger.error(f"Analysis failed: {str(e)}")
-            raise RuntimeError(f"Analysis error: {str(e)}")
-# Gradio Interface
-def create_interface():
-    scraper = WebScraperAgent()
-    def process_url(url):
-        try:
-            # Step 1: Scrape
-            content = scraper.scrape_website(url)
-            # Step 2: Analyze
-            return scraper.analyze_content(content)
-        except Exception as e:
-            return f"❌ Error: {str(e)}"
-    with gr.Blocks(title="AI Web Scraper", theme=gr.themes.Soft()) as app:
-        gr.Markdown("""
-        # 🌐 AI-Powered Web Scraper
-        *Extract and summarize content from any website*
-        """)
-        with gr.Row():
-            url_input = gr.Textbox(
-                label="Enter Website URL",
-                placeholder="https://example.com",
-                max_lines=1
-            )
-            submit_btn = gr.Button("Analyze", variant="primary")
-        output = gr.Markdown(
-            label="Analysis Results",
-            elem_classes=["output-box"]
-        )
-        submit_btn.click(
-            fn=process_url,
-            inputs=url_input,
-            outputs=output
-        )
-        gr.Examples(
-            examples=[
-                ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
-                ["https://www.nasa.gov/about/index.html"],
-                ["https://www.w3schools.com/python/"]
-            ],
-            inputs=url_input,
-            label="Try these examples"
-        )
-    return app
 if __name__ == "__main__":
-    app = create_interface()
-    app.launch(
-        server_name="0.0.0.0",
-        server_port=7860
-    )

+# web_summarizer_app.py
 import os
+import gradio as gr
 import requests
 from bs4 import BeautifulSoup
 from dotenv import load_dotenv
+import openai
 # Load environment variables
 load_dotenv()
+openai.api_key = os.getenv("OPENAI_API_KEY")
+model = os.getenv("OPENAI_MODEL", "gpt-3.5-turbo")
+# 🌐 Web Scraper
+def scrape_text_from_url(url):
+    try:
+        response = requests.get(url, timeout=10)
+        soup = BeautifulSoup(response.content, "html.parser")
+        # Remove scripts and style
+        for tag in soup(["script", "style"]):
+            tag.decompose()
+        # Extract visible text
+        text = " ".join(chunk.strip() for chunk in soup.stripped_strings)
+        return text[:5000]  # limit to avoid token overflow
+    except Exception as e:
+        return f"❌ Error scraping the page: {str(e)}"
+# 🧠 LLM Summarizer
+def summarize_with_gpt(text):
+    try:
+        response = openai.ChatCompletion.create(
+            model=model,
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant that summarizes articles."},
+                {"role": "user", "content": f"Please summarize the following content:\n\n{text}"}
+            ],
+            temperature=0.7,
+            max_tokens=500
+        )
+        return response.choices[0].message.content.strip()
+    except Exception as e:
+        return f"❌ Error from OpenAI: {str(e)}"
+# 🔁 Combined Function
+def scrape_and_summarize(url):
+    raw_text = scrape_text_from_url(url)
+    if "❌" in raw_text:
+        return raw_text, ""
+    summary = summarize_with_gpt(raw_text)
+    return raw_text, summary
+# 🎨 Gradio UI
+with gr.Blocks(title="🔎 Web Summarizer with AI") as demo:
+    gr.Markdown("## 🧠🌐 Web Article Summarizer")
+    gr.Markdown("Enter a webpage URL below. The AI will scrape and summarize the content.")
+    with gr.Row():
+        url_input = gr.Textbox(label="🔗 Enter URL", placeholder="https://example.com", scale=4)
+        btn = gr.Button("Summarize", variant="primary")
+    with gr.Row():
+        with gr.Column(scale=1):
+            raw_output = gr.Textbox(label="📝 Raw Scraped Text", lines=15, interactive=False)
+        with gr.Column(scale=1):
+            summary_output = gr.Textbox(label="📄 AI Summary", lines=15, interactive=False)
+    btn.click(scrape_and_summarize, inputs=[url_input], outputs=[raw_output, summary_output])
+# 🚀 Launch app
 if __name__ == "__main__":
+    demo.launch()