Spaces:

bala00712200502
/

webscraping

Sleeping

App Files Files Community

bala00712200502 commited on Apr 15, 2025

Commit

a29b87f

verified ·

1 Parent(s): 542ebc2

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -72

app.py CHANGED Viewed

@@ -1,51 +1,50 @@
 import os
 from dotenv import load_dotenv
 from crewai import Agent, Task, Crew, Process
 from langchain_openai import ChatOpenAI
 import gradio as gr
-import requests
-from bs4 import BeautifulSoup
 # Load environment variables
 load_dotenv()
-# Set up OpenAI API key
-os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
-# Initialize LLM
-llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.7)
 class WebScraper:
     def __init__(self):
-        # Define agents
         self.scraper_agent = Agent(
-            role='Senior Web Scraper',
-            goal='Extract content from web pages accurately and efficiently',
-            backstory="""You are an expert web scraper with years of experience in extracting
-            information from various websites. You know how to handle different website structures
-            and can adapt to different content formats.""",
-            verbose=True,  # Changed to boolean
             allow_delegation=False,
             llm=llm
         )
         self.analyst_agent = Agent(
             role='Content Analyst',
-            goal='Analyze and summarize scraped content effectively',
-            backstory="""You are a skilled content analyst who can take raw scraped data and
-            transform it into meaningful, organized information. You excel at summarizing,
-            categorizing, and extracting key insights from web content.""",
-            verbose=True,  # Changed to boolean
             allow_delegation=False,
             llm=llm
         )
     def scrape_website(self, url):
-        """Basic web scraping function"""
         try:
             headers = {
-                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
-            response = requests.get(url, headers=headers)
             response.raise_for_status()
             soup = BeautifulSoup(response.text, 'html.parser')
@@ -54,92 +53,80 @@ class WebScraper:
             for element in soup(['script', 'style', 'nav', 'footer', 'iframe', 'noscript']):
                 element.decompose()
-            # Get text content
             text = soup.get_text(separator='\n', strip=True)
             return {
                 'status': 'success',
                 'url': url,
-                'content': text[:10000]  # Limit to first 10k characters to avoid token limits
             }
         except Exception as e:
             return {
                 'status': 'error',
                 'url': url,
-                'error': str(e)
             }
     def analyze_content(self, content):
-        """Process the scraped content with CrewAI"""
-        # Define tasks
         scrape_task = Task(
-            description=f"Extract and clean the content from the provided web page data.",
-            expected_output="A clean, well-formatted text containing the main content of the web page if posible give in table formet.",
             agent=self.scraper_agent
         )
         analyze_task = Task(
-            description="Analyze the scraped content and provide a comprehensive summary and key points.",
-            expected_output="A detailed summary of the content with bullet points of key information.",
             agent=self.analyst_agent
         )
-        # Create crew
         crew = Crew(
             agents=[self.scraper_agent, self.analyst_agent],
             tasks=[scrape_task, analyze_task],
-            verbose=True,  # Changed from 2 to True
             process=Process.sequential
         )
-        # Execute tasks
-        result = crew.kickoff(inputs={'content': content})
-        return result
 def process_url(url):
-    """Process URL through scraping and analysis"""
     scraper = WebScraper()
-    # Step 1: Scrape the website
     scraped_data = scraper.scrape_website(url)
     if scraped_data['status'] == 'error':
-        return f"Error scraping website: {scraped_data['error']}"
-    # Step 2: Analyze content
-    analysis_result = scraper.analyze_content(scraped_data['content'])
-    return analysis_result
-# Gradio Interface
-def create_gradio_interface():
-    with gr.Blocks() as demo:
-        gr.Markdown("# Web Scraping Agent with CrewAI")
-        gr.Markdown("Enter a URL to scrape and analyze its content")
-        with gr.Row():
-            url_input = gr.Textbox(label="Enter URL", placeholder="https://example.com")
-            submit_btn = gr.Button("Scrape & Analyze")
-        output = gr.Textbox(label="Analysis Results", lines=20, interactive=False)
-        submit_btn.click(
-            fn=process_url,
-            inputs=url_input,
-            outputs=output
-        )
-        gr.Examples(
-            examples=[
-                ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
-                ["https://www.nytimes.com"],
-                ["https://www.bbc.com/news/technology"]
-            ],
-            inputs=url_input
-        )
-    return demo
 if __name__ == "__main__":
-    demo = create_gradio_interface()
     demo.launch()

 import os
+import requests
+from bs4 import BeautifulSoup
+from functools import lru_cache
 from dotenv import load_dotenv
 from crewai import Agent, Task, Crew, Process
 from langchain_openai import ChatOpenAI
 import gradio as gr
 # Load environment variables
 load_dotenv()
+# Initialize LLM with timeout
+llm = ChatOpenAI(
+    model="gpt-3.5-turbo",
+    temperature=0.7,
+    request_timeout=60
+)
 class WebScraper:
     def __init__(self):
         self.scraper_agent = Agent(
+            role='Web Scraper',
+            goal='Extract clean content from web pages',
+            backstory="Expert in extracting information from websites.",
+            verbose=False,  # Disable verbose in production
             allow_delegation=False,
             llm=llm
         )
         self.analyst_agent = Agent(
             role='Content Analyst',
+            goal='Summarize scraped content',
+            backstory="Skilled at analyzing and summarizing content.",
+            verbose=False,
             allow_delegation=False,
             llm=llm
         )
     def scrape_website(self, url):
         try:
             headers = {
+                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
+                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
+            }
+            response = requests.get(url, headers=headers, timeout=15)
             response.raise_for_status()
             soup = BeautifulSoup(response.text, 'html.parser')
             for element in soup(['script', 'style', 'nav', 'footer', 'iframe', 'noscript']):
                 element.decompose()
             text = soup.get_text(separator='\n', strip=True)
             return {
                 'status': 'success',
                 'url': url,
+                'content': text[:5000]  # Smaller limit for Spaces
             }
         except Exception as e:
             return {
                 'status': 'error',
                 'url': url,
+                'error': f"Scraping failed: {str(e)}"
             }
     def analyze_content(self, content):
         scrape_task = Task(
+            description="Extract and clean the web page content.",
+            expected_output="Clean text content in markdown format.",
             agent=self.scraper_agent
         )
         analyze_task = Task(
+            description="Summarize the content with key points.",
+            expected_output="Bullet point summary with main ideas.",
             agent=self.analyst_agent
         )
         crew = Crew(
             agents=[self.scraper_agent, self.analyst_agent],
             tasks=[scrape_task, analyze_task],
+            verbose=False,
             process=Process.sequential
         )
+        try:
+            result = crew.kickoff(inputs={'content': content})
+            return result
+        except Exception as e:
+            return f"Analysis failed: {str(e)}"
+@lru_cache(maxsize=32)
 def process_url(url):
     scraper = WebScraper()
     scraped_data = scraper.scrape_website(url)
     if scraped_data['status'] == 'error':
+        return f"Error: {scraped_data['error']}"
+    return scraper.analyze_content(scraped_data['content'])
+with gr.Blocks() as demo:
+    gr.Markdown("# Web Scraping Agent")
+    gr.Markdown("Enter a URL to analyze (simple informational sites work best)")
+    with gr.Row():
+        url_input = gr.Textbox(label="URL", placeholder="https://example.com")
+        submit_btn = gr.Button("Analyze")
+    output = gr.Markdown(label="Results")
+    submit_btn.click(
+        fn=process_url,
+        inputs=url_input,
+        outputs=output
+    )
+    gr.Examples(
+        examples=[
+            ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
+            ["https://www.bbc.com/news/technology-68639847"],
+            ["https://www.nasa.gov/about/index.html"]
+        ],
+        inputs=url_input
+    )
 if __name__ == "__main__":
     demo.launch()