Spaces:

bala00712200502
/

webscraping

Sleeping

App Files Files Community

bala00712200502 commited on Apr 15, 2025

Commit

255ebde

verified ·

1 Parent(s): 33cc970

Upload 2 files

Browse files

Files changed (2) hide show

requirements.txt +30 -0
webscrapagent.py +145 -0

requirements.txt ADDED Viewed

	@@ -0,0 +1,30 @@

+streamlit
+gradio
+pydantic
+crewai
+langchain
+langchain-community
+langchainhub
+python-dotenv
+grok
+groqcloud
+beautifulsoup4
+requests
+pandas
+openai
+chromadb
+streamlit==1.32.2
+crewai==0.22.2
+chromadb==0.4.24
+langchain
+groq
+langchain-groq
+litellm
+yfinance
+pandas
+plotly
+yfinance
+plotly-express
+requests
+langchain-openai

webscrapagent.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import os
+from dotenv import load_dotenv
+from crewai import Agent, Task, Crew, Process
+from langchain_openai import ChatOpenAI
+import gradio as gr
+import requests
+from bs4 import BeautifulSoup
+# Load environment variables
+load_dotenv()
+# Set up OpenAI API key
+os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
+# Initialize LLM
+llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.7)
+class WebScraper:
+    def __init__(self):
+        # Define agents
+        self.scraper_agent = Agent(
+            role='Senior Web Scraper',
+            goal='Extract content from web pages accurately and efficiently',
+            backstory="""You are an expert web scraper with years of experience in extracting
+            information from various websites. You know how to handle different website structures
+            and can adapt to different content formats.""",
+            verbose=True,  # Changed to boolean
+            allow_delegation=False,
+            llm=llm
+        )
+        self.analyst_agent = Agent(
+            role='Content Analyst',
+            goal='Analyze and summarize scraped content effectively',
+            backstory="""You are a skilled content analyst who can take raw scraped data and
+            transform it into meaningful, organized information. You excel at summarizing,
+            categorizing, and extracting key insights from web content.""",
+            verbose=True,  # Changed to boolean
+            allow_delegation=False,
+            llm=llm
+        )
+    def scrape_website(self, url):
+        """Basic web scraping function"""
+        try:
+            headers = {
+                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
+            response = requests.get(url, headers=headers)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.text, 'html.parser')
+            # Remove unwanted elements
+            for element in soup(['script', 'style', 'nav', 'footer', 'iframe', 'noscript']):
+                element.decompose()
+            # Get text content
+            text = soup.get_text(separator='\n', strip=True)
+            return {
+                'status': 'success',
+                'url': url,
+                'content': text[:10000]  # Limit to first 10k characters to avoid token limits
+            }
+        except Exception as e:
+            return {
+                'status': 'error',
+                'url': url,
+                'error': str(e)
+            }
+    def analyze_content(self, content):
+        """Process the scraped content with CrewAI"""
+        # Define tasks
+        scrape_task = Task(
+            description=f"Extract and clean the content from the provided web page data.",
+            expected_output="A clean, well-formatted text containing the main content of the web page if posible give in table formet.",
+            agent=self.scraper_agent
+        )
+        analyze_task = Task(
+            description="Analyze the scraped content and provide a comprehensive summary and key points.",
+            expected_output="A detailed summary of the content with bullet points of key information.",
+            agent=self.analyst_agent
+        )
+        # Create crew
+        crew = Crew(
+            agents=[self.scraper_agent, self.analyst_agent],
+            tasks=[scrape_task, analyze_task],
+            verbose=True,  # Changed from 2 to True
+            process=Process.sequential
+        )
+        # Execute tasks
+        result = crew.kickoff(inputs={'content': content})
+        return result
+def process_url(url):
+    """Process URL through scraping and analysis"""
+    scraper = WebScraper()
+    # Step 1: Scrape the website
+    scraped_data = scraper.scrape_website(url)
+    if scraped_data['status'] == 'error':
+        return f"Error scraping website: {scraped_data['error']}"
+    # Step 2: Analyze content
+    analysis_result = scraper.analyze_content(scraped_data['content'])
+    return analysis_result
+# Gradio Interface
+def create_gradio_interface():
+    with gr.Blocks() as demo:
+        gr.Markdown("# Web Scraping Agent with CrewAI")
+        gr.Markdown("Enter a URL to scrape and analyze its content")
+        with gr.Row():
+            url_input = gr.Textbox(label="Enter URL", placeholder="https://example.com")
+            submit_btn = gr.Button("Scrape & Analyze")
+        output = gr.Textbox(label="Analysis Results", lines=20, interactive=False)
+        submit_btn.click(
+            fn=process_url,
+            inputs=url_input,
+            outputs=output
+        )
+        gr.Examples(
+            examples=[
+                ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
+                ["https://www.nytimes.com"],
+                ["https://www.bbc.com/news/technology"]
+            ],
+            inputs=url_input
+        )
+    return demo
+if __name__ == "__main__":
+    demo = create_gradio_interface()
+    demo.launch()