bala00712200502 commited on
Commit
255ebde
·
verified ·
1 Parent(s): 33cc970

Upload 2 files

Browse files
Files changed (2) hide show
  1. requirements.txt +30 -0
  2. webscrapagent.py +145 -0
requirements.txt ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ gradio
3
+ pydantic
4
+ crewai
5
+ langchain
6
+ langchain-community
7
+ langchainhub
8
+ python-dotenv
9
+ grok
10
+ groqcloud
11
+
12
+ beautifulsoup4
13
+ requests
14
+ pandas
15
+ openai
16
+ chromadb
17
+ streamlit==1.32.2
18
+ crewai==0.22.2
19
+ chromadb==0.4.24
20
+ langchain
21
+ groq
22
+ langchain-groq
23
+ litellm
24
+ yfinance
25
+ pandas
26
+ plotly
27
+ yfinance
28
+ plotly-express
29
+ requests
30
+ langchain-openai
webscrapagent.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from crewai import Agent, Task, Crew, Process
4
+ from langchain_openai import ChatOpenAI
5
+ import gradio as gr
6
+ import requests
7
+ from bs4 import BeautifulSoup
8
+
9
+ # Load environment variables
10
+ load_dotenv()
11
+
12
+ # Set up OpenAI API key
13
+ os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
14
+
15
+ # Initialize LLM
16
+ llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.7)
17
+
18
+ class WebScraper:
19
+ def __init__(self):
20
+ # Define agents
21
+ self.scraper_agent = Agent(
22
+ role='Senior Web Scraper',
23
+ goal='Extract content from web pages accurately and efficiently',
24
+ backstory="""You are an expert web scraper with years of experience in extracting
25
+ information from various websites. You know how to handle different website structures
26
+ and can adapt to different content formats.""",
27
+ verbose=True, # Changed to boolean
28
+ allow_delegation=False,
29
+ llm=llm
30
+ )
31
+
32
+ self.analyst_agent = Agent(
33
+ role='Content Analyst',
34
+ goal='Analyze and summarize scraped content effectively',
35
+ backstory="""You are a skilled content analyst who can take raw scraped data and
36
+ transform it into meaningful, organized information. You excel at summarizing,
37
+ categorizing, and extracting key insights from web content.""",
38
+ verbose=True, # Changed to boolean
39
+ allow_delegation=False,
40
+ llm=llm
41
+ )
42
+
43
+ def scrape_website(self, url):
44
+ """Basic web scraping function"""
45
+ try:
46
+ headers = {
47
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
48
+ response = requests.get(url, headers=headers)
49
+ response.raise_for_status()
50
+
51
+ soup = BeautifulSoup(response.text, 'html.parser')
52
+
53
+ # Remove unwanted elements
54
+ for element in soup(['script', 'style', 'nav', 'footer', 'iframe', 'noscript']):
55
+ element.decompose()
56
+
57
+ # Get text content
58
+ text = soup.get_text(separator='\n', strip=True)
59
+
60
+ return {
61
+ 'status': 'success',
62
+ 'url': url,
63
+ 'content': text[:10000] # Limit to first 10k characters to avoid token limits
64
+ }
65
+ except Exception as e:
66
+ return {
67
+ 'status': 'error',
68
+ 'url': url,
69
+ 'error': str(e)
70
+ }
71
+
72
+ def analyze_content(self, content):
73
+ """Process the scraped content with CrewAI"""
74
+ # Define tasks
75
+ scrape_task = Task(
76
+ description=f"Extract and clean the content from the provided web page data.",
77
+ expected_output="A clean, well-formatted text containing the main content of the web page if posible give in table formet.",
78
+ agent=self.scraper_agent
79
+ )
80
+
81
+ analyze_task = Task(
82
+ description="Analyze the scraped content and provide a comprehensive summary and key points.",
83
+ expected_output="A detailed summary of the content with bullet points of key information.",
84
+ agent=self.analyst_agent
85
+ )
86
+
87
+ # Create crew
88
+ crew = Crew(
89
+ agents=[self.scraper_agent, self.analyst_agent],
90
+ tasks=[scrape_task, analyze_task],
91
+ verbose=True, # Changed from 2 to True
92
+ process=Process.sequential
93
+ )
94
+
95
+ # Execute tasks
96
+ result = crew.kickoff(inputs={'content': content})
97
+ return result
98
+
99
+ def process_url(url):
100
+ """Process URL through scraping and analysis"""
101
+ scraper = WebScraper()
102
+
103
+ # Step 1: Scrape the website
104
+ scraped_data = scraper.scrape_website(url)
105
+
106
+ if scraped_data['status'] == 'error':
107
+ return f"Error scraping website: {scraped_data['error']}"
108
+
109
+ # Step 2: Analyze content
110
+ analysis_result = scraper.analyze_content(scraped_data['content'])
111
+
112
+ return analysis_result
113
+
114
+ # Gradio Interface
115
+ def create_gradio_interface():
116
+ with gr.Blocks() as demo:
117
+ gr.Markdown("# Web Scraping Agent with CrewAI")
118
+ gr.Markdown("Enter a URL to scrape and analyze its content")
119
+
120
+ with gr.Row():
121
+ url_input = gr.Textbox(label="Enter URL", placeholder="https://example.com")
122
+ submit_btn = gr.Button("Scrape & Analyze")
123
+
124
+ output = gr.Textbox(label="Analysis Results", lines=20, interactive=False)
125
+
126
+ submit_btn.click(
127
+ fn=process_url,
128
+ inputs=url_input,
129
+ outputs=output
130
+ )
131
+
132
+ gr.Examples(
133
+ examples=[
134
+ ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
135
+ ["https://www.nytimes.com"],
136
+ ["https://www.bbc.com/news/technology"]
137
+ ],
138
+ inputs=url_input
139
+ )
140
+
141
+ return demo
142
+
143
+ if __name__ == "__main__":
144
+ demo = create_gradio_interface()
145
+ demo.launch()