bala00712200502 commited on
Commit
a29b87f
·
verified ·
1 Parent(s): 542ebc2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -72
app.py CHANGED
@@ -1,51 +1,50 @@
1
  import os
 
 
 
2
  from dotenv import load_dotenv
3
  from crewai import Agent, Task, Crew, Process
4
  from langchain_openai import ChatOpenAI
5
  import gradio as gr
6
- import requests
7
- from bs4 import BeautifulSoup
8
 
9
  # Load environment variables
10
  load_dotenv()
11
 
12
- # Set up OpenAI API key
13
- os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
14
-
15
- # Initialize LLM
16
- llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.7)
 
17
 
18
  class WebScraper:
19
  def __init__(self):
20
- # Define agents
21
  self.scraper_agent = Agent(
22
- role='Senior Web Scraper',
23
- goal='Extract content from web pages accurately and efficiently',
24
- backstory="""You are an expert web scraper with years of experience in extracting
25
- information from various websites. You know how to handle different website structures
26
- and can adapt to different content formats.""",
27
- verbose=True, # Changed to boolean
28
  allow_delegation=False,
29
  llm=llm
30
  )
31
 
32
  self.analyst_agent = Agent(
33
  role='Content Analyst',
34
- goal='Analyze and summarize scraped content effectively',
35
- backstory="""You are a skilled content analyst who can take raw scraped data and
36
- transform it into meaningful, organized information. You excel at summarizing,
37
- categorizing, and extracting key insights from web content.""",
38
- verbose=True, # Changed to boolean
39
  allow_delegation=False,
40
  llm=llm
41
  )
42
 
43
  def scrape_website(self, url):
44
- """Basic web scraping function"""
45
  try:
46
  headers = {
47
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
48
- response = requests.get(url, headers=headers)
 
 
 
49
  response.raise_for_status()
50
 
51
  soup = BeautifulSoup(response.text, 'html.parser')
@@ -54,92 +53,80 @@ class WebScraper:
54
  for element in soup(['script', 'style', 'nav', 'footer', 'iframe', 'noscript']):
55
  element.decompose()
56
 
57
- # Get text content
58
  text = soup.get_text(separator='\n', strip=True)
59
 
60
  return {
61
  'status': 'success',
62
  'url': url,
63
- 'content': text[:10000] # Limit to first 10k characters to avoid token limits
64
  }
65
  except Exception as e:
66
  return {
67
  'status': 'error',
68
  'url': url,
69
- 'error': str(e)
70
  }
71
 
72
  def analyze_content(self, content):
73
- """Process the scraped content with CrewAI"""
74
- # Define tasks
75
  scrape_task = Task(
76
- description=f"Extract and clean the content from the provided web page data.",
77
- expected_output="A clean, well-formatted text containing the main content of the web page if posible give in table formet.",
78
  agent=self.scraper_agent
79
  )
80
 
81
  analyze_task = Task(
82
- description="Analyze the scraped content and provide a comprehensive summary and key points.",
83
- expected_output="A detailed summary of the content with bullet points of key information.",
84
  agent=self.analyst_agent
85
  )
86
 
87
- # Create crew
88
  crew = Crew(
89
  agents=[self.scraper_agent, self.analyst_agent],
90
  tasks=[scrape_task, analyze_task],
91
- verbose=True, # Changed from 2 to True
92
  process=Process.sequential
93
  )
94
 
95
- # Execute tasks
96
- result = crew.kickoff(inputs={'content': content})
97
- return result
 
 
98
 
 
99
  def process_url(url):
100
- """Process URL through scraping and analysis"""
101
  scraper = WebScraper()
102
-
103
- # Step 1: Scrape the website
104
  scraped_data = scraper.scrape_website(url)
105
 
106
  if scraped_data['status'] == 'error':
107
- return f"Error scraping website: {scraped_data['error']}"
108
 
109
- # Step 2: Analyze content
110
- analysis_result = scraper.analyze_content(scraped_data['content'])
111
-
112
- return analysis_result
113
 
114
- # Gradio Interface
115
- def create_gradio_interface():
116
- with gr.Blocks() as demo:
117
- gr.Markdown("# Web Scraping Agent with CrewAI")
118
- gr.Markdown("Enter a URL to scrape and analyze its content")
119
-
120
- with gr.Row():
121
- url_input = gr.Textbox(label="Enter URL", placeholder="https://example.com")
122
- submit_btn = gr.Button("Scrape & Analyze")
123
-
124
- output = gr.Textbox(label="Analysis Results", lines=20, interactive=False)
125
-
126
- submit_btn.click(
127
- fn=process_url,
128
- inputs=url_input,
129
- outputs=output
130
- )
131
-
132
- gr.Examples(
133
- examples=[
134
- ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
135
- ["https://www.nytimes.com"],
136
- ["https://www.bbc.com/news/technology"]
137
- ],
138
- inputs=url_input
139
- )
140
 
141
- return demo
 
 
 
 
 
 
 
142
 
143
  if __name__ == "__main__":
144
- demo = create_gradio_interface()
145
  demo.launch()
 
1
  import os
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ from functools import lru_cache
5
  from dotenv import load_dotenv
6
  from crewai import Agent, Task, Crew, Process
7
  from langchain_openai import ChatOpenAI
8
  import gradio as gr
 
 
9
 
10
  # Load environment variables
11
  load_dotenv()
12
 
13
+ # Initialize LLM with timeout
14
+ llm = ChatOpenAI(
15
+ model="gpt-3.5-turbo",
16
+ temperature=0.7,
17
+ request_timeout=60
18
+ )
19
 
20
  class WebScraper:
21
  def __init__(self):
 
22
  self.scraper_agent = Agent(
23
+ role='Web Scraper',
24
+ goal='Extract clean content from web pages',
25
+ backstory="Expert in extracting information from websites.",
26
+ verbose=False, # Disable verbose in production
 
 
27
  allow_delegation=False,
28
  llm=llm
29
  )
30
 
31
  self.analyst_agent = Agent(
32
  role='Content Analyst',
33
+ goal='Summarize scraped content',
34
+ backstory="Skilled at analyzing and summarizing content.",
35
+ verbose=False,
 
 
36
  allow_delegation=False,
37
  llm=llm
38
  )
39
 
40
  def scrape_website(self, url):
 
41
  try:
42
  headers = {
43
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
44
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
45
+ }
46
+
47
+ response = requests.get(url, headers=headers, timeout=15)
48
  response.raise_for_status()
49
 
50
  soup = BeautifulSoup(response.text, 'html.parser')
 
53
  for element in soup(['script', 'style', 'nav', 'footer', 'iframe', 'noscript']):
54
  element.decompose()
55
 
 
56
  text = soup.get_text(separator='\n', strip=True)
57
 
58
  return {
59
  'status': 'success',
60
  'url': url,
61
+ 'content': text[:5000] # Smaller limit for Spaces
62
  }
63
  except Exception as e:
64
  return {
65
  'status': 'error',
66
  'url': url,
67
+ 'error': f"Scraping failed: {str(e)}"
68
  }
69
 
70
  def analyze_content(self, content):
 
 
71
  scrape_task = Task(
72
+ description="Extract and clean the web page content.",
73
+ expected_output="Clean text content in markdown format.",
74
  agent=self.scraper_agent
75
  )
76
 
77
  analyze_task = Task(
78
+ description="Summarize the content with key points.",
79
+ expected_output="Bullet point summary with main ideas.",
80
  agent=self.analyst_agent
81
  )
82
 
 
83
  crew = Crew(
84
  agents=[self.scraper_agent, self.analyst_agent],
85
  tasks=[scrape_task, analyze_task],
86
+ verbose=False,
87
  process=Process.sequential
88
  )
89
 
90
+ try:
91
+ result = crew.kickoff(inputs={'content': content})
92
+ return result
93
+ except Exception as e:
94
+ return f"Analysis failed: {str(e)}"
95
 
96
+ @lru_cache(maxsize=32)
97
  def process_url(url):
 
98
  scraper = WebScraper()
 
 
99
  scraped_data = scraper.scrape_website(url)
100
 
101
  if scraped_data['status'] == 'error':
102
+ return f"Error: {scraped_data['error']}"
103
 
104
+ return scraper.analyze_content(scraped_data['content'])
 
 
 
105
 
106
+ with gr.Blocks() as demo:
107
+ gr.Markdown("# Web Scraping Agent")
108
+ gr.Markdown("Enter a URL to analyze (simple informational sites work best)")
109
+
110
+ with gr.Row():
111
+ url_input = gr.Textbox(label="URL", placeholder="https://example.com")
112
+ submit_btn = gr.Button("Analyze")
113
+
114
+ output = gr.Markdown(label="Results")
115
+
116
+ submit_btn.click(
117
+ fn=process_url,
118
+ inputs=url_input,
119
+ outputs=output
120
+ )
 
 
 
 
 
 
 
 
 
 
 
121
 
122
+ gr.Examples(
123
+ examples=[
124
+ ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
125
+ ["https://www.bbc.com/news/technology-68639847"],
126
+ ["https://www.nasa.gov/about/index.html"]
127
+ ],
128
+ inputs=url_input
129
+ )
130
 
131
  if __name__ == "__main__":
 
132
  demo.launch()