MainStreet123 commited on
Commit
2543503
·
verified ·
1 Parent(s): 547bdb7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +235 -101
app.py CHANGED
@@ -1,121 +1,253 @@
1
  import os
 
 
2
  import sys
3
  import gradio as gr
4
  import requests
5
  import pandas as pd
6
-
7
- # Allow importing smolagents from hf-smolagents venv when running from project root
8
- _root = os.path.dirname(os.path.abspath(__file__))
9
- _venv_lib = os.path.join(_root, "hf-smolagents", ".venv", "lib")
10
- if os.path.isdir(_venv_lib):
11
- for _name in os.listdir(_venv_lib):
12
- if _name.startswith("python"):
13
- _sp = os.path.join(_venv_lib, _name, "site-packages")
14
- if os.path.isdir(_sp):
15
- sys.path.insert(0, _sp)
16
- break
17
- else:
18
- _sp = None
19
- else:
20
- _sp = None
21
-
22
- from smolagents import CodeAgent, InferenceClientModel
23
- from smolagents.default_tools import (
24
- DuckDuckGoSearchTool,
25
- FinalAnswerTool,
26
- PythonInterpreterTool,
27
- UserInputTool,
28
- )
29
 
30
  # (Keep Constants as is)
31
  # --- Constants ---
32
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 
 
 
 
33
 
34
- # --- Multi-Agent System (smolagents) ---
35
- def _create_model():
36
- token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
37
- return InferenceClientModel(
38
- model_id="Qwen/Qwen2.5-Coder-7B-Instruct",
39
- token=token,
40
- )
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
- def _create_code_agent(model):
44
- """Code agent: Python interpreter + final answer. For math, calculations, data."""
45
- return CodeAgent(
46
- tools=[PythonInterpreterTool(), UserInputTool()],
47
- model=model,
48
- name="code_agent",
49
- description="Use for math, calculations, data processing, or when the task can be solved by writing and running Python code. Call with a single clear task.",
50
- max_steps=15,
51
- )
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
- def _create_web_agent(model):
55
- """Web agent: DuckDuckGo search + final answer. For factual/search tasks."""
56
- return CodeAgent(
57
- tools=[DuckDuckGoSearchTool(), UserInputTool()],
58
- model=model,
59
- name="web_agent",
60
- description="Use for factual questions, current events, or when you need to search the web. Give one search task per call; you can call me multiple times with different queries.",
61
- max_steps=15,
62
- )
 
 
 
 
 
 
63
 
64
 
65
- def _create_evaluator_agent(model):
66
- """Evaluator: picks best answer from multiple candidates, returns via final_answer."""
67
- return CodeAgent(
68
- tools=[FinalAnswerTool(), UserInputTool()],
69
- model=model,
70
- name="evaluator_agent",
71
- description="Use to pick the single best answer from multiple candidate answers. Pass a task containing the original question and the list of candidate answers; return only the chosen best answer (concise, factual).",
72
- max_steps=5,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  )
 
 
 
 
 
 
74
 
75
 
76
- def _create_manager_agent(model):
77
- """Manager: decides code_agent vs web_agent; if web, runs multiple web queries then evaluator then final_answer."""
78
- code_agent = _create_code_agent(model)
79
- web_agent = _create_web_agent(model)
80
- evaluator_agent = _create_evaluator_agent(model)
81
- return CodeAgent(
82
- tools=[UserInputTool()],
83
- model=model,
84
- managed_agents=[code_agent, web_agent, evaluator_agent],
85
- name="manager",
86
- description="Orchestrator: decide whether to use code_agent or web_agent. For web tasks, call web_agent multiple times with different search queries, then call evaluator_agent with the question and all answers to pick the best one, then call final_answer with that result.",
87
- max_steps=25,
88
- planning_interval=3,
89
- )
 
 
 
 
 
90
 
 
 
 
 
 
 
 
 
91
 
92
- class BasicAgent:
93
- """Wrapper: builds manager-led multi-agent system and returns final answer string per question."""
94
 
 
95
  def __init__(self):
96
- print("Multi-agent system: initializing model and manager...")
97
- self._model = _create_model()
98
- self._manager = _create_manager_agent(self._model)
99
- print("Multi-agent system initialized (manager + code_agent, web_agent, evaluator_agent).")
100
 
101
  def __call__(self, question: str) -> str:
102
- print(f"Agent received question (first 80 chars): {question[:80]}...")
103
- try:
104
- result = self._manager.run(
105
- question,
106
- reset=True,
107
- stream=False,
108
- return_full_result=False,
109
- )
110
- if result is None:
111
- out = "No answer produced."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  else:
113
- out = str(result).strip()
114
- print(f"Agent returning answer (first 80 chars): {out[:80]}...")
115
- return out
116
- except Exception as e:
117
- print(f"Agent error: {e}")
118
- return f"AGENT ERROR: {e}"
 
 
 
 
 
 
 
119
 
120
  def run_and_submit_all( profile: gr.OAuthProfile | None):
121
  """
@@ -136,9 +268,9 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
136
  questions_url = f"{api_url}/questions"
137
  submit_url = f"{api_url}/submit"
138
 
139
- # 1. Instantiate Agent ( modify this part to create your agent)
140
  try:
141
- agent = BasicAgent()
142
  except Exception as e:
143
  print(f"Error instantiating agent: {e}")
144
  return f"Error initializing agent: {e}", None
@@ -240,17 +372,20 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
240
 
241
  # --- Build Gradio Interface using Blocks ---
242
  with gr.Blocks() as demo:
243
- gr.Markdown("# Basic Agent Evaluation Runner")
244
  gr.Markdown(
245
  """
 
 
246
  **Instructions:**
247
- 1. Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc ...
 
248
  2. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
249
- 3. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
 
250
  ---
251
  **Disclaimers:**
252
- Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
253
- This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
254
  """
255
  )
256
 
@@ -289,5 +424,4 @@ if __name__ == "__main__":
289
  print("-"*(60 + len(" App Starting ")) + "\n")
290
 
291
  print("Launching Gradio Interface for Basic Agent Evaluation...")
292
- demo.launch(debug=True, share=False)
293
-
 
1
  import os
2
+ import re
3
+ import io
4
  import sys
5
  import gradio as gr
6
  import requests
7
  import pandas as pd
8
+ from duckduckgo_search import DDGS
9
+ from bs4 import BeautifulSoup
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  # (Keep Constants as is)
12
  # --- Constants ---
13
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
14
+ HF_INFERENCE_URL = "https://api-inference.huggingface.co/models"
15
+ ROUTER_MODEL = "HuggingFaceH4/zephyr-7b-beta"
16
+ EVALUATOR_MODEL = "HuggingFaceH4/zephyr-7b-beta"
17
+ MAX_MANAGER_ITERATIONS = 5
18
 
19
+ # --- Tools (used by agents) ---
 
 
 
 
 
 
20
 
21
+ def python_interpreter_tool(code: str) -> str:
22
+ """Execute Python code and return stdout + result."""
23
+ try:
24
+ old_stdout = sys.stdout
25
+ sys.stdout = buf = io.StringIO()
26
+ try:
27
+ local = {}
28
+ exec(code, {"__builtins__": __builtins__}, local)
29
+ out = buf.getvalue()
30
+ if local.get("result") is not None:
31
+ out = (out + "\n" + str(local["result"])).strip()
32
+ return out or "(no output)"
33
+ finally:
34
+ sys.stdout = old_stdout
35
+ except Exception as e:
36
+ return f"Error: {e}"
37
 
 
 
 
 
 
 
 
 
 
38
 
39
+ def duckduckgo_search_tool(query: str, max_results: int = 5) -> str:
40
+ """Search DuckDuckGo and return snippets."""
41
+ try:
42
+ with DDGS() as ddgs:
43
+ results = list(ddgs.text(query, max_results=max_results))
44
+ if not results:
45
+ return "No search results found."
46
+ parts = []
47
+ for r in results:
48
+ title = r.get("title", "")
49
+ body = r.get("body", "")
50
+ href = r.get("href", "")
51
+ parts.append(f"[{title}]({href})\n{body}")
52
+ return "\n\n".join(parts)
53
+ except Exception as e:
54
+ return f"Search error: {e}"
55
 
56
+
57
+ def visit_web_page_tool(url: str, max_chars: int = 8000) -> str:
58
+ """Fetch a URL and return main text content."""
59
+ try:
60
+ headers = {"User-Agent": "Mozilla/5.0 (compatible; GAIA-Agent/1.0)"}
61
+ resp = requests.get(url, timeout=15, headers=headers)
62
+ resp.raise_for_status()
63
+ soup = BeautifulSoup(resp.text, "html.parser")
64
+ for tag in soup(["script", "style"]):
65
+ tag.decompose()
66
+ text = soup.get_text(separator="\n", strip=True)
67
+ text = re.sub(r"\n{3,}", "\n\n", text)
68
+ return text[:max_chars] if len(text) > max_chars else text
69
+ except Exception as e:
70
+ return f"Visit error: {e}"
71
 
72
 
73
+ def _llm_call(prompt: str, model: str, max_new_tokens: int = 150) -> str:
74
+ """Single LLM call via Hugging Face Inference API."""
75
+ token = os.getenv("HF_TOKEN") or os.getenv("HUGGING_FACE_HUB_TOKEN")
76
+ if not token:
77
+ return ""
78
+ url = f"{HF_INFERENCE_URL}/{model}"
79
+ try:
80
+ r = requests.post(
81
+ url,
82
+ headers={"Authorization": f"Bearer {token}"},
83
+ json={"inputs": prompt, "parameters": {"max_new_tokens": max_new_tokens, "return_full_text": False}},
84
+ timeout=30,
85
+ )
86
+ if r.status_code != 200:
87
+ return ""
88
+ data = r.json()
89
+ if isinstance(data, list) and len(data) > 0:
90
+ return (data[0].get("generated_text") or "").strip()
91
+ if isinstance(data, dict) and data.get("generated_text"):
92
+ return str(data["generated_text"]).strip()
93
+ return ""
94
+ except Exception:
95
+ return ""
96
+
97
+
98
+ def manager_route_question(question: str) -> str:
99
+ """Decide whether to use code agent or web search agent. Returns 'code' or 'web'."""
100
+ q = question.lower()
101
+ code_keywords = (
102
+ "calculate", "compute", "python", "code", "program", "script", "function",
103
+ "how many", "number of", "formula", "equation", "sum", "multiply", "divide",
104
+ "percentage", "average", "median", "prime", "fibonacci", "factorial",
105
+ "run code", "execute", "output of", "result of"
106
+ )
107
+ if any(k in q for k in code_keywords):
108
+ return "code"
109
+ prompt = f'Given this question, reply with exactly one word: "code" or "web". Question: {question[:300]}'
110
+ out = _llm_call(prompt, ROUTER_MODEL, max_new_tokens=10).lower()
111
+ if "code" in out:
112
+ return "code"
113
+ if "web" in out:
114
+ return "web"
115
+ return "web"
116
+
117
+
118
+ def evaluate_accuracy_tool(question: str, answer: str) -> bool:
119
+ """Use LLM to judge if answer looks mostly accurate. If no LLM, accept non-empty non-error answers."""
120
+ if not answer or "Error:" in answer or "error:" in answer[:200]:
121
+ return False
122
+ prompt = (
123
+ f'Question: {question}\nProposed answer: {answer[:800]}\n'
124
+ 'Does this answer look mostly correct and complete? Reply with exactly "yes" or "no".'
125
  )
126
+ out = _llm_call(prompt, EVALUATOR_MODEL, max_new_tokens=5).lower()
127
+ if "yes" in out:
128
+ return True
129
+ if "no" in out:
130
+ return False
131
+ return len(answer.strip()) > 10 and "not found" not in answer.lower()[:100]
132
 
133
 
134
+ def final_answer_tool(answer: str) -> str:
135
+ """Commit the final answer (manager returns this as the answer)."""
136
+ return answer.strip()
137
+
138
+
139
+ # --- Code Agent (has Python interpreter tool) ---
140
+
141
+ def _extract_python_code(text: str) -> str:
142
+ if not text:
143
+ return ""
144
+ text = text.strip()
145
+ for marker in ["```python", "```"]:
146
+ if marker in text:
147
+ parts = text.split(marker, 1)
148
+ if len(parts) > 1:
149
+ rest = parts[1].split("```", 1)[0]
150
+ return rest.strip()
151
+ return text
152
+
153
 
154
+ def _heuristic_code_from_question(question: str) -> str:
155
+ numbers = re.findall(r"\d+(?:\.\d+)?", question)
156
+ q = question.lower()
157
+ if "how many" in q or "number of" in q:
158
+ return "result = ' (code agent could not compute; try web search)'"
159
+ if numbers and ("sum" in q or "total" in q or "+" in question):
160
+ return f"result = {' + '.join(numbers)}"
161
+ return "result = ' (no code generated; try web search)'"
162
 
 
 
163
 
164
+ class CodeAgent:
165
  def __init__(self):
166
+ print("CodeAgent initialized.")
 
 
 
167
 
168
  def __call__(self, question: str) -> str:
169
+ print(f"CodeAgent received (first 50 chars): {question[:50]}...")
170
+ prompt = (
171
+ f"Question: {question}\n\n"
172
+ "Write a single Python code block to answer this. Use a variable 'result' for the final answer. "
173
+ "Only output valid Python code, no explanation."
174
+ )
175
+ code = _llm_call(prompt, ROUTER_MODEL, max_new_tokens=400)
176
+ if not code:
177
+ code = _heuristic_code_from_question(question)
178
+ code = _extract_python_code(code)
179
+ if not code:
180
+ return "Could not generate code for this question."
181
+ return python_interpreter_tool(code)
182
+
183
+
184
+ # --- Web Search Agent (DuckDuckGo + visit web page tools) ---
185
+
186
+ class WebSearchAgent:
187
+ def __init__(self):
188
+ print("WebSearchAgent initialized.")
189
+
190
+ def __call__(self, question: str) -> str:
191
+ print(f"WebSearchAgent received (first 50 chars): {question[:50]}...")
192
+ snippets = duckduckgo_search_tool(question, max_results=5)
193
+ if not snippets or "No search results" in snippets:
194
+ return "No search results found."
195
+ first_url = None
196
+ for line in snippets.split("\n"):
197
+ m = re.search(r"\((https?://[^)]+)\)", line)
198
+ if m:
199
+ first_url = m.group(1)
200
+ break
201
+ if first_url:
202
+ page_text = visit_web_page_tool(first_url, max_chars=4000)
203
+ if "Visit error" not in page_text:
204
+ snippets = snippets + "\n\n--- Page content ---\n" + page_text[:3000]
205
+ prompt = (
206
+ f"Question: {question}\n\nRelevant information:\n{snippets[:6000]}\n\n"
207
+ "Provide a concise, direct answer (string or number). No preamble."
208
+ )
209
+ answer = _llm_call(prompt, EVALUATOR_MODEL, max_new_tokens=200)
210
+ if answer:
211
+ return answer.strip()
212
+ blocks = [b.strip() for b in snippets.split("\n\n") if len(b.strip()) > 20]
213
+ return blocks[0][:500] if blocks else snippets[:500]
214
+
215
+
216
+ # --- Manager Agent (user input = question; routes code/web; evaluates accuracy; final answer or retry) ---
217
+
218
+ class ManagerAgent:
219
+ def __init__(self):
220
+ self.code_agent = CodeAgent()
221
+ self.web_agent = WebSearchAgent()
222
+ print("ManagerAgent initialized.")
223
+
224
+ def __call__(self, question: str) -> str:
225
+ print(f"Manager received question (first 50 chars): {question[:50]}...")
226
+ best_answer = None
227
+ tried_code = False
228
+ tried_web = False
229
+ for _ in range(MAX_MANAGER_ITERATIONS):
230
+ route = manager_route_question(question)
231
+ if route == "code" and not tried_code:
232
+ tried_code = True
233
+ reply = self.code_agent(question)
234
+ elif route == "web" and not tried_web:
235
+ tried_web = True
236
+ reply = self.web_agent(question)
237
  else:
238
+ if not tried_code:
239
+ tried_code = True
240
+ reply = self.code_agent(question)
241
+ elif not tried_web:
242
+ tried_web = True
243
+ reply = self.web_agent(question)
244
+ else:
245
+ break
246
+ if reply and "Error:" not in reply[:100] and "Could not" not in reply[:100]:
247
+ best_answer = reply
248
+ if evaluate_accuracy_tool(question, reply):
249
+ return final_answer_tool(reply)
250
+ return final_answer_tool(best_answer) if best_answer else "I could not determine a reliable answer."
251
 
252
  def run_and_submit_all( profile: gr.OAuthProfile | None):
253
  """
 
268
  questions_url = f"{api_url}/questions"
269
  submit_url = f"{api_url}/submit"
270
 
271
+ # 1. Instantiate Agent (multi-agent: Manager with Code + Web Search agents)
272
  try:
273
+ agent = ManagerAgent()
274
  except Exception as e:
275
  print(f"Error instantiating agent: {e}")
276
  return f"Error initializing agent: {e}", None
 
372
 
373
  # --- Build Gradio Interface using Blocks ---
374
  with gr.Blocks() as demo:
375
+ gr.Markdown("# Multi-Agent GAIA Evaluation Runner")
376
  gr.Markdown(
377
  """
378
+ **Architecture:** Manager Agent routes each question to either a **Code Agent** (Python interpreter) or **Web Search Agent** (DuckDuckGo + visit web page). The manager evaluates answer accuracy via an LLM; if mostly accurate it returns the final answer, otherwise it tries the other agent. Goal: score above 30 on GAIA.
379
+
380
  **Instructions:**
381
+
382
+ 1. Clone this space, then modify the code to tune agents, tools, or add an API token (HF_TOKEN or HUGGING_FACE_HUB_TOKEN) for LLM routing and evaluation.
383
  2. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
384
+ 3. Click 'Run Evaluation & Submit All Answers' to fetch questions, run the multi-agent system, submit answers, and see the score.
385
+
386
  ---
387
  **Disclaimers:**
388
+ Running the evaluation can take a long time while the agent processes all questions. For better GAIA scores, set HF_TOKEN in Space secrets for LLM-based routing and accuracy checks.
 
389
  """
390
  )
391
 
 
424
  print("-"*(60 + len(" App Starting ")) + "\n")
425
 
426
  print("Launching Gradio Interface for Basic Agent Evaluation...")
427
+ demo.launch(debug=True, share=False)