MainStreet123 commited on
Commit
993aee8
·
verified ·
1 Parent(s): 00d93b9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +170 -301
app.py CHANGED
@@ -1,321 +1,193 @@
1
  import os
2
  import re
3
- import io
4
- import sys
5
  import gradio as gr
6
  import requests
7
  import pandas as pd
8
- from duckduckgo_search import DDGS
9
  from bs4 import BeautifulSoup
 
 
 
10
 
11
  # (Keep Constants as is)
12
  # --- Constants ---
13
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
14
- HF_INFERENCE_URL = "https://api-inference.huggingface.co/models"
15
- ROUTER_MODEL = "HuggingFaceH4/zephyr-7b-beta"
16
- EVALUATOR_MODEL = "HuggingFaceH4/zephyr-7b-beta"
17
- CODE_MODEL = "HuggingFaceH4/zephyr-7b-beta"
18
- EXTRACTOR_MODEL = "HuggingFaceH4/zephyr-7b-beta"
19
- MAX_MANAGER_ITERATIONS = 5
20
- MAX_WEB_PAGES_TO_VISIT = 3
21
- MAX_WEB_SEARCH_ROUNDS = 2
22
-
23
- # --- Tools (used by agents) ---
24
-
25
- def python_interpreter_tool(code: str) -> str:
26
- """Execute Python code and return stdout + result."""
27
- try:
28
- old_stdout = sys.stdout
29
- sys.stdout = buf = io.StringIO()
30
- try:
31
- local = {}
32
- exec(code, {"__builtins__": __builtins__}, local)
33
- out = buf.getvalue()
34
- if local.get("result") is not None:
35
- out = (out + "\n" + str(local["result"])).strip()
36
- return out or "(no output)"
37
- finally:
38
- sys.stdout = old_stdout
39
- except Exception as e:
40
- return f"Error: {e}"
41
 
42
 
43
- def duckduckgo_search_tool(query: str, max_results: int = 5) -> str:
44
- """Search DuckDuckGo and return snippets."""
 
45
  try:
46
- with DDGS() as ddgs:
47
- results = list(ddgs.text(query, max_results=max_results))
48
  if not results:
49
  return "No search results found."
50
- parts = []
51
- for r in results:
52
- title = r.get("title", "")
53
- body = r.get("body", "")
54
- href = r.get("href", "")
55
- parts.append(f"[{title}]({href})\n{body}")
56
- return "\n\n".join(parts)
57
  except Exception as e:
58
- return f"Search error: {e}"
59
 
60
 
61
- def visit_web_page_tool(url: str, max_chars: int = 8000) -> str:
62
- """Fetch a URL and return main text content."""
63
  try:
64
- headers = {"User-Agent": "Mozilla/5.0 (compatible; GAIA-Agent/1.0)"}
65
- resp = requests.get(url, timeout=15, headers=headers)
66
- resp.raise_for_status()
67
- soup = BeautifulSoup(resp.text, "html.parser")
68
- for tag in soup(["script", "style"]):
69
  tag.decompose()
70
  text = soup.get_text(separator="\n", strip=True)
71
- text = re.sub(r"\n{3,}", "\n\n", text)
72
- return text[:max_chars] if len(text) > max_chars else text
73
  except Exception as e:
74
- return f"Visit error: {e}"
75
-
76
-
77
- def _llm_call(prompt: str, model: str, max_new_tokens: int = 150) -> str:
78
- """Single LLM call via Hugging Face Inference API."""
79
- token = os.getenv("HF_TOKEN") or os.getenv("HUGGING_FACE_HUB_TOKEN")
80
- if not token:
81
- return ""
82
- url = f"{HF_INFERENCE_URL}/{model}"
 
 
 
 
 
 
 
 
 
 
 
 
83
  try:
84
- r = requests.post(
85
- url,
86
- headers={"Authorization": f"Bearer {token}"},
87
- json={"inputs": prompt, "parameters": {"max_new_tokens": max_new_tokens, "return_full_text": False}},
88
- timeout=30,
89
- )
90
- if r.status_code != 200:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  return ""
92
- data = r.json()
93
- if isinstance(data, list) and len(data) > 0:
94
- return (data[0].get("generated_text") or "").strip()
95
- if isinstance(data, dict) and data.get("generated_text"):
96
- return str(data["generated_text"]).strip()
97
- return ""
98
- except Exception:
99
- return ""
100
-
101
-
102
- def manager_route_question(question: str) -> str:
103
- """Decide whether to use code agent or web search agent. Returns 'code' or 'web'."""
104
- q = question.lower()
105
- code_keywords = (
106
- "calculate", "compute", "python", "code", "program", "script", "function",
107
- "how many", "number of", "formula", "equation", "sum", "multiply", "divide",
108
- "percentage", "average", "median", "prime", "fibonacci", "factorial",
109
- "run code", "execute", "output of", "result of"
110
- )
111
- if any(k in q for k in code_keywords):
112
- return "code"
113
- prompt = f'Given this question, reply with exactly one word: "code" or "web". Question: {question[:300]}'
114
- out = _llm_call(prompt, ROUTER_MODEL, max_new_tokens=10).lower()
115
- if "code" in out:
116
- return "code"
117
- if "web" in out:
118
- return "web"
119
- return "web"
120
-
121
-
122
- def evaluate_accuracy_tool(question: str, answer: str) -> bool:
123
- """Use LLM to judge if answer looks mostly accurate. If no LLM, accept non-empty non-error answers."""
124
- if not answer or "Error:" in answer or "error:" in answer[:200]:
125
- return False
126
- prompt = (
127
- f'Question: {question}\nProposed answer: {answer[:800]}\n'
128
- 'Does this answer look mostly correct and complete? Reply with exactly "yes" or "no".'
129
- )
130
- out = _llm_call(prompt, EVALUATOR_MODEL, max_new_tokens=5).lower()
131
- if "yes" in out:
132
- return True
133
- if "no" in out:
134
- return False
135
- return len(answer.strip()) > 10 and "not found" not in answer.lower()[:100]
136
-
137
-
138
- def final_answer_tool(answer: str) -> str:
139
- """Commit the final answer (manager returns this as the answer)."""
140
- return answer.strip()
141
-
142
-
143
- def _looks_like_number(s: str) -> bool:
144
- s = s.strip().rstrip("%")
145
- try:
146
- float(s.replace(",", ""))
147
- return True
148
- except ValueError:
149
- return False
150
-
151
-
152
- def normalize_to_gaia_answer(question: str, raw_answer: str) -> str:
153
- """Extract a short, GAIA-style answer: one word, number, or short comma-separated list."""
154
- if not raw_answer or not raw_answer.strip():
155
- return raw_answer.strip() if raw_answer else ""
156
- raw = raw_answer.strip()
157
- lines = [ln.strip() for ln in raw.split("\n") if ln.strip()]
158
- for candidate in reversed(lines):
159
- if 1 <= len(candidate) <= 120 and "Error" not in candidate and "Could not" not in candidate:
160
- if candidate[0].isdigit() or (not candidate.startswith("(") and "http" not in candidate.lower()):
161
- if "," in candidate and len(candidate) < 80:
162
- return candidate
163
- if candidate.isdigit() or _looks_like_number(candidate):
164
- return candidate
165
- if len(candidate.split()) <= 8:
166
- return candidate
167
- numbers = re.findall(r"\b\d+(?:\.\d+)?%?\b", raw)
168
- if numbers:
169
- return numbers[-1]
170
- prompt = (
171
- f"Question: {question}\n\nLong answer or context:\n{raw[:1000]}\n\n"
172
- "Output ONLY the final answer: one word, one number, or a short comma-separated list (no explanation, no period at end). "
173
- "Example: Paris | 42 | apple, banana"
174
- )
175
- out = _llm_call(prompt, EXTRACTOR_MODEL, max_new_tokens=50).strip()
176
- if out:
177
- out = out.rstrip(".")
178
- if len(out) <= 150:
179
- return out
180
- for seg in re.split(r"[\n.!?]", raw):
181
- seg = seg.strip()
182
- if 1 <= len(seg) <= 100 and "Error" not in seg:
183
- return seg
184
- return raw[:200].strip()
185
-
186
-
187
- # --- Code Agent (has Python interpreter tool) ---
188
-
189
- def _extract_python_code(text: str) -> str:
190
- if not text:
191
- return ""
192
- text = text.strip()
193
- for marker in ["```python", "```"]:
194
- if marker in text:
195
- parts = text.split(marker, 1)
196
- if len(parts) > 1:
197
- rest = parts[1].split("```", 1)[0]
198
- return rest.strip()
199
- return text
200
-
201
-
202
- def _heuristic_code_from_question(question: str) -> str:
203
- numbers = re.findall(r"\d+(?:\.\d+)?", question)
204
- q = question.lower()
205
- if "how many" in q or "number of" in q:
206
- return "result = ' (code agent could not compute; try web search)'"
207
- if numbers and ("sum" in q or "total" in q or "+" in question):
208
- return f"result = {' + '.join(numbers)}"
209
- return "result = ' (no code generated; try web search)'"
210
-
211
-
212
- class CodeAgent:
213
- def __init__(self):
214
- print("CodeAgent initialized.")
215
-
216
- def __call__(self, question: str) -> str:
217
- print(f"CodeAgent received (first 50 chars): {question[:50]}...")
218
- prompt = (
219
- f"Question: {question}\n\n"
220
- "Write a single Python code block to answer this. Use a variable 'result' for the final answer. "
221
- "The value of 'result' must be a single number, one word, or a short phrase (GAIA format: no long explanation). "
222
- "Only output valid Python code, no explanation."
223
- )
224
- code = _llm_call(prompt, CODE_MODEL, max_new_tokens=400)
225
- if not code:
226
- code = _heuristic_code_from_question(question)
227
- code = _extract_python_code(code)
228
- if not code:
229
- return "Could not generate code for this question."
230
- return python_interpreter_tool(code)
231
-
232
-
233
- # --- Web Search Agent (DuckDuckGo + visit web page tools) ---
234
-
235
- def _urls_from_snippets(snippets: str, max_urls: int = 5) -> list:
236
- urls = []
237
- for line in snippets.split("\n"):
238
- m = re.search(r"\((https?://[^)]+)\)", line)
239
- if m:
240
- u = m.group(1)
241
- if u not in urls:
242
- urls.append(u)
243
- if len(urls) >= max_urls:
244
- break
245
- return urls
246
-
247
-
248
- class WebSearchAgent:
249
- def __init__(self):
250
- print("WebSearchAgent initialized.")
251
 
252
  def __call__(self, question: str) -> str:
253
- print(f"WebSearchAgent received (first 50 chars): {question[:50]}...")
254
- combined = ""
255
- for round_num in range(MAX_WEB_SEARCH_ROUNDS):
256
- query = question if round_num == 0 else f"{question} answer"
257
- snippets = duckduckgo_search_tool(query, max_results=6)
258
- if not snippets or "No search results" in snippets:
259
- if round_num == 0:
260
- return "No search results found."
261
- break
262
- combined += "\n\n--- Search round {} ---\n{}".format(round_num + 1, snippets)
263
- urls = _urls_from_snippets(snippets, max_urls=MAX_WEB_PAGES_TO_VISIT)
264
- for url in urls:
265
- page_text = visit_web_page_tool(url, max_chars=3500)
266
- if "Visit error" not in page_text:
267
- combined += "\n\n--- Page ---\n" + page_text[:3000]
268
- if round_num == 0 and len(combined) > 500:
269
- break
270
- if not combined:
271
- return "No search results found."
272
- prompt = (
273
- f"Question: {question}\n\nRelevant information:\n{combined[:7000]}\n\n"
274
- "Provide ONLY the final answer in GAIA format: one word, one number, or a short comma-separated list. No preamble, no explanation, no period at end."
275
  )
276
- answer = _llm_call(prompt, EVALUATOR_MODEL, max_new_tokens=200)
277
- if answer:
278
- return answer.strip()
279
- blocks = [b.strip() for b in combined.split("\n\n") if len(b.strip()) > 20]
280
- return blocks[0][:400] if blocks else combined[:400]
281
-
282
-
283
- # --- Manager Agent (user input = question; routes code/web; evaluates accuracy; final answer or retry) ---
284
-
285
- class ManagerAgent:
286
- def __init__(self):
287
- self.code_agent = CodeAgent()
288
- self.web_agent = WebSearchAgent()
289
- print("ManagerAgent initialized.")
290
-
291
- def __call__(self, question: str) -> str:
292
- print(f"Manager received question (first 50 chars): {question[:50]}...")
293
- best_answer = None
294
- tried_code = False
295
- tried_web = False
296
- for _ in range(MAX_MANAGER_ITERATIONS):
297
- route = manager_route_question(question)
298
- if route == "code" and not tried_code:
299
- tried_code = True
300
- reply = self.code_agent(question)
301
- elif route == "web" and not tried_web:
302
- tried_web = True
303
- reply = self.web_agent(question)
304
- else:
305
- if not tried_code:
306
- tried_code = True
307
- reply = self.code_agent(question)
308
- elif not tried_web:
309
- tried_web = True
310
- reply = self.web_agent(question)
311
- else:
312
- break
313
- if reply and "Error:" not in reply[:100] and "Could not" not in reply[:100]:
314
- best_answer = reply
315
- if evaluate_accuracy_tool(question, reply):
316
- return normalize_to_gaia_answer(question, final_answer_tool(reply))
317
- out = final_answer_tool(best_answer) if best_answer else "I could not determine a reliable answer."
318
- return normalize_to_gaia_answer(question, out)
319
 
320
  def run_and_submit_all( profile: gr.OAuthProfile | None):
321
  """
@@ -336,9 +208,9 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
336
  questions_url = f"{api_url}/questions"
337
  submit_url = f"{api_url}/submit"
338
 
339
- # 1. Instantiate Agent (multi-agent: Manager with Code + Web Search agents)
340
  try:
341
- agent = ManagerAgent()
342
  except Exception as e:
343
  print(f"Error instantiating agent: {e}")
344
  return f"Error initializing agent: {e}", None
@@ -440,20 +312,17 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
440
 
441
  # --- Build Gradio Interface using Blocks ---
442
  with gr.Blocks() as demo:
443
- gr.Markdown("# Multi-Agent GAIA Evaluation Runner")
444
  gr.Markdown(
445
  """
446
- **Architecture:** Manager Agent routes each question to either a **Code Agent** (Python interpreter) or **Web Search Agent** (DuckDuckGo + visit web page). The manager evaluates answer accuracy via an LLM; if mostly accurate it returns the final answer, otherwise it tries the other agent. Goal: score above 30 on GAIA.
447
-
448
  **Instructions:**
449
-
450
- 1. Clone this space, then modify the code to tune agents, tools, or add an API token (HF_TOKEN or HUGGING_FACE_HUB_TOKEN) for LLM routing and evaluation.
451
  2. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
452
- 3. Click 'Run Evaluation & Submit All Answers' to fetch questions, run the multi-agent system, submit answers, and see the score.
453
-
454
  ---
455
  **Disclaimers:**
456
- Running the evaluation can take a long time while the agent processes all questions. For better GAIA scores, set HF_TOKEN in Space secrets for LLM-based routing and accuracy checks.
 
457
  """
458
  )
459
 
 
1
  import os
2
  import re
3
+ import json
 
4
  import gradio as gr
5
  import requests
6
  import pandas as pd
7
+ from urllib.parse import quote
8
  from bs4 import BeautifulSoup
9
+ from dotenv import load_dotenv
10
+
11
+ load_dotenv()
12
 
13
  # (Keep Constants as is)
14
  # --- Constants ---
15
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
16
+ HF_TOKEN = os.getenv("HF_TOKEN", "")
17
+ REACT_MAX_STEPS = 10
18
+ LLM_MODEL = "Qwen/Qwen2.5-7B-Instruct"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
 
21
+ # --- Tools (DuckDuckGo search, web page view, code agent) ---
22
+ def tool_web_search(query: str, max_results: int = 5) -> str:
23
+ """Search the web using DuckDuckGo. Input: search query string."""
24
  try:
25
+ from duckduckgo_search import DDGS
26
+ results = list(DDGS().text(query, max_results=max_results))
27
  if not results:
28
  return "No search results found."
29
+ out = []
30
+ for i, r in enumerate(results, 1):
31
+ out.append(f"{i}. {r.get('title', '')}\n URL: {r.get('href', '')}\n {r.get('body', '')}")
32
+ return "\n\n".join(out)
 
 
 
33
  except Exception as e:
34
+ return f"Web search error: {e}"
35
 
36
 
37
+ def tool_web_page_view(url: str) -> str:
38
+ """View the main text content of a web page. Input: full URL string."""
39
  try:
40
+ headers = {"User-Agent": "Mozilla/5.0 (compatible; ReActAgent/1.0)"}
41
+ r = requests.get(url, timeout=15, headers=headers)
42
+ r.raise_for_status()
43
+ soup = BeautifulSoup(r.text, "html.parser")
44
+ for tag in soup(["script", "style", "nav", "footer", "header"]):
45
  tag.decompose()
46
  text = soup.get_text(separator="\n", strip=True)
47
+ return text[:8000] if len(text) > 8000 else text or "No text content found."
 
48
  except Exception as e:
49
+ return f"Web page view error: {e}"
50
+
51
+
52
+ def tool_code_agent(code: str) -> str:
53
+ """Run Python code to compute an answer. Input: a single Python expression or block (e.g. print(2+2)). No file or network access."""
54
+ import builtins
55
+ import io
56
+ import sys
57
+ safe_builtins = {
58
+ "abs": builtins.abs, "all": builtins.all, "any": builtins.any,
59
+ "bin": builtins.bin, "bool": builtins.bool, "chr": builtins.chr,
60
+ "dict": builtins.dict, "divmod": builtins.divmod, "enumerate": builtins.enumerate,
61
+ "filter": builtins.filter, "float": builtins.float, "format": builtins.format,
62
+ "hash": builtins.hash, "int": builtins.int, "len": builtins.len,
63
+ "list": builtins.list, "map": builtins.map, "max": builtins.max,
64
+ "min": builtins.min, "next": builtins.next, "pow": builtins.pow,
65
+ "print": builtins.print, "range": builtins.range, "repr": builtins.repr,
66
+ "reversed": builtins.reversed, "round": builtins.round, "set": builtins.set,
67
+ "sorted": builtins.sorted, "str": builtins.str, "sum": builtins.sum,
68
+ "tuple": builtins.tuple, "zip": builtins.zip,
69
+ }
70
  try:
71
+ code = code.strip()
72
+ if not code.startswith("print(") and "print(" not in code:
73
+ code = f"print({code})"
74
+ buf = io.StringIO()
75
+ old_stdout = sys.stdout
76
+ sys.stdout = buf
77
+ try:
78
+ exec(code, {"__builtins__": safe_builtins, "print": builtins.print}, {})
79
+ finally:
80
+ sys.stdout = old_stdout
81
+ return buf.getvalue().strip() or "Code ran (no printed output)."
82
+ except Exception as e:
83
+ return f"Code error: {e}"
84
+
85
+
86
+ TOOLS = {
87
+ "web_search": tool_web_search,
88
+ "web_page_view": tool_web_page_view,
89
+ "code_agent": tool_code_agent,
90
+ }
91
+
92
+ TOOL_DESCRIPTIONS = """Available tools:
93
+ - web_search: search the web with DuckDuckGo. Input: search query (string).
94
+ - web_page_view: get main text from a web page. Input: URL (string).
95
+ - code_agent: run Python code (math, string ops). Input: code (string)."""
96
+
97
+
98
+ # --- ReAct Agent: Plan -> Act -> Observe -> Reflect ---
99
+ class ReActAgent:
100
+ def __init__(self, token: str | None = None, model: str = LLM_MODEL, max_steps: int = REACT_MAX_STEPS):
101
+ self.token = (token or HF_TOKEN or "").strip()
102
+ self.model = model
103
+ self.max_steps = max_steps
104
+ print("ReActAgent initialized (plan -> act -> observe -> reflect).")
105
+
106
+ def _llm(self, messages: list[dict]) -> str:
107
+ if not self.token:
108
+ return "Error: HF_TOKEN not set. Add your token in .env to use the LLM."
109
+ url = f"https://api-inference.huggingface.co/models/{self.model}"
110
+ headers = {"Authorization": f"Bearer {self.token}", "Content-Type": "application/json"}
111
+ payload = {"inputs": self._messages_to_prompt(messages), "parameters": {"max_new_tokens": 512, "return_full_text": False}}
112
+ try:
113
+ r = requests.post(url, json=payload, headers=headers, timeout=60)
114
+ r.raise_for_status()
115
+ data = r.json()
116
+ if isinstance(data, list) and len(data) > 0:
117
+ return (data[0].get("generated_text") or "").strip()
118
+ if isinstance(data, dict) and "generated_text" in data:
119
+ return (data["generated_text"] or "").strip()
120
  return ""
121
+ except Exception as e:
122
+ return f"LLM error: {e}"
123
+
124
+ def _messages_to_prompt(self, messages: list[dict]) -> str:
125
+ out = []
126
+ for m in messages:
127
+ role = m.get("role", "user")
128
+ content = m.get("content", "")
129
+ if role == "system":
130
+ out.append(f"System: {content}")
131
+ elif role == "user":
132
+ out.append(f"User: {content}")
133
+ else:
134
+ out.append(f"Assistant: {content}")
135
+ out.append("Assistant:")
136
+ return "\n\n".join(out)
137
+
138
+ def _parse_action(self, text: str) -> tuple[str | None, str | None, str | None]:
139
+ """Returns (thought, action, action_input) or (None, None, final_answer)."""
140
+ text = text.strip()
141
+ final_match = re.search(r"Final Answer\s*:\s*(.+?)(?=\n\n|\Z)", text, re.DOTALL | re.IGNORECASE)
142
+ if final_match:
143
+ return None, None, final_match.group(1).strip()
144
+ action_match = re.search(r"Action\s*:\s*(\w+)", text, re.IGNORECASE)
145
+ input_match = re.search(r"Action Input\s*:\s*(.+?)(?=\n\n|\nThought:|\Z)", text, re.DOTALL | re.IGNORECASE)
146
+ thought = None
147
+ thought_match = re.search(r"Thought\s*:\s*(.+?)(?=\nAction:|\Z)", text, re.DOTALL | re.IGNORECASE)
148
+ if thought_match:
149
+ thought = thought_match.group(1).strip()
150
+ action = action_match.group(1).strip() if action_match else None
151
+ action_input = input_match.group(1).strip() if input_match else None
152
+ if action_input:
153
+ action_input = action_input.strip().strip('"\'')
154
+ return thought, action, action_input
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
  def __call__(self, question: str) -> str:
157
+ print(f"ReAct agent received question (first 50 chars): {question[:50]}...")
158
+ if not self.token:
159
+ return "HF_TOKEN not set. Add your Hugging Face token in .env to run the ReAct agent."
160
+ system = (
161
+ "You are a ReAct agent. For each turn you must either:\n"
162
+ "1. Output: Thought: <reasoning> then Action: <tool_name> then Action Input: <input>\n"
163
+ "2. Or when you have the answer: Final Answer: <your answer>\n\n"
164
+ + TOOL_DESCRIPTIONS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  )
166
+ messages = [
167
+ {"role": "system", "content": system},
168
+ {"role": "user", "content": f"Question: {question}\n\nFirst, plan which tool(s) to use, then take action, then observe, then reflect. Give your final answer when done."},
169
+ ]
170
+ for step in range(self.max_steps):
171
+ response = self._llm(messages)
172
+ thought, action, action_input = self._parse_action(response)
173
+ if thought is None and action is None and action_input is not None:
174
+ return action_input # Final Answer
175
+ if not action or action not in TOOLS:
176
+ messages.append({"role": "assistant", "content": response})
177
+ messages.append({"role": "user", "content": "You must use one of the tools (Action: tool_name, Action Input: input) or give Final Answer: your answer. Try again."})
178
+ continue
179
+ try:
180
+ observation = TOOLS[action](action_input)
181
+ except Exception as e:
182
+ observation = f"Tool error: {e}"
183
+ observation = (observation[:3000] + "...") if len(observation) > 3000 else observation
184
+ messages.append({"role": "assistant", "content": response})
185
+ messages.append({"role": "user", "content": f"Observation: {observation}\n\nReflect: does this answer the question? If yes, reply with Final Answer: <answer>. If not, use another tool (Thought / Action / Action Input)."})
186
+ last_assistant = next((m["content"] for m in reversed(messages) if m.get("role") == "assistant"), "")
187
+ final = self._parse_action(last_assistant)
188
+ if final[2] and final[0] is None and final[1] is None:
189
+ return final[2]
190
+ return last_assistant[:500] if last_assistant else "ReAct agent reached max steps without a final answer."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
  def run_and_submit_all( profile: gr.OAuthProfile | None):
193
  """
 
208
  questions_url = f"{api_url}/questions"
209
  submit_url = f"{api_url}/submit"
210
 
211
+ # 1. Instantiate Agent ( modify this part to create your agent)
212
  try:
213
+ agent = ReActAgent(token=os.getenv("HF_TOKEN"), max_steps=REACT_MAX_STEPS)
214
  except Exception as e:
215
  print(f"Error instantiating agent: {e}")
216
  return f"Error initializing agent: {e}", None
 
312
 
313
  # --- Build Gradio Interface using Blocks ---
314
  with gr.Blocks() as demo:
315
+ gr.Markdown("# Basic Agent Evaluation Runner")
316
  gr.Markdown(
317
  """
 
 
318
  **Instructions:**
319
+ 1. Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc ...
 
320
  2. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
321
+ 3. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
 
322
  ---
323
  **Disclaimers:**
324
+ Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
325
+ This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
326
  """
327
  )
328