Gianluca Tessitore commited on
Commit
bfff34c
·
1 Parent(s): 81917a3

upload Agent

Browse files
Files changed (3) hide show
  1. .gitignore +4 -0
  2. app.py +488 -73
  3. requirements.txt +8 -1
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ **/.vscode/
2
+ .venv
3
+ .claude
4
+ .env
app.py CHANGED
@@ -1,34 +1,459 @@
1
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import gradio as gr
3
  import requests
4
- import inspect
5
  import pandas as pd
 
6
 
7
- # (Keep Constants as is)
8
  # --- Constants ---
9
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
10
 
11
- # --- Basic Agent Definition ---
12
- # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
13
- class BasicAgent:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  def __init__(self):
15
- print("BasicAgent initialized.")
16
- def __call__(self, question: str) -> str:
17
- print(f"Agent received question (first 50 chars): {question[:50]}...")
18
- fixed_answer = "This is a default answer."
19
- print(f"Agent returning fixed answer: {fixed_answer}")
20
- return fixed_answer
21
-
22
- def run_and_submit_all( profile: gr.OAuthProfile | None):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  """
24
- Fetches all questions, runs the BasicAgent on them, submits all answers,
25
  and displays the results.
26
  """
27
- # --- Determine HF Space Runtime URL and Repo URL ---
28
- space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
29
 
30
  if profile:
31
- username= f"{profile.username}"
32
  print(f"User logged in: {username}")
33
  else:
34
  print("User not logged in.")
@@ -38,13 +463,13 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
38
  questions_url = f"{api_url}/questions"
39
  submit_url = f"{api_url}/submit"
40
 
41
- # 1. Instantiate Agent ( modify this part to create your agent)
42
  try:
43
- agent = BasicAgent()
44
  except Exception as e:
45
  print(f"Error instantiating agent: {e}")
46
  return f"Error initializing agent: {e}", None
47
- # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
48
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
49
  print(agent_code)
50
 
@@ -55,24 +480,18 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
55
  response.raise_for_status()
56
  questions_data = response.json()
57
  if not questions_data:
58
- print("Fetched questions list is empty.")
59
- return "Fetched questions list is empty or invalid format.", None
60
  print(f"Fetched {len(questions_data)} questions.")
61
  except requests.exceptions.RequestException as e:
62
- print(f"Error fetching questions: {e}")
63
  return f"Error fetching questions: {e}", None
64
- except requests.exceptions.JSONDecodeError as e:
65
- print(f"Error decoding JSON response from questions endpoint: {e}")
66
- print(f"Response text: {response.text[:500]}")
67
- return f"Error decoding server response for questions: {e}", None
68
  except Exception as e:
69
- print(f"An unexpected error occurred fetching questions: {e}")
70
  return f"An unexpected error occurred fetching questions: {e}", None
71
 
72
- # 3. Run your Agent
73
  results_log = []
74
  answers_payload = []
75
  print(f"Running agent on {len(questions_data)} questions...")
 
76
  for item in questions_data:
77
  task_id = item.get("task_id")
78
  question_text = item.get("question")
@@ -80,23 +499,30 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
80
  print(f"Skipping item with missing task_id or question: {item}")
81
  continue
82
  try:
83
- submitted_answer = agent(question_text)
84
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
85
- results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
 
 
 
 
86
  except Exception as e:
87
- print(f"Error running agent on task {task_id}: {e}")
88
- results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
 
 
 
 
89
 
90
  if not answers_payload:
91
- print("Agent did not produce any answers to submit.")
92
  return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
93
 
94
- # 4. Prepare Submission
95
- submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
96
- status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
97
- print(status_update)
98
-
99
- # 5. Submit
100
  print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
101
  try:
102
  response = requests.post(submit_url, json=submission_data, timeout=60)
@@ -110,51 +536,46 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
110
  f"Message: {result_data.get('message', 'No message received.')}"
111
  )
112
  print("Submission successful.")
113
- results_df = pd.DataFrame(results_log)
114
- return final_status, results_df
115
  except requests.exceptions.HTTPError as e:
116
  error_detail = f"Server responded with status {e.response.status_code}."
117
  try:
118
  error_json = e.response.json()
119
  error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
120
- except requests.exceptions.JSONDecodeError:
121
  error_detail += f" Response: {e.response.text[:500]}"
122
  status_message = f"Submission Failed: {error_detail}"
123
  print(status_message)
124
- results_df = pd.DataFrame(results_log)
125
- return status_message, results_df
126
  except requests.exceptions.Timeout:
127
  status_message = "Submission Failed: The request timed out."
128
  print(status_message)
129
- results_df = pd.DataFrame(results_log)
130
- return status_message, results_df
131
  except requests.exceptions.RequestException as e:
132
  status_message = f"Submission Failed: Network error - {e}"
133
  print(status_message)
134
- results_df = pd.DataFrame(results_log)
135
- return status_message, results_df
136
  except Exception as e:
137
  status_message = f"An unexpected error occurred during submission: {e}"
138
  print(status_message)
139
- results_df = pd.DataFrame(results_log)
140
- return status_message, results_df
141
 
142
 
143
- # --- Build Gradio Interface using Blocks ---
144
  with gr.Blocks() as demo:
145
- gr.Markdown("# Basic Agent Evaluation Runner")
146
  gr.Markdown(
147
- """
148
  **Instructions:**
149
 
150
- 1. Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc ...
151
- 2. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
152
- 3. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
153
 
154
  ---
155
- **Disclaimers:**
156
- Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
157
- This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
 
158
  """
159
  )
160
 
@@ -163,19 +584,14 @@ with gr.Blocks() as demo:
163
  run_button = gr.Button("Run Evaluation & Submit All Answers")
164
 
165
  status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
166
- # Removed max_rows=10 from DataFrame constructor
167
  results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
168
 
169
- run_button.click(
170
- fn=run_and_submit_all,
171
- outputs=[status_output, results_table]
172
- )
173
 
174
  if __name__ == "__main__":
175
- print("\n" + "-"*30 + " App Starting " + "-"*30)
176
- # Check for SPACE_HOST and SPACE_ID at startup for information
177
  space_host_startup = os.getenv("SPACE_HOST")
178
- space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
179
 
180
  if space_host_startup:
181
  print(f"✅ SPACE_HOST found: {space_host_startup}")
@@ -183,14 +599,13 @@ if __name__ == "__main__":
183
  else:
184
  print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
185
 
186
- if space_id_startup: # Print repo URLs if SPACE_ID is found
187
  print(f"✅ SPACE_ID found: {space_id_startup}")
188
  print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
189
  print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
190
  else:
191
  print("ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
192
 
193
- print("-"*(60 + len(" App Starting ")) + "\n")
194
-
195
- print("Launching Gradio Interface for Basic Agent Evaluation...")
196
- demo.launch(debug=True, share=False)
 
1
  import os
2
+ import sys
3
+ import json
4
+
5
+ # Load .env file if present (local development)
6
+ try:
7
+ from dotenv import load_dotenv
8
+ load_dotenv()
9
+ except ImportError:
10
+ pass
11
+ import re
12
+ import base64
13
+ from io import StringIO
14
+
15
  import gradio as gr
16
  import requests
 
17
  import pandas as pd
18
+ from huggingface_hub import InferenceClient
19
 
 
20
  # --- Constants ---
21
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
22
 
23
+ # --- Tool Functions ---
24
+
25
+ def web_search(query: str, max_results: int = 5) -> str:
26
+ """Search the web using DuckDuckGo."""
27
+ try:
28
+ from ddgs import DDGS
29
+ with DDGS() as ddgs:
30
+ results = list(ddgs.text(query, max_results=max_results))
31
+ if not results:
32
+ return "No search results found."
33
+ output = []
34
+ for r in results:
35
+ output.append(
36
+ f"Title: {r.get('title', '')}\n"
37
+ f"URL: {r.get('href', '')}\n"
38
+ f"Snippet: {r.get('body', '')}"
39
+ )
40
+ return "\n\n".join(output)
41
+ except Exception as e:
42
+ return f"Search error: {e}"
43
+
44
+
45
+ def visit_webpage(url: str) -> str:
46
+ """Fetch and return text content of a webpage."""
47
+ try:
48
+ headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
49
+ response = requests.get(url, headers=headers, timeout=15)
50
+ response.raise_for_status()
51
+ try:
52
+ from bs4 import BeautifulSoup
53
+ soup = BeautifulSoup(response.text, "html.parser")
54
+ for tag in soup(["script", "style", "nav", "footer", "header"]):
55
+ tag.decompose()
56
+ text = soup.get_text(separator=" ", strip=True)
57
+ except ImportError:
58
+ text = re.sub(r"<[^>]+>", " ", response.text)
59
+ text = re.sub(r"\s+", " ", text).strip()
60
+ return text[:12000]
61
+ except Exception as e:
62
+ return f"Error visiting webpage: {e}"
63
+
64
+
65
+ def wikipedia_search(query: str) -> str:
66
+ """Search Wikipedia for information about a topic."""
67
+ try:
68
+ # Try direct page summary
69
+ encoded = requests.utils.quote(query.replace(" ", "_"))
70
+ url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{encoded}"
71
+ resp = requests.get(url, timeout=10)
72
+ if resp.status_code == 200:
73
+ data = resp.json()
74
+ extract = data.get("extract", "")
75
+ if extract:
76
+ return f"{data.get('title', '')}: {extract}"
77
+ # Fallback: use search API
78
+ search_url = "https://en.wikipedia.org/w/api.php"
79
+ params = {
80
+ "action": "query", "list": "search",
81
+ "srsearch": query, "format": "json",
82
+ "srlimit": 3, "srprop": "snippet",
83
+ }
84
+ resp = requests.get(search_url, params=params, timeout=10)
85
+ data = resp.json()
86
+ results = data.get("query", {}).get("search", [])
87
+ if not results:
88
+ return "No Wikipedia results found."
89
+ # Get summary of first result
90
+ title = results[0].get("title", "")
91
+ encoded2 = requests.utils.quote(title.replace(" ", "_"))
92
+ resp2 = requests.get(
93
+ f"https://en.wikipedia.org/api/rest_v1/page/summary/{encoded2}", timeout=10
94
+ )
95
+ if resp2.status_code == 200:
96
+ d = resp2.json()
97
+ return f"{d.get('title', '')}: {d.get('extract', '')}"
98
+ return "\n".join(r.get("snippet", "") for r in results)
99
+ except Exception as e:
100
+ return f"Wikipedia error: {e}"
101
+
102
+
103
+ def python_interpreter(code: str) -> str:
104
+ """Execute Python code and return its printed output."""
105
+ old_stdout = sys.stdout
106
+ sys.stdout = buffer = StringIO()
107
+ try:
108
+ exec_globals: dict = {}
109
+ exec(code, exec_globals) # noqa: S102
110
+ output = buffer.getvalue()
111
+ return output if output else "Executed successfully (no output)."
112
+ except Exception as e:
113
+ return f"Error: {type(e).__name__}: {e}"
114
+ finally:
115
+ sys.stdout = old_stdout
116
+
117
+
118
+ def download_task_file(task_id: str) -> str:
119
+ """Download the file associated with a task and return its content."""
120
+ try:
121
+ url = f"{DEFAULT_API_URL}/files/{task_id}"
122
+ resp = requests.get(url, timeout=30)
123
+ resp.raise_for_status()
124
+
125
+ content_type = resp.headers.get("content-type", "")
126
+ filename = ""
127
+ if "content-disposition" in resp.headers:
128
+ cd = resp.headers["content-disposition"]
129
+ m = re.search(r'filename=["\']?([^"\';\n]+)', cd)
130
+ if m:
131
+ filename = m.group(1).strip()
132
+
133
+ # Determine type by content-type or filename extension
134
+ is_csv = "text/csv" in content_type or filename.endswith(".csv")
135
+ is_excel = filename.endswith((".xlsx", ".xls")) or "spreadsheet" in content_type
136
+ is_image = "image/" in content_type or filename.endswith(
137
+ (".png", ".jpg", ".jpeg", ".gif", ".bmp", ".webp")
138
+ )
139
+ is_python = filename.endswith(".py")
140
+
141
+ if is_image:
142
+ media_type = content_type.split(";")[0].strip() or "image/png"
143
+ img_b64 = base64.b64encode(resp.content).decode()
144
+ # Special prefix parsed by the agent to pass as vision content
145
+ return f"IMAGE:{media_type}:{img_b64}"
146
+
147
+ if is_csv:
148
+ try:
149
+ import io
150
+ df = pd.read_csv(io.StringIO(resp.text))
151
+ return (
152
+ f"CSV file: {len(df)} rows × {len(df.columns)} columns.\n"
153
+ f"Columns: {list(df.columns)}\n\n"
154
+ f"{df.head(20).to_string()}"
155
+ )
156
+ except Exception:
157
+ return resp.text[:5000]
158
+
159
+ if is_excel:
160
+ try:
161
+ import io
162
+ df = pd.read_excel(io.BytesIO(resp.content))
163
+ return (
164
+ f"Excel file: {len(df)} rows × {len(df.columns)} columns.\n"
165
+ f"Columns: {list(df.columns)}\n\n"
166
+ f"{df.head(20).to_string()}"
167
+ )
168
+ except Exception as e:
169
+ return f"Excel file could not be parsed: {e}"
170
+
171
+ is_audio = filename.endswith((".mp3", ".wav", ".ogg", ".flac", ".m4a")) or "audio/" in content_type
172
+ if is_audio:
173
+ try:
174
+ asr_client = InferenceClient(api_key=os.environ["HF_TOKEN"])
175
+ transcript = asr_client.automatic_speech_recognition(
176
+ audio=resp.content,
177
+ model="openai/whisper-large-v3",
178
+ )
179
+ text_result = transcript.text if hasattr(transcript, "text") else str(transcript)
180
+ return f"Audio transcript:\n{text_result}"
181
+ except Exception as e:
182
+ return f"Audio file (transcription failed: {e}). File size: {len(resp.content)} bytes."
183
+
184
+ if is_python:
185
+ return f"Python file:\n```python\n{resp.text[:4000]}\n```"
186
+
187
+ # Default: try to decode as text
188
+ try:
189
+ return resp.content.decode("utf-8")[:6000]
190
+ except Exception:
191
+ return f"Binary file ({len(resp.content)} bytes, type: {content_type})"
192
+
193
+ except requests.exceptions.HTTPError as e:
194
+ if e.response.status_code == 404:
195
+ return "No file associated with this task."
196
+ return f"Error downloading file: {e}"
197
+ except Exception as e:
198
+ return f"Error: {e}"
199
+
200
+
201
+ # --- Agent Definition ---
202
+
203
+ class GAIAAgent:
204
+ """
205
+ ReAct-style agent using plain chat completions (no native tool-calling API).
206
+ Works with any instruction-following model on HF's free serverless inference.
207
+ """
208
+
209
+ SYSTEM_PROMPT = """You are an expert AI assistant solving questions from the GAIA benchmark.
210
+ You have access to these tools:
211
+
212
+ - web_search(query): Search the web via DuckDuckGo for current facts, people, events, statistics.
213
+ - visit_webpage(url): Fetch and read the text content of a specific webpage.
214
+ - wikipedia_search(query): Search Wikipedia for background information on a topic.
215
+ - python_interpreter(code): Execute Python code. Always use print() to output results.
216
+ - download_task_file(task_id): Download the file attached to the current task (image, CSV, Excel, text, etc.).
217
+
218
+ Use this EXACT format for every step:
219
+
220
+ Thought: [your reasoning]
221
+ Action: [tool_name]
222
+ Action Input: {"key": "value"}
223
+
224
+ After receiving the Observation, continue with more Thought/Action steps.
225
+ When you have the final answer, write:
226
+
227
+ Thought: I now know the final answer.
228
+ Final Answer: [exact answer]
229
+
230
+ Important rules:
231
+ - "Final Answer:" must contain ONLY the bare answer — no explanation, no "FINAL ANSWER:" prefix.
232
+ - Numbers: exact format as requested (integer, decimal, etc.).
233
+ - Names: exact spelling as they appear in authoritative sources.
234
+ - Lists: comma-separated values unless another format is specified.
235
+ - Always use a tool to verify facts rather than relying on memory."""
236
+
237
+ MODEL = "moonshotai/Kimi-K2.5:cheapest"
238
+
239
  def __init__(self):
240
+ self.client = InferenceClient(
241
+ api_key=os.environ["HF_TOKEN"],
242
+ )
243
+ print("GAIAAgent initialized.")
244
+
245
+ @staticmethod
246
+ def _strip_think(text: str) -> str:
247
+ """Remove <think>…</think> reasoning blocks (DeepSeek-R1 / o1-style)."""
248
+ return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()
249
+
250
+ def _run_tool(self, name: str, tool_input: dict) -> str:
251
+ """Execute a named tool and return its result as a string."""
252
+ import time
253
+ t0 = time.time()
254
+ try:
255
+ if name == "web_search":
256
+ query = tool_input.get("query", "")
257
+ if not query:
258
+ return "Error: 'query' parameter is required."
259
+ return web_search(query)
260
+ if name == "visit_webpage":
261
+ url = tool_input.get("url", "")
262
+ if not url or not url.startswith("http"):
263
+ print(f" [TOOL ERROR] visit_webpage called with invalid url: {url!r}")
264
+ return "Error: valid 'url' parameter is required."
265
+ return visit_webpage(url)
266
+ if name == "wikipedia_search":
267
+ query = tool_input.get("query", "")
268
+ if not query:
269
+ return "Error: 'query' parameter is required."
270
+ return wikipedia_search(query)
271
+ if name == "python_interpreter":
272
+ code = tool_input.get("code", "")
273
+ if not code:
274
+ print(f" [TOOL ERROR] python_interpreter called with empty code. Full input: {tool_input!r}")
275
+ return "Error: 'code' parameter is required."
276
+ return python_interpreter(code)
277
+ if name == "download_task_file":
278
+ return download_task_file(tool_input.get("task_id", ""))
279
+ print(f" [TOOL ERROR] Unknown tool called: {name!r}")
280
+ return f"Unknown tool: {name}"
281
+ except Exception as e:
282
+ print(f" [TOOL EXCEPTION] {name} raised {type(e).__name__}: {e}")
283
+ return f"Tool error: {e}"
284
+ finally:
285
+ print(f" [TOOL TIMING] {name} completed in {time.time() - t0:.2f}s")
286
+
287
+ @staticmethod
288
+ def _extract_json(text: str, start: int) -> dict:
289
+ """
290
+ Extract a JSON object starting at `start` (which must be '{') by
291
+ counting braces — handles nested dicts/code strings safely.
292
+ """
293
+ depth = 0
294
+ in_string = False
295
+ escape = False
296
+ for i in range(start, len(text)):
297
+ ch = text[i]
298
+ if escape:
299
+ escape = False
300
+ continue
301
+ if ch == "\\" and in_string:
302
+ escape = True
303
+ continue
304
+ if ch == '"':
305
+ in_string = not in_string
306
+ continue
307
+ if in_string:
308
+ continue
309
+ if ch == "{":
310
+ depth += 1
311
+ elif ch == "}":
312
+ depth -= 1
313
+ if depth == 0:
314
+ raw = text[start : i + 1]
315
+ try:
316
+ return json.loads(raw)
317
+ except json.JSONDecodeError as e:
318
+ print(f" [PARSE ERROR] JSON decode failed: {e} | raw={raw[:200]!r}")
319
+ return {}
320
+ print(f" [PARSE ERROR] Unmatched braces — no closing '}}' found from pos {start}")
321
+ return {}
322
+
323
+ def _parse_action(self, text: str):
324
+ """
325
+ Return (tool_name, tool_input_dict) for the last Action block in text,
326
+ or (None, None) if none is found.
327
+ """
328
+ action_matches = list(re.finditer(r"Action:\s*(\w+)", text))
329
+ if not action_matches:
330
+ return None, None
331
+
332
+ tool_name = action_matches[-1].group(1).strip()
333
+ tool_input: dict = {}
334
+
335
+ ai_matches = list(re.finditer(r"Action Input:\s*", text))
336
+ if not ai_matches:
337
+ print(f" [PARSE WARN] Action '{tool_name}' found but no 'Action Input:' block.")
338
+ else:
339
+ pos = ai_matches[-1].end()
340
+ if pos < len(text) and text[pos] == "{":
341
+ tool_input = self._extract_json(text, pos)
342
+ if not tool_input:
343
+ print(f" [PARSE WARN] Action Input for '{tool_name}' parsed as empty dict.")
344
+ else:
345
+ snippet = text[pos : pos + 80].replace("\n", "\\n")
346
+ print(f" [PARSE WARN] Action Input for '{tool_name}' does not start with '{{': {snippet!r}")
347
+
348
+ return tool_name, tool_input
349
+
350
+ def __call__(self, question: str, task_id: str = None) -> str:
351
+ import time
352
+ print(f"\nAgent processing task {task_id}: {question[:80]}...")
353
+
354
+ user_content = f"Task ID: {task_id}\n\nQuestion: {question}" if task_id else question
355
+ messages = [
356
+ {"role": "system", "content": self.SYSTEM_PROMPT},
357
+ {"role": "user", "content": user_content},
358
+ ]
359
+
360
+ for iteration in range(15):
361
+ t_llm = time.time()
362
+ response = None
363
+ for attempt in range(3):
364
+ try:
365
+ response = self.client.chat.completions.create(
366
+ model=self.MODEL,
367
+ messages=messages,
368
+ max_tokens=4096,
369
+ temperature=0.1,
370
+ )
371
+ break
372
+ except Exception as e:
373
+ is_retryable = any(code in str(e) for code in ("504", "502", "503", "429"))
374
+ print(f" [{iteration}] [LLM ERROR attempt {attempt+1}/3] {type(e).__name__}: {str(e)[:120]}")
375
+ if is_retryable and attempt < 2:
376
+ wait = 15 * (attempt + 1)
377
+ print(f" [{iteration}] Retrying in {wait}s...")
378
+ time.sleep(wait)
379
+ else:
380
+ raise
381
+ if response is None:
382
+ raise RuntimeError("LLM returned no response after retries")
383
+ llm_elapsed = time.time() - t_llm
384
+
385
+ raw_output = (response.choices[0].message.content or "").strip()
386
+ think_stripped = len(raw_output) - len(self._strip_think(raw_output))
387
+ output = self._strip_think(raw_output)
388
+
389
+ usage = response.usage
390
+ print(
391
+ f" [{iteration}] LLM {llm_elapsed:.1f}s | "
392
+ f"tokens in={getattr(usage, 'prompt_tokens', '?')} "
393
+ f"out={getattr(usage, 'completion_tokens', '?')} | "
394
+ f"think_stripped={think_stripped}chars"
395
+ )
396
+ print(f" [{iteration}] Model output: {output[:300]}{'...' if len(output) > 300 else ''}")
397
+
398
+ # ── Final answer found (must be at line start, not inside code/JSON) ──
399
+ fa_match = re.search(r"(?:^|\n)Final Answer:\s*(.+?)(?:\n|$)", output)
400
+ if fa_match:
401
+ answer = fa_match.group(1).strip()
402
+ print(f" [{iteration}] => Final Answer: {answer!r}")
403
+ return answer
404
+
405
+ # ── Tool call found ──
406
+ tool_name, tool_input = self._parse_action(output)
407
+ if tool_name:
408
+ print(f" [{iteration}] Tool call: {tool_name}({json.dumps(tool_input)[:200]})")
409
+ result = self._run_tool(tool_name, tool_input)
410
+ result_preview = result[:200].replace("\n", " ")
411
+ print(f" [{iteration}] Tool result ({len(result)} chars): {result_preview}{'...' if len(result) > 200 else ''}")
412
+
413
+ messages.append({"role": "assistant", "content": raw_output})
414
+
415
+ if result.startswith("IMAGE:"):
416
+ parts = result.split(":", 2)
417
+ media_type, img_b64 = parts[1], parts[2]
418
+ print(f" [{iteration}] Image received: type={media_type}, size={len(img_b64)} b64 chars")
419
+ messages.append({
420
+ "role": "user",
421
+ "content": [
422
+ {"type": "text", "text": "Observation: Here is the downloaded image. Analyse it to answer the question."},
423
+ {"type": "image_url", "image_url": {"url": f"data:{media_type};base64,{img_b64}"}},
424
+ ],
425
+ })
426
+ else:
427
+ messages.append({
428
+ "role": "user",
429
+ "content": f"Observation: {result[:6000]}",
430
+ })
431
+ else:
432
+ print(f" [{iteration}] No tool call and no Final Answer — prompting model to conclude.")
433
+ messages.append({"role": "assistant", "content": raw_output})
434
+ messages.append({
435
+ "role": "user",
436
+ "content": (
437
+ "You haven't provided a Final Answer yet. "
438
+ "Please conclude with:\nFinal Answer: [answer]"
439
+ ),
440
+ })
441
+
442
+ print(f" [MAX ITERATIONS] Reached iteration limit for task {task_id}.")
443
+ return "Unable to determine answer."
444
+
445
+
446
+ # --- Gradio App ---
447
+
448
+ def run_and_submit_all(profile: gr.OAuthProfile | None):
449
  """
450
+ Fetches all questions, runs the GAIAAgent on them, submits all answers,
451
  and displays the results.
452
  """
453
+ space_id = os.getenv("SPACE_ID")
 
454
 
455
  if profile:
456
+ username = profile.username
457
  print(f"User logged in: {username}")
458
  else:
459
  print("User not logged in.")
 
463
  questions_url = f"{api_url}/questions"
464
  submit_url = f"{api_url}/submit"
465
 
466
+ # 1. Instantiate Agent
467
  try:
468
+ agent = GAIAAgent()
469
  except Exception as e:
470
  print(f"Error instantiating agent: {e}")
471
  return f"Error initializing agent: {e}", None
472
+
473
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
474
  print(agent_code)
475
 
 
480
  response.raise_for_status()
481
  questions_data = response.json()
482
  if not questions_data:
483
+ return "Fetched questions list is empty or invalid format.", None
 
484
  print(f"Fetched {len(questions_data)} questions.")
485
  except requests.exceptions.RequestException as e:
 
486
  return f"Error fetching questions: {e}", None
 
 
 
 
487
  except Exception as e:
 
488
  return f"An unexpected error occurred fetching questions: {e}", None
489
 
490
+ # 3. Run Agent
491
  results_log = []
492
  answers_payload = []
493
  print(f"Running agent on {len(questions_data)} questions...")
494
+
495
  for item in questions_data:
496
  task_id = item.get("task_id")
497
  question_text = item.get("question")
 
499
  print(f"Skipping item with missing task_id or question: {item}")
500
  continue
501
  try:
502
+ submitted_answer = agent(question_text, task_id=task_id)
503
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
504
+ results_log.append({
505
+ "Task ID": task_id,
506
+ "Question": question_text,
507
+ "Submitted Answer": submitted_answer,
508
+ })
509
  except Exception as e:
510
+ print(f"Error running agent on task {task_id}: {e}")
511
+ results_log.append({
512
+ "Task ID": task_id,
513
+ "Question": question_text,
514
+ "Submitted Answer": f"AGENT ERROR: {e}",
515
+ })
516
 
517
  if not answers_payload:
 
518
  return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
519
 
520
+ # 4. Submit
521
+ submission_data = {
522
+ "username": username.strip(),
523
+ "agent_code": agent_code,
524
+ "answers": answers_payload,
525
+ }
526
  print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
527
  try:
528
  response = requests.post(submit_url, json=submission_data, timeout=60)
 
536
  f"Message: {result_data.get('message', 'No message received.')}"
537
  )
538
  print("Submission successful.")
539
+ return final_status, pd.DataFrame(results_log)
 
540
  except requests.exceptions.HTTPError as e:
541
  error_detail = f"Server responded with status {e.response.status_code}."
542
  try:
543
  error_json = e.response.json()
544
  error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
545
+ except Exception:
546
  error_detail += f" Response: {e.response.text[:500]}"
547
  status_message = f"Submission Failed: {error_detail}"
548
  print(status_message)
549
+ return status_message, pd.DataFrame(results_log)
 
550
  except requests.exceptions.Timeout:
551
  status_message = "Submission Failed: The request timed out."
552
  print(status_message)
553
+ return status_message, pd.DataFrame(results_log)
 
554
  except requests.exceptions.RequestException as e:
555
  status_message = f"Submission Failed: Network error - {e}"
556
  print(status_message)
557
+ return status_message, pd.DataFrame(results_log)
 
558
  except Exception as e:
559
  status_message = f"An unexpected error occurred during submission: {e}"
560
  print(status_message)
561
+ return status_message, pd.DataFrame(results_log)
 
562
 
563
 
564
+ # --- Build Gradio Interface ---
565
  with gr.Blocks() as demo:
566
+ gr.Markdown("# GAIA Agent Evaluation Runner")
567
  gr.Markdown(
568
+ f"""
569
  **Instructions:**
570
 
571
+ 1. Log in to your Hugging Face account using the button below.
572
+ 2. Click **Run Evaluation & Submit All Answers** to fetch questions, run the agent, submit answers, and see the score.
 
573
 
574
  ---
575
+ **Notes:**
576
+ - The agent uses models via HF InferenceClient (provider=auto) with a ReAct loop: web search, Wikipedia, Python interpreter, and file download tools.
577
+ - Targets ≥30% on GAIA level-1 questions.
578
+ - Submission can take several minutes while the agent processes each question.
579
  """
580
  )
581
 
 
584
  run_button = gr.Button("Run Evaluation & Submit All Answers")
585
 
586
  status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
 
587
  results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
588
 
589
+ run_button.click(fn=run_and_submit_all, outputs=[status_output, results_table])
 
 
 
590
 
591
  if __name__ == "__main__":
592
+ print("\n" + "-" * 30 + " App Starting " + "-" * 30)
 
593
  space_host_startup = os.getenv("SPACE_HOST")
594
+ space_id_startup = os.getenv("SPACE_ID")
595
 
596
  if space_host_startup:
597
  print(f"✅ SPACE_HOST found: {space_host_startup}")
 
599
  else:
600
  print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
601
 
602
+ if space_id_startup:
603
  print(f"✅ SPACE_ID found: {space_id_startup}")
604
  print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
605
  print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
606
  else:
607
  print("ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
608
 
609
+ print("-" * (60 + len(" App Starting ")) + "\n")
610
+ print("Launching Gradio Interface for GAIA Agent Evaluation...")
611
+ demo.launch(debug=True, share=False)
 
requirements.txt CHANGED
@@ -1,2 +1,9 @@
1
  gradio
2
- requests
 
 
 
 
 
 
 
 
1
  gradio
2
+ gradio[oauth]
3
+ requests
4
+ pandas
5
+ huggingface_hub
6
+ ddgs
7
+ beautifulsoup4
8
+ openpyxl
9
+ python-dotenv