Files changed (1) hide show
  1. app.py +418 -138
app.py CHANGED
@@ -1,196 +1,476 @@
1
  import os
2
- import gradio as gr
 
 
 
3
  import requests
4
- import inspect
5
  import pandas as pd
 
 
 
 
6
 
7
- # (Keep Constants as is)
8
  # --- Constants ---
9
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
10
 
11
- # --- Basic Agent Definition ---
12
- # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
13
- class BasicAgent:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  def __init__(self):
15
- print("BasicAgent initialized.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  def __call__(self, question: str) -> str:
17
- print(f"Agent received question (first 50 chars): {question[:50]}...")
18
- fixed_answer = "This is a default answer."
19
- print(f"Agent returning fixed answer: {fixed_answer}")
20
- return fixed_answer
21
-
22
- def run_and_submit_all( profile: gr.OAuthProfile | None):
23
- """
24
- Fetches all questions, runs the BasicAgent on them, submits all answers,
25
- and displays the results.
26
- """
27
- # --- Determine HF Space Runtime URL and Repo URL ---
28
- space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
29
 
30
  if profile:
31
- username= f"{profile.username}"
32
- print(f"User logged in: {username}")
33
  else:
34
- print("User not logged in.")
35
- return "Please Login to Hugging Face with the button.", None
36
 
37
  api_url = DEFAULT_API_URL
38
  questions_url = f"{api_url}/questions"
39
  submit_url = f"{api_url}/submit"
40
 
41
- # 1. Instantiate Agent ( modify this part to create your agent)
42
  try:
43
- agent = BasicAgent()
44
  except Exception as e:
45
- print(f"Error instantiating agent: {e}")
46
- return f"Error initializing agent: {e}", None
47
- # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
48
- agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
49
- print(agent_code)
50
-
51
- # 2. Fetch Questions
52
- print(f"Fetching questions from: {questions_url}")
53
  try:
54
- response = requests.get(questions_url, timeout=15)
55
- response.raise_for_status()
56
- questions_data = response.json()
57
  if not questions_data:
58
- print("Fetched questions list is empty.")
59
- return "Fetched questions list is empty or invalid format.", None
60
  print(f"Fetched {len(questions_data)} questions.")
61
- except requests.exceptions.RequestException as e:
62
- print(f"Error fetching questions: {e}")
63
- return f"Error fetching questions: {e}", None
64
- except requests.exceptions.JSONDecodeError as e:
65
- print(f"Error decoding JSON response from questions endpoint: {e}")
66
- print(f"Response text: {response.text[:500]}")
67
- return f"Error decoding server response for questions: {e}", None
68
  except Exception as e:
69
- print(f"An unexpected error occurred fetching questions: {e}")
70
- return f"An unexpected error occurred fetching questions: {e}", None
71
 
72
- # 3. Run your Agent
73
  results_log = []
74
  answers_payload = []
75
- print(f"Running agent on {len(questions_data)} questions...")
76
  for item in questions_data:
77
  task_id = item.get("task_id")
78
  question_text = item.get("question")
 
 
79
  if not task_id or question_text is None:
80
- print(f"Skipping item with missing task_id or question: {item}")
81
  continue
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  try:
83
- submitted_answer = agent(question_text)
84
- answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
85
- results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
86
  except Exception as e:
87
- print(f"Error running agent on task {task_id}: {e}")
88
- results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
89
 
90
- if not answers_payload:
91
- print("Agent did not produce any answers to submit.")
92
- return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
 
 
 
 
93
 
94
- # 4. Prepare Submission
95
- submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
96
- status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
97
- print(status_update)
98
 
99
- # 5. Submit
100
- print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
 
 
 
 
 
101
  try:
102
- response = requests.post(submit_url, json=submission_data, timeout=60)
103
- response.raise_for_status()
104
- result_data = response.json()
105
- final_status = (
106
  f"Submission Successful!\n"
107
- f"User: {result_data.get('username')}\n"
108
- f"Overall Score: {result_data.get('score', 'N/A')}% "
109
- f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
110
- f"Message: {result_data.get('message', 'No message received.')}"
111
  )
112
- print("Submission successful.")
113
- results_df = pd.DataFrame(results_log)
114
- return final_status, results_df
115
  except requests.exceptions.HTTPError as e:
116
- error_detail = f"Server responded with status {e.response.status_code}."
117
  try:
118
- error_json = e.response.json()
119
- error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
120
- except requests.exceptions.JSONDecodeError:
121
- error_detail += f" Response: {e.response.text[:500]}"
122
- status_message = f"Submission Failed: {error_detail}"
123
- print(status_message)
124
- results_df = pd.DataFrame(results_log)
125
- return status_message, results_df
126
- except requests.exceptions.Timeout:
127
- status_message = "Submission Failed: The request timed out."
128
- print(status_message)
129
- results_df = pd.DataFrame(results_log)
130
- return status_message, results_df
131
- except requests.exceptions.RequestException as e:
132
- status_message = f"Submission Failed: Network error - {e}"
133
- print(status_message)
134
- results_df = pd.DataFrame(results_log)
135
- return status_message, results_df
136
  except Exception as e:
137
- status_message = f"An unexpected error occurred during submission: {e}"
138
- print(status_message)
139
- results_df = pd.DataFrame(results_log)
140
- return status_message, results_df
141
 
142
 
143
- # --- Build Gradio Interface using Blocks ---
 
 
 
144
  with gr.Blocks() as demo:
145
- gr.Markdown("# Basic Agent Evaluation Runner")
146
  gr.Markdown(
147
  """
148
- **Instructions:**
149
-
150
- 1. Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc ...
151
- 2. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
152
- 3. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
153
 
154
- ---
155
- **Disclaimers:**
156
- Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
157
- This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
158
  """
159
  )
160
 
161
  gr.LoginButton()
162
 
163
- run_button = gr.Button("Run Evaluation & Submit All Answers")
 
 
164
 
165
- status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
166
- # Removed max_rows=10 from DataFrame constructor
167
- results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
168
-
169
- run_button.click(
170
- fn=run_and_submit_all,
171
- outputs=[status_output, results_table]
172
- )
173
 
174
  if __name__ == "__main__":
175
- print("\n" + "-"*30 + " App Starting " + "-"*30)
176
- # Check for SPACE_HOST and SPACE_ID at startup for information
177
- space_host_startup = os.getenv("SPACE_HOST")
178
- space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
179
-
180
- if space_host_startup:
181
- print(f"✅ SPACE_HOST found: {space_host_startup}")
182
- print(f" Runtime URL should be: https://{space_host_startup}.hf.space")
 
183
  else:
184
- print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
185
-
186
- if space_id_startup: # Print repo URLs if SPACE_ID is found
187
- print(f"✅ SPACE_ID found: {space_id_startup}")
188
- print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
189
- print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
190
- else:
191
- print("ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
192
-
193
- print("-"*(60 + len(" App Starting ")) + "\n")
194
-
195
- print("Launching Gradio Interface for Basic Agent Evaluation...")
196
  demo.launch(debug=True, share=False)
 
1
  import os
2
+ import sys
3
+ import json
4
+ import base64
5
+ import tempfile
6
  import requests
 
7
  import pandas as pd
8
+ import gradio as gr
9
+ import anthropic
10
+ from io import StringIO
11
+ from pathlib import Path
12
 
 
13
  # --- Constants ---
14
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
15
 
16
+ # ============================================================
17
+ # Tool Implementations
18
+ # ============================================================
19
+
20
+ def web_search(query: str) -> str:
21
+ """Search the web using DuckDuckGo (no API key needed)."""
22
+ try:
23
+ from duckduckgo_search import DDGS
24
+ with DDGS() as ddgs:
25
+ results = list(ddgs.text(query, max_results=6))
26
+ if not results:
27
+ return "No results found."
28
+ return "\n\n".join(
29
+ f"Title: {r['title']}\nURL: {r['href']}\nSnippet: {r['body']}"
30
+ for r in results
31
+ )
32
+ except Exception as e:
33
+ return f"Search error: {e}"
34
+
35
+
36
+ def visit_webpage(url: str) -> str:
37
+ """Fetch and return the text content of a webpage."""
38
+ try:
39
+ headers = {"User-Agent": "Mozilla/5.0 (compatible; GAIABot/1.0)"}
40
+ resp = requests.get(url, headers=headers, timeout=15)
41
+ resp.raise_for_status()
42
+ try:
43
+ from bs4 import BeautifulSoup
44
+ soup = BeautifulSoup(resp.text, "html.parser")
45
+ for tag in soup(["script", "style", "nav", "footer", "header"]):
46
+ tag.decompose()
47
+ text = soup.get_text(separator=" ", strip=True)
48
+ except ImportError:
49
+ from html.parser import HTMLParser
50
+ class _Strip(HTMLParser):
51
+ def __init__(self):
52
+ super().__init__()
53
+ self._parts, self._skip = [], False
54
+ def handle_starttag(self, t, _):
55
+ if t in ("script", "style"):
56
+ self._skip = True
57
+ def handle_endtag(self, t):
58
+ if t in ("script", "style"):
59
+ self._skip = False
60
+ def handle_data(self, d):
61
+ if not self._skip:
62
+ self._parts.append(d)
63
+ p = _Strip()
64
+ p.feed(resp.text)
65
+ text = " ".join(p._parts)
66
+ import re
67
+ text = re.sub(r"\s+", " ", text).strip()
68
+ return text[:8000]
69
+ except Exception as e:
70
+ return f"Failed to fetch {url}: {e}"
71
+
72
+
73
+ def run_python(code: str) -> str:
74
+ """Execute Python code in a sandboxed namespace and return stdout."""
75
+ buf_out, buf_err = StringIO(), StringIO()
76
+ old_out, old_err = sys.stdout, sys.stderr
77
+ sys.stdout, sys.stderr = buf_out, buf_err
78
+ try:
79
+ namespace = {"pd": pd, "__builtins__": __builtins__}
80
+ exec(code, namespace)
81
+ out = buf_out.getvalue()
82
+ err = buf_err.getvalue()
83
+ if err:
84
+ out += f"\n[stderr]: {err}"
85
+ return out.strip() or "(executed — no output)"
86
+ except Exception as exc:
87
+ return f"{type(exc).__name__}: {exc}"
88
+ finally:
89
+ sys.stdout, sys.stderr = old_out, old_err
90
+
91
+
92
+ def read_file_as_text(file_bytes: bytes, file_name: str) -> str:
93
+ """Convert various file types to a text representation."""
94
+ ext = Path(file_name).suffix.lower()
95
+ try:
96
+ if ext in (".txt", ".py", ".md", ".json", ".xml", ".html", ".css", ".js"):
97
+ return file_bytes.decode("utf-8", errors="replace")[:6000]
98
+ elif ext == ".csv":
99
+ df = pd.read_csv(StringIO(file_bytes.decode("utf-8", errors="replace")))
100
+ return df.to_string(max_rows=50)
101
+ elif ext in (".xlsx", ".xls"):
102
+ import io
103
+ df = pd.read_excel(io.BytesIO(file_bytes), sheet_name=None)
104
+ parts = []
105
+ for sheet, frame in df.items():
106
+ parts.append(f"=== Sheet: {sheet} ===\n{frame.to_string(max_rows=50)}")
107
+ return "\n\n".join(parts)[:6000]
108
+ elif ext == ".pdf":
109
+ import io
110
+ try:
111
+ import pypdf
112
+ reader = pypdf.PdfReader(io.BytesIO(file_bytes))
113
+ return "\n".join(p.extract_text() for p in reader.pages)[:6000]
114
+ except ImportError:
115
+ return "[PDF reading requires pypdf — install with: pip install pypdf]"
116
+ elif ext in (".mp3", ".wav", ".m4a", ".flac"):
117
+ return f"[Audio file: {file_name}, {len(file_bytes):,} bytes — transcription not available without Whisper API]"
118
+ else:
119
+ # Try decoding as UTF-8 as a last resort
120
+ try:
121
+ return file_bytes.decode("utf-8", errors="replace")[:4000]
122
+ except Exception:
123
+ return f"[Binary file: {file_name}, {len(file_bytes):,} bytes]"
124
+ except Exception as e:
125
+ return f"Error reading file {file_name}: {e}"
126
+
127
+
128
+ # ============================================================
129
+ # Tool Schema (for Anthropic tool_use)
130
+ # ============================================================
131
+
132
+ TOOLS = [
133
+ {
134
+ "name": "web_search",
135
+ "description": (
136
+ "Search the web for current information, facts, Wikipedia content, "
137
+ "news, etc. Returns titles, URLs, and snippets."
138
+ ),
139
+ "input_schema": {
140
+ "type": "object",
141
+ "properties": {
142
+ "query": {"type": "string", "description": "The search query"}
143
+ },
144
+ "required": ["query"],
145
+ },
146
+ },
147
+ {
148
+ "name": "visit_webpage",
149
+ "description": (
150
+ "Fetch the full text of a specific webpage. Use when you need more "
151
+ "detail than a search snippet, e.g. to read a Wikipedia article."
152
+ ),
153
+ "input_schema": {
154
+ "type": "object",
155
+ "properties": {
156
+ "url": {"type": "string", "description": "Full URL to fetch"}
157
+ },
158
+ "required": ["url"],
159
+ },
160
+ },
161
+ {
162
+ "name": "run_python",
163
+ "description": (
164
+ "Execute Python code. Great for arithmetic, counting, sorting, "
165
+ "string manipulation, or processing data. Use print() for output. "
166
+ "pandas (as pd) is pre-imported."
167
+ ),
168
+ "input_schema": {
169
+ "type": "object",
170
+ "properties": {
171
+ "code": {
172
+ "type": "string",
173
+ "description": "Python code to run. Always use print() to show results.",
174
+ }
175
+ },
176
+ "required": ["code"],
177
+ },
178
+ },
179
+ ]
180
+
181
+ SYSTEM_PROMPT = """You are an expert research assistant solving GAIA benchmark questions.
182
+ These are real-world questions requiring careful research and precise answers.
183
+
184
+ Strategy:
185
+ - Use web_search to find facts; follow up with visit_webpage for detail
186
+ - Use run_python for any calculation, counting, sorting, or data manipulation
187
+ - For files provided in the question, analyse them carefully
188
+ - Cross-check facts when accuracy is critical
189
+
190
+ Answer format (VERY IMPORTANT):
191
+ - Provide ONLY the final answer — no preamble, no explanation
192
+ - Give exactly what is asked: a number, a name, a date, a word, a short phrase
193
+ - Numbers: digits only, unless units are part of the question's expected format
194
+ - Lists: comma-separated values unless another format is specified
195
+ - Yes/No questions: just "Yes" or "No"
196
+
197
+ Think step by step, then output your final concise answer."""
198
+
199
+
200
+ # ============================================================
201
+ # Agent
202
+ # ============================================================
203
+
204
+ class GAIAAgent:
205
+ """Agentic loop backed by Claude with tool use."""
206
+
207
+ MAX_ITERATIONS = 15
208
+
209
  def __init__(self):
210
+ api_key = os.getenv("ANTHROPIC_API_KEY")
211
+ if not api_key:
212
+ raise EnvironmentError("ANTHROPIC_API_KEY environment variable not set.")
213
+ self.client = anthropic.Anthropic(api_key=api_key)
214
+ self.model = "claude-sonnet-4-20250514"
215
+ print(f"GAIAAgent initialised (model: {self.model})")
216
+
217
+ # ---- internal helpers ----
218
+
219
+ def _dispatch_tool(self, name: str, inputs: dict) -> str:
220
+ if name == "web_search":
221
+ return web_search(inputs["query"])
222
+ if name == "visit_webpage":
223
+ return visit_webpage(inputs["url"])
224
+ if name == "run_python":
225
+ return run_python(inputs["code"])
226
+ return f"[unknown tool: {name}]"
227
+
228
+ def _build_initial_content(
229
+ self, question: str, file_bytes: bytes | None, file_name: str | None
230
+ ) -> list:
231
+ """Return the content list for the first user message."""
232
+ content = []
233
+
234
+ if file_bytes and file_name:
235
+ ext = Path(file_name).suffix.lower()
236
+ image_exts = {".jpg", ".jpeg", ".png", ".gif", ".webp"}
237
+ if ext in image_exts:
238
+ media_map = {
239
+ ".jpg": "image/jpeg", ".jpeg": "image/jpeg",
240
+ ".png": "image/png", ".gif": "image/gif",
241
+ ".webp": "image/webp",
242
+ }
243
+ content.append({
244
+ "type": "image",
245
+ "source": {
246
+ "type": "base64",
247
+ "media_type": media_map[ext],
248
+ "data": base64.b64encode(file_bytes).decode(),
249
+ },
250
+ })
251
+ content.append({
252
+ "type": "text",
253
+ "text": f"The image above is the attached file '{file_name}'.\n\n{question}",
254
+ })
255
+ else:
256
+ file_text = read_file_as_text(file_bytes, file_name)
257
+ content.append({
258
+ "type": "text",
259
+ "text": (
260
+ f"A file named '{file_name}' is attached. Its contents:\n\n"
261
+ f"{file_text}\n\n---\n\nQuestion: {question}"
262
+ ),
263
+ })
264
+ else:
265
+ content.append({"type": "text", "text": question})
266
+
267
+ return content
268
+
269
+ # ---- public interface ----
270
+
271
+ def solve(
272
+ self,
273
+ question: str,
274
+ file_bytes: bytes | None = None,
275
+ file_name: str | None = None,
276
+ ) -> str:
277
+ print(f"\n[Agent] Question: {question[:120]}{'...' if len(question)>120 else ''}")
278
+ messages = [
279
+ {"role": "user", "content": self._build_initial_content(question, file_bytes, file_name)}
280
+ ]
281
+
282
+ for iteration in range(self.MAX_ITERATIONS):
283
+ response = self.client.messages.create(
284
+ model=self.model,
285
+ max_tokens=4096,
286
+ system=SYSTEM_PROMPT,
287
+ tools=TOOLS,
288
+ messages=messages,
289
+ )
290
+
291
+ if response.stop_reason == "end_turn":
292
+ for block in response.content:
293
+ if hasattr(block, "text"):
294
+ answer = block.text.strip()
295
+ print(f"[Agent] Answer: {answer[:100]}")
296
+ return answer
297
+ return "No answer generated."
298
+
299
+ if response.stop_reason == "tool_use":
300
+ tool_results = []
301
+ for block in response.content:
302
+ if block.type == "tool_use":
303
+ print(f" [Tool] {block.name}({json.dumps(block.input)[:80]})")
304
+ result = self._dispatch_tool(block.name, block.input)
305
+ print(f" [Tool] → {result[:120]}")
306
+ tool_results.append({
307
+ "type": "tool_result",
308
+ "tool_use_id": block.id,
309
+ "content": result,
310
+ })
311
+ messages.append({"role": "assistant", "content": response.content})
312
+ messages.append({"role": "user", "content": tool_results})
313
+ else:
314
+ # Unexpected stop reason
315
+ print(f"[Agent] Unexpected stop_reason: {response.stop_reason}")
316
+ break
317
+
318
+ return "Could not determine answer within iteration limit."
319
+
320
  def __call__(self, question: str) -> str:
321
+ """Compatibility shim for the template's agent(question) calls."""
322
+ return self.solve(question)
323
+
324
+
325
+ # ============================================================
326
+ # Evaluation runner
327
+ # ============================================================
328
+
329
+ def run_and_submit_all(profile: gr.OAuthProfile | None):
330
+ """Fetch questions, run the agent, submit answers, display results."""
331
+
332
+ space_id = os.getenv("SPACE_ID")
333
 
334
  if profile:
335
+ username = profile.username
336
+ print(f"Logged in as: {username}")
337
  else:
338
+ return "Please log in to Hugging Face first.", None
 
339
 
340
  api_url = DEFAULT_API_URL
341
  questions_url = f"{api_url}/questions"
342
  submit_url = f"{api_url}/submit"
343
 
344
+ # 1. Build agent
345
  try:
346
+ agent = GAIAAgent()
347
  except Exception as e:
348
+ return f"Error initialising agent: {e}", None
349
+
350
+ agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "unknown"
351
+
352
+ # 2. Fetch questions
353
+ print(f"Fetching questions from {questions_url} …")
 
 
354
  try:
355
+ resp = requests.get(questions_url, timeout=15)
356
+ resp.raise_for_status()
357
+ questions_data = resp.json()
358
  if not questions_data:
359
+ return "Questions list is empty.", None
 
360
  print(f"Fetched {len(questions_data)} questions.")
 
 
 
 
 
 
 
361
  except Exception as e:
362
+ return f"Error fetching questions: {e}", None
 
363
 
364
+ # 3. Run agent on each question
365
  results_log = []
366
  answers_payload = []
367
+
368
  for item in questions_data:
369
  task_id = item.get("task_id")
370
  question_text = item.get("question")
371
+ file_name = item.get("file_name", "")
372
+
373
  if not task_id or question_text is None:
374
+ print(f"Skipping malformed item: {item}")
375
  continue
376
+
377
+ # Download attached file if present
378
+ file_bytes = None
379
+ if file_name:
380
+ try:
381
+ file_url = f"{api_url}/files/{task_id}"
382
+ file_resp = requests.get(file_url, timeout=30)
383
+ file_resp.raise_for_status()
384
+ file_bytes = file_resp.content
385
+ print(f" Downloaded '{file_name}' ({len(file_bytes):,} bytes)")
386
+ except Exception as e:
387
+ print(f" Could not download file for task {task_id}: {e}")
388
+
389
  try:
390
+ submitted_answer = agent.solve(question_text, file_bytes, file_name)
 
 
391
  except Exception as e:
392
+ submitted_answer = f"AGENT ERROR: {e}"
393
+ print(f" Agent error on {task_id}: {e}")
394
 
395
+ answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
396
+ results_log.append({
397
+ "Task ID": task_id,
398
+ "Question": question_text[:120],
399
+ "File": file_name or "—",
400
+ "Submitted Answer": submitted_answer,
401
+ })
402
 
403
+ if not answers_payload:
404
+ return "Agent produced no answers.", pd.DataFrame(results_log)
 
 
405
 
406
+ # 4. Submit
407
+ submission = {
408
+ "username": username.strip(),
409
+ "agent_code": agent_code,
410
+ "answers": answers_payload,
411
+ }
412
+ print(f"Submitting {len(answers_payload)} answers …")
413
  try:
414
+ resp = requests.post(submit_url, json=submission, timeout=120)
415
+ resp.raise_for_status()
416
+ result = resp.json()
417
+ status = (
418
  f"Submission Successful!\n"
419
+ f"User: {result.get('username')}\n"
420
+ f"Score: {result.get('score', 'N/A')}% "
421
+ f"({result.get('correct_count', '?')}/{result.get('total_attempted', '?')} correct)\n"
422
+ f"Message: {result.get('message', '')}"
423
  )
 
 
 
424
  except requests.exceptions.HTTPError as e:
425
+ detail = ""
426
  try:
427
+ detail = e.response.json().get("detail", e.response.text)
428
+ except Exception:
429
+ detail = e.response.text[:500]
430
+ status = f"Submission failed (HTTP {e.response.status_code}): {detail}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
431
  except Exception as e:
432
+ status = f"Submission error: {e}"
433
+
434
+ print(status)
435
+ return status, pd.DataFrame(results_log)
436
 
437
 
438
+ # ============================================================
439
+ # Gradio UI
440
+ # ============================================================
441
+
442
  with gr.Blocks() as demo:
443
+ gr.Markdown("# GAIA Agent Evaluation Runner")
444
  gr.Markdown(
445
  """
446
+ **Setup:**
447
+ 1. Set `ANTHROPIC_API_KEY` as a Space secret.
448
+ 2. Log in with your Hugging Face account below.
449
+ 3. Click **Run Evaluation** to fetch questions, run the agent, and submit.
 
450
 
451
+ The agent uses Claude with web search, code execution, and file analysis.
 
 
 
452
  """
453
  )
454
 
455
  gr.LoginButton()
456
 
457
+ run_btn = gr.Button("Run Evaluation & Submit All Answers", variant="primary")
458
+ status_box = gr.Textbox(label="Status / Result", lines=6, interactive=False)
459
+ results_table = gr.DataFrame(label="Questions & Answers", wrap=True)
460
 
461
+ run_btn.click(fn=run_and_submit_all, outputs=[status_box, results_table])
 
 
 
 
 
 
 
462
 
463
  if __name__ == "__main__":
464
+ print("\n" + "=" * 60)
465
+ space_host = os.getenv("SPACE_HOST")
466
+ space_id = os.getenv("SPACE_ID")
467
+ if space_host:
468
+ print(f"SPACE_HOST : {space_host}")
469
+ if space_id:
470
+ print(f"SPACE_ID : {space_id}")
471
+ if not os.getenv("ANTHROPIC_API_KEY"):
472
+ print("⚠️ ANTHROPIC_API_KEY is NOT set — agent will fail.")
473
  else:
474
+ print(" ANTHROPIC_API_KEY found.")
475
+ print("=" * 60 + "\n")
 
 
 
 
 
 
 
 
 
 
476
  demo.launch(debug=True, share=False)