mangubee Claude Sonnet 4.5 commited on
Commit
2a449c8
·
1 Parent(s): 5d84945

Feat: Add markdown export for GAIA evaluation results

Browse files

Added export_results_to_markdown() function that saves evaluation results to ~/Downloads/gaia_results_TIMESTAMP.md with formatted markdown table. Updated all return paths in run_and_submit_all() to export results (success and error cases). Added export_output UI component to display file path.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

Files changed (2) hide show
  1. CHANGELOG.md +4 -0
  2. app.py +74 -13
CHANGELOG.md CHANGED
@@ -25,6 +25,10 @@
25
  - **app.py**
26
  - Updated `check_api_keys()` - Added HF_TOKEN status display in Test & Debug tab
27
  - UI now shows: "HF_TOKEN (HuggingFace): ✓ SET" or "✗ MISSING"
 
 
 
 
28
 
29
  - **src/tools/__init__.py** (Fixed earlier in session)
30
  - Fixed TOOLS schema bug - Changed parameters from list to dict format
 
25
  - **app.py**
26
  - Updated `check_api_keys()` - Added HF_TOKEN status display in Test & Debug tab
27
  - UI now shows: "HF_TOKEN (HuggingFace): ✓ SET" or "✗ MISSING"
28
+ - Added `export_results_to_markdown(results_log, submission_status)` - Export evaluation results to markdown file
29
+ - Updated `run_and_submit_all()` - ALL return paths now export results to ~/Downloads/gaia_results_TIMESTAMP.md
30
+ - Added export_output UI component - Displays exported file path to user
31
+ - Updated run_button click handler - Now outputs 3 values (status, table, export_path)
32
 
33
  - **src/tools/__init__.py** (Fixed earlier in session)
34
  - Fixed TOOLS schema bug - Changed parameters from list to dict format
app.py CHANGED
@@ -34,6 +34,52 @@ def check_api_keys():
34
  return "\n".join([f"{k}: {v}" for k, v in keys_status.items()])
35
 
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  def format_diagnostics(final_state: dict) -> str:
38
  """Format agent state for diagnostic display."""
39
  diagnostics = []
@@ -147,7 +193,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
147
  print(f"User logged in: {username}")
148
  else:
149
  print("User not logged in.")
150
- return "Please Login to Hugging Face with the button.", None
151
 
152
  api_url = DEFAULT_API_URL
153
  questions_url = f"{api_url}/questions"
@@ -161,7 +207,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
161
  except Exception as e:
162
  logger.error(f"Error instantiating agent: {e}")
163
  print(f"Error instantiating agent: {e}")
164
- return f"Error initializing agent: {e}", None
165
  # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
166
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
167
  print(agent_code)
@@ -174,18 +220,18 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
174
  questions_data = response.json()
175
  if not questions_data:
176
  print("Fetched questions list is empty.")
177
- return "Fetched questions list is empty or invalid format.", None
178
  print(f"Fetched {len(questions_data)} questions.")
179
  except requests.exceptions.RequestException as e:
180
  print(f"Error fetching questions: {e}")
181
- return f"Error fetching questions: {e}", None
182
  except requests.exceptions.JSONDecodeError as e:
183
  print(f"Error decoding JSON response from questions endpoint: {e}")
184
  print(f"Response text: {response.text[:500]}")
185
- return f"Error decoding server response for questions: {e}", None
186
  except Exception as e:
187
  print(f"An unexpected error occurred fetching questions: {e}")
188
- return f"An unexpected error occurred fetching questions: {e}", None
189
 
190
  # 3. Run your Agent
191
  results_log = []
@@ -221,7 +267,10 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
221
 
222
  if not answers_payload:
223
  print("Agent did not produce any answers to submit.")
224
- return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
 
 
 
225
 
226
  # 4. Prepare Submission
227
  submission_data = {
@@ -247,7 +296,9 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
247
  )
248
  print("Submission successful.")
249
  results_df = pd.DataFrame(results_log)
250
- return final_status, results_df
 
 
251
  except requests.exceptions.HTTPError as e:
252
  error_detail = f"Server responded with status {e.response.status_code}."
253
  try:
@@ -258,22 +309,26 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
258
  status_message = f"Submission Failed: {error_detail}"
259
  print(status_message)
260
  results_df = pd.DataFrame(results_log)
261
- return status_message, results_df
 
262
  except requests.exceptions.Timeout:
263
  status_message = "Submission Failed: The request timed out."
264
  print(status_message)
265
  results_df = pd.DataFrame(results_log)
266
- return status_message, results_df
 
267
  except requests.exceptions.RequestException as e:
268
  status_message = f"Submission Failed: Network error - {e}"
269
  print(status_message)
270
  results_df = pd.DataFrame(results_log)
271
- return status_message, results_df
 
272
  except Exception as e:
273
  status_message = f"An unexpected error occurred during submission: {e}"
274
  print(status_message)
275
  results_df = pd.DataFrame(results_log)
276
- return status_message, results_df
 
277
 
278
 
279
  # --- Build Gradio Interface using Blocks ---
@@ -359,7 +414,13 @@ with gr.Blocks() as demo:
359
  # Removed max_rows=10 from DataFrame constructor
360
  results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
361
 
362
- run_button.click(fn=run_and_submit_all, outputs=[status_output, results_table])
 
 
 
 
 
 
363
 
364
  if __name__ == "__main__":
365
  print("\n" + "-" * 30 + " App Starting " + "-" * 30)
 
34
  return "\n".join([f"{k}: {v}" for k, v in keys_status.items()])
35
 
36
 
37
+ def export_results_to_markdown(results_log: list, submission_status: str) -> str:
38
+ """Export evaluation results to markdown file in Downloads folder."""
39
+ from datetime import datetime
40
+
41
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
42
+ downloads_dir = os.path.expanduser("~/Downloads")
43
+ filename = f"gaia_results_{timestamp}.md"
44
+ filepath = os.path.join(downloads_dir, filename)
45
+
46
+ with open(filepath, 'w') as f:
47
+ # Header
48
+ f.write("# GAIA Agent Evaluation Results\n\n")
49
+ f.write(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
50
+
51
+ # Submission status
52
+ f.write("## Submission Status\n\n")
53
+ f.write(f"{submission_status}\n\n")
54
+
55
+ # Results table
56
+ f.write("## Questions and Answers\n\n")
57
+
58
+ if not results_log:
59
+ f.write("*No results available*\n")
60
+ return filepath
61
+
62
+ # Create markdown table
63
+ f.write("| Task ID | Question | Submitted Answer |\n")
64
+ f.write("|---------|----------|------------------|\n")
65
+
66
+ for result in results_log:
67
+ task_id = result.get("Task ID", "N/A")
68
+ question = result.get("Question", "N/A").replace("\n", " ").replace("|", "\\|")
69
+ answer = result.get("Submitted Answer", "N/A").replace("\n", " ").replace("|", "\\|")
70
+
71
+ # Truncate long text for readability
72
+ if len(question) > 100:
73
+ question = question[:97] + "..."
74
+ if len(answer) > 100:
75
+ answer = answer[:97] + "..."
76
+
77
+ f.write(f"| {task_id} | {question} | {answer} |\n")
78
+
79
+ logger.info(f"Results exported to: {filepath}")
80
+ return filepath
81
+
82
+
83
  def format_diagnostics(final_state: dict) -> str:
84
  """Format agent state for diagnostic display."""
85
  diagnostics = []
 
193
  print(f"User logged in: {username}")
194
  else:
195
  print("User not logged in.")
196
+ return "Please Login to Hugging Face with the button.", None, ""
197
 
198
  api_url = DEFAULT_API_URL
199
  questions_url = f"{api_url}/questions"
 
207
  except Exception as e:
208
  logger.error(f"Error instantiating agent: {e}")
209
  print(f"Error instantiating agent: {e}")
210
+ return f"Error initializing agent: {e}", None, ""
211
  # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
212
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
213
  print(agent_code)
 
220
  questions_data = response.json()
221
  if not questions_data:
222
  print("Fetched questions list is empty.")
223
+ return "Fetched questions list is empty or invalid format.", None, ""
224
  print(f"Fetched {len(questions_data)} questions.")
225
  except requests.exceptions.RequestException as e:
226
  print(f"Error fetching questions: {e}")
227
+ return f"Error fetching questions: {e}", None, ""
228
  except requests.exceptions.JSONDecodeError as e:
229
  print(f"Error decoding JSON response from questions endpoint: {e}")
230
  print(f"Response text: {response.text[:500]}")
231
+ return f"Error decoding server response for questions: {e}", None, ""
232
  except Exception as e:
233
  print(f"An unexpected error occurred fetching questions: {e}")
234
+ return f"An unexpected error occurred fetching questions: {e}", None, ""
235
 
236
  # 3. Run your Agent
237
  results_log = []
 
267
 
268
  if not answers_payload:
269
  print("Agent did not produce any answers to submit.")
270
+ status_message = "Agent did not produce any answers to submit."
271
+ results_df = pd.DataFrame(results_log)
272
+ export_path = export_results_to_markdown(results_log, status_message)
273
+ return status_message, results_df, export_path
274
 
275
  # 4. Prepare Submission
276
  submission_data = {
 
296
  )
297
  print("Submission successful.")
298
  results_df = pd.DataFrame(results_log)
299
+ # Export to markdown
300
+ export_path = export_results_to_markdown(results_log, final_status)
301
+ return final_status, results_df, export_path
302
  except requests.exceptions.HTTPError as e:
303
  error_detail = f"Server responded with status {e.response.status_code}."
304
  try:
 
309
  status_message = f"Submission Failed: {error_detail}"
310
  print(status_message)
311
  results_df = pd.DataFrame(results_log)
312
+ export_path = export_results_to_markdown(results_log, status_message)
313
+ return status_message, results_df, export_path
314
  except requests.exceptions.Timeout:
315
  status_message = "Submission Failed: The request timed out."
316
  print(status_message)
317
  results_df = pd.DataFrame(results_log)
318
+ export_path = export_results_to_markdown(results_log, status_message)
319
+ return status_message, results_df, export_path
320
  except requests.exceptions.RequestException as e:
321
  status_message = f"Submission Failed: Network error - {e}"
322
  print(status_message)
323
  results_df = pd.DataFrame(results_log)
324
+ export_path = export_results_to_markdown(results_log, status_message)
325
+ return status_message, results_df, export_path
326
  except Exception as e:
327
  status_message = f"An unexpected error occurred during submission: {e}"
328
  print(status_message)
329
  results_df = pd.DataFrame(results_log)
330
+ export_path = export_results_to_markdown(results_log, status_message)
331
+ return status_message, results_df, export_path
332
 
333
 
334
  # --- Build Gradio Interface using Blocks ---
 
414
  # Removed max_rows=10 from DataFrame constructor
415
  results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
416
 
417
+ export_output = gr.Textbox(
418
+ label="Exported Results",
419
+ placeholder="Results will be exported to markdown file in ~/Downloads",
420
+ interactive=False
421
+ )
422
+
423
+ run_button.click(fn=run_and_submit_all, outputs=[status_output, results_table, export_output])
424
 
425
  if __name__ == "__main__":
426
  print("\n" + "-" * 30 + " App Starting " + "-" * 30)