mangubee commited on
Commit
9edb481
·
1 Parent(s): 2fc4228
Files changed (3) hide show
  1. CHANGELOG.md +5 -3
  2. app.py +33 -43
  3. output/gaia_results_20260104_005858.md +35 -0
CHANGELOG.md CHANGED
@@ -25,9 +25,11 @@
25
  - **app.py**
26
  - Updated `check_api_keys()` - Added HF_TOKEN status display in Test & Debug tab
27
  - UI now shows: "HF_TOKEN (HuggingFace): ✓ SET" or "✗ MISSING"
28
- - Added `export_results_to_markdown(results_log, submission_status)` - Export evaluation results with environment detection
29
- - Local: Saves to ~/Downloads/gaia_results_TIMESTAMP.md
30
- - HF Spaces: Saves to ./exports/gaia_results_TIMESTAMP.md (fixes cloud deployment issue)
 
 
31
  - Updated `run_and_submit_all()` - ALL return paths now export results
32
  - Added gr.File download button - Users can directly download results (better UX than textbox)
33
  - Updated run_button click handler - Now outputs 3 values (status, table, export_path)
 
25
  - **app.py**
26
  - Updated `check_api_keys()` - Added HF_TOKEN status display in Test & Debug tab
27
  - UI now shows: "HF_TOKEN (HuggingFace): ✓ SET" or "✗ MISSING"
28
+ - Added `export_results_to_json(results_log, submission_status)` - Export evaluation results as JSON
29
+ - Local: Saves to ~/Downloads/gaia_results_TIMESTAMP.json
30
+ - HF Spaces: Saves to ./exports/gaia_results_TIMESTAMP.json (fixes cloud deployment issue)
31
+ - JSON format: No special char escaping issues, full error messages, easy code processing
32
+ - Pretty formatted with indent=2, ensure_ascii=False for readability
33
  - Updated `run_and_submit_all()` - ALL return paths now export results
34
  - Added gr.File download button - Users can directly download results (better UX than textbox)
35
  - Updated run_button click handler - Now outputs 3 values (status, table, export_path)
app.py CHANGED
@@ -34,16 +34,17 @@ def check_api_keys():
34
  return "\n".join([f"{k}: {v}" for k, v in keys_status.items()])
35
 
36
 
37
- def export_results_to_markdown(results_log: list, submission_status: str) -> str:
38
- """Export evaluation results to markdown file.
39
 
40
- - Local: Saves to ~/Downloads
41
- - HF Spaces: Saves to ./exports/ (for Gradio file download)
 
42
  """
43
  from datetime import datetime
44
 
45
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
46
- filename = f"gaia_results_{timestamp}.md"
47
 
48
  # Detect environment: HF Spaces or local
49
  if os.getenv("SPACE_ID"):
@@ -56,38 +57,27 @@ def export_results_to_markdown(results_log: list, submission_status: str) -> str
56
  downloads_dir = os.path.expanduser("~/Downloads")
57
  filepath = os.path.join(downloads_dir, filename)
58
 
59
- with open(filepath, 'w') as f:
60
- # Header
61
- f.write("# GAIA Agent Evaluation Results\n\n")
62
- f.write(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
63
-
64
- # Submission status
65
- f.write("## Submission Status\n\n")
66
- f.write(f"{submission_status}\n\n")
67
-
68
- # Results table
69
- f.write("## Questions and Answers\n\n")
70
-
71
- if not results_log:
72
- f.write("*No results available*\n")
73
- return filepath
74
-
75
- # Create markdown table
76
- f.write("| Task ID | Question | Submitted Answer |\n")
77
- f.write("|---------|----------|------------------|\n")
78
-
79
- for result in results_log:
80
- task_id = result.get("Task ID", "N/A")
81
- question = result.get("Question", "N/A").replace("\n", " ").replace("|", "\\|")
82
- answer = result.get("Submitted Answer", "N/A").replace("\n", " ").replace("|", "\\|")
83
-
84
- # Truncate long text for readability
85
- if len(question) > 100:
86
- question = question[:97] + "..."
87
- if len(answer) > 100:
88
- answer = answer[:97] + "..."
89
 
90
- f.write(f"| {task_id} | {question} | {answer} |\n")
 
 
91
 
92
  logger.info(f"Results exported to: {filepath}")
93
  return filepath
@@ -282,7 +272,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
282
  print("Agent did not produce any answers to submit.")
283
  status_message = "Agent did not produce any answers to submit."
284
  results_df = pd.DataFrame(results_log)
285
- export_path = export_results_to_markdown(results_log, status_message)
286
  return status_message, results_df, export_path
287
 
288
  # 4. Prepare Submission
@@ -309,8 +299,8 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
309
  )
310
  print("Submission successful.")
311
  results_df = pd.DataFrame(results_log)
312
- # Export to markdown
313
- export_path = export_results_to_markdown(results_log, final_status)
314
  return final_status, results_df, export_path
315
  except requests.exceptions.HTTPError as e:
316
  error_detail = f"Server responded with status {e.response.status_code}."
@@ -322,25 +312,25 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
322
  status_message = f"Submission Failed: {error_detail}"
323
  print(status_message)
324
  results_df = pd.DataFrame(results_log)
325
- export_path = export_results_to_markdown(results_log, status_message)
326
  return status_message, results_df, export_path
327
  except requests.exceptions.Timeout:
328
  status_message = "Submission Failed: The request timed out."
329
  print(status_message)
330
  results_df = pd.DataFrame(results_log)
331
- export_path = export_results_to_markdown(results_log, status_message)
332
  return status_message, results_df, export_path
333
  except requests.exceptions.RequestException as e:
334
  status_message = f"Submission Failed: Network error - {e}"
335
  print(status_message)
336
  results_df = pd.DataFrame(results_log)
337
- export_path = export_results_to_markdown(results_log, status_message)
338
  return status_message, results_df, export_path
339
  except Exception as e:
340
  status_message = f"An unexpected error occurred during submission: {e}"
341
  print(status_message)
342
  results_df = pd.DataFrame(results_log)
343
- export_path = export_results_to_markdown(results_log, status_message)
344
  return status_message, results_df, export_path
345
 
346
 
 
34
  return "\n".join([f"{k}: {v}" for k, v in keys_status.items()])
35
 
36
 
37
+ def export_results_to_json(results_log: list, submission_status: str) -> str:
38
+ """Export evaluation results to JSON file for easy processing.
39
 
40
+ - Local: Saves to ~/Downloads/gaia_results_TIMESTAMP.json
41
+ - HF Spaces: Saves to ./exports/gaia_results_TIMESTAMP.json
42
+ - Format: Clean JSON with full error messages, no truncation
43
  """
44
  from datetime import datetime
45
 
46
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
47
+ filename = f"gaia_results_{timestamp}.json"
48
 
49
  # Detect environment: HF Spaces or local
50
  if os.getenv("SPACE_ID"):
 
57
  downloads_dir = os.path.expanduser("~/Downloads")
58
  filepath = os.path.join(downloads_dir, filename)
59
 
60
+ # Build JSON structure
61
+ export_data = {
62
+ "metadata": {
63
+ "generated": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
64
+ "timestamp": timestamp,
65
+ "total_questions": len(results_log)
66
+ },
67
+ "submission_status": submission_status,
68
+ "results": [
69
+ {
70
+ "task_id": result.get("Task ID", "N/A"),
71
+ "question": result.get("Question", "N/A"),
72
+ "submitted_answer": result.get("Submitted Answer", "N/A")
73
+ }
74
+ for result in results_log
75
+ ]
76
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
+ # Write JSON file with pretty formatting
79
+ with open(filepath, 'w', encoding='utf-8') as f:
80
+ json.dump(export_data, f, indent=2, ensure_ascii=False)
81
 
82
  logger.info(f"Results exported to: {filepath}")
83
  return filepath
 
272
  print("Agent did not produce any answers to submit.")
273
  status_message = "Agent did not produce any answers to submit."
274
  results_df = pd.DataFrame(results_log)
275
+ export_path = export_results_to_json(results_log, status_message)
276
  return status_message, results_df, export_path
277
 
278
  # 4. Prepare Submission
 
299
  )
300
  print("Submission successful.")
301
  results_df = pd.DataFrame(results_log)
302
+ # Export to JSON
303
+ export_path = export_results_to_json(results_log, final_status)
304
  return final_status, results_df, export_path
305
  except requests.exceptions.HTTPError as e:
306
  error_detail = f"Server responded with status {e.response.status_code}."
 
312
  status_message = f"Submission Failed: {error_detail}"
313
  print(status_message)
314
  results_df = pd.DataFrame(results_log)
315
+ export_path = export_results_to_json(results_log, status_message)
316
  return status_message, results_df, export_path
317
  except requests.exceptions.Timeout:
318
  status_message = "Submission Failed: The request timed out."
319
  print(status_message)
320
  results_df = pd.DataFrame(results_log)
321
+ export_path = export_results_to_json(results_log, status_message)
322
  return status_message, results_df, export_path
323
  except requests.exceptions.RequestException as e:
324
  status_message = f"Submission Failed: Network error - {e}"
325
  print(status_message)
326
  results_df = pd.DataFrame(results_log)
327
+ export_path = export_results_to_json(results_log, status_message)
328
  return status_message, results_df, export_path
329
  except Exception as e:
330
  status_message = f"An unexpected error occurred during submission: {e}"
331
  print(status_message)
332
  results_df = pd.DataFrame(results_log)
333
+ export_path = export_results_to_json(results_log, status_message)
334
  return status_message, results_df, export_path
335
 
336
 
output/gaia_results_20260104_005858.md ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GAIA Agent Evaluation Results
2
+
3
+ **Generated:** 2026-01-04 00:58:58
4
+
5
+ ## Submission Status
6
+
7
+ Submission Successful!
8
+ User: mangoobee
9
+ Overall Score: 5.0% (1/20 correct)
10
+ Message: Score calculated successfully: 1/20 total questions answered correctly (20 valid tasks attempted). Score did not improve previous record, leaderboard not updated.
11
+
12
+ ## Questions and Answers
13
+
14
+ | Task ID | Question | Submitted Answer |
15
+ |---------|----------|------------------|
16
+ | 8e867cd7-cff9-4e6c-867a-ff5ddc2550be | How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can ... | Unable to answer |
17
+ | a1e91b78-d3d8-4675-bb8d-62741b4b68a6 | In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird spec... | ERROR: No evidence collected. Details: Tool vision failed: Exception: Vision analysis failed - Ge... |
18
+ | 2d83110e-a098-4ebb-9987-066c06fa42d0 | .rewsna eht sa "tfel" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI | ERROR: No evidence collected. Details: Tool selection returned no tools - using fallback keyword ... |
19
+ | cca530fc-4052-43b2-b130-b30968d8aa44 | Review the chess position provided in the image. It is black's turn. Provide the correct next mov... | ERROR: No evidence collected. Details: Tool vision failed: Exception: Vision analysis failed - Ge... |
20
+ | 4fc2f1ae-8625-45b5-ab34-ad4433bc21f8 | Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted i... | FunkMonk |
21
+ | 6f37996b-2ac7-44b0-8e68-6d28256631b4 | Given this table defining * on the set S = {a, b, c, d, e} \|*\|a\|b\|c\|d\|e\| \|---\|---\|---\... | ERROR: No evidence collected. Details: Planning error: Exception: Planning failed with all LLMs. ... |
22
+ | 9d191bce-651d-4746-be2d-7ef8ecadb9c2 | Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec. What does Teal'c say in respon... | ERROR: No evidence collected. Details: Planning error: Exception: Planning failed with all LLMs. ... |
23
+ | cabe07ed-9eca-40ea-8ead-410ef5e83f91 | What is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry mate... | ERROR: Answer synthesis failed - Exception: Answer synthesis failed with all LLMs. Gemini: 429 Yo... |
24
+ | 3cef3a44-215e-4aed-8e3b-b1e3f08063b7 | I'm making a grocery list for my mom, but she's a professor of botany and she's a real stickler w... | ERROR: Answer synthesis failed - Exception: Answer synthesis failed with all LLMs. Gemini: 429 Yo... |
25
+ | 99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3 | Hi, I'm making a pie but I could use some help with my shopping list. I have everything I need fo... | ERROR: No evidence collected. Details: Planning error: Exception: Planning failed with all LLMs. ... |
26
+ | 305ac316-eef6-4446-960a-92d80d542f82 | Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play i... | ERROR: No evidence collected. Details: Planning error: Exception: Planning failed with all LLMs. ... |
27
+ | f918266a-b3e0-4914-865d-4faa564f1aef | What is the final numeric output from the attached Python code? | ERROR: Answer synthesis failed - Exception: Answer synthesis failed with all LLMs. Gemini: 429 Yo... |
28
+ | 3f57289b-8c60-48be-bd80-01f8099ca449 | How many at bats did the Yankee with the most walks in the 1977 regular season have that same sea... | 589 |
29
+ | 1f975693-876d-457b-a649-393859e79bf3 | Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I need to study fo... | ERROR: No evidence collected. Details: Tool parse_file failed: ValueError: Unsupported file type:... |
30
+ | 840bfca7-4f7b-481a-8794-c560c340185d | On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This art... | Unable to answer |
31
+ | bda648d7-d618-4883-88f4-3466eabd860e | Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010 paper eventually... | Unable to answer |
32
+ | cf106601-ab4f-4af9-b045-5295fe67b37d | What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a... | ERROR: Answer synthesis failed - Exception: Answer synthesis failed with all LLMs. Gemini: 429 Yo... |
33
+ | a0c07678-e491-4bbc-8f0b-07405144218f | Who are the pitchers with the number before and after Taishō Tamai's number as of July 2023? Give... | ERROR: Answer synthesis failed - Exception: Answer synthesis failed with all LLMs. Gemini: 429 Yo... |
34
+ | 7bd855d8-463d-4ed5-93ca-5fe35145f733 | The attached Excel file contains the sales of menu items for a local fast-food chain. What were t... | ERROR: No evidence collected. Details: Planning error: Exception: Planning failed with all LLMs. ... |
35
+ | 5a0c1adf-205e-4841-a666-7c3ef95def9d | What is the first name of the only Malko Competition recipient from the 20th Century (after 1977)... | ERROR: Answer synthesis failed - Exception: Answer synthesis failed with all LLMs. Gemini: 429 Yo... |