mangubee commited on
Commit
1bf8fc5
·
1 Parent(s): 44d1862
Files changed (3) hide show
  1. CHANGELOG.md +56 -2
  2. app.py +28 -14
  3. exports/gaia_results_20260105_153616.json +85 -0
CHANGELOG.md CHANGED
@@ -476,8 +476,62 @@ results_log (single source)
476
 
477
  **Verification:**
478
 
479
- - Testing with validation set to verify columns appear correctly
480
- - Verify annotator metadata format in UI table and JSON
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
481
 
482
  ### [FEATURE: UI Control for Question Limit - Cloud Testing Support]
483
 
 
476
 
477
  **Verification:**
478
 
479
+ - UI table shows annotator metadata as JSON string
480
+ - JSON export includes ground_truth_answer and annotator_metadata fields
481
+ - ⏳ Full testing pending to verify format is correct
482
+
483
+ ### [BUGFIX: Annotator Metadata Display and JSON Export]
484
+
485
+ **Problem:**
486
+
487
+ 1. UI table shows "[object Object]" for annotator metadata (dict can't be displayed)
488
+ 2. JSON export missing ground_truth_answer and annotator_metadata fields
489
+
490
+ **Root Cause:**
491
+
492
+ 1. Annotator metadata stored as dict, pandas displays as "[object Object]"
493
+ 2. JSON export function explicitly constructed only specific fields, ignoring new ground truth fields
494
+
495
+ **Modified Files:**
496
+
497
+ - **app.py** (~25 lines modified)
498
+ - Updated results collection (lines 413-416)
499
+ - Convert annotator_metadata dict to JSON string for UI display: `json.dumps(annotator_metadata)`
500
+ - Store raw dict in `_annotator_metadata_raw` for JSON export
501
+ - Updated `export_results_to_json()` function (lines 101-128)
502
+ - Changed from list comprehension to explicit loop for better control
503
+ - Added conditional field addition for ground truth data
504
+ - Added `ground_truth_answer` field to JSON export
505
+ - Added `annotator_metadata` field to JSON export (from raw dict)
506
+ - Only includes fields if they exist in results_log
507
+
508
+ **Solution:**
509
+
510
+ - UI table: Shows annotator metadata as JSON string (readable format)
511
+ - JSON export: Includes `ground_truth_answer` and `annotator_metadata` objects
512
+ - Dual storage: String for UI, raw dict for JSON
513
+
514
+ **JSON Export Format:**
515
+
516
+ ```json
517
+ {
518
+ "task_id": "...",
519
+ "question": "...",
520
+ "submitted_answer": "...",
521
+ "correct": true/false/null,
522
+ "ground_truth_answer": "expected answer",
523
+ "annotator_metadata": {
524
+ "steps": ["step 1", "step 2"],
525
+ "tools": ["web_search"],
526
+ "reasoning": "..."
527
+ }
528
+ }
529
+ ```
530
+
531
+ **Verification:**
532
+
533
+ - ✅ UI table displays annotator metadata as readable JSON string
534
+ - ✅ JSON export includes all ground truth fields properly formatted
535
 
536
  ### [FEATURE: UI Control for Question Limit - Cloud Testing Support]
537
 
app.py CHANGED
@@ -98,21 +98,33 @@ def export_results_to_json(
98
  metadata["correct_count"] = submission_response.get("correct_count")
99
  metadata["total_attempted"] = submission_response.get("total_attempted")
100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  export_data = {
102
  "metadata": metadata,
103
  "submission_status": submission_status,
104
- "results": [
105
- {
106
- "task_id": result.get("Task ID", "N/A"),
107
- "question": result.get("Question", "N/A"),
108
- "submitted_answer": result.get("Submitted Answer", "N/A"),
109
- # Use ground truth comparison if available, otherwise null
110
- "correct": True if result.get("Correct?") == "✅ Yes"
111
- else False if result.get("Correct?") == "❌ No"
112
- else None,
113
- }
114
- for result in results_log
115
- ],
116
  }
117
 
118
  # Write JSON file with pretty formatting
@@ -410,8 +422,10 @@ def run_and_submit_all(
410
  if is_correct is not None:
411
  result_entry["Correct?"] = "✅ Yes" if is_correct else "❌ No"
412
  result_entry["Ground Truth Answer"] = gt_answer
413
- # Add annotator metadata as dict for JSON, string for UI table
414
- result_entry["Annotator Metadata"] = annotator_metadata
 
 
415
 
416
  results_log.append(result_entry)
417
 
 
98
  metadata["correct_count"] = submission_response.get("correct_count")
99
  metadata["total_attempted"] = submission_response.get("total_attempted")
100
 
101
+ # Build results array with all fields from results_log
102
+ results_array = []
103
+ for result in results_log:
104
+ result_dict = {
105
+ "task_id": result.get("Task ID", "N/A"),
106
+ "question": result.get("Question", "N/A"),
107
+ "submitted_answer": result.get("Submitted Answer", "N/A"),
108
+ }
109
+
110
+ # Add correctness if available
111
+ if result.get("Correct?"):
112
+ result_dict["correct"] = True if result.get("Correct?") == "✅ Yes" else False
113
+
114
+ # Add ground truth answer if available
115
+ if result.get("Ground Truth Answer"):
116
+ result_dict["ground_truth_answer"] = result.get("Ground Truth Answer")
117
+
118
+ # Add annotator metadata if available (use raw dict)
119
+ if result.get("_annotator_metadata_raw"):
120
+ result_dict["annotator_metadata"] = result.get("_annotator_metadata_raw")
121
+
122
+ results_array.append(result_dict)
123
+
124
  export_data = {
125
  "metadata": metadata,
126
  "submission_status": submission_status,
127
+ "results": results_array,
 
 
 
 
 
 
 
 
 
 
 
128
  }
129
 
130
  # Write JSON file with pretty formatting
 
422
  if is_correct is not None:
423
  result_entry["Correct?"] = "✅ Yes" if is_correct else "❌ No"
424
  result_entry["Ground Truth Answer"] = gt_answer
425
+ # Convert annotator metadata to JSON string for UI table display
426
+ result_entry["Annotator Metadata"] = json.dumps(annotator_metadata, ensure_ascii=False) if annotator_metadata else ""
427
+ # Store raw dict for JSON export (will be extracted in export function)
428
+ result_entry["_annotator_metadata_raw"] = annotator_metadata
429
 
430
  results_log.append(result_entry)
431
 
exports/gaia_results_20260105_153616.json ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "generated": "2026-01-05 15:36:16",
4
+ "timestamp": "20260105_153616",
5
+ "total_questions": 5,
6
+ "execution_time_seconds": 51.51,
7
+ "execution_time_formatted": "0m 51s",
8
+ "score_percent": 0.0,
9
+ "correct_count": 0,
10
+ "total_attempted": 5
11
+ },
12
+ "submission_status": "Submission Successful!\nUser: mangoobee\nOverall Score: 0.0% (0/5 correct)\nMessage: Score calculated successfully: 0/20 total questions answered correctly (5 valid tasks attempted). Score did not improve previous record, leaderboard not updated.",
13
+ "results": [
14
+ {
15
+ "task_id": "2d83110e-a098-4ebb-9987-066c06fa42d0",
16
+ "question": ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI",
17
+ "submitted_answer": "ERROR: No evidence collected. Details: Tool selection returned no tools - using fallback keyword matching; Tool calculator failed: SyntaxError: Invalid expression syntax: invalid syntax (<unknown>, line 1)",
18
+ "correct": false,
19
+ "ground_truth_answer": "Right",
20
+ "annotator_metadata": {
21
+ "Steps": "1. Read the instructions in reverse",
22
+ "Number of steps": "1",
23
+ "How long did this take?": "1 minute",
24
+ "Tools": "1. A word reversal tool / script",
25
+ "Number of tools": "0"
26
+ }
27
+ },
28
+ {
29
+ "task_id": "cca530fc-4052-43b2-b130-b30968d8aa44",
30
+ "question": "Review the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.",
31
+ "submitted_answer": "ERROR: No evidence collected. Details: Tool vision failed: Exception: Vision analysis failed - Gemini and Claude both failed",
32
+ "correct": false,
33
+ "ground_truth_answer": "Rd5",
34
+ "annotator_metadata": {
35
+ "Steps": "Step 1: Evaluate the position of the pieces in the chess position\nStep 2: Report the best move available for black: \"Rd5\"",
36
+ "Number of steps": "2",
37
+ "How long did this take?": "10 minutes",
38
+ "Tools": "1. Image recognition tools",
39
+ "Number of tools": "1"
40
+ }
41
+ },
42
+ {
43
+ "task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
44
+ "question": "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?",
45
+ "submitted_answer": "Unable to answer",
46
+ "correct": false,
47
+ "ground_truth_answer": "3",
48
+ "annotator_metadata": {
49
+ "Steps": "1. Navigate to the YouTube link.\n2. Watch the video to see the highest number of bird species.\n3. Note the number.",
50
+ "Number of steps": "3",
51
+ "How long did this take?": "3 minutes",
52
+ "Tools": "1. Web browser\n2. Video parsing",
53
+ "Number of tools": "2"
54
+ }
55
+ },
56
+ {
57
+ "task_id": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be",
58
+ "question": "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.",
59
+ "submitted_answer": "",
60
+ "correct": false,
61
+ "ground_truth_answer": "3",
62
+ "annotator_metadata": {
63
+ "Steps": "1. I did a search for Mercedes Sosa\n2. I went to the Wikipedia page for her\n3. I scrolled down to \"Studio albums\"\n4. I counted the ones between 2000 and 2009",
64
+ "Number of steps": "4",
65
+ "How long did this take?": "5 minutes",
66
+ "Tools": "1. web browser\n2. google search",
67
+ "Number of tools": "2"
68
+ }
69
+ },
70
+ {
71
+ "task_id": "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8",
72
+ "question": "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?",
73
+ "submitted_answer": "",
74
+ "correct": false,
75
+ "ground_truth_answer": "FunkMonk",
76
+ "annotator_metadata": {
77
+ "Steps": "1. Search \"Wikipedia featured articles promoted in november 2016\"\n2. Click through to the appropriate page and find the person who nominated Giganotosaurus.",
78
+ "Number of steps": "2",
79
+ "How long did this take?": "5 minutes",
80
+ "Tools": "1. web browser\n2. search engine",
81
+ "Number of tools": "2"
82
+ }
83
+ }
84
+ ]
85
+ }