mangubee commited on
Commit
dc583a7
·
1 Parent(s): 65a5dc6

Add Groundtruth Answer Column

Browse files
CHANGELOG.md CHANGED
@@ -437,6 +437,48 @@ No "results" array exists with per-question correctness. API tells us "1/3 corre
437
  - ⏳ Verify exact match comparison works correctly
438
  - ⏳ Check performance with dataset caching
439
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
440
  ### [FEATURE: UI Control for Question Limit - Cloud Testing Support]
441
 
442
  **Problem:** DEBUG_QUESTION_LIMIT in .env requires file editing to change. In HF Spaces cloud, users can't easily modify .env for testing different question counts.
 
437
  - ⏳ Verify exact match comparison works correctly
438
  - ⏳ Check performance with dataset caching
439
 
440
+ ### [ENHANCEMENT: Add Ground Truth Answer and Annotator Metadata to Results]
441
+
442
+ **Problem:** Results only show if answer is correct/incorrect, but don't show what the correct answer should be or how to solve it. Makes error analysis difficult.
443
+
444
+ **Solution:** Add ground truth answer and annotator metadata to results_log (single source of truth for both UI and JSON).
445
+
446
+ **Modified Files:**
447
+
448
+ - **src/utils/ground_truth.py** (~5 lines modified)
449
+ - Added `self.metadata: Dict[str, dict] = {}` to store full item data (line 29)
450
+ - Updated `load_validation_set()` to store full dataset items in metadata dict (lines 62-63)
451
+ - Enables access to all GAIA dataset fields (Level, Annotator Metadata, file_name, etc.)
452
+
453
+ - **app.py** (~10 lines modified)
454
+ - Updated results collection loop (lines 397-414)
455
+ - Added `gt_answer = ground_truth.get_answer(task_id)` to fetch ground truth answer
456
+ - Added `annotator_metadata = metadata_item.get("Annotator Metadata", {})` to fetch solving steps
457
+ - Added "Ground Truth Answer" column to results_log when ground truth available
458
+ - Added "Annotator Metadata" column to results_log when ground truth available
459
+ - Both UI table and JSON export automatically get these columns (same source: results_log)
460
+
461
+ **Benefits:**
462
+
463
+ - ✅ **Error analysis:** See what correct answer should be when agent fails
464
+ - ✅ **Debugging hints:** Annotator metadata shows how question should be solved
465
+ - ✅ **Single source:** Modify results_log once, both UI and JSON get the data
466
+ - ✅ **UI table:** New columns appear in results DataFrame
467
+ - ✅ **JSON export:** New fields automatically included in export
468
+
469
+ **Data Flow:**
470
+
471
+ ```
472
+ results_log (single source)
473
+ ├─> pd.DataFrame(results_log) → UI table
474
+ └─> export_results_to_json(results_log) → JSON export
475
+ ```
476
+
477
+ **Verification:**
478
+
479
+ - ⏳ Testing with validation set to verify columns appear correctly
480
+ - ⏳ Verify annotator metadata format in UI table and JSON
481
+
482
  ### [FEATURE: UI Control for Question Limit - Cloud Testing Support]
483
 
484
  **Problem:** DEBUG_QUESTION_LIMIT in .env requires file editing to change. In HF Spaces cloud, users can't easily modify .env for testing different question counts.
app.py CHANGED
@@ -394,6 +394,11 @@ def run_and_submit_all(
394
  # Compare with ground truth if available
395
  is_correct = ground_truth.compare_answer(result["task_id"], result["answer"])
396
 
 
 
 
 
 
397
  # Add to results log
398
  result_entry = {
399
  "Task ID": result["task_id"],
@@ -401,9 +406,12 @@ def run_and_submit_all(
401
  "Submitted Answer": result["answer"],
402
  }
403
 
404
- # Add "Correct?" column if ground truth available
405
  if is_correct is not None:
406
  result_entry["Correct?"] = "✅ Yes" if is_correct else "❌ No"
 
 
 
407
 
408
  results_log.append(result_entry)
409
 
 
394
  # Compare with ground truth if available
395
  is_correct = ground_truth.compare_answer(result["task_id"], result["answer"])
396
 
397
+ # Get ground truth answer and metadata
398
+ gt_answer = ground_truth.get_answer(result["task_id"])
399
+ metadata_item = ground_truth.metadata.get(result["task_id"], {})
400
+ annotator_metadata = metadata_item.get("Annotator Metadata", {})
401
+
402
  # Add to results log
403
  result_entry = {
404
  "Task ID": result["task_id"],
 
406
  "Submitted Answer": result["answer"],
407
  }
408
 
409
+ # Add ground truth data if available
410
  if is_correct is not None:
411
  result_entry["Correct?"] = "✅ Yes" if is_correct else "❌ No"
412
+ result_entry["Ground Truth Answer"] = gt_answer
413
+ # Add annotator metadata as dict for JSON, string for UI table
414
+ result_entry["Annotator Metadata"] = annotator_metadata
415
 
416
  results_log.append(result_entry)
417
 
output/gaia_results_20260104_221732.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "generated": "2026-01-04 22:17:32",
4
+ "timestamp": "20260104_221732",
5
+ "total_questions": 6,
6
+ "execution_time_seconds": 23.92,
7
+ "execution_time_formatted": "0m 23s",
8
+ "score_percent": 5.0,
9
+ "correct_count": 1,
10
+ "total_attempted": 6
11
+ },
12
+ "submission_status": "Submission Successful!\nUser: mangoobee\nOverall Score: 5.0% (1/6 correct)\nMessage: Score calculated successfully: 1/20 total questions answered correctly (6 valid tasks attempted). Score did not improve previous record, leaderboard not updated.",
13
+ "results": [
14
+ {
15
+ "task_id": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be",
16
+ "question": "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.",
17
+ "submitted_answer": "Unable to answer",
18
+ "correct": false
19
+ },
20
+ {
21
+ "task_id": "cca530fc-4052-43b2-b130-b30968d8aa44",
22
+ "question": "Review the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.",
23
+ "submitted_answer": "ERROR: No evidence collected. Details: Tool vision failed: Exception: Vision analysis failed - Gemini and Claude both failed",
24
+ "correct": false
25
+ },
26
+ {
27
+ "task_id": "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8",
28
+ "question": "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?",
29
+ "submitted_answer": "FunkMonk",
30
+ "correct": true
31
+ },
32
+ {
33
+ "task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
34
+ "question": "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?",
35
+ "submitted_answer": "ERROR: No evidence collected. Details: Tool vision failed: Exception: Vision analysis failed - Gemini and Claude both failed",
36
+ "correct": false
37
+ },
38
+ {
39
+ "task_id": "6f37996b-2ac7-44b0-8e68-6d28256631b4",
40
+ "question": "Given this table defining * on the set S = {a, b, c, d, e}\n\n|*|a|b|c|d|e|\n|---|---|---|---|---|---|\n|a|a|b|c|b|d|\n|b|b|c|a|e|c|\n|c|c|a|b|b|a|\n|d|b|e|b|e|d|\n|e|d|b|a|d|c|\n\nprovide the subset of S involved in any possible counter-examples that prove * is not commutative. Provide your answer as a comma separated list of the elements in the set in alphabetical order.",
41
+ "submitted_answer": "ERROR: No evidence collected. Details: Tool parse_file failed: FileNotFoundError: Text file not found: path/to/the/given/table.csv",
42
+ "correct": false
43
+ },
44
+ {
45
+ "task_id": "2d83110e-a098-4ebb-9987-066c06fa42d0",
46
+ "question": ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI",
47
+ "submitted_answer": "Unable to answer",
48
+ "correct": false
49
+ }
50
+ ]
51
+ }
src/utils/ground_truth.py CHANGED
@@ -25,7 +25,8 @@ class GAIAGroundTruth:
25
 
26
  def __init__(self):
27
  """Initialize ground truth loader."""
28
- self.ground_truth: Dict[str, str] = {}
 
29
  self._loaded = False
30
 
31
  def load_validation_set(self) -> bool:
@@ -51,13 +52,15 @@ class GAIAGroundTruth:
51
  cache_dir=CACHE_DIR
52
  )
53
 
54
- # Build task_id -> final_answer mapping
55
  for item in dataset:
56
  task_id = item.get("task_id")
57
  final_answer = item.get("Final answer")
58
 
59
  if task_id and final_answer:
60
  self.ground_truth[task_id] = str(final_answer).strip()
 
 
61
 
62
  self._loaded = True
63
  logger.info(f"Loaded {len(self.ground_truth)} ground truth answers")
 
25
 
26
  def __init__(self):
27
  """Initialize ground truth loader."""
28
+ self.ground_truth: Dict[str, str] = {} # task_id -> final_answer
29
+ self.metadata: Dict[str, dict] = {} # task_id -> full item data
30
  self._loaded = False
31
 
32
  def load_validation_set(self) -> bool:
 
52
  cache_dir=CACHE_DIR
53
  )
54
 
55
+ # Build task_id -> final_answer mapping and metadata
56
  for item in dataset:
57
  task_id = item.get("task_id")
58
  final_answer = item.get("Final answer")
59
 
60
  if task_id and final_answer:
61
  self.ground_truth[task_id] = str(final_answer).strip()
62
+ # Store full item for metadata access
63
+ self.metadata[task_id] = dict(item)
64
 
65
  self._loaded = True
66
  logger.info(f"Loaded {len(self.ground_truth)} ground truth answers")