agentbee

Running

App Files Files Community

mangubee commited on 22 days ago

Commit

dc583a7

1 Parent(s): 65a5dc6

Add Groundtruth Answer Column

Browse files

Files changed (4) hide show

CHANGELOG.md +42 -0
app.py +9 -1
output/gaia_results_20260104_221732.json +51 -0
src/utils/ground_truth.py +5 -2

CHANGELOG.md CHANGED Viewed

@@ -437,6 +437,48 @@ No "results" array exists with per-question correctness. API tells us "1/3 corre
 - ⏳ Verify exact match comparison works correctly
 - ⏳ Check performance with dataset caching
 ### [FEATURE: UI Control for Question Limit - Cloud Testing Support]
 **Problem:** DEBUG_QUESTION_LIMIT in .env requires file editing to change. In HF Spaces cloud, users can't easily modify .env for testing different question counts.

 - ⏳ Verify exact match comparison works correctly
 - ⏳ Check performance with dataset caching
+### [ENHANCEMENT: Add Ground Truth Answer and Annotator Metadata to Results]
+**Problem:** Results only show if answer is correct/incorrect, but don't show what the correct answer should be or how to solve it. Makes error analysis difficult.
+**Solution:** Add ground truth answer and annotator metadata to results_log (single source of truth for both UI and JSON).
+**Modified Files:**
+- **src/utils/ground_truth.py** (~5 lines modified)
+  - Added `self.metadata: Dict[str, dict] = {}` to store full item data (line 29)
+  - Updated `load_validation_set()` to store full dataset items in metadata dict (lines 62-63)
+  - Enables access to all GAIA dataset fields (Level, Annotator Metadata, file_name, etc.)
+- **app.py** (~10 lines modified)
+  - Updated results collection loop (lines 397-414)
+  - Added `gt_answer = ground_truth.get_answer(task_id)` to fetch ground truth answer
+  - Added `annotator_metadata = metadata_item.get("Annotator Metadata", {})` to fetch solving steps
+  - Added "Ground Truth Answer" column to results_log when ground truth available
+  - Added "Annotator Metadata" column to results_log when ground truth available
+  - Both UI table and JSON export automatically get these columns (same source: results_log)
+**Benefits:**
+- ✅ **Error analysis:** See what correct answer should be when agent fails
+- ✅ **Debugging hints:** Annotator metadata shows how question should be solved
+- ✅ **Single source:** Modify results_log once, both UI and JSON get the data
+- ✅ **UI table:** New columns appear in results DataFrame
+- ✅ **JSON export:** New fields automatically included in export
+**Data Flow:**
+```
+results_log (single source)
+    ├─> pd.DataFrame(results_log) → UI table
+    └─> export_results_to_json(results_log) → JSON export
+```
+**Verification:**
+- ⏳ Testing with validation set to verify columns appear correctly
+- ⏳ Verify annotator metadata format in UI table and JSON
 ### [FEATURE: UI Control for Question Limit - Cloud Testing Support]
 **Problem:** DEBUG_QUESTION_LIMIT in .env requires file editing to change. In HF Spaces cloud, users can't easily modify .env for testing different question counts.

app.py CHANGED Viewed

@@ -394,6 +394,11 @@ def run_and_submit_all(
             # Compare with ground truth if available
             is_correct = ground_truth.compare_answer(result["task_id"], result["answer"])
             # Add to results log
             result_entry = {
                 "Task ID": result["task_id"],
@@ -401,9 +406,12 @@ def run_and_submit_all(
                 "Submitted Answer": result["answer"],
             }
-            # Add "Correct?" column if ground truth available
             if is_correct is not None:
                 result_entry["Correct?"] = "✅ Yes" if is_correct else "❌ No"
             results_log.append(result_entry)

             # Compare with ground truth if available
             is_correct = ground_truth.compare_answer(result["task_id"], result["answer"])
+            # Get ground truth answer and metadata
+            gt_answer = ground_truth.get_answer(result["task_id"])
+            metadata_item = ground_truth.metadata.get(result["task_id"], {})
+            annotator_metadata = metadata_item.get("Annotator Metadata", {})
             # Add to results log
             result_entry = {
                 "Task ID": result["task_id"],
                 "Submitted Answer": result["answer"],
             }
+            # Add ground truth data if available
             if is_correct is not None:
                 result_entry["Correct?"] = "✅ Yes" if is_correct else "❌ No"
+                result_entry["Ground Truth Answer"] = gt_answer
+                # Add annotator metadata as dict for JSON, string for UI table
+                result_entry["Annotator Metadata"] = annotator_metadata
             results_log.append(result_entry)

output/gaia_results_20260104_221732.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "metadata": {
+    "generated": "2026-01-04 22:17:32",
+    "timestamp": "20260104_221732",
+    "total_questions": 6,
+    "execution_time_seconds": 23.92,
+    "execution_time_formatted": "0m 23s",
+    "score_percent": 5.0,
+    "correct_count": 1,
+    "total_attempted": 6
+  },
+  "submission_status": "Submission Successful!\nUser: mangoobee\nOverall Score: 5.0% (1/6 correct)\nMessage: Score calculated successfully: 1/20 total questions answered correctly (6 valid tasks attempted). Score did not improve previous record, leaderboard not updated.",
+  "results": [
+    {
+      "task_id": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be",
+      "question": "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.",
+      "submitted_answer": "Unable to answer",
+      "correct": false
+    },
+    {
+      "task_id": "cca530fc-4052-43b2-b130-b30968d8aa44",
+      "question": "Review the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.",
+      "submitted_answer": "ERROR: No evidence collected. Details: Tool vision failed: Exception: Vision analysis failed - Gemini and Claude both failed",
+      "correct": false
+    },
+    {
+      "task_id": "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8",
+      "question": "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?",
+      "submitted_answer": "FunkMonk",
+      "correct": true
+    },
+    {
+      "task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
+      "question": "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?",
+      "submitted_answer": "ERROR: No evidence collected. Details: Tool vision failed: Exception: Vision analysis failed - Gemini and Claude both failed",
+      "correct": false
+    },
+    {
+      "task_id": "6f37996b-2ac7-44b0-8e68-6d28256631b4",
+      "question": "Given this table defining * on the set S = {a, b, c, d, e}\n\n|*|a|b|c|d|e|\n|---|---|---|---|---|---|\n|a|a|b|c|b|d|\n|b|b|c|a|e|c|\n|c|c|a|b|b|a|\n|d|b|e|b|e|d|\n|e|d|b|a|d|c|\n\nprovide the subset of S involved in any possible counter-examples that prove * is not commutative. Provide your answer as a comma separated list of the elements in the set in alphabetical order.",
+      "submitted_answer": "ERROR: No evidence collected. Details: Tool parse_file failed: FileNotFoundError: Text file not found: path/to/the/given/table.csv",
+      "correct": false
+    },
+    {
+      "task_id": "2d83110e-a098-4ebb-9987-066c06fa42d0",
+      "question": ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI",
+      "submitted_answer": "Unable to answer",
+      "correct": false
+    }
+  ]
+}

src/utils/ground_truth.py CHANGED Viewed

@@ -25,7 +25,8 @@ class GAIAGroundTruth:
     def __init__(self):
         """Initialize ground truth loader."""
-        self.ground_truth: Dict[str, str] = {}
         self._loaded = False
     def load_validation_set(self) -> bool:
@@ -51,13 +52,15 @@ class GAIAGroundTruth:
                 cache_dir=CACHE_DIR
             )
-            # Build task_id -> final_answer mapping
             for item in dataset:
                 task_id = item.get("task_id")
                 final_answer = item.get("Final answer")
                 if task_id and final_answer:
                     self.ground_truth[task_id] = str(final_answer).strip()
             self._loaded = True
             logger.info(f"Loaded {len(self.ground_truth)} ground truth answers")

     def __init__(self):
         """Initialize ground truth loader."""
+        self.ground_truth: Dict[str, str] = {}  # task_id -> final_answer
+        self.metadata: Dict[str, dict] = {}  # task_id -> full item data
         self._loaded = False
     def load_validation_set(self) -> bool:
                 cache_dir=CACHE_DIR
             )
+            # Build task_id -> final_answer mapping and metadata
             for item in dataset:
                 task_id = item.get("task_id")
                 final_answer = item.get("Final answer")
                 if task_id and final_answer:
                     self.ground_truth[task_id] = str(final_answer).strip()
+                    # Store full item for metadata access
+                    self.metadata[task_id] = dict(item)
             self._loaded = True
             logger.info(f"Loaded {len(self.ground_truth)} ground truth answers")