Add Groundtruth Answer Column
Browse files- CHANGELOG.md +42 -0
- app.py +9 -1
- output/gaia_results_20260104_221732.json +51 -0
- src/utils/ground_truth.py +5 -2
CHANGELOG.md
CHANGED
|
@@ -437,6 +437,48 @@ No "results" array exists with per-question correctness. API tells us "1/3 corre
|
|
| 437 |
- ⏳ Verify exact match comparison works correctly
|
| 438 |
- ⏳ Check performance with dataset caching
|
| 439 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 440 |
### [FEATURE: UI Control for Question Limit - Cloud Testing Support]
|
| 441 |
|
| 442 |
**Problem:** DEBUG_QUESTION_LIMIT in .env requires file editing to change. In HF Spaces cloud, users can't easily modify .env for testing different question counts.
|
|
|
|
| 437 |
- ⏳ Verify exact match comparison works correctly
|
| 438 |
- ⏳ Check performance with dataset caching
|
| 439 |
|
| 440 |
+
### [ENHANCEMENT: Add Ground Truth Answer and Annotator Metadata to Results]
|
| 441 |
+
|
| 442 |
+
**Problem:** Results only show if answer is correct/incorrect, but don't show what the correct answer should be or how to solve it. Makes error analysis difficult.
|
| 443 |
+
|
| 444 |
+
**Solution:** Add ground truth answer and annotator metadata to results_log (single source of truth for both UI and JSON).
|
| 445 |
+
|
| 446 |
+
**Modified Files:**
|
| 447 |
+
|
| 448 |
+
- **src/utils/ground_truth.py** (~5 lines modified)
|
| 449 |
+
- Added `self.metadata: Dict[str, dict] = {}` to store full item data (line 29)
|
| 450 |
+
- Updated `load_validation_set()` to store full dataset items in metadata dict (lines 62-63)
|
| 451 |
+
- Enables access to all GAIA dataset fields (Level, Annotator Metadata, file_name, etc.)
|
| 452 |
+
|
| 453 |
+
- **app.py** (~10 lines modified)
|
| 454 |
+
- Updated results collection loop (lines 397-414)
|
| 455 |
+
- Added `gt_answer = ground_truth.get_answer(task_id)` to fetch ground truth answer
|
| 456 |
+
- Added `annotator_metadata = metadata_item.get("Annotator Metadata", {})` to fetch solving steps
|
| 457 |
+
- Added "Ground Truth Answer" column to results_log when ground truth available
|
| 458 |
+
- Added "Annotator Metadata" column to results_log when ground truth available
|
| 459 |
+
- Both UI table and JSON export automatically get these columns (same source: results_log)
|
| 460 |
+
|
| 461 |
+
**Benefits:**
|
| 462 |
+
|
| 463 |
+
- ✅ **Error analysis:** See what correct answer should be when agent fails
|
| 464 |
+
- ✅ **Debugging hints:** Annotator metadata shows how question should be solved
|
| 465 |
+
- ✅ **Single source:** Modify results_log once, both UI and JSON get the data
|
| 466 |
+
- ✅ **UI table:** New columns appear in results DataFrame
|
| 467 |
+
- ✅ **JSON export:** New fields automatically included in export
|
| 468 |
+
|
| 469 |
+
**Data Flow:**
|
| 470 |
+
|
| 471 |
+
```
|
| 472 |
+
results_log (single source)
|
| 473 |
+
├─> pd.DataFrame(results_log) → UI table
|
| 474 |
+
└─> export_results_to_json(results_log) → JSON export
|
| 475 |
+
```
|
| 476 |
+
|
| 477 |
+
**Verification:**
|
| 478 |
+
|
| 479 |
+
- ⏳ Testing with validation set to verify columns appear correctly
|
| 480 |
+
- ⏳ Verify annotator metadata format in UI table and JSON
|
| 481 |
+
|
| 482 |
### [FEATURE: UI Control for Question Limit - Cloud Testing Support]
|
| 483 |
|
| 484 |
**Problem:** DEBUG_QUESTION_LIMIT in .env requires file editing to change. In HF Spaces cloud, users can't easily modify .env for testing different question counts.
|
app.py
CHANGED
|
@@ -394,6 +394,11 @@ def run_and_submit_all(
|
|
| 394 |
# Compare with ground truth if available
|
| 395 |
is_correct = ground_truth.compare_answer(result["task_id"], result["answer"])
|
| 396 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 397 |
# Add to results log
|
| 398 |
result_entry = {
|
| 399 |
"Task ID": result["task_id"],
|
|
@@ -401,9 +406,12 @@ def run_and_submit_all(
|
|
| 401 |
"Submitted Answer": result["answer"],
|
| 402 |
}
|
| 403 |
|
| 404 |
-
# Add
|
| 405 |
if is_correct is not None:
|
| 406 |
result_entry["Correct?"] = "✅ Yes" if is_correct else "❌ No"
|
|
|
|
|
|
|
|
|
|
| 407 |
|
| 408 |
results_log.append(result_entry)
|
| 409 |
|
|
|
|
| 394 |
# Compare with ground truth if available
|
| 395 |
is_correct = ground_truth.compare_answer(result["task_id"], result["answer"])
|
| 396 |
|
| 397 |
+
# Get ground truth answer and metadata
|
| 398 |
+
gt_answer = ground_truth.get_answer(result["task_id"])
|
| 399 |
+
metadata_item = ground_truth.metadata.get(result["task_id"], {})
|
| 400 |
+
annotator_metadata = metadata_item.get("Annotator Metadata", {})
|
| 401 |
+
|
| 402 |
# Add to results log
|
| 403 |
result_entry = {
|
| 404 |
"Task ID": result["task_id"],
|
|
|
|
| 406 |
"Submitted Answer": result["answer"],
|
| 407 |
}
|
| 408 |
|
| 409 |
+
# Add ground truth data if available
|
| 410 |
if is_correct is not None:
|
| 411 |
result_entry["Correct?"] = "✅ Yes" if is_correct else "❌ No"
|
| 412 |
+
result_entry["Ground Truth Answer"] = gt_answer
|
| 413 |
+
# Add annotator metadata as dict for JSON, string for UI table
|
| 414 |
+
result_entry["Annotator Metadata"] = annotator_metadata
|
| 415 |
|
| 416 |
results_log.append(result_entry)
|
| 417 |
|
output/gaia_results_20260104_221732.json
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metadata": {
|
| 3 |
+
"generated": "2026-01-04 22:17:32",
|
| 4 |
+
"timestamp": "20260104_221732",
|
| 5 |
+
"total_questions": 6,
|
| 6 |
+
"execution_time_seconds": 23.92,
|
| 7 |
+
"execution_time_formatted": "0m 23s",
|
| 8 |
+
"score_percent": 5.0,
|
| 9 |
+
"correct_count": 1,
|
| 10 |
+
"total_attempted": 6
|
| 11 |
+
},
|
| 12 |
+
"submission_status": "Submission Successful!\nUser: mangoobee\nOverall Score: 5.0% (1/6 correct)\nMessage: Score calculated successfully: 1/20 total questions answered correctly (6 valid tasks attempted). Score did not improve previous record, leaderboard not updated.",
|
| 13 |
+
"results": [
|
| 14 |
+
{
|
| 15 |
+
"task_id": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be",
|
| 16 |
+
"question": "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.",
|
| 17 |
+
"submitted_answer": "Unable to answer",
|
| 18 |
+
"correct": false
|
| 19 |
+
},
|
| 20 |
+
{
|
| 21 |
+
"task_id": "cca530fc-4052-43b2-b130-b30968d8aa44",
|
| 22 |
+
"question": "Review the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.",
|
| 23 |
+
"submitted_answer": "ERROR: No evidence collected. Details: Tool vision failed: Exception: Vision analysis failed - Gemini and Claude both failed",
|
| 24 |
+
"correct": false
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"task_id": "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8",
|
| 28 |
+
"question": "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?",
|
| 29 |
+
"submitted_answer": "FunkMonk",
|
| 30 |
+
"correct": true
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
|
| 34 |
+
"question": "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?",
|
| 35 |
+
"submitted_answer": "ERROR: No evidence collected. Details: Tool vision failed: Exception: Vision analysis failed - Gemini and Claude both failed",
|
| 36 |
+
"correct": false
|
| 37 |
+
},
|
| 38 |
+
{
|
| 39 |
+
"task_id": "6f37996b-2ac7-44b0-8e68-6d28256631b4",
|
| 40 |
+
"question": "Given this table defining * on the set S = {a, b, c, d, e}\n\n|*|a|b|c|d|e|\n|---|---|---|---|---|---|\n|a|a|b|c|b|d|\n|b|b|c|a|e|c|\n|c|c|a|b|b|a|\n|d|b|e|b|e|d|\n|e|d|b|a|d|c|\n\nprovide the subset of S involved in any possible counter-examples that prove * is not commutative. Provide your answer as a comma separated list of the elements in the set in alphabetical order.",
|
| 41 |
+
"submitted_answer": "ERROR: No evidence collected. Details: Tool parse_file failed: FileNotFoundError: Text file not found: path/to/the/given/table.csv",
|
| 42 |
+
"correct": false
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
"task_id": "2d83110e-a098-4ebb-9987-066c06fa42d0",
|
| 46 |
+
"question": ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI",
|
| 47 |
+
"submitted_answer": "Unable to answer",
|
| 48 |
+
"correct": false
|
| 49 |
+
}
|
| 50 |
+
]
|
| 51 |
+
}
|
src/utils/ground_truth.py
CHANGED
|
@@ -25,7 +25,8 @@ class GAIAGroundTruth:
|
|
| 25 |
|
| 26 |
def __init__(self):
|
| 27 |
"""Initialize ground truth loader."""
|
| 28 |
-
self.ground_truth: Dict[str, str] = {}
|
|
|
|
| 29 |
self._loaded = False
|
| 30 |
|
| 31 |
def load_validation_set(self) -> bool:
|
|
@@ -51,13 +52,15 @@ class GAIAGroundTruth:
|
|
| 51 |
cache_dir=CACHE_DIR
|
| 52 |
)
|
| 53 |
|
| 54 |
-
# Build task_id -> final_answer mapping
|
| 55 |
for item in dataset:
|
| 56 |
task_id = item.get("task_id")
|
| 57 |
final_answer = item.get("Final answer")
|
| 58 |
|
| 59 |
if task_id and final_answer:
|
| 60 |
self.ground_truth[task_id] = str(final_answer).strip()
|
|
|
|
|
|
|
| 61 |
|
| 62 |
self._loaded = True
|
| 63 |
logger.info(f"Loaded {len(self.ground_truth)} ground truth answers")
|
|
|
|
| 25 |
|
| 26 |
def __init__(self):
|
| 27 |
"""Initialize ground truth loader."""
|
| 28 |
+
self.ground_truth: Dict[str, str] = {} # task_id -> final_answer
|
| 29 |
+
self.metadata: Dict[str, dict] = {} # task_id -> full item data
|
| 30 |
self._loaded = False
|
| 31 |
|
| 32 |
def load_validation_set(self) -> bool:
|
|
|
|
| 52 |
cache_dir=CACHE_DIR
|
| 53 |
)
|
| 54 |
|
| 55 |
+
# Build task_id -> final_answer mapping and metadata
|
| 56 |
for item in dataset:
|
| 57 |
task_id = item.get("task_id")
|
| 58 |
final_answer = item.get("Final answer")
|
| 59 |
|
| 60 |
if task_id and final_answer:
|
| 61 |
self.ground_truth[task_id] = str(final_answer).strip()
|
| 62 |
+
# Store full item for metadata access
|
| 63 |
+
self.metadata[task_id] = dict(item)
|
| 64 |
|
| 65 |
self._loaded = True
|
| 66 |
logger.info(f"Loaded {len(self.ground_truth)} ground truth answers")
|