Fix bug
Browse files- CHANGELOG.md +56 -2
- app.py +28 -14
- exports/gaia_results_20260105_153616.json +85 -0
CHANGELOG.md
CHANGED
|
@@ -476,8 +476,62 @@ results_log (single source)
|
|
| 476 |
|
| 477 |
**Verification:**
|
| 478 |
|
| 479 |
-
-
|
| 480 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 481 |
|
| 482 |
### [FEATURE: UI Control for Question Limit - Cloud Testing Support]
|
| 483 |
|
|
|
|
| 476 |
|
| 477 |
**Verification:**
|
| 478 |
|
| 479 |
+
- ✅ UI table shows annotator metadata as JSON string
|
| 480 |
+
- ✅ JSON export includes ground_truth_answer and annotator_metadata fields
|
| 481 |
+
- ⏳ Full testing pending to verify format is correct
|
| 482 |
+
|
| 483 |
+
### [BUGFIX: Annotator Metadata Display and JSON Export]
|
| 484 |
+
|
| 485 |
+
**Problem:**
|
| 486 |
+
|
| 487 |
+
1. UI table shows "[object Object]" for annotator metadata (dict can't be displayed)
|
| 488 |
+
2. JSON export missing ground_truth_answer and annotator_metadata fields
|
| 489 |
+
|
| 490 |
+
**Root Cause:**
|
| 491 |
+
|
| 492 |
+
1. Annotator metadata stored as dict, pandas displays as "[object Object]"
|
| 493 |
+
2. JSON export function explicitly constructed only specific fields, ignoring new ground truth fields
|
| 494 |
+
|
| 495 |
+
**Modified Files:**
|
| 496 |
+
|
| 497 |
+
- **app.py** (~25 lines modified)
|
| 498 |
+
- Updated results collection (lines 413-416)
|
| 499 |
+
- Convert annotator_metadata dict to JSON string for UI display: `json.dumps(annotator_metadata)`
|
| 500 |
+
- Store raw dict in `_annotator_metadata_raw` for JSON export
|
| 501 |
+
- Updated `export_results_to_json()` function (lines 101-128)
|
| 502 |
+
- Changed from list comprehension to explicit loop for better control
|
| 503 |
+
- Added conditional field addition for ground truth data
|
| 504 |
+
- Added `ground_truth_answer` field to JSON export
|
| 505 |
+
- Added `annotator_metadata` field to JSON export (from raw dict)
|
| 506 |
+
- Only includes fields if they exist in results_log
|
| 507 |
+
|
| 508 |
+
**Solution:**
|
| 509 |
+
|
| 510 |
+
- UI table: Shows annotator metadata as JSON string (readable format)
|
| 511 |
+
- JSON export: Includes `ground_truth_answer` and `annotator_metadata` objects
|
| 512 |
+
- Dual storage: String for UI, raw dict for JSON
|
| 513 |
+
|
| 514 |
+
**JSON Export Format:**
|
| 515 |
+
|
| 516 |
+
```json
|
| 517 |
+
{
|
| 518 |
+
"task_id": "...",
|
| 519 |
+
"question": "...",
|
| 520 |
+
"submitted_answer": "...",
|
| 521 |
+
"correct": true/false/null,
|
| 522 |
+
"ground_truth_answer": "expected answer",
|
| 523 |
+
"annotator_metadata": {
|
| 524 |
+
"steps": ["step 1", "step 2"],
|
| 525 |
+
"tools": ["web_search"],
|
| 526 |
+
"reasoning": "..."
|
| 527 |
+
}
|
| 528 |
+
}
|
| 529 |
+
```
|
| 530 |
+
|
| 531 |
+
**Verification:**
|
| 532 |
+
|
| 533 |
+
- ✅ UI table displays annotator metadata as readable JSON string
|
| 534 |
+
- ✅ JSON export includes all ground truth fields properly formatted
|
| 535 |
|
| 536 |
### [FEATURE: UI Control for Question Limit - Cloud Testing Support]
|
| 537 |
|
app.py
CHANGED
|
@@ -98,21 +98,33 @@ def export_results_to_json(
|
|
| 98 |
metadata["correct_count"] = submission_response.get("correct_count")
|
| 99 |
metadata["total_attempted"] = submission_response.get("total_attempted")
|
| 100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
export_data = {
|
| 102 |
"metadata": metadata,
|
| 103 |
"submission_status": submission_status,
|
| 104 |
-
"results":
|
| 105 |
-
{
|
| 106 |
-
"task_id": result.get("Task ID", "N/A"),
|
| 107 |
-
"question": result.get("Question", "N/A"),
|
| 108 |
-
"submitted_answer": result.get("Submitted Answer", "N/A"),
|
| 109 |
-
# Use ground truth comparison if available, otherwise null
|
| 110 |
-
"correct": True if result.get("Correct?") == "✅ Yes"
|
| 111 |
-
else False if result.get("Correct?") == "❌ No"
|
| 112 |
-
else None,
|
| 113 |
-
}
|
| 114 |
-
for result in results_log
|
| 115 |
-
],
|
| 116 |
}
|
| 117 |
|
| 118 |
# Write JSON file with pretty formatting
|
|
@@ -410,8 +422,10 @@ def run_and_submit_all(
|
|
| 410 |
if is_correct is not None:
|
| 411 |
result_entry["Correct?"] = "✅ Yes" if is_correct else "❌ No"
|
| 412 |
result_entry["Ground Truth Answer"] = gt_answer
|
| 413 |
-
#
|
| 414 |
-
result_entry["Annotator Metadata"] = annotator_metadata
|
|
|
|
|
|
|
| 415 |
|
| 416 |
results_log.append(result_entry)
|
| 417 |
|
|
|
|
| 98 |
metadata["correct_count"] = submission_response.get("correct_count")
|
| 99 |
metadata["total_attempted"] = submission_response.get("total_attempted")
|
| 100 |
|
| 101 |
+
# Build results array with all fields from results_log
|
| 102 |
+
results_array = []
|
| 103 |
+
for result in results_log:
|
| 104 |
+
result_dict = {
|
| 105 |
+
"task_id": result.get("Task ID", "N/A"),
|
| 106 |
+
"question": result.get("Question", "N/A"),
|
| 107 |
+
"submitted_answer": result.get("Submitted Answer", "N/A"),
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
# Add correctness if available
|
| 111 |
+
if result.get("Correct?"):
|
| 112 |
+
result_dict["correct"] = True if result.get("Correct?") == "✅ Yes" else False
|
| 113 |
+
|
| 114 |
+
# Add ground truth answer if available
|
| 115 |
+
if result.get("Ground Truth Answer"):
|
| 116 |
+
result_dict["ground_truth_answer"] = result.get("Ground Truth Answer")
|
| 117 |
+
|
| 118 |
+
# Add annotator metadata if available (use raw dict)
|
| 119 |
+
if result.get("_annotator_metadata_raw"):
|
| 120 |
+
result_dict["annotator_metadata"] = result.get("_annotator_metadata_raw")
|
| 121 |
+
|
| 122 |
+
results_array.append(result_dict)
|
| 123 |
+
|
| 124 |
export_data = {
|
| 125 |
"metadata": metadata,
|
| 126 |
"submission_status": submission_status,
|
| 127 |
+
"results": results_array,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
}
|
| 129 |
|
| 130 |
# Write JSON file with pretty formatting
|
|
|
|
| 422 |
if is_correct is not None:
|
| 423 |
result_entry["Correct?"] = "✅ Yes" if is_correct else "❌ No"
|
| 424 |
result_entry["Ground Truth Answer"] = gt_answer
|
| 425 |
+
# Convert annotator metadata to JSON string for UI table display
|
| 426 |
+
result_entry["Annotator Metadata"] = json.dumps(annotator_metadata, ensure_ascii=False) if annotator_metadata else ""
|
| 427 |
+
# Store raw dict for JSON export (will be extracted in export function)
|
| 428 |
+
result_entry["_annotator_metadata_raw"] = annotator_metadata
|
| 429 |
|
| 430 |
results_log.append(result_entry)
|
| 431 |
|
exports/gaia_results_20260105_153616.json
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metadata": {
|
| 3 |
+
"generated": "2026-01-05 15:36:16",
|
| 4 |
+
"timestamp": "20260105_153616",
|
| 5 |
+
"total_questions": 5,
|
| 6 |
+
"execution_time_seconds": 51.51,
|
| 7 |
+
"execution_time_formatted": "0m 51s",
|
| 8 |
+
"score_percent": 0.0,
|
| 9 |
+
"correct_count": 0,
|
| 10 |
+
"total_attempted": 5
|
| 11 |
+
},
|
| 12 |
+
"submission_status": "Submission Successful!\nUser: mangoobee\nOverall Score: 0.0% (0/5 correct)\nMessage: Score calculated successfully: 0/20 total questions answered correctly (5 valid tasks attempted). Score did not improve previous record, leaderboard not updated.",
|
| 13 |
+
"results": [
|
| 14 |
+
{
|
| 15 |
+
"task_id": "2d83110e-a098-4ebb-9987-066c06fa42d0",
|
| 16 |
+
"question": ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI",
|
| 17 |
+
"submitted_answer": "ERROR: No evidence collected. Details: Tool selection returned no tools - using fallback keyword matching; Tool calculator failed: SyntaxError: Invalid expression syntax: invalid syntax (<unknown>, line 1)",
|
| 18 |
+
"correct": false,
|
| 19 |
+
"ground_truth_answer": "Right",
|
| 20 |
+
"annotator_metadata": {
|
| 21 |
+
"Steps": "1. Read the instructions in reverse",
|
| 22 |
+
"Number of steps": "1",
|
| 23 |
+
"How long did this take?": "1 minute",
|
| 24 |
+
"Tools": "1. A word reversal tool / script",
|
| 25 |
+
"Number of tools": "0"
|
| 26 |
+
}
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"task_id": "cca530fc-4052-43b2-b130-b30968d8aa44",
|
| 30 |
+
"question": "Review the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.",
|
| 31 |
+
"submitted_answer": "ERROR: No evidence collected. Details: Tool vision failed: Exception: Vision analysis failed - Gemini and Claude both failed",
|
| 32 |
+
"correct": false,
|
| 33 |
+
"ground_truth_answer": "Rd5",
|
| 34 |
+
"annotator_metadata": {
|
| 35 |
+
"Steps": "Step 1: Evaluate the position of the pieces in the chess position\nStep 2: Report the best move available for black: \"Rd5\"",
|
| 36 |
+
"Number of steps": "2",
|
| 37 |
+
"How long did this take?": "10 minutes",
|
| 38 |
+
"Tools": "1. Image recognition tools",
|
| 39 |
+
"Number of tools": "1"
|
| 40 |
+
}
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
|
| 44 |
+
"question": "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?",
|
| 45 |
+
"submitted_answer": "Unable to answer",
|
| 46 |
+
"correct": false,
|
| 47 |
+
"ground_truth_answer": "3",
|
| 48 |
+
"annotator_metadata": {
|
| 49 |
+
"Steps": "1. Navigate to the YouTube link.\n2. Watch the video to see the highest number of bird species.\n3. Note the number.",
|
| 50 |
+
"Number of steps": "3",
|
| 51 |
+
"How long did this take?": "3 minutes",
|
| 52 |
+
"Tools": "1. Web browser\n2. Video parsing",
|
| 53 |
+
"Number of tools": "2"
|
| 54 |
+
}
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"task_id": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be",
|
| 58 |
+
"question": "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.",
|
| 59 |
+
"submitted_answer": "",
|
| 60 |
+
"correct": false,
|
| 61 |
+
"ground_truth_answer": "3",
|
| 62 |
+
"annotator_metadata": {
|
| 63 |
+
"Steps": "1. I did a search for Mercedes Sosa\n2. I went to the Wikipedia page for her\n3. I scrolled down to \"Studio albums\"\n4. I counted the ones between 2000 and 2009",
|
| 64 |
+
"Number of steps": "4",
|
| 65 |
+
"How long did this take?": "5 minutes",
|
| 66 |
+
"Tools": "1. web browser\n2. google search",
|
| 67 |
+
"Number of tools": "2"
|
| 68 |
+
}
|
| 69 |
+
},
|
| 70 |
+
{
|
| 71 |
+
"task_id": "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8",
|
| 72 |
+
"question": "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?",
|
| 73 |
+
"submitted_answer": "",
|
| 74 |
+
"correct": false,
|
| 75 |
+
"ground_truth_answer": "FunkMonk",
|
| 76 |
+
"annotator_metadata": {
|
| 77 |
+
"Steps": "1. Search \"Wikipedia featured articles promoted in november 2016\"\n2. Click through to the appropriate page and find the person who nominated Giganotosaurus.",
|
| 78 |
+
"Number of steps": "2",
|
| 79 |
+
"How long did this take?": "5 minutes",
|
| 80 |
+
"Tools": "1. web browser\n2. search engine",
|
| 81 |
+
"Number of tools": "2"
|
| 82 |
+
}
|
| 83 |
+
}
|
| 84 |
+
]
|
| 85 |
+
}
|