agentbee

Running

App Files Files Community

mangubee commited on 21 days ago

Commit

1bf8fc5

1 Parent(s): 44d1862

Fix bug

Browse files

Files changed (3) hide show

CHANGELOG.md +56 -2
app.py +28 -14
exports/gaia_results_20260105_153616.json +85 -0

CHANGELOG.md CHANGED Viewed

@@ -476,8 +476,62 @@ results_log (single source)
 **Verification:**
-- ⏳ Testing with validation set to verify columns appear correctly
-- ⏳ Verify annotator metadata format in UI table and JSON
 ### [FEATURE: UI Control for Question Limit - Cloud Testing Support]

 **Verification:**
+- ✅ UI table shows annotator metadata as JSON string
+- ✅ JSON export includes ground_truth_answer and annotator_metadata fields
+- ⏳ Full testing pending to verify format is correct
+### [BUGFIX: Annotator Metadata Display and JSON Export]
+**Problem:**
+1. UI table shows "[object Object]" for annotator metadata (dict can't be displayed)
+2. JSON export missing ground_truth_answer and annotator_metadata fields
+**Root Cause:**
+1. Annotator metadata stored as dict, pandas displays as "[object Object]"
+2. JSON export function explicitly constructed only specific fields, ignoring new ground truth fields
+**Modified Files:**
+- **app.py** (~25 lines modified)
+  - Updated results collection (lines 413-416)
+    - Convert annotator_metadata dict to JSON string for UI display: `json.dumps(annotator_metadata)`
+    - Store raw dict in `_annotator_metadata_raw` for JSON export
+  - Updated `export_results_to_json()` function (lines 101-128)
+    - Changed from list comprehension to explicit loop for better control
+    - Added conditional field addition for ground truth data
+    - Added `ground_truth_answer` field to JSON export
+    - Added `annotator_metadata` field to JSON export (from raw dict)
+    - Only includes fields if they exist in results_log
+**Solution:**
+- UI table: Shows annotator metadata as JSON string (readable format)
+- JSON export: Includes `ground_truth_answer` and `annotator_metadata` objects
+- Dual storage: String for UI, raw dict for JSON
+**JSON Export Format:**
+```json
+{
+  "task_id": "...",
+  "question": "...",
+  "submitted_answer": "...",
+  "correct": true/false/null,
+  "ground_truth_answer": "expected answer",
+  "annotator_metadata": {
+    "steps": ["step 1", "step 2"],
+    "tools": ["web_search"],
+    "reasoning": "..."
+  }
+}
+```
+**Verification:**
+- ✅ UI table displays annotator metadata as readable JSON string
+- ✅ JSON export includes all ground truth fields properly formatted
 ### [FEATURE: UI Control for Question Limit - Cloud Testing Support]

app.py CHANGED Viewed

@@ -98,21 +98,33 @@ def export_results_to_json(
         metadata["correct_count"] = submission_response.get("correct_count")
         metadata["total_attempted"] = submission_response.get("total_attempted")
     export_data = {
         "metadata": metadata,
         "submission_status": submission_status,
-        "results": [
-            {
-                "task_id": result.get("Task ID", "N/A"),
-                "question": result.get("Question", "N/A"),
-                "submitted_answer": result.get("Submitted Answer", "N/A"),
-                # Use ground truth comparison if available, otherwise null
-                "correct": True if result.get("Correct?") == "✅ Yes"
-                          else False if result.get("Correct?") == "❌ No"
-                          else None,
-            }
-            for result in results_log
-        ],
     }
     # Write JSON file with pretty formatting
@@ -410,8 +422,10 @@ def run_and_submit_all(
             if is_correct is not None:
                 result_entry["Correct?"] = "✅ Yes" if is_correct else "❌ No"
                 result_entry["Ground Truth Answer"] = gt_answer
-                # Add annotator metadata as dict for JSON, string for UI table
-                result_entry["Annotator Metadata"] = annotator_metadata
             results_log.append(result_entry)

         metadata["correct_count"] = submission_response.get("correct_count")
         metadata["total_attempted"] = submission_response.get("total_attempted")
+    # Build results array with all fields from results_log
+    results_array = []
+    for result in results_log:
+        result_dict = {
+            "task_id": result.get("Task ID", "N/A"),
+            "question": result.get("Question", "N/A"),
+            "submitted_answer": result.get("Submitted Answer", "N/A"),
+        }
+        # Add correctness if available
+        if result.get("Correct?"):
+            result_dict["correct"] = True if result.get("Correct?") == "✅ Yes" else False
+        # Add ground truth answer if available
+        if result.get("Ground Truth Answer"):
+            result_dict["ground_truth_answer"] = result.get("Ground Truth Answer")
+        # Add annotator metadata if available (use raw dict)
+        if result.get("_annotator_metadata_raw"):
+            result_dict["annotator_metadata"] = result.get("_annotator_metadata_raw")
+        results_array.append(result_dict)
     export_data = {
         "metadata": metadata,
         "submission_status": submission_status,
+        "results": results_array,
     }
     # Write JSON file with pretty formatting
             if is_correct is not None:
                 result_entry["Correct?"] = "✅ Yes" if is_correct else "❌ No"
                 result_entry["Ground Truth Answer"] = gt_answer
+                # Convert annotator metadata to JSON string for UI table display
+                result_entry["Annotator Metadata"] = json.dumps(annotator_metadata, ensure_ascii=False) if annotator_metadata else ""
+                # Store raw dict for JSON export (will be extracted in export function)
+                result_entry["_annotator_metadata_raw"] = annotator_metadata
             results_log.append(result_entry)

exports/gaia_results_20260105_153616.json ADDED Viewed

	@@ -0,0 +1,85 @@

+{
+  "metadata": {
+    "generated": "2026-01-05 15:36:16",
+    "timestamp": "20260105_153616",
+    "total_questions": 5,
+    "execution_time_seconds": 51.51,
+    "execution_time_formatted": "0m 51s",
+    "score_percent": 0.0,
+    "correct_count": 0,
+    "total_attempted": 5
+  },
+  "submission_status": "Submission Successful!\nUser: mangoobee\nOverall Score: 0.0% (0/5 correct)\nMessage: Score calculated successfully: 0/20 total questions answered correctly (5 valid tasks attempted). Score did not improve previous record, leaderboard not updated.",
+  "results": [
+    {
+      "task_id": "2d83110e-a098-4ebb-9987-066c06fa42d0",
+      "question": ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI",
+      "submitted_answer": "ERROR: No evidence collected. Details: Tool selection returned no tools - using fallback keyword matching; Tool calculator failed: SyntaxError: Invalid expression syntax: invalid syntax (<unknown>, line 1)",
+      "correct": false,
+      "ground_truth_answer": "Right",
+      "annotator_metadata": {
+        "Steps": "1. Read the instructions in reverse",
+        "Number of steps": "1",
+        "How long did this take?": "1 minute",
+        "Tools": "1. A word reversal tool / script",
+        "Number of tools": "0"
+      }
+    },
+    {
+      "task_id": "cca530fc-4052-43b2-b130-b30968d8aa44",
+      "question": "Review the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.",
+      "submitted_answer": "ERROR: No evidence collected. Details: Tool vision failed: Exception: Vision analysis failed - Gemini and Claude both failed",
+      "correct": false,
+      "ground_truth_answer": "Rd5",
+      "annotator_metadata": {
+        "Steps": "Step 1: Evaluate the position of the pieces in the chess position\nStep 2: Report the best move available for black: \"Rd5\"",
+        "Number of steps": "2",
+        "How long did this take?": "10 minutes",
+        "Tools": "1. Image recognition tools",
+        "Number of tools": "1"
+      }
+    },
+    {
+      "task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
+      "question": "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?",
+      "submitted_answer": "Unable to answer",
+      "correct": false,
+      "ground_truth_answer": "3",
+      "annotator_metadata": {
+        "Steps": "1. Navigate to the YouTube link.\n2. Watch the video to see the highest number of bird species.\n3. Note the number.",
+        "Number of steps": "3",
+        "How long did this take?": "3 minutes",
+        "Tools": "1. Web browser\n2. Video parsing",
+        "Number of tools": "2"
+      }
+    },
+    {
+      "task_id": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be",
+      "question": "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.",
+      "submitted_answer": "",
+      "correct": false,
+      "ground_truth_answer": "3",
+      "annotator_metadata": {
+        "Steps": "1. I did a search for Mercedes Sosa\n2. I went to the Wikipedia page for her\n3. I scrolled down to \"Studio albums\"\n4. I counted the ones between 2000 and 2009",
+        "Number of steps": "4",
+        "How long did this take?": "5 minutes",
+        "Tools": "1. web browser\n2. google search",
+        "Number of tools": "2"
+      }
+    },
+    {
+      "task_id": "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8",
+      "question": "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?",
+      "submitted_answer": "",
+      "correct": false,
+      "ground_truth_answer": "FunkMonk",
+      "annotator_metadata": {
+        "Steps": "1. Search \"Wikipedia featured articles promoted in november 2016\"\n2. Click through to the appropriate page and find the person who nominated Giganotosaurus.",
+        "Number of steps": "2",
+        "How long did this take?": "5 minutes",
+        "Tools": "1. web browser\n2. search engine",
+        "Number of tools": "2"
+      }
+    }
+  ]
+}