agentbee

Running

App Files Files Community

mangubee commited on 21 days ago

Commit

5731c0c

1 Parent(s): 24396e3

Update

Browse files

Files changed (2) hide show

CHANGELOG.md +25 -0
app.py +17 -14

CHANGELOG.md CHANGED Viewed

@@ -551,6 +551,31 @@ results_log (single source)
 - UI table columns: Task ID, Question, Submitted Answer, Correct?, Ground Truth Answer
 - JSON export fields: task_id, question, submitted_answer, correct, ground_truth_answer, annotator_metadata
 ### [FEATURE: UI Control for Question Limit - Cloud Testing Support]
 **Problem:** DEBUG_QUESTION_LIMIT in .env requires file editing to change. In HF Spaces cloud, users can't easily modify .env for testing different question counts.

 - UI table columns: Task ID, Question, Submitted Answer, Correct?, Ground Truth Answer
 - JSON export fields: task_id, question, submitted_answer, correct, ground_truth_answer, annotator_metadata
+### [CLEANUP: Remove _annotator_metadata_raw from UI Table]
+**Problem:** Internal `_annotator_metadata_raw` field showing in UI table as a confusing column.
+**Solution:** Pass ground_truth object to export function instead of storing metadata in each result_entry.
+**Modified Files:**
+- **app.py** (~20 lines modified)
+  - Removed `_annotator_metadata_raw` from result_entry (line 426 removed)
+  - Removed unused local variables: metadata_item, annotator_metadata (lines 411-412 removed)
+  - Updated `export_results_to_json()` signature (line 52)
+    - Added `ground_truth = None` parameter
+  - Updated JSON export logic (lines 120-126)
+    - Fetch annotator_metadata from ground_truth.metadata during export
+    - No longer relies on result.get("_annotator_metadata_raw")
+  - Updated all 6 calls to export_results_to_json (lines 453, 493, 507, 516, 525, 534)
+    - Added ground_truth as final parameter
+**Result:**
+- UI table: Clean - no internal/hidden fields
+- JSON export: Still includes annotator_metadata (fetched from ground_truth object)
+- Better separation of concerns: UI uses results_log, export uses ground_truth object
 ### [FEATURE: UI Control for Question Limit - Cloud Testing Support]
 **Problem:** DEBUG_QUESTION_LIMIT in .env requires file editing to change. In HF Spaces cloud, users can't easily modify .env for testing different question counts.

app.py CHANGED Viewed

@@ -49,6 +49,7 @@ def export_results_to_json(
     submission_status: str,
     execution_time: float = None,
     submission_response: dict = None,
 ) -> str:
     """Export evaluation results to JSON file for easy processing.
@@ -61,6 +62,7 @@ def export_results_to_json(
         submission_status: Status message from submission
         execution_time: Total execution time in seconds
         submission_response: Response from GAIA API with correctness info
     """
     from datetime import datetime
@@ -115,9 +117,13 @@ def export_results_to_json(
         if result.get("Ground Truth Answer"):
             result_dict["ground_truth_answer"] = result.get("Ground Truth Answer")
-        # Add annotator metadata if available (use raw dict)
-        if result.get("_annotator_metadata_raw"):
-            result_dict["annotator_metadata"] = result.get("_annotator_metadata_raw")
         results_array.append(result_dict)
@@ -406,10 +412,8 @@ def run_and_submit_all(
             # Compare with ground truth if available
             is_correct = ground_truth.compare_answer(result["task_id"], result["answer"])
-            # Get ground truth answer and metadata
             gt_answer = ground_truth.get_answer(result["task_id"])
-            metadata_item = ground_truth.metadata.get(result["task_id"], {})
-            annotator_metadata = metadata_item.get("Annotator Metadata", {})
             # Add to results log
             result_entry = {
@@ -422,8 +426,7 @@ def run_and_submit_all(
             if is_correct is not None:
                 result_entry["Correct?"] = "✅ Yes" if is_correct else "❌ No"
                 result_entry["Ground Truth Answer"] = gt_answer
-                # Store raw dict for JSON export (NOT displayed in UI table)
-                result_entry["_annotator_metadata_raw"] = annotator_metadata
             results_log.append(result_entry)
@@ -444,7 +447,7 @@ def run_and_submit_all(
         results_df = pd.DataFrame(results_log)
         execution_time = time.time() - start_time
         export_path = export_results_to_json(
-            results_log, status_message, execution_time
         )
         return status_message, results_df, export_path
@@ -485,7 +488,7 @@ def run_and_submit_all(
         results_df = pd.DataFrame(results_log)
         # Export to JSON with execution time and submission response
         export_path = export_results_to_json(
-            results_log, final_status, execution_time, result_data
         )
         return final_status, results_df, export_path
     except requests.exceptions.HTTPError as e:
@@ -500,7 +503,7 @@ def run_and_submit_all(
         execution_time = time.time() - start_time
         results_df = pd.DataFrame(results_log)
         export_path = export_results_to_json(
-            results_log, status_message, execution_time
         )
         return status_message, results_df, export_path
     except requests.exceptions.Timeout:
@@ -509,7 +512,7 @@ def run_and_submit_all(
         execution_time = time.time() - start_time
         results_df = pd.DataFrame(results_log)
         export_path = export_results_to_json(
-            results_log, status_message, execution_time
         )
         return status_message, results_df, export_path
     except requests.exceptions.RequestException as e:
@@ -518,7 +521,7 @@ def run_and_submit_all(
         execution_time = time.time() - start_time
         results_df = pd.DataFrame(results_log)
         export_path = export_results_to_json(
-            results_log, status_message, execution_time
         )
         return status_message, results_df, export_path
     except Exception as e:
@@ -527,7 +530,7 @@ def run_and_submit_all(
         execution_time = time.time() - start_time
         results_df = pd.DataFrame(results_log)
         export_path = export_results_to_json(
-            results_log, status_message, execution_time
         )
         return status_message, results_df, export_path

     submission_status: str,
     execution_time: float = None,
     submission_response: dict = None,
+    ground_truth = None,
 ) -> str:
     """Export evaluation results to JSON file for easy processing.
         submission_status: Status message from submission
         execution_time: Total execution time in seconds
         submission_response: Response from GAIA API with correctness info
+        ground_truth: GAIAGroundTruth object for metadata access
     """
     from datetime import datetime
         if result.get("Ground Truth Answer"):
             result_dict["ground_truth_answer"] = result.get("Ground Truth Answer")
+        # Add annotator metadata if available (from ground_truth object)
+        if ground_truth and result.get("Task ID"):
+            task_id = result.get("Task ID")
+            metadata_item = ground_truth.metadata.get(task_id, {})
+            annotator_metadata = metadata_item.get("Annotator Metadata", {})
+            if annotator_metadata:
+                result_dict["annotator_metadata"] = annotator_metadata
         results_array.append(result_dict)
             # Compare with ground truth if available
             is_correct = ground_truth.compare_answer(result["task_id"], result["answer"])
+            # Get ground truth answer
             gt_answer = ground_truth.get_answer(result["task_id"])
             # Add to results log
             result_entry = {
             if is_correct is not None:
                 result_entry["Correct?"] = "✅ Yes" if is_correct else "❌ No"
                 result_entry["Ground Truth Answer"] = gt_answer
+                # Note: metadata NOT added to UI table, only used in JSON export
             results_log.append(result_entry)
         results_df = pd.DataFrame(results_log)
         execution_time = time.time() - start_time
         export_path = export_results_to_json(
+            results_log, status_message, execution_time, None, ground_truth
         )
         return status_message, results_df, export_path
         results_df = pd.DataFrame(results_log)
         # Export to JSON with execution time and submission response
         export_path = export_results_to_json(
+            results_log, final_status, execution_time, result_data, ground_truth
         )
         return final_status, results_df, export_path
     except requests.exceptions.HTTPError as e:
         execution_time = time.time() - start_time
         results_df = pd.DataFrame(results_log)
         export_path = export_results_to_json(
+            results_log, status_message, execution_time, None, ground_truth
         )
         return status_message, results_df, export_path
     except requests.exceptions.Timeout:
         execution_time = time.time() - start_time
         results_df = pd.DataFrame(results_log)
         export_path = export_results_to_json(
+            results_log, status_message, execution_time, None, ground_truth
         )
         return status_message, results_df, export_path
     except requests.exceptions.RequestException as e:
         execution_time = time.time() - start_time
         results_df = pd.DataFrame(results_log)
         export_path = export_results_to_json(
+            results_log, status_message, execution_time, None, ground_truth
         )
         return status_message, results_df, export_path
     except Exception as e:
         execution_time = time.time() - start_time
         results_df = pd.DataFrame(results_log)
         export_path = export_results_to_json(
+            results_log, status_message, execution_time, None, ground_truth
         )
         return status_message, results_df, export_path