agentbee

Sleeping

App Files Files Community

mangubee commited on Jan 5

Commit

87c4c82

1 Parent(s): 5731c0c

Update

Browse files

Files changed (2) hide show

CHANGELOG.md +14 -0
app.py +16 -18

CHANGELOG.md CHANGED Viewed

@@ -4,6 +4,20 @@
 ## Changes Made
 ### [PROBLEM: LLM Quota Exhaustion - Retry Logic]
 **Modified Files:**

 ## Changes Made
+### [PROBLEM: Ground Truth Architecture - Single Source Simplification]
+**Modified Files:**
+- **app.py** (~10 lines modified)
+  - Removed `ground_truth` parameter from `export_results_to_json()` function signature
+  - Removed double work: no longer access `ground_truth.metadata` in export function
+  - Changed `_annotator_metadata` to `annotator_metadata` (removed underscore prefix)
+  - Updated all 6 function calls to remove `ground_truth` parameter (lines 448, 489, 504, 513, 522, 531)
+  - Updated comment: "both UI and JSON show identical data" (line 426)
+  - Updated docstring: "Single source: Both UI and JSON use identical results_log data" (line 58)
+  - Simplified JSON export to use `result.get("annotator_metadata")` instead of accessing metadata again (lines 119-121)
+  - Result: One object (results_log) → Two formats (UI table + JSON), both identical, no filtering
 ### [PROBLEM: LLM Quota Exhaustion - Retry Logic]
 **Modified Files:**

app.py CHANGED Viewed

@@ -49,20 +49,19 @@ def export_results_to_json(
     submission_status: str,
     execution_time: float = None,
     submission_response: dict = None,
-    ground_truth = None,
 ) -> str:
     """Export evaluation results to JSON file for easy processing.
     - Local: Saves to ~/Downloads/gaia_results_TIMESTAMP.json
     - HF Spaces: Saves to ./exports/gaia_results_TIMESTAMP.json
     - Format: Clean JSON with full error messages, no truncation
     Args:
-        results_log: List of question results
         submission_status: Status message from submission
         execution_time: Total execution time in seconds
         submission_response: Response from GAIA API with correctness info
-        ground_truth: GAIAGroundTruth object for metadata access
     """
     from datetime import datetime
@@ -117,13 +116,9 @@ def export_results_to_json(
         if result.get("Ground Truth Answer"):
             result_dict["ground_truth_answer"] = result.get("Ground Truth Answer")
-        # Add annotator metadata if available (from ground_truth object)
-        if ground_truth and result.get("Task ID"):
-            task_id = result.get("Task ID")
-            metadata_item = ground_truth.metadata.get(task_id, {})
-            annotator_metadata = metadata_item.get("Annotator Metadata", {})
-            if annotator_metadata:
-                result_dict["annotator_metadata"] = annotator_metadata
         results_array.append(result_dict)
@@ -412,8 +407,10 @@ def run_and_submit_all(
             # Compare with ground truth if available
             is_correct = ground_truth.compare_answer(result["task_id"], result["answer"])
-            # Get ground truth answer
             gt_answer = ground_truth.get_answer(result["task_id"])
             # Add to results log
             result_entry = {
@@ -426,7 +423,8 @@ def run_and_submit_all(
             if is_correct is not None:
                 result_entry["Correct?"] = "✅ Yes" if is_correct else "❌ No"
                 result_entry["Ground Truth Answer"] = gt_answer
-                # Note: metadata NOT added to UI table, only used in JSON export
             results_log.append(result_entry)
@@ -447,7 +445,7 @@ def run_and_submit_all(
         results_df = pd.DataFrame(results_log)
         execution_time = time.time() - start_time
         export_path = export_results_to_json(
-            results_log, status_message, execution_time, None, ground_truth
         )
         return status_message, results_df, export_path
@@ -488,7 +486,7 @@ def run_and_submit_all(
         results_df = pd.DataFrame(results_log)
         # Export to JSON with execution time and submission response
         export_path = export_results_to_json(
-            results_log, final_status, execution_time, result_data, ground_truth
         )
         return final_status, results_df, export_path
     except requests.exceptions.HTTPError as e:
@@ -503,7 +501,7 @@ def run_and_submit_all(
         execution_time = time.time() - start_time
         results_df = pd.DataFrame(results_log)
         export_path = export_results_to_json(
-            results_log, status_message, execution_time, None, ground_truth
         )
         return status_message, results_df, export_path
     except requests.exceptions.Timeout:
@@ -512,7 +510,7 @@ def run_and_submit_all(
         execution_time = time.time() - start_time
         results_df = pd.DataFrame(results_log)
         export_path = export_results_to_json(
-            results_log, status_message, execution_time, None, ground_truth
         )
         return status_message, results_df, export_path
     except requests.exceptions.RequestException as e:
@@ -521,7 +519,7 @@ def run_and_submit_all(
         execution_time = time.time() - start_time
         results_df = pd.DataFrame(results_log)
         export_path = export_results_to_json(
-            results_log, status_message, execution_time, None, ground_truth
         )
         return status_message, results_df, export_path
     except Exception as e:
@@ -530,7 +528,7 @@ def run_and_submit_all(
         execution_time = time.time() - start_time
         results_df = pd.DataFrame(results_log)
         export_path = export_results_to_json(
-            results_log, status_message, execution_time, None, ground_truth
         )
         return status_message, results_df, export_path

     submission_status: str,
     execution_time: float = None,
     submission_response: dict = None,
 ) -> str:
     """Export evaluation results to JSON file for easy processing.
     - Local: Saves to ~/Downloads/gaia_results_TIMESTAMP.json
     - HF Spaces: Saves to ./exports/gaia_results_TIMESTAMP.json
     - Format: Clean JSON with full error messages, no truncation
+    - Single source: Both UI and JSON use identical results_log data
     Args:
+        results_log: List of question results (single source of truth)
         submission_status: Status message from submission
         execution_time: Total execution time in seconds
         submission_response: Response from GAIA API with correctness info
     """
     from datetime import datetime
         if result.get("Ground Truth Answer"):
             result_dict["ground_truth_answer"] = result.get("Ground Truth Answer")
+        # Add annotator metadata if available (already stored in results_log)
+        if result.get("annotator_metadata"):
+            result_dict["annotator_metadata"] = result.get("annotator_metadata")
         results_array.append(result_dict)
             # Compare with ground truth if available
             is_correct = ground_truth.compare_answer(result["task_id"], result["answer"])
+            # Get ground truth answer and metadata (fetch once)
             gt_answer = ground_truth.get_answer(result["task_id"])
+            metadata_item = ground_truth.metadata.get(result["task_id"], {})
+            annotator_metadata = metadata_item.get("Annotator Metadata", {})
             # Add to results log
             result_entry = {
             if is_correct is not None:
                 result_entry["Correct?"] = "✅ Yes" if is_correct else "❌ No"
                 result_entry["Ground Truth Answer"] = gt_answer
+                # Store metadata (both UI and JSON show identical data)
+                result_entry["annotator_metadata"] = annotator_metadata
             results_log.append(result_entry)
         results_df = pd.DataFrame(results_log)
         execution_time = time.time() - start_time
         export_path = export_results_to_json(
+            results_log, status_message, execution_time, None
         )
         return status_message, results_df, export_path
         results_df = pd.DataFrame(results_log)
         # Export to JSON with execution time and submission response
         export_path = export_results_to_json(
+            results_log, final_status, execution_time, result_data
         )
         return final_status, results_df, export_path
     except requests.exceptions.HTTPError as e:
         execution_time = time.time() - start_time
         results_df = pd.DataFrame(results_log)
         export_path = export_results_to_json(
+            results_log, status_message, execution_time, None
         )
         return status_message, results_df, export_path
     except requests.exceptions.Timeout:
         execution_time = time.time() - start_time
         results_df = pd.DataFrame(results_log)
         export_path = export_results_to_json(
+            results_log, status_message, execution_time, None
         )
         return status_message, results_df, export_path
     except requests.exceptions.RequestException as e:
         execution_time = time.time() - start_time
         results_df = pd.DataFrame(results_log)
         export_path = export_results_to_json(
+            results_log, status_message, execution_time, None
         )
         return status_message, results_df, export_path
     except Exception as e:
         execution_time = time.time() - start_time
         results_df = pd.DataFrame(results_log)
         export_path = export_results_to_json(
+            results_log, status_message, execution_time, None
         )
         return status_message, results_df, export_path