mangubee commited on
Commit
5731c0c
·
1 Parent(s): 24396e3
Files changed (2) hide show
  1. CHANGELOG.md +25 -0
  2. app.py +17 -14
CHANGELOG.md CHANGED
@@ -551,6 +551,31 @@ results_log (single source)
551
  - UI table columns: Task ID, Question, Submitted Answer, Correct?, Ground Truth Answer
552
  - JSON export fields: task_id, question, submitted_answer, correct, ground_truth_answer, annotator_metadata
553
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
554
  ### [FEATURE: UI Control for Question Limit - Cloud Testing Support]
555
 
556
  **Problem:** DEBUG_QUESTION_LIMIT in .env requires file editing to change. In HF Spaces cloud, users can't easily modify .env for testing different question counts.
 
551
  - UI table columns: Task ID, Question, Submitted Answer, Correct?, Ground Truth Answer
552
  - JSON export fields: task_id, question, submitted_answer, correct, ground_truth_answer, annotator_metadata
553
 
554
+ ### [CLEANUP: Remove _annotator_metadata_raw from UI Table]
555
+
556
+ **Problem:** Internal `_annotator_metadata_raw` field showing in UI table as a confusing column.
557
+
558
+ **Solution:** Pass ground_truth object to export function instead of storing metadata in each result_entry.
559
+
560
+ **Modified Files:**
561
+
562
+ - **app.py** (~20 lines modified)
563
+ - Removed `_annotator_metadata_raw` from result_entry (line 426 removed)
564
+ - Removed unused local variables: metadata_item, annotator_metadata (lines 411-412 removed)
565
+ - Updated `export_results_to_json()` signature (line 52)
566
+ - Added `ground_truth = None` parameter
567
+ - Updated JSON export logic (lines 120-126)
568
+ - Fetch annotator_metadata from ground_truth.metadata during export
569
+ - No longer relies on result.get("_annotator_metadata_raw")
570
+ - Updated all 6 calls to export_results_to_json (lines 453, 493, 507, 516, 525, 534)
571
+ - Added ground_truth as final parameter
572
+
573
+ **Result:**
574
+
575
+ - UI table: Clean - no internal/hidden fields
576
+ - JSON export: Still includes annotator_metadata (fetched from ground_truth object)
577
+ - Better separation of concerns: UI uses results_log, export uses ground_truth object
578
+
579
  ### [FEATURE: UI Control for Question Limit - Cloud Testing Support]
580
 
581
  **Problem:** DEBUG_QUESTION_LIMIT in .env requires file editing to change. In HF Spaces cloud, users can't easily modify .env for testing different question counts.
app.py CHANGED
@@ -49,6 +49,7 @@ def export_results_to_json(
49
  submission_status: str,
50
  execution_time: float = None,
51
  submission_response: dict = None,
 
52
  ) -> str:
53
  """Export evaluation results to JSON file for easy processing.
54
 
@@ -61,6 +62,7 @@ def export_results_to_json(
61
  submission_status: Status message from submission
62
  execution_time: Total execution time in seconds
63
  submission_response: Response from GAIA API with correctness info
 
64
  """
65
  from datetime import datetime
66
 
@@ -115,9 +117,13 @@ def export_results_to_json(
115
  if result.get("Ground Truth Answer"):
116
  result_dict["ground_truth_answer"] = result.get("Ground Truth Answer")
117
 
118
- # Add annotator metadata if available (use raw dict)
119
- if result.get("_annotator_metadata_raw"):
120
- result_dict["annotator_metadata"] = result.get("_annotator_metadata_raw")
 
 
 
 
121
 
122
  results_array.append(result_dict)
123
 
@@ -406,10 +412,8 @@ def run_and_submit_all(
406
  # Compare with ground truth if available
407
  is_correct = ground_truth.compare_answer(result["task_id"], result["answer"])
408
 
409
- # Get ground truth answer and metadata
410
  gt_answer = ground_truth.get_answer(result["task_id"])
411
- metadata_item = ground_truth.metadata.get(result["task_id"], {})
412
- annotator_metadata = metadata_item.get("Annotator Metadata", {})
413
 
414
  # Add to results log
415
  result_entry = {
@@ -422,8 +426,7 @@ def run_and_submit_all(
422
  if is_correct is not None:
423
  result_entry["Correct?"] = "✅ Yes" if is_correct else "❌ No"
424
  result_entry["Ground Truth Answer"] = gt_answer
425
- # Store raw dict for JSON export (NOT displayed in UI table)
426
- result_entry["_annotator_metadata_raw"] = annotator_metadata
427
 
428
  results_log.append(result_entry)
429
 
@@ -444,7 +447,7 @@ def run_and_submit_all(
444
  results_df = pd.DataFrame(results_log)
445
  execution_time = time.time() - start_time
446
  export_path = export_results_to_json(
447
- results_log, status_message, execution_time
448
  )
449
  return status_message, results_df, export_path
450
 
@@ -485,7 +488,7 @@ def run_and_submit_all(
485
  results_df = pd.DataFrame(results_log)
486
  # Export to JSON with execution time and submission response
487
  export_path = export_results_to_json(
488
- results_log, final_status, execution_time, result_data
489
  )
490
  return final_status, results_df, export_path
491
  except requests.exceptions.HTTPError as e:
@@ -500,7 +503,7 @@ def run_and_submit_all(
500
  execution_time = time.time() - start_time
501
  results_df = pd.DataFrame(results_log)
502
  export_path = export_results_to_json(
503
- results_log, status_message, execution_time
504
  )
505
  return status_message, results_df, export_path
506
  except requests.exceptions.Timeout:
@@ -509,7 +512,7 @@ def run_and_submit_all(
509
  execution_time = time.time() - start_time
510
  results_df = pd.DataFrame(results_log)
511
  export_path = export_results_to_json(
512
- results_log, status_message, execution_time
513
  )
514
  return status_message, results_df, export_path
515
  except requests.exceptions.RequestException as e:
@@ -518,7 +521,7 @@ def run_and_submit_all(
518
  execution_time = time.time() - start_time
519
  results_df = pd.DataFrame(results_log)
520
  export_path = export_results_to_json(
521
- results_log, status_message, execution_time
522
  )
523
  return status_message, results_df, export_path
524
  except Exception as e:
@@ -527,7 +530,7 @@ def run_and_submit_all(
527
  execution_time = time.time() - start_time
528
  results_df = pd.DataFrame(results_log)
529
  export_path = export_results_to_json(
530
- results_log, status_message, execution_time
531
  )
532
  return status_message, results_df, export_path
533
 
 
49
  submission_status: str,
50
  execution_time: float = None,
51
  submission_response: dict = None,
52
+ ground_truth = None,
53
  ) -> str:
54
  """Export evaluation results to JSON file for easy processing.
55
 
 
62
  submission_status: Status message from submission
63
  execution_time: Total execution time in seconds
64
  submission_response: Response from GAIA API with correctness info
65
+ ground_truth: GAIAGroundTruth object for metadata access
66
  """
67
  from datetime import datetime
68
 
 
117
  if result.get("Ground Truth Answer"):
118
  result_dict["ground_truth_answer"] = result.get("Ground Truth Answer")
119
 
120
+ # Add annotator metadata if available (from ground_truth object)
121
+ if ground_truth and result.get("Task ID"):
122
+ task_id = result.get("Task ID")
123
+ metadata_item = ground_truth.metadata.get(task_id, {})
124
+ annotator_metadata = metadata_item.get("Annotator Metadata", {})
125
+ if annotator_metadata:
126
+ result_dict["annotator_metadata"] = annotator_metadata
127
 
128
  results_array.append(result_dict)
129
 
 
412
  # Compare with ground truth if available
413
  is_correct = ground_truth.compare_answer(result["task_id"], result["answer"])
414
 
415
+ # Get ground truth answer
416
  gt_answer = ground_truth.get_answer(result["task_id"])
 
 
417
 
418
  # Add to results log
419
  result_entry = {
 
426
  if is_correct is not None:
427
  result_entry["Correct?"] = "✅ Yes" if is_correct else "❌ No"
428
  result_entry["Ground Truth Answer"] = gt_answer
429
+ # Note: metadata NOT added to UI table, only used in JSON export
 
430
 
431
  results_log.append(result_entry)
432
 
 
447
  results_df = pd.DataFrame(results_log)
448
  execution_time = time.time() - start_time
449
  export_path = export_results_to_json(
450
+ results_log, status_message, execution_time, None, ground_truth
451
  )
452
  return status_message, results_df, export_path
453
 
 
488
  results_df = pd.DataFrame(results_log)
489
  # Export to JSON with execution time and submission response
490
  export_path = export_results_to_json(
491
+ results_log, final_status, execution_time, result_data, ground_truth
492
  )
493
  return final_status, results_df, export_path
494
  except requests.exceptions.HTTPError as e:
 
503
  execution_time = time.time() - start_time
504
  results_df = pd.DataFrame(results_log)
505
  export_path = export_results_to_json(
506
+ results_log, status_message, execution_time, None, ground_truth
507
  )
508
  return status_message, results_df, export_path
509
  except requests.exceptions.Timeout:
 
512
  execution_time = time.time() - start_time
513
  results_df = pd.DataFrame(results_log)
514
  export_path = export_results_to_json(
515
+ results_log, status_message, execution_time, None, ground_truth
516
  )
517
  return status_message, results_df, export_path
518
  except requests.exceptions.RequestException as e:
 
521
  execution_time = time.time() - start_time
522
  results_df = pd.DataFrame(results_log)
523
  export_path = export_results_to_json(
524
+ results_log, status_message, execution_time, None, ground_truth
525
  )
526
  return status_message, results_df, export_path
527
  except Exception as e:
 
530
  execution_time = time.time() - start_time
531
  results_df = pd.DataFrame(results_log)
532
  export_path = export_results_to_json(
533
+ results_log, status_message, execution_time, None, ground_truth
534
  )
535
  return status_message, results_df, export_path
536