mangubee commited on
Commit
87c4c82
·
1 Parent(s): 5731c0c
Files changed (2) hide show
  1. CHANGELOG.md +14 -0
  2. app.py +16 -18
CHANGELOG.md CHANGED
@@ -4,6 +4,20 @@
4
 
5
  ## Changes Made
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  ### [PROBLEM: LLM Quota Exhaustion - Retry Logic]
8
 
9
  **Modified Files:**
 
4
 
5
  ## Changes Made
6
 
7
+ ### [PROBLEM: Ground Truth Architecture - Single Source Simplification]
8
+
9
+ **Modified Files:**
10
+
11
+ - **app.py** (~10 lines modified)
12
+ - Removed `ground_truth` parameter from `export_results_to_json()` function signature
13
+ - Removed double work: no longer access `ground_truth.metadata` in export function
14
+ - Changed `_annotator_metadata` to `annotator_metadata` (removed underscore prefix)
15
+ - Updated all 6 function calls to remove `ground_truth` parameter (lines 448, 489, 504, 513, 522, 531)
16
+ - Updated comment: "both UI and JSON show identical data" (line 426)
17
+ - Updated docstring: "Single source: Both UI and JSON use identical results_log data" (line 58)
18
+ - Simplified JSON export to use `result.get("annotator_metadata")` instead of accessing metadata again (lines 119-121)
19
+ - Result: One object (results_log) → Two formats (UI table + JSON), both identical, no filtering
20
+
21
  ### [PROBLEM: LLM Quota Exhaustion - Retry Logic]
22
 
23
  **Modified Files:**
app.py CHANGED
@@ -49,20 +49,19 @@ def export_results_to_json(
49
  submission_status: str,
50
  execution_time: float = None,
51
  submission_response: dict = None,
52
- ground_truth = None,
53
  ) -> str:
54
  """Export evaluation results to JSON file for easy processing.
55
 
56
  - Local: Saves to ~/Downloads/gaia_results_TIMESTAMP.json
57
  - HF Spaces: Saves to ./exports/gaia_results_TIMESTAMP.json
58
  - Format: Clean JSON with full error messages, no truncation
 
59
 
60
  Args:
61
- results_log: List of question results
62
  submission_status: Status message from submission
63
  execution_time: Total execution time in seconds
64
  submission_response: Response from GAIA API with correctness info
65
- ground_truth: GAIAGroundTruth object for metadata access
66
  """
67
  from datetime import datetime
68
 
@@ -117,13 +116,9 @@ def export_results_to_json(
117
  if result.get("Ground Truth Answer"):
118
  result_dict["ground_truth_answer"] = result.get("Ground Truth Answer")
119
 
120
- # Add annotator metadata if available (from ground_truth object)
121
- if ground_truth and result.get("Task ID"):
122
- task_id = result.get("Task ID")
123
- metadata_item = ground_truth.metadata.get(task_id, {})
124
- annotator_metadata = metadata_item.get("Annotator Metadata", {})
125
- if annotator_metadata:
126
- result_dict["annotator_metadata"] = annotator_metadata
127
 
128
  results_array.append(result_dict)
129
 
@@ -412,8 +407,10 @@ def run_and_submit_all(
412
  # Compare with ground truth if available
413
  is_correct = ground_truth.compare_answer(result["task_id"], result["answer"])
414
 
415
- # Get ground truth answer
416
  gt_answer = ground_truth.get_answer(result["task_id"])
 
 
417
 
418
  # Add to results log
419
  result_entry = {
@@ -426,7 +423,8 @@ def run_and_submit_all(
426
  if is_correct is not None:
427
  result_entry["Correct?"] = "✅ Yes" if is_correct else "❌ No"
428
  result_entry["Ground Truth Answer"] = gt_answer
429
- # Note: metadata NOT added to UI table, only used in JSON export
 
430
 
431
  results_log.append(result_entry)
432
 
@@ -447,7 +445,7 @@ def run_and_submit_all(
447
  results_df = pd.DataFrame(results_log)
448
  execution_time = time.time() - start_time
449
  export_path = export_results_to_json(
450
- results_log, status_message, execution_time, None, ground_truth
451
  )
452
  return status_message, results_df, export_path
453
 
@@ -488,7 +486,7 @@ def run_and_submit_all(
488
  results_df = pd.DataFrame(results_log)
489
  # Export to JSON with execution time and submission response
490
  export_path = export_results_to_json(
491
- results_log, final_status, execution_time, result_data, ground_truth
492
  )
493
  return final_status, results_df, export_path
494
  except requests.exceptions.HTTPError as e:
@@ -503,7 +501,7 @@ def run_and_submit_all(
503
  execution_time = time.time() - start_time
504
  results_df = pd.DataFrame(results_log)
505
  export_path = export_results_to_json(
506
- results_log, status_message, execution_time, None, ground_truth
507
  )
508
  return status_message, results_df, export_path
509
  except requests.exceptions.Timeout:
@@ -512,7 +510,7 @@ def run_and_submit_all(
512
  execution_time = time.time() - start_time
513
  results_df = pd.DataFrame(results_log)
514
  export_path = export_results_to_json(
515
- results_log, status_message, execution_time, None, ground_truth
516
  )
517
  return status_message, results_df, export_path
518
  except requests.exceptions.RequestException as e:
@@ -521,7 +519,7 @@ def run_and_submit_all(
521
  execution_time = time.time() - start_time
522
  results_df = pd.DataFrame(results_log)
523
  export_path = export_results_to_json(
524
- results_log, status_message, execution_time, None, ground_truth
525
  )
526
  return status_message, results_df, export_path
527
  except Exception as e:
@@ -530,7 +528,7 @@ def run_and_submit_all(
530
  execution_time = time.time() - start_time
531
  results_df = pd.DataFrame(results_log)
532
  export_path = export_results_to_json(
533
- results_log, status_message, execution_time, None, ground_truth
534
  )
535
  return status_message, results_df, export_path
536
 
 
49
  submission_status: str,
50
  execution_time: float = None,
51
  submission_response: dict = None,
 
52
  ) -> str:
53
  """Export evaluation results to JSON file for easy processing.
54
 
55
  - Local: Saves to ~/Downloads/gaia_results_TIMESTAMP.json
56
  - HF Spaces: Saves to ./exports/gaia_results_TIMESTAMP.json
57
  - Format: Clean JSON with full error messages, no truncation
58
+ - Single source: Both UI and JSON use identical results_log data
59
 
60
  Args:
61
+ results_log: List of question results (single source of truth)
62
  submission_status: Status message from submission
63
  execution_time: Total execution time in seconds
64
  submission_response: Response from GAIA API with correctness info
 
65
  """
66
  from datetime import datetime
67
 
 
116
  if result.get("Ground Truth Answer"):
117
  result_dict["ground_truth_answer"] = result.get("Ground Truth Answer")
118
 
119
+ # Add annotator metadata if available (already stored in results_log)
120
+ if result.get("annotator_metadata"):
121
+ result_dict["annotator_metadata"] = result.get("annotator_metadata")
 
 
 
 
122
 
123
  results_array.append(result_dict)
124
 
 
407
  # Compare with ground truth if available
408
  is_correct = ground_truth.compare_answer(result["task_id"], result["answer"])
409
 
410
+ # Get ground truth answer and metadata (fetch once)
411
  gt_answer = ground_truth.get_answer(result["task_id"])
412
+ metadata_item = ground_truth.metadata.get(result["task_id"], {})
413
+ annotator_metadata = metadata_item.get("Annotator Metadata", {})
414
 
415
  # Add to results log
416
  result_entry = {
 
423
  if is_correct is not None:
424
  result_entry["Correct?"] = "✅ Yes" if is_correct else "❌ No"
425
  result_entry["Ground Truth Answer"] = gt_answer
426
+ # Store metadata (both UI and JSON show identical data)
427
+ result_entry["annotator_metadata"] = annotator_metadata
428
 
429
  results_log.append(result_entry)
430
 
 
445
  results_df = pd.DataFrame(results_log)
446
  execution_time = time.time() - start_time
447
  export_path = export_results_to_json(
448
+ results_log, status_message, execution_time, None
449
  )
450
  return status_message, results_df, export_path
451
 
 
486
  results_df = pd.DataFrame(results_log)
487
  # Export to JSON with execution time and submission response
488
  export_path = export_results_to_json(
489
+ results_log, final_status, execution_time, result_data
490
  )
491
  return final_status, results_df, export_path
492
  except requests.exceptions.HTTPError as e:
 
501
  execution_time = time.time() - start_time
502
  results_df = pd.DataFrame(results_log)
503
  export_path = export_results_to_json(
504
+ results_log, status_message, execution_time, None
505
  )
506
  return status_message, results_df, export_path
507
  except requests.exceptions.Timeout:
 
510
  execution_time = time.time() - start_time
511
  results_df = pd.DataFrame(results_log)
512
  export_path = export_results_to_json(
513
+ results_log, status_message, execution_time, None
514
  )
515
  return status_message, results_df, export_path
516
  except requests.exceptions.RequestException as e:
 
519
  execution_time = time.time() - start_time
520
  results_df = pd.DataFrame(results_log)
521
  export_path = export_results_to_json(
522
+ results_log, status_message, execution_time, None
523
  )
524
  return status_message, results_df, export_path
525
  except Exception as e:
 
528
  execution_time = time.time() - start_time
529
  results_df = pd.DataFrame(results_log)
530
  export_path = export_results_to_json(
531
+ results_log, status_message, execution_time, None
532
  )
533
  return status_message, results_df, export_path
534