Borchmann commited on
Commit
dfedb16
·
1 Parent(s): 4098fb3

Add effort validation for agentic submissions and uniform-effort display

Browse files
Files changed (2) hide show
  1. app.py +118 -4
  2. eval/metrics.py +5 -5
app.py CHANGED
@@ -48,6 +48,7 @@ try:
48
  confidence_interval,
49
  citation_f1,
50
  kuiper_statistic,
 
51
  LLM_JUDGE_SPECIFICITY,
52
  LLM_JUDGE_SENSITIVITY
53
  )
@@ -967,6 +968,48 @@ def _extract_timestamp_from_filename(filename: str) -> str:
967
  return match.group(1) if match else "00000000_000000"
968
 
969
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
970
  @st.cache_data(ttl=300) # Cache for 5 minutes
971
  def load_eval_results() -> pd.DataFrame:
972
  """Load evaluation results from JSON files, keeping only the most recent per model."""
@@ -1024,6 +1067,13 @@ def load_eval_results() -> pd.DataFrame:
1024
 
1025
  anls_acc = overall.get("anls", 0.0)
1026
 
 
 
 
 
 
 
 
1027
  result_dict = {
1028
  "Model": model_name,
1029
  "Organization": data.get("organization", data.get("submitted_by", org_dir.name)),
@@ -1042,7 +1092,8 @@ def load_eval_results() -> pd.DataFrame:
1042
  "Attribution (Page F1)": overall.get("page_f1", 0.0),
1043
  "Attribution (Doc F1)": overall.get("doc_f1", 0.0),
1044
  # Calibration metric
1045
- "Effort (Kuiper)": overall.get("kuiper", 0.0),
 
1046
  "Submission Date": data.get("submission_date", ""),
1047
  "Link": data.get("link", ""),
1048
  "Description": data.get("description", metadata.get("description", "")) or
@@ -1340,12 +1391,15 @@ def render_leaderboard_table(df: pd.DataFrame, columns: list, show_analyze_colum
1340
  tags = row.get("Tags", [])
1341
  is_conventional_rag = "Conventional RAG" in tags if isinstance(tags, list) else False
1342
  if is_conventional_rag:
1343
- cell_html = "—" # Not applicable for conventional RAG
1344
  else:
1345
  try:
1346
  cell_html = f"{float(value):.1f}" if value else "0"
1347
  except (ValueError, TypeError):
1348
  cell_html = str(value)
 
 
 
1349
  cells.append(f'<td style="text-align: center;">{cell_html}</td>')
1350
  elif col == "Organization":
1351
  cell_html = str(value) if value else ""
@@ -1739,6 +1793,9 @@ def show_model_details(model_name: str):
1739
  with col3:
1740
  if is_conventional_rag:
1741
  st.metric("Effort (Kuiper)", "—")
 
 
 
1742
  else:
1743
  kuiper = model_data.get('Effort (Kuiper)', 0)
1744
  st.metric("Effort (Kuiper)", f"{kuiper:.2f}" if kuiper else "N/A")
@@ -1775,6 +1832,35 @@ def show_model_details(model_name: str):
1775
  st.info("Per-domain breakdown not available for this submission. Newer submissions will include this data.")
1776
 
1777
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1778
  def validate_jsonl_submission(file_content: str) -> tuple[bool, str, list]:
1779
  """Validate JSONL submission format and return parsed predictions."""
1780
  try:
@@ -2168,8 +2254,12 @@ def submit_results_fragment():
2168
 
2169
  **Optional fields (for full metrics):**
2170
  - `citations`: List of `{"file": "...", "page": N}` for attribution metrics
2171
- - `iterations`, `search_history`, `llm_calls`, or `effort`: For effort/calibration metrics
2172
  - `id`: Question ID (fallback matching)
 
 
 
 
 
2173
  """)
2174
 
2175
  # Initialize session state for evaluation results
@@ -2314,8 +2404,28 @@ def submit_results_fragment():
2314
  st.markdown("#### Step 3: Submit to Leaderboard")
2315
 
2316
  if st.button("Submit to Leaderboard", type="primary", disabled=not (model_name and organization and model_type)):
 
 
2317
  if not model_name or not organization or not model_type:
2318
- st.error("Please fill in all required fields (Model Name, Organization, Model Type)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2319
  else:
2320
  # Get current user for submission tracking
2321
  hf_user = get_hf_user()
@@ -2989,6 +3099,10 @@ The task is characterized by six formal properties:
2989
 
2990
  ##### Effort (Kuiper)
2991
  - **Effort (Kuiper)**: Measures whether computational effort correlates with problem difficulty. Lower values indicate better calibration—the system "knows what it knows" and doesn't waste effort on unsolvable queries
 
 
 
 
2992
  """)
2993
 
2994
  # ===== SUBMIT TAB =====
 
48
  confidence_interval,
49
  citation_f1,
50
  kuiper_statistic,
51
+ get_effort_value,
52
  LLM_JUDGE_SPECIFICITY,
53
  LLM_JUDGE_SENSITIVITY
54
  )
 
968
  return match.group(1) if match else "00000000_000000"
969
 
970
 
971
+ def _detect_effort_uniform(result_file: Path, data: dict) -> bool:
972
+ """Check if all predictions in the companion JSONL have the same effort value."""
973
+ pred_rel = data.get("source_predictions_file")
974
+ if pred_rel:
975
+ pred_path = Path(EVAL_RESULTS_PATH) / pred_rel
976
+ else:
977
+ pred_path = Path(str(result_file).replace("_results_", "_predictions_").replace(".json", ".jsonl"))
978
+
979
+ if not pred_path.exists():
980
+ return False
981
+
982
+ try:
983
+ effort_values = set()
984
+ with open(pred_path) as f:
985
+ for line in f:
986
+ line = line.strip()
987
+ if not line:
988
+ continue
989
+ pred = json.loads(line)
990
+ search_history = pred.get('search_history', [])
991
+ steps = len(search_history) if isinstance(search_history, list) and search_history else 0
992
+ if steps == 0:
993
+ steps = pred.get('iterations', 0)
994
+ try:
995
+ steps = float(steps) if steps else 0
996
+ except (TypeError, ValueError):
997
+ steps = 0
998
+ effort_dict = {
999
+ 'steps': steps,
1000
+ 'llm_calls': pred.get('llm_calls') or (pred.get('trajectory', {}) or {}).get('llm_calls'),
1001
+ 'effort': pred.get('effort') or (pred.get('trajectory', {}) or {}).get('effort'),
1002
+ }
1003
+ val = get_effort_value(effort_dict)
1004
+ if val > 0:
1005
+ effort_values.add(val)
1006
+ if len(effort_values) > 1:
1007
+ return False
1008
+ return len(effort_values) == 1
1009
+ except Exception:
1010
+ return False
1011
+
1012
+
1013
  @st.cache_data(ttl=300) # Cache for 5 minutes
1014
  def load_eval_results() -> pd.DataFrame:
1015
  """Load evaluation results from JSON files, keeping only the most recent per model."""
 
1067
 
1068
  anls_acc = overall.get("anls", 0.0)
1069
 
1070
+ # Detect effort uniformity for Agentic models with Kuiper
1071
+ kuiper_val = overall.get("kuiper", 0.0)
1072
+ is_agentic = "Agentic" in tags if isinstance(tags, list) else False
1073
+ effort_uniform = False
1074
+ if is_agentic and kuiper_val and EVAL_AVAILABLE:
1075
+ effort_uniform = _detect_effort_uniform(result_file, data)
1076
+
1077
  result_dict = {
1078
  "Model": model_name,
1079
  "Organization": data.get("organization", data.get("submitted_by", org_dir.name)),
 
1092
  "Attribution (Page F1)": overall.get("page_f1", 0.0),
1093
  "Attribution (Doc F1)": overall.get("doc_f1", 0.0),
1094
  # Calibration metric
1095
+ "Effort (Kuiper)": kuiper_val,
1096
+ "_effort_uniform": effort_uniform,
1097
  "Submission Date": data.get("submission_date", ""),
1098
  "Link": data.get("link", ""),
1099
  "Description": data.get("description", metadata.get("description", "")) or
 
1391
  tags = row.get("Tags", [])
1392
  is_conventional_rag = "Conventional RAG" in tags if isinstance(tags, list) else False
1393
  if is_conventional_rag:
1394
+ cell_html = "—"
1395
  else:
1396
  try:
1397
  cell_html = f"{float(value):.1f}" if value else "0"
1398
  except (ValueError, TypeError):
1399
  cell_html = str(value)
1400
+ if row.get("_effort_uniform", False) and cell_html != "0":
1401
+ tooltip = "This agent uses the same effort for all samples, so effort-invariance metric is not meaningful."
1402
+ cell_html = f'<span style="color: #888; cursor: help;" title="{tooltip}">({cell_html})</span>'
1403
  cells.append(f'<td style="text-align: center;">{cell_html}</td>')
1404
  elif col == "Organization":
1405
  cell_html = str(value) if value else ""
 
1793
  with col3:
1794
  if is_conventional_rag:
1795
  st.metric("Effort (Kuiper)", "—")
1796
+ elif model_data.get('_effort_uniform', False):
1797
+ kuiper = model_data.get('Effort (Kuiper)', 0)
1798
+ st.metric("Effort (Kuiper)", f"({kuiper:.2f})" if kuiper else "N/A", help="This agent uses the same effort for all samples, so effort-invariance metric is not meaningful.")
1799
  else:
1800
  kuiper = model_data.get('Effort (Kuiper)', 0)
1801
  st.metric("Effort (Kuiper)", f"{kuiper:.2f}" if kuiper else "N/A")
 
1832
  st.info("Per-domain breakdown not available for this submission. Newer submissions will include this data.")
1833
 
1834
 
1835
+ def _prediction_has_effort(pred: dict) -> bool:
1836
+ """Check if a prediction contains at least one valid effort measure."""
1837
+ search_history = pred.get('search_history', [])
1838
+ if isinstance(search_history, list) and len(search_history) > 0:
1839
+ return True
1840
+
1841
+ for key in ('iterations', 'steps', 'llm_calls', 'effort'):
1842
+ val = pred.get(key)
1843
+ if val is not None:
1844
+ try:
1845
+ if float(val) > 0:
1846
+ return True
1847
+ except (TypeError, ValueError):
1848
+ pass
1849
+
1850
+ trajectory = pred.get('trajectory', {})
1851
+ if isinstance(trajectory, dict):
1852
+ for key in ('llm_calls', 'effort'):
1853
+ val = trajectory.get(key)
1854
+ if val is not None:
1855
+ try:
1856
+ if float(val) > 0:
1857
+ return True
1858
+ except (TypeError, ValueError):
1859
+ pass
1860
+
1861
+ return False
1862
+
1863
+
1864
  def validate_jsonl_submission(file_content: str) -> tuple[bool, str, list]:
1865
  """Validate JSONL submission format and return parsed predictions."""
1866
  try:
 
2254
 
2255
  **Optional fields (for full metrics):**
2256
  - `citations`: List of `{"file": "...", "page": N}` for attribution metrics
 
2257
  - `id`: Question ID (fallback matching)
2258
+
2259
+ **Effort fields (required for Agentic submissions, at least one per sample):**
2260
+ - `steps`: Number of agentic steps taken (positive integer)
2261
+ - `search_history`: List of search queries performed (e.g. `["query1", "query2"]`)
2262
+ - `effort`: Generic effort measure (positive number), should be proportional to the number of searches, LLM calls, or reasoning tokens generated, in this order of preference
2263
  """)
2264
 
2265
  # Initialize session state for evaluation results
 
2404
  st.markdown("#### Step 3: Submit to Leaderboard")
2405
 
2406
  if st.button("Submit to Leaderboard", type="primary", disabled=not (model_name and organization and model_type)):
2407
+ # Validate required fields
2408
+ submit_error = None
2409
  if not model_name or not organization or not model_type:
2410
+ submit_error = "Please fill in all required fields (Model Name, Organization, Model Type)"
2411
+ elif "Agentic" in selected_tags and st.session_state.predictions:
2412
+ missing_effort = [
2413
+ (i + 1, p.get('question', '')[:60])
2414
+ for i, p in enumerate(st.session_state.predictions)
2415
+ if not _prediction_has_effort(p)
2416
+ ]
2417
+ if missing_effort:
2418
+ samples = "; ".join(f"line {ln}: {q}..." for ln, q in missing_effort[:5])
2419
+ extra = f" (and {len(missing_effort) - 5} more)" if len(missing_effort) > 5 else ""
2420
+ submit_error = (
2421
+ f"**Agentic submissions require effort data for every sample.** "
2422
+ f"{len(missing_effort)} prediction(s) are missing effort information "
2423
+ f"(e.g. `iterations`, `steps`, `llm_calls`, `effort`, or `search_history`). "
2424
+ f"Examples: {samples}{extra}"
2425
+ )
2426
+
2427
+ if submit_error:
2428
+ st.error(submit_error)
2429
  else:
2430
  # Get current user for submission tracking
2431
  hf_user = get_hf_user()
 
3099
 
3100
  ##### Effort (Kuiper)
3101
  - **Effort (Kuiper)**: Measures whether computational effort correlates with problem difficulty. Lower values indicate better calibration—the system "knows what it knows" and doesn't waste effort on unsolvable queries
3102
+
3103
+ ---
3104
+
3105
+ **Contact:** [lukasz.borchmann@snowflake.com](mailto:lukasz.borchmann@snowflake.com)
3106
  """)
3107
 
3108
  # ===== SUBMIT TAB =====
eval/metrics.py CHANGED
@@ -632,7 +632,7 @@ def citation_f1(
632
  return {'precision': precision, 'recall': recall, 'f1': f1, 'support': len(gt_set)}
633
 
634
 
635
- def _get_effort_value(result: Dict) -> float:
636
  """Extract effort value with fallbacks.
637
 
638
  Priority: steps -> llm_calls -> effort
@@ -672,7 +672,7 @@ def kuiper_statistic(results: List[Dict]) -> Dict[str, Any]:
672
  """
673
  if np is None:
674
  raise ImportError("numpy is required for kuiper_statistic; please install numpy")
675
- valid = [r for r in results if _get_effort_value(r) > 0]
676
 
677
  if not valid:
678
  return {
@@ -685,7 +685,7 @@ def kuiper_statistic(results: List[Dict]) -> Dict[str, Any]:
685
  }
686
 
687
  # Sort by effort (steps -> llm_calls -> effort)
688
- sorted_results = sorted(valid, key=_get_effort_value)
689
  correctness = [1 if r['correct'] else 0 for r in sorted_results]
690
 
691
  y_bar = np.mean(correctness)
@@ -734,8 +734,8 @@ def wasted_effort_ratio(results: List[Dict]) -> Dict[str, float]:
734
  Returns:
735
  Dict with 'ratio', 'mean_steps_correct', 'mean_steps_incorrect'
736
  """
737
- correct_steps = [_get_effort_value(r) for r in results if r.get('correct') and _get_effort_value(r) > 0]
738
- incorrect_steps = [_get_effort_value(r) for r in results if not r.get('correct') and _get_effort_value(r) > 0]
739
 
740
  mean_correct = float(np.mean(correct_steps)) if correct_steps else 0.0
741
  mean_incorrect = float(np.mean(incorrect_steps)) if incorrect_steps else 0.0
 
632
  return {'precision': precision, 'recall': recall, 'f1': f1, 'support': len(gt_set)}
633
 
634
 
635
+ def get_effort_value(result: Dict) -> float:
636
  """Extract effort value with fallbacks.
637
 
638
  Priority: steps -> llm_calls -> effort
 
672
  """
673
  if np is None:
674
  raise ImportError("numpy is required for kuiper_statistic; please install numpy")
675
+ valid = [r for r in results if get_effort_value(r) > 0]
676
 
677
  if not valid:
678
  return {
 
685
  }
686
 
687
  # Sort by effort (steps -> llm_calls -> effort)
688
+ sorted_results = sorted(valid, key=get_effort_value)
689
  correctness = [1 if r['correct'] else 0 for r in sorted_results]
690
 
691
  y_bar = np.mean(correctness)
 
734
  Returns:
735
  Dict with 'ratio', 'mean_steps_correct', 'mean_steps_incorrect'
736
  """
737
+ correct_steps = [get_effort_value(r) for r in results if r.get('correct') and get_effort_value(r) > 0]
738
+ incorrect_steps = [get_effort_value(r) for r in results if not r.get('correct') and get_effort_value(r) > 0]
739
 
740
  mean_correct = float(np.mean(correct_steps)) if correct_steps else 0.0
741
  mean_incorrect = float(np.mean(incorrect_steps)) if incorrect_steps else 0.0