Spaces:
Running
Running
Add effort validation for agentic submissions and uniform-effort display
Browse files- app.py +118 -4
- eval/metrics.py +5 -5
app.py
CHANGED
|
@@ -48,6 +48,7 @@ try:
|
|
| 48 |
confidence_interval,
|
| 49 |
citation_f1,
|
| 50 |
kuiper_statistic,
|
|
|
|
| 51 |
LLM_JUDGE_SPECIFICITY,
|
| 52 |
LLM_JUDGE_SENSITIVITY
|
| 53 |
)
|
|
@@ -967,6 +968,48 @@ def _extract_timestamp_from_filename(filename: str) -> str:
|
|
| 967 |
return match.group(1) if match else "00000000_000000"
|
| 968 |
|
| 969 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 970 |
@st.cache_data(ttl=300) # Cache for 5 minutes
|
| 971 |
def load_eval_results() -> pd.DataFrame:
|
| 972 |
"""Load evaluation results from JSON files, keeping only the most recent per model."""
|
|
@@ -1024,6 +1067,13 @@ def load_eval_results() -> pd.DataFrame:
|
|
| 1024 |
|
| 1025 |
anls_acc = overall.get("anls", 0.0)
|
| 1026 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1027 |
result_dict = {
|
| 1028 |
"Model": model_name,
|
| 1029 |
"Organization": data.get("organization", data.get("submitted_by", org_dir.name)),
|
|
@@ -1042,7 +1092,8 @@ def load_eval_results() -> pd.DataFrame:
|
|
| 1042 |
"Attribution (Page F1)": overall.get("page_f1", 0.0),
|
| 1043 |
"Attribution (Doc F1)": overall.get("doc_f1", 0.0),
|
| 1044 |
# Calibration metric
|
| 1045 |
-
"Effort (Kuiper)":
|
|
|
|
| 1046 |
"Submission Date": data.get("submission_date", ""),
|
| 1047 |
"Link": data.get("link", ""),
|
| 1048 |
"Description": data.get("description", metadata.get("description", "")) or
|
|
@@ -1340,12 +1391,15 @@ def render_leaderboard_table(df: pd.DataFrame, columns: list, show_analyze_colum
|
|
| 1340 |
tags = row.get("Tags", [])
|
| 1341 |
is_conventional_rag = "Conventional RAG" in tags if isinstance(tags, list) else False
|
| 1342 |
if is_conventional_rag:
|
| 1343 |
-
cell_html = "—"
|
| 1344 |
else:
|
| 1345 |
try:
|
| 1346 |
cell_html = f"{float(value):.1f}" if value else "0"
|
| 1347 |
except (ValueError, TypeError):
|
| 1348 |
cell_html = str(value)
|
|
|
|
|
|
|
|
|
|
| 1349 |
cells.append(f'<td style="text-align: center;">{cell_html}</td>')
|
| 1350 |
elif col == "Organization":
|
| 1351 |
cell_html = str(value) if value else ""
|
|
@@ -1739,6 +1793,9 @@ def show_model_details(model_name: str):
|
|
| 1739 |
with col3:
|
| 1740 |
if is_conventional_rag:
|
| 1741 |
st.metric("Effort (Kuiper)", "—")
|
|
|
|
|
|
|
|
|
|
| 1742 |
else:
|
| 1743 |
kuiper = model_data.get('Effort (Kuiper)', 0)
|
| 1744 |
st.metric("Effort (Kuiper)", f"{kuiper:.2f}" if kuiper else "N/A")
|
|
@@ -1775,6 +1832,35 @@ def show_model_details(model_name: str):
|
|
| 1775 |
st.info("Per-domain breakdown not available for this submission. Newer submissions will include this data.")
|
| 1776 |
|
| 1777 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1778 |
def validate_jsonl_submission(file_content: str) -> tuple[bool, str, list]:
|
| 1779 |
"""Validate JSONL submission format and return parsed predictions."""
|
| 1780 |
try:
|
|
@@ -2168,8 +2254,12 @@ def submit_results_fragment():
|
|
| 2168 |
|
| 2169 |
**Optional fields (for full metrics):**
|
| 2170 |
- `citations`: List of `{"file": "...", "page": N}` for attribution metrics
|
| 2171 |
-
- `iterations`, `search_history`, `llm_calls`, or `effort`: For effort/calibration metrics
|
| 2172 |
- `id`: Question ID (fallback matching)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2173 |
""")
|
| 2174 |
|
| 2175 |
# Initialize session state for evaluation results
|
|
@@ -2314,8 +2404,28 @@ def submit_results_fragment():
|
|
| 2314 |
st.markdown("#### Step 3: Submit to Leaderboard")
|
| 2315 |
|
| 2316 |
if st.button("Submit to Leaderboard", type="primary", disabled=not (model_name and organization and model_type)):
|
|
|
|
|
|
|
| 2317 |
if not model_name or not organization or not model_type:
|
| 2318 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2319 |
else:
|
| 2320 |
# Get current user for submission tracking
|
| 2321 |
hf_user = get_hf_user()
|
|
@@ -2989,6 +3099,10 @@ The task is characterized by six formal properties:
|
|
| 2989 |
|
| 2990 |
##### Effort (Kuiper)
|
| 2991 |
- **Effort (Kuiper)**: Measures whether computational effort correlates with problem difficulty. Lower values indicate better calibration—the system "knows what it knows" and doesn't waste effort on unsolvable queries
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2992 |
""")
|
| 2993 |
|
| 2994 |
# ===== SUBMIT TAB =====
|
|
|
|
| 48 |
confidence_interval,
|
| 49 |
citation_f1,
|
| 50 |
kuiper_statistic,
|
| 51 |
+
get_effort_value,
|
| 52 |
LLM_JUDGE_SPECIFICITY,
|
| 53 |
LLM_JUDGE_SENSITIVITY
|
| 54 |
)
|
|
|
|
| 968 |
return match.group(1) if match else "00000000_000000"
|
| 969 |
|
| 970 |
|
| 971 |
+
def _detect_effort_uniform(result_file: Path, data: dict) -> bool:
|
| 972 |
+
"""Check if all predictions in the companion JSONL have the same effort value."""
|
| 973 |
+
pred_rel = data.get("source_predictions_file")
|
| 974 |
+
if pred_rel:
|
| 975 |
+
pred_path = Path(EVAL_RESULTS_PATH) / pred_rel
|
| 976 |
+
else:
|
| 977 |
+
pred_path = Path(str(result_file).replace("_results_", "_predictions_").replace(".json", ".jsonl"))
|
| 978 |
+
|
| 979 |
+
if not pred_path.exists():
|
| 980 |
+
return False
|
| 981 |
+
|
| 982 |
+
try:
|
| 983 |
+
effort_values = set()
|
| 984 |
+
with open(pred_path) as f:
|
| 985 |
+
for line in f:
|
| 986 |
+
line = line.strip()
|
| 987 |
+
if not line:
|
| 988 |
+
continue
|
| 989 |
+
pred = json.loads(line)
|
| 990 |
+
search_history = pred.get('search_history', [])
|
| 991 |
+
steps = len(search_history) if isinstance(search_history, list) and search_history else 0
|
| 992 |
+
if steps == 0:
|
| 993 |
+
steps = pred.get('iterations', 0)
|
| 994 |
+
try:
|
| 995 |
+
steps = float(steps) if steps else 0
|
| 996 |
+
except (TypeError, ValueError):
|
| 997 |
+
steps = 0
|
| 998 |
+
effort_dict = {
|
| 999 |
+
'steps': steps,
|
| 1000 |
+
'llm_calls': pred.get('llm_calls') or (pred.get('trajectory', {}) or {}).get('llm_calls'),
|
| 1001 |
+
'effort': pred.get('effort') or (pred.get('trajectory', {}) or {}).get('effort'),
|
| 1002 |
+
}
|
| 1003 |
+
val = get_effort_value(effort_dict)
|
| 1004 |
+
if val > 0:
|
| 1005 |
+
effort_values.add(val)
|
| 1006 |
+
if len(effort_values) > 1:
|
| 1007 |
+
return False
|
| 1008 |
+
return len(effort_values) == 1
|
| 1009 |
+
except Exception:
|
| 1010 |
+
return False
|
| 1011 |
+
|
| 1012 |
+
|
| 1013 |
@st.cache_data(ttl=300) # Cache for 5 minutes
|
| 1014 |
def load_eval_results() -> pd.DataFrame:
|
| 1015 |
"""Load evaluation results from JSON files, keeping only the most recent per model."""
|
|
|
|
| 1067 |
|
| 1068 |
anls_acc = overall.get("anls", 0.0)
|
| 1069 |
|
| 1070 |
+
# Detect effort uniformity for Agentic models with Kuiper
|
| 1071 |
+
kuiper_val = overall.get("kuiper", 0.0)
|
| 1072 |
+
is_agentic = "Agentic" in tags if isinstance(tags, list) else False
|
| 1073 |
+
effort_uniform = False
|
| 1074 |
+
if is_agentic and kuiper_val and EVAL_AVAILABLE:
|
| 1075 |
+
effort_uniform = _detect_effort_uniform(result_file, data)
|
| 1076 |
+
|
| 1077 |
result_dict = {
|
| 1078 |
"Model": model_name,
|
| 1079 |
"Organization": data.get("organization", data.get("submitted_by", org_dir.name)),
|
|
|
|
| 1092 |
"Attribution (Page F1)": overall.get("page_f1", 0.0),
|
| 1093 |
"Attribution (Doc F1)": overall.get("doc_f1", 0.0),
|
| 1094 |
# Calibration metric
|
| 1095 |
+
"Effort (Kuiper)": kuiper_val,
|
| 1096 |
+
"_effort_uniform": effort_uniform,
|
| 1097 |
"Submission Date": data.get("submission_date", ""),
|
| 1098 |
"Link": data.get("link", ""),
|
| 1099 |
"Description": data.get("description", metadata.get("description", "")) or
|
|
|
|
| 1391 |
tags = row.get("Tags", [])
|
| 1392 |
is_conventional_rag = "Conventional RAG" in tags if isinstance(tags, list) else False
|
| 1393 |
if is_conventional_rag:
|
| 1394 |
+
cell_html = "—"
|
| 1395 |
else:
|
| 1396 |
try:
|
| 1397 |
cell_html = f"{float(value):.1f}" if value else "0"
|
| 1398 |
except (ValueError, TypeError):
|
| 1399 |
cell_html = str(value)
|
| 1400 |
+
if row.get("_effort_uniform", False) and cell_html != "0":
|
| 1401 |
+
tooltip = "This agent uses the same effort for all samples, so effort-invariance metric is not meaningful."
|
| 1402 |
+
cell_html = f'<span style="color: #888; cursor: help;" title="{tooltip}">({cell_html})</span>'
|
| 1403 |
cells.append(f'<td style="text-align: center;">{cell_html}</td>')
|
| 1404 |
elif col == "Organization":
|
| 1405 |
cell_html = str(value) if value else ""
|
|
|
|
| 1793 |
with col3:
|
| 1794 |
if is_conventional_rag:
|
| 1795 |
st.metric("Effort (Kuiper)", "—")
|
| 1796 |
+
elif model_data.get('_effort_uniform', False):
|
| 1797 |
+
kuiper = model_data.get('Effort (Kuiper)', 0)
|
| 1798 |
+
st.metric("Effort (Kuiper)", f"({kuiper:.2f})" if kuiper else "N/A", help="This agent uses the same effort for all samples, so effort-invariance metric is not meaningful.")
|
| 1799 |
else:
|
| 1800 |
kuiper = model_data.get('Effort (Kuiper)', 0)
|
| 1801 |
st.metric("Effort (Kuiper)", f"{kuiper:.2f}" if kuiper else "N/A")
|
|
|
|
| 1832 |
st.info("Per-domain breakdown not available for this submission. Newer submissions will include this data.")
|
| 1833 |
|
| 1834 |
|
| 1835 |
+
def _prediction_has_effort(pred: dict) -> bool:
|
| 1836 |
+
"""Check if a prediction contains at least one valid effort measure."""
|
| 1837 |
+
search_history = pred.get('search_history', [])
|
| 1838 |
+
if isinstance(search_history, list) and len(search_history) > 0:
|
| 1839 |
+
return True
|
| 1840 |
+
|
| 1841 |
+
for key in ('iterations', 'steps', 'llm_calls', 'effort'):
|
| 1842 |
+
val = pred.get(key)
|
| 1843 |
+
if val is not None:
|
| 1844 |
+
try:
|
| 1845 |
+
if float(val) > 0:
|
| 1846 |
+
return True
|
| 1847 |
+
except (TypeError, ValueError):
|
| 1848 |
+
pass
|
| 1849 |
+
|
| 1850 |
+
trajectory = pred.get('trajectory', {})
|
| 1851 |
+
if isinstance(trajectory, dict):
|
| 1852 |
+
for key in ('llm_calls', 'effort'):
|
| 1853 |
+
val = trajectory.get(key)
|
| 1854 |
+
if val is not None:
|
| 1855 |
+
try:
|
| 1856 |
+
if float(val) > 0:
|
| 1857 |
+
return True
|
| 1858 |
+
except (TypeError, ValueError):
|
| 1859 |
+
pass
|
| 1860 |
+
|
| 1861 |
+
return False
|
| 1862 |
+
|
| 1863 |
+
|
| 1864 |
def validate_jsonl_submission(file_content: str) -> tuple[bool, str, list]:
|
| 1865 |
"""Validate JSONL submission format and return parsed predictions."""
|
| 1866 |
try:
|
|
|
|
| 2254 |
|
| 2255 |
**Optional fields (for full metrics):**
|
| 2256 |
- `citations`: List of `{"file": "...", "page": N}` for attribution metrics
|
|
|
|
| 2257 |
- `id`: Question ID (fallback matching)
|
| 2258 |
+
|
| 2259 |
+
**Effort fields (required for Agentic submissions, at least one per sample):**
|
| 2260 |
+
- `steps`: Number of agentic steps taken (positive integer)
|
| 2261 |
+
- `search_history`: List of search queries performed (e.g. `["query1", "query2"]`)
|
| 2262 |
+
- `effort`: Generic effort measure (positive number), should be proportional to the number of searches, LLM calls, or reasoning tokens generated, in this order of preference
|
| 2263 |
""")
|
| 2264 |
|
| 2265 |
# Initialize session state for evaluation results
|
|
|
|
| 2404 |
st.markdown("#### Step 3: Submit to Leaderboard")
|
| 2405 |
|
| 2406 |
if st.button("Submit to Leaderboard", type="primary", disabled=not (model_name and organization and model_type)):
|
| 2407 |
+
# Validate required fields
|
| 2408 |
+
submit_error = None
|
| 2409 |
if not model_name or not organization or not model_type:
|
| 2410 |
+
submit_error = "Please fill in all required fields (Model Name, Organization, Model Type)"
|
| 2411 |
+
elif "Agentic" in selected_tags and st.session_state.predictions:
|
| 2412 |
+
missing_effort = [
|
| 2413 |
+
(i + 1, p.get('question', '')[:60])
|
| 2414 |
+
for i, p in enumerate(st.session_state.predictions)
|
| 2415 |
+
if not _prediction_has_effort(p)
|
| 2416 |
+
]
|
| 2417 |
+
if missing_effort:
|
| 2418 |
+
samples = "; ".join(f"line {ln}: {q}..." for ln, q in missing_effort[:5])
|
| 2419 |
+
extra = f" (and {len(missing_effort) - 5} more)" if len(missing_effort) > 5 else ""
|
| 2420 |
+
submit_error = (
|
| 2421 |
+
f"**Agentic submissions require effort data for every sample.** "
|
| 2422 |
+
f"{len(missing_effort)} prediction(s) are missing effort information "
|
| 2423 |
+
f"(e.g. `iterations`, `steps`, `llm_calls`, `effort`, or `search_history`). "
|
| 2424 |
+
f"Examples: {samples}{extra}"
|
| 2425 |
+
)
|
| 2426 |
+
|
| 2427 |
+
if submit_error:
|
| 2428 |
+
st.error(submit_error)
|
| 2429 |
else:
|
| 2430 |
# Get current user for submission tracking
|
| 2431 |
hf_user = get_hf_user()
|
|
|
|
| 3099 |
|
| 3100 |
##### Effort (Kuiper)
|
| 3101 |
- **Effort (Kuiper)**: Measures whether computational effort correlates with problem difficulty. Lower values indicate better calibration—the system "knows what it knows" and doesn't waste effort on unsolvable queries
|
| 3102 |
+
|
| 3103 |
+
---
|
| 3104 |
+
|
| 3105 |
+
**Contact:** [lukasz.borchmann@snowflake.com](mailto:lukasz.borchmann@snowflake.com)
|
| 3106 |
""")
|
| 3107 |
|
| 3108 |
# ===== SUBMIT TAB =====
|
eval/metrics.py
CHANGED
|
@@ -632,7 +632,7 @@ def citation_f1(
|
|
| 632 |
return {'precision': precision, 'recall': recall, 'f1': f1, 'support': len(gt_set)}
|
| 633 |
|
| 634 |
|
| 635 |
-
def
|
| 636 |
"""Extract effort value with fallbacks.
|
| 637 |
|
| 638 |
Priority: steps -> llm_calls -> effort
|
|
@@ -672,7 +672,7 @@ def kuiper_statistic(results: List[Dict]) -> Dict[str, Any]:
|
|
| 672 |
"""
|
| 673 |
if np is None:
|
| 674 |
raise ImportError("numpy is required for kuiper_statistic; please install numpy")
|
| 675 |
-
valid = [r for r in results if
|
| 676 |
|
| 677 |
if not valid:
|
| 678 |
return {
|
|
@@ -685,7 +685,7 @@ def kuiper_statistic(results: List[Dict]) -> Dict[str, Any]:
|
|
| 685 |
}
|
| 686 |
|
| 687 |
# Sort by effort (steps -> llm_calls -> effort)
|
| 688 |
-
sorted_results = sorted(valid, key=
|
| 689 |
correctness = [1 if r['correct'] else 0 for r in sorted_results]
|
| 690 |
|
| 691 |
y_bar = np.mean(correctness)
|
|
@@ -734,8 +734,8 @@ def wasted_effort_ratio(results: List[Dict]) -> Dict[str, float]:
|
|
| 734 |
Returns:
|
| 735 |
Dict with 'ratio', 'mean_steps_correct', 'mean_steps_incorrect'
|
| 736 |
"""
|
| 737 |
-
correct_steps = [
|
| 738 |
-
incorrect_steps = [
|
| 739 |
|
| 740 |
mean_correct = float(np.mean(correct_steps)) if correct_steps else 0.0
|
| 741 |
mean_incorrect = float(np.mean(incorrect_steps)) if incorrect_steps else 0.0
|
|
|
|
| 632 |
return {'precision': precision, 'recall': recall, 'f1': f1, 'support': len(gt_set)}
|
| 633 |
|
| 634 |
|
| 635 |
+
def get_effort_value(result: Dict) -> float:
|
| 636 |
"""Extract effort value with fallbacks.
|
| 637 |
|
| 638 |
Priority: steps -> llm_calls -> effort
|
|
|
|
| 672 |
"""
|
| 673 |
if np is None:
|
| 674 |
raise ImportError("numpy is required for kuiper_statistic; please install numpy")
|
| 675 |
+
valid = [r for r in results if get_effort_value(r) > 0]
|
| 676 |
|
| 677 |
if not valid:
|
| 678 |
return {
|
|
|
|
| 685 |
}
|
| 686 |
|
| 687 |
# Sort by effort (steps -> llm_calls -> effort)
|
| 688 |
+
sorted_results = sorted(valid, key=get_effort_value)
|
| 689 |
correctness = [1 if r['correct'] else 0 for r in sorted_results]
|
| 690 |
|
| 691 |
y_bar = np.mean(correctness)
|
|
|
|
| 734 |
Returns:
|
| 735 |
Dict with 'ratio', 'mean_steps_correct', 'mean_steps_incorrect'
|
| 736 |
"""
|
| 737 |
+
correct_steps = [get_effort_value(r) for r in results if r.get('correct') and get_effort_value(r) > 0]
|
| 738 |
+
incorrect_steps = [get_effort_value(r) for r in results if not r.get('correct') and get_effort_value(r) > 0]
|
| 739 |
|
| 740 |
mean_correct = float(np.mean(correct_steps)) if correct_steps else 0.0
|
| 741 |
mean_incorrect = float(np.mean(incorrect_steps)) if incorrect_steps else 0.0
|