Spaces:

Snowflake
/

MADQA-Leaderboard

Running

App Files Files

Borchmann commited on 26 days ago

Commit

dfedb16

1 Parent(s): 4098fb3

Add effort validation for agentic submissions and uniform-effort display

Browse files

Files changed (2) hide show

app.py +118 -4
eval/metrics.py +5 -5

app.py CHANGED Viewed

@@ -48,6 +48,7 @@ try:
         confidence_interval,
         citation_f1,
         kuiper_statistic,
         LLM_JUDGE_SPECIFICITY,
         LLM_JUDGE_SENSITIVITY
     )
@@ -967,6 +968,48 @@ def _extract_timestamp_from_filename(filename: str) -> str:
     return match.group(1) if match else "00000000_000000"
 @st.cache_data(ttl=300)  # Cache for 5 minutes
 def load_eval_results() -> pd.DataFrame:
     """Load evaluation results from JSON files, keeping only the most recent per model."""
@@ -1024,6 +1067,13 @@ def load_eval_results() -> pd.DataFrame:
                     anls_acc = overall.get("anls", 0.0)
                     result_dict = {
                         "Model": model_name,
                         "Organization": data.get("organization", data.get("submitted_by", org_dir.name)),
@@ -1042,7 +1092,8 @@ def load_eval_results() -> pd.DataFrame:
                         "Attribution (Page F1)": overall.get("page_f1", 0.0),
                         "Attribution (Doc F1)": overall.get("doc_f1", 0.0),
                         # Calibration metric
-                        "Effort (Kuiper)": overall.get("kuiper", 0.0),
                         "Submission Date": data.get("submission_date", ""),
                         "Link": data.get("link", ""),
                         "Description": data.get("description", metadata.get("description", "")) or
@@ -1340,12 +1391,15 @@ def render_leaderboard_table(df: pd.DataFrame, columns: list, show_analyze_colum
                 tags = row.get("Tags", [])
                 is_conventional_rag = "Conventional RAG" in tags if isinstance(tags, list) else False
                 if is_conventional_rag:
-                    cell_html = "—"  # Not applicable for conventional RAG
                 else:
                     try:
                         cell_html = f"{float(value):.1f}" if value else "0"
                     except (ValueError, TypeError):
                         cell_html = str(value)
                 cells.append(f'<td style="text-align: center;">{cell_html}</td>')
             elif col == "Organization":
                 cell_html = str(value) if value else ""
@@ -1739,6 +1793,9 @@ def show_model_details(model_name: str):
     with col3:
         if is_conventional_rag:
             st.metric("Effort (Kuiper)", "—")
         else:
             kuiper = model_data.get('Effort (Kuiper)', 0)
             st.metric("Effort (Kuiper)", f"{kuiper:.2f}" if kuiper else "N/A")
@@ -1775,6 +1832,35 @@ def show_model_details(model_name: str):
         st.info("Per-domain breakdown not available for this submission. Newer submissions will include this data.")
 def validate_jsonl_submission(file_content: str) -> tuple[bool, str, list]:
     """Validate JSONL submission format and return parsed predictions."""
     try:
@@ -2168,8 +2254,12 @@ def submit_results_fragment():
         **Optional fields (for full metrics):**
         - `citations`: List of `{"file": "...", "page": N}` for attribution metrics
-        - `iterations`, `search_history`, `llm_calls`, or `effort`: For effort/calibration metrics
         - `id`: Question ID (fallback matching)
         """)
     # Initialize session state for evaluation results
@@ -2314,8 +2404,28 @@ def submit_results_fragment():
             st.markdown("#### Step 3: Submit to Leaderboard")
             if st.button("Submit to Leaderboard", type="primary", disabled=not (model_name and organization and model_type)):
                 if not model_name or not organization or not model_type:
-                    st.error("Please fill in all required fields (Model Name, Organization, Model Type)")
                 else:
                     # Get current user for submission tracking
                     hf_user = get_hf_user()
@@ -2989,6 +3099,10 @@ The task is characterized by six formal properties:
 ##### Effort (Kuiper)
 - **Effort (Kuiper)**: Measures whether computational effort correlates with problem difficulty. Lower values indicate better calibration—the system "knows what it knows" and doesn't waste effort on unsolvable queries
             """)
     # ===== SUBMIT TAB =====

         confidence_interval,
         citation_f1,
         kuiper_statistic,
+        get_effort_value,
         LLM_JUDGE_SPECIFICITY,
         LLM_JUDGE_SENSITIVITY
     )
     return match.group(1) if match else "00000000_000000"
+def _detect_effort_uniform(result_file: Path, data: dict) -> bool:
+    """Check if all predictions in the companion JSONL have the same effort value."""
+    pred_rel = data.get("source_predictions_file")
+    if pred_rel:
+        pred_path = Path(EVAL_RESULTS_PATH) / pred_rel
+    else:
+        pred_path = Path(str(result_file).replace("_results_", "_predictions_").replace(".json", ".jsonl"))
+    if not pred_path.exists():
+        return False
+    try:
+        effort_values = set()
+        with open(pred_path) as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                pred = json.loads(line)
+                search_history = pred.get('search_history', [])
+                steps = len(search_history) if isinstance(search_history, list) and search_history else 0
+                if steps == 0:
+                    steps = pred.get('iterations', 0)
+                    try:
+                        steps = float(steps) if steps else 0
+                    except (TypeError, ValueError):
+                        steps = 0
+                effort_dict = {
+                    'steps': steps,
+                    'llm_calls': pred.get('llm_calls') or (pred.get('trajectory', {}) or {}).get('llm_calls'),
+                    'effort': pred.get('effort') or (pred.get('trajectory', {}) or {}).get('effort'),
+                }
+                val = get_effort_value(effort_dict)
+                if val > 0:
+                    effort_values.add(val)
+                    if len(effort_values) > 1:
+                        return False
+        return len(effort_values) == 1
+    except Exception:
+        return False
 @st.cache_data(ttl=300)  # Cache for 5 minutes
 def load_eval_results() -> pd.DataFrame:
     """Load evaluation results from JSON files, keeping only the most recent per model."""
                     anls_acc = overall.get("anls", 0.0)
+                    # Detect effort uniformity for Agentic models with Kuiper
+                    kuiper_val = overall.get("kuiper", 0.0)
+                    is_agentic = "Agentic" in tags if isinstance(tags, list) else False
+                    effort_uniform = False
+                    if is_agentic and kuiper_val and EVAL_AVAILABLE:
+                        effort_uniform = _detect_effort_uniform(result_file, data)
                     result_dict = {
                         "Model": model_name,
                         "Organization": data.get("organization", data.get("submitted_by", org_dir.name)),
                         "Attribution (Page F1)": overall.get("page_f1", 0.0),
                         "Attribution (Doc F1)": overall.get("doc_f1", 0.0),
                         # Calibration metric
+                        "Effort (Kuiper)": kuiper_val,
+                        "_effort_uniform": effort_uniform,
                         "Submission Date": data.get("submission_date", ""),
                         "Link": data.get("link", ""),
                         "Description": data.get("description", metadata.get("description", "")) or
                 tags = row.get("Tags", [])
                 is_conventional_rag = "Conventional RAG" in tags if isinstance(tags, list) else False
                 if is_conventional_rag:
+                    cell_html = "—"
                 else:
                     try:
                         cell_html = f"{float(value):.1f}" if value else "0"
                     except (ValueError, TypeError):
                         cell_html = str(value)
+                    if row.get("_effort_uniform", False) and cell_html != "0":
+                        tooltip = "This agent uses the same effort for all samples, so effort-invariance metric is not meaningful."
+                        cell_html = f'<span style="color: #888; cursor: help;" title="{tooltip}">({cell_html})</span>'
                 cells.append(f'<td style="text-align: center;">{cell_html}</td>')
             elif col == "Organization":
                 cell_html = str(value) if value else ""
     with col3:
         if is_conventional_rag:
             st.metric("Effort (Kuiper)", "—")
+        elif model_data.get('_effort_uniform', False):
+            kuiper = model_data.get('Effort (Kuiper)', 0)
+            st.metric("Effort (Kuiper)", f"({kuiper:.2f})" if kuiper else "N/A", help="This agent uses the same effort for all samples, so effort-invariance metric is not meaningful.")
         else:
             kuiper = model_data.get('Effort (Kuiper)', 0)
             st.metric("Effort (Kuiper)", f"{kuiper:.2f}" if kuiper else "N/A")
         st.info("Per-domain breakdown not available for this submission. Newer submissions will include this data.")
+def _prediction_has_effort(pred: dict) -> bool:
+    """Check if a prediction contains at least one valid effort measure."""
+    search_history = pred.get('search_history', [])
+    if isinstance(search_history, list) and len(search_history) > 0:
+        return True
+    for key in ('iterations', 'steps', 'llm_calls', 'effort'):
+        val = pred.get(key)
+        if val is not None:
+            try:
+                if float(val) > 0:
+                    return True
+            except (TypeError, ValueError):
+                pass
+    trajectory = pred.get('trajectory', {})
+    if isinstance(trajectory, dict):
+        for key in ('llm_calls', 'effort'):
+            val = trajectory.get(key)
+            if val is not None:
+                try:
+                    if float(val) > 0:
+                        return True
+                except (TypeError, ValueError):
+                    pass
+    return False
 def validate_jsonl_submission(file_content: str) -> tuple[bool, str, list]:
     """Validate JSONL submission format and return parsed predictions."""
     try:
         **Optional fields (for full metrics):**
         - `citations`: List of `{"file": "...", "page": N}` for attribution metrics
         - `id`: Question ID (fallback matching)
+        **Effort fields (required for Agentic submissions, at least one per sample):**
+        - `steps`: Number of agentic steps taken (positive integer)
+        - `search_history`: List of search queries performed (e.g. `["query1", "query2"]`)
+        - `effort`: Generic effort measure (positive number), should be proportional to the number of searches, LLM calls, or reasoning tokens generated, in this order of preference
         """)
     # Initialize session state for evaluation results
             st.markdown("#### Step 3: Submit to Leaderboard")
             if st.button("Submit to Leaderboard", type="primary", disabled=not (model_name and organization and model_type)):
+                # Validate required fields
+                submit_error = None
                 if not model_name or not organization or not model_type:
+                    submit_error = "Please fill in all required fields (Model Name, Organization, Model Type)"
+                elif "Agentic" in selected_tags and st.session_state.predictions:
+                    missing_effort = [
+                        (i + 1, p.get('question', '')[:60])
+                        for i, p in enumerate(st.session_state.predictions)
+                        if not _prediction_has_effort(p)
+                    ]
+                    if missing_effort:
+                        samples = "; ".join(f"line {ln}: {q}..." for ln, q in missing_effort[:5])
+                        extra = f" (and {len(missing_effort) - 5} more)" if len(missing_effort) > 5 else ""
+                        submit_error = (
+                            f"**Agentic submissions require effort data for every sample.** "
+                            f"{len(missing_effort)} prediction(s) are missing effort information "
+                            f"(e.g. `iterations`, `steps`, `llm_calls`, `effort`, or `search_history`). "
+                            f"Examples: {samples}{extra}"
+                        )
+                if submit_error:
+                    st.error(submit_error)
                 else:
                     # Get current user for submission tracking
                     hf_user = get_hf_user()
 ##### Effort (Kuiper)
 - **Effort (Kuiper)**: Measures whether computational effort correlates with problem difficulty. Lower values indicate better calibration—the system "knows what it knows" and doesn't waste effort on unsolvable queries
+---
+**Contact:** [lukasz.borchmann@snowflake.com](mailto:lukasz.borchmann@snowflake.com)
             """)
     # ===== SUBMIT TAB =====

eval/metrics.py CHANGED Viewed

@@ -632,7 +632,7 @@ def citation_f1(
     return {'precision': precision, 'recall': recall, 'f1': f1, 'support': len(gt_set)}
-def _get_effort_value(result: Dict) -> float:
     """Extract effort value with fallbacks.
     Priority: steps -> llm_calls -> effort
@@ -672,7 +672,7 @@ def kuiper_statistic(results: List[Dict]) -> Dict[str, Any]:
     """
     if np is None:
         raise ImportError("numpy is required for kuiper_statistic; please install numpy")
-    valid = [r for r in results if _get_effort_value(r) > 0]
     if not valid:
         return {
@@ -685,7 +685,7 @@ def kuiper_statistic(results: List[Dict]) -> Dict[str, Any]:
         }
     # Sort by effort (steps -> llm_calls -> effort)
-    sorted_results = sorted(valid, key=_get_effort_value)
     correctness = [1 if r['correct'] else 0 for r in sorted_results]
     y_bar = np.mean(correctness)
@@ -734,8 +734,8 @@ def wasted_effort_ratio(results: List[Dict]) -> Dict[str, float]:
     Returns:
         Dict with 'ratio', 'mean_steps_correct', 'mean_steps_incorrect'
     """
-    correct_steps = [_get_effort_value(r) for r in results if r.get('correct') and _get_effort_value(r) > 0]
-    incorrect_steps = [_get_effort_value(r) for r in results if not r.get('correct') and _get_effort_value(r) > 0]
     mean_correct = float(np.mean(correct_steps)) if correct_steps else 0.0
     mean_incorrect = float(np.mean(incorrect_steps)) if incorrect_steps else 0.0

     return {'precision': precision, 'recall': recall, 'f1': f1, 'support': len(gt_set)}
+def get_effort_value(result: Dict) -> float:
     """Extract effort value with fallbacks.
     Priority: steps -> llm_calls -> effort
     """
     if np is None:
         raise ImportError("numpy is required for kuiper_statistic; please install numpy")
+    valid = [r for r in results if get_effort_value(r) > 0]
     if not valid:
         return {
         }
     # Sort by effort (steps -> llm_calls -> effort)
+    sorted_results = sorted(valid, key=get_effort_value)
     correctness = [1 if r['correct'] else 0 for r in sorted_results]
     y_bar = np.mean(correctness)
     Returns:
         Dict with 'ratio', 'mean_steps_correct', 'mean_steps_incorrect'
     """
+    correct_steps = [get_effort_value(r) for r in results if r.get('correct') and get_effort_value(r) > 0]
+    incorrect_steps = [get_effort_value(r) for r in results if not r.get('correct') and get_effort_value(r) > 0]
     mean_correct = float(np.mean(correct_steps)) if correct_steps else 0.0
     mean_incorrect = float(np.mean(incorrect_steps)) if incorrect_steps else 0.0