ITBench-Lite

Sleeping

App Files Files Community

rohan-arora-ibm commited on Jan 18

Commit

0b73078

unverified ·

1 Parent(s): 7225d57

bump: bringing in evaluation code from @ptoroisaza

Browse files

Files changed (18) hide show

analysis_src/__pycache__/consistency.cpython-310.pyc +0 -0
analysis_src/__pycache__/extract_consistency_data.cpython-310.pyc +0 -0
analysis_src/__pycache__/extract_discovery_trajectory.cpython-310.pyc +0 -0
analysis_src/__pycache__/extract_inference_data.cpython-310.pyc +0 -0
analysis_src/__pycache__/extract_majority_vote_data.cpython-310.pyc +0 -0
analysis_src/__pycache__/extract_tool_failures.cpython-310.pyc +0 -0
analysis_src/__pycache__/model_styles.cpython-310.pyc +0 -0
analysis_src/__pycache__/utils.cpython-310.pyc +0 -0
analysis_src/consistency.py +513 -0
analysis_src/extract_consistency_data.py +251 -0
analysis_src/extract_discovery_trajectory.py +928 -0
analysis_src/extract_exploration.py +623 -0
analysis_src/extract_inference_data.py +595 -0
analysis_src/extract_majority_vote_data.py +507 -0
analysis_src/extract_tool_failures.py +560 -0
analysis_src/model_styles.py +241 -0
analysis_src/utils.py +155 -0
evaluation.ipynb +0 -0

analysis_src/__pycache__/consistency.cpython-310.pyc ADDED Viewed

Binary file (12.6 kB). View file

analysis_src/__pycache__/extract_consistency_data.cpython-310.pyc ADDED Viewed

Binary file (6.73 kB). View file

analysis_src/__pycache__/extract_discovery_trajectory.cpython-310.pyc ADDED Viewed

Binary file (21.9 kB). View file

analysis_src/__pycache__/extract_inference_data.cpython-310.pyc ADDED Viewed

Binary file (14.5 kB). View file

analysis_src/__pycache__/extract_majority_vote_data.cpython-310.pyc ADDED Viewed

Binary file (12.6 kB). View file

analysis_src/__pycache__/extract_tool_failures.cpython-310.pyc ADDED Viewed

Binary file (13 kB). View file

analysis_src/__pycache__/model_styles.cpython-310.pyc ADDED Viewed

Binary file (4.52 kB). View file

analysis_src/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (4.85 kB). View file

analysis_src/consistency.py ADDED Viewed

	@@ -0,0 +1,513 @@

+#!/usr/bin/env python3
+"""
+# TODO: Needs to be integrated into the itbench_leaderboard module
+# This script calculates ICC (Intraclass Correlation Coefficient) and other
+# consistency metrics for agent evaluation results.
+Consistency Analysis for Agent Leaderboard Results.
+Computes ICC (Intraclass Correlation Coefficient) to measure the reliability
+and consistency of agent responses across multiple trials per scenario.
+ICC answers: "Of all the variance observed, how much is due to actual scenario
+difficulty (signal) vs. random model variability (noise/flakiness)?"
+Interpretation:
+    ICC > 0.9:  Excellent consistency
+    ICC 0.75-0.9: Good consistency
+    ICC 0.5-0.75: Moderate consistency
+    ICC < 0.5:  Poor consistency (high flakiness)
+Usage:
+    python -m itbench_leaderboard.consistency --results-dir leaderboard_results/results
+    python -m itbench_leaderboard.consistency --results-file path/to/results.json
+"""
+import argparse
+import json
+import sys
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional
+import numpy as np
+@dataclass
+class ConsistencyMetrics:
+    """Container for all consistency metrics."""
+    # Core ICC metrics
+    icc: float
+    flakiness_ratio: float  # 1 - ICC
+    # ANOVA components
+    msb: float  # Mean Square Between (between-scenario variance)
+    msw: float  # Mean Square Within (within-scenario variance)
+    # Within-scenario consistency
+    mean_within_std: float
+    mean_agreement_rate: float
+    repeatability_coefficient: float
+    # Summary stats
+    n_scenarios: int
+    n_trials: int
+    n_flaky_scenarios: int
+    flaky_scenarios: list = field(default_factory=list)
+    # Per-scenario breakdown
+    scenario_details: dict = field(default_factory=dict)
+    def __str__(self) -> str:
+        return (
+            f"ICC: {self.icc:.4f} (flakiness: {self.flakiness_ratio:.4f})\n"
+            f"MSB (between): {self.msb:.4f}, MSW (within): {self.msw:.4f}\n"
+            f"Mean within-std: {self.mean_within_std:.4f}\n"
+            f"Agreement rate: {self.mean_agreement_rate:.4f}\n"
+            f"Repeatability coef: {self.repeatability_coefficient:.4f}\n"
+            f"Flaky scenarios: {self.n_flaky_scenarios}/{self.n_scenarios}"
+        )
+def load_results(filepath: Path) -> dict:
+    """Load results JSON file."""
+    with open(filepath, "r") as f:
+        return json.load(f)
+def extract_trial_scores(
+    results: dict,
+    metric: str = "root_cause_entity_f1"
+) -> dict[str, list[float]]:
+    """
+    Extract per-trial scores for a given metric from results.
+    Args:
+        results: Loaded JSON results
+        metric: The metric name to extract (default: root_cause_entity_f1)
+    Returns:
+        Dict mapping scenario_id -> list of trial scores
+    """
+    scenario_trials = {}
+    scenarios = results.get("scenarios", {})
+    for scenario_id, scenario_data in scenarios.items():
+        runs = scenario_data.get("runs", [])
+        trial_scores = []
+        for run in runs:
+            scores = run.get("scores", {})
+            score = scores.get(metric)
+            # Handle None/null values
+            if score is None:
+                score = 0.0
+            trial_scores.append(float(score))
+        if trial_scores:
+            scenario_trials[scenario_id] = trial_scores
+    return scenario_trials
+def calculate_agreement_rate(trials: list[float], tolerance: float = 0.1) -> float:
+    """
+    Calculate agreement rate between trial pairs.
+    Args:
+        trials: List of trial scores
+        tolerance: Maximum difference to consider as "agreement"
+    Returns:
+        Fraction of trial pairs that agree (0-1)
+    """
+    from itertools import combinations
+    if len(trials) < 2:
+        return 1.0
+    pairs = list(combinations(trials, 2))
+    agreements = sum(1 for a, b in pairs if abs(a - b) <= tolerance)
+    return agreements / len(pairs)
+def compute_icc(scenario_trials: dict[str, list[float]]) -> ConsistencyMetrics:
+    """
+    Compute ICC(1,1) - one-way random effects model.
+    The ICC formula:
+        ICC = (MSB - MSW) / (MSB + (k-1) * MSW)
+    Where:
+        MSB = k * Var(scenario_means)  [between-scenario variance]
+        MSW = Mean(Var(trials per scenario))  [within-scenario variance]
+        k = number of trials per scenario
+    Args:
+        scenario_trials: Dict mapping scenario_id -> list of trial scores
+    Returns:
+        ConsistencyMetrics with ICC and related metrics
+    """
+    # Convert to numpy array
+    scenarios = list(scenario_trials.keys())
+    # Ensure all scenarios have same number of trials
+    n_trials_list = [len(trials) for trials in scenario_trials.values()]
+    if len(set(n_trials_list)) > 1:
+        # Pad or truncate to minimum
+        k = min(n_trials_list)
+        scores = np.array([scenario_trials[s][:k] for s in scenarios])
+    else:
+        k = n_trials_list[0] if n_trials_list else 0
+        scores = np.array([scenario_trials[s] for s in scenarios])
+    n_scenarios = len(scenarios)
+    if n_scenarios == 0 or k == 0:
+        return ConsistencyMetrics(
+            icc=float('nan'),
+            flakiness_ratio=float('nan'),
+            msb=0.0,
+            msw=0.0,
+            mean_within_std=0.0,
+            mean_agreement_rate=1.0,
+            repeatability_coefficient=0.0,
+            n_scenarios=0,
+            n_trials=0,
+            n_flaky_scenarios=0,
+        )
+    # Calculate scenario means
+    scenario_means = np.mean(scores, axis=1)
+    # Between-scenario variance (MSB)
+    # MSB = k * Var(scenario means)
+    msb = k * np.var(scenario_means, ddof=1) if n_scenarios > 1 else 0.0
+    # Within-scenario variance (MSW)
+    # MSW = average of within-scenario variances
+    within_vars = np.var(scores, axis=1, ddof=1) if k > 1 else np.zeros(n_scenarios)
+    msw = np.mean(within_vars)
+    # ICC(1,1) formula
+    denominator = msb + (k - 1) * msw
+    if denominator > 0:
+        icc = (msb - msw) / denominator
+        icc = max(0.0, icc)  # ICC can be negative, clip to 0
+    else:
+        icc = float('nan') if msw == 0 and msb == 0 else 0.0
+    # Within-scenario standard deviations
+    within_stds = np.std(scores, axis=1, ddof=1) if k > 1 else np.zeros(n_scenarios)
+    mean_within_std = np.mean(within_stds)
+    # Agreement rates
+    agreement_rates = [
+        calculate_agreement_rate(scenario_trials[s])
+        for s in scenarios
+    ]
+    mean_agreement_rate = np.mean(agreement_rates)
+    # Repeatability coefficient (95% of repeat differences < RC)
+    rc = 1.96 * np.sqrt(2 * msw) if msw > 0 else 0.0
+    # Identify flaky scenarios (high within-variance)
+    flaky_threshold = 0.3
+    flaky_scenarios = [
+        (s, float(std))
+        for s, std in zip(scenarios, within_stds)
+        if std > flaky_threshold
+    ]
+    # Per-scenario details
+    scenario_details = {}
+    for i, s in enumerate(scenarios):
+        scenario_details[s] = {
+            "trials": scenario_trials[s],
+            "mean": float(scenario_means[i]),
+            "std": float(within_stds[i]),
+            "agreement_rate": agreement_rates[i],
+            "is_flaky": within_stds[i] > flaky_threshold,
+        }
+    return ConsistencyMetrics(
+        icc=float(icc),
+        flakiness_ratio=float(1 - icc) if not np.isnan(icc) else float('nan'),
+        msb=float(msb),
+        msw=float(msw),
+        mean_within_std=float(mean_within_std),
+        mean_agreement_rate=float(mean_agreement_rate),
+        repeatability_coefficient=float(rc),
+        n_scenarios=n_scenarios,
+        n_trials=k,
+        n_flaky_scenarios=len(flaky_scenarios),
+        flaky_scenarios=flaky_scenarios,
+        scenario_details=scenario_details,
+    )
+def analyze_results_file(
+    filepath: Path,
+    metrics: list[str] | None = None,
+) -> dict[str, ConsistencyMetrics]:
+    """
+    Analyze a single results file for multiple metrics.
+    Args:
+        filepath: Path to the results JSON file
+        metrics: List of metrics to analyze. Defaults to common metrics.
+    Returns:
+        Dict mapping metric_name -> ConsistencyMetrics
+    """
+    if metrics is None:
+        metrics = [
+            "root_cause_entity_f1",
+            "root_cause_proximity_with_fp_f1",
+            "propagation_chain",
+        ]
+    results = load_results(filepath)
+    analysis = {}
+    for metric in metrics:
+        scenario_trials = extract_trial_scores(results, metric)
+        if scenario_trials:
+            analysis[metric] = compute_icc(scenario_trials)
+    return analysis
+def compare_models(
+    results_dir: Path,
+    model_patterns: list[str],
+    metric: str = "root_cause_entity_f1",
+) -> dict[str, ConsistencyMetrics]:
+    """
+    Compare ICC across multiple models.
+    Args:
+        results_dir: Directory containing results JSON files
+        model_patterns: List of model name patterns to match
+        metric: The metric to analyze
+    Returns:
+        Dict mapping model_name -> ConsistencyMetrics
+    """
+    comparison = {}
+    for pattern in model_patterns:
+        # Find matching file
+        matches = list(results_dir.glob(f"*{pattern}*.json"))
+        if not matches:
+            print(f"Warning: No file found for pattern '{pattern}'", file=sys.stderr)
+            continue
+        filepath = matches[0]
+        print(f"Analyzing: {filepath.name}")
+        results = load_results(filepath)
+        scenario_trials = extract_trial_scores(results, metric)
+        if scenario_trials:
+            model_name = results.get("agent_name", filepath.stem)
+            comparison[model_name] = compute_icc(scenario_trials)
+    return comparison
+def print_comparison_table(
+    comparison: dict[str, ConsistencyMetrics],
+    metric: str,
+) -> None:
+    """Print a formatted comparison table."""
+    print(f"\n{'='*80}")
+    print(f"ICC Comparison for metric: {metric}")
+    print(f"{'='*80}\n")
+    # Header
+    print(f"{'Model':<55} {'ICC':>8} {'Flaky%':>8} {'Std':>8} {'Agree%':>8}")
+    print("-" * 91)
+    # Sort by ICC descending
+    sorted_models = sorted(
+        comparison.items(),
+        key=lambda x: x[1].icc if not np.isnan(x[1].icc) else -1,
+        reverse=True
+    )
+    for model, metrics in sorted_models:
+        # Truncate model name if too long
+        display_name = model[:52] + "..." if len(model) > 55 else model
+        icc_str = f"{metrics.icc:.4f}" if not np.isnan(metrics.icc) else "N/A"
+        flaky_pct = f"{metrics.flakiness_ratio*100:.1f}%" if not np.isnan(metrics.flakiness_ratio) else "N/A"
+        print(
+            f"{display_name:<55} "
+            f"{icc_str:>8} "
+            f"{flaky_pct:>8} "
+            f"{metrics.mean_within_std:>8.4f} "
+            f"{metrics.mean_agreement_rate*100:>7.1f}%"
+        )
+    print("\nInterpretation:")
+    print("  ICC > 0.9:    Excellent consistency")
+    print("  ICC 0.75-0.9: Good consistency")
+    print("  ICC 0.5-0.75: Moderate consistency")
+    print("  ICC < 0.5:    Poor consistency (high flakiness)")
+def print_detailed_report(
+    model_name: str,
+    metrics_analysis: dict[str, ConsistencyMetrics],
+) -> None:
+    """Print detailed report for a single model."""
+    print(f"\n{'='*80}")
+    print(f"Detailed Consistency Report: {model_name}")
+    print(f"{'='*80}\n")
+    for metric_name, cm in metrics_analysis.items():
+        print(f"\n--- {metric_name} ---")
+        print(cm)
+        if cm.flaky_scenarios:
+            print(f"\nFlaky scenarios (std > 0.3):")
+            for scenario, std in sorted(cm.flaky_scenarios, key=lambda x: -x[1])[:10]:
+                details = cm.scenario_details.get(scenario, {})
+                trials = details.get("trials", [])
+                print(f"  {scenario}: std={std:.3f}, trials={trials}")
+def main():
+    parser = argparse.ArgumentParser(
+        description="Calculate ICC and consistency metrics for leaderboard results",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__,
+    )
+    parser.add_argument(
+        "--results-dir",
+        type=Path,
+        default=Path("leaderboard_results/results"),
+        help="Directory containing results JSON files",
+    )
+    parser.add_argument(
+        "--results-file",
+        type=Path,
+        help="Analyze a single results file",
+    )
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        default=[
+            "react with code_Azure_o4-mini",
+            "react with code_Azure_gpt-5.1-2025-11-13",
+            "react with code_gcp_gemini-3-pro-preview",
+            "react with code_GCP_gemini-2.5-pro",
+        ],
+        help="Model name patterns to compare",
+    )
+    parser.add_argument(
+        "--metric",
+        type=str,
+        default="root_cause_entity_f1",
+        help="Metric to analyze (default: root_cause_entity_f1)",
+    )
+    parser.add_argument(
+        "--all-metrics",
+        action="store_true",
+        help="Analyze all common metrics",
+    )
+    parser.add_argument(
+        "--detailed",
+        action="store_true",
+        help="Show detailed per-scenario breakdown",
+    )
+    parser.add_argument(
+        "--output-json",
+        type=Path,
+        help="Save results to JSON file",
+    )
+    args = parser.parse_args()
+    # Determine metrics to analyze
+    if args.all_metrics:
+        metrics = [
+            "root_cause_entity_f1",
+            "root_cause_entity_precision",
+            "root_cause_entity_recall",
+            "root_cause_proximity_with_fp_f1",
+            "propagation_chain",
+            "fault_localization_component_identification",
+        ]
+    else:
+        metrics = [args.metric]
+    results_to_save = {}
+    if args.results_file:
+        # Single file analysis
+        print(f"Analyzing: {args.results_file}")
+        analysis = analyze_results_file(args.results_file, metrics)
+        results = load_results(args.results_file)
+        model_name = results.get("agent_name", args.results_file.stem)
+        print_detailed_report(model_name, analysis)
+        results_to_save[model_name] = {
+            m: {
+                "icc": cm.icc,
+                "flakiness_ratio": cm.flakiness_ratio,
+                "mean_within_std": cm.mean_within_std,
+                "mean_agreement_rate": cm.mean_agreement_rate,
+                "n_flaky_scenarios": cm.n_flaky_scenarios,
+                "n_scenarios": cm.n_scenarios,
+            }
+            for m, cm in analysis.items()
+        }
+    else:
+        # Multi-model comparison
+        for metric in metrics:
+            comparison = compare_models(args.results_dir, args.models, metric)
+            print_comparison_table(comparison, metric)
+            # Store results
+            for model, cm in comparison.items():
+                if model not in results_to_save:
+                    results_to_save[model] = {}
+                results_to_save[model][metric] = {
+                    "icc": cm.icc if not np.isnan(cm.icc) else None,
+                    "flakiness_ratio": cm.flakiness_ratio if not np.isnan(cm.flakiness_ratio) else None,
+                    "mean_within_std": cm.mean_within_std,
+                    "mean_agreement_rate": cm.mean_agreement_rate,
+                    "n_flaky_scenarios": cm.n_flaky_scenarios,
+                    "n_scenarios": cm.n_scenarios,
+                }
+            if args.detailed:
+                for model, cm in comparison.items():
+                    print_detailed_report(model, {metric: cm})
+    # Save to JSON if requested
+    if args.output_json:
+        with open(args.output_json, "w") as f:
+            json.dump(results_to_save, f, indent=2)
+        print(f"\nResults saved to: {args.output_json}")
+if __name__ == "__main__":
+    main()

analysis_src/extract_consistency_data.py ADDED Viewed

	@@ -0,0 +1,251 @@

+#!/usr/bin/env python3
+"""
+Extract consistency (ICC) and performance data for all 'react with code' agents.
+This script reads directly from the run directories (not JSON result files)
+to ensure all trials are captured.
+Output is saved to paper_analysis/react with code/resources/figures/consistency/ as CSV files for plotting.
+"""
+import json
+import sys
+from pathlib import Path
+from dataclasses import dataclass, field
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+# Add project root to path
+PROJECT_ROOT = Path(__file__).parent.parent
+sys.path.insert(0, str(PROJECT_ROOT))
+from src.consistency import (
+    compute_icc,
+    ConsistencyMetrics,
+)
+from src.utils import (
+    get_model_name,
+    find_react_with_code_dirs,
+    read_judge_outputs_from_dir,
+    extract_trial_scores_from_judge_outputs,
+    get_runs_stats,
+    filter_scenarios_with_min_runs,
+)
+# Paths
+LEADERBOARD_DIR = PROJECT_ROOT / "data" / "trajectories"
+RESULTS_JSON_DIR = LEADERBOARD_DIR / "results"
+OUTPUT_DIR = PROJECT_ROOT / "data" / "output" / "consistency"
+# Minimum runs per scenario required for inclusion
+MIN_RUNS_PER_SCENARIO = 3
+# Minimum scenarios needed after filtering (must have at least this many with 3+ runs)
+MIN_QUALIFYING_SCENARIOS = 20
+# Metrics to analyze
+METRICS = [
+    "root_cause_entity_f1",
+    "root_cause_entity_precision",
+    "root_cause_entity_recall",
+    "root_cause_proximity_with_fp_f1",
+    "propagation_chain",
+    "fault_localization_component_identification",
+]
+# Short names for display
+METRIC_SHORT_NAMES = {
+    "root_cause_entity_f1": "RC Entity F1",
+    "root_cause_entity_precision": "RC Entity Prec",
+    "root_cause_entity_recall": "RC Entity Rec",
+    "root_cause_proximity_with_fp_f1": "RC Proximity F1",
+    "propagation_chain": "Prop. Chain",
+    "fault_localization_component_identification": "Fault Loc.",
+}
+def extract_all_data() -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+    """
+    Extract ICC and performance data for all agents by reading from directories.
+    Returns:
+        - icc_df: ICC scores per model per metric
+        - perf_df: Performance averages per model per metric
+        - scenario_df: Per-scenario breakdown
+    """
+    agent_dirs = find_react_with_code_dirs(LEADERBOARD_DIR)
+    print(f"Found {len(agent_dirs)} 'react with code' agent directories:")
+    for d in agent_dirs:
+        print(f"  - {d.name}")
+    icc_records = []
+    perf_records = []
+    scenario_records = []
+    valid_models = []
+    skipped_models = []
+    for agent_dir in tqdm(agent_dirs, desc="Reading agent data"):
+        model_name = get_model_name(agent_dir.name)
+        print(f"\nReading: {agent_dir.name}")
+        scenario_data = read_judge_outputs_from_dir(agent_dir)
+        n_scenarios, min_runs, max_runs, n_qualifying = get_runs_stats(scenario_data, MIN_RUNS_PER_SCENARIO)
+        if n_scenarios == 0:
+            print(f"  SKIPPING {model_name}: No judge outputs found")
+            skipped_models.append((model_name, "No data", 0))
+            continue
+        if n_qualifying < MIN_QUALIFYING_SCENARIOS:
+            print(f"  SKIPPING {model_name}: Only {n_qualifying}/{n_scenarios} scenarios have {MIN_RUNS_PER_SCENARIO}+ runs")
+            skipped_models.append((model_name, f"{n_qualifying}/{n_scenarios} qualifying", n_qualifying))
+            continue
+        # Filter to only include scenarios with enough runs
+        scenario_data = filter_scenarios_with_min_runs(scenario_data, MIN_RUNS_PER_SCENARIO)
+        n_scenarios_filtered = len(scenario_data)
+        print(f"  Processing: {model_name} ({n_scenarios_filtered} scenarios with {MIN_RUNS_PER_SCENARIO}+ runs)")
+        valid_models.append(model_name)
+        for metric in tqdm(METRICS, desc=f"  {model_name} metrics", leave=False):
+            # Extract trial scores
+            scenario_trials = extract_trial_scores_from_judge_outputs(scenario_data, metric)
+            if not scenario_trials:
+                continue
+            # Calculate performance average
+            all_scores = [s for trials in scenario_trials.values() for s in trials]
+            perf_avg = np.mean(all_scores) if all_scores else 0.0
+            perf_records.append({
+                "model": model_name,
+                "metric": METRIC_SHORT_NAMES.get(metric, metric),
+                "metric_raw": metric,
+                "performance": perf_avg,
+            })
+            # ICC calculation
+            try:
+                icc_metrics = compute_icc(scenario_trials)
+                icc_records.append({
+                    "model": model_name,
+                    "metric": METRIC_SHORT_NAMES.get(metric, metric),
+                    "metric_raw": metric,
+                    "icc": icc_metrics.icc if not np.isnan(icc_metrics.icc) else 0.0,
+                    "flakiness": icc_metrics.flakiness_ratio if not np.isnan(icc_metrics.flakiness_ratio) else 1.0,
+                    "within_std": icc_metrics.mean_within_std,
+                    "agreement_rate": icc_metrics.mean_agreement_rate,
+                    "n_flaky_scenarios": icc_metrics.n_flaky_scenarios,
+                    "n_scenarios": icc_metrics.n_scenarios,
+                })
+                # Per-scenario data (only for root_cause_entity_f1)
+                if metric == "root_cause_entity_f1":
+                    for scenario_id, details in icc_metrics.scenario_details.items():
+                        scenario_records.append({
+                            "model": model_name,
+                            "scenario": scenario_id,
+                            "mean": details["mean"],
+                            "std": details["std"],
+                            "trials": details["trials"],
+                            "is_flaky": details["is_flaky"],
+                        })
+            except Exception as e:
+                print(f"    Error computing ICC for {metric}: {e}")
+                continue
+    if skipped_models:
+        print(f"\n⚠️  Skipped {len(skipped_models)} models:")
+        for name, reason, _ in skipped_models:
+            print(f"    - {name}: {reason}")
+    print(f"\n✓ Included {len(valid_models)} models: {valid_models}")
+    icc_df = pd.DataFrame(icc_records)
+    perf_df = pd.DataFrame(perf_records)
+    scenario_df = pd.DataFrame(scenario_records)
+    return icc_df, perf_df, scenario_df
+def save_data(icc_df: pd.DataFrame, perf_df: pd.DataFrame, scenario_df: pd.DataFrame):
+    """Save extracted data to CSV files."""
+    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+    icc_path = OUTPUT_DIR / "icc_data.csv"
+    perf_path = OUTPUT_DIR / "performance_data.csv"
+    scenario_path = OUTPUT_DIR / "scenario_data.csv"
+    icc_df.to_csv(icc_path, index=False)
+    perf_df.to_csv(perf_path, index=False)
+    scenario_df.to_csv(scenario_path, index=False)
+    print(f"\nData saved to:")
+    print(f"  - {icc_path}")
+    print(f"  - {perf_path}")
+    print(f"  - {scenario_path}")
+    # Also save a summary JSON
+    summary = {
+        "models": icc_df["model"].unique().tolist(),
+        "metrics": icc_df["metric"].unique().tolist(),
+        "n_scenarios": int(icc_df["n_scenarios"].max()) if len(icc_df) > 0 else 0,
+        "min_runs_required": MIN_RUNS_PER_SCENARIO,
+    }
+    summary_path = OUTPUT_DIR / "analysis_summary.json"
+    with open(summary_path, "w") as f:
+        json.dump(summary, f, indent=2)
+    print(f"  - {summary_path}")
+def print_summary(icc_df: pd.DataFrame, perf_df: pd.DataFrame):
+    """Print summary tables."""
+    print("\n" + "="*80)
+    print("ICC Summary (root_cause_entity_f1)")
+    print("="*80)
+    rc_icc = icc_df[icc_df["metric_raw"] == "root_cause_entity_f1"].copy()
+    rc_icc = rc_icc.sort_values("icc", ascending=False)
+    print(f"\n{'Model':<20} {'ICC':>8} {'Flaky%':>8} {'Std':>8} {'Agree%':>8}")
+    print("-" * 56)
+    for _, row in rc_icc.iterrows():
+        print(f"{row['model']:<20} {row['icc']:>8.4f} {row['flakiness']*100:>7.1f}% {row['within_std']:>8.4f} {row['agreement_rate']*100:>7.1f}%")
+    print("\n" + "="*80)
+    print("Performance Summary (root_cause_entity_f1)")
+    print("="*80)
+    rc_perf = perf_df[perf_df["metric_raw"] == "root_cause_entity_f1"].copy()
+    rc_perf = rc_perf.sort_values("performance", ascending=False)
+    print(f"\n{'Model':<20} {'Avg Score':>12}")
+    print("-" * 34)
+    for _, row in rc_perf.iterrows():
+        print(f"{row['model']:<20} {row['performance']:>12.4f}")
+def main():
+    print("Extracting consistency data for 'react with code' agents...")
+    print(f"Reading from directories: {LEADERBOARD_DIR}")
+    print(f"Output directory: {OUTPUT_DIR}")
+    print(f"Minimum runs per scenario: {MIN_RUNS_PER_SCENARIO}")
+    icc_df, perf_df, scenario_df = extract_all_data()
+    if len(icc_df) == 0:
+        print("No data extracted!")
+        return
+    save_data(icc_df, perf_df, scenario_df)
+    print_summary(icc_df, perf_df)
+if __name__ == "__main__":
+    main()

analysis_src/extract_discovery_trajectory.py ADDED Viewed

	@@ -0,0 +1,928 @@

+#!/usr/bin/env python3
+"""
+Root Cause Discovery Trajectory Analysis
+Analyzes how agents discover root cause entities:
+- T_encounter: When GT entity first appears in tool output
+- T_investigate: When agent actively queries GT entity
+- T_assert: When agent asserts GT entity as root cause
+- T_exonerate: When agent dismisses GT entity (if ever)
+- T_recover: When agent corrects after exoneration
+Metrics computed:
+- Discovery efficiency (how early GT appears)
+- Investigation delay (turns between seeing and investigating)
+- Assertion delay (turns to confirm after investigating)
+- Recovery rate (% of trials with successful recovery)
+"""
+import json
+import sys
+import re
+import yaml
+from pathlib import Path
+from dataclasses import dataclass, field, asdict
+from typing import Optional, List, Dict, Any
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+import plotly.graph_objects as go
+from tqdm import tqdm
+PROJECT_ROOT = Path(__file__).parent.parent
+sys.path.insert(0, str(PROJECT_ROOT))
+from src.utils import find_latest_rollout_file
+from src.model_styles import (
+    get_display_name, MIN_FONT_SIZE, SINGLE_COLUMN_WIDTH, DOUBLE_COLUMN_WIDTH, PLOT_PARAMETERS
+)
+# Improved regex to capture standard K8s resource patterns: namespace/Kind/name
+# Captures: (namespace, Kind, name)
+K8S_ENTITY_PATTERN = re.compile(r'([\w-]+)/(Deployment|Service|Pod|ReplicaSet|ResourceQuota|StatefulSet|DaemonSet|Job|CronJob|ConfigMap|Secret|Endpoints|Ingress|PersistentVolumeClaim|PersistentVolume|ServiceAccount|Role|RoleBinding|ClusterRole|ClusterRoleBinding|NetworkPolicy|HorizontalPodAutoscaler|Node)/([\w-]+)', re.IGNORECASE)
+def extract_k8s_entities(text: str) -> List[str]:
+    """Extract all K8s entities matching the standard pattern."""
+    matches = K8S_ENTITY_PATTERN.findall(text)
+    entities = []
+    for m in matches:
+        # Normalize to namespace/Kind/name
+        entity = f"{m[0]}/{m[1]}/{m[2]}"
+        entities.append(entity)
+    return entities
+# Paths
+PROJECT_ROOT = Path(__file__).parent.parent
+LEADERBOARD_DIR = PROJECT_ROOT / "data" / "trajectories"
+GT_DIR = PROJECT_ROOT / "data" / "itbench-snapshots"
+OUTPUT_DIR = PROJECT_ROOT / "data" / "output" / "discovery"
+@dataclass
+class GroundTruth:
+    """Ground truth root cause entity info."""
+    scenario: str
+    entity_name: str
+    entity_kind: str
+    group_id: str
+    filters: List[str]  # regex patterns to match entity
+    aliases: List[str]  # related entity group IDs
+    propagation_entities: set = field(default_factory=set)  # All entities involved in propagation
+    all_entities: list = field(default_factory=list)  # All entities defined in the scenario
+    entity_filters: Dict[str, List[str]] = field(default_factory=dict)  # group_id -> filters mapping for all entities
+@dataclass
+class EntityMention:
+    """A mention of an entity in the agent's trajectory."""
+    turn: int
+    mention_type: str  # 'encounter', 'investigate', 'assert', 'exonerate'
+    context: str  # 'tool_output', 'tool_args', 'reasoning', 'final_output'
+    text_snippet: str
+    sentiment: str  # 'positive', 'negative', 'neutral'
+@dataclass
+class TrajectoryAnalysis:
+    """Analysis results for a single trial."""
+    model: str
+    scenario: str
+    trial: int
+    total_turns: int
+    gt_entity: str
+    # Key timestamps (turn numbers, None if not found)
+    t_encounter: Optional[int] = None
+    t_investigate: Optional[int] = None
+    t_assert: Optional[int] = None
+    t_exonerate: Optional[int] = None
+    t_recover: Optional[int] = None
+    # Final outcome (from judge scores if available)
+    final_success: bool = False  # Did the final answer include GT?
+    root_cause_f1: Optional[float] = None
+    # Pipeline stage reached (for funnel analysis)
+    # 0=none, 1=encounter, 2=investigate, 3=assert, 4=success
+    max_stage_reached: int = 0
+    # All mentions for detailed analysis
+    mentions: List[EntityMention] = field(default_factory=list)
+    # Exploration metrics
+    total_entities_available: int = 0
+    unique_entities_encountered: int = 0
+    unique_entities_investigated: int = 0
+    exploration_ratio: float = 0.0  # investigated / available
+    # Coverage metrics
+    on_chain_investigated: int = 0
+    off_chain_investigated: int = 0  # Detoured
+    propagation_coverage: float = 0.0  # % of chain entities investigated
+    detour_rate: float = 0.0  # off_chain / total_investigated
+    # Computed metrics
+    discovery_efficiency: Optional[float] = None  # t_encounter / total_turns
+    investigation_delay: Optional[int] = None  # t_investigate - t_encounter
+    assertion_delay: Optional[int] = None  # t_assert - t_investigate
+    had_recovery: bool = False
+def check_entity_match(text: str, entity_info: Dict) -> bool:
+    """Check if text matches an arbitrary entity definition."""
+    text_lower = text.lower()
+    entity_name = entity_info.get('id', '').lower()
+    # Check direct name match
+    if entity_name and entity_name in text_lower:
+        return True
+    # Check filters
+    filters = entity_info.get('filter', [])
+    for pattern in filters:
+        search_term = pattern.replace('\\b', '').replace('-.*', '').replace('.*', '')
+        if search_term and search_term.lower() in text_lower:
+            return True
+    return False
+def load_ground_truth(scenario: str) -> Optional[GroundTruth]:
+    """Load and parse ground truth YAML for a scenario."""
+    gt_path = GT_DIR / scenario / "ground_truth.yaml"
+    if not gt_path.exists():
+        return None
+    with open(gt_path) as f:
+        gt_data = yaml.safe_load(f)
+    # Find the root cause group
+    root_cause_group = None
+    all_groups = gt_data.get('groups', [])
+    for group in all_groups:
+        if group.get('root_cause', False):
+            root_cause_group = group
+            break
+    if not root_cause_group:
+        return None
+    # Get fault entity info
+    fault_list = gt_data.get('fault', [])
+    fault_info = fault_list[0] if fault_list else {}
+    entity_info = fault_info.get('entity', {})
+    # Collect all aliases
+    aliases = []
+    for alias_group in gt_data.get('aliases', []):
+        if root_cause_group['id'] in alias_group:
+            aliases.extend(alias_group)
+    # Collect all entities in propagation chain
+    propagation_entities = set()
+    for prop in gt_data.get('propagations', []):
+        if 'source' in prop:
+            propagation_entities.add(prop['source'])
+        if 'target' in prop:
+            propagation_entities.add(prop['target'])
+    # Add root cause itself if not already there (it should be as source)
+    propagation_entities.add(root_cause_group['id'])
+    # Build entity_filters mapping: group_id -> list of filter patterns
+    entity_filters = {}
+    for group in all_groups:
+        group_id = group.get('id', '')
+        filters = group.get('filter', [])
+        # Also use the group id itself and 'name' field as filters
+        name = group.get('name', '')
+        all_filters = list(filters) if filters else []
+        if group_id:
+            all_filters.append(group_id)
+        if name and name != group_id:
+            all_filters.append(name)
+        entity_filters[group_id] = all_filters
+    gt_obj = GroundTruth(
+        scenario=scenario,
+        entity_name=entity_info.get('name', root_cause_group['id']),
+        entity_kind=root_cause_group.get('kind', 'Unknown'),
+        group_id=root_cause_group['id'],
+        filters=root_cause_group.get('filter', []),
+        aliases=aliases,
+        propagation_entities=propagation_entities,
+        entity_filters=entity_filters
+    )
+    # Attach all entities for exploration analysis
+    gt_obj.all_entities = all_groups
+    return gt_obj
+def entity_matches(text: str, gt: GroundTruth) -> bool:
+    """Check if text mentions the ground truth entity."""
+    text_lower = text.lower()
+    # Check direct name match
+    if gt.entity_name.lower() in text_lower:
+        return True
+    # Check group_id match
+    if gt.group_id.lower().replace('-', ' ') in text_lower.replace('-', ' '):
+        return True
+    if gt.group_id.lower().replace('-', '') in text_lower.replace('-', ''):
+        return True
+    # Check filter patterns
+    for pattern in gt.filters:
+        # Convert filter pattern to regex-friendly form
+        search_term = pattern.replace('\\b', '').replace('-.*', '').replace('.*', '')
+        if search_term.lower() in text_lower:
+            return True
+    # Check aliases
+    for alias in gt.aliases:
+        alias_term = alias.replace('-', ' ').lower()
+        if alias_term in text_lower.replace('-', ' '):
+            return True
+    return False
+def is_entity_on_chain(entity_str: str, gt: GroundTruth) -> Optional[str]:
+    """
+    Check if an entity string matches any entity in the fault propagation chain.
+    Returns the matched group_id if on-chain, None if off-chain.
+    entity_str: e.g., "otel-demo/Pod/frontend-abc123" or just "frontend"
+    """
+    entity_lower = entity_str.lower()
+    # For each propagation entity, check if entity_str matches its filters
+    for group_id in gt.propagation_entities:
+        filters = gt.entity_filters.get(group_id, [])
+        # Check group_id itself
+        if group_id.lower() in entity_lower or entity_lower in group_id.lower():
+            return group_id
+        # Check filter patterns
+        for pattern in filters:
+            # Clean up the regex pattern for simple matching
+            search_term = pattern.replace('\\b', '').replace('-.*', '').replace('.*', '').replace('\\', '')
+            if search_term and len(search_term) > 2:  # Avoid too short matches
+                if search_term.lower() in entity_lower:
+                    return group_id
+    return None
+def get_entity_group_match(entity_str: str, gt: GroundTruth) -> Optional[str]:
+    """
+    Check if an entity string matches any entity group in the scenario.
+    Returns the matched group_id if found, None otherwise.
+    """
+    entity_lower = entity_str.lower()
+    for group in gt.all_entities:
+        group_id = group.get('id', '')
+        filters = group.get('filter', [])
+        name = group.get('name', '')
+        # Check group_id
+        if group_id and (group_id.lower() in entity_lower or entity_lower in group_id.lower()):
+            return group_id
+        # Check name
+        if name and (name.lower() in entity_lower or entity_lower in name.lower()):
+            return group_id
+        # Check filter patterns
+        for pattern in filters:
+            search_term = pattern.replace('\\b', '').replace('-.*', '').replace('.*', '').replace('\\', '')
+            if search_term and len(search_term) > 2:
+                if search_term.lower() in entity_lower:
+                    return group_id
+    return None
+def classify_sentiment(text: str, gt: GroundTruth) -> str:
+    """Classify if mention is positive (asserting), negative (exonerating), or neutral."""
+    text_lower = text.lower()
+    # Find the sentence/context containing the entity
+    entity_term = gt.entity_name.lower()
+    # Positive indicators (asserting as root cause)
+    positive_patterns = [
+        r'root\s*cause',
+        r'is\s+the\s+cause',
+        r'caused\s+by',
+        r'source\s+of\s+(the\s+)?problem',
+        r'culprit',
+        r'responsible\s+for',
+        r'likely\s+cause',
+        r'appears\s+to\s+be\s+the\s+issue',
+        r'primary\s+issue',
+        r'main\s+issue',
+    ]
+    # Negative indicators (exonerating)
+    negative_patterns = [
+        r'not\s+the\s+(root\s*)?cause',
+        r'ruled\s+out',
+        r'is\s+not\s+responsible',
+        r'working\s+(correctly|normally|fine)',
+        r'healthy',
+        r'no\s+issues?\s+(found|detected)',
+        r'can\s+be\s+excluded',
+        r'unlikely\s+to\s+be',
+    ]
+    for pattern in positive_patterns:
+        if re.search(pattern, text_lower):
+            return 'positive'
+    for pattern in negative_patterns:
+        if re.search(pattern, text_lower):
+            return 'negative'
+    return 'neutral'
+def get_latest_rollout(trial_dir: Path) -> Optional[Path]:
+    """Get the latest rollout file from a trial directory."""
+    sessions_dir = trial_dir / "sessions"
+    if not sessions_dir.exists():
+        return None
+    rollout_files = list(sessions_dir.glob("**/rollout-*.jsonl"))
+    if not rollout_files:
+        return None
+    # Sort by modification time, get latest
+    return max(rollout_files, key=lambda p: p.stat().st_mtime)
+def get_judge_score(trial_dir: Path) -> Optional[float]:
+    """Get root_cause_entity_f1 from judge output."""
+    judge_path = trial_dir / "judge_output.json"
+    if not judge_path.exists():
+        return None
+    try:
+        with open(judge_path) as f:
+            judge_data = json.load(f)
+        return judge_data.get('flat_scores', {}).get('root_cause_entity_f1')
+    except:
+        return None
+def parse_rollout(rollout_path: Path, gt: GroundTruth) -> TrajectoryAnalysis:
+    """Parse a rollout file and extract entity mentions."""
+    mentions = []
+    turn_num = 0
+    total_turns = 0
+    t_encounter = None
+    t_investigate = None
+    t_assert = None
+    t_exonerate = None
+    t_recover = None
+    # Exploration tracking
+    # We want to track unique entities from the SCENARIO that were touched
+    # gt.filters contains patterns for the root cause.
+    # But we want patterns for ALL entities in the scenario.
+    # The GroundTruth class only has root cause info currently.
+    # We need to pass the full list of scenario entities.
+    # Wait, GroundTruth class needs updating first to hold all scenario entities.
+    # Currently it only holds root cause info.
+    # Let's assume the caller will update GroundTruth definition or pass a list of entities.
+    # Actually, let's update this function to work with the updated GroundTruth class
+    # which will be updated in the next step.
+    encountered_entities = set()
+    investigated_entities = set()
+    # Track which entity groups were investigated (on-chain vs off-chain)
+    on_chain_groups_investigated = set()
+    off_chain_groups_investigated = set()
+    all_groups_investigated = set()
+    with open(rollout_path) as f:
+        for line in f:
+            try:
+                obj = json.loads(line)
+            except json.JSONDecodeError:
+                continue
+            if obj.get('type') == 'turn_context':
+                turn_num += 1
+                total_turns = turn_num
+            if obj.get('type') != 'response_item':
+                continue
+            payload = obj.get('payload', {})
+            # Check tool outputs (encounter)
+            if payload.get('type') == 'function_call_output':
+                output = str(payload.get('output', ''))
+                # Check for root cause match
+                if entity_matches(output, gt):
+                    sentiment = classify_sentiment(output, gt)
+                    mentions.append(EntityMention(
+                        turn=turn_num,
+                        mention_type='encounter',
+                        context='tool_output',
+                        text_snippet=output[:200],
+                        sentiment=sentiment
+                    ))
+                    if t_encounter is None:
+                        t_encounter = turn_num
+                # Broad exploration check using Regex
+                found_entities = extract_k8s_entities(output)
+                for entity in found_entities:
+                    encountered_entities.add(entity)
+            # Check tool arguments (investigate)
+            if payload.get('type') == 'function_call':
+                args = payload.get('arguments', {})
+                if isinstance(args, str):
+                    try:
+                        args = json.loads(args)
+                    except:
+                        args = {'raw': args}
+                args_str = json.dumps(args)
+                # Root cause check
+                if entity_matches(args_str, gt):
+                    mentions.append(EntityMention(
+                        turn=turn_num,
+                        mention_type='investigate',
+                        context='tool_args',
+                        text_snippet=args_str[:200],
+                        sentiment='neutral'
+                    ))
+                    if t_investigate is None:
+                        t_investigate = turn_num
+                # Broad exploration check using Regex
+                found_entities = extract_k8s_entities(args_str)
+                for entity in found_entities:
+                    investigated_entities.add(entity)
+                    # Classify as on-chain or off-chain
+                    on_chain_group = is_entity_on_chain(entity, gt)
+                    if on_chain_group:
+                        on_chain_groups_investigated.add(on_chain_group)
+                        all_groups_investigated.add(on_chain_group)
+                    else:
+                        # Check if it matches any entity in scenario at all
+                        any_group = get_entity_group_match(entity, gt)
+                        if any_group:
+                            off_chain_groups_investigated.add(any_group)
+                            all_groups_investigated.add(any_group)
+                # Check update_plan for assertions/reasoning
+                if payload.get('name') == 'update_plan':
+                    explanation = args.get('explanation', '')
+                    if entity_matches(explanation, gt):
+                        sentiment = classify_sentiment(explanation, gt)
+                        mention_type = 'assert' if sentiment == 'positive' else ('exonerate' if sentiment == 'negative' else 'investigate')
+                        mentions.append(EntityMention(
+                            turn=turn_num,
+                            mention_type=mention_type,
+                            context='reasoning',
+                            text_snippet=explanation[:200],
+                            sentiment=sentiment
+                        ))
+                        if mention_type == 'assert' and t_assert is None:
+                            t_assert = turn_num
+                        elif mention_type == 'exonerate' and t_exonerate is None:
+                            t_exonerate = turn_num
+                # Check shell commands for final output
+                if payload.get('name') == 'shell':
+                    cmd = args.get('command', [])
+                    cmd_str = ' '.join(cmd) if isinstance(cmd, list) else str(cmd)
+                    # Look for output generation with root cause assertions
+                    if ('output.json' in cmd_str or 'root_cause' in cmd_str.lower()) and entity_matches(cmd_str, gt):
+                        sentiment = classify_sentiment(cmd_str, gt)
+                        if sentiment == 'positive' or 'root_cause' in cmd_str.lower():
+                            mentions.append(EntityMention(
+                                turn=turn_num,
+                                mention_type='assert',
+                                context='final_output',
+                                text_snippet=cmd_str[:300],
+                                sentiment='positive'
+                            ))
+                            if t_assert is None:
+                                t_assert = turn_num
+    # Check for recovery (exoneration followed by assertion)
+    had_recovery = False
+    if t_exonerate is not None and t_assert is not None and t_exonerate < t_assert:
+        had_recovery = True
+        t_recover = t_assert
+    # Compute metrics
+    discovery_efficiency = t_encounter / total_turns if t_encounter and total_turns > 0 else None
+    investigation_delay = t_investigate - t_encounter if t_investigate and t_encounter else None
+    assertion_delay = t_assert - t_investigate if t_assert and t_investigate else None
+    # Compute max stage reached (without final success - that comes from judge)
+    # 0=none, 1=encounter, 2=investigate, 3=assert
+    max_stage = 0
+    if t_encounter is not None:
+        max_stage = 1
+    if t_investigate is not None:
+        max_stage = 2
+    if t_assert is not None:
+        max_stage = 3
+    # Exploration metrics
+    # Note: total_entities_available is hard to define with regex approach as we don't know the universe.
+    # We will use the number of encountered entities as the denominator for "investigation ratio"
+    # or just report the raw counts.
+    num_encountered = len(encountered_entities)
+    num_investigated = len(investigated_entities)
+    # Ratio: What % of things seen were actually investigated?
+    expl_ratio = num_investigated / num_encountered if num_encountered > 0 else 0.0
+    # Coverage metrics: on-chain (fault propagation) vs off-chain (detoured)
+    n_on_chain = len(on_chain_groups_investigated)
+    n_off_chain = len(off_chain_groups_investigated)
+    total_investigated_groups = len(all_groups_investigated)
+    # Propagation coverage: what % of the fault propagation chain was investigated?
+    n_propagation_entities = len(gt.propagation_entities)
+    prop_coverage = n_on_chain / n_propagation_entities if n_propagation_entities > 0 else 0.0
+    # Detour rate: what % of investigated entities were off-chain (not in fault propagation)?
+    det_rate = n_off_chain / total_investigated_groups if total_investigated_groups > 0 else 0.0
+    return TrajectoryAnalysis(
+        model="",  # Set by caller
+        scenario="",  # Set by caller
+        trial=0,  # Set by caller
+        total_turns=total_turns,
+        gt_entity=gt.entity_name,
+        t_encounter=t_encounter,
+        t_investigate=t_investigate,
+        t_assert=t_assert,
+        t_exonerate=t_exonerate,
+        t_recover=t_recover,
+        max_stage_reached=max_stage,
+        mentions=mentions,
+        total_entities_available=num_encountered, # Using encountered as the "available" set
+        unique_entities_encountered=num_encountered,
+        unique_entities_investigated=num_investigated,
+        exploration_ratio=expl_ratio,
+        # Coverage metrics (on-chain vs off-chain)
+        on_chain_investigated=n_on_chain,
+        off_chain_investigated=n_off_chain,
+        propagation_coverage=prop_coverage,
+        detour_rate=det_rate,
+        # Computed metrics
+        discovery_efficiency=discovery_efficiency,
+        investigation_delay=investigation_delay,
+        assertion_delay=assertion_delay,
+        had_recovery=had_recovery
+    )
+def analyze_model(model_dir: Path, gt_cache: Dict[str, GroundTruth]) -> List[TrajectoryAnalysis]:
+    """Analyze all trials for a model."""
+    results = []
+    model_name = model_dir.name.replace("react with code_", "").split("_07ccdb1")[0]
+    scenario_dirs = [d for d in sorted(model_dir.iterdir()) if d.is_dir() and d.name.startswith("Scenario-")]
+    for scenario_dir in tqdm(scenario_dirs, desc=f"  {model_name} scenarios"):
+        scenario = scenario_dir.name
+        gt = gt_cache.get(scenario)
+        if gt is None:
+            continue
+        trial_dirs = [d for d in sorted(scenario_dir.iterdir()) if d.is_dir() and d.name.isdigit()]
+        for trial_dir in tqdm(trial_dirs, desc=f"    {scenario} trials"):
+            trial_num = int(trial_dir.name)
+            rollout_path = find_latest_rollout_file(trial_dir)
+            if rollout_path is None:
+                continue
+            try:
+                analysis = parse_rollout(rollout_path, gt)
+                analysis.model = model_name
+                analysis.scenario = scenario
+                analysis.trial = trial_num
+                # Get judge score to determine final success
+                f1_score = get_judge_score(trial_dir)
+                analysis.root_cause_f1 = f1_score
+                if f1_score is not None and f1_score > 0:
+                    analysis.final_success = True
+                    analysis.max_stage_reached = 4  # Success!
+                results.append(analysis)
+            except Exception as e:
+                print(f"Error processing {model_name}/{scenario}/{trial_num}: {e}")
+    return results
+def plot_pipeline_funnel(summary_df: pd.DataFrame):
+    """
+    Figure 1: Stacked bar showing where trials drop off in the pipeline.
+    Pipeline stages:
+    - Encounter: GT entity appears in tool OUTPUT (passive - agent didn't ask for it)
+    - Investigate: GT entity appears in tool ARGUMENTS (active - agent explicitly queried it)
+    - Assert: Agent declares GT as root cause
+    - Success: Judge confirms correct answer
+    """
+    # Filter out mistral (no data) and prepare data
+    data = summary_df[summary_df['encounter_rate'] > 0].copy()
+    data['model_clean'] = data['model'].apply(get_display_name)
+    data = data.sort_values('success_rate', ascending=True)
+    # Stack: none, encounter_only, investigate_only, assert_only, success
+    # Normalize to percentages
+    n_trials = data['n_trials']
+    none_pct = data['n_stage_0_none'] / n_trials * 100
+    enc_pct = data['n_stage_1_encounter_only'] / n_trials * 100
+    inv_pct = data['n_stage_2_investigate_only'] / n_trials * 100
+    ass_pct = data['n_stage_3_assert_only'] / n_trials * 100
+    suc_pct = data['n_stage_4_success'] / n_trials * 100
+    n_models = len(data)
+    y = np.arange(n_models)
+    bar_height = 0.7
+    plt.rcParams.update(PLOT_PARAMETERS)
+    STAGE_COLORS = {
+        'none': '#d73027',       # Red - never encountered GT
+        'encounter': '#fc8d59',  # Orange - saw but didn't investigate
+        'investigate': '#fee08b', # Yellow - investigated but didn't assert
+        'assert': '#d9ef8b',     # Light green - asserted but wrong final answer
+        'success': '#1a9850',    # Green - success
+        }
+    # Create figure sized to fill half column with legend
+    fig, ax = plt.subplots(figsize=(DOUBLE_COLUMN_WIDTH, 2.5))
+    # Plot stacked bars with GT prefix labels
+    ax.barh(y, none_pct, height=bar_height, label='RC never seen', color=STAGE_COLORS['none'],
+            edgecolor='white', linewidth=0.3)
+    ax.barh(y, enc_pct, height=bar_height, left=none_pct, label='RC seen, not queried',
+            color=STAGE_COLORS['encounter'], edgecolor='white', linewidth=0.3)
+    ax.barh(y, inv_pct, height=bar_height, left=none_pct + enc_pct, label='RC queried, not asserted',
+            color=STAGE_COLORS['investigate'], edgecolor='white', linewidth=0.3)
+    ax.barh(y, ass_pct, height=bar_height, left=none_pct + enc_pct + inv_pct, label='RC asserted, not in output',
+            color=STAGE_COLORS['assert'], edgecolor='white', linewidth=0.3)
+    ax.barh(y, suc_pct, height=bar_height, left=none_pct + enc_pct + inv_pct + ass_pct, label='RC asserted, in output',
+            color=STAGE_COLORS['success'], edgecolor='white', linewidth=0.3)
+    # Add percentage labels to each stack
+    min_pct_threshold = 4  # Only show labels for segments >= 2%
+    label_fontsize = MIN_FONT_SIZE - 3
+    for i, model_idx in enumerate(y):
+        segments = [
+            (none_pct.iloc[i], none_pct.iloc[i] / 2),
+            (enc_pct.iloc[i], none_pct.iloc[i] + enc_pct.iloc[i] / 2),
+            (inv_pct.iloc[i], none_pct.iloc[i] + enc_pct.iloc[i] + inv_pct.iloc[i] / 2),
+            (ass_pct.iloc[i], none_pct.iloc[i] + enc_pct.iloc[i] + inv_pct.iloc[i] + ass_pct.iloc[i] / 2),
+            (suc_pct.iloc[i], none_pct.iloc[i] + enc_pct.iloc[i] + inv_pct.iloc[i] + ass_pct.iloc[i] + suc_pct.iloc[i] / 2)
+        ]
+        for pct, x_pos in segments:
+            if pct >= min_pct_threshold:
+                ax.text(x_pos, model_idx, f'{pct:.0f}%',
+                       ha='center', va='center', fontsize=label_fontsize,
+                       color='black', weight='bold')
+    ax.set_yticks(y)
+    ax.set_yticklabels(data['model_clean'], fontsize=MIN_FONT_SIZE)
+    ax.set_xlabel('Trials (%)', fontsize=MIN_FONT_SIZE)
+    ax.set_xlim(0, 100)
+    ax.set_ylim(-0.5, n_models - 0.5)
+    ax.tick_params(axis='x', labelsize=MIN_FONT_SIZE)
+    # Legend below the plot - 2 columns, positioned below x-axis label
+    ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.18), ncol=2,
+              frameon=False, fontsize=MIN_FONT_SIZE, columnspacing=0.8,
+              handletextpad=0.3, handlelength=1.0)
+    # Tight margins - more bottom space for legend
+    fig.subplots_adjust(left=0.28, right=0.99, top=0.99, bottom=0.38)
+    plt.title("Root Cause Entity Discovery Funnel")
+    plt.show()
+    fig.savefig(OUTPUT_DIR / "fig_conversion_funnel.png")
+    plt.close(fig)
+    print("Saved: fig_conversion_funnel.png")
+def extract_all_data():
+    # Create output directory
+    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+    # Load all ground truths
+    print("\nLoading ground truth data...")
+    gt_cache = {}
+    scenario_dirs = [d for d in GT_DIR.iterdir() if d.is_dir() and d.name.startswith("Scenario-")]
+    for scenario_dir in tqdm(scenario_dirs, desc="Loading ground truths"):
+        gt = load_ground_truth(scenario_dir.name)
+        if gt:
+            gt_cache[scenario_dir.name] = gt
+    print(f"Loaded {len(gt_cache)} ground truth files")
+    # Find react with code agents
+    model_dirs = [d for d in LEADERBOARD_DIR.iterdir()
+                  if d.is_dir() and d.name.startswith("react with code_")]
+    print(f"Found {len(model_dirs)} agent models")
+    # Analyze each model
+    all_results = []
+    for model_dir in tqdm(model_dirs, desc="Analyzing models"):
+        model_name = model_dir.name.replace("react with code_", "").split("_07ccdb1")[0]
+        print(f"\nAnalyzing {model_name}...")
+        results = analyze_model(model_dir, gt_cache)
+        all_results.extend(results)
+        # Summary stats
+        if results:
+            encounters = [r for r in results if r.t_encounter is not None]
+            asserts = [r for r in results if r.t_assert is not None]
+            recoveries = [r for r in results if r.had_recovery]
+            print(f"  Trials: {len(results)}")
+            print(f"  Encounters: {len(encounters)} ({100*len(encounters)/len(results):.1f}%)")
+            print(f"  Assertions: {len(asserts)} ({100*len(asserts)/len(results):.1f}%)")
+            print(f"  Recoveries: {len(recoveries)} ({100*len(recoveries)/len(results):.1f}%)")
+    # Convert to DataFrame
+    print("\n" + "=" * 60)
+    print("Generating output files...")
+    # Summary per trial
+    trial_data = []
+    for r in all_results:
+        trial_data.append({
+            'model': r.model,
+            'scenario': r.scenario,
+            'trial': r.trial,
+            'total_turns': r.total_turns,
+            'gt_entity': r.gt_entity,
+            't_encounter': r.t_encounter,
+            't_investigate': r.t_investigate,
+            't_assert': r.t_assert,
+            't_exonerate': r.t_exonerate,
+            't_recover': r.t_recover,
+            'max_stage_reached': r.max_stage_reached,
+            'final_success': r.final_success,
+            'root_cause_f1': r.root_cause_f1,
+            'discovery_efficiency': r.discovery_efficiency,
+            'investigation_delay': r.investigation_delay,
+            'assertion_delay': r.assertion_delay,
+            'had_recovery': r.had_recovery,
+            'n_mentions': len(r.mentions),
+            'total_entities_available': r.total_entities_available,
+            'unique_entities_encountered': r.unique_entities_encountered,
+            'unique_entities_investigated': r.unique_entities_investigated,
+            'exploration_ratio': r.exploration_ratio,
+            # Coverage metrics (on-chain vs off-chain)
+            'on_chain_investigated': r.on_chain_investigated,
+            'off_chain_investigated': r.off_chain_investigated,
+            'propagation_coverage': r.propagation_coverage,
+            'detour_rate': r.detour_rate
+        })
+    trial_df = pd.DataFrame(trial_data)
+    trial_df.to_csv(OUTPUT_DIR / "discovery_trials.csv", index=False)
+    print(f"Saved: {OUTPUT_DIR / 'discovery_trials.csv'}")
+    # Summary per model
+    model_summary = []
+    for model in trial_df['model'].unique():
+        model_data = trial_df[trial_df['model'] == model]
+        n_total = len(model_data)
+        # Funnel stages: count trials reaching each stage
+        # Stage 0: none, 1: encounter, 2: investigate, 3: assert, 4: success
+        stage_counts = model_data['max_stage_reached'].value_counts().to_dict()
+        # Cumulative: how many reached AT LEAST this stage
+        n_encounter = len(model_data[model_data['max_stage_reached'] >= 1])
+        n_investigate = len(model_data[model_data['max_stage_reached'] >= 2])
+        n_assert = len(model_data[model_data['max_stage_reached'] >= 3])
+        n_success = len(model_data[model_data['max_stage_reached'] >= 4])
+        # Filter to trials where we found something
+        with_encounter = model_data[model_data['t_encounter'].notna()]
+        with_assert = model_data[model_data['t_assert'].notna()]
+        with_recovery = model_data[model_data['had_recovery'] == True]
+        with_success = model_data[model_data['final_success'] == True]
+        model_summary.append({
+            'model': model,
+            'n_trials': n_total,
+            'n_scenarios': model_data['scenario'].nunique(),
+            # Funnel rates (cumulative, relative to total trials)
+            'encounter_rate': n_encounter / n_total if n_total > 0 else 0,
+            'investigate_rate': n_investigate / n_total if n_total > 0 else 0,
+            'assertion_rate': n_assert / n_total if n_total > 0 else 0,
+            'success_rate': n_success / n_total if n_total > 0 else 0,
+            # Conversion rate: given encounter, did model declare it as root cause?
+            # This handles multi-root-cause scenarios better
+            'conversion_rate': n_success / n_encounter if n_encounter > 0 else 0,
+            # Drop-off at each stage (exclusive counts)
+            'n_stage_0_none': stage_counts.get(0, 0),
+            'n_stage_1_encounter_only': stage_counts.get(1, 0),
+            'n_stage_2_investigate_only': stage_counts.get(2, 0),
+            'n_stage_3_assert_only': stage_counts.get(3, 0),
+            'n_stage_4_success': stage_counts.get(4, 0),
+            # Legacy metrics
+            'recovery_rate': len(with_recovery) / n_total if n_total > 0 else 0,
+            'avg_t_encounter': with_encounter['t_encounter'].mean() if len(with_encounter) > 0 else None,
+            'avg_t_assert': with_assert['t_assert'].mean() if len(with_assert) > 0 else None,
+            'avg_total_turns': model_data['total_turns'].mean(),
+            'avg_discovery_efficiency': with_encounter['discovery_efficiency'].mean() if len(with_encounter) > 0 else None,
+            'avg_investigation_delay': with_encounter['investigation_delay'].mean() if len(with_encounter) > 0 else None,
+            'avg_assertion_delay': with_assert['assertion_delay'].mean() if len(with_assert) > 0 else None,
+            'avg_f1': with_success['root_cause_f1'].mean() if len(with_success) > 0 else None,
+            'avg_exploration_ratio': model_data['exploration_ratio'].mean(),
+            'avg_entities_investigated': model_data['unique_entities_investigated'].mean(),
+            # Coverage metrics (fault propagation coverage)
+            'avg_on_chain_investigated': model_data['on_chain_investigated'].mean(),
+            'avg_off_chain_investigated': model_data['off_chain_investigated'].mean(),
+            'avg_propagation_coverage': model_data['propagation_coverage'].mean(),
+            'avg_detour_rate': model_data['detour_rate'].mean()
+        })
+    summary_df = pd.DataFrame(model_summary)
+    summary_df.to_csv(OUTPUT_DIR / "discovery_summary.csv", index=False)
+    print(f"Saved: {OUTPUT_DIR / 'discovery_summary.csv'}")
+    trials_n = len(all_results)
+    return summary_df, trial_df, trials_n
+def main():
+    print("=" * 60)
+    print("Root Cause Discovery Trajectory Analysis")
+    print("=" * 60)
+    summary_df, trial_df, trials_n = extract_all_data()
+    # Print summary table with funnel
+    print("\n" + "=" * 80)
+    print("Discovery Pipeline Funnel:")
+    print("-" * 80)
+    print(f"{'Model':<25} {'Trials':>7} {'Encntr':>8} {'Invest':>8} {'Assert':>8} {'Success':>8}")
+    print("-" * 80)
+    for _, row in summary_df.iterrows():
+        print(f"{row['model']:<25} {row['n_trials']:>7} "
+              f"{row['encounter_rate']*100:>7.0f}% "
+              f"{row['investigate_rate']*100:>7.0f}% "
+              f"{row['assertion_rate']*100:>7.0f}% "
+              f"{row['success_rate']*100:>7.0f}%")
+    print("\n" + "=" * 80)
+    print("Drop-off Analysis (where trials stopped):")
+    print("-" * 80)
+    print(f"{'Model':<25} {'None':>7} {'Enc→X':>7} {'Inv→X':>7} {'Ass→X':>7} {'✓':>7}")
+    print("-" * 80)
+    for _, row in summary_df.iterrows():
+        print(f"{row['model']:<25} "
+              f"{row['n_stage_0_none']:>7} "
+              f"{row['n_stage_1_encounter_only']:>7} "
+              f"{row['n_stage_2_investigate_only']:>7} "
+              f"{row['n_stage_3_assert_only']:>7} "
+              f"{row['n_stage_4_success']:>7}")
+    print(f"\nTotal trials analyzed: {trials_n}")
+    print(f"\nOutput saved to: {OUTPUT_DIR}")
+if __name__ == "__main__":
+    main()

analysis_src/extract_exploration.py ADDED Viewed

	@@ -0,0 +1,623 @@

+#!/usr/bin/env python3
+"""
+Exploration Breadth Analysis by Diagnosis Correctness
+Creates a plot comparing exploration breadth between:
+- Correct diagnoses (recall > 0, i.e., root_cause_f1 > 0)
+- Incorrect diagnoses (recall = 0, i.e., root_cause_f1 == 0)
+Uses semantic entity grouping to avoid counting "frontend deployment" and
+"frontend service" as separate entities.
+"""
+import json
+import sys
+import re
+from pathlib import Path
+from dataclasses import dataclass
+from typing import Optional, List, Dict, Set, Tuple
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from tqdm import tqdm
+# Publication settings - ICML half column
+HALF_COLUMN_WIDTH = 3.25  # inches
+MIN_FONT_SIZE = 8
+plt.rcParams.update({
+    'font.size': MIN_FONT_SIZE,
+    'font.family': 'serif',
+    'axes.labelsize': MIN_FONT_SIZE,
+    'axes.titlesize': MIN_FONT_SIZE + 1,
+    'xtick.labelsize': MIN_FONT_SIZE,
+    'ytick.labelsize': MIN_FONT_SIZE,
+    'legend.fontsize': MIN_FONT_SIZE,
+    'figure.dpi': 150,
+    'savefig.dpi': 300,
+    'savefig.bbox': 'tight',
+    'axes.spines.top': False,
+    'axes.spines.right': False,
+})
+# Paths
+PROJECT_ROOT = Path(__file__).parent.parent
+sys.path.insert(0, str(PROJECT_ROOT))
+from src.utils import find_latest_rollout_file
+from src.model_styles import (
+    get_display_name, MIN_FONT_SIZE, SINGLE_COLUMN_WIDTH, DOUBLE_COLUMN_WIDTH, PLOT_PARAMETERS
+)
+# Paths
+LEADERBOARD_DIR = PROJECT_ROOT / "data" / "trajectories"
+GT_DIR = PROJECT_ROOT / "data" / "itbench-snapshots"
+OUTPUT_DIR = PROJECT_ROOT / "data" / "output" / "discovery"
+# Regex for K8s entities
+K8S_ENTITY_PATTERN = re.compile(
+    r'([\w-]+)/(Deployment|Service|Pod|ReplicaSet|ResourceQuota|StatefulSet|'
+    r'DaemonSet|Job|CronJob|ConfigMap|Secret|Endpoints|Ingress|'
+    r'PersistentVolumeClaim|PersistentVolume|ServiceAccount|Role|RoleBinding|'
+    r'ClusterRole|ClusterRoleBinding|NetworkPolicy|HorizontalPodAutoscaler|'
+    r'Node|Schedule|NetworkChaos|StressChaos|PodChaos)/([\w-]+)',
+    re.IGNORECASE
+)
+# Service name normalization patterns
+SERVICE_NORMALIZATIONS = {
+    # Map specific variations to canonical names
+    'frontend-proxy': 'frontend-proxy',
+    'frontendproxy': 'frontend-proxy',
+    'frontend': 'frontend',
+    'checkout': 'checkout',
+    'checkoutservice': 'checkout',
+    'cart': 'cart',
+    'cartservice': 'cart',
+    'shipping': 'shipping',
+    'shippingservice': 'shipping',
+    'product-catalog': 'product-catalog',
+    'productcatalog': 'product-catalog',
+    'productcatalogservice': 'product-catalog',
+    'recommendation': 'recommendation',
+    'recommendationservice': 'recommendation',
+    'email': 'email',
+    'emailservice': 'email',
+    'payment': 'payment',
+    'paymentservice': 'payment',
+    'currency': 'currency',
+    'currencyservice': 'currency',
+    'ad': 'ad',
+    'adservice': 'ad',
+    'fraud-detection': 'fraud-detection',
+    'frauddetection': 'fraud-detection',
+    'frauddetectionservice': 'fraud-detection',
+    'load-generator': 'load-generator',
+    'loadgenerator': 'load-generator',
+    'flagd': 'flagd',
+    'otel-collector': 'otel-collector',
+    'valkey': 'valkey',
+    'valkey-cart': 'valkey',  # valkey instance for cart
+    'redis': 'valkey',  # alias
+    'kafka': 'kafka',
+    'quote': 'quote',
+    'quoteservice': 'quote',
+    'accounting': 'accounting',
+    'accountingservice': 'accounting',
+    'otel-demo': 'otel-demo',  # namespace
+    'imageprovider': 'imageprovider',
+    'flagdui': 'flagdui',
+    'opensearch': 'opensearch',
+    'grafana': 'grafana',
+    'jaeger': 'jaeger',
+    'prometheus': 'prometheus',
+}
+# Model name mapping for cleaner labels
+MODEL_NAMES = {
+    'Azure_gpt-5.1-2025-11-13': 'GPT-5.1',
+    'Azure_o4-mini': 'o4-mini',
+    'GCP_gemini-2.5-pro': 'Gemini 2.5 Pro',
+    'gcp_gemini-3-pro-preview': 'Gemini 3 Pro',
+    'gemini-3-pro-preview': 'Gemini 3 Pro',
+    'gemini-3-flash-preview': 'Gemini 3 Flash',
+    'moonshotai_kimi-k2-thinking': 'Kimi K2',
+    'aws_claude-opus-4-5': 'Claude Opus 4.5',
+    'openai_gpt-oss-120b': 'GPT-OSS-120B',
+}
+def normalize_entity_to_logical(entity: str) -> str:
+    """
+    Normalize an entity to its logical/canonical service name.
+    e.g., "otel-demo/Deployment/frontend-abc123" -> "frontend"
+          "otel-demo/Service/checkout" -> "checkout"
+          "chaos-mesh/NetworkChaos/xyz" -> "chaos:NetworkChaos"
+    """
+    parts = entity.lower().split('/')
+    # Handle chaos-mesh specially
+    if 'chaos-mesh' in parts[0] if parts else '':
+        if len(parts) >= 2:
+            return f"chaos:{parts[1]}"
+        return "chaos"
+    # Get the name part (last component)
+    if len(parts) >= 3:
+        name = parts[2]
+    elif len(parts) >= 1:
+        name = parts[-1]
+    else:
+        return entity.lower()
+    # Strip pod suffixes (e.g., frontend-5d4f6b7c8d-xyz9a -> frontend)
+    # Pattern: name followed by hash-like suffixes from ReplicaSets/Pods
+    # ReplicaSet adds -<hash10> and Pod adds -<hash5>
+    # e.g., frontend-5d4f6b7c8d-xyz9a -> strip -5d4f6b7c8d-xyz9a
+    name = re.sub(r'-[a-f0-9]{8,10}-[a-z0-9]{5}$', '', name)  # Pod suffix (RS hash + Pod hash)
+    name = re.sub(r'-[a-f0-9]{8,10}$', '', name)  # ReplicaSet suffix only (10-char hex hash)
+    # Also strip numeric suffixes like -1, -2 from entity names
+    name = re.sub(r'-\d+$', '', name)
+    # First check for exact match (most reliable)
+    if name in SERVICE_NORMALIZATIONS:
+        return SERVICE_NORMALIZATIONS[name]
+    # Try matching with service name variations
+    # Sort by length descending so longer patterns match first (frontend-proxy before frontend)
+    for pattern in sorted(SERVICE_NORMALIZATIONS.keys(), key=len, reverse=True):
+        canonical = SERVICE_NORMALIZATIONS[pattern]
+        # Exact match or name starts with pattern followed by typical suffixes
+        if name == pattern:
+            return canonical
+        # e.g., "checkoutservice" starts with "checkout"
+        if name.startswith(pattern) and (
+            len(name) == len(pattern) or
+            name[len(pattern):].startswith('service') or
+            name[len(pattern):].startswith('-')
+        ):
+            return canonical
+    # Fallback: return cleaned name
+    return name
+def extract_k8s_entities(text: str) -> List[str]:
+    """Extract all K8s entities from text."""
+    matches = K8S_ENTITY_PATTERN.findall(text)
+    entities = []
+    for m in matches:
+        entity = f"{m[0]}/{m[1]}/{m[2]}"
+        entities.append(entity)
+    return entities
+def extract_logical_entities(text: str) -> Set[str]:
+    """Extract and normalize entities to logical names."""
+    raw_entities = extract_k8s_entities(text)
+    return {normalize_entity_to_logical(e) for e in raw_entities}
+def get_latest_rollout(trial_dir: Path) -> Optional[Path]:
+    """Get the latest rollout file from a trial directory."""
+    sessions_dir = trial_dir / "sessions"
+    if not sessions_dir.exists():
+        return None
+    rollout_files = list(sessions_dir.glob("**/rollout-*.jsonl"))
+    if not rollout_files:
+        return None
+    return max(rollout_files, key=lambda p: p.stat().st_mtime)
+def get_judge_f1(trial_dir: Path) -> float:
+    """Get root_cause_entity_f1 from judge output."""
+    judge_path = trial_dir / "judge_output.json"
+    if not judge_path.exists():
+        return 0.0
+    try:
+        with open(judge_path) as f:
+            judge_data = json.load(f)
+        return judge_data.get('flat_scores', {}).get('root_cause_entity_f1', 0.0) or 0.0
+    except:
+        return 0.0
+def count_semantic_entities_investigated(rollout_path: Path) -> int:
+    """
+    Count unique semantic entity groups investigated in a rollout.
+    Uses normalization to group similar entities:
+    - otel-demo/Deployment/frontend and otel-demo/Service/frontend -> 1 entity ("frontend")
+    - otel-demo/Pod/frontend-abc123 and otel-demo/Pod/frontend-xyz456 -> 1 entity ("frontend")
+    """
+    investigated_logical = set()
+    with open(rollout_path) as f:
+        for line in f:
+            try:
+                obj = json.loads(line)
+            except json.JSONDecodeError:
+                continue
+            if obj.get('type') != 'response_item':
+                continue
+            payload = obj.get('payload', {})
+            # Check tool arguments (investigation = active querying)
+            if payload.get('type') == 'function_call':
+                args = payload.get('arguments', {})
+                if isinstance(args, str):
+                    try:
+                        args = json.loads(args)
+                    except:
+                        args = {'raw': args}
+                args_str = json.dumps(args)
+                # Extract and normalize entities
+                logical_entities = extract_logical_entities(args_str)
+                investigated_logical.update(logical_entities)
+    return len(investigated_logical)
+def analyze_all_trials() -> pd.DataFrame:
+    """
+    Analyze all trials from react with code agents.
+    Returns DataFrame with model, scenario, trial, f1_score, semantic_entities_investigated.
+    """
+    results = []
+    # Find react with code agents
+    model_dirs = [d for d in LEADERBOARD_DIR.iterdir()
+                  if d.is_dir() and d.name.startswith("react with code_")]
+    print(f"Found {len(model_dirs)} agent models")
+    for model_dir in tqdm(model_dirs, desc="Processing models"):
+        model_name = model_dir.name.replace("react with code_", "").split("_07ccdb1")[0]
+        print(f"Processing {model_name}...")
+        scenario_dirs = [d for d in sorted(model_dir.iterdir()) if d.is_dir() and d.name.startswith("Scenario-")]
+        for scenario_dir in tqdm(scenario_dirs, desc=f"  {model_name} scenarios", leave=False):
+            scenario = scenario_dir.name
+            trial_dirs = [d for d in sorted(scenario_dir.iterdir()) if d.is_dir() and d.name.isdigit()]
+            for trial_dir in tqdm(trial_dirs, desc=f"    {scenario} trials", leave=False):
+                trial_num = int(trial_dir.name)
+                rollout_path = get_latest_rollout(trial_dir)
+                if rollout_path is None:
+                    continue
+                try:
+                    f1_score = get_judge_f1(trial_dir)
+                    semantic_count = count_semantic_entities_investigated(rollout_path)
+                    results.append({
+                        'model': model_name,
+                        'scenario': scenario,
+                        'trial': trial_num,
+                        'root_cause_f1': f1_score,
+                        'is_correct': f1_score > 0,
+                        'semantic_entities_investigated': semantic_count
+                    })
+                except Exception as e:
+                    print(f"  Error processing {model_name}/{scenario}/{trial_num}: {e}")
+    return pd.DataFrame(results)
+def clean_model_name(name: str) -> str:
+    return MODEL_NAMES.get(name, name)
+def plot_exploration_by_correctness(df: pd.DataFrame):
+    """
+    Plot comparing exploration breadth between correct and incorrect diagnoses.
+    Creates a grouped bar chart or box plot.
+    """
+    # Aggregate by model and correctness
+    agg = df.groupby(['model', 'is_correct']).agg({
+        'semantic_entities_investigated': ['mean', 'std', 'count']
+    }).reset_index()
+    agg.columns = ['model', 'is_correct', 'mean_entities', 'std_entities', 'n_trials']
+    # Pivot for easier plotting
+    correct_df = agg[agg['is_correct'] == True].set_index('model')
+    incorrect_df = agg[agg['is_correct'] == False].set_index('model')
+    # Get all models that have both correct and incorrect trials
+    models_both = set(correct_df.index) & set(incorrect_df.index)
+    # Create comparison data
+    comparison_data = []
+    for model in models_both:
+        comparison_data.append({
+            'model': model,
+            'model_clean': clean_model_name(model),
+            'correct_mean': correct_df.loc[model, 'mean_entities'],
+            'correct_std': correct_df.loc[model, 'std_entities'],
+            'correct_n': correct_df.loc[model, 'n_trials'],
+            'incorrect_mean': incorrect_df.loc[model, 'mean_entities'],
+            'incorrect_std': incorrect_df.loc[model, 'std_entities'],
+            'incorrect_n': incorrect_df.loc[model, 'n_trials'],
+        })
+    comp_df = pd.DataFrame(comparison_data)
+    comp_df = comp_df.sort_values('correct_mean', ascending=True)
+    # === Figure 1: Grouped bar chart ===
+    fig, ax = plt.subplots(figsize=(HALF_COLUMN_WIDTH, 3.0))
+    y = np.arange(len(comp_df))
+    bar_height = 0.35
+    # Incorrect (red) and Correct (green) bars
+    bars_incorrect = ax.barh(y - bar_height/2, comp_df['incorrect_mean'],
+                              height=bar_height, label='Incorrect (recall=0)',
+                              color='#d62728', edgecolor='black', linewidth=0.3, alpha=0.8)
+    bars_correct = ax.barh(y + bar_height/2, comp_df['correct_mean'],
+                            height=bar_height, label='Correct (recall>0)',
+                            color='#2ca02c', edgecolor='black', linewidth=0.3, alpha=0.8)
+    ax.set_yticks(y)
+    ax.set_yticklabels(comp_df['model_clean'])
+    ax.set_xlabel('Avg. Semantic Entity Groups Investigated')
+    # Add value labels
+    for i, (bar_i, bar_c) in enumerate(zip(bars_incorrect, bars_correct)):
+        # Incorrect
+        ax.text(bar_i.get_width() + 0.1, bar_i.get_y() + bar_i.get_height()/2,
+                f'{bar_i.get_width():.1f}', va='center', ha='left',
+                fontsize=MIN_FONT_SIZE - 1, color='#d62728')
+        # Correct
+        ax.text(bar_c.get_width() + 0.1, bar_c.get_y() + bar_c.get_height()/2,
+                f'{bar_c.get_width():.1f}', va='center', ha='left',
+                fontsize=MIN_FONT_SIZE - 1, color='#2ca02c')
+    ax.legend(loc='lower right', frameon=False, fontsize=MIN_FONT_SIZE)
+    plt.tight_layout()
+    fig.savefig(OUTPUT_DIR / "fig_exploration_by_correctness.pdf")
+    fig.savefig(OUTPUT_DIR / "fig_exploration_by_correctness.png")
+    plt.close(fig)
+    print(f"Saved: fig_exploration_by_correctness.pdf/png")
+    # === Figure 2: Box plot distribution ===
+    fig2, ax2 = plt.subplots(figsize=(HALF_COLUMN_WIDTH * 1.5, 3.5))
+    # Prepare data for box plot
+    df['correctness'] = df['is_correct'].map({True: 'Correct\n(recall>0)', False: 'Incorrect\n(recall=0)'})
+    df['model_clean'] = df['model'].apply(clean_model_name)
+    # Order models by overall median exploration
+    model_order = df.groupby('model_clean')['semantic_entities_investigated'].median().sort_values().index.tolist()
+    # Create box plot with hue
+    sns.boxplot(data=df, x='model_clean', y='semantic_entities_investigated',
+                hue='correctness', order=model_order, ax=ax2,
+                palette={'Correct\n(recall>0)': '#2ca02c', 'Incorrect\n(recall=0)': '#d62728'},
+                linewidth=0.5, fliersize=2)
+    ax2.set_xlabel('')
+    ax2.set_ylabel('Semantic Entity Groups Investigated')
+    ax2.tick_params(axis='x', rotation=45)
+    ax2.legend(title='', loc='upper left', frameon=False, fontsize=MIN_FONT_SIZE)
+    plt.tight_layout()
+    fig2.savefig(OUTPUT_DIR / "fig_exploration_by_correctness_boxplot.pdf")
+    fig2.savefig(OUTPUT_DIR / "fig_exploration_by_correctness_boxplot.png")
+    plt.close(fig2)
+    print(f"Saved: fig_exploration_by_correctness_boxplot.pdf/png")
+    # === Figure 3: Aggregated across all models ===
+    fig3, ax3 = plt.subplots(figsize=(HALF_COLUMN_WIDTH * 0.8, 2.5))
+    correct_all = df[df['is_correct'] == True]['semantic_entities_investigated']
+    incorrect_all = df[df['is_correct'] == False]['semantic_entities_investigated']
+    # Violin plot for overall distribution
+    parts = ax3.violinplot([incorrect_all, correct_all], positions=[0, 1],
+                            showmeans=True, showmedians=True)
+    # Color the violins
+    colors = ['#d62728', '#2ca02c']
+    for i, pc in enumerate(parts['bodies']):
+        pc.set_facecolor(colors[i])
+        pc.set_alpha(0.7)
+    # Style the other elements
+    for partname in ['cmeans', 'cmedians', 'cbars', 'cmins', 'cmaxes']:
+        if partname in parts:
+            parts[partname].set_edgecolor('black')
+            parts[partname].set_linewidth(0.5)
+    ax3.set_xticks([0, 1])
+    ax3.set_xticklabels(['Incorrect\n(recall=0)', 'Correct\n(recall>0)'])
+    ax3.set_ylabel('Semantic Entities Investigated')
+    # Add mean values as text
+    ax3.text(0, incorrect_all.mean() + 0.5, f'μ={incorrect_all.mean():.1f}',
+             ha='center', fontsize=MIN_FONT_SIZE, color='#d62728')
+    ax3.text(1, correct_all.mean() + 0.5, f'μ={correct_all.mean():.1f}',
+             ha='center', fontsize=MIN_FONT_SIZE, color='#2ca02c')
+    # Add n counts
+    ax3.text(0, ax3.get_ylim()[0] + 0.5, f'n={len(incorrect_all)}',
+             ha='center', fontsize=MIN_FONT_SIZE - 1)
+    ax3.text(1, ax3.get_ylim()[0] + 0.5, f'n={len(correct_all)}',
+             ha='center', fontsize=MIN_FONT_SIZE - 1)
+    plt.tight_layout()
+    fig3.savefig(OUTPUT_DIR / "fig_exploration_overall_correctness.pdf")
+    fig3.savefig(OUTPUT_DIR / "fig_exploration_overall_correctness.png")
+    plt.close(fig3)
+    print(f"Saved: fig_exploration_overall_correctness.pdf/png")
+    # Print statistics
+    print("\n" + "=" * 60)
+    print("Exploration Breadth by Diagnosis Correctness")
+    print("=" * 60)
+    print(f"\nOverall Statistics:")
+    print(f"  Correct diagnoses (n={len(correct_all)}): mean={correct_all.mean():.2f}, median={correct_all.median():.1f}")
+    print(f"  Incorrect diagnoses (n={len(incorrect_all)}): mean={incorrect_all.mean():.2f}, median={incorrect_all.median():.1f}")
+    # Statistical test
+    from scipy import stats
+    stat, pvalue = stats.mannwhitneyu(correct_all, incorrect_all, alternative='two-sided')
+    print(f"\n  Mann-Whitney U test: U={stat:.0f}, p={pvalue:.4f}")
+    print(f"\nPer-Model Comparison:")
+    print(f"{'Model':<20} {'Correct':>12} {'Incorrect':>12} {'Diff':>8}")
+    print("-" * 55)
+    for _, row in comp_df.sort_values('correct_mean', ascending=False).iterrows():
+        diff = row['correct_mean'] - row['incorrect_mean']
+        print(f"{row['model_clean']:<20} {row['correct_mean']:>10.1f} (n={int(row['correct_n'])}) "
+              f"{row['incorrect_mean']:>10.1f} (n={int(row['incorrect_n'])}) {diff:>+7.1f}")
+    return comp_df
+def plot_success_by_exploration_bins(df: pd.DataFrame):
+    """
+    Plot showing success rate as a function of exploration breadth.
+    This shows a clear dose-response relationship.
+    """
+    # Create exploration bins
+    bins = [0, 2, 4, 6, 8, 10, 100]
+    labels = ['0-2', '3-4', '5-6', '7-8', '9-10', '11+']
+    df['exploration_bin'] = pd.cut(df['semantic_entities_investigated'],
+                                    bins=bins, labels=labels)
+    # Calculate success rate per bin
+    bin_stats = []
+    for label in labels:
+        subset = df[df['exploration_bin'] == label]
+        if len(subset) > 0:
+            success_rate = (subset['root_cause_f1'] > 0).mean() * 100
+            bin_stats.append({
+                'bin': label,
+                'success_rate': success_rate,
+                'n': len(subset)
+            })
+    stats_df = pd.DataFrame(bin_stats)
+    # Create figure
+    fig, ax = plt.subplots(figsize=(HALF_COLUMN_WIDTH, 2.5))
+    x = np.arange(len(stats_df))
+    bars = ax.bar(x, stats_df['success_rate'],
+                  color='#4a90d9', edgecolor='black', linewidth=0.5)
+    ax.set_xticks(x)
+    ax.set_xticklabels(stats_df['bin'])
+    ax.set_xlabel('Semantic Entities Investigated')
+    ax.set_ylabel('Correct Diagnosis Rate (%)')
+    # Add value labels on bars
+    for i, (bar, row) in enumerate(zip(bars, stats_df.itertuples())):
+        height = bar.get_height()
+        ax.text(bar.get_x() + bar.get_width()/2, height + 1,
+                f'{height:.0f}%', ha='center', va='bottom',
+                fontsize=MIN_FONT_SIZE)
+        ax.text(bar.get_x() + bar.get_width()/2, 2,
+                f'n={row.n}', ha='center', va='bottom',
+                fontsize=MIN_FONT_SIZE - 1, color='white')
+    ax.set_ylim(0, 60)
+    plt.tight_layout()
+    fig.savefig(OUTPUT_DIR / "fig_exploration_success_rate.pdf")
+    fig.savefig(OUTPUT_DIR / "fig_exploration_success_rate.png")
+    plt.close(fig)
+    print(f"Saved: fig_exploration_success_rate.pdf/png")
+    # Also create a combined figure with both views
+    fig2, axes = plt.subplots(1, 2, figsize=(HALF_COLUMN_WIDTH * 2 + 0.3, 2.5))
+    # Left: Success rate by exploration bins
+    ax1 = axes[0]
+    bars1 = ax1.bar(x, stats_df['success_rate'],
+                    color='#4a90d9', edgecolor='black', linewidth=0.5)
+    ax1.set_xticks(x)
+    ax1.set_xticklabels(stats_df['bin'])
+    ax1.set_xlabel('Entities Investigated')
+    ax1.set_ylabel('Correct Diagnosis Rate (%)')
+    ax1.set_title('(a) Success vs Exploration', fontsize=MIN_FONT_SIZE + 1)
+    for bar, row in zip(bars1, stats_df.itertuples()):
+        ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
+                f'{bar.get_height():.0f}%', ha='center', va='bottom',
+                fontsize=MIN_FONT_SIZE - 1)
+    ax1.set_ylim(0, 60)
+    # Right: Exploration distribution by correctness (violin)
+    ax2 = axes[1]
+    correct = df[df['is_correct'] == True]['semantic_entities_investigated']
+    incorrect = df[df['is_correct'] == False]['semantic_entities_investigated']
+    parts = ax2.violinplot([incorrect, correct], positions=[0, 1],
+                            showmeans=True, showmedians=True)
+    colors = ['#d62728', '#2ca02c']
+    for i, pc in enumerate(parts['bodies']):
+        pc.set_facecolor(colors[i])
+        pc.set_alpha(0.7)
+    for partname in ['cmeans', 'cmedians', 'cbars', 'cmins', 'cmaxes']:
+        if partname in parts:
+            parts[partname].set_edgecolor('black')
+            parts[partname].set_linewidth(0.5)
+    ax2.set_xticks([0, 1])
+    ax2.set_xticklabels(['Incorrect', 'Correct'])
+    ax2.set_ylabel('Entities Investigated')
+    ax2.set_title('(b) Exploration by Outcome', fontsize=MIN_FONT_SIZE + 1)
+    ax2.text(0, incorrect.mean() + 1, f'μ={incorrect.mean():.1f}',
+             ha='center', fontsize=MIN_FONT_SIZE - 1, color='#d62728')
+    ax2.text(1, correct.mean() + 1, f'μ={correct.mean():.1f}',
+             ha='center', fontsize=MIN_FONT_SIZE - 1, color='#2ca02c')
+    plt.tight_layout()
+    fig2.savefig(OUTPUT_DIR / "fig_exploration_combined.pdf")
+    fig2.savefig(OUTPUT_DIR / "fig_exploration_combined.png")
+    plt.close(fig2)
+    print(f"Saved: fig_exploration_combined.pdf/png")
+def main():
+    print("=" * 60)
+    print("Exploration Breadth by Diagnosis Correctness Analysis")
+    print("=" * 60)
+    # Check if we can use cached data or need to re-extract
+    cache_path = OUTPUT_DIR / "exploration_by_correctness.csv"
+    if cache_path.exists():
+        print(f"\nLoading cached data from {cache_path}")
+        df = pd.read_csv(cache_path)
+    else:
+        print("\nExtracting data from rollout files (this may take a while)...")
+        df = analyze_all_trials()
+        df.to_csv(cache_path, index=False)
+        print(f"Saved cache to: {cache_path}")
+    print(f"\nLoaded {len(df)} trials from {df['model'].nunique()} models")
+    # Generate plots
+    print("\nGenerating figures...")
+    plot_exploration_by_correctness(df)
+    plot_success_by_exploration_bins(df)  # NEW: dose-response plot
+    print(f"\nDone! Figures saved to: {OUTPUT_DIR}")
+if __name__ == "__main__":
+    main()

analysis_src/extract_inference_data.py ADDED Viewed

	@@ -0,0 +1,595 @@

+#!/usr/bin/env python3
+"""
+Extract inference request and token usage data for all 'react with code' agents.
+This script reads session.jsonl files to count inference requests and estimate token usage.
+"""
+import json
+import sys
+from pathlib import Path
+from dataclasses import dataclass
+import ast
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+from tqdm import tqdm
+PROJECT_ROOT = Path(__file__).parent.parent
+sys.path.insert(0, str(PROJECT_ROOT))
+from src.utils import (
+    get_model_name,
+    find_react_with_code_dirs,
+    get_runs_stats,
+    filter_scenarios_with_min_runs,
+    find_latest_rollout_file
+)
+from src.model_styles import (
+    get_model_style, MIN_FONT_SIZE, SINGLE_COLUMN_WIDTH, DOUBLE_COLUMN_WIDTH, get_color_palette, PLOT_PARAMETERS
+)
+# Paths
+LEADERBOARD_DIR = PROJECT_ROOT / "data" / "trajectories"
+RESULTS_JSON_DIR = LEADERBOARD_DIR / "results"
+OUTPUT_DIR = PROJECT_ROOT / "data" / "output" / "inferences"
+# Minimum runs per scenario required
+MIN_RUNS_PER_SCENARIO = 3
+MIN_QUALIFYING_SCENARIOS = 20
+# Token estimation factor (chars per token)
+CHARS_PER_TOKEN = 4
+def extract_tokens_from_rollout(rollout_file: Path) -> dict:
+    """
+    Extract token counts and tool usage from a rollout file.
+    Counts:
+    - INPUT: system prompt + user messages + tool outputs
+    - OUTPUT: assistant messages + tool call arguments
+    - TOOLS: counts by tool name, including code execution
+    """
+    system_prompt_chars = 0
+    user_input_chars = 0
+    assistant_output_chars = 0
+    tool_call_chars = 0
+    tool_output_chars = 0
+    assistant_msg_count = 0
+    tool_call_count = 0
+    tool_counts = {}  # tool_name -> count
+    code_execution_count = 0  # Specifically track code/python execution
+    # Tool names that indicate code execution
+    CODE_TOOLS = ['execute_python', 'run_python', 'python', 'execute_code',
+                  'run_code', 'shell', 'bash', 'terminal', 'exec']
+    try:
+        with open(rollout_file) as f:
+            for line in f:
+                try:
+                    d = json.loads(line)
+                    msg_type = d.get('type', '')
+                    payload = d.get('payload', {})
+                    if msg_type == 'session_meta':
+                        # System prompt
+                        instructions = payload.get('instructions', '')
+                        system_prompt_chars += len(str(instructions))
+                    elif msg_type == 'response_item':
+                        item_type = payload.get('type', '')
+                        role = payload.get('role', '')
+                        if item_type == 'message':
+                            content = payload.get('content', [])
+                            if isinstance(content, list):
+                                text = ' '.join([
+                                    c.get('text', '') if isinstance(c, dict) else str(c)
+                                    for c in content
+                                ])
+                            else:
+                                text = str(content)
+                            if role == 'user':
+                                user_input_chars += len(text)
+                            elif role == 'assistant':
+                                assistant_output_chars += len(text)
+                                assistant_msg_count += 1
+                        elif item_type == 'function_call':
+                            # Tool call (output)
+                            name = payload.get('name', '')
+                            arguments = payload.get('arguments', '')
+                            tool_call_chars += len(str(name)) + len(str(arguments))
+                            tool_call_count += 1
+                            # Track tool usage
+                            tool_counts[name] = tool_counts.get(name, 0) + 1
+                            # Check if it's code execution
+                            name_lower = name.lower()
+                            if any(code_tool in name_lower for code_tool in CODE_TOOLS):
+                                code_execution_count += 1
+                            # Also check if arguments contain python code patterns
+                            args_str = str(arguments).lower()
+                            if 'python' in name_lower or ('def ' in args_str or 'import ' in args_str):
+                                code_execution_count += 1
+                        elif item_type == 'function_call_output':
+                            # Tool output (input to model)
+                            output = payload.get('output', '')
+                            tool_output_chars += len(str(output))
+                except json.JSONDecodeError:
+                    continue
+    except Exception as e:
+        return None
+    # INPUT = system + user + tool outputs (fed back to model)
+    input_chars = system_prompt_chars + user_input_chars + tool_output_chars
+    # OUTPUT = assistant responses + tool call arguments
+    output_chars = assistant_output_chars + tool_call_chars
+    return {
+        'system_prompt_chars': system_prompt_chars,
+        'user_input_chars': user_input_chars,
+        'assistant_output_chars': assistant_output_chars,
+        'tool_call_chars': tool_call_chars,
+        'tool_output_chars': tool_output_chars,
+        'input_chars': input_chars,
+        'output_chars': output_chars,
+        'input_tokens': input_chars // CHARS_PER_TOKEN,
+        'output_tokens': output_chars // CHARS_PER_TOKEN,
+        'assistant_msg_count': assistant_msg_count,
+        'tool_call_count': tool_call_count,
+        'tool_counts': tool_counts,
+        'code_execution_count': code_execution_count,
+    }
+def extract_session_stats(session_file: Path) -> dict:
+    """
+    Extract inference stats from session.jsonl and rollout files.
+    Uses the latest rollout file for accurate token counting.
+    """
+    if not session_file.exists():
+        return None
+    trial_dir = session_file.parent
+    # Count inference requests from session.jsonl
+    inference_count = 0
+    try:
+        with open(session_file) as f:
+            for line in f:
+                try:
+                    d = json.loads(line)
+                    if d.get('type') == 'response_item':
+                        inference_count += 1
+                except json.JSONDecodeError:
+                    continue
+    except Exception as e:
+        print(f"    Warning: Error reading {session_file}: {e}")
+        return None
+    # First check stdout.log for real token counts (OpenAI models)
+    stdout_log = trial_dir / "traces" / "stdout.log"
+    has_real_tokens = False
+    input_tokens = 0
+    output_tokens = 0
+    cached_input_tokens = 0
+    if stdout_log.exists():
+        try:
+            with open(stdout_log) as f:
+                for line in f:
+                    try:
+                        d = json.loads(line)
+                        if d.get('type') == 'turn.completed':
+                            usage = d.get('usage', {})
+                            input_tokens = usage.get('input_tokens', 0)
+                            output_tokens = usage.get('output_tokens', 0)
+                            cached_input_tokens = usage.get('cached_input_tokens', 0)
+                            if input_tokens > 0 or output_tokens > 0:
+                                has_real_tokens = True
+                            break
+                    except json.JSONDecodeError:
+                        continue
+        except Exception:
+            pass
+    # Extract from latest rollout file for tokens (if needed) and tool counts
+    tool_call_count = 0
+    tool_counts = {}
+    code_execution_count = 0
+    latest_rollout = find_latest_rollout_file(trial_dir)
+    if latest_rollout:
+        rollout_stats = extract_tokens_from_rollout(latest_rollout)
+        if rollout_stats:
+            # Use rollout tokens if no real API token data
+            if not has_real_tokens:
+                input_tokens = rollout_stats['input_tokens']
+                output_tokens = rollout_stats['output_tokens']
+            # Always use rollout for tool counts
+            tool_call_count = rollout_stats['tool_call_count']
+            tool_counts = rollout_stats['tool_counts']
+            code_execution_count = rollout_stats['code_execution_count']
+    return {
+        'inference_count': inference_count,
+        'input_tokens': input_tokens,
+        'cached_input_tokens': cached_input_tokens,
+        'output_tokens': output_tokens,
+        'total_tokens': input_tokens + output_tokens,
+        'has_real_tokens': has_real_tokens,
+        'tool_call_count': tool_call_count,
+        'tool_counts': tool_counts,
+        'code_execution_count': code_execution_count,
+    }
+def read_agent_stats(agent_dir: Path) -> dict[str, list[dict]]:
+    """
+    Read session stats from all scenarios/trials for an agent.
+    Returns:
+        Dict mapping scenario_id -> list of stats (one per trial)
+    """
+    scenario_data = {}
+    for scenario_dir in agent_dir.iterdir():
+        if not scenario_dir.is_dir() or not scenario_dir.name.startswith("Scenario"):
+            continue
+        scenario_id = scenario_dir.name
+        trials = []
+        for trial_dir in sorted(scenario_dir.iterdir()):
+            if not trial_dir.is_dir():
+                continue
+            session_file = trial_dir / "session.jsonl"
+            stats = extract_session_stats(session_file)
+            if stats:
+                trials.append(stats)
+        if trials:
+            scenario_data[scenario_id] = trials
+    return scenario_data
+def load_performance_data() -> pd.DataFrame:
+    """Load performance data from the consistency analysis."""
+    perf_file = PROJECT_ROOT / "data" / "output" / "consistency" / "performance_data.csv"
+    if perf_file.exists():
+        df = pd.read_csv(perf_file)
+        return df[df["metric_raw"] == "root_cause_entity_f1"][["model", "performance"]]
+    return pd.DataFrame()
+def extract_all_data() -> tuple[pd.DataFrame, pd.DataFrame]:
+    """
+    Extract inference data for all agents.
+    Returns:
+        - summary_df: Aggregated stats per model
+        - detail_df: Per-scenario stats
+    """
+    agent_dirs = find_react_with_code_dirs(LEADERBOARD_DIR)
+    print(f"Found {len(agent_dirs)} 'react with code' agent directories")
+    summary_records = []
+    detail_records = []
+    for agent_dir in tqdm(agent_dirs, desc="Processing agents"):
+        model_name = get_model_name(agent_dir.name)
+        print(f"\nReading: {agent_dir.name}")
+        scenario_data = read_agent_stats(agent_dir)
+        n_scenarios, min_runs, max_runs, n_qualifying = get_runs_stats(scenario_data, MIN_RUNS_PER_SCENARIO)
+        if n_scenarios == 0:
+            print(f"  SKIPPING {model_name}: No session data found")
+            continue
+        if n_qualifying < MIN_QUALIFYING_SCENARIOS:
+            print(f"  SKIPPING {model_name}: Only {n_qualifying}/{n_scenarios} scenarios have {MIN_RUNS_PER_SCENARIO}+ runs")
+            continue
+        # Filter scenarios
+        scenario_data = filter_scenarios_with_min_runs(scenario_data, MIN_RUNS_PER_SCENARIO)
+        n_scenarios_filtered = len(scenario_data)
+        print(f"  Processing: {model_name} ({n_scenarios_filtered} scenarios)")
+        # Aggregate across all scenarios and trials
+        all_inference_counts = []
+        all_input_tokens = []
+        all_output_tokens = []
+        all_total_tokens = []
+        all_cached_tokens = []
+        all_tool_call_counts = []
+        all_code_execution_counts = []
+        aggregated_tool_counts = {}
+        for scenario_id, trials in tqdm(scenario_data.items(), desc=f"  {model_name} scenarios", leave=False):
+            for trial in trials:
+                all_inference_counts.append(trial['inference_count'])
+                all_input_tokens.append(trial['input_tokens'])
+                all_output_tokens.append(trial['output_tokens'])
+                all_total_tokens.append(trial['total_tokens'])
+                all_cached_tokens.append(trial.get('cached_input_tokens', 0))
+                all_tool_call_counts.append(trial.get('tool_call_count', 0))
+                all_code_execution_counts.append(trial.get('code_execution_count', 0))
+                # Aggregate tool counts
+                for tool_name, count in trial.get('tool_counts', {}).items():
+                    aggregated_tool_counts[tool_name] = aggregated_tool_counts.get(tool_name, 0) + count
+                detail_records.append({
+                    'model': model_name,
+                    'scenario': scenario_id,
+                    'inference_count': trial['inference_count'],
+                    'input_tokens': trial['input_tokens'],
+                    'cached_input_tokens': trial.get('cached_input_tokens', 0),
+                    'output_tokens': trial['output_tokens'],
+                    'total_tokens': trial['total_tokens'],
+                    'tool_call_count': trial.get('tool_call_count', 0),
+                    'code_execution_count': trial.get('code_execution_count', 0),
+                })
+        # Summary stats
+        summary_records.append({
+            'model': model_name,
+            'n_scenarios': n_scenarios_filtered,
+            'n_trials': len(all_inference_counts),
+            'avg_inference_count': np.mean(all_inference_counts),
+            'std_inference_count': np.std(all_inference_counts),
+            'avg_input_tokens': np.mean(all_input_tokens),
+            'avg_cached_tokens': np.mean(all_cached_tokens),
+            'avg_output_tokens': np.mean(all_output_tokens),
+            'avg_total_tokens': np.mean(all_total_tokens),
+            'total_inference_count': sum(all_inference_counts),
+            'total_tokens': sum(all_total_tokens),
+            'avg_tool_call_count': np.mean(all_tool_call_counts) if all_tool_call_counts else 0,
+            'total_tool_calls': sum(all_tool_call_counts),
+            'avg_code_execution_count': np.mean(all_code_execution_counts) if all_code_execution_counts else 0,
+            'total_code_executions': sum(all_code_execution_counts),
+            'top_tools': dict(sorted(aggregated_tool_counts.items(), key=lambda x: -x[1])[:10]),
+        })
+    summary_df = pd.DataFrame(summary_records)
+    detail_df = pd.DataFrame(detail_records)
+    # Merge with performance data
+    perf_df = load_performance_data()
+    if len(perf_df) > 0:
+        summary_df = pd.merge(summary_df, perf_df, on='model', how='left')
+    return summary_df, detail_df
+def save_data(summary_df: pd.DataFrame, detail_df: pd.DataFrame):
+    """Save extracted data to CSV files."""
+    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+    summary_path = OUTPUT_DIR / "inference_summary.csv"
+    detail_path = OUTPUT_DIR / "inference_detail.csv"
+    summary_df.to_csv(summary_path, index=False)
+    detail_df.to_csv(detail_path, index=False)
+    print(f"\nData saved to:")
+    print(f"  - {summary_path}")
+    print(f"  - {detail_path}")
+def print_summary(summary_df: pd.DataFrame):
+    """Print summary table."""
+    print("\n" + "="*80)
+    print("Inference Summary")
+    print("="*80)
+    summary_df = summary_df.sort_values("avg_inference_count", ascending=False)
+    print(f"\n{'Model':<25} {'Avg Infer':>10} {'Avg Tokens':>12} {'Avg In':>10} {'Avg Out':>10}")
+    print("-" * 70)
+    for _, row in summary_df.iterrows():
+        print(f"{row['model']:<25} {row['avg_inference_count']:>10.1f} {row['avg_total_tokens']:>12.0f} {row['avg_input_tokens']:>10.0f} {row['avg_output_tokens']:>10.0f}")
+def plot_tool_usage(summary_df: pd.DataFrame):
+    """
+    Figure: Tool usage per model - total tool calls and code execution.
+    """
+    plt.rcParams.update(PLOT_PARAMETERS)
+    if 'avg_tool_call_count' not in summary_df.columns:
+        print("Skipping tool usage: no tool data")
+        return
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(SINGLE_COLUMN_WIDTH * 2, 2.5))
+    data = summary_df.sort_values("avg_tool_call_count", ascending=True)
+    color_palette = get_color_palette(len(data))
+    colors = [color_palette[i % len(color_palette)] for i in range(len(data))]
+    # Left: Total tool calls
+    bars1 = ax1.barh(data["model"], data["avg_tool_call_count"], color=colors,
+                     edgecolor='black', linewidth=0.5)
+    ax1.set_xlabel("Avg. Tool Calls per Scenario")
+    for bar, val in zip(bars1, data["avg_tool_call_count"]):
+        ax1.text(val + 1, bar.get_y() + bar.get_height()/2,
+                 f'{val:.0f}', va='center', ha='left', fontsize=MIN_FONT_SIZE - 1)
+    ax1.set_xlim(0, data["avg_tool_call_count"].max() * 1.15)
+    # Right: Code executions
+    bars2 = ax2.barh(data["model"], data["avg_code_execution_count"], color=colors,
+                     edgecolor='black', linewidth=0.5)
+    ax2.set_xlabel("Avg. Code Executions per Scenario")
+    for bar, val in zip(bars2, data["avg_code_execution_count"]):
+        if val > 0:
+            ax2.text(val + 0.5, bar.get_y() + bar.get_height()/2,
+                     f'{val:.0f}', va='center', ha='left', fontsize=MIN_FONT_SIZE - 1)
+    ax2.set_xlim(0, max(data["avg_code_execution_count"].max() * 1.3, 1))
+    ax2.set_yticklabels([])
+    plt.title("Tool Call Distribution")
+    plt.tight_layout()
+    plt.show()
+    fig.savefig(OUTPUT_DIR / "fig_tool_usage.png")
+    plt.close(fig)
+    print("Saved: fig_tool_usage.png")
+def plot_inference_vs_performance(summary_df: pd.DataFrame):
+    """
+    Figure 3: Inference count vs Performance scatter.
+    """
+    if 'performance' not in summary_df.columns:
+        print("Skipping inference vs performance: no performance data")
+        return
+    plt.rcParams.update(PLOT_PARAMETERS)
+    fig, ax = plt.subplots(figsize=(SINGLE_COLUMN_WIDTH, SINGLE_COLUMN_WIDTH))
+    data = summary_df.dropna(subset=['performance'])
+    # Manual label offsets to avoid overlap
+    label_offsets = {
+        "GPT-5.1": (-5, -8, "right", "top"),
+        "o4-mini": (5, -8, "left", "top"),
+        "GPT-OSS-120B": (5, 3, "left", "bottom"),
+        "Gemini-2.5-Pro": (-5, 3, "right", "bottom"),
+        "Gemini-3-Flash": (5, 3, "left", "bottom"),
+        "gemini-3-pro-preview": (5, 3, "left", "bottom"),
+        "Kimi-K2": (5, 3, "left", "bottom"),
+    }
+    # Get color palette
+    color_palette = get_color_palette(len(data))
+    # Scatter plot
+    for i, (_, row) in enumerate(data.iterrows()):
+        ax.scatter(row["avg_inference_count"], row["performance"],
+                   c=[color_palette[i % len(color_palette)]], s=60, edgecolors='black',
+                   linewidth=0.5, zorder=10)
+        # Label with custom offset
+        offset = label_offsets.get(row["model"], (5, 3, "left", "bottom"))
+        ax.annotate(row["model"],
+                    (row["avg_inference_count"], row["performance"]),
+                    xytext=(offset[0], offset[1]), textcoords='offset points',
+                    fontsize=MIN_FONT_SIZE - 1, ha=offset[2], va=offset[3])
+    ax.set_xlabel("Avg. Inference Requests")
+    ax.set_ylabel("Performance (RC Entity F1)")
+    ax.set_xlim(0, data["avg_inference_count"].max() * 1.2)
+    ax.set_ylim(0, 0.7)
+    plt.title("Inference Requests vs. Performance")
+    plt.tight_layout()
+    plt.show()
+    fig.savefig(OUTPUT_DIR / "fig_inference_vs_performance.png")
+    plt.close(fig)
+    print("Saved: fig_inference_vs_performance.png")
+def plot_tool_breakdown_heatmap(summary_df: pd.DataFrame):
+    """
+    Generate a heatmap showing which tools each agent uses most.
+    """
+    # Parse the stringified dict of top_tools
+    tool_usage = []
+    for _, row in summary_df.iterrows():
+        if pd.isna(row.get('top_tools')):
+            print("pd.isna")
+            continue
+        tools = row['top_tools']
+        total_calls = row['total_tool_calls']
+        if total_calls == 0:
+            print("No tool calls")
+            continue
+        for tool, count in tools.items():
+            tool_usage.append({
+                'model': row['model'],
+                'tool': tool,
+                'count': count,
+                'avg_per_scenario': count / row['n_scenarios']
+            })
+    df = pd.DataFrame(tool_usage)
+    if len(df) == 0:
+        print("No tool usage data found")
+        return
+    # Pivot for heatmap
+    pivot_df = df.pivot(index='model', columns='tool', values='avg_per_scenario').fillna(0)
+    # Filter to top 10 most used tools across all models
+    # top_tools = pivot_df.sum().sort_values(ascending=False).head(10).index
+    top_tools = pivot_df.sum().sort_values(ascending=False).index
+    pivot_df = pivot_df[top_tools]
+    # Sort models by total tool usage
+    pivot_df['total'] = pivot_df.sum(axis=1)
+    pivot_df = pivot_df.sort_values('total', ascending=False).drop('total', axis=1)
+    # Plot
+    PLOT_PARAMETERS['font.size'] = 8
+    plt.rcParams.update(PLOT_PARAMETERS)
+    fig, ax = plt.subplots(figsize=(SINGLE_COLUMN_WIDTH * 2, 4))
+    sns.heatmap(pivot_df, annot=True, fmt='.1f', cmap='YlOrRd', ax=ax,
+                cbar_kws={'label': 'Avg. Calls per Scenario'})
+    ax.set_xlabel("")
+    ax.set_ylabel("")
+    plt.xticks(rotation=45, ha='right')
+    plt.yticks(rotation=0)
+    plt.title("Tool Call Distribution")
+    plt.tight_layout()
+    plt.show()
+    fig.savefig(OUTPUT_DIR / "fig_tool_usage_heatmap.png")
+    plt.close(fig)
+    print("Saved: fig_tool_usage_heatmap.png")
+def main():
+    print("Extracting inference data for 'react with code' agents...")
+    print(f"Reading from directories: {LEADERBOARD_DIR}")
+    print(f"Output directory: {OUTPUT_DIR}")
+    summary_df, detail_df = extract_all_data()
+    if len(summary_df) == 0:
+        print("No data extracted!")
+        return
+    save_data(summary_df, detail_df)
+    print_summary(summary_df)
+if __name__ == "__main__":
+    main()

analysis_src/extract_majority_vote_data.py ADDED Viewed

	@@ -0,0 +1,507 @@

+#!/usr/bin/env python3
+"""
+Extract Majority Vote and consistency breakdown data for all 'react with code' agents.
+This script computes:
+- Pass@k: At least 1 trial succeeds
+- Majority@k: Majority of trials succeed
+- All@k: All trials succeed
+- Consistency breakdown: Consistent Correct, Consistent Wrong, Inconsistent
+Output is saved to paper_analysis/react with code/resources/figures/consistency/ as CSV files.
+"""
+import json
+import sys
+from pathlib import Path
+from itertools import combinations
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+from tqdm import tqdm
+# Add project root to path
+PROJECT_ROOT = Path(__file__).parent.parent
+sys.path.insert(0, str(PROJECT_ROOT))
+from src.utils import (
+    get_model_name,
+    find_react_with_code_dirs,
+    read_judge_outputs_from_dir,
+    extract_trial_scores_from_judge_outputs,
+    filter_scenarios_with_min_runs,
+)
+from src.model_styles import (
+    get_model_style, MIN_FONT_SIZE, SINGLE_COLUMN_WIDTH, DOUBLE_COLUMN_WIDTH, PLOT_PARAMETERS
+)
+# Paths
+LEADERBOARD_DIR = PROJECT_ROOT / "data" / "trajectories"
+OUTPUT_DIR = PROJECT_ROOT / "data" / "output" / "consistency"
+# Minimum runs per scenario required for inclusion
+MIN_RUNS_PER_SCENARIO = 2
+# Minimum scenarios needed after filtering
+MIN_QUALIFYING_SCENARIOS = 20
+# Success threshold for binary classification
+SUCCESS_THRESHOLD = 0.5
+def compute_majority_vote_metrics(
+    scenario_trials: dict[str, list[float]],
+    success_threshold: float = SUCCESS_THRESHOLD
+) -> dict:
+    """
+    Compute majority vote and consistency metrics.
+    Returns dict with:
+    - pass_at_k: At least 1 trial succeeds
+    - majority_at_k: Majority of trials succeed
+    - all_at_k: All trials succeed
+    - consistent_correct: All trials succeed
+    - consistent_wrong: All trials fail
+    - inconsistent: Mixed results
+    """
+    scenarios = list(scenario_trials.keys())
+    n_trials_list = [len(trials) for trials in scenario_trials.values()]
+    if not n_trials_list:
+        return None
+    k = min(n_trials_list)
+    n_scenarios = len(scenarios)
+    if n_scenarios == 0 or k < 1:
+        return None
+    pass_at_k = 0
+    majority_at_k = 0
+    all_at_k = 0
+    consistent_correct = 0
+    consistent_wrong = 0
+    inconsistent = 0
+    scenario_details = []
+    all_scores = []
+    for s in scenarios:
+        trials = scenario_trials[s][:k]
+        all_scores.extend(trials)
+        successes = [1 if t >= success_threshold else 0 for t in trials]
+        n_success = sum(successes)
+        if n_success >= 1:
+            pass_at_k += 1
+        if n_success > k / 2:
+            majority_at_k += 1
+        if n_success == k:
+            all_at_k += 1
+            consistent_correct += 1
+            consistency_type = "correct"
+        elif n_success == 0:
+            consistent_wrong += 1
+            consistency_type = "wrong"
+        else:
+            inconsistent += 1
+            consistency_type = "inconsistent"
+        scenario_details.append({
+            "scenario": s,
+            "n_success": n_success,
+            "n_trials": k,
+            "majority_correct": n_success > k / 2,
+            "consistency_type": consistency_type,
+            "mean_score": np.mean(trials),
+            "std_score": np.std(trials) if len(trials) > 1 else 0,
+        })
+    return {
+        "n_scenarios": n_scenarios,
+        "n_trials": k,
+        "threshold": success_threshold,
+        "pass_at_k": pass_at_k / n_scenarios,
+        "majority_at_k": majority_at_k / n_scenarios,
+        "all_at_k": all_at_k / n_scenarios,
+        "consistent_correct": consistent_correct / n_scenarios,
+        "consistent_wrong": consistent_wrong / n_scenarios,
+        "inconsistent": inconsistent / n_scenarios,
+        "n_pass": pass_at_k,
+        "n_majority": majority_at_k,
+        "n_all": all_at_k,
+        "n_consistent_correct": consistent_correct,
+        "n_consistent_wrong": consistent_wrong,
+        "n_inconsistent": inconsistent,
+        "overall_mean": np.mean(all_scores),
+        "overall_std": np.std(all_scores),
+        "scenario_details": scenario_details,
+    }
+# Metrics to extract
+METRICS = [
+    ("root_cause_entity_f1", "F1"),
+    ("root_cause_entity_precision", "Precision"),
+    ("root_cause_entity_recall", "Recall"),
+]
+def extract_all_data() -> dict[str, tuple[pd.DataFrame, pd.DataFrame]]:
+    """
+    Extract majority vote data for all agents, for multiple metrics.
+    Returns:
+    - dict mapping metric_name -> (summary_df, scenario_df)
+    """
+    agent_dirs = find_react_with_code_dirs(LEADERBOARD_DIR)
+    print(f"Found {len(agent_dirs)} 'react with code' agent directories:")
+    for d in agent_dirs:
+        print(f"  - {d.name}")
+    # Read all judge outputs once
+    agent_data = {}
+    valid_models = []
+    skipped_models = []
+    for agent_dir in tqdm(agent_dirs, desc="Reading agent data"):
+        model_name = get_model_name(agent_dir.name)
+        print(f"\nReading: {agent_dir.name}")
+        scenario_data = read_judge_outputs_from_dir(agent_dir)
+        if not scenario_data:
+            print(f"  SKIPPING {model_name}: No judge outputs found")
+            skipped_models.append((model_name, "No data"))
+            continue
+        # Filter scenarios with minimum runs
+        scenario_data = filter_scenarios_with_min_runs(scenario_data, MIN_RUNS_PER_SCENARIO)
+        n_qualifying = len(scenario_data)
+        if n_qualifying < MIN_QUALIFYING_SCENARIOS:
+            print(f"  SKIPPING {model_name}: Only {n_qualifying} scenarios with {MIN_RUNS_PER_SCENARIO}+ runs")
+            skipped_models.append((model_name, f"{n_qualifying} qualifying"))
+            continue
+        print(f"  Processing: {model_name} ({n_qualifying} scenarios)")
+        valid_models.append(model_name)
+        agent_data[model_name] = scenario_data
+    if skipped_models:
+        print(f"\n⚠️  Skipped {len(skipped_models)} models:")
+        for name, reason in skipped_models:
+            print(f"    - {name}: {reason}")
+    print(f"\n✓ Included {len(valid_models)} models: {valid_models}")
+    # Extract for each metric
+    results = {}
+    for metric_key, metric_label in tqdm(METRICS, desc="Processing metrics"):
+        print(f"\n--- Extracting for metric: {metric_label} ({metric_key}) ---")
+        summary_records = []
+        scenario_records = []
+        for model_name, scenario_data in tqdm(agent_data.items(), desc=f"  {metric_label}", leave=False):
+            # Extract scores for this metric
+            scenario_trials = extract_trial_scores_from_judge_outputs(scenario_data, metric_key)
+            # Compute majority vote metrics
+            metrics = compute_majority_vote_metrics(scenario_trials)
+            if metrics is None:
+                continue
+            # Add to summary
+            summary_records.append({
+                "model": model_name,
+                "metric": metric_label,
+                "n_scenarios": metrics["n_scenarios"],
+                "n_trials": metrics["n_trials"],
+                "pass_at_k": metrics["pass_at_k"],
+                "majority_at_k": metrics["majority_at_k"],
+                "all_at_k": metrics["all_at_k"],
+                "consistent_correct": metrics["consistent_correct"],
+                "consistent_wrong": metrics["consistent_wrong"],
+                "inconsistent": metrics["inconsistent"],
+                "overall_mean": metrics["overall_mean"],
+                "overall_std": metrics["overall_std"],
+            })
+            # Add per-scenario data
+            for detail in metrics["scenario_details"]:
+                scenario_records.append({
+                    "model": model_name,
+                    "metric": metric_label,
+                    "scenario": detail["scenario"],
+                    "n_success": detail["n_success"],
+                    "n_trials": detail["n_trials"],
+                    "majority_correct": detail["majority_correct"],
+                    "consistency_type": detail["consistency_type"],
+                    "mean_score": detail["mean_score"],
+                    "std_score": detail["std_score"],
+                })
+        summary_df = pd.DataFrame(summary_records)
+        scenario_df = pd.DataFrame(scenario_records)
+        results[metric_label] = (summary_df, scenario_df)
+    return results
+def save_data(results: dict[str, tuple[pd.DataFrame, pd.DataFrame]]):
+    """Save extracted data to CSV files for each metric."""
+    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+    # Also save combined data for backward compatibility
+    all_summaries = []
+    all_scenarios = []
+    for metric_label, (summary_df, scenario_df) in results.items():
+        metric_suffix = metric_label.lower()
+        summary_path = OUTPUT_DIR / f"majority_vote_data_{metric_suffix}.csv"
+        scenario_path = OUTPUT_DIR / f"majority_vote_scenarios_{metric_suffix}.csv"
+        summary_df.to_csv(summary_path, index=False)
+        scenario_df.to_csv(scenario_path, index=False)
+        print(f"\nData saved for {metric_label}:")
+        print(f"  - {summary_path}")
+        print(f"  - {scenario_path}")
+        all_summaries.append(summary_df)
+        all_scenarios.append(scenario_df)
+    # Save combined (default to F1 for backward compatibility)
+    if "F1" in results:
+        f1_summary, f1_scenario = results["F1"]
+        # Save without metric column for backward compat
+        f1_summary_compat = f1_summary.drop(columns=["metric"], errors="ignore")
+        f1_scenario_compat = f1_scenario.drop(columns=["metric"], errors="ignore")
+        f1_summary_compat.to_csv(OUTPUT_DIR / "majority_vote_data.csv", index=False)
+        f1_scenario_compat.to_csv(OUTPUT_DIR / "majority_vote_scenarios.csv", index=False)
+        print(f"\nBackward-compatible files (F1) saved to:")
+        print(f"  - {OUTPUT_DIR / 'majority_vote_data.csv'}")
+        print(f"  - {OUTPUT_DIR / 'majority_vote_scenarios.csv'}")
+def print_summary(results: dict[str, tuple[pd.DataFrame, pd.DataFrame]]):
+    """Print summary table for each metric."""
+    for metric_label, (summary_df, _) in results.items():
+        print("\n" + "="*80)
+        print(f"Majority Vote Summary ({metric_label}, threshold={SUCCESS_THRESHOLD})")
+        print("="*80)
+        df = summary_df.sort_values("majority_at_k", ascending=False)
+        print(f"\n{'Model':<20} {'Maj@k':>8} {'Pass@k':>8} {'All@k':>8} {'Cons✓':>8} {'Cons✗':>8} {'Incons':>8}")
+        print("-" * 80)
+        for _, row in df.iterrows():
+            print(f"{row['model']:<20} "
+                  f"{row['majority_at_k']*100:>7.1f}% "
+                  f"{row['pass_at_k']*100:>7.1f}% "
+                  f"{row['all_at_k']*100:>7.1f}% "
+                  f"{row['consistent_correct']*100:>7.1f}% "
+                  f"{row['consistent_wrong']*100:>7.1f}% "
+                  f"{row['inconsistent']*100:>7.1f}%")
+def plot_majority_vs_performance(df: pd.DataFrame):
+    """
+    Figure: Majority@k vs Performance scatter plot.
+    """
+    plt.rcParams.update({PLOT_PARAMETERS})
+    fig, ax = plt.subplots(figsize=(SINGLE_COLUMN_WIDTH, DOUBLE_COLUMN_WIDTH))
+    # Axis limits
+    x_min, x_max = 0, 1.0
+    y_min, y_max = 0, 100
+    # Gradient shading toward top-right (ideal)
+    for i in range(5):
+        alpha = 0.02 + i * 0.02
+        x_start = 0.1 + i * 0.15
+        y_start = 10 + i * 15
+        rect = plt.Rectangle((x_start, y_start), x_max - x_start, y_max - y_start,
+                              color='#2ecc71', alpha=alpha, zorder=0)
+        ax.add_patch(rect)
+    # Arrow pointing to ideal
+    ax.annotate('', xy=(0.85, 85), xytext=(0.55, 55),
+                arrowprops=dict(arrowstyle='->', color='#27ae60', alpha=0.7, lw=1.5),
+                zorder=2)
+    ax.text(0.58, 58, 'better', fontsize=MIN_FONT_SIZE, style='italic',
+            color='#27ae60', alpha=0.8, rotation=45, zorder=2)
+    # Mark ideal corner
+    ax.scatter([1.0], [100], marker='*', s=100, c='#27ae60', alpha=0.5, zorder=2)
+    ax.text(0.92, 95, 'ideal', fontsize=MIN_FONT_SIZE - 1, color='#27ae60',
+            alpha=0.7, ha='right')
+    # Scatter points with model-specific colors and markers
+    for _, row in df.iterrows():
+        style = get_model_style(row["model"])
+        ax.scatter(row["overall_mean"], row["majority_at_k"] * 100,
+                   c=style['color'], marker=style['marker'],
+                   s=80, edgecolors='black', linewidth=0.5, zorder=10)
+    # Labels with smart positioning
+    for _, row in df.iterrows():
+        model = row["model"]
+        x_pos = row["overall_mean"]
+        y_pos = row["majority_at_k"] * 100
+        dx, dy = 0.03, 2
+        ha, va = "left", "center"
+        if x_pos > 0.7:
+            dx = -0.03
+            ha = "right"
+        if y_pos > 80:
+            dy = -3
+            va = "top"
+        ax.text(x_pos + dx, y_pos + dy, model, fontsize=MIN_FONT_SIZE - 1,
+                ha=ha, va=va, zorder=11)
+    ax.set_xlabel("Performance (RC Entity F1)")
+    ax.set_ylabel("Majority@k (%)")
+    ax.set_xlim(x_min, x_max)
+    ax.set_ylim(y_min, y_max)
+    plt.tight_layout()
+    plt.show()
+    fig.savefig(OUTPUT_DIR / "fig_majority_vs_performance.pdf")
+    fig.savefig(OUTPUT_DIR / "fig_majority_vs_performance.png")
+    plt.close(fig)
+    print("Saved: fig_majority_vs_performance.pdf/png")
+def plot_pass_vs_majority(df: pd.DataFrame, metric: str = "F1", suffix: str = ""):
+    """
+    Figure: Scatter plot of Pass@k (x-axis) vs Majority@k (y-axis).
+    Args:
+        df: DataFrame with pass_at_k and majority_at_k columns
+        metric: Name of metric for labeling (F1, Precision, Recall)
+        suffix: Suffix for output filename (e.g., "_precision")
+    """
+    fig, ax = plt.subplots(figsize=(SINGLE_COLUMN_WIDTH, SINGLE_COLUMN_WIDTH))
+    ax_min, ax_max = 0, 100
+    # Diagonal line
+    ax.plot([ax_min, ax_max], [ax_min, ax_max], color='#444444', linestyle='--',
+            linewidth=1.5, alpha=0.6, zorder=1)
+    # Consistency region labels
+    ax.text(8, 92, 'more\nconsistent', fontsize=MIN_FONT_SIZE + 1, color='#333333',
+            ha='left', va='top', style='italic')
+    ax.text(92, 8, 'less\nconsistent', fontsize=MIN_FONT_SIZE + 1, color='#333333',
+            ha='right', va='bottom', style='italic')
+    # Collect and plot points
+    points = {}
+    for _, row in df.iterrows():
+        style = get_model_style(row["model"])
+        x = row["pass_at_k"] * 100
+        y = row["majority_at_k"] * 100
+        ax.scatter(x, y, c=style['color'], marker=style['marker'],
+                   s=50, edgecolors='black', linewidth=0.5, zorder=10)
+        points[row["model"]] = {'x': x, 'y': y}
+    line_color = '#444444'
+    line_width = 1.2
+    # Place labels with manual positioning
+    for model, p in points.items():
+        x, y = p['x'], p['y']
+        if 'GPT-OSS-120B' in model:
+            # Label to the right, slightly below
+            ax.text(x + 3, y - 2, model, fontsize=MIN_FONT_SIZE, ha='left', va='center', zorder=11)
+        elif 'Gemini 2.5 Pro' in model:
+            # TEAL CIRCLE: label slightly below and to the right
+            ax.text(x + 3, y + 2, model, fontsize=MIN_FONT_SIZE, ha='left', va='bottom', zorder=11)
+        elif 'o4-mini' in model:
+            # YELLOW SQUARE: shorter line goes right then to label
+            label_x = x + 12
+            label_y = y
+            # Horizontal line right (shorter)
+            ax.plot([x, label_x], [y, y], color=line_color, linewidth=line_width, alpha=0.8, zorder=5)
+            ax.text(label_x + 1, label_y, model, fontsize=MIN_FONT_SIZE, ha='left', va='center', zorder=11)
+        elif 'GPT-5.1' in model:
+            # GREEN SQUARE: line from left edge, goes left then up
+            label_x = 5
+            label_y = 25
+            start_x = x - 2  # Left edge of the square marker
+            # Horizontal line left from left edge midpoint
+            ax.plot([start_x, label_x], [y, y], color=line_color, linewidth=line_width, alpha=0.8, zorder=5)
+            # Vertical line up to label height
+            ax.plot([label_x, label_x], [y, label_y], color=line_color, linewidth=line_width, alpha=0.8, zorder=5)
+            ax.text(label_x, label_y + 1, model, fontsize=MIN_FONT_SIZE, ha='left', va='bottom', zorder=11)
+        elif 'Claude Opus' in model:
+            # Label to the right
+            ax.text(x + 5, y, model, fontsize=MIN_FONT_SIZE, ha='left', va='center', zorder=11)
+        elif 'Gemini 3 Pro' in model:
+            # Label BELOW the circle, offset left
+            ax.text(x - 18, y - 6, model, fontsize=MIN_FONT_SIZE, ha='left', va='top', zorder=11)
+        elif 'Gemini 3 Flash' in model:
+            # Label at x=95 to avoid diagonal line
+            ax.text(105, y + 4, model, fontsize=MIN_FONT_SIZE, ha='right', va='bottom', zorder=11)
+        elif 'Kimi K2' in model:
+            # Label to the right
+            ax.text(x + 3, y, model, fontsize=MIN_FONT_SIZE, ha='left', va='center', zorder=11)
+        else:
+            # Default: label to the right
+            ax.text(x + 3, y, model, fontsize=MIN_FONT_SIZE, ha='left', va='center', zorder=11)
+    ax.set_xlabel(f"Pass@k (%) [{metric}]")
+    ax.set_ylabel(f"Majority@k (%) [{metric}]")
+    ax.set_xlim(ax_min, ax_max)
+    ax.set_ylim(ax_min, ax_max)
+    ax.set_aspect('equal')
+    plt.title("Consistency: Pass@k vs. Majority@k")
+    plt.tight_layout()
+    plt.show()
+    filename = f"fig_pass_vs_majority{suffix}"
+    fig.savefig(OUTPUT_DIR / f"{filename}.png")
+    plt.close(fig)
+    print(f"Saved: {filename}.png")
+def main():
+    print("Extracting majority vote data for 'react with code' agents...")
+    print(f"Reading from directories: {LEADERBOARD_DIR}")
+    print(f"Output directory: {OUTPUT_DIR}")
+    print(f"Success threshold: {SUCCESS_THRESHOLD}")
+    print(f"Minimum runs per scenario: {MIN_RUNS_PER_SCENARIO}")
+    print(f"Metrics: {[m[1] for m in METRICS]}")
+    results = extract_all_data()
+    if not results:
+        print("No data extracted!")
+        return
+    save_data(results)
+    print_summary(results)
+if __name__ == "__main__":
+    main()

analysis_src/extract_tool_failures.py ADDED Viewed

	@@ -0,0 +1,560 @@

+#!/usr/bin/env python3
+"""
+Extract tool call failure data for all 'react with code' agents.
+This script reads rollout JSONL files to identify and categorize tool call failures.
+"""
+import json
+import re
+import sys
+import ast
+from pathlib import Path
+from dataclasses import dataclass, field
+from collections import defaultdict
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+from tqdm import tqdm
+# Add project root to path
+PROJECT_ROOT = Path(__file__).parent.parent
+sys.path.insert(0, str(PROJECT_ROOT))
+from src.utils import (
+    get_model_name,
+    find_react_with_code_dirs,
+    get_runs_stats,
+    filter_scenarios_with_min_runs,
+    find_latest_rollout_file
+)
+from src.model_styles import (
+    get_model_style, get_color_palette, MIN_FONT_SIZE, SINGLE_COLUMN_WIDTH, DOUBLE_COLUMN_WIDTH, _COLORS, PLOT_PARAMETERS
+)
+# Paths
+LEADERBOARD_DIR = PROJECT_ROOT / "data" / "trajectories"
+RESULTS_JSON_DIR = LEADERBOARD_DIR / "results"
+OUTPUT_DIR = PROJECT_ROOT / "data" / "output" / "tool_failures"
+# Minimum runs per scenario required
+MIN_RUNS_PER_SCENARIO = 3
+MIN_QUALIFYING_SCENARIOS = 20
+# Failure type patterns
+FAILURE_PATTERNS = {
+    'python_syntax': [
+        r'SyntaxError',
+        r'IndentationError',
+        r'TabError',
+    ],
+    'python_type': [
+        r'TypeError',
+        r'AttributeError',
+        r'ValueError',
+        r'KeyError',
+        r'IndexError',
+    ],
+    'python_name': [
+        r'NameError',
+        r'UnboundLocalError',
+        r'ModuleNotFoundError',
+        r'ImportError',
+    ],
+    'file_not_found': [
+        r'FileNotFoundError',
+        r'No such file or directory',
+        r'ENOENT',
+        r'path does not exist',
+    ],
+    'permission_denied': [
+        r'PermissionError',
+        r'Permission denied',
+        r'EACCES',
+    ],
+    'json_parse': [
+        r'JSONDecodeError',
+        r'json\.decoder\.JSONDecodeError',
+        r'Expecting value',
+        r'Invalid JSON',
+    ],
+    'timeout': [
+        r'TimeoutError',
+        r'timeout',
+        r'Timed out',
+        r'deadline exceeded',
+    ],
+    'memory': [
+        r'MemoryError',
+        r'out of memory',
+        r'OOM',
+        r'Cannot allocate memory',
+    ],
+    'connection': [
+        r'ConnectionError',
+        r'ConnectionRefusedError',
+        r'Connection refused',
+        r'ECONNREFUSED',
+    ],
+    'shell_command': [
+        r'command not found',
+        r'No such command',
+        r'not recognized as',
+    ],
+    'assertion': [
+        r'AssertionError',
+    ],
+    'runtime': [
+        r'RuntimeError',
+        r'Exception',
+        r'Error:',
+    ],
+}
+def classify_failure(output: str) -> tuple[str, str]:
+    """
+    Classify a failure based on the output string.
+    Returns:
+        (category, specific_error)
+    """
+    for category, patterns in FAILURE_PATTERNS.items():
+        for pattern in patterns:
+            if re.search(pattern, output, re.IGNORECASE):
+                # Extract the specific error type
+                match = re.search(pattern, output, re.IGNORECASE)
+                return (category, match.group(0) if match else pattern)
+    # Check for generic traceback
+    if 'Traceback' in output:
+        return ('other_python', 'Unknown Python Error')
+    return ('other', 'Unknown Error')
+def extract_tool_calls_from_rollout(rollout_file: Path) -> dict:
+    """
+    Extract all tool calls and their outcomes from a rollout file.
+    Returns dict with:
+    - total_tool_calls: int
+    - failed_tool_calls: int
+    - failures: list of failure details
+    - tool_call_counts: dict of tool_name -> count
+    - tool_failure_counts: dict of tool_name -> failure_count
+    """
+    tool_calls = {}  # call_id -> {name, arguments}
+    total_calls = 0
+    failed_calls = 0
+    failures = []
+    tool_call_counts = defaultdict(int)
+    tool_failure_counts = defaultdict(int)
+    try:
+        with open(rollout_file) as f:
+            for line in f:
+                try:
+                    d = json.loads(line)
+                    if d.get('type') != 'response_item':
+                        continue
+                    payload = d.get('payload', {})
+                    payload_type = payload.get('type', '')
+                    if payload_type == 'function_call':
+                        call_id = payload.get('call_id', '')
+                        name = payload.get('name', '')
+                        arguments = payload.get('arguments', '')
+                        tool_calls[call_id] = {
+                            'name': name,
+                            'arguments': arguments,
+                            'timestamp': d.get('timestamp', ''),
+                        }
+                        total_calls += 1
+                        tool_call_counts[name] += 1
+                    elif payload_type == 'function_call_output':
+                        call_id = payload.get('call_id', '')
+                        output = payload.get('output', '')
+                        # Check if this is a failure
+                        is_failure = False
+                        failure_info = None
+                        # Parse the output if it's JSON
+                        try:
+                            output_data = json.loads(output)
+                            if isinstance(output_data, dict):
+                                exit_code = output_data.get('metadata', {}).get('exit_code', 0)
+                                output_text = output_data.get('output', '')
+                                if exit_code != 0:
+                                    is_failure = True
+                                    category, error = classify_failure(output_text)
+                                    failure_info = {
+                                        'exit_code': exit_code,
+                                        'category': category,
+                                        'error': error,
+                                        'output_snippet': output_text[:300] if output_text else '',
+                                    }
+                        except json.JSONDecodeError:
+                            # Not JSON, check for error patterns in raw output
+                            if 'Error' in output or 'error' in output or 'Traceback' in output:
+                                is_failure = True
+                                category, error = classify_failure(output)
+                                failure_info = {
+                                    'exit_code': None,
+                                    'category': category,
+                                    'error': error,
+                                    'output_snippet': output[:300],
+                                }
+                        if is_failure and call_id in tool_calls:
+                            failed_calls += 1
+                            tool_name = tool_calls[call_id]['name']
+                            tool_failure_counts[tool_name] += 1
+                            failures.append({
+                                'tool_name': tool_name,
+                                'arguments': tool_calls[call_id]['arguments'][:200],
+                                'timestamp': tool_calls[call_id]['timestamp'],
+                                **failure_info,
+                            })
+                except json.JSONDecodeError:
+                    continue
+    except Exception as e:
+        print(f"    Warning: Error reading {rollout_file}: {e}")
+        return None
+    return {
+        'total_tool_calls': total_calls,
+        'failed_tool_calls': failed_calls,
+        'failures': failures,
+        'tool_call_counts': dict(tool_call_counts),
+        'tool_failure_counts': dict(tool_failure_counts),
+    }
+def read_agent_stats(agent_dir: Path) -> dict[str, list[dict]]:
+    """
+    Read tool call stats from all scenarios/trials for an agent.
+    Returns:
+        Dict mapping scenario_id -> list of stats (one per trial)
+    """
+    scenario_data = {}
+    for scenario_dir in agent_dir.iterdir():
+        if not scenario_dir.is_dir() or not scenario_dir.name.startswith("Scenario"):
+            continue
+        scenario_id = scenario_dir.name
+        trials = []
+        for trial_dir in sorted(scenario_dir.iterdir()):
+            if not trial_dir.is_dir():
+                continue
+            rollout_file = find_latest_rollout_file(trial_dir)
+            if rollout_file:
+                stats = extract_tool_calls_from_rollout(rollout_file)
+                if stats:
+                    trials.append(stats)
+        if trials:
+            scenario_data[scenario_id] = trials
+    return scenario_data
+def extract_all_data() -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+    """
+    Extract tool failure data for all agents.
+    Returns:
+        - summary_df: Aggregated stats per model
+        - detail_df: Per-trial failure stats
+        - failures_df: Individual failure details
+    """
+    agent_dirs = find_react_with_code_dirs(LEADERBOARD_DIR)
+    print(f"Found {len(agent_dirs)} 'react with code' agent directories")
+    summary_records = []
+    detail_records = []
+    failure_records = []
+    for agent_dir in tqdm(agent_dirs, desc="Processing agents"):
+        model_name = get_model_name(agent_dir.name)
+        print(f"\nProcessing: {agent_dir.name}")
+        scenario_data = read_agent_stats(agent_dir)
+        n_scenarios, min_runs, max_runs, n_qualifying = get_runs_stats(scenario_data, MIN_RUNS_PER_SCENARIO)
+        if n_scenarios == 0:
+            print(f"  SKIPPING {model_name}: No rollout data found")
+            continue
+        if n_qualifying < MIN_QUALIFYING_SCENARIOS:
+            print(f"  SKIPPING {model_name}: Only {n_qualifying}/{n_scenarios} scenarios have {MIN_RUNS_PER_SCENARIO}+ runs")
+            continue
+        # Filter scenarios
+        scenario_data = filter_scenarios_with_min_runs(scenario_data, MIN_RUNS_PER_SCENARIO)
+        n_scenarios_filtered = len(scenario_data)
+        print(f"  Processing: {model_name} ({n_scenarios_filtered} scenarios)")
+        # Aggregate across all scenarios and trials
+        all_total_calls = []
+        all_failed_calls = []
+        all_failure_rates = []
+        aggregated_tool_counts = defaultdict(int)
+        aggregated_failure_counts = defaultdict(int)
+        aggregated_category_counts = defaultdict(int)
+        for scenario_id, trials in tqdm(scenario_data.items(), desc=f"  {model_name} scenarios", leave=False):
+            for trial_idx, trial in enumerate(trials):
+                total = trial['total_tool_calls']
+                failed = trial['failed_tool_calls']
+                all_total_calls.append(total)
+                all_failed_calls.append(failed)
+                all_failure_rates.append(failed / total * 100 if total > 0 else 0)
+                for tool_name, count in trial['tool_call_counts'].items():
+                    aggregated_tool_counts[tool_name] += count
+                for tool_name, count in trial['tool_failure_counts'].items():
+                    aggregated_failure_counts[tool_name] += count
+                # Count failure categories
+                for failure in trial['failures']:
+                    category = failure.get('category', 'other')
+                    aggregated_category_counts[category] += 1
+                    # Add to failure records
+                    failure_records.append({
+                        'model': model_name,
+                        'scenario': scenario_id,
+                        'trial': trial_idx,
+                        'tool_name': failure.get('tool_name', ''),
+                        'category': category,
+                        'error': failure.get('error', ''),
+                        'exit_code': failure.get('exit_code'),
+                        'output_snippet': failure.get('output_snippet', '')[:100],
+                    })
+                detail_records.append({
+                    'model': model_name,
+                    'scenario': scenario_id,
+                    'trial': trial_idx,
+                    'total_tool_calls': total,
+                    'failed_tool_calls': failed,
+                    'failure_rate_pct': failed / total * 100 if total > 0 else 0,
+                })
+        # Compute per-tool failure rates
+        tool_failure_rates = {}
+        for tool_name, total in aggregated_tool_counts.items():
+            failures = aggregated_failure_counts.get(tool_name, 0)
+            tool_failure_rates[tool_name] = {
+                'total': total,
+                'failures': failures,
+                'rate': failures / total * 100 if total > 0 else 0
+            }
+        summary_records.append({
+            'model': model_name,
+            'n_scenarios': n_scenarios_filtered,
+            'n_trials': len(all_total_calls),
+            'total_tool_calls': sum(all_total_calls),
+            'total_failed_calls': sum(all_failed_calls),
+            'avg_tool_calls_per_trial': np.mean(all_total_calls),
+            'avg_failed_calls_per_trial': np.mean(all_failed_calls),
+            'avg_failure_rate_pct': np.mean(all_failure_rates),
+            'std_failure_rate_pct': np.std(all_failure_rates),
+            'failure_categories': dict(aggregated_category_counts),
+            'tool_failure_rates': tool_failure_rates,
+        })
+    summary_df = pd.DataFrame(summary_records)
+    detail_df = pd.DataFrame(detail_records)
+    failures_df = pd.DataFrame(failure_records)
+    return summary_df, detail_df, failures_df
+def save_data(summary_df: pd.DataFrame, detail_df: pd.DataFrame, failures_df: pd.DataFrame):
+    """Save extracted data to CSV files."""
+    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+    summary_path = OUTPUT_DIR / "tool_failures_summary.csv"
+    detail_path = OUTPUT_DIR / "tool_failures_detail.csv"
+    failures_path = OUTPUT_DIR / "tool_failures_individual.csv"
+    summary_df.to_csv(summary_path, index=False)
+    detail_df.to_csv(detail_path, index=False)
+    failures_df.to_csv(failures_path, index=False)
+    print(f"\nData saved to:")
+    print(f"  - {summary_path}")
+    print(f"  - {detail_path}")
+    print(f"  - {failures_path}")
+def print_summary(summary_df: pd.DataFrame):
+    """Print summary table."""
+    print("\n" + "="*100)
+    print("Tool Call Failure Summary")
+    print("="*100)
+    summary_df = summary_df.sort_values("avg_failure_rate_pct", ascending=False)
+    print(f"\n{'Model':<20} {'Trials':>8} {'Total Calls':>12} {'Failed':>10} {'Fail Rate':>10} {'Top Category':>20}")
+    print("-" * 85)
+    for _, row in summary_df.iterrows():
+        categories = row.get('failure_categories', {})
+        if categories:
+            top_cat = max(categories, key=categories.get)
+            top_cat_count = categories[top_cat]
+        else:
+            top_cat = '-'
+            top_cat_count = 0
+        print(f"{row['model']:<20} {row['n_trials']:>8} {row['total_tool_calls']:>12} "
+              f"{row['total_failed_calls']:>10} {row['avg_failure_rate_pct']:>9.2f}% "
+              f"{top_cat} ({top_cat_count})")
+def plot_failure_rate_by_model(summary_df: pd.DataFrame):
+    """
+    Figure 1: Overall failure rate per model (horizontal bar chart).
+    """
+    plt.rcParams.update(PLOT_PARAMETERS)
+    fig, ax = plt.subplots(figsize=(DOUBLE_COLUMN_WIDTH, 3.0))
+    data = summary_df.sort_values("avg_failure_rate_pct", ascending=True)
+    colors = get_color_palette(len(data))
+    bars = ax.barh(data["model"], data["avg_failure_rate_pct"],
+                   color=colors, edgecolor='black', linewidth=0.5)
+    # Add error bars for std
+    ax.errorbar(data["avg_failure_rate_pct"], range(len(data)),
+                xerr=data["std_failure_rate_pct"], fmt='none',
+                color='black', capsize=2, linewidth=0.5)
+    # Add value labels
+    for i, (bar, val, std) in enumerate(zip(bars, data["avg_failure_rate_pct"], data["std_failure_rate_pct"])):
+        ax.text(val + std + 0.5, bar.get_y() + bar.get_height()/2,
+                f'{val:.1f}%', va='center', ha='left', fontsize=MIN_FONT_SIZE - 1)
+    ax.set_xlabel("Average Failure Rate (%)")
+    ax.set_xlim(0, data["avg_failure_rate_pct"].max() + data["std_failure_rate_pct"].max() + 5)
+    plt.title("Tool Call Failure Rate")
+    plt.tight_layout()
+    plt.show()
+    fig.savefig(OUTPUT_DIR / "fig_failure_rate_by_model.png")
+    plt.close(fig)
+    print("Saved: fig_failure_rate_by_model.png")
+def parse_dict_column(col_str):
+    """Parse a dictionary column stored as string."""
+    if pd.isna(col_str) or col_str == '{}':
+        return {}
+    try:
+        return ast.literal_eval(col_str)
+    except:
+        return {}
+def plot_failure_categories_stacked(summary_df: pd.DataFrame):
+    """
+    Figure 2: Failure category breakdown per model (stacked bar).
+    """
+    plt.rcParams.update(PLOT_PARAMETERS)
+    fig, ax = plt.subplots(figsize=(DOUBLE_COLUMN_WIDTH, 3.0))
+    # Parse failure categories
+    data = summary_df.copy()
+    # data['failure_categories'] = data['failure_categories'].apply(parse_dict_column)
+    print(data['failure_categories'])
+    # Get all categories and sort by total
+    all_categories = defaultdict(int)
+    for cats in data['failure_categories']:
+        for cat, count in cats.items():
+            all_categories[cat] += count
+    CATEGORY_COLORS = {
+        'python_syntax': '#e41a1c',
+        'python_type': '#377eb8',
+        'python_name': '#4daf4a',
+        'file_not_found': '#984ea3',
+        'json_parse': '#ff7f00',
+        'shell_command': '#a65628',
+        'timeout': '#f781bf',
+        'memory': '#999999',
+        'other_python': '#66c2a5',
+        'other': '#8da0cb',
+    }
+    # Sort categories by total count
+    sorted_cats = sorted(all_categories.keys(), key=lambda x: all_categories[x], reverse=True)[:8]
+    # Build data for stacked bar
+    data = data.sort_values('total_failed_calls', ascending=True)
+    bottom = np.zeros(len(data))
+    for cat in sorted_cats:
+        values = [row['failure_categories'].get(cat, 0) for _, row in data.iterrows()]
+        color = CATEGORY_COLORS.get(cat, '#888888')
+        ax.barh(data['model'], values, left=bottom,
+                label=cat.replace('_', ' ').title(), color=color,
+                edgecolor='white', linewidth=0.3)
+        bottom += values
+    ax.set_xlabel("Number of Failed Tool Calls")
+    ax.legend(loc='lower right', ncol=2, fontsize=MIN_FONT_SIZE - 1,
+              framealpha=0.9, bbox_to_anchor=(1.0, 0.0))
+    plt.title("Tool Failure Category Distribution")
+    plt.tight_layout()
+    plt.show()
+    fig.savefig(OUTPUT_DIR / "fig_failure_categories_stacked.png")
+    plt.close(fig)
+    print("Saved: fig_failure_categories_stacked.png")
+def main():
+    print("Extracting tool call failure data for 'react with code' agents...")
+    print(f"Reading from directories: {LEADERBOARD_DIR}")
+    print(f"Output directory: {OUTPUT_DIR}")
+    summary_df, detail_df, failures_df = extract_all_data()
+    if len(summary_df) == 0:
+        print("No data extracted!")
+        return
+    save_data(summary_df, detail_df, failures_df)
+    print_summary(summary_df)
+    plot_failure_categories_stacked(summary_df)
+if __name__ == "__main__":
+    main()

analysis_src/model_styles.py ADDED Viewed

	@@ -0,0 +1,241 @@

+#!/usr/bin/env python3
+"""
+Shared model styling configuration for paper analysis figures.
+Provides consistent colors, markers, and display names across all agents
+(EOG, React with Code, and future agents).
+Usage:
+    from paper_analysis.common.model_styles import get_model_style, MODEL_DISPLAY_NAMES
+    style = get_model_style("GPT-5.1")
+    ax.scatter(x, y, c=style['color'], marker=style['marker'], ...)
+"""
+import seaborn as sns
+# =============================================================================
+# MODEL DISPLAY NAMES
+# Maps various raw names to standardized display names
+# =============================================================================
+MODEL_DISPLAY_NAMES = {
+    # OpenAI / Azure
+    "Azure_gpt-5.1-2025-11-13": "GPT-5.1",
+    "Azure_gpt-5.1-chat-2025-11-13": "GPT-5.1",
+    "Azure_o4-mini": "o4-mini",
+    "Azure_gpt-4o": "GPT-4o",
+    "openai_gpt-oss-120b": "GPT-OSS-120B",
+    "openai_gpt-oss-20b": "GPT-OSS-20B",
+    # Google / GCP
+    "GCP_gemini-2.5-pro": "Gemini 2.5 Pro",
+    "gemini-2.5-pro": "Gemini 2.5 Pro",
+    "Gemini-2.5-Pro": "Gemini 2.5 Pro",
+    "gcp_gemini-3-pro-preview": "Gemini 3 Pro",
+    "gemini-3-pro-preview": "Gemini 3 Pro",
+    "Gemini-3-Pro": "Gemini 3 Pro",
+    "gemini-3-flash-preview": "Gemini 3 Flash",
+    "Gemini-3-Flash": "Gemini 3 Flash",
+    "google_gemini-3-flash-preview": "Gemini 3 Flash",
+    # Moonshot AI
+    "moonshotai_kimi-k2-thinking": "Kimi K2",
+    "kimi-k2-thinking": "Kimi K2",
+    "Kimi-K2": "Kimi K2",
+    # Anthropic / AWS
+    "aws_claude-opus-4-5": "Claude Opus 4.5",
+    "Claude-Opus-4.5": "Claude Opus 4.5",
+    # Mistral AI
+    "mistralai_mistral-large-2512": "Mistral Large",
+    "Mistral-Large": "Mistral Large",
+    # Alibaba / Qwen
+    "qwen_qwen3-vl-32b-instruct": "Qwen3-VL-32B",
+    # ServiceNow
+    "ServiceNow-AI_Apriel-1.6-15b-Thinker": "Apriel-1.6-15B",
+    # Minimax
+    "minimax_minimax-m2.1": "Minimax M2.1",
+}
+# =============================================================================
+# MODEL STYLES
+# Defines color and marker for each model (by display name)
+# Colors are colorblind-friendly, markers provide redundant encoding
+# =============================================================================
+# Colorblind-friendly palette (based on IBM Design Library / Wong palette)
+_COLORS = {
+    'blue': '#0072B2',
+    'orange': '#E69F00',
+    'green': '#009E73',
+    'pink': '#CC79A7',
+    'light_blue': '#56B4E9',
+    'yellow': '#F0E442',
+    'red': '#D55E00',
+    'gray': '#999999',
+    'purple': '#9467BD',
+    'brown': '#8C564B',
+    'teal': '#17BECF',
+}
+# Marker styles for redundant encoding
+_MARKERS = {
+    'circle': 'o',
+    'square': 's',
+    'diamond': 'D',
+    'triangle_up': '^',
+    'triangle_down': 'v',
+    'pentagon': 'p',
+    'hexagon': 'h',
+    'star': '*',
+    'plus': 'P',
+    'x': 'X',
+}
+# Model style definitions (display_name -> {color, marker})
+MODEL_STYLES = {
+    # Google models - shades of blue
+    "Gemini 3 Flash": {
+        'color': _COLORS['blue'],
+        'marker': _MARKERS['circle'],
+    },
+    "Gemini 3 Pro": {
+        'color': _COLORS['light_blue'],
+        'marker': _MARKERS['circle'],
+    },
+    "Gemini 2.5 Pro": {
+        'color': _COLORS['teal'],
+        'marker': _MARKERS['circle'],
+    },
+    # OpenAI models - shades of green/orange
+    "GPT-5.1": {
+        'color': _COLORS['green'],
+        'marker': _MARKERS['square'],
+    },
+    "GPT-4o": {
+        'color': _COLORS['green'],
+        'marker': _MARKERS['diamond'],
+    },
+    "o4-mini": {
+        'color': _COLORS['yellow'],
+        'marker': _MARKERS['square'],
+    },
+    "GPT-OSS-120B": {
+        'color': _COLORS['orange'],
+        'marker': _MARKERS['triangle_up'],
+    },
+    "GPT-OSS-20B": {
+        'color': _COLORS['brown'],
+        'marker': _MARKERS['triangle_down'],
+    },
+    # Anthropic models - pink
+    "Claude Opus 4.5": {
+        'color': _COLORS['pink'],
+        'marker': _MARKERS['diamond'],
+    },
+    # Moonshot AI - red
+    "Kimi K2": {
+        'color': _COLORS['red'],
+        'marker': _MARKERS['pentagon'],
+    },
+    # Mistral - purple
+    "Mistral Large": {
+        'color': _COLORS['purple'],
+        'marker': _MARKERS['hexagon'],
+    },
+    # Minimax - gray
+    "Minimax M2.1": {
+        'color': _COLORS['gray'],
+        'marker': _MARKERS['star'],
+    },
+    # Qwen - teal
+    "Qwen3-VL-32B": {
+        'color': _COLORS['teal'],
+        'marker': _MARKERS['plus'],
+    },
+    # ServiceNow - brown
+    "Apriel-1.6-15B": {
+        'color': _COLORS['brown'],
+        'marker': _MARKERS['x'],
+    },
+}
+# Default style for unknown models
+_DEFAULT_STYLE = {
+    'color': _COLORS['gray'],
+    'marker': _MARKERS['circle'],
+}
+def get_display_name(raw_name: str) -> str:
+    """Convert raw model name to display name."""
+    return MODEL_DISPLAY_NAMES.get(raw_name, raw_name)
+def get_model_style(model_name: str) -> dict:
+    """
+    Get the style (color, marker) for a model.
+    Args:
+        model_name: Either raw name or display name
+    Returns:
+        Dict with 'color' and 'marker' keys
+    """
+    # Try display name first
+    if model_name in MODEL_STYLES:
+        return MODEL_STYLES[model_name]
+    # Try converting from raw name
+    display_name = get_display_name(model_name)
+    if display_name in MODEL_STYLES:
+        return MODEL_STYLES[display_name]
+    # Return default
+    return _DEFAULT_STYLE
+def get_model_color(model_name: str) -> str:
+    """Get just the color for a model."""
+    return get_model_style(model_name)['color']
+def get_model_marker(model_name: str) -> str:
+    """Get just the marker for a model."""
+    return get_model_style(model_name)['marker']
+# For backward compatibility - create a color palette list
+def get_color_palette(n_colors: int = 10):
+    """Get a colorblind-friendly palette with n colors."""
+    palette_order = ['blue', 'orange', 'green', 'pink', 'light_blue',
+                     'red', 'purple', 'brown', 'teal', 'yellow']
+    return [_COLORS[c] for c in palette_order[:n_colors]]
+SINGLE_COLUMN_WIDTH = 3.25  # inches (ICML)
+DOUBLE_COLUMN_WIDTH = 6.75  # inches (ICML)
+MIN_FONT_SIZE = 10
+PLOT_PARAMETERS = {
+    'font.size': MIN_FONT_SIZE,
+    'font.family': 'serif',
+    'axes.labelsize': MIN_FONT_SIZE + 1,
+    'axes.titlesize': MIN_FONT_SIZE + 2,
+    'xtick.labelsize': MIN_FONT_SIZE,
+    'ytick.labelsize': MIN_FONT_SIZE,
+    'legend.fontsize': MIN_FONT_SIZE,
+    'figure.titlesize': MIN_FONT_SIZE + 2,
+    'figure.dpi': 150,
+    'savefig.dpi': 300,
+    'savefig.bbox': 'tight',
+    'axes.spines.top': False,
+    'axes.spines.right': False,
+    'axes.linewidth': 0.8,
+    'lines.linewidth': 1.0,
+    'patch.linewidth': 0.5,
+}

analysis_src/utils.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import json
+from pathlib import Path
+# Model display names (short for figures)
+# Follows ArtificialAnalysis.ai naming conventions
+MODEL_DISPLAY_NAMES = {
+    # OpenAI / Azure
+    "Azure_gpt-5.1-2025-11-13": "GPT-5.1",
+    "Azure_gpt-5.1-chat-2025-11-13": "GPT-5.1",
+    "Azure_o4-mini": "o4-mini",
+    "Azure_gpt-4o": "GPT-4o",
+    "openai_gpt-oss-120b": "GPT-OSS-120B",
+    "openai_gpt-oss-20b": "GPT-OSS-20B",
+    # Google / GCP
+    "GCP_gemini-2.5-pro": "Gemini 2.5 Pro",
+    "gemini-2.5-pro": "Gemini 2.5 Pro",
+    "gcp_gemini-3-pro-preview": "Gemini 3 Pro",
+    "gemini-3-pro-preview": "Gemini 3 Pro",
+    "gemini-3-flash-preview": "Gemini 3 Flash",
+    "google_gemini-3-flash-preview": "Gemini 3 Flash",
+    # Moonshot AI
+    "moonshotai_kimi-k2-thinking": "Kimi K2",
+    "kimi-k2-thinking": "Kimi K2",
+    # Anthropic / AWS
+    "aws_claude-opus-4-5": "Claude Opus 4.5",
+    # Mistral AI
+    "mistralai_mistral-large-2512": "Mistral Large",
+    # Alibaba / Qwen
+    "qwen_qwen3-vl-32b-instruct": "Qwen3-VL-32B",
+    # ServiceNow
+    "ServiceNow-AI_Apriel-1.6-15b-Thinker": "Apriel-1.6-15B",
+}
+def get_model_name(dirname: str) -> str:
+    """Extract model name from directory name."""
+    name = dirname.replace("react with code_", "").replace("_07ccdb1", "")
+    return MODEL_DISPLAY_NAMES.get(name, name)
+def find_react_with_code_dirs(leaderboard_dir: Path) -> list[Path]:
+    """Find all 'react with code' agent directories (non-backup)."""
+    dirs = []
+    for d in leaderboard_dir.iterdir():
+        if d.is_dir() and d.name.startswith("react with code_") and not d.name.startswith("backup_"):
+            dirs.append(d)
+    return sorted(dirs)
+def read_judge_outputs_from_dir(agent_dir: Path) -> dict[str, list[dict]]:
+    """
+    Read all judge_output.json files from an agent directory.
+    Returns:
+        Dict mapping scenario_id -> list of judge outputs (one per trial)
+    """
+    scenario_data = {}
+    for scenario_dir in agent_dir.iterdir():
+        if not scenario_dir.is_dir() or not scenario_dir.name.startswith("Scenario"):
+            continue
+        scenario_id = scenario_dir.name
+        trials = []
+        # Look for trial subdirectories (1, 2, 3, etc.)
+        for trial_dir in sorted(scenario_dir.iterdir()):
+            if not trial_dir.is_dir():
+                continue
+            judge_file = trial_dir / "judge_output.json"
+            if judge_file.exists():
+                try:
+                    with open(judge_file) as f:
+                        judge_data = json.load(f)
+                    trials.append(judge_data)
+                except Exception as e:
+                    print(f"    Warning: Error reading {judge_file}: {e}")
+        if trials:
+            scenario_data[scenario_id] = trials
+    return scenario_data
+def extract_trial_scores_from_judge_outputs(
+    scenario_data: dict[str, list[dict]],
+    metric: str
+) -> dict[str, list[float]]:
+    """
+    Extract per-trial scores for a given metric from judge outputs.
+    Args:
+        scenario_data: Dict mapping scenario_id -> list of judge outputs
+        metric: The metric name to extract
+    Returns:
+        Dict mapping scenario_id -> list of trial scores
+    """
+    scenario_trials = {}
+    for scenario_id, trials in scenario_data.items():
+        scores = []
+        for trial in trials:
+            flat_scores = trial.get("flat_scores", {})
+            score = flat_scores.get(metric)
+            # Handle None/null values
+            if score is None:
+                score = 0.0
+            scores.append(float(score))
+        if scores:
+            scenario_trials[scenario_id] = scores
+    return scenario_trials
+def get_runs_stats(scenario_data: dict[str, list], min_runs_required: int) -> tuple[int, int, int, int]:
+    """Get run statistics: (n_scenarios, min_runs, max_runs, n_qualifying)."""
+    if not scenario_data:
+        return 0, 0, 0, 0
+    run_counts = [len(trials) for trials in scenario_data.values()]
+    n_qualifying = sum(1 for c in run_counts if c >= min_runs_required)
+    return len(scenario_data), min(run_counts), max(run_counts), n_qualifying
+def filter_scenarios_with_min_runs(scenario_data: dict[str, list], min_runs_required: int) -> dict[str, list]:
+    """Filter to only include scenarios with >= min_runs_required runs."""
+    return {
+        scenario_id: trials
+        for scenario_id, trials in scenario_data.items()
+        if len(trials) >= min_runs_required
+    }
+def find_latest_rollout_file(trial_dir: Path) -> Path:
+    """Find the latest rollout file in a trial's sessions directory."""
+    sessions_dir = trial_dir / "sessions"
+    if not sessions_dir.exists():
+        return None
+    rollout_files = []
+    for rollout_file in sessions_dir.rglob("rollout-*.jsonl"):
+        rollout_files.append(rollout_file)
+    if not rollout_files:
+        return None
+    # Sort by filename (which includes timestamp) and return the latest
+    # rollout_files.sort(key=lambda f: f.name, reverse=True)
+    # return rollout_files[0]
+    return max(rollout_files, key=lambda p: p.stat().st_mtime)

evaluation.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff