Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Extract consistency (ICC) and performance data for all 'react with code' agents. | |
| This script reads directly from the run directories (not JSON result files) | |
| to ensure all trials are captured. | |
| Output is saved to paper_analysis/react with code/resources/figures/consistency/ as CSV files for plotting. | |
| """ | |
| import json | |
| import sys | |
| from pathlib import Path | |
| from dataclasses import dataclass, field | |
| import numpy as np | |
| import pandas as pd | |
| from tqdm import tqdm | |
| # Add project root to path | |
| PROJECT_ROOT = Path(__file__).parent.parent | |
| sys.path.insert(0, str(PROJECT_ROOT)) | |
| from analysis_src.consistency import ( | |
| compute_icc, | |
| ConsistencyMetrics, | |
| ) | |
| from analysis_src.utils import ( | |
| get_model_name, | |
| find_react_with_code_dirs, | |
| read_judge_outputs_from_dir, | |
| extract_trial_scores_from_judge_outputs, | |
| get_runs_stats, | |
| filter_scenarios_with_min_runs, | |
| ) | |
| # Paths | |
| LEADERBOARD_DIR = PROJECT_ROOT / "ITBench-SRE-Agent" / "ITBench-Trajectories" / "ReAct-Agent-Trajectories" | |
| RESULTS_JSON_DIR = LEADERBOARD_DIR / "results" | |
| OUTPUT_DIR = PROJECT_ROOT / "ITBench-SRE-Agent" / "ITBench-Trajectories" / "output" / "consistency" | |
| # Minimum runs per scenario required for inclusion | |
| MIN_RUNS_PER_SCENARIO = 3 | |
| # Minimum scenarios needed after filtering (must have at least this many with 3+ runs) | |
| MIN_QUALIFYING_SCENARIOS = 20 | |
| # Metrics to analyze | |
| METRICS = [ | |
| "root_cause_entity_f1", | |
| "root_cause_entity_precision", | |
| "root_cause_entity_recall", | |
| "root_cause_proximity_with_fp_f1", | |
| "propagation_chain", | |
| "fault_localization_component_identification", | |
| ] | |
| # Short names for display | |
| METRIC_SHORT_NAMES = { | |
| "root_cause_entity_f1": "RC Entity F1", | |
| "root_cause_entity_precision": "RC Entity Prec", | |
| "root_cause_entity_recall": "RC Entity Rec", | |
| "root_cause_proximity_with_fp_f1": "RC Proximity F1", | |
| "propagation_chain": "Prop. Chain", | |
| "fault_localization_component_identification": "Fault Loc.", | |
| } | |
| def extract_all_data() -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: | |
| """ | |
| Extract ICC and performance data for all agents by reading from directories. | |
| Returns: | |
| - icc_df: ICC scores per model per metric | |
| - perf_df: Performance averages per model per metric | |
| - scenario_df: Per-scenario breakdown | |
| """ | |
| agent_dirs = find_react_with_code_dirs(LEADERBOARD_DIR) | |
| print(f"Found {len(agent_dirs)} 'react with code' agent directories:") | |
| for d in agent_dirs: | |
| print(f" - {d.name}") | |
| icc_records = [] | |
| perf_records = [] | |
| scenario_records = [] | |
| valid_models = [] | |
| skipped_models = [] | |
| for agent_dir in tqdm(agent_dirs, desc="Reading agent data"): | |
| model_name = get_model_name(agent_dir.name) | |
| print(f"\nReading: {agent_dir.name}") | |
| scenario_data = read_judge_outputs_from_dir(agent_dir) | |
| n_scenarios, min_runs, max_runs, n_qualifying = get_runs_stats(scenario_data, MIN_RUNS_PER_SCENARIO) | |
| if n_scenarios == 0: | |
| print(f" SKIPPING {model_name}: No judge outputs found") | |
| skipped_models.append((model_name, "No data", 0)) | |
| continue | |
| if n_qualifying < MIN_QUALIFYING_SCENARIOS: | |
| print(f" SKIPPING {model_name}: Only {n_qualifying}/{n_scenarios} scenarios have {MIN_RUNS_PER_SCENARIO}+ runs") | |
| skipped_models.append((model_name, f"{n_qualifying}/{n_scenarios} qualifying", n_qualifying)) | |
| continue | |
| # Filter to only include scenarios with enough runs | |
| scenario_data = filter_scenarios_with_min_runs(scenario_data, MIN_RUNS_PER_SCENARIO) | |
| n_scenarios_filtered = len(scenario_data) | |
| print(f" Processing: {model_name} ({n_scenarios_filtered} scenarios with {MIN_RUNS_PER_SCENARIO}+ runs)") | |
| valid_models.append(model_name) | |
| for metric in tqdm(METRICS, desc=f" {model_name} metrics", leave=False): | |
| # Extract trial scores | |
| scenario_trials = extract_trial_scores_from_judge_outputs(scenario_data, metric) | |
| if not scenario_trials: | |
| continue | |
| # Calculate performance average | |
| all_scores = [s for trials in scenario_trials.values() for s in trials] | |
| perf_avg = np.mean(all_scores) if all_scores else 0.0 | |
| perf_records.append({ | |
| "model": model_name, | |
| "metric": METRIC_SHORT_NAMES.get(metric, metric), | |
| "metric_raw": metric, | |
| "performance": perf_avg, | |
| }) | |
| # ICC calculation | |
| try: | |
| icc_metrics = compute_icc(scenario_trials) | |
| icc_records.append({ | |
| "model": model_name, | |
| "metric": METRIC_SHORT_NAMES.get(metric, metric), | |
| "metric_raw": metric, | |
| "icc": icc_metrics.icc if not np.isnan(icc_metrics.icc) else 0.0, | |
| "flakiness": icc_metrics.flakiness_ratio if not np.isnan(icc_metrics.flakiness_ratio) else 1.0, | |
| "within_std": icc_metrics.mean_within_std, | |
| "agreement_rate": icc_metrics.mean_agreement_rate, | |
| "n_flaky_scenarios": icc_metrics.n_flaky_scenarios, | |
| "n_scenarios": icc_metrics.n_scenarios, | |
| }) | |
| # Per-scenario data (only for root_cause_entity_f1) | |
| if metric == "root_cause_entity_f1": | |
| for scenario_id, details in icc_metrics.scenario_details.items(): | |
| scenario_records.append({ | |
| "model": model_name, | |
| "scenario": scenario_id, | |
| "mean": details["mean"], | |
| "std": details["std"], | |
| "trials": details["trials"], | |
| "is_flaky": details["is_flaky"], | |
| }) | |
| except Exception as e: | |
| print(f" Error computing ICC for {metric}: {e}") | |
| continue | |
| if skipped_models: | |
| print(f"\n⚠️ Skipped {len(skipped_models)} models:") | |
| for name, reason, _ in skipped_models: | |
| print(f" - {name}: {reason}") | |
| print(f"\n✓ Included {len(valid_models)} models: {valid_models}") | |
| icc_df = pd.DataFrame(icc_records) | |
| perf_df = pd.DataFrame(perf_records) | |
| scenario_df = pd.DataFrame(scenario_records) | |
| return icc_df, perf_df, scenario_df | |
| def save_data(icc_df: pd.DataFrame, perf_df: pd.DataFrame, scenario_df: pd.DataFrame): | |
| """Save extracted data to CSV files.""" | |
| OUTPUT_DIR.mkdir(parents=True, exist_ok=True) | |
| icc_path = OUTPUT_DIR / "icc_data.csv" | |
| perf_path = OUTPUT_DIR / "performance_data.csv" | |
| scenario_path = OUTPUT_DIR / "scenario_data.csv" | |
| icc_df.to_csv(icc_path, index=False) | |
| perf_df.to_csv(perf_path, index=False) | |
| scenario_df.to_csv(scenario_path, index=False) | |
| print(f"\nData saved to:") | |
| print(f" - {icc_path}") | |
| print(f" - {perf_path}") | |
| print(f" - {scenario_path}") | |
| # Also save a summary JSON | |
| summary = { | |
| "models": icc_df["model"].unique().tolist(), | |
| "metrics": icc_df["metric"].unique().tolist(), | |
| "n_scenarios": int(icc_df["n_scenarios"].max()) if len(icc_df) > 0 else 0, | |
| "min_runs_required": MIN_RUNS_PER_SCENARIO, | |
| } | |
| summary_path = OUTPUT_DIR / "analysis_summary.json" | |
| with open(summary_path, "w") as f: | |
| json.dump(summary, f, indent=2) | |
| print(f" - {summary_path}") | |
| def print_summary(icc_df: pd.DataFrame, perf_df: pd.DataFrame): | |
| """Print summary tables.""" | |
| print("\n" + "="*80) | |
| print("ICC Summary (root_cause_entity_f1)") | |
| print("="*80) | |
| rc_icc = icc_df[icc_df["metric_raw"] == "root_cause_entity_f1"].copy() | |
| rc_icc = rc_icc.sort_values("icc", ascending=False) | |
| print(f"\n{'Model':<20} {'ICC':>8} {'Flaky%':>8} {'Std':>8} {'Agree%':>8}") | |
| print("-" * 56) | |
| for _, row in rc_icc.iterrows(): | |
| print(f"{row['model']:<20} {row['icc']:>8.4f} {row['flakiness']*100:>7.1f}% {row['within_std']:>8.4f} {row['agreement_rate']*100:>7.1f}%") | |
| print("\n" + "="*80) | |
| print("Performance Summary (root_cause_entity_f1)") | |
| print("="*80) | |
| rc_perf = perf_df[perf_df["metric_raw"] == "root_cause_entity_f1"].copy() | |
| rc_perf = rc_perf.sort_values("performance", ascending=False) | |
| print(f"\n{'Model':<20} {'Avg Score':>12}") | |
| print("-" * 34) | |
| for _, row in rc_perf.iterrows(): | |
| print(f"{row['model']:<20} {row['performance']:>12.4f}") | |
| def main(): | |
| print("Extracting consistency data for 'react with code' agents...") | |
| print(f"Reading from directories: {LEADERBOARD_DIR}") | |
| print(f"Output directory: {OUTPUT_DIR}") | |
| print(f"Minimum runs per scenario: {MIN_RUNS_PER_SCENARIO}") | |
| icc_df, perf_df, scenario_df = extract_all_data() | |
| if len(icc_df) == 0: | |
| print("No data extracted!") | |
| return | |
| save_data(icc_df, perf_df, scenario_df) | |
| print_summary(icc_df, perf_df) | |
| if __name__ == "__main__": | |
| main() | |