Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Extract Majority Vote and consistency breakdown data for all 'react with code' agents. | |
| This script computes: | |
| - Pass@k: At least 1 trial succeeds | |
| - Majority@k: Majority of trials succeed | |
| - All@k: All trials succeed | |
| - Consistency breakdown: Consistent Correct, Consistent Wrong, Inconsistent | |
| Output is saved to paper_analysis/react with code/resources/figures/consistency/ as CSV files. | |
| """ | |
| import json | |
| import sys | |
| from pathlib import Path | |
| from itertools import combinations | |
| import numpy as np | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from tqdm import tqdm | |
| # Add project root to path | |
| PROJECT_ROOT = Path(__file__).parent.parent | |
| sys.path.insert(0, str(PROJECT_ROOT)) | |
| from analysis_src.utils import ( | |
| get_model_name, | |
| find_react_with_code_dirs, | |
| read_judge_outputs_from_dir, | |
| extract_trial_scores_from_judge_outputs, | |
| filter_scenarios_with_min_runs, | |
| ) | |
| from analysis_src.model_styles import ( | |
| get_model_style, MIN_FONT_SIZE, SINGLE_COLUMN_WIDTH, DOUBLE_COLUMN_WIDTH, PLOT_PARAMETERS | |
| ) | |
| # Paths | |
| LEADERBOARD_DIR = PROJECT_ROOT / "ITBench-SRE-Agent" / "ITBench-Trajectories" / "ReAct-Agent-Trajectories" | |
| OUTPUT_DIR = PROJECT_ROOT / "ITBench-SRE-Agent" / "ITBench-Trajectories" / "output" / "consistency" | |
| # Minimum runs per scenario required for inclusion | |
| MIN_RUNS_PER_SCENARIO = 2 | |
| # Minimum scenarios needed after filtering | |
| MIN_QUALIFYING_SCENARIOS = 20 | |
| # Success threshold for binary classification | |
| SUCCESS_THRESHOLD = 0.5 | |
| def compute_majority_vote_metrics( | |
| scenario_trials: dict[str, list[float]], | |
| success_threshold: float = SUCCESS_THRESHOLD | |
| ) -> dict: | |
| """ | |
| Compute majority vote and consistency metrics. | |
| Returns dict with: | |
| - pass_at_k: At least 1 trial succeeds | |
| - majority_at_k: Majority of trials succeed | |
| - all_at_k: All trials succeed | |
| - consistent_correct: All trials succeed | |
| - consistent_wrong: All trials fail | |
| - inconsistent: Mixed results | |
| """ | |
| scenarios = list(scenario_trials.keys()) | |
| n_trials_list = [len(trials) for trials in scenario_trials.values()] | |
| if not n_trials_list: | |
| return None | |
| k = min(n_trials_list) | |
| n_scenarios = len(scenarios) | |
| if n_scenarios == 0 or k < 1: | |
| return None | |
| pass_at_k = 0 | |
| majority_at_k = 0 | |
| all_at_k = 0 | |
| consistent_correct = 0 | |
| consistent_wrong = 0 | |
| inconsistent = 0 | |
| scenario_details = [] | |
| all_scores = [] | |
| for s in scenarios: | |
| trials = scenario_trials[s][:k] | |
| all_scores.extend(trials) | |
| successes = [1 if t >= success_threshold else 0 for t in trials] | |
| n_success = sum(successes) | |
| if n_success >= 1: | |
| pass_at_k += 1 | |
| if n_success > k / 2: | |
| majority_at_k += 1 | |
| if n_success == k: | |
| all_at_k += 1 | |
| consistent_correct += 1 | |
| consistency_type = "correct" | |
| elif n_success == 0: | |
| consistent_wrong += 1 | |
| consistency_type = "wrong" | |
| else: | |
| inconsistent += 1 | |
| consistency_type = "inconsistent" | |
| scenario_details.append({ | |
| "scenario": s, | |
| "n_success": n_success, | |
| "n_trials": k, | |
| "majority_correct": n_success > k / 2, | |
| "consistency_type": consistency_type, | |
| "mean_score": np.mean(trials), | |
| "std_score": np.std(trials) if len(trials) > 1 else 0, | |
| }) | |
| return { | |
| "n_scenarios": n_scenarios, | |
| "n_trials": k, | |
| "threshold": success_threshold, | |
| "pass_at_k": pass_at_k / n_scenarios, | |
| "majority_at_k": majority_at_k / n_scenarios, | |
| "all_at_k": all_at_k / n_scenarios, | |
| "consistent_correct": consistent_correct / n_scenarios, | |
| "consistent_wrong": consistent_wrong / n_scenarios, | |
| "inconsistent": inconsistent / n_scenarios, | |
| "n_pass": pass_at_k, | |
| "n_majority": majority_at_k, | |
| "n_all": all_at_k, | |
| "n_consistent_correct": consistent_correct, | |
| "n_consistent_wrong": consistent_wrong, | |
| "n_inconsistent": inconsistent, | |
| "overall_mean": np.mean(all_scores), | |
| "overall_std": np.std(all_scores), | |
| "scenario_details": scenario_details, | |
| } | |
| # Metrics to extract | |
| METRICS = [ | |
| ("root_cause_entity_f1", "F1"), | |
| ("root_cause_entity_precision", "Precision"), | |
| ("root_cause_entity_recall", "Recall"), | |
| ] | |
| def extract_all_data() -> dict[str, tuple[pd.DataFrame, pd.DataFrame]]: | |
| """ | |
| Extract majority vote data for all agents, for multiple metrics. | |
| Returns: | |
| - dict mapping metric_name -> (summary_df, scenario_df) | |
| """ | |
| agent_dirs = find_react_with_code_dirs(LEADERBOARD_DIR) | |
| print(f"Found {len(agent_dirs)} 'react with code' agent directories:") | |
| for d in agent_dirs: | |
| print(f" - {d.name}") | |
| # Read all judge outputs once | |
| agent_data = {} | |
| valid_models = [] | |
| skipped_models = [] | |
| for agent_dir in tqdm(agent_dirs, desc="Reading agent data"): | |
| model_name = get_model_name(agent_dir.name) | |
| print(f"\nReading: {agent_dir.name}") | |
| scenario_data = read_judge_outputs_from_dir(agent_dir) | |
| if not scenario_data: | |
| print(f" SKIPPING {model_name}: No judge outputs found") | |
| skipped_models.append((model_name, "No data")) | |
| continue | |
| # Filter scenarios with minimum runs | |
| scenario_data = filter_scenarios_with_min_runs(scenario_data, MIN_RUNS_PER_SCENARIO) | |
| n_qualifying = len(scenario_data) | |
| if n_qualifying < MIN_QUALIFYING_SCENARIOS: | |
| print(f" SKIPPING {model_name}: Only {n_qualifying} scenarios with {MIN_RUNS_PER_SCENARIO}+ runs") | |
| skipped_models.append((model_name, f"{n_qualifying} qualifying")) | |
| continue | |
| print(f" Processing: {model_name} ({n_qualifying} scenarios)") | |
| valid_models.append(model_name) | |
| agent_data[model_name] = scenario_data | |
| if skipped_models: | |
| print(f"\n⚠️ Skipped {len(skipped_models)} models:") | |
| for name, reason in skipped_models: | |
| print(f" - {name}: {reason}") | |
| print(f"\n✓ Included {len(valid_models)} models: {valid_models}") | |
| # Extract for each metric | |
| results = {} | |
| for metric_key, metric_label in tqdm(METRICS, desc="Processing metrics"): | |
| print(f"\n--- Extracting for metric: {metric_label} ({metric_key}) ---") | |
| summary_records = [] | |
| scenario_records = [] | |
| for model_name, scenario_data in tqdm(agent_data.items(), desc=f" {metric_label}", leave=False): | |
| # Extract scores for this metric | |
| scenario_trials = extract_trial_scores_from_judge_outputs(scenario_data, metric_key) | |
| # Compute majority vote metrics | |
| metrics = compute_majority_vote_metrics(scenario_trials) | |
| if metrics is None: | |
| continue | |
| # Add to summary | |
| summary_records.append({ | |
| "model": model_name, | |
| "metric": metric_label, | |
| "n_scenarios": metrics["n_scenarios"], | |
| "n_trials": metrics["n_trials"], | |
| "pass_at_k": metrics["pass_at_k"], | |
| "majority_at_k": metrics["majority_at_k"], | |
| "all_at_k": metrics["all_at_k"], | |
| "consistent_correct": metrics["consistent_correct"], | |
| "consistent_wrong": metrics["consistent_wrong"], | |
| "inconsistent": metrics["inconsistent"], | |
| "overall_mean": metrics["overall_mean"], | |
| "overall_std": metrics["overall_std"], | |
| }) | |
| # Add per-scenario data | |
| for detail in metrics["scenario_details"]: | |
| scenario_records.append({ | |
| "model": model_name, | |
| "metric": metric_label, | |
| "scenario": detail["scenario"], | |
| "n_success": detail["n_success"], | |
| "n_trials": detail["n_trials"], | |
| "majority_correct": detail["majority_correct"], | |
| "consistency_type": detail["consistency_type"], | |
| "mean_score": detail["mean_score"], | |
| "std_score": detail["std_score"], | |
| }) | |
| summary_df = pd.DataFrame(summary_records) | |
| scenario_df = pd.DataFrame(scenario_records) | |
| results[metric_label] = (summary_df, scenario_df) | |
| return results | |
| def save_data(results: dict[str, tuple[pd.DataFrame, pd.DataFrame]]): | |
| """Save extracted data to CSV files for each metric.""" | |
| OUTPUT_DIR.mkdir(parents=True, exist_ok=True) | |
| # Also save combined data for backward compatibility | |
| all_summaries = [] | |
| all_scenarios = [] | |
| for metric_label, (summary_df, scenario_df) in results.items(): | |
| metric_suffix = metric_label.lower() | |
| summary_path = OUTPUT_DIR / f"majority_vote_data_{metric_suffix}.csv" | |
| scenario_path = OUTPUT_DIR / f"majority_vote_scenarios_{metric_suffix}.csv" | |
| summary_df.to_csv(summary_path, index=False) | |
| scenario_df.to_csv(scenario_path, index=False) | |
| print(f"\nData saved for {metric_label}:") | |
| print(f" - {summary_path}") | |
| print(f" - {scenario_path}") | |
| all_summaries.append(summary_df) | |
| all_scenarios.append(scenario_df) | |
| # Save combined (default to F1 for backward compatibility) | |
| if "F1" in results: | |
| f1_summary, f1_scenario = results["F1"] | |
| # Save without metric column for backward compat | |
| f1_summary_compat = f1_summary.drop(columns=["metric"], errors="ignore") | |
| f1_scenario_compat = f1_scenario.drop(columns=["metric"], errors="ignore") | |
| f1_summary_compat.to_csv(OUTPUT_DIR / "majority_vote_data.csv", index=False) | |
| f1_scenario_compat.to_csv(OUTPUT_DIR / "majority_vote_scenarios.csv", index=False) | |
| print(f"\nBackward-compatible files (F1) saved to:") | |
| print(f" - {OUTPUT_DIR / 'majority_vote_data.csv'}") | |
| print(f" - {OUTPUT_DIR / 'majority_vote_scenarios.csv'}") | |
| def print_summary(results: dict[str, tuple[pd.DataFrame, pd.DataFrame]]): | |
| """Print summary table for each metric.""" | |
| for metric_label, (summary_df, _) in results.items(): | |
| print("\n" + "="*80) | |
| print(f"Majority Vote Summary ({metric_label}, threshold={SUCCESS_THRESHOLD})") | |
| print("="*80) | |
| df = summary_df.sort_values("majority_at_k", ascending=False) | |
| print(f"\n{'Model':<20} {'Maj@k':>8} {'Pass@k':>8} {'All@k':>8} {'Cons✓':>8} {'Cons✗':>8} {'Incons':>8}") | |
| print("-" * 80) | |
| for _, row in df.iterrows(): | |
| print(f"{row['model']:<20} " | |
| f"{row['majority_at_k']*100:>7.1f}% " | |
| f"{row['pass_at_k']*100:>7.1f}% " | |
| f"{row['all_at_k']*100:>7.1f}% " | |
| f"{row['consistent_correct']*100:>7.1f}% " | |
| f"{row['consistent_wrong']*100:>7.1f}% " | |
| f"{row['inconsistent']*100:>7.1f}%") | |
| def plot_majority_vs_performance(df: pd.DataFrame): | |
| """ | |
| Figure: Majority@k vs Performance scatter plot. | |
| """ | |
| plt.rcParams.update({PLOT_PARAMETERS}) | |
| fig, ax = plt.subplots(figsize=(SINGLE_COLUMN_WIDTH, DOUBLE_COLUMN_WIDTH)) | |
| # Axis limits | |
| x_min, x_max = 0, 1.0 | |
| y_min, y_max = 0, 100 | |
| # Gradient shading toward top-right (ideal) | |
| for i in range(5): | |
| alpha = 0.02 + i * 0.02 | |
| x_start = 0.1 + i * 0.15 | |
| y_start = 10 + i * 15 | |
| rect = plt.Rectangle((x_start, y_start), x_max - x_start, y_max - y_start, | |
| color='#2ecc71', alpha=alpha, zorder=0) | |
| ax.add_patch(rect) | |
| # Arrow pointing to ideal | |
| ax.annotate('', xy=(0.85, 85), xytext=(0.55, 55), | |
| arrowprops=dict(arrowstyle='->', color='#27ae60', alpha=0.7, lw=1.5), | |
| zorder=2) | |
| ax.text(0.58, 58, 'better', fontsize=MIN_FONT_SIZE, style='italic', | |
| color='#27ae60', alpha=0.8, rotation=45, zorder=2) | |
| # Mark ideal corner | |
| ax.scatter([1.0], [100], marker='*', s=100, c='#27ae60', alpha=0.5, zorder=2) | |
| ax.text(0.92, 95, 'ideal', fontsize=MIN_FONT_SIZE - 1, color='#27ae60', | |
| alpha=0.7, ha='right') | |
| # Scatter points with model-specific colors and markers | |
| for _, row in df.iterrows(): | |
| style = get_model_style(row["model"]) | |
| ax.scatter(row["overall_mean"], row["majority_at_k"] * 100, | |
| c=style['color'], marker=style['marker'], | |
| s=80, edgecolors='black', linewidth=0.5, zorder=10) | |
| # Labels with smart positioning | |
| for _, row in df.iterrows(): | |
| model = row["model"] | |
| x_pos = row["overall_mean"] | |
| y_pos = row["majority_at_k"] * 100 | |
| dx, dy = 0.03, 2 | |
| ha, va = "left", "center" | |
| if x_pos > 0.7: | |
| dx = -0.03 | |
| ha = "right" | |
| if y_pos > 80: | |
| dy = -3 | |
| va = "top" | |
| ax.text(x_pos + dx, y_pos + dy, model, fontsize=MIN_FONT_SIZE - 1, | |
| ha=ha, va=va, zorder=11) | |
| ax.set_xlabel("Performance (RC Entity F1)") | |
| ax.set_ylabel("Majority@k (%)") | |
| ax.set_xlim(x_min, x_max) | |
| ax.set_ylim(y_min, y_max) | |
| plt.tight_layout() | |
| plt.show() | |
| fig.savefig(OUTPUT_DIR / "fig_majority_vs_performance.pdf") | |
| fig.savefig(OUTPUT_DIR / "fig_majority_vs_performance.png") | |
| plt.close(fig) | |
| print("Saved: fig_majority_vs_performance.pdf/png") | |
| def plot_pass_vs_majority(df: pd.DataFrame, metric: str = "F1", suffix: str = ""): | |
| """ | |
| Figure: Scatter plot of Pass@k (x-axis) vs Majority@k (y-axis). | |
| Args: | |
| df: DataFrame with pass_at_k and majority_at_k columns | |
| metric: Name of metric for labeling (F1, Precision, Recall) | |
| suffix: Suffix for output filename (e.g., "_precision") | |
| """ | |
| fig, ax = plt.subplots(figsize=(SINGLE_COLUMN_WIDTH, SINGLE_COLUMN_WIDTH)) | |
| ax_min, ax_max = 0, 100 | |
| # Diagonal line | |
| ax.plot([ax_min, ax_max], [ax_min, ax_max], color='#444444', linestyle='--', | |
| linewidth=1.5, alpha=0.6, zorder=1) | |
| # Consistency region labels | |
| ax.text(8, 92, 'more\nconsistent', fontsize=MIN_FONT_SIZE + 1, color='#333333', | |
| ha='left', va='top', style='italic') | |
| ax.text(92, 8, 'less\nconsistent', fontsize=MIN_FONT_SIZE + 1, color='#333333', | |
| ha='right', va='bottom', style='italic') | |
| # Collect and plot points | |
| points = {} | |
| for _, row in df.iterrows(): | |
| style = get_model_style(row["model"]) | |
| x = row["pass_at_k"] * 100 | |
| y = row["majority_at_k"] * 100 | |
| ax.scatter(x, y, c=style['color'], marker=style['marker'], | |
| s=50, edgecolors='black', linewidth=0.5, zorder=10) | |
| points[row["model"]] = {'x': x, 'y': y} | |
| line_color = '#444444' | |
| line_width = 1.2 | |
| # Place labels with manual positioning | |
| for model, p in points.items(): | |
| x, y = p['x'], p['y'] | |
| if 'GPT-OSS-120B' in model: | |
| # Label to the right, slightly below | |
| ax.text(x + 3, y - 2, model, fontsize=MIN_FONT_SIZE, ha='left', va='center', zorder=11) | |
| elif 'Gemini 2.5 Pro' in model: | |
| # TEAL CIRCLE: label slightly below and to the right | |
| ax.text(x + 3, y + 2, model, fontsize=MIN_FONT_SIZE, ha='left', va='bottom', zorder=11) | |
| elif 'o4-mini' in model: | |
| # YELLOW SQUARE: shorter line goes right then to label | |
| label_x = x + 12 | |
| label_y = y | |
| # Horizontal line right (shorter) | |
| ax.plot([x, label_x], [y, y], color=line_color, linewidth=line_width, alpha=0.8, zorder=5) | |
| ax.text(label_x + 1, label_y, model, fontsize=MIN_FONT_SIZE, ha='left', va='center', zorder=11) | |
| elif 'GPT-5.1' in model: | |
| # GREEN SQUARE: line from left edge, goes left then up | |
| label_x = 5 | |
| label_y = 25 | |
| start_x = x - 2 # Left edge of the square marker | |
| # Horizontal line left from left edge midpoint | |
| ax.plot([start_x, label_x], [y, y], color=line_color, linewidth=line_width, alpha=0.8, zorder=5) | |
| # Vertical line up to label height | |
| ax.plot([label_x, label_x], [y, label_y], color=line_color, linewidth=line_width, alpha=0.8, zorder=5) | |
| ax.text(label_x, label_y + 1, model, fontsize=MIN_FONT_SIZE, ha='left', va='bottom', zorder=11) | |
| elif 'Claude Opus' in model: | |
| # Label to the right | |
| ax.text(x + 5, y, model, fontsize=MIN_FONT_SIZE, ha='left', va='center', zorder=11) | |
| elif 'Gemini 3 Pro' in model: | |
| # Label BELOW the circle, offset left | |
| ax.text(x - 18, y - 6, model, fontsize=MIN_FONT_SIZE, ha='left', va='top', zorder=11) | |
| elif 'Gemini 3 Flash' in model: | |
| # Label at x=95 to avoid diagonal line | |
| ax.text(105, y + 4, model, fontsize=MIN_FONT_SIZE, ha='right', va='bottom', zorder=11) | |
| elif 'Kimi K2' in model: | |
| # Label to the right | |
| ax.text(x + 3, y, model, fontsize=MIN_FONT_SIZE, ha='left', va='center', zorder=11) | |
| else: | |
| # Default: label to the right | |
| ax.text(x + 3, y, model, fontsize=MIN_FONT_SIZE, ha='left', va='center', zorder=11) | |
| ax.set_xlabel(f"Pass@k (%) [{metric}]") | |
| ax.set_ylabel(f"Majority@k (%) [{metric}]") | |
| ax.set_xlim(ax_min, ax_max) | |
| ax.set_ylim(ax_min, ax_max) | |
| ax.set_aspect('equal') | |
| plt.title("Consistency: Pass@k vs. Majority@k") | |
| plt.tight_layout() | |
| plt.show() | |
| filename = f"fig_pass_vs_majority{suffix}" | |
| fig.savefig(OUTPUT_DIR / f"{filename}.png") | |
| plt.close(fig) | |
| print(f"Saved: {filename}.png") | |
| def main(): | |
| print("Extracting majority vote data for 'react with code' agents...") | |
| print(f"Reading from directories: {LEADERBOARD_DIR}") | |
| print(f"Output directory: {OUTPUT_DIR}") | |
| print(f"Success threshold: {SUCCESS_THRESHOLD}") | |
| print(f"Minimum runs per scenario: {MIN_RUNS_PER_SCENARIO}") | |
| print(f"Metrics: {[m[1] for m in METRICS]}") | |
| results = extract_all_data() | |
| if not results: | |
| print("No data extracted!") | |
| return | |
| save_data(results) | |
| print_summary(results) | |
| if __name__ == "__main__": | |
| main() | |