Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| # TODO: Needs to be integrated into the itbench_leaderboard module | |
| # This script calculates ICC (Intraclass Correlation Coefficient) and other | |
| # consistency metrics for agent evaluation results. | |
| Consistency Analysis for Agent Leaderboard Results. | |
| Computes ICC (Intraclass Correlation Coefficient) to measure the reliability | |
| and consistency of agent responses across multiple trials per scenario. | |
| ICC answers: "Of all the variance observed, how much is due to actual scenario | |
| difficulty (signal) vs. random model variability (noise/flakiness)?" | |
| Interpretation: | |
| ICC > 0.9: Excellent consistency | |
| ICC 0.75-0.9: Good consistency | |
| ICC 0.5-0.75: Moderate consistency | |
| ICC < 0.5: Poor consistency (high flakiness) | |
| Usage: | |
| python -m itbench_leaderboard.consistency --results-dir leaderboard_results/results | |
| python -m itbench_leaderboard.consistency --results-file path/to/results.json | |
| """ | |
| import argparse | |
| import json | |
| import sys | |
| from dataclasses import dataclass, field | |
| from pathlib import Path | |
| from typing import Optional | |
| import numpy as np | |
| class ConsistencyMetrics: | |
| """Container for all consistency metrics.""" | |
| # Core ICC metrics | |
| icc: float | |
| flakiness_ratio: float # 1 - ICC | |
| # ANOVA components | |
| msb: float # Mean Square Between (between-scenario variance) | |
| msw: float # Mean Square Within (within-scenario variance) | |
| # Within-scenario consistency | |
| mean_within_std: float | |
| mean_agreement_rate: float | |
| repeatability_coefficient: float | |
| # Summary stats | |
| n_scenarios: int | |
| n_trials: int | |
| n_flaky_scenarios: int | |
| flaky_scenarios: list = field(default_factory=list) | |
| # Per-scenario breakdown | |
| scenario_details: dict = field(default_factory=dict) | |
| def __str__(self) -> str: | |
| return ( | |
| f"ICC: {self.icc:.4f} (flakiness: {self.flakiness_ratio:.4f})\n" | |
| f"MSB (between): {self.msb:.4f}, MSW (within): {self.msw:.4f}\n" | |
| f"Mean within-std: {self.mean_within_std:.4f}\n" | |
| f"Agreement rate: {self.mean_agreement_rate:.4f}\n" | |
| f"Repeatability coef: {self.repeatability_coefficient:.4f}\n" | |
| f"Flaky scenarios: {self.n_flaky_scenarios}/{self.n_scenarios}" | |
| ) | |
| def load_results(filepath: Path) -> dict: | |
| """Load results JSON file.""" | |
| with open(filepath, "r") as f: | |
| return json.load(f) | |
| def extract_trial_scores( | |
| results: dict, | |
| metric: str = "root_cause_entity_f1" | |
| ) -> dict[str, list[float]]: | |
| """ | |
| Extract per-trial scores for a given metric from results. | |
| Args: | |
| results: Loaded JSON results | |
| metric: The metric name to extract (default: root_cause_entity_f1) | |
| Returns: | |
| Dict mapping scenario_id -> list of trial scores | |
| """ | |
| scenario_trials = {} | |
| scenarios = results.get("scenarios", {}) | |
| for scenario_id, scenario_data in scenarios.items(): | |
| runs = scenario_data.get("runs", []) | |
| trial_scores = [] | |
| for run in runs: | |
| scores = run.get("scores", {}) | |
| score = scores.get(metric) | |
| # Handle None/null values | |
| if score is None: | |
| score = 0.0 | |
| trial_scores.append(float(score)) | |
| if trial_scores: | |
| scenario_trials[scenario_id] = trial_scores | |
| return scenario_trials | |
| def calculate_agreement_rate(trials: list[float], tolerance: float = 0.1) -> float: | |
| """ | |
| Calculate agreement rate between trial pairs. | |
| Args: | |
| trials: List of trial scores | |
| tolerance: Maximum difference to consider as "agreement" | |
| Returns: | |
| Fraction of trial pairs that agree (0-1) | |
| """ | |
| from itertools import combinations | |
| if len(trials) < 2: | |
| return 1.0 | |
| pairs = list(combinations(trials, 2)) | |
| agreements = sum(1 for a, b in pairs if abs(a - b) <= tolerance) | |
| return agreements / len(pairs) | |
| def compute_icc(scenario_trials: dict[str, list[float]]) -> ConsistencyMetrics: | |
| """ | |
| Compute ICC(1,1) - one-way random effects model. | |
| The ICC formula: | |
| ICC = (MSB - MSW) / (MSB + (k-1) * MSW) | |
| Where: | |
| MSB = k * Var(scenario_means) [between-scenario variance] | |
| MSW = Mean(Var(trials per scenario)) [within-scenario variance] | |
| k = number of trials per scenario | |
| Args: | |
| scenario_trials: Dict mapping scenario_id -> list of trial scores | |
| Returns: | |
| ConsistencyMetrics with ICC and related metrics | |
| """ | |
| # Convert to numpy array | |
| scenarios = list(scenario_trials.keys()) | |
| # Ensure all scenarios have same number of trials | |
| n_trials_list = [len(trials) for trials in scenario_trials.values()] | |
| if len(set(n_trials_list)) > 1: | |
| # Pad or truncate to minimum | |
| k = min(n_trials_list) | |
| scores = np.array([scenario_trials[s][:k] for s in scenarios]) | |
| else: | |
| k = n_trials_list[0] if n_trials_list else 0 | |
| scores = np.array([scenario_trials[s] for s in scenarios]) | |
| n_scenarios = len(scenarios) | |
| if n_scenarios == 0 or k == 0: | |
| return ConsistencyMetrics( | |
| icc=float('nan'), | |
| flakiness_ratio=float('nan'), | |
| msb=0.0, | |
| msw=0.0, | |
| mean_within_std=0.0, | |
| mean_agreement_rate=1.0, | |
| repeatability_coefficient=0.0, | |
| n_scenarios=0, | |
| n_trials=0, | |
| n_flaky_scenarios=0, | |
| ) | |
| # Calculate scenario means | |
| scenario_means = np.mean(scores, axis=1) | |
| # Between-scenario variance (MSB) | |
| # MSB = k * Var(scenario means) | |
| msb = k * np.var(scenario_means, ddof=1) if n_scenarios > 1 else 0.0 | |
| # Within-scenario variance (MSW) | |
| # MSW = average of within-scenario variances | |
| within_vars = np.var(scores, axis=1, ddof=1) if k > 1 else np.zeros(n_scenarios) | |
| msw = np.mean(within_vars) | |
| # ICC(1,1) formula | |
| denominator = msb + (k - 1) * msw | |
| if denominator > 0: | |
| icc = (msb - msw) / denominator | |
| icc = max(0.0, icc) # ICC can be negative, clip to 0 | |
| else: | |
| icc = float('nan') if msw == 0 and msb == 0 else 0.0 | |
| # Within-scenario standard deviations | |
| within_stds = np.std(scores, axis=1, ddof=1) if k > 1 else np.zeros(n_scenarios) | |
| mean_within_std = np.mean(within_stds) | |
| # Agreement rates | |
| agreement_rates = [ | |
| calculate_agreement_rate(scenario_trials[s]) | |
| for s in scenarios | |
| ] | |
| mean_agreement_rate = np.mean(agreement_rates) | |
| # Repeatability coefficient (95% of repeat differences < RC) | |
| rc = 1.96 * np.sqrt(2 * msw) if msw > 0 else 0.0 | |
| # Identify flaky scenarios (high within-variance) | |
| flaky_threshold = 0.3 | |
| flaky_scenarios = [ | |
| (s, float(std)) | |
| for s, std in zip(scenarios, within_stds) | |
| if std > flaky_threshold | |
| ] | |
| # Per-scenario details | |
| scenario_details = {} | |
| for i, s in enumerate(scenarios): | |
| scenario_details[s] = { | |
| "trials": scenario_trials[s], | |
| "mean": float(scenario_means[i]), | |
| "std": float(within_stds[i]), | |
| "agreement_rate": agreement_rates[i], | |
| "is_flaky": within_stds[i] > flaky_threshold, | |
| } | |
| return ConsistencyMetrics( | |
| icc=float(icc), | |
| flakiness_ratio=float(1 - icc) if not np.isnan(icc) else float('nan'), | |
| msb=float(msb), | |
| msw=float(msw), | |
| mean_within_std=float(mean_within_std), | |
| mean_agreement_rate=float(mean_agreement_rate), | |
| repeatability_coefficient=float(rc), | |
| n_scenarios=n_scenarios, | |
| n_trials=k, | |
| n_flaky_scenarios=len(flaky_scenarios), | |
| flaky_scenarios=flaky_scenarios, | |
| scenario_details=scenario_details, | |
| ) | |
| def analyze_results_file( | |
| filepath: Path, | |
| metrics: list[str] | None = None, | |
| ) -> dict[str, ConsistencyMetrics]: | |
| """ | |
| Analyze a single results file for multiple metrics. | |
| Args: | |
| filepath: Path to the results JSON file | |
| metrics: List of metrics to analyze. Defaults to common metrics. | |
| Returns: | |
| Dict mapping metric_name -> ConsistencyMetrics | |
| """ | |
| if metrics is None: | |
| metrics = [ | |
| "root_cause_entity_f1", | |
| "root_cause_proximity_with_fp_f1", | |
| "propagation_chain", | |
| ] | |
| results = load_results(filepath) | |
| analysis = {} | |
| for metric in metrics: | |
| scenario_trials = extract_trial_scores(results, metric) | |
| if scenario_trials: | |
| analysis[metric] = compute_icc(scenario_trials) | |
| return analysis | |
| def compare_models( | |
| results_dir: Path, | |
| model_patterns: list[str], | |
| metric: str = "root_cause_entity_f1", | |
| ) -> dict[str, ConsistencyMetrics]: | |
| """ | |
| Compare ICC across multiple models. | |
| Args: | |
| results_dir: Directory containing results JSON files | |
| model_patterns: List of model name patterns to match | |
| metric: The metric to analyze | |
| Returns: | |
| Dict mapping model_name -> ConsistencyMetrics | |
| """ | |
| comparison = {} | |
| for pattern in model_patterns: | |
| # Find matching file | |
| matches = list(results_dir.glob(f"*{pattern}*.json")) | |
| if not matches: | |
| print(f"Warning: No file found for pattern '{pattern}'", file=sys.stderr) | |
| continue | |
| filepath = matches[0] | |
| print(f"Analyzing: {filepath.name}") | |
| results = load_results(filepath) | |
| scenario_trials = extract_trial_scores(results, metric) | |
| if scenario_trials: | |
| model_name = results.get("agent_name", filepath.stem) | |
| comparison[model_name] = compute_icc(scenario_trials) | |
| return comparison | |
| def print_comparison_table( | |
| comparison: dict[str, ConsistencyMetrics], | |
| metric: str, | |
| ) -> None: | |
| """Print a formatted comparison table.""" | |
| print(f"\n{'='*80}") | |
| print(f"ICC Comparison for metric: {metric}") | |
| print(f"{'='*80}\n") | |
| # Header | |
| print(f"{'Model':<55} {'ICC':>8} {'Flaky%':>8} {'Std':>8} {'Agree%':>8}") | |
| print("-" * 91) | |
| # Sort by ICC descending | |
| sorted_models = sorted( | |
| comparison.items(), | |
| key=lambda x: x[1].icc if not np.isnan(x[1].icc) else -1, | |
| reverse=True | |
| ) | |
| for model, metrics in sorted_models: | |
| # Truncate model name if too long | |
| display_name = model[:52] + "..." if len(model) > 55 else model | |
| icc_str = f"{metrics.icc:.4f}" if not np.isnan(metrics.icc) else "N/A" | |
| flaky_pct = f"{metrics.flakiness_ratio*100:.1f}%" if not np.isnan(metrics.flakiness_ratio) else "N/A" | |
| print( | |
| f"{display_name:<55} " | |
| f"{icc_str:>8} " | |
| f"{flaky_pct:>8} " | |
| f"{metrics.mean_within_std:>8.4f} " | |
| f"{metrics.mean_agreement_rate*100:>7.1f}%" | |
| ) | |
| print("\nInterpretation:") | |
| print(" ICC > 0.9: Excellent consistency") | |
| print(" ICC 0.75-0.9: Good consistency") | |
| print(" ICC 0.5-0.75: Moderate consistency") | |
| print(" ICC < 0.5: Poor consistency (high flakiness)") | |
| def print_detailed_report( | |
| model_name: str, | |
| metrics_analysis: dict[str, ConsistencyMetrics], | |
| ) -> None: | |
| """Print detailed report for a single model.""" | |
| print(f"\n{'='*80}") | |
| print(f"Detailed Consistency Report: {model_name}") | |
| print(f"{'='*80}\n") | |
| for metric_name, cm in metrics_analysis.items(): | |
| print(f"\n--- {metric_name} ---") | |
| print(cm) | |
| if cm.flaky_scenarios: | |
| print(f"\nFlaky scenarios (std > 0.3):") | |
| for scenario, std in sorted(cm.flaky_scenarios, key=lambda x: -x[1])[:10]: | |
| details = cm.scenario_details.get(scenario, {}) | |
| trials = details.get("trials", []) | |
| print(f" {scenario}: std={std:.3f}, trials={trials}") | |
| def main(): | |
| parser = argparse.ArgumentParser( | |
| description="Calculate ICC and consistency metrics for leaderboard results", | |
| formatter_class=argparse.RawDescriptionHelpFormatter, | |
| epilog=__doc__, | |
| ) | |
| parser.add_argument( | |
| "--results-dir", | |
| type=Path, | |
| default=Path("leaderboard_results/results"), | |
| help="Directory containing results JSON files", | |
| ) | |
| parser.add_argument( | |
| "--results-file", | |
| type=Path, | |
| help="Analyze a single results file", | |
| ) | |
| parser.add_argument( | |
| "--models", | |
| nargs="+", | |
| default=[ | |
| "react with code_Azure_o4-mini", | |
| "react with code_Azure_gpt-5.1-2025-11-13", | |
| "react with code_gcp_gemini-3-pro-preview", | |
| "react with code_GCP_gemini-2.5-pro", | |
| ], | |
| help="Model name patterns to compare", | |
| ) | |
| parser.add_argument( | |
| "--metric", | |
| type=str, | |
| default="root_cause_entity_f1", | |
| help="Metric to analyze (default: root_cause_entity_f1)", | |
| ) | |
| parser.add_argument( | |
| "--all-metrics", | |
| action="store_true", | |
| help="Analyze all common metrics", | |
| ) | |
| parser.add_argument( | |
| "--detailed", | |
| action="store_true", | |
| help="Show detailed per-scenario breakdown", | |
| ) | |
| parser.add_argument( | |
| "--output-json", | |
| type=Path, | |
| help="Save results to JSON file", | |
| ) | |
| args = parser.parse_args() | |
| # Determine metrics to analyze | |
| if args.all_metrics: | |
| metrics = [ | |
| "root_cause_entity_f1", | |
| "root_cause_entity_precision", | |
| "root_cause_entity_recall", | |
| "root_cause_proximity_with_fp_f1", | |
| "propagation_chain", | |
| "fault_localization_component_identification", | |
| ] | |
| else: | |
| metrics = [args.metric] | |
| results_to_save = {} | |
| if args.results_file: | |
| # Single file analysis | |
| print(f"Analyzing: {args.results_file}") | |
| analysis = analyze_results_file(args.results_file, metrics) | |
| results = load_results(args.results_file) | |
| model_name = results.get("agent_name", args.results_file.stem) | |
| print_detailed_report(model_name, analysis) | |
| results_to_save[model_name] = { | |
| m: { | |
| "icc": cm.icc, | |
| "flakiness_ratio": cm.flakiness_ratio, | |
| "mean_within_std": cm.mean_within_std, | |
| "mean_agreement_rate": cm.mean_agreement_rate, | |
| "n_flaky_scenarios": cm.n_flaky_scenarios, | |
| "n_scenarios": cm.n_scenarios, | |
| } | |
| for m, cm in analysis.items() | |
| } | |
| else: | |
| # Multi-model comparison | |
| for metric in metrics: | |
| comparison = compare_models(args.results_dir, args.models, metric) | |
| print_comparison_table(comparison, metric) | |
| # Store results | |
| for model, cm in comparison.items(): | |
| if model not in results_to_save: | |
| results_to_save[model] = {} | |
| results_to_save[model][metric] = { | |
| "icc": cm.icc if not np.isnan(cm.icc) else None, | |
| "flakiness_ratio": cm.flakiness_ratio if not np.isnan(cm.flakiness_ratio) else None, | |
| "mean_within_std": cm.mean_within_std, | |
| "mean_agreement_rate": cm.mean_agreement_rate, | |
| "n_flaky_scenarios": cm.n_flaky_scenarios, | |
| "n_scenarios": cm.n_scenarios, | |
| } | |
| if args.detailed: | |
| for model, cm in comparison.items(): | |
| print_detailed_report(model, {metric: cm}) | |
| # Save to JSON if requested | |
| if args.output_json: | |
| with open(args.output_json, "w") as f: | |
| json.dump(results_to_save, f, indent=2) | |
| print(f"\nResults saved to: {args.output_json}") | |
| if __name__ == "__main__": | |
| main() | |