| """Generate individual performance graphs from self-play training results. |
| |
| Produces separate PNG files for each metric in the output/ directory. |
| Called automatically after every self-play training run. |
| |
| Output files: |
| output/01_training_loss.png |
| output/02_reward_analysis.png |
| output/03_elo_progression.png |
| output/04_win_rate.png |
| output/05_detection_fp_rate.png |
| output/06_difficulty_progression.png |
| output/performance_matrix.csv |
| """ |
| import json |
| import os |
| import sys |
| import numpy as np |
| import matplotlib |
| matplotlib.use('Agg') |
| import matplotlib.pyplot as plt |
| import pandas as pd |
| from pathlib import Path |
|
|
| |
| ROOT_DIR = Path(__file__).resolve().parent.parent |
| sys.path.insert(0, str(ROOT_DIR)) |
|
|
|
|
| def compute_fixed_baseline_scores(): |
| """Run heuristic agent on fixed tasks for absolute baseline.""" |
| from server.firewall_environment import FirewallEnvironment |
| from server.graders import run_deterministic_grade |
| from server.baseline.heuristic_agent import heuristic_policy |
|
|
| baselines = {} |
| for task in ['easy', 'medium', 'hard']: |
| env = FirewallEnvironment(seed=303) |
| result = run_deterministic_grade(env, task, heuristic_policy) |
| baselines[task] = result['score'] |
| return baselines |
|
|
|
|
| def generate_graphs(input_json: str = None, output_dir: str = None): |
| """Generate all individual performance graph files. |
| |
| Args: |
| input_json: Path to self_play_results.json (default: project root) |
| output_dir: Directory to save graphs (default: project root / output) |
| """ |
| input_path = Path(input_json) if input_json else ROOT_DIR / "self_play_results.json" |
| out_dir = Path(output_dir) if output_dir else ROOT_DIR / "output" |
|
|
| if not input_path.exists(): |
| print(f" [GRAPHS] Error: {input_path} not found") |
| return |
|
|
| out_dir.mkdir(parents=True, exist_ok=True) |
|
|
| with open(input_path, 'r', encoding='utf-8') as f: |
| data = json.load(f) |
|
|
| rounds_data = data.get("rounds", []) |
| if not rounds_data: |
| print(" [GRAPHS] No rounds data found.") |
| return |
|
|
| |
| print(" [GRAPHS] Computing fixed baselines...") |
| baselines = compute_fixed_baseline_scores() |
|
|
| |
| rn = [r["round"] for r in rounds_data] |
| scores = [r["score"] for r in rounds_data] |
| elos = [r["elo"] for r in rounds_data] |
| elo_deltas = [r["elo_delta"] for r in rounds_data] |
| diff_elos = [r["difficulty_elo"] for r in rounds_data] |
| det_rates = [r["stats"]["det"] for r in rounds_data] |
| fp_rates = [r["stats"]["fp"] for r in rounds_data] |
| eff_rates = [r["stats"]["eff"] for r in rounds_data] |
|
|
| |
| abs_loss = [1.0 - s for s in scores] |
| diff_fracs = [np.clip((de - 800) / 800, 0, 1) for de in diff_elos] |
| norm_rewards = [min(1.0, s / max(0.3, 1.0 - 0.3 * df)) for s, df in zip(scores, diff_fracs)] |
| elo_gaps = [e - de for e, de in zip(elos, diff_elos)] |
|
|
| w = 5 |
| pass_thresh = data.get("config", {}).get("pass_threshold", 0.55) |
| wins = [1 if r["passed"] else 0 for r in rounds_data] |
| win_roll = pd.Series(wins).rolling(window=w, min_periods=1).mean().tolist() |
| det_roll = pd.Series(det_rates).rolling(window=w, min_periods=1).mean().tolist() |
| fp_roll = pd.Series(fp_rates).rolling(window=w, min_periods=1).mean().tolist() |
| loss_roll = pd.Series(abs_loss).rolling(window=w, min_periods=1).mean().tolist() |
| score_roll_mean = pd.Series(scores).rolling(window=w, min_periods=1).mean().tolist() |
| score_roll_std = pd.Series(scores).rolling(window=w, min_periods=1).std().fillna(0).tolist() |
|
|
| |
| df = pd.DataFrame({ |
| "Round": rn, "Raw_Score": scores, "Abs_Training_Loss": abs_loss, |
| "Diff_Normalized_Reward": norm_rewards, |
| "Detection_Rate": det_rates, "FP_Rate": fp_rates, "Efficiency": eff_rates, |
| "Agent_Elo": elos, "Elo_Delta": elo_deltas, "Difficulty_Elo": diff_elos, |
| "Elo_Gap": elo_gaps, "Win_Rate": win_roll, "Difficulty_Frac": diff_fracs, |
| }) |
| csv_path = out_dir / "performance_matrix.csv" |
| df.to_csv(csv_path, index=False, float_format="%.6f") |
|
|
| |
| plt.rcParams.update({ |
| 'figure.facecolor': '#FAFAFA', |
| 'axes.facecolor': '#FFFFFF', |
| 'axes.grid': True, |
| 'grid.alpha': 0.3, |
| 'font.size': 11, |
| }) |
|
|
| saved = [] |
|
|
| |
| |
| |
| fig, ax = plt.subplots(figsize=(10, 5)) |
| ax.plot(rn, abs_loss, color='#E74C3C', linewidth=2, marker='o', markersize=4, |
| alpha=0.6, label='Abs. Loss (1 - score)') |
| ax.plot(rn, loss_roll, color='#C0392B', linewidth=2.5, linestyle='--', |
| label=f'Rolling Mean (w={w})') |
| ax.set_xlabel('Training Round', fontweight='bold') |
| ax.set_ylabel('Training Loss', fontweight='bold') |
| ax.set_title('Training Loss (Absolute Performance Gap)\n' |
| 'Loss increases because curriculum difficulty rises, not because agent worsens', |
| fontsize=12, fontweight='bold') |
| ax.legend(fontsize=10) |
| ax.set_ylim(0, max(abs_loss) * 1.3) |
| plt.tight_layout() |
| p = out_dir / "01_training_loss.png" |
| fig.savefig(p, dpi=200, bbox_inches='tight') |
| plt.close(fig) |
| saved.append(p.name) |
|
|
| |
| |
| |
| fig, ax = plt.subplots(figsize=(10, 5)) |
| ax.plot(rn, scores, color='#2ECC71', linewidth=1.5, alpha=0.4, marker='.', |
| label='Raw Score (vs adaptive opponent)') |
| ax.plot(rn, norm_rewards, color='#27AE60', linewidth=2.5, marker='o', markersize=4, |
| label='Difficulty-Normalized Reward') |
| ax.fill_between(rn, |
| np.array(score_roll_mean) - np.array(score_roll_std), |
| np.array(score_roll_mean) + np.array(score_roll_std), |
| color='#2ECC71', alpha=0.15, label=f'Score Std Dev (w={w})') |
| ax.axhline(y=baselines['medium'], color='gray', linestyle=':', linewidth=1.5, |
| label=f'Fixed Medium Baseline ({baselines["medium"]:.3f})') |
| ax.axhline(y=pass_thresh, color='red', linestyle=':', alpha=0.5, |
| label=f'Pass Threshold ({pass_thresh})') |
| ax.set_xlabel('Training Round', fontweight='bold') |
| ax.set_ylabel('Reward / Score', fontweight='bold') |
| ax.set_title('Reward Analysis: Raw vs Difficulty-Normalized\n' |
| 'Normalized reward UP = agent genuinely improving despite harder tasks', |
| fontsize=12, fontweight='bold') |
| ax.legend(fontsize=9, loc='lower left') |
| plt.tight_layout() |
| p = out_dir / "02_reward_analysis.png" |
| fig.savefig(p, dpi=200, bbox_inches='tight') |
| plt.close(fig) |
| saved.append(p.name) |
|
|
| |
| |
| |
| fig, ax = plt.subplots(figsize=(10, 5)) |
| ax.plot(rn, elos, color='#3498DB', linewidth=2.5, marker='o', markersize=4, |
| label='Agent Elo') |
| ax.plot(rn, diff_elos, color='#E67E22', linewidth=2, marker='s', markersize=3, |
| linestyle='--', label='Opponent (Difficulty) Elo') |
| ax.fill_between(rn, elos, diff_elos, |
| where=[e < de for e, de in zip(elos, diff_elos)], |
| color='#E74C3C', alpha=0.1, label='Agent Behind') |
| ax.fill_between(rn, elos, diff_elos, |
| where=[e >= de for e, de in zip(elos, diff_elos)], |
| color='#27AE60', alpha=0.1, label='Agent Ahead') |
| ax.set_xlabel('Training Round', fontweight='bold') |
| ax.set_ylabel('Elo Rating', fontweight='bold') |
| ax.set_title('Elo Progression: Agent vs Adaptive Opponent\n' |
| f'Method: Logistic K=32 | Gap: {elo_gaps[0]:+.0f} -> {elo_gaps[-1]:+.0f}', |
| fontsize=12, fontweight='bold') |
| ax.legend(fontsize=9) |
| plt.tight_layout() |
| p = out_dir / "03_elo_progression.png" |
| fig.savefig(p, dpi=200, bbox_inches='tight') |
| plt.close(fig) |
| saved.append(p.name) |
|
|
| |
| |
| |
| fig, ax1 = plt.subplots(figsize=(10, 5)) |
| ax2 = ax1.twinx() |
| bars = ax1.bar(rn, elo_deltas, color='#3498DB', alpha=0.35, label='Elo Delta per Round') |
| line = ax2.plot(rn, win_roll, color='#1ABC9C', linewidth=2.5, marker='o', |
| markersize=4, label=f'Win Rate (rolling w={w})') |
| ax2.axhline(y=1.0, color='gray', linestyle=':', alpha=0.5) |
| ax1.set_xlabel('Training Round', fontweight='bold') |
| ax1.set_ylabel('Elo Delta', fontweight='bold', color='#3498DB') |
| ax2.set_ylabel('Win Rate', fontweight='bold', color='#1ABC9C') |
| ax2.set_ylim(0, 1.15) |
| total_pass = sum(wins) |
| ax1.set_title(f'Win Rate & Elo Gain per Round\n' |
| f'Overall: {total_pass}/{len(wins)} passed ({100*total_pass/len(wins):.0f}%)', |
| fontsize=12, fontweight='bold') |
| h1, l1 = ax1.get_legend_handles_labels() |
| h2, l2 = ax2.get_legend_handles_labels() |
| ax1.legend(h1 + h2, l1 + l2, fontsize=9, loc='lower right') |
| plt.tight_layout() |
| p = out_dir / "04_win_rate.png" |
| fig.savefig(p, dpi=200, bbox_inches='tight') |
| plt.close(fig) |
| saved.append(p.name) |
|
|
| |
| |
| |
| fig, ax = plt.subplots(figsize=(10, 5)) |
| ax.plot(rn, det_roll, color='#9B59B6', linewidth=2.5, marker='o', markersize=4, |
| label=f'Detection Rate (rolling w={w})') |
| ax.plot(rn, fp_roll, color='#E74C3C', linewidth=2, marker='s', markersize=3, |
| label=f'False Positive Rate (rolling w={w})') |
| ax.plot(rn, eff_rates, color='#F39C12', linewidth=1.5, alpha=0.5, marker='.', |
| label='Efficiency') |
| ax.set_xlabel('Training Round', fontweight='bold') |
| ax.set_ylabel('Rate', fontweight='bold') |
| ax.set_title('Detection, False Positive & Efficiency over Training\n' |
| f'Detection stays high while FP stays near zero', |
| fontsize=12, fontweight='bold') |
| ax.legend(fontsize=9) |
| ax.set_ylim(-0.02, 1.05) |
| plt.tight_layout() |
| p = out_dir / "05_detection_fp_rate.png" |
| fig.savefig(p, dpi=200, bbox_inches='tight') |
| plt.close(fig) |
| saved.append(p.name) |
|
|
| |
| |
| |
| fig, ax = plt.subplots(figsize=(10, 5)) |
| ax.plot(rn, diff_fracs, color='#E67E22', linewidth=2.5, marker='s', markersize=4, |
| label='Difficulty Fraction') |
| ax.fill_between(rn, 0, diff_fracs, color='#E67E22', alpha=0.15) |
| ax.axhline(y=0.25, color='green', linestyle=':', alpha=0.5, label='Easy zone') |
| ax.axhline(y=0.5, color='orange', linestyle=':', alpha=0.5, label='Medium zone') |
| ax.axhline(y=0.75, color='red', linestyle=':', alpha=0.5, label='Hard zone') |
| ax.set_xlabel('Training Round', fontweight='bold') |
| ax.set_ylabel('Difficulty (0=Easiest, 1=Hardest)', fontweight='bold') |
| ax.set_title('Curriculum Difficulty Progression (ADR)\n' |
| f'Started at {diff_fracs[0]:.2f}, ended at {diff_fracs[-1]:.2f}', |
| fontsize=12, fontweight='bold') |
| ax.legend(fontsize=9) |
| ax.set_ylim(0, 1.05) |
| plt.tight_layout() |
| p = out_dir / "06_difficulty_progression.png" |
| fig.savefig(p, dpi=200, bbox_inches='tight') |
| plt.close(fig) |
| saved.append(p.name) |
|
|
| |
| print(f" [GRAPHS] Saved {len(saved)} graphs to {out_dir}/") |
| for name in saved: |
| print(f" -> {name}") |
| print(f" [GRAPHS] Saved CSV -> {csv_path.name}") |
|
|
| |
| n = len(rn) |
| early_n = min(10, n) |
| late_start = max(0, n - 10) |
| print(f"\n {'Metric':<35s} {'Early':>10s} {'Late':>10s} {'Trend':>7s}") |
| print(f" {'-'*35} {'-'*10} {'-'*10} {'-'*7}") |
| for name, vals in [ |
| ("Abs. Training Loss", abs_loss), |
| ("Raw Score", scores), |
| ("Diff-Normalized Reward", norm_rewards), |
| ("Detection Rate", det_rates), |
| ("FP Rate", fp_rates), |
| ("Efficiency", eff_rates), |
| ]: |
| early = np.mean(vals[:early_n]) |
| late = np.mean(vals[late_start:]) |
| trend = "DOWN" if late < early - 0.005 else ("UP" if late > early + 0.005 else "FLAT") |
| print(f" {name:<35s} {early:10.4f} {late:10.4f} {trend:>7s}") |
|
|
| print(f"\n Agent Elo: {elos[0]:.1f} -> {elos[-1]:.1f} (d={elos[-1]-elos[0]:+.1f})") |
| print(f" Opponent Elo: {diff_elos[0]:.1f} -> {diff_elos[-1]:.1f} (d={diff_elos[-1]-diff_elos[0]:+.1f})") |
|
|
| return saved |
|
|
|
|
| if __name__ == "__main__": |
| generate_graphs() |
|
|