Spaces:

vettri06
/

meta_hackathon_finals_d3cdrs

Sleeping

meta_hackathon_finals_d3cdrs / scripts /generate_performance_matrix.py

GOOD CAT

Deploy clean Space snapshot without binary artifacts

ccd6313 25 days ago

13.2 kB

	"""Generate individual performance graphs from self-play training results.

	Produces separate PNG files for each metric in the output/ directory.
	Called automatically after every self-play training run.

	Output files:
	output/01_training_loss.png
	output/02_reward_analysis.png
	output/03_elo_progression.png
	output/04_win_rate.png
	output/05_detection_fp_rate.png
	output/06_difficulty_progression.png
	output/performance_matrix.csv
	"""
	import json
	import os
	import sys
	import numpy as np
	import matplotlib
	matplotlib.use('Agg')
	import matplotlib.pyplot as plt
	import pandas as pd
	from pathlib import Path

	# Ensure project root is on path
	ROOT_DIR = Path(__file__).resolve().parent.parent
	sys.path.insert(0, str(ROOT_DIR))


	def compute_fixed_baseline_scores():
	"""Run heuristic agent on fixed tasks for absolute baseline."""
	from server.firewall_environment import FirewallEnvironment
	from server.graders import run_deterministic_grade
	from server.baseline.heuristic_agent import heuristic_policy

	baselines = {}
	for task in ['easy', 'medium', 'hard']:
	env = FirewallEnvironment(seed=303)
	result = run_deterministic_grade(env, task, heuristic_policy)
	baselines[task] = result['score']
	return baselines


	def generate_graphs(input_json: str = None, output_dir: str = None):
	"""Generate all individual performance graph files.

	Args:
	input_json: Path to self_play_results.json (default: project root)
	output_dir: Directory to save graphs (default: project root / output)
	"""
	input_path = Path(input_json) if input_json else ROOT_DIR / "self_play_results.json"
	out_dir = Path(output_dir) if output_dir else ROOT_DIR / "output"

	if not input_path.exists():
	print(f" [GRAPHS] Error: {input_path} not found")
	return

	out_dir.mkdir(parents=True, exist_ok=True)

	with open(input_path, 'r', encoding='utf-8') as f:
	data = json.load(f)

	rounds_data = data.get("rounds", [])
	if not rounds_data:
	print(" [GRAPHS] No rounds data found.")
	return

	# ── Fixed baseline ──
	print(" [GRAPHS] Computing fixed baselines...")
	baselines = compute_fixed_baseline_scores()

	# ── Extract data ──
	rn = [r["round"] for r in rounds_data]
	scores = [r["score"] for r in rounds_data]
	elos = [r["elo"] for r in rounds_data]
	elo_deltas = [r["elo_delta"] for r in rounds_data]
	diff_elos = [r["difficulty_elo"] for r in rounds_data]
	det_rates = [r["stats"]["det"] for r in rounds_data]
	fp_rates = [r["stats"]["fp"] for r in rounds_data]
	eff_rates = [r["stats"]["eff"] for r in rounds_data]

	# Derived metrics
	abs_loss = [1.0 - s for s in scores]
	diff_fracs = [np.clip((de - 800) / 800, 0, 1) for de in diff_elos]
	norm_rewards = [min(1.0, s / max(0.3, 1.0 - 0.3 * df)) for s, df in zip(scores, diff_fracs)]
	elo_gaps = [e - de for e, de in zip(elos, diff_elos)]

	w = 5 # rolling window
	pass_thresh = data.get("config", {}).get("pass_threshold", 0.55)
	wins = [1 if r["passed"] else 0 for r in rounds_data]
	win_roll = pd.Series(wins).rolling(window=w, min_periods=1).mean().tolist()
	det_roll = pd.Series(det_rates).rolling(window=w, min_periods=1).mean().tolist()
	fp_roll = pd.Series(fp_rates).rolling(window=w, min_periods=1).mean().tolist()
	loss_roll = pd.Series(abs_loss).rolling(window=w, min_periods=1).mean().tolist()
	score_roll_mean = pd.Series(scores).rolling(window=w, min_periods=1).mean().tolist()
	score_roll_std = pd.Series(scores).rolling(window=w, min_periods=1).std().fillna(0).tolist()

	# ── Save CSV ──
	df = pd.DataFrame({
	"Round": rn, "Raw_Score": scores, "Abs_Training_Loss": abs_loss,
	"Diff_Normalized_Reward": norm_rewards,
	"Detection_Rate": det_rates, "FP_Rate": fp_rates, "Efficiency": eff_rates,
	"Agent_Elo": elos, "Elo_Delta": elo_deltas, "Difficulty_Elo": diff_elos,
	"Elo_Gap": elo_gaps, "Win_Rate": win_roll, "Difficulty_Frac": diff_fracs,
	})
	csv_path = out_dir / "performance_matrix.csv"
	df.to_csv(csv_path, index=False, float_format="%.6f")

	# ── Shared style ──
	plt.rcParams.update({
	'figure.facecolor': '#FAFAFA',
	'axes.facecolor': '#FFFFFF',
	'axes.grid': True,
	'grid.alpha': 0.3,
	'font.size': 11,
	})

	saved = []

	# ================================================================
	# GRAPH 1: Training Loss
	# ================================================================
	fig, ax = plt.subplots(figsize=(10, 5))
	ax.plot(rn, abs_loss, color='#E74C3C', linewidth=2, marker='o', markersize=4,
	alpha=0.6, label='Abs. Loss (1 - score)')
	ax.plot(rn, loss_roll, color='#C0392B', linewidth=2.5, linestyle='--',
	label=f'Rolling Mean (w={w})')
	ax.set_xlabel('Training Round', fontweight='bold')
	ax.set_ylabel('Training Loss', fontweight='bold')
	ax.set_title('Training Loss (Absolute Performance Gap)\n'
	'Loss increases because curriculum difficulty rises, not because agent worsens',
	fontsize=12, fontweight='bold')
	ax.legend(fontsize=10)
	ax.set_ylim(0, max(abs_loss) * 1.3)
	plt.tight_layout()
	p = out_dir / "01_training_loss.png"
	fig.savefig(p, dpi=200, bbox_inches='tight')
	plt.close(fig)
	saved.append(p.name)

	# ================================================================
	# GRAPH 2: Reward Analysis
	# ================================================================
	fig, ax = plt.subplots(figsize=(10, 5))
	ax.plot(rn, scores, color='#2ECC71', linewidth=1.5, alpha=0.4, marker='.',
	label='Raw Score (vs adaptive opponent)')
	ax.plot(rn, norm_rewards, color='#27AE60', linewidth=2.5, marker='o', markersize=4,
	label='Difficulty-Normalized Reward')
	ax.fill_between(rn,
	np.array(score_roll_mean) - np.array(score_roll_std),
	np.array(score_roll_mean) + np.array(score_roll_std),
	color='#2ECC71', alpha=0.15, label=f'Score Std Dev (w={w})')
	ax.axhline(y=baselines['medium'], color='gray', linestyle=':', linewidth=1.5,
	label=f'Fixed Medium Baseline ({baselines["medium"]:.3f})')
	ax.axhline(y=pass_thresh, color='red', linestyle=':', alpha=0.5,
	label=f'Pass Threshold ({pass_thresh})')
	ax.set_xlabel('Training Round', fontweight='bold')
	ax.set_ylabel('Reward / Score', fontweight='bold')
	ax.set_title('Reward Analysis: Raw vs Difficulty-Normalized\n'
	'Normalized reward UP = agent genuinely improving despite harder tasks',
	fontsize=12, fontweight='bold')
	ax.legend(fontsize=9, loc='lower left')
	plt.tight_layout()
	p = out_dir / "02_reward_analysis.png"
	fig.savefig(p, dpi=200, bbox_inches='tight')
	plt.close(fig)
	saved.append(p.name)

	# ================================================================
	# GRAPH 3: Elo Progression
	# ================================================================
	fig, ax = plt.subplots(figsize=(10, 5))
	ax.plot(rn, elos, color='#3498DB', linewidth=2.5, marker='o', markersize=4,
	label='Agent Elo')
	ax.plot(rn, diff_elos, color='#E67E22', linewidth=2, marker='s', markersize=3,
	linestyle='--', label='Opponent (Difficulty) Elo')
	ax.fill_between(rn, elos, diff_elos,
	where=[e < de for e, de in zip(elos, diff_elos)],
	color='#E74C3C', alpha=0.1, label='Agent Behind')
	ax.fill_between(rn, elos, diff_elos,
	where=[e >= de for e, de in zip(elos, diff_elos)],
	color='#27AE60', alpha=0.1, label='Agent Ahead')
	ax.set_xlabel('Training Round', fontweight='bold')
	ax.set_ylabel('Elo Rating', fontweight='bold')
	ax.set_title('Elo Progression: Agent vs Adaptive Opponent\n'
	f'Method: Logistic K=32 \| Gap: {elo_gaps[0]:+.0f} -> {elo_gaps[-1]:+.0f}',
	fontsize=12, fontweight='bold')
	ax.legend(fontsize=9)
	plt.tight_layout()
	p = out_dir / "03_elo_progression.png"
	fig.savefig(p, dpi=200, bbox_inches='tight')
	plt.close(fig)
	saved.append(p.name)

	# ================================================================
	# GRAPH 4: Win Rate & Elo Delta
	# ================================================================
	fig, ax1 = plt.subplots(figsize=(10, 5))
	ax2 = ax1.twinx()
	bars = ax1.bar(rn, elo_deltas, color='#3498DB', alpha=0.35, label='Elo Delta per Round')
	line = ax2.plot(rn, win_roll, color='#1ABC9C', linewidth=2.5, marker='o',
	markersize=4, label=f'Win Rate (rolling w={w})')
	ax2.axhline(y=1.0, color='gray', linestyle=':', alpha=0.5)
	ax1.set_xlabel('Training Round', fontweight='bold')
	ax1.set_ylabel('Elo Delta', fontweight='bold', color='#3498DB')
	ax2.set_ylabel('Win Rate', fontweight='bold', color='#1ABC9C')
	ax2.set_ylim(0, 1.15)
	total_pass = sum(wins)
	ax1.set_title(f'Win Rate & Elo Gain per Round\n'
	f'Overall: {total_pass}/{len(wins)} passed ({100*total_pass/len(wins):.0f}%)',
	fontsize=12, fontweight='bold')
	h1, l1 = ax1.get_legend_handles_labels()
	h2, l2 = ax2.get_legend_handles_labels()
	ax1.legend(h1 + h2, l1 + l2, fontsize=9, loc='lower right')
	plt.tight_layout()
	p = out_dir / "04_win_rate.png"
	fig.savefig(p, dpi=200, bbox_inches='tight')
	plt.close(fig)
	saved.append(p.name)

	# ================================================================
	# GRAPH 5: Detection & FP Rate
	# ================================================================
	fig, ax = plt.subplots(figsize=(10, 5))
	ax.plot(rn, det_roll, color='#9B59B6', linewidth=2.5, marker='o', markersize=4,
	label=f'Detection Rate (rolling w={w})')
	ax.plot(rn, fp_roll, color='#E74C3C', linewidth=2, marker='s', markersize=3,
	label=f'False Positive Rate (rolling w={w})')
	ax.plot(rn, eff_rates, color='#F39C12', linewidth=1.5, alpha=0.5, marker='.',
	label='Efficiency')
	ax.set_xlabel('Training Round', fontweight='bold')
	ax.set_ylabel('Rate', fontweight='bold')
	ax.set_title('Detection, False Positive & Efficiency over Training\n'
	f'Detection stays high while FP stays near zero',
	fontsize=12, fontweight='bold')
	ax.legend(fontsize=9)
	ax.set_ylim(-0.02, 1.05)
	plt.tight_layout()
	p = out_dir / "05_detection_fp_rate.png"
	fig.savefig(p, dpi=200, bbox_inches='tight')
	plt.close(fig)
	saved.append(p.name)

	# ================================================================
	# GRAPH 6: Difficulty Progression
	# ================================================================
	fig, ax = plt.subplots(figsize=(10, 5))
	ax.plot(rn, diff_fracs, color='#E67E22', linewidth=2.5, marker='s', markersize=4,
	label='Difficulty Fraction')
	ax.fill_between(rn, 0, diff_fracs, color='#E67E22', alpha=0.15)
	ax.axhline(y=0.25, color='green', linestyle=':', alpha=0.5, label='Easy zone')
	ax.axhline(y=0.5, color='orange', linestyle=':', alpha=0.5, label='Medium zone')
	ax.axhline(y=0.75, color='red', linestyle=':', alpha=0.5, label='Hard zone')
	ax.set_xlabel('Training Round', fontweight='bold')
	ax.set_ylabel('Difficulty (0=Easiest, 1=Hardest)', fontweight='bold')
	ax.set_title('Curriculum Difficulty Progression (ADR)\n'
	f'Started at {diff_fracs[0]:.2f}, ended at {diff_fracs[-1]:.2f}',
	fontsize=12, fontweight='bold')
	ax.legend(fontsize=9)
	ax.set_ylim(0, 1.05)
	plt.tight_layout()
	p = out_dir / "06_difficulty_progression.png"
	fig.savefig(p, dpi=200, bbox_inches='tight')
	plt.close(fig)
	saved.append(p.name)

	# ── Print summary ──
	print(f" [GRAPHS] Saved {len(saved)} graphs to {out_dir}/")
	for name in saved:
	print(f" -> {name}")
	print(f" [GRAPHS] Saved CSV -> {csv_path.name}")

	# Console summary table
	n = len(rn)
	early_n = min(10, n)
	late_start = max(0, n - 10)
	print(f"\n {'Metric':<35s} {'Early':>10s} {'Late':>10s} {'Trend':>7s}")
	print(f" {'-'35} {'-'10} {'-'10} {'-'7}")
	for name, vals in [
	("Abs. Training Loss", abs_loss),
	("Raw Score", scores),
	("Diff-Normalized Reward", norm_rewards),
	("Detection Rate", det_rates),
	("FP Rate", fp_rates),
	("Efficiency", eff_rates),
	]:
	early = np.mean(vals[:early_n])
	late = np.mean(vals[late_start:])
	trend = "DOWN" if late < early - 0.005 else ("UP" if late > early + 0.005 else "FLAT")
	print(f" {name:<35s} {early:10.4f} {late:10.4f} {trend:>7s}")

	print(f"\n Agent Elo: {elos[0]:.1f} -> {elos[-1]:.1f} (d={elos[-1]-elos[0]:+.1f})")
	print(f" Opponent Elo: {diff_elos[0]:.1f} -> {diff_elos[-1]:.1f} (d={diff_elos[-1]-diff_elos[0]:+.1f})")

	return saved


	if __name__ == "__main__":
	generate_graphs()