Spaces:
Sleeping
Sleeping
| """ | |
| Evaluation script — generates before/after comparison plots. | |
| Run after training to produce the graphs for your README and presentation. | |
| Usage: | |
| python scripts/evaluate.py \ | |
| --baseline ./outputs/baseline_results.json \ | |
| --trained ./outputs/grpo_solver/training_log_history.json \ | |
| --output ./outputs/plots/ | |
| """ | |
| import sys | |
| import os | |
| import json | |
| import argparse | |
| from pathlib import Path | |
| os.environ.setdefault("MPLBACKEND", "Agg") | |
| sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| def load_json(path: str) -> dict: | |
| with open(path) as f: | |
| return json.load(f) | |
| def smooth(arr, window=10): | |
| result = [] | |
| for i in range(len(arr)): | |
| start = max(0, i - window + 1) | |
| result.append(sum(arr[start:i+1]) / (i - start + 1)) | |
| return result | |
| def plot_reward_curves(history: list, output_dir: str): | |
| try: | |
| import matplotlib.pyplot as plt | |
| import matplotlib.style as style | |
| style.use('dark_background') | |
| except ImportError: | |
| print("⚠ matplotlib not installed. pip install matplotlib") | |
| return | |
| if "episode" in history[0]: | |
| x_axis = [r["episode"] for r in history] | |
| setter_rewards = smooth([r["setter_reward"] for r in history]) | |
| solver_rewards = smooth([r["solver_reward"] for r in history]) | |
| pass_rates = smooth([r.get("solver_pass_rate", 0) for r in history]) | |
| setter_elo = [r["setter_elo"] for r in history] if "setter_elo" in history[0] else None | |
| solver_elo = [r["solver_elo"] for r in history] if "solver_elo" in history[0] else None | |
| outcomes_source = history | |
| x_label = "Episode" | |
| else: | |
| log_records = [r for r in history if "step" in r and ("reward" in r or "reward_pass_rate" in r)] | |
| if not log_records: | |
| print("⚠ No plottable training metrics found") | |
| return | |
| x_axis = [r["step"] for r in log_records] | |
| setter_rewards = [0.0 for _ in log_records] | |
| solver_rewards = smooth([r.get("reward", 0.0) for r in log_records]) | |
| pass_rates = smooth([r.get("reward_pass_rate", 0.0) for r in log_records]) | |
| setter_elo = None | |
| solver_elo = None | |
| outcomes_source = [] | |
| x_label = "Training Step" | |
| fig, axes = plt.subplots(2, 2, figsize=(14, 9)) | |
| fig.patch.set_facecolor('#0d0d0d') | |
| fig.suptitle('CodeCourt — Training Metrics', color='white', | |
| fontsize=16, fontweight='bold', y=0.98) | |
| COLORS = { | |
| 'setter': '#ff6b35', | |
| 'solver': '#4ecdc4', | |
| 'grid': '#333333', | |
| 'text': '#cccccc', | |
| } | |
| def style_ax(ax, title, xlabel, ylabel): | |
| ax.set_facecolor('#1a1a1a') | |
| ax.set_title(title, color='white', fontsize=11, pad=8) | |
| ax.set_xlabel(xlabel, color=COLORS['text'], fontsize=9) | |
| ax.set_ylabel(ylabel, color=COLORS['text'], fontsize=9) | |
| ax.tick_params(colors=COLORS['text']) | |
| ax.grid(True, color=COLORS['grid'], linewidth=0.5, alpha=0.7) | |
| for spine in ax.spines.values(): | |
| spine.set_color('#444444') | |
| # 1. Reward curves | |
| ax = axes[0, 0] | |
| if any(value != 0.0 for value in setter_rewards): | |
| ax.plot(x_axis, setter_rewards, color=COLORS['setter'], | |
| linewidth=1.5, label='Setter Reward') | |
| ax.plot(x_axis, solver_rewards, color=COLORS['solver'], | |
| linewidth=1.5, label='Solver Reward') | |
| ax.axhline(0, color='#555555', linewidth=0.8, linestyle='--') | |
| ax.legend(facecolor='#2a2a2a', edgecolor='#555555', | |
| labelcolor='white', fontsize=9) | |
| style_ax(ax, 'Reward Curves (smoothed, window=10)', | |
| x_label, 'Avg Reward') | |
| # 2. Solver pass rate over time | |
| ax = axes[0, 1] | |
| ax.plot(x_axis, [p * 100 for p in pass_rates], | |
| color=COLORS['solver'], linewidth=1.5) | |
| ax.set_ylim(0, 105) | |
| ax.axhline(50, color='#ffaa00', linewidth=0.8, linestyle='--', | |
| label='50% baseline') | |
| ax.legend(facecolor='#2a2a2a', edgecolor='#555555', | |
| labelcolor='white', fontsize=9) | |
| style_ax(ax, 'Solver Pass Rate (%)', x_label, 'Pass Rate %') | |
| # 3. Elo ratings | |
| ax = axes[1, 0] | |
| if setter_elo is not None and solver_elo is not None: | |
| ax.plot(x_axis, setter_elo, color=COLORS['setter'], | |
| linewidth=1.5, label='Setter Elo') | |
| ax.plot(x_axis, solver_elo, color=COLORS['solver'], | |
| linewidth=1.5, label='Solver Elo') | |
| ax.axhline(1000, color='#555555', linewidth=0.8, linestyle='--') | |
| ax.legend(facecolor='#2a2a2a', edgecolor='#555555', | |
| labelcolor='white', fontsize=9) | |
| style_ax(ax, 'Elo Rating Progression', x_label, 'Elo Rating') | |
| else: | |
| ax.text(0.5, 0.5, 'GRPO run logs reward metrics,\nnot match Elo.', | |
| ha='center', va='center', color='white', fontsize=11) | |
| ax.set_axis_off() | |
| # 4. Outcome distribution (stacked bar, binned) | |
| ax = axes[1, 1] | |
| if not outcomes_source: | |
| ax.text(0.5, 0.5, 'Outcome bins are available for\nlegacy episode runs only.', | |
| ha='center', va='center', color='white', fontsize=11) | |
| ax.set_axis_off() | |
| else: | |
| bin_size = max(1, len(outcomes_source) // 20) | |
| bins = [] | |
| setter_wins_pct = [] | |
| solver_wins_pct = [] | |
| invalid_pct = [] | |
| for i in range(0, len(outcomes_source), bin_size): | |
| chunk = outcomes_source[i:i+bin_size] | |
| if not chunk: | |
| continue | |
| bins.append(i) | |
| outcomes = [r["outcome"] for r in chunk] | |
| n = len(outcomes) | |
| setter_wins_pct.append(outcomes.count("setter_wins") / n * 100) | |
| solver_wins_pct.append(outcomes.count("solver_wins") / n * 100) | |
| invalid_pct.append(outcomes.count("invalid") / n * 100) | |
| ax.bar(bins, setter_wins_pct, width=bin_size*0.8, | |
| color=COLORS['setter'], alpha=0.8, label='Setter Wins') | |
| ax.bar(bins, solver_wins_pct, width=bin_size*0.8, | |
| bottom=setter_wins_pct, color=COLORS['solver'], | |
| alpha=0.8, label='Solver Wins') | |
| ax.set_ylim(0, 105) | |
| ax.legend(facecolor='#2a2a2a', edgecolor='#555555', | |
| labelcolor='white', fontsize=9) | |
| style_ax(ax, 'Outcome Distribution Over Time', | |
| 'Episode', 'Percentage %') | |
| plt.tight_layout(rect=[0, 0, 1, 0.96]) | |
| os.makedirs(output_dir, exist_ok=True) | |
| out_path = os.path.join(output_dir, 'training_curves.png') | |
| plt.savefig(out_path, dpi=150, bbox_inches='tight', | |
| facecolor=fig.get_facecolor()) | |
| print(f"✓ Saved: {out_path}") | |
| plt.close() | |
| def plot_before_after(baseline: dict, trained_history: list, output_dir: str): | |
| """Before/after comparison — the killer demo chart.""" | |
| try: | |
| import matplotlib.pyplot as plt | |
| import matplotlib.style as style | |
| style.use('dark_background') | |
| except ImportError: | |
| return | |
| # Compute trained metrics (last 25% of training) | |
| if "episode" in trained_history[0]: | |
| n = len(trained_history) | |
| last_quarter = trained_history[n * 3 // 4:] | |
| trained_pass_rate = sum( | |
| r.get("solver_pass_rate", 0) for r in last_quarter | |
| ) / max(len(last_quarter), 1) | |
| trained_solver_reward = sum(r["solver_reward"] for r in last_quarter) / max(len(last_quarter), 1) | |
| trained_setter_win_rate = ( | |
| sum(1 for r in last_quarter if r["outcome"] == "setter_wins") | |
| / max(len(last_quarter), 1) * 100 | |
| ) | |
| else: | |
| log_records = [r for r in trained_history if "step" in r and ("reward" in r or "reward_pass_rate" in r)] | |
| last_quarter = log_records[len(log_records) * 3 // 4:] | |
| trained_pass_rate = sum( | |
| r.get("reward_pass_rate", 0) for r in last_quarter | |
| ) / max(len(last_quarter), 1) | |
| trained_solver_reward = sum(r.get("reward", 0) for r in last_quarter) / max(len(last_quarter), 1) | |
| trained_setter_win_rate = 0.0 | |
| baseline_summary = baseline.get("summary", baseline) | |
| baseline_pass = baseline_summary.get("avg_solver_pass_rate", 0.31) | |
| fig, axes = plt.subplots(1, 3, figsize=(14, 5)) | |
| fig.patch.set_facecolor('#0d0d0d') | |
| fig.suptitle('CodeCourt — Before vs After Training', | |
| color='white', fontsize=15, fontweight='bold') | |
| BEFORE = '#ff6b35' | |
| AFTER = '#4ecdc4' | |
| BG = '#1a1a1a' | |
| metrics = [ | |
| ("Solver Pass Rate", baseline_pass * 100, trained_pass_rate * 100, "%"), | |
| ( | |
| "Avg Solver Reward", | |
| baseline_summary.get("avg_solver_reward", -15), | |
| trained_solver_reward, | |
| "pts", | |
| ), | |
| ( | |
| "Setter Win Rate", | |
| baseline_summary.get("setter_win_rate", 0.4) * 100, | |
| trained_setter_win_rate, | |
| "%", | |
| ), | |
| ] | |
| for ax, (title, before_val, after_val, unit) in zip(axes, metrics): | |
| ax.set_facecolor(BG) | |
| bars = ax.bar(['Before\n(Untrained)', 'After\n(Trained)'], | |
| [before_val, after_val], | |
| color=[BEFORE, AFTER], width=0.5, | |
| edgecolor='#333333') | |
| # Value labels | |
| for bar, val in zip(bars, [before_val, after_val]): | |
| ax.text(bar.get_x() + bar.get_width() / 2, | |
| bar.get_height() + abs(before_val) * 0.05, | |
| f"{val:.1f}{unit}", | |
| ha='center', va='bottom', color='white', | |
| fontsize=13, fontweight='bold') | |
| ax.set_title(title, color='white', fontsize=11, pad=10) | |
| ax.tick_params(colors='#cccccc') | |
| ax.set_ylabel(unit, color='#cccccc', fontsize=9) | |
| ax.grid(True, axis='y', color='#333333', linewidth=0.5) | |
| for spine in ax.spines.values(): | |
| spine.set_color('#444444') | |
| plt.tight_layout(rect=[0, 0, 1, 0.93]) | |
| out_path = os.path.join(output_dir, 'before_after.png') | |
| plt.savefig(out_path, dpi=150, bbox_inches='tight', | |
| facecolor=fig.get_facecolor()) | |
| print(f"✓ Saved: {out_path}") | |
| plt.close() | |
| def build_evaluation_summary(baseline: dict | None, trained_history: list) -> dict: | |
| log_records = [row for row in trained_history if isinstance(row, dict) and "step" in row] | |
| baseline_summary = (baseline or {}).get("summary", baseline or {}) | |
| baseline_pass = baseline_summary.get("avg_solver_pass_rate") | |
| baseline_reward = baseline_summary.get("avg_solver_reward") | |
| if log_records: | |
| final_record = log_records[-1] | |
| trained_pass = final_record.get("reward_pass_rate") | |
| trained_reward = final_record.get("reward") | |
| trained_robustness = final_record.get("reward_robustness") | |
| if trained_pass is None: | |
| pass_values = [row.get("reward_pass_rate") for row in log_records if row.get("reward_pass_rate") is not None] | |
| trained_pass = pass_values[-1] if pass_values else None | |
| if trained_reward is None: | |
| reward_values = [row.get("reward") for row in log_records if row.get("reward") is not None] | |
| trained_reward = reward_values[-1] if reward_values else None | |
| if trained_robustness is None: | |
| robustness_values = [ | |
| row.get("reward_robustness") | |
| for row in log_records | |
| if row.get("reward_robustness") is not None | |
| ] | |
| trained_robustness = robustness_values[-1] if robustness_values else None | |
| setter_win_rate = None | |
| else: | |
| episodes = [row for row in trained_history if isinstance(row, dict) and "episode" in row] | |
| tail = episodes[len(episodes) * 3 // 4:] if episodes else [] | |
| trained_pass = ( | |
| sum(row.get("solver_pass_rate", 0) for row in tail) / max(len(tail), 1) | |
| if tail else None | |
| ) | |
| trained_reward = ( | |
| sum(row.get("solver_reward", 0) for row in tail) / max(len(tail), 1) | |
| if tail else None | |
| ) | |
| trained_robustness = None | |
| setter_win_rate = ( | |
| sum(1 for row in tail if row.get("outcome") == "setter_wins") / max(len(tail), 1) | |
| if tail else None | |
| ) | |
| return { | |
| "baseline_pass_rate": baseline_pass, | |
| "trained_pass_rate": trained_pass, | |
| "pass_rate_delta": (trained_pass - baseline_pass) if baseline_pass is not None and trained_pass is not None else None, | |
| "baseline_reward": baseline_reward, | |
| "trained_reward": trained_reward, | |
| "reward_delta": (trained_reward - baseline_reward) if baseline_reward is not None and trained_reward is not None else None, | |
| "trained_robustness": trained_robustness, | |
| "trained_setter_win_rate": setter_win_rate, | |
| } | |
| def generate_reports(baseline_path: Path | None, trained_path: Path, output_dir: Path): | |
| trained = load_json(str(trained_path)) | |
| history = trained if isinstance(trained, list) else trained.get("episodes", trained) | |
| os.makedirs(output_dir, exist_ok=True) | |
| plot_reward_curves(history, str(output_dir)) | |
| baseline = None | |
| if baseline_path and baseline_path.exists(): | |
| baseline = load_json(str(baseline_path)) | |
| plot_before_after(baseline, history, str(output_dir)) | |
| summary = build_evaluation_summary(baseline, history) | |
| with open(output_dir / "evaluation_summary.json", "w") as f: | |
| json.dump(summary, f, indent=2) | |
| print(f"✓ Saved: {output_dir / 'evaluation_summary.json'}") | |
| return summary | |
| def main(): | |
| p = argparse.ArgumentParser() | |
| p.add_argument("--baseline", type=str, | |
| default="./outputs/baseline_results.json") | |
| p.add_argument("--trained", type=str, | |
| default="./outputs/grpo_solver/training_log_history.json") | |
| p.add_argument("--output", type=str, default="./outputs/plots/") | |
| args = p.parse_args() | |
| print("\n CodeCourt Evaluation") | |
| print("=" * 50) | |
| # Load data | |
| if not os.path.exists(args.trained): | |
| print(f"⚠ No training history at {args.trained}") | |
| print(" Run: python scripts/train.py --train-samples 54 --max-steps 30") | |
| return | |
| history = load_json(args.trained) | |
| history = history if isinstance(history, list) else history.get("episodes", history) | |
| print(f"Loaded {len(history)} training episodes") | |
| baseline_path = Path(args.baseline) if os.path.exists(args.baseline) else None | |
| if baseline_path is None: | |
| print(f"⚠ No baseline at {args.baseline} — before/after chart will be skipped") | |
| print(" Run: python scripts/baseline.py") | |
| generate_reports(baseline_path, Path(args.trained), Path(args.output)) | |
| print(f"\n✓ All plots saved to: {args.output}") | |
| if __name__ == "__main__": | |
| main() | |