"""Generate training evidence plots from baseline and training artifacts.""" import json import sys from pathlib import Path import matplotlib.pyplot as plt import numpy as np ARTIFACTS_DIR = Path("artifacts") PLOTS_DIR = ARTIFACTS_DIR / "plots" PLOTS_DIR.mkdir(parents=True, exist_ok=True) def load_baseline(task: str) -> dict | None: path = ARTIFACTS_DIR / f"baseline_{task}.json" if not path.exists(): return None with open(path) as f: return json.load(f) def load_training_results() -> list[dict]: results = [] grpo_dir = ARTIFACTS_DIR / "grpo" if not grpo_dir.exists(): return results for subdir in grpo_dir.iterdir(): if subdir.is_dir(): result_file = subdir / "results.json" if result_file.exists(): with open(result_file) as f: data = json.load(f) if data.get("status") != "not_run": results.append(data) return results def plot_reward_curve(baselines: dict[str, dict], training: list[dict]) -> None: fig, ax = plt.subplots(figsize=(10, 6)) if training: episodes = list(range(1, len(training) + 1)) rewards = [r.get("mean_reward", 0) for r in training] ax.plot(episodes, rewards, "o-", color="green", linewidth=2, label="Training Reward") ax.fill_between( episodes, [r - 0.1 for r in rewards], [r + 0.1 for r in rewards], alpha=0.2, color="green", ) else: ax.text( 0.5, 0.5, "No training data available\nRun training to generate reward curves", transform=ax.transAxes, ha="center", va="center", fontsize=14, color="gray", ) ax.set_xlabel("Training Episode / Checkpoint") ax.set_ylabel("Mean Eval Reward") ax.set_title("Training Reward Curve") ax.grid(True, alpha=0.3) if training: ax.legend() fig.savefig(PLOTS_DIR / "reward_curve.png", dpi=300, bbox_inches="tight") plt.close(fig) def plot_grader_score_curve(baselines: dict[str, dict], training: list[dict]) -> None: fig, ax = plt.subplots(figsize=(10, 6)) if training: episodes = list(range(1, len(training) + 1)) scores = [r.get("mean_grader_score", 0) for r in training] ax.plot(episodes, scores, "o-", color="blue", linewidth=2, label="Grader Score") ax.fill_between( episodes, [max(0, s - 0.05) for s in scores], [min(1, s + 0.05) for s in scores], alpha=0.2, color="blue", ) else: ax.text( 0.5, 0.5, "No training data available\nRun training to generate grader score curves", transform=ax.transAxes, ha="center", va="center", fontsize=14, color="gray", ) ax.set_xlabel("Training Episode / Checkpoint") ax.set_ylabel("Mean Grader Score") ax.set_title("Training Grader Score Curve") ax.set_ylim(0, 1) ax.grid(True, alpha=0.3) if training: ax.legend() fig.savefig(PLOTS_DIR / "grader_curve.png", dpi=300, bbox_inches="tight") plt.close(fig) def plot_before_after(baselines: dict[str, dict]) -> None: tasks = ["easy", "medium", "hard"] baseline_scores = [] trained_scores = [] labels = [] for task in tasks: if task in baselines: baseline_scores.append(baselines[task]["summary"]["mean_grader_score"]) trained_scores.append(0.0) # placeholder labels.append(task.capitalize()) if not labels: fig, ax = plt.subplots(figsize=(8, 6)) ax.text( 0.5, 0.5, "No baseline data available", transform=ax.transAxes, ha="center", va="center", fontsize=14, color="gray", ) fig.savefig(PLOTS_DIR / "before_after.png", dpi=300, bbox_inches="tight") plt.close(fig) return x = np.arange(len(labels)) width = 0.35 fig, ax = plt.subplots(figsize=(8, 6)) bars1 = ax.bar(x - width / 2, baseline_scores, width, label="Baseline", color="#e74c3c") bars2 = ax.bar(x + width / 2, trained_scores, width, label="Trained (TBD)", color="#2ecc71") ax.set_ylabel("Mean Grader Score") ax.set_title("Before vs After: Baseline vs Trained Agent") ax.set_xticks(x) ax.set_xticklabels(labels) ax.set_ylim(0, 1) ax.legend() ax.grid(True, alpha=0.3, axis="y") # Add value labels on bars for bar in bars1: height = bar.get_height() ax.annotate( f"{height:.3f}", xy=(bar.get_x() + bar.get_width() / 2, height), xytext=(0, 3), textcoords="offset points", ha="center", va="bottom", fontsize=9, ) fig.savefig(PLOTS_DIR / "before_after.png", dpi=300, bbox_inches="tight") plt.close(fig) def plot_per_task_comparison(baselines: dict[str, dict]) -> None: tasks = ["easy", "medium", "hard"] fig, axes = plt.subplots(1, 3, figsize=(15, 5)) for idx, task in enumerate(tasks): ax = axes[idx] if task in baselines: episodes = baselines[task]["episodes"] scores = [e["grader_score"] for e in episodes] rewards = [e["total_reward"] for e in episodes] ax.scatter(scores, rewards, alpha=0.6, s=50) ax.set_xlabel("Grader Score") ax.set_ylabel("Total Reward") ax.set_title(f"{task.capitalize()} Task") ax.grid(True, alpha=0.3) else: ax.text( 0.5, 0.5, "No data", transform=ax.transAxes, ha="center", va="center", fontsize=12, color="gray", ) ax.set_title(f"{task.capitalize()} Task") fig.suptitle("Per-Task Baseline Distribution: Grader Score vs Total Reward") fig.tight_layout() fig.savefig(PLOTS_DIR / "per_task_comparison.png", dpi=300, bbox_inches="tight") plt.close(fig) def main(): print("Loading baseline data...", file=sys.stderr) baselines = {} for task in ["easy", "medium", "hard"]: data = load_baseline(task) if data: baselines[task] = data print(f" {task}: {len(data['episodes'])} episodes", file=sys.stderr) print("Loading training data...", file=sys.stderr) training = load_training_results() print(f" Found {len(training)} training result files", file=sys.stderr) print("Generating plots...", file=sys.stderr) plot_reward_curve(baselines, training) plot_grader_score_curve(baselines, training) plot_before_after(baselines) plot_per_task_comparison(baselines) for png in sorted(PLOTS_DIR.glob("*.png")): print(f" {png.name}: {png.stat().st_size / 1024:.1f} KB", file=sys.stderr) print("Done.", file=sys.stderr) if __name__ == "__main__": main()