#!/usr/bin/env python3 """Generate training-evidence PNG charts from committed docs (no re-training needed). Reads numeric data already present in: docs/MI300X_EVIDENCE.md — SFT loss/token-accuracy log lines docs/TRAINING_STORY.md — GRPO per-step mean reward, benchmark table Outputs: assets/training/sft_loss.png assets/training/grpo_reward.png assets/training/benchmark_resolution.png assets/training/benchmark_per_tier.png Usage: pip install matplotlib # only dependency python scripts/generate_training_plots.py """ from pathlib import Path import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt import matplotlib.ticker as mtick ROOT = Path(__file__).resolve().parent.parent OUT = ROOT / "assets" / "training" OUT.mkdir(parents=True, exist_ok=True) # ── Dark theme matching AtlasOps UI ────────────────────────────────────────── BG = "#0d1117" FG = "#c9d1d9" ACCENT = "#58a6ff" GREEN = "#57F287" YELLOW = "#FEE75C" RED = "#ED4245" GRID = "#21262d" plt.rcParams.update({ "figure.facecolor": BG, "axes.facecolor": BG, "axes.edgecolor": GRID, "axes.labelcolor": FG, "text.color": FG, "xtick.color": FG, "ytick.color": FG, "grid.color": GRID, "grid.alpha": 0.5, "font.size": 11, "font.family": "sans-serif", "savefig.facecolor": BG, "savefig.edgecolor": BG, }) # ── SFT loss + token accuracy ─────────────────────────────────────────────── SFT_DATA = [ # (epoch, loss, token_accuracy) (0.04, 1.2651, 0.7196), (0.08, 0.4114, 0.8998), (0.12, 0.1950, 0.9483), (0.20, 0.1156, 0.9660), (0.32, 0.0845, 0.9742), (0.55, 0.0557, 0.9821), (0.75, 0.0370, 0.9873), (0.99, 0.0272, 0.9915), ] def plot_sft(): epochs = [d[0] for d in SFT_DATA] losses = [d[1] for d in SFT_DATA] accs = [d[2] for d in SFT_DATA] fig, ax1 = plt.subplots(figsize=(8, 4.5)) ax1.set_xlabel("Epoch") ax1.set_ylabel("Loss", color=RED) l1, = ax1.plot(epochs, losses, color=RED, marker="o", markersize=5, linewidth=2, label="Loss") ax1.tick_params(axis="y", labelcolor=RED) ax1.set_ylim(bottom=0) ax2 = ax1.twinx() ax2.set_ylabel("Token Accuracy", color=GREEN) l2, = ax2.plot(epochs, accs, color=GREEN, marker="s", markersize=5, linewidth=2, label="Token Accuracy") ax2.tick_params(axis="y", labelcolor=GREEN) ax2.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1)) ax2.set_ylim(0.65, 1.0) ax1.set_title("SFT on AMD MI300X · 2,028 trajectories · 254 steps · 14 min", fontsize=12, pad=12) ax1.legend(handles=[l1, l2], loc="center right", framealpha=0.3) ax1.grid(True, alpha=0.3) fig.tight_layout() fig.savefig(OUT / "sft_loss.png", dpi=150) plt.close(fig) print(f" wrote {OUT / 'sft_loss.png'}") # ── GRPO mean reward per step ──────────────────────────────────────────────── GRPO_REWARDS = [ 0.355, 0.243, 0.073, 0.218, 0.191, 0.147, 0.241, 0.251, 0.070, 0.144, 0.070, 0.070, 0.048, 0.236, 0.188, 0.011, 0.247, 0.159, 0.158, 0.332, 0.274, 0.297, 0.021, 0.376, 0.304, 0.352, 0.240, 0.140, 0.222, 0.149, 0.421, 0.214, 0.140, 0.101, 0.201, 0.341, 0.232, 0.153, 0.219, 0.154, 0.070, 0.402, 0.000, 0.276, 0.070, 0.261, 0.210, 0.116, 0.214, 0.070, 0.143, 0.210, 0.319, 0.254, 0.230, 0.205, 0.251, 0.286, 0.182, 0.364, ] def plot_grpo(): steps = list(range(1, len(GRPO_REWARDS) + 1)) # Running best-so-far best = [] cur_best = 0.0 for r in GRPO_REWARDS: cur_best = max(cur_best, r) best.append(cur_best) # 5-step moving average window = 5 ma = [] for i in range(len(GRPO_REWARDS)): start = max(0, i - window + 1) ma.append(sum(GRPO_REWARDS[start:i+1]) / (i - start + 1)) fig, ax = plt.subplots(figsize=(10, 4.5)) ax.bar(steps, GRPO_REWARDS, color=ACCENT, alpha=0.4, width=0.8, label="Per-step mean reward") ax.plot(steps, ma, color=YELLOW, linewidth=2, label=f"{window}-step moving avg") ax.plot(steps, best, color=GREEN, linewidth=1.5, linestyle="--", alpha=0.7, label="Best so far") ax.axhline(y=sum(GRPO_REWARDS)/len(GRPO_REWARDS), color=FG, linewidth=1, linestyle=":", alpha=0.5, label=f"Overall mean ({sum(GRPO_REWARDS)/len(GRPO_REWARDS):.3f})") ax.set_xlabel("GRPO Step") ax.set_ylabel("Mean Reward") ax.set_title("Online GRPO on AMD MI300X · 60 steps · 4 rollouts · 236 episodes · 9h 34m", fontsize=12, pad=12) ax.legend(loc="upper left", framealpha=0.3, fontsize=9) ax.set_ylim(bottom=-0.02) ax.grid(True, alpha=0.3) fig.tight_layout() fig.savefig(OUT / "grpo_reward.png", dpi=150) plt.close(fig) print(f" wrote {OUT / 'grpo_reward.png'}") # ── Benchmark resolution comparison ───────────────────────────────────────── def plot_benchmark_resolution(): models = ["Zero-shot\nBaseline", "AtlasOps\nSFT", "AtlasOps\nGRPO"] resolution = [54, 68, 82] judge_reward = [0.481, 0.601, 0.729] colors = [FG, YELLOW, GREEN] fig, ax1 = plt.subplots(figsize=(7, 4.5)) bars = ax1.bar(models, resolution, color=colors, alpha=0.85, width=0.5, edgecolor=GRID) for bar, val in zip(bars, resolution): ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1.5, f"{val}%", ha="center", va="bottom", fontweight="bold", fontsize=13) ax1.set_ylabel("Resolution Rate (%)") ax1.set_ylim(0, 100) ax1.set_title("Incident Resolution Rate · 28 chaos scenarios", fontsize=12, pad=12) ax1.grid(True, axis="y", alpha=0.3) ax2 = ax1.twinx() ax2.plot(models, judge_reward, color=RED, marker="D", markersize=8, linewidth=2, label="Judge reward") ax2.set_ylabel("Avg Judge Reward", color=RED) ax2.tick_params(axis="y", labelcolor=RED) ax2.set_ylim(0.3, 0.85) ax2.legend(loc="upper left", framealpha=0.3, fontsize=9) fig.tight_layout() fig.savefig(OUT / "benchmark_resolution.png", dpi=150) plt.close(fig) print(f" wrote {OUT / 'benchmark_resolution.png'}") # ── Benchmark per-tier ─────────────────────────────────────────────────────── def plot_benchmark_per_tier(): tiers = ["Single Fault", "Cascade", "Multi-Fault", "Named Replays"] baseline = [63, 40, 40, 30] grpo = [88, 78, 76, 72] x = range(len(tiers)) w = 0.35 fig, ax = plt.subplots(figsize=(8, 4.5)) b1 = ax.bar([i - w/2 for i in x], baseline, w, label="Zero-shot Baseline", color=FG, alpha=0.7, edgecolor=GRID) b2 = ax.bar([i + w/2 for i in x], grpo, w, label="AtlasOps GRPO", color=GREEN, alpha=0.85, edgecolor=GRID) for bars in [b1, b2]: for bar in bars: ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, f"{int(bar.get_height())}%", ha="center", va="bottom", fontsize=10, fontweight="bold") ax.set_ylabel("Resolution Rate (%)") ax.set_xticks(list(x)) ax.set_xticklabels(tiers) ax.set_ylim(0, 100) ax.set_title("Resolution by Scenario Tier · Baseline vs GRPO", fontsize=12, pad=12) ax.legend(framealpha=0.3) ax.grid(True, axis="y", alpha=0.3) fig.tight_layout() fig.savefig(OUT / "benchmark_per_tier.png", dpi=150) plt.close(fig) print(f" wrote {OUT / 'benchmark_per_tier.png'}") if __name__ == "__main__": print("Generating training evidence plots...") plot_sft() plot_grpo() plot_benchmark_resolution() plot_benchmark_per_tier() print("Done.")