atlas-ops / scripts /generate_training_plots.py
Harikishanth R
AtlasOps: full deploy with reliability fixes + training evidence
4a77231
#!/usr/bin/env python3
"""Generate training-evidence PNG charts from committed docs (no re-training needed).
Reads numeric data already present in:
docs/MI300X_EVIDENCE.md — SFT loss/token-accuracy log lines
docs/TRAINING_STORY.md — GRPO per-step mean reward, benchmark table
Outputs:
assets/training/sft_loss.png
assets/training/grpo_reward.png
assets/training/benchmark_resolution.png
assets/training/benchmark_per_tier.png
Usage:
pip install matplotlib # only dependency
python scripts/generate_training_plots.py
"""
from pathlib import Path
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
ROOT = Path(__file__).resolve().parent.parent
OUT = ROOT / "assets" / "training"
OUT.mkdir(parents=True, exist_ok=True)
# ── Dark theme matching AtlasOps UI ──────────────────────────────────────────
BG = "#0d1117"
FG = "#c9d1d9"
ACCENT = "#58a6ff"
GREEN = "#57F287"
YELLOW = "#FEE75C"
RED = "#ED4245"
GRID = "#21262d"
plt.rcParams.update({
"figure.facecolor": BG,
"axes.facecolor": BG,
"axes.edgecolor": GRID,
"axes.labelcolor": FG,
"text.color": FG,
"xtick.color": FG,
"ytick.color": FG,
"grid.color": GRID,
"grid.alpha": 0.5,
"font.size": 11,
"font.family": "sans-serif",
"savefig.facecolor": BG,
"savefig.edgecolor": BG,
})
# ── SFT loss + token accuracy ───────────────────────────────────────────────
SFT_DATA = [
# (epoch, loss, token_accuracy)
(0.04, 1.2651, 0.7196),
(0.08, 0.4114, 0.8998),
(0.12, 0.1950, 0.9483),
(0.20, 0.1156, 0.9660),
(0.32, 0.0845, 0.9742),
(0.55, 0.0557, 0.9821),
(0.75, 0.0370, 0.9873),
(0.99, 0.0272, 0.9915),
]
def plot_sft():
epochs = [d[0] for d in SFT_DATA]
losses = [d[1] for d in SFT_DATA]
accs = [d[2] for d in SFT_DATA]
fig, ax1 = plt.subplots(figsize=(8, 4.5))
ax1.set_xlabel("Epoch")
ax1.set_ylabel("Loss", color=RED)
l1, = ax1.plot(epochs, losses, color=RED, marker="o", markersize=5, linewidth=2, label="Loss")
ax1.tick_params(axis="y", labelcolor=RED)
ax1.set_ylim(bottom=0)
ax2 = ax1.twinx()
ax2.set_ylabel("Token Accuracy", color=GREEN)
l2, = ax2.plot(epochs, accs, color=GREEN, marker="s", markersize=5, linewidth=2, label="Token Accuracy")
ax2.tick_params(axis="y", labelcolor=GREEN)
ax2.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1))
ax2.set_ylim(0.65, 1.0)
ax1.set_title("SFT on AMD MI300X · 2,028 trajectories · 254 steps · 14 min", fontsize=12, pad=12)
ax1.legend(handles=[l1, l2], loc="center right", framealpha=0.3)
ax1.grid(True, alpha=0.3)
fig.tight_layout()
fig.savefig(OUT / "sft_loss.png", dpi=150)
plt.close(fig)
print(f" wrote {OUT / 'sft_loss.png'}")
# ── GRPO mean reward per step ────────────────────────────────────────────────
GRPO_REWARDS = [
0.355, 0.243, 0.073, 0.218, 0.191, 0.147, 0.241, 0.251, 0.070, 0.144,
0.070, 0.070, 0.048, 0.236, 0.188, 0.011, 0.247, 0.159, 0.158, 0.332,
0.274, 0.297, 0.021, 0.376, 0.304, 0.352, 0.240, 0.140, 0.222, 0.149,
0.421, 0.214, 0.140, 0.101, 0.201, 0.341, 0.232, 0.153, 0.219, 0.154,
0.070, 0.402, 0.000, 0.276, 0.070, 0.261, 0.210, 0.116, 0.214, 0.070,
0.143, 0.210, 0.319, 0.254, 0.230, 0.205, 0.251, 0.286, 0.182, 0.364,
]
def plot_grpo():
steps = list(range(1, len(GRPO_REWARDS) + 1))
# Running best-so-far
best = []
cur_best = 0.0
for r in GRPO_REWARDS:
cur_best = max(cur_best, r)
best.append(cur_best)
# 5-step moving average
window = 5
ma = []
for i in range(len(GRPO_REWARDS)):
start = max(0, i - window + 1)
ma.append(sum(GRPO_REWARDS[start:i+1]) / (i - start + 1))
fig, ax = plt.subplots(figsize=(10, 4.5))
ax.bar(steps, GRPO_REWARDS, color=ACCENT, alpha=0.4, width=0.8, label="Per-step mean reward")
ax.plot(steps, ma, color=YELLOW, linewidth=2, label=f"{window}-step moving avg")
ax.plot(steps, best, color=GREEN, linewidth=1.5, linestyle="--", alpha=0.7, label="Best so far")
ax.axhline(y=sum(GRPO_REWARDS)/len(GRPO_REWARDS), color=FG, linewidth=1, linestyle=":", alpha=0.5, label=f"Overall mean ({sum(GRPO_REWARDS)/len(GRPO_REWARDS):.3f})")
ax.set_xlabel("GRPO Step")
ax.set_ylabel("Mean Reward")
ax.set_title("Online GRPO on AMD MI300X · 60 steps · 4 rollouts · 236 episodes · 9h 34m", fontsize=12, pad=12)
ax.legend(loc="upper left", framealpha=0.3, fontsize=9)
ax.set_ylim(bottom=-0.02)
ax.grid(True, alpha=0.3)
fig.tight_layout()
fig.savefig(OUT / "grpo_reward.png", dpi=150)
plt.close(fig)
print(f" wrote {OUT / 'grpo_reward.png'}")
# ── Benchmark resolution comparison ─────────────────────────────────────────
def plot_benchmark_resolution():
models = ["Zero-shot\nBaseline", "AtlasOps\nSFT", "AtlasOps\nGRPO"]
resolution = [54, 68, 82]
judge_reward = [0.481, 0.601, 0.729]
colors = [FG, YELLOW, GREEN]
fig, ax1 = plt.subplots(figsize=(7, 4.5))
bars = ax1.bar(models, resolution, color=colors, alpha=0.85, width=0.5, edgecolor=GRID)
for bar, val in zip(bars, resolution):
ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1.5, f"{val}%", ha="center", va="bottom", fontweight="bold", fontsize=13)
ax1.set_ylabel("Resolution Rate (%)")
ax1.set_ylim(0, 100)
ax1.set_title("Incident Resolution Rate · 28 chaos scenarios", fontsize=12, pad=12)
ax1.grid(True, axis="y", alpha=0.3)
ax2 = ax1.twinx()
ax2.plot(models, judge_reward, color=RED, marker="D", markersize=8, linewidth=2, label="Judge reward")
ax2.set_ylabel("Avg Judge Reward", color=RED)
ax2.tick_params(axis="y", labelcolor=RED)
ax2.set_ylim(0.3, 0.85)
ax2.legend(loc="upper left", framealpha=0.3, fontsize=9)
fig.tight_layout()
fig.savefig(OUT / "benchmark_resolution.png", dpi=150)
plt.close(fig)
print(f" wrote {OUT / 'benchmark_resolution.png'}")
# ── Benchmark per-tier ───────────────────────────────────────────────────────
def plot_benchmark_per_tier():
tiers = ["Single Fault", "Cascade", "Multi-Fault", "Named Replays"]
baseline = [63, 40, 40, 30]
grpo = [88, 78, 76, 72]
x = range(len(tiers))
w = 0.35
fig, ax = plt.subplots(figsize=(8, 4.5))
b1 = ax.bar([i - w/2 for i in x], baseline, w, label="Zero-shot Baseline", color=FG, alpha=0.7, edgecolor=GRID)
b2 = ax.bar([i + w/2 for i in x], grpo, w, label="AtlasOps GRPO", color=GREEN, alpha=0.85, edgecolor=GRID)
for bars in [b1, b2]:
for bar in bars:
ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, f"{int(bar.get_height())}%", ha="center", va="bottom", fontsize=10, fontweight="bold")
ax.set_ylabel("Resolution Rate (%)")
ax.set_xticks(list(x))
ax.set_xticklabels(tiers)
ax.set_ylim(0, 100)
ax.set_title("Resolution by Scenario Tier · Baseline vs GRPO", fontsize=12, pad=12)
ax.legend(framealpha=0.3)
ax.grid(True, axis="y", alpha=0.3)
fig.tight_layout()
fig.savefig(OUT / "benchmark_per_tier.png", dpi=150)
plt.close(fig)
print(f" wrote {OUT / 'benchmark_per_tier.png'}")
if __name__ == "__main__":
print("Generating training evidence plots...")
plot_sft()
plot_grpo()
plot_benchmark_resolution()
plot_benchmark_per_tier()
print("Done.")