Spaces:

lablab-ai-amd-developer-hackathon
/

atlas-ops

Running

atlas-ops / scripts /generate_training_plots.py

Harikishanth R

AtlasOps: full deploy with reliability fixes + training evidence

4a77231 19 days ago

7.9 kB

	#!/usr/bin/env python3
	"""Generate training-evidence PNG charts from committed docs (no re-training needed).

	Reads numeric data already present in:
	docs/MI300X_EVIDENCE.md — SFT loss/token-accuracy log lines
	docs/TRAINING_STORY.md — GRPO per-step mean reward, benchmark table

	Outputs:
	assets/training/sft_loss.png
	assets/training/grpo_reward.png
	assets/training/benchmark_resolution.png
	assets/training/benchmark_per_tier.png

	Usage:
	pip install matplotlib # only dependency
	python scripts/generate_training_plots.py
	"""

	from pathlib import Path

	import matplotlib
	matplotlib.use("Agg")
	import matplotlib.pyplot as plt
	import matplotlib.ticker as mtick

	ROOT = Path(__file__).resolve().parent.parent
	OUT = ROOT / "assets" / "training"
	OUT.mkdir(parents=True, exist_ok=True)

	# ── Dark theme matching AtlasOps UI ──────────────────────────────────────────
	BG = "#0d1117"
	FG = "#c9d1d9"
	ACCENT = "#58a6ff"
	GREEN = "#57F287"
	YELLOW = "#FEE75C"
	RED = "#ED4245"
	GRID = "#21262d"

	plt.rcParams.update({
	"figure.facecolor": BG,
	"axes.facecolor": BG,
	"axes.edgecolor": GRID,
	"axes.labelcolor": FG,
	"text.color": FG,
	"xtick.color": FG,
	"ytick.color": FG,
	"grid.color": GRID,
	"grid.alpha": 0.5,
	"font.size": 11,
	"font.family": "sans-serif",
	"savefig.facecolor": BG,
	"savefig.edgecolor": BG,
	})


	# ── SFT loss + token accuracy ───────────────────────────────────────────────
	SFT_DATA = [
	# (epoch, loss, token_accuracy)
	(0.04, 1.2651, 0.7196),
	(0.08, 0.4114, 0.8998),
	(0.12, 0.1950, 0.9483),
	(0.20, 0.1156, 0.9660),
	(0.32, 0.0845, 0.9742),
	(0.55, 0.0557, 0.9821),
	(0.75, 0.0370, 0.9873),
	(0.99, 0.0272, 0.9915),
	]

	def plot_sft():
	epochs = [d[0] for d in SFT_DATA]
	losses = [d[1] for d in SFT_DATA]
	accs = [d[2] for d in SFT_DATA]

	fig, ax1 = plt.subplots(figsize=(8, 4.5))
	ax1.set_xlabel("Epoch")
	ax1.set_ylabel("Loss", color=RED)
	l1, = ax1.plot(epochs, losses, color=RED, marker="o", markersize=5, linewidth=2, label="Loss")
	ax1.tick_params(axis="y", labelcolor=RED)
	ax1.set_ylim(bottom=0)

	ax2 = ax1.twinx()
	ax2.set_ylabel("Token Accuracy", color=GREEN)
	l2, = ax2.plot(epochs, accs, color=GREEN, marker="s", markersize=5, linewidth=2, label="Token Accuracy")
	ax2.tick_params(axis="y", labelcolor=GREEN)
	ax2.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1))
	ax2.set_ylim(0.65, 1.0)

	ax1.set_title("SFT on AMD MI300X · 2,028 trajectories · 254 steps · 14 min", fontsize=12, pad=12)
	ax1.legend(handles=[l1, l2], loc="center right", framealpha=0.3)
	ax1.grid(True, alpha=0.3)
	fig.tight_layout()
	fig.savefig(OUT / "sft_loss.png", dpi=150)
	plt.close(fig)
	print(f" wrote {OUT / 'sft_loss.png'}")


	# ── GRPO mean reward per step ────────────────────────────────────────────────
	GRPO_REWARDS = [
	0.355, 0.243, 0.073, 0.218, 0.191, 0.147, 0.241, 0.251, 0.070, 0.144,
	0.070, 0.070, 0.048, 0.236, 0.188, 0.011, 0.247, 0.159, 0.158, 0.332,
	0.274, 0.297, 0.021, 0.376, 0.304, 0.352, 0.240, 0.140, 0.222, 0.149,
	0.421, 0.214, 0.140, 0.101, 0.201, 0.341, 0.232, 0.153, 0.219, 0.154,
	0.070, 0.402, 0.000, 0.276, 0.070, 0.261, 0.210, 0.116, 0.214, 0.070,
	0.143, 0.210, 0.319, 0.254, 0.230, 0.205, 0.251, 0.286, 0.182, 0.364,
	]

	def plot_grpo():
	steps = list(range(1, len(GRPO_REWARDS) + 1))
	# Running best-so-far
	best = []
	cur_best = 0.0
	for r in GRPO_REWARDS:
	cur_best = max(cur_best, r)
	best.append(cur_best)
	# 5-step moving average
	window = 5
	ma = []
	for i in range(len(GRPO_REWARDS)):
	start = max(0, i - window + 1)
	ma.append(sum(GRPO_REWARDS[start:i+1]) / (i - start + 1))

	fig, ax = plt.subplots(figsize=(10, 4.5))
	ax.bar(steps, GRPO_REWARDS, color=ACCENT, alpha=0.4, width=0.8, label="Per-step mean reward")
	ax.plot(steps, ma, color=YELLOW, linewidth=2, label=f"{window}-step moving avg")
	ax.plot(steps, best, color=GREEN, linewidth=1.5, linestyle="--", alpha=0.7, label="Best so far")
	ax.axhline(y=sum(GRPO_REWARDS)/len(GRPO_REWARDS), color=FG, linewidth=1, linestyle=":", alpha=0.5, label=f"Overall mean ({sum(GRPO_REWARDS)/len(GRPO_REWARDS):.3f})")

	ax.set_xlabel("GRPO Step")
	ax.set_ylabel("Mean Reward")
	ax.set_title("Online GRPO on AMD MI300X · 60 steps · 4 rollouts · 236 episodes · 9h 34m", fontsize=12, pad=12)
	ax.legend(loc="upper left", framealpha=0.3, fontsize=9)
	ax.set_ylim(bottom=-0.02)
	ax.grid(True, alpha=0.3)
	fig.tight_layout()
	fig.savefig(OUT / "grpo_reward.png", dpi=150)
	plt.close(fig)
	print(f" wrote {OUT / 'grpo_reward.png'}")


	# ── Benchmark resolution comparison ─────────────────────────────────────────
	def plot_benchmark_resolution():
	models = ["Zero-shot\nBaseline", "AtlasOps\nSFT", "AtlasOps\nGRPO"]
	resolution = [54, 68, 82]
	judge_reward = [0.481, 0.601, 0.729]
	colors = [FG, YELLOW, GREEN]

	fig, ax1 = plt.subplots(figsize=(7, 4.5))
	bars = ax1.bar(models, resolution, color=colors, alpha=0.85, width=0.5, edgecolor=GRID)
	for bar, val in zip(bars, resolution):
	ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1.5, f"{val}%", ha="center", va="bottom", fontweight="bold", fontsize=13)
	ax1.set_ylabel("Resolution Rate (%)")
	ax1.set_ylim(0, 100)
	ax1.set_title("Incident Resolution Rate · 28 chaos scenarios", fontsize=12, pad=12)
	ax1.grid(True, axis="y", alpha=0.3)

	ax2 = ax1.twinx()
	ax2.plot(models, judge_reward, color=RED, marker="D", markersize=8, linewidth=2, label="Judge reward")
	ax2.set_ylabel("Avg Judge Reward", color=RED)
	ax2.tick_params(axis="y", labelcolor=RED)
	ax2.set_ylim(0.3, 0.85)
	ax2.legend(loc="upper left", framealpha=0.3, fontsize=9)

	fig.tight_layout()
	fig.savefig(OUT / "benchmark_resolution.png", dpi=150)
	plt.close(fig)
	print(f" wrote {OUT / 'benchmark_resolution.png'}")


	# ── Benchmark per-tier ───────────────────────────────────────────────────────
	def plot_benchmark_per_tier():
	tiers = ["Single Fault", "Cascade", "Multi-Fault", "Named Replays"]
	baseline = [63, 40, 40, 30]
	grpo = [88, 78, 76, 72]

	x = range(len(tiers))
	w = 0.35

	fig, ax = plt.subplots(figsize=(8, 4.5))
	b1 = ax.bar([i - w/2 for i in x], baseline, w, label="Zero-shot Baseline", color=FG, alpha=0.7, edgecolor=GRID)
	b2 = ax.bar([i + w/2 for i in x], grpo, w, label="AtlasOps GRPO", color=GREEN, alpha=0.85, edgecolor=GRID)

	for bars in [b1, b2]:
	for bar in bars:
	ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, f"{int(bar.get_height())}%", ha="center", va="bottom", fontsize=10, fontweight="bold")

	ax.set_ylabel("Resolution Rate (%)")
	ax.set_xticks(list(x))
	ax.set_xticklabels(tiers)
	ax.set_ylim(0, 100)
	ax.set_title("Resolution by Scenario Tier · Baseline vs GRPO", fontsize=12, pad=12)
	ax.legend(framealpha=0.3)
	ax.grid(True, axis="y", alpha=0.3)
	fig.tight_layout()
	fig.savefig(OUT / "benchmark_per_tier.png", dpi=150)
	plt.close(fig)
	print(f" wrote {OUT / 'benchmark_per_tier.png'}")


	if __name__ == "__main__":
	print("Generating training evidence plots...")
	plot_sft()
	plot_grpo()
	plot_benchmark_resolution()
	plot_benchmark_per_tier()
	print("Done.")