File size: 8,168 Bytes
ce00c50 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 | """
Plot training curves from REAL per-step lines emitted by the running HF Job.
Every datapoint below appears verbatim in the live job log stream
(https://huggingface.co/jobs/testingaccc/69ecfb45d70108f37acdeb50).
Nothing is interpolated, smoothed in, or fabricated.
Once the final metrics.json uploads at end of training, prefer
scripts/plot_from_metrics.py — it reads the full per-step history
straight from the model repo.
"""
import sys
from pathlib import Path
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import numpy as np
# (step, reward, accuracy_percent) — REAL log lines, in order.
DATA = [
(0, -2.50, 50.00), (1, -3.74, 37.50), (2, 5.43, 62.50),
(3, 1.50, 37.50), (4, -2.74, 25.00),
(10, -1.20, 37.50), (20, -2.25, 25.00), (30, -1.27, 37.50),
(40, -2.50, 50.00), (50, 0.56, 50.00), (60, 5.10, 37.50),
(70, -2.94, 25.00), (80, 0.25, 62.50), (90, -7.15, 12.50),
(100, -3.45, 37.50), (110, 0.38, 37.50), (120, -4.83, 12.50),
(130, -1.62, 50.00), (140, -4.38, 37.50), (150, -3.99, 37.50),
(160, -3.34, 25.00), (170, 1.38, 50.00), (180, -8.07, 12.50),
(190, 0.50, 50.00), (200, 2.62, 62.50), (210, -2.99, 25.00),
(220, -2.00, 25.00), (230, -7.99, 12.50), (240, 1.50, 50.00),
(250, 11.63, 62.50), (260, -2.47, 25.00), (280, -1.21, 37.50),
(290, 5.50, 62.50), (300, -2.33, 25.00), (310, -4.12, 37.50),
(320, 5.31, 50.00), (330, -2.45, 50.00), (340, 0.81, 50.00),
(350, 3.23, 62.50), (360, -0.62, 62.50), (370, 2.62, 37.50),
(380, 8.62, 62.50), (390, 8.85, 62.50), (400, 3.55, 50.00),
(410, -3.25, 37.50), (420, -6.20, 25.00), (430, 1.78, 37.50),
(440, -5.33, 25.00), (450, 2.40, 62.50),
(1200, 1.82, 37.50), (1210, -1.38, 37.50), (1220, -0.10, 25.00),
(1230, -1.35, 37.50), (1240, -3.06, 25.00), (1250, -3.19, 25.00),
(1260, -7.88, 12.50), (1270, 1.88, 37.50), (1280, 3.92, 50.00),
(1290, -3.24, 25.00), (1300, 6.70, 50.00), (1310, -3.17, 25.00),
(1320, 5.65, 62.50), (1330, 2.50, 62.50), (1340, -6.25, 25.00),
(1350, -6.25, 25.00), (1360, -6.25, 25.00), (1370, 2.25, 62.50),
(1380, -4.88, 25.00), (1390, -6.15, 25.00), (1400, 0.86, 25.00),
(1410, 3.62, 50.00), (1420, -1.38, 37.50), (1430, -6.17, 25.00),
(1710, -4.38, 37.50), (1720, -7.75, 12.50), (1730, -0.48, 37.50),
(1740, -1.25, 37.50), (1750, -1.25, 37.50), (1760, -0.35, 37.50),
(1770, 8.77, 62.50), (1780, 2.51, 62.50), (1790, 0.06, 25.00),
(1800, 0.76, 50.00), (1810, 5.68, 62.50), (1820, 1.50, 50.00),
(1830, -7.03, 12.50), (1840, 0.62, 50.00), (1850, -7.79, 12.50),
(1860, 0.48, 50.00), (1870, -2.88, 25.00), (1880, 0.75, 50.00),
(1890, -9.93, 0.00), (1900, -5.12, 25.00), (1910, -2.38, 25.00),
(1920, -1.25, 37.50), (1930, -3.11, 25.00), (1940, 11.91, 62.50),
(1950, -9.92, 0.00), (1960, -1.52, 37.50),
]
def rolling_mean(arr, window):
arr = np.asarray(arr, dtype=float)
if len(arr) < window:
return arr
out = np.empty_like(arr)
cs = np.cumsum(np.insert(arr, 0, 0))
for i in range(len(arr)):
lo = max(0, i - window + 1)
out[i] = (cs[i + 1] - cs[lo]) / (i - lo + 1)
return out
def plot(out_path: str = "training_curves.png"):
steps = np.array([r[0] for r in DATA])
rewards = np.array([r[1] for r in DATA])
accs = np.array([r[2] for r in DATA])
plt.style.use("dark_background")
fig, axes = plt.subplots(2, 2, figsize=(14, 9))
fig.suptitle(
f"Conflict Arbitration Agent - GRPO training (real per-step data, n={len(steps)})",
fontsize=14, fontweight="bold", color="#e6e6f0",
)
# 1. Reward over time
ax = axes[0, 0]
ax.scatter(steps, rewards, alpha=0.5, c="#8be9d6", s=30, label="per-step reward")
if len(rewards) >= 5:
ax.plot(steps, rolling_mean(rewards, 5), color="#ff79c6", linewidth=2.5,
label="rolling avg (window=5)")
ax.axhline(0, color="#666", linestyle="--", linewidth=1, alpha=0.7)
ax.set_title("Average reward over training step", color="#e6e6f0")
ax.set_xlabel("Training step")
ax.set_ylabel("Reward (mean of 8 GRPO rollouts)")
ax.legend(loc="lower right", framealpha=0.3)
ax.grid(True, alpha=0.15)
# 2. Accuracy over time
ax = axes[0, 1]
ax.scatter(steps, accs, alpha=0.5, c="#50fa7b", s=30, label="per-step accuracy")
if len(accs) >= 5:
ax.plot(steps, rolling_mean(accs, 5), color="#f1fa8c", linewidth=2.5,
label="rolling avg (window=5)")
ax.axhline(33.3, color="#ff5555", linestyle="--", linewidth=1.5, alpha=0.7,
label="random baseline (33.3%)")
ax.set_title("Arbitration accuracy over training step", color="#e6e6f0")
ax.set_xlabel("Training step")
ax.set_ylabel("Accuracy (%)")
ax.set_ylim(-5, 105)
ax.legend(loc="lower right", framealpha=0.3)
ax.grid(True, alpha=0.15)
# 3. Reward distribution: early vs late
ax = axes[1, 0]
early_mask = steps <= 500
late_mask = steps >= 1700
bins = np.linspace(rewards.min() - 0.5, rewards.max() + 0.5, 16)
ax.hist(rewards[early_mask], bins=bins, alpha=0.6, color="#ff5555",
label=f"early steps 0-450 (n={int(early_mask.sum())})")
ax.hist(rewards[late_mask], bins=bins, alpha=0.6, color="#50fa7b",
label=f"late steps 1710-1960 (n={int(late_mask.sum())})")
ax.axvline(rewards[early_mask].mean(), color="#ff5555", linestyle="--", linewidth=2,
label=f"early mean = {rewards[early_mask].mean():+.2f}")
ax.axvline(rewards[late_mask].mean(), color="#50fa7b", linestyle="--", linewidth=2,
label=f"late mean = {rewards[late_mask].mean():+.2f}")
ax.set_title("Reward distribution: early vs late training", color="#e6e6f0")
ax.set_xlabel("Reward")
ax.set_ylabel("Frequency")
ax.legend(loc="upper left", framealpha=0.3, fontsize=9)
ax.grid(True, alpha=0.15)
# 4. Summary stats
ax = axes[1, 1]
ax.axis("off")
early_r = rewards[early_mask]
late_r = rewards[late_mask]
early_a = accs[early_mask]
late_a = accs[late_mask]
pos = int((rewards > 0).sum())
above_chance = int((accs > 33.3).sum())
text = f"""TRAINING SUMMARY (real log data, no interpolation)
{'='*46}
Datapoints logged: {len(steps)}
Step range covered: {steps[0]} -> {steps[-1]}
Curriculum phase: 1 throughout
Hardware: A10G-small via HF Jobs
Job ID: 69ecfb45d70108f37acdeb50
REWARD
Early (steps 0-450): mean {early_r.mean():+.2f}
Late (steps 1710-1960): mean {late_r.mean():+.2f}
Improvement (late-early): {late_r.mean() - early_r.mean():+.2f}
Best step: {rewards.max():+.2f} (step {steps[int(np.argmax(rewards))]})
Worst step: {rewards.min():+.2f} (step {steps[int(np.argmin(rewards))]})
Positive-reward steps: {pos}/{len(rewards)} ({100*pos/len(rewards):.0f}%)
ACCURACY
Early (steps 0-450): mean {early_a.mean():.1f}%
Late (steps 1710-1960): mean {late_a.mean():.1f}%
Best: {accs.max():.1f}%
Random baseline: 33.3%
Above-chance steps: {above_chance}/{len(accs)} ({100*above_chance/len(accs):.0f}%)
NOTES
- High reward variance is expected: 8 stochastic rollouts/step
at temperature 0.9 with sparse, contrastive reward.
- Curriculum did not advance to phase 2; threshold was 70%.
- Final per-step metrics.json will overwrite this on completion.
"""
ax.text(0.0, 0.98, text, transform=ax.transAxes, fontsize=9,
verticalalignment="top", fontfamily="monospace", color="#c8c8e8")
plt.tight_layout()
plt.savefig(out_path, dpi=150, bbox_inches="tight", facecolor="#0a0a14")
print(f"saved {out_path} ({len(steps)} real datapoints)")
# CLI summary
print(f"early reward mean: {early_r.mean():+.3f}")
print(f"late reward mean: {late_r.mean():+.3f}")
print(f"early acc mean: {early_a.mean():.2f}%")
print(f"late acc mean: {late_a.mean():.2f}%")
print(f"random baseline: 33.30%")
if __name__ == "__main__":
out = sys.argv[1] if len(sys.argv) > 1 else "training_curves.png"
plot(out)
|