Spaces:

testingaccc
/

conflict-arbitration-env

Sleeping

App Files Files Community

conflict-arbitration-env / scripts /plot_from_logs.py

Jeeevan11

ship submission: judge-aligned README, real training curves from 2000-step run, blog, scripts

ce00c50 about 2 months ago

Raw

History Blame Contribute Delete

8.17 kB

	"""
	Plot training curves from REAL per-step lines emitted by the running HF Job.

	Every datapoint below appears verbatim in the live job log stream
	(https://huggingface.co/jobs/testingaccc/69ecfb45d70108f37acdeb50).
	Nothing is interpolated, smoothed in, or fabricated.

	Once the final metrics.json uploads at end of training, prefer
	scripts/plot_from_metrics.py — it reads the full per-step history
	straight from the model repo.
	"""
	import sys
	from pathlib import Path

	import matplotlib
	matplotlib.use("Agg")
	import matplotlib.pyplot as plt
	import numpy as np

	# (step, reward, accuracy_percent) — REAL log lines, in order.
	DATA = [
	(0, -2.50, 50.00), (1, -3.74, 37.50), (2, 5.43, 62.50),
	(3, 1.50, 37.50), (4, -2.74, 25.00),
	(10, -1.20, 37.50), (20, -2.25, 25.00), (30, -1.27, 37.50),
	(40, -2.50, 50.00), (50, 0.56, 50.00), (60, 5.10, 37.50),
	(70, -2.94, 25.00), (80, 0.25, 62.50), (90, -7.15, 12.50),
	(100, -3.45, 37.50), (110, 0.38, 37.50), (120, -4.83, 12.50),
	(130, -1.62, 50.00), (140, -4.38, 37.50), (150, -3.99, 37.50),
	(160, -3.34, 25.00), (170, 1.38, 50.00), (180, -8.07, 12.50),
	(190, 0.50, 50.00), (200, 2.62, 62.50), (210, -2.99, 25.00),
	(220, -2.00, 25.00), (230, -7.99, 12.50), (240, 1.50, 50.00),
	(250, 11.63, 62.50), (260, -2.47, 25.00), (280, -1.21, 37.50),
	(290, 5.50, 62.50), (300, -2.33, 25.00), (310, -4.12, 37.50),
	(320, 5.31, 50.00), (330, -2.45, 50.00), (340, 0.81, 50.00),
	(350, 3.23, 62.50), (360, -0.62, 62.50), (370, 2.62, 37.50),
	(380, 8.62, 62.50), (390, 8.85, 62.50), (400, 3.55, 50.00),
	(410, -3.25, 37.50), (420, -6.20, 25.00), (430, 1.78, 37.50),
	(440, -5.33, 25.00), (450, 2.40, 62.50),
	(1200, 1.82, 37.50), (1210, -1.38, 37.50), (1220, -0.10, 25.00),
	(1230, -1.35, 37.50), (1240, -3.06, 25.00), (1250, -3.19, 25.00),
	(1260, -7.88, 12.50), (1270, 1.88, 37.50), (1280, 3.92, 50.00),
	(1290, -3.24, 25.00), (1300, 6.70, 50.00), (1310, -3.17, 25.00),
	(1320, 5.65, 62.50), (1330, 2.50, 62.50), (1340, -6.25, 25.00),
	(1350, -6.25, 25.00), (1360, -6.25, 25.00), (1370, 2.25, 62.50),
	(1380, -4.88, 25.00), (1390, -6.15, 25.00), (1400, 0.86, 25.00),
	(1410, 3.62, 50.00), (1420, -1.38, 37.50), (1430, -6.17, 25.00),
	(1710, -4.38, 37.50), (1720, -7.75, 12.50), (1730, -0.48, 37.50),
	(1740, -1.25, 37.50), (1750, -1.25, 37.50), (1760, -0.35, 37.50),
	(1770, 8.77, 62.50), (1780, 2.51, 62.50), (1790, 0.06, 25.00),
	(1800, 0.76, 50.00), (1810, 5.68, 62.50), (1820, 1.50, 50.00),
	(1830, -7.03, 12.50), (1840, 0.62, 50.00), (1850, -7.79, 12.50),
	(1860, 0.48, 50.00), (1870, -2.88, 25.00), (1880, 0.75, 50.00),
	(1890, -9.93, 0.00), (1900, -5.12, 25.00), (1910, -2.38, 25.00),
	(1920, -1.25, 37.50), (1930, -3.11, 25.00), (1940, 11.91, 62.50),
	(1950, -9.92, 0.00), (1960, -1.52, 37.50),
	]


	def rolling_mean(arr, window):
	arr = np.asarray(arr, dtype=float)
	if len(arr) < window:
	return arr
	out = np.empty_like(arr)
	cs = np.cumsum(np.insert(arr, 0, 0))
	for i in range(len(arr)):
	lo = max(0, i - window + 1)
	out[i] = (cs[i + 1] - cs[lo]) / (i - lo + 1)
	return out


	def plot(out_path: str = "training_curves.png"):
	steps = np.array([r[0] for r in DATA])
	rewards = np.array([r[1] for r in DATA])
	accs = np.array([r[2] for r in DATA])

	plt.style.use("dark_background")
	fig, axes = plt.subplots(2, 2, figsize=(14, 9))
	fig.suptitle(
	f"Conflict Arbitration Agent - GRPO training (real per-step data, n={len(steps)})",
	fontsize=14, fontweight="bold", color="#e6e6f0",
	)

	# 1. Reward over time
	ax = axes[0, 0]
	ax.scatter(steps, rewards, alpha=0.5, c="#8be9d6", s=30, label="per-step reward")
	if len(rewards) >= 5:
	ax.plot(steps, rolling_mean(rewards, 5), color="#ff79c6", linewidth=2.5,
	label="rolling avg (window=5)")
	ax.axhline(0, color="#666", linestyle="--", linewidth=1, alpha=0.7)
	ax.set_title("Average reward over training step", color="#e6e6f0")
	ax.set_xlabel("Training step")
	ax.set_ylabel("Reward (mean of 8 GRPO rollouts)")
	ax.legend(loc="lower right", framealpha=0.3)
	ax.grid(True, alpha=0.15)

	# 2. Accuracy over time
	ax = axes[0, 1]
	ax.scatter(steps, accs, alpha=0.5, c="#50fa7b", s=30, label="per-step accuracy")
	if len(accs) >= 5:
	ax.plot(steps, rolling_mean(accs, 5), color="#f1fa8c", linewidth=2.5,
	label="rolling avg (window=5)")
	ax.axhline(33.3, color="#ff5555", linestyle="--", linewidth=1.5, alpha=0.7,
	label="random baseline (33.3%)")
	ax.set_title("Arbitration accuracy over training step", color="#e6e6f0")
	ax.set_xlabel("Training step")
	ax.set_ylabel("Accuracy (%)")
	ax.set_ylim(-5, 105)
	ax.legend(loc="lower right", framealpha=0.3)
	ax.grid(True, alpha=0.15)

	# 3. Reward distribution: early vs late
	ax = axes[1, 0]
	early_mask = steps <= 500
	late_mask = steps >= 1700
	bins = np.linspace(rewards.min() - 0.5, rewards.max() + 0.5, 16)
	ax.hist(rewards[early_mask], bins=bins, alpha=0.6, color="#ff5555",
	label=f"early steps 0-450 (n={int(early_mask.sum())})")
	ax.hist(rewards[late_mask], bins=bins, alpha=0.6, color="#50fa7b",
	label=f"late steps 1710-1960 (n={int(late_mask.sum())})")
	ax.axvline(rewards[early_mask].mean(), color="#ff5555", linestyle="--", linewidth=2,
	label=f"early mean = {rewards[early_mask].mean():+.2f}")
	ax.axvline(rewards[late_mask].mean(), color="#50fa7b", linestyle="--", linewidth=2,
	label=f"late mean = {rewards[late_mask].mean():+.2f}")
	ax.set_title("Reward distribution: early vs late training", color="#e6e6f0")
	ax.set_xlabel("Reward")
	ax.set_ylabel("Frequency")
	ax.legend(loc="upper left", framealpha=0.3, fontsize=9)
	ax.grid(True, alpha=0.15)

	# 4. Summary stats
	ax = axes[1, 1]
	ax.axis("off")
	early_r = rewards[early_mask]
	late_r = rewards[late_mask]
	early_a = accs[early_mask]
	late_a = accs[late_mask]
	pos = int((rewards > 0).sum())
	above_chance = int((accs > 33.3).sum())
	text = f"""TRAINING SUMMARY (real log data, no interpolation)
	{'='*46}
	Datapoints logged: {len(steps)}
	Step range covered: {steps[0]} -> {steps[-1]}
	Curriculum phase: 1 throughout
	Hardware: A10G-small via HF Jobs
	Job ID: 69ecfb45d70108f37acdeb50

	REWARD
	Early (steps 0-450): mean {early_r.mean():+.2f}
	Late (steps 1710-1960): mean {late_r.mean():+.2f}
	Improvement (late-early): {late_r.mean() - early_r.mean():+.2f}
	Best step: {rewards.max():+.2f} (step {steps[int(np.argmax(rewards))]})
	Worst step: {rewards.min():+.2f} (step {steps[int(np.argmin(rewards))]})
	Positive-reward steps: {pos}/{len(rewards)} ({100*pos/len(rewards):.0f}%)

	ACCURACY
	Early (steps 0-450): mean {early_a.mean():.1f}%
	Late (steps 1710-1960): mean {late_a.mean():.1f}%
	Best: {accs.max():.1f}%
	Random baseline: 33.3%
	Above-chance steps: {above_chance}/{len(accs)} ({100*above_chance/len(accs):.0f}%)

	NOTES
	- High reward variance is expected: 8 stochastic rollouts/step
	at temperature 0.9 with sparse, contrastive reward.
	- Curriculum did not advance to phase 2; threshold was 70%.
	- Final per-step metrics.json will overwrite this on completion.
	"""
	ax.text(0.0, 0.98, text, transform=ax.transAxes, fontsize=9,
	verticalalignment="top", fontfamily="monospace", color="#c8c8e8")

	plt.tight_layout()
	plt.savefig(out_path, dpi=150, bbox_inches="tight", facecolor="#0a0a14")
	print(f"saved {out_path} ({len(steps)} real datapoints)")

	# CLI summary
	print(f"early reward mean: {early_r.mean():+.3f}")
	print(f"late reward mean: {late_r.mean():+.3f}")
	print(f"early acc mean: {early_a.mean():.2f}%")
	print(f"late acc mean: {late_a.mean():.2f}%")
	print(f"random baseline: 33.30%")


	if __name__ == "__main__":
	out = sys.argv[1] if len(sys.argv) > 1 else "training_curves.png"
	plot(out)