trial1 / plot_results.py

Upload trained LoRA adapter from Kaggle

35e9f20 verified 2 months ago

17.1 kB

	"""
	plot_results.py — Results Charts for Hackathon Presentation
	============================================================
	Generates publication-ready charts from evaluation JSON data.
	Saves all charts to results/charts/ as both PNG and SVG.

	BASELINE CONSTANTS (FINAL — measured, do not change):
	R1 Llama-3.1-8B zero-shot: easy=0.0100, medium=0.4583, hard=0.0100, avg=0.1594
	R2 Llama-3.1-8B zero-shot: easy=0.3198, medium=0.2443, hard=0.2520, avg=0.2720
	Training model: Qwen/Qwen2.5-1.5B-Instruct (GRPO, 4-bit QLoRA)

	Charts produced:
	1. r1_scores_comparison.png — R1 Llama baseline vs trained bar chart
	2. r2_scores_comparison.png — R2 Llama zero-shot vs trained bar chart (no rule-based)
	3. sprint_rewards.png — Sprint-by-sprint reward for each R2 scenario
	4. improvement_summary.png — Combined before/after delta chart (main slide chart)
	5. training_curve.png — GRPO training loss/reward curve (if trainer_state.json present)

	Usage:
	# After running evaluate_r2.py --baseline-only:
	python plot_results.py --eval results/r2_evaluation.json

	# With training curve (after train_llm.py):
	python plot_results.py --eval results/r2_evaluation.json \\
	--trainer results/trained_model/trainer_state.json

	# Hardcode known scores for presentation (no eval file needed):
	python plot_results.py --demo
	"""

	from __future__ import annotations

	import argparse
	import json
	import os
	from pathlib import Path

	CHARTS_DIR = Path("results/charts")
	CHARTS_DIR.mkdir(parents=True, exist_ok=True)

	# ── Measured baselines (FINAL) ────────────────────────────────────────────────
	LLAMA_BASELINE_R1 = {
	"easy_sprint": 0.0100,
	"medium_sprint": 0.4583,
	"hard_sprint": 0.0100,
	"average": 0.1594,
	}
	LLAMA_BASELINE_R2 = {
	"project_easy": 0.3198,
	"project_medium": 0.2443,
	"project_hard": 0.2520,
	"average": 0.2720,
	}
	TRAINING_MODEL = "Qwen/Qwen2.5-1.5B-Instruct"

	# ── Colour palette ────────────────────────────────────────────────────────────
	C_LLAMA = "#6B7280" # grey — Llama zero-shot baseline
	C_RULE = "#3B82F6" # blue — rule-based baseline
	C_TRAINED = "#10B981" # green — trained Qwen (post-GRPO)
	C_EASY = "#60A5FA"
	C_MEDIUM = "#F59E0B"
	C_HARD = "#EF4444"
	C_BG = "#F9FAFB"
	C_TEXT = "#111827"


	def _setup_matplotlib():
	import matplotlib
	matplotlib.use("Agg") # headless
	import matplotlib.pyplot as plt
	import matplotlib.patches as mpatches
	plt.rcParams.update({
	"figure.facecolor": C_BG,
	"axes.facecolor": C_BG,
	"axes.edgecolor": "#D1D5DB",
	"axes.labelcolor": C_TEXT,
	"text.color": C_TEXT,
	"xtick.color": C_TEXT,
	"ytick.color": C_TEXT,
	"grid.color": "#E5E7EB",
	"grid.linestyle": "--",
	"grid.alpha": 0.7,
	"font.family": "sans-serif",
	"font.size": 11,
	"axes.titlesize": 13,
	"axes.titleweight": "bold",
	"figure.dpi": 150,
	})
	return plt, mpatches


	def save(plt, name: str):
	png = CHARTS_DIR / f"{name}.png"
	svg = CHARTS_DIR / f"{name}.svg"
	plt.tight_layout()
	plt.savefig(png, bbox_inches="tight")
	plt.savefig(svg, bbox_inches="tight")
	plt.close()
	print(f" [SAVED] {png}", flush=True)


	# ── Chart 1: R1 scores comparison ────────────────────────────────────────────

	def chart_r1_comparison(eval_data: dict):
	plt, mpatches = _setup_matplotlib()

	tasks = ["easy_sprint", "medium_sprint", "hard_sprint"]
	labels = ["Easy Sprint", "Medium Sprint", "Hard Sprint"]
	llama_base = [eval_data.get("r1_llama_baseline", LLAMA_BASELINE_R1).get(t, 0) for t in tasks]
	rule_based = [eval_data.get("r1_rule_based", {}).get(t, {}).get("avg_score", 0) for t in tasks]
	llm_scores = [eval_data.get("r1_llm", {}).get(t, {}).get("avg_score", 0) for t in tasks]

	has_llm = any(v > 0 for v in llm_scores)
	x = range(len(tasks))
	width = 0.28 if has_llm else 0.38
	fig, ax = plt.subplots(figsize=(9, 5))

	b1 = ax.bar([i - width for i in x], llama_base, width, label=f"Llama-3.1-8B (zero-shot baseline)", color=C_LLAMA, zorder=3)
	b2 = ax.bar([i for i in x], rule_based, width, label="Rule-based", color=C_RULE, zorder=3)
	if has_llm:
	b3 = ax.bar([i + width for i in x], llm_scores, width, label=f"{TRAINING_MODEL} (GRPO trained)", color=C_TRAINED, zorder=3)

	def label_bars(bars):
	for bar in bars:
	h = bar.get_height()
	if h > 0.01:
	ax.text(bar.get_x() + bar.get_width()/2, h + 0.01,
	f"{h:.2f}", ha="center", va="bottom", fontsize=9)

	label_bars(b1); label_bars(b2)
	if has_llm: label_bars(b3)

	ax.set_xticks(list(x))
	ax.set_xticklabels(labels)
	ax.set_ylim(0, 1.15)
	ax.set_ylabel("Score (0.01 – 0.99)")
	ax.set_title("Round 1 — Score Comparison")
	ax.legend(loc="upper right")
	ax.yaxis.grid(True, zorder=0)
	ax.set_axisbelow(True)

	save(plt, "r1_scores_comparison")


	# ── Chart 2: R2 scores comparison ────────────────────────────────────────────

	def chart_r2_comparison(eval_data: dict):
	plt, mpatches = _setup_matplotlib()

	tasks = ["project_easy", "project_medium", "project_hard"]
	labels = ["Easy (6 sprints)", "Medium (6 sprints)", "Hard (6 sprints)"]
	llama_base = [eval_data.get("r2_llama_baseline", LLAMA_BASELINE_R2).get(t, 0) for t in tasks]
	llm_scores = [eval_data.get("r2_llm", {}).get(t, {}).get("avg_score", 0) for t in tasks]

	has_llm = any(v > 0 for v in llm_scores)
	x = range(len(tasks))
	width = 0.32 if has_llm else 0.5
	fig, ax = plt.subplots(figsize=(9, 5))

	b1 = ax.bar([i - width/2 for i in x], llama_base, width, label="Llama-3.1-8B (zero-shot)", color=C_LLAMA, zorder=3)
	if has_llm:
	b2 = ax.bar([i + width/2 for i in x], llm_scores, width, label=f"{TRAINING_MODEL} (GRPO)", color=C_TRAINED, zorder=3)

	for bars in ([b1] + ([b2] if has_llm else [])):
	for bar in bars:
	h = bar.get_height()
	if h > 0.01:
	ax.text(bar.get_x() + bar.get_width()/2, h + 0.01,
	f"{h:.2f}", ha="center", va="bottom", fontsize=9)

	ax.set_xticks(list(x))
	ax.set_xticklabels(labels)
	ax.set_ylim(0, 1.15)
	ax.set_ylabel("Project Score (delivery × instruction × health)")
	ax.set_title("Round 2 — Multi-Sprint Project Score")
	ax.legend(loc="upper right")
	ax.yaxis.grid(True, zorder=0)
	ax.set_axisbelow(True)

	save(plt, "r2_scores_comparison")


	# ── Chart 3: Sprint reward curves ─────────────────────────────────────────────

	def chart_sprint_rewards(eval_data: dict):
	"""Per-sprint reward for each R2 scenario (Llama baseline vs trained)."""
	plt, _ = _setup_matplotlib()
	tasks = ["project_easy", "project_medium", "project_hard"]
	colors = [C_EASY, C_MEDIUM, C_HARD]
	labels = ["Easy", "Medium", "Hard"]

	fig, axes = plt.subplots(1, 3, figsize=(13, 4), sharey=True)

	for ax, task, color, label in zip(axes, tasks, colors, labels):
	llm_eps = eval_data.get("r2_llm", {}).get(task, {}).get("episodes", [])

	if llm_eps:
	sr = llm_eps[0].get("sprint_rewards", [])
	if sr:
	ax.plot(range(1, len(sr)+1), sr, "o-",
	color=color, label="Qwen GRPO", linewidth=2, markersize=6)

	ax.set_xlabel("Sprint")
	ax.set_title(f"{label} Project")
	ax.set_xticks(range(1, 7))
	ax.set_ylim(0, 2.2)
	ax.yaxis.grid(True)
	ax.set_axisbelow(True)
	if ax == axes[0]:
	ax.set_ylabel("Sprint Reward")
	ax.legend(fontsize=8)

	fig.suptitle("Sprint-by-Sprint Reward — Qwen GRPO Trained", y=1.02, fontsize=13, fontweight="bold")
	save(plt, "sprint_rewards")


	# ── Chart 4: Improvement summary (main presentation slide) ───────────────────

	def chart_improvement_summary(eval_data: dict):
	"""Main before/after chart. Uses Llama zero-shot as the 'before' bar."""
	plt, _ = _setup_matplotlib()

	all_tasks = (
	[f"R1: {t.replace('_sprint','').title()}" for t in ["easy_sprint","medium_sprint","hard_sprint"]] +
	[f"R2: {t.replace('project_','').title()}" for t in ["project_easy","project_medium","project_hard"]]
	)

	llama_base, trained_scores = [], []
	for t in ["easy_sprint","medium_sprint","hard_sprint"]:
	llama_base.append(eval_data.get("r1_llama_baseline", LLAMA_BASELINE_R1).get(t, 0))
	trained_scores.append(eval_data.get("r1_llm", {}).get(t, {}).get("avg_score", 0))
	for t in ["project_easy","project_medium","project_hard"]:
	llama_base.append(eval_data.get("r2_llama_baseline", LLAMA_BASELINE_R2).get(t, 0))
	trained_scores.append(eval_data.get("r2_llm", {}).get(t, {}).get("avg_score", 0))

	x = range(len(all_tasks))
	width = 0.36
	fig, ax = plt.subplots(figsize=(11, 5))

	b1 = ax.bar([i - width/2 for i in x], llama_base, width,
	label="Before: Llama-3.1-8B (zero-shot)", color=C_LLAMA, zorder=3)
	b2 = ax.bar([i + width/2 for i in x], trained_scores, width,
	label=f"After: {TRAINING_MODEL} (GRPO)", color=C_TRAINED, zorder=3)

	# Delta arrows and labels
	for i, (base, trained) in enumerate(zip(llama_base, trained_scores)):
	if trained > base + 0.01:
	ax.annotate("", xy=(i + width/2, trained + 0.02), xytext=(i - width/2, base + 0.02),
	arrowprops=dict(arrowstyle="->", color="#059669", lw=1.5))
	ax.text(i, max(base, trained) + 0.06, f"+{trained-base:.2f}",
	ha="center", fontsize=8, color="#059669", fontweight="bold")

	ax.set_xticks(list(x))
	ax.set_xticklabels(all_tasks, rotation=15, ha="right")
	ax.set_ylim(0, 1.25)
	ax.set_ylabel("Score")
	ax.set_title(f"Reward Improvement: Llama Zero-Shot → Qwen2.5-1.5B GRPO Trained")
	ax.legend(loc="upper left")
	ax.yaxis.grid(True, zorder=0)
	ax.set_axisbelow(True)

	# Divider between R1 and R2
	ax.axvline(x=2.5, color="#9CA3AF", linestyle=":", linewidth=1.5)
	ax.text(1.0, 1.20, "Round 1", ha="center", fontsize=10, color="#6B7280")
	ax.text(4.0, 1.20, "Round 2", ha="center", fontsize=10, color="#6B7280")

	save(plt, "improvement_summary")


	# ── Chart 5: Training loss/reward curve ───────────────────────────────────────

	def chart_training_curve(trainer_state_path: str):
	plt, _ = _setup_matplotlib()

	with open(trainer_state_path) as f:
	state = json.load(f)

	log_history = state.get("log_history", [])
	if not log_history:
	print(" [SKIP] No log_history in trainer_state.json", flush=True)
	return

	steps, losses, rewards = [], [], []
	for entry in log_history:
	if "loss" in entry:
	steps.append(entry.get("step", 0))
	losses.append(entry["loss"])
	if "reward" in entry:
	rewards.append((entry.get("step", 0), entry["reward"]))

	fig, axes = plt.subplots(1, 2, figsize=(11, 4))

	if steps and losses:
	axes[0].plot(steps, losses, color=C_RULE, linewidth=2)
	axes[0].set_xlabel("Training Step")
	axes[0].set_ylabel("Loss")
	axes[0].set_title("GRPO Training Loss (Qwen2.5-1.5B)")
	axes[0].yaxis.grid(True)

	if rewards:
	rsteps, rvals = zip(*rewards)
	axes[1].plot(rsteps, rvals, color=C_TRAINED, linewidth=2)
	axes[1].set_xlabel("Training Step")
	axes[1].set_ylabel("Mean Reward")
	axes[1].set_title("GRPO Mean Reward per Step")
	axes[1].yaxis.grid(True)

	fig.suptitle(f"GRPO Training Curves — {TRAINING_MODEL}", fontsize=13, fontweight="bold")
	save(plt, "training_curve")


	# ── Demo mode (hardcoded real baselines, placeholder trained scores) ──────────

	def demo_mode():
	"""
	Generate charts using real measured baselines.
	Trained scores are placeholders — replace with real evaluate_r2.py output after on-site training.
	"""
	print("[INFO] Demo mode — real Llama baselines, placeholder trained scores", flush=True)
	print(f"[INFO] Training model: {TRAINING_MODEL}", flush=True)

	# Placeholder trained scores — update after on-site GRPO training
	PLACEHOLDER_R1_TRAINED = {
	"easy_sprint": 0.0, # update after training
	"medium_sprint": 0.0, # update after training
	"hard_sprint": 0.0, # update after training
	}
	PLACEHOLDER_R2_TRAINED = {
	"project_easy": 0.0, # update after training
	"project_medium": 0.0, # update after training
	"project_hard": 0.0, # update after training
	}

	demo_data = {
	"r1_llama_baseline": LLAMA_BASELINE_R1,
	"r2_llama_baseline": LLAMA_BASELINE_R2,
	"r1_rule_based": {
	"easy_sprint": {"avg_score": 0.92},
	"medium_sprint": {"avg_score": 0.35},
	"hard_sprint": {"avg_score": 0.01},
	},
	"r1_llm": {
	t: {"avg_score": v} for t, v in PLACEHOLDER_R1_TRAINED.items()
	},
	"r2_rule_based": {},
	"r2_llm": {
	t: {"avg_score": v, "episodes": []}
	for t, v in PLACEHOLDER_R2_TRAINED.items()
	},
	"improvement": {
	t: {
	"llama_baseline": LLAMA_BASELINE_R2[t],
	"trained_llm": PLACEHOLDER_R2_TRAINED[t],
	"delta_vs_llama": round(PLACEHOLDER_R2_TRAINED[t] - LLAMA_BASELINE_R2[t], 4),
	}
	for t in ["project_easy", "project_medium", "project_hard"]
	},
	}
	return demo_data


	# ── CLI ───────────────────────────────────────────────────────────────────────

	def main():
	parser = argparse.ArgumentParser(description="Generate result charts for presentation")
	parser.add_argument("--eval", type=str, default=None,
	help="Path to r2_evaluation.json from evaluate_r2.py")
	parser.add_argument("--trainer", type=str, default=None,
	help="Path to trainer_state.json from train_llm.py output")
	parser.add_argument("--demo", action="store_true",
	help="Generate charts with real baselines + placeholder trained scores")
	args = parser.parse_args()

	try:
	import matplotlib
	except ImportError:
	print("[ERROR] matplotlib not installed. Run: pip install matplotlib", flush=True)
	import sys; sys.exit(1)

	if args.demo:
	eval_data = demo_mode()
	elif args.eval:
	with open(args.eval) as f:
	eval_data = json.load(f)
	# Back-fill baseline keys if running against old JSON format
	if "r1_llama_baseline" not in eval_data:
	eval_data["r1_llama_baseline"] = LLAMA_BASELINE_R1
	if "r2_llama_baseline" not in eval_data:
	eval_data["r2_llama_baseline"] = LLAMA_BASELINE_R2
	else:
	print("[INFO] No --eval file specified. Using --demo mode.", flush=True)
	eval_data = demo_mode()

	print(f"\nGenerating charts → {CHARTS_DIR}/", flush=True)
	print(f" Baselines: R1 avg={LLAMA_BASELINE_R1['average']:.4f} R2 avg={LLAMA_BASELINE_R2['average']:.4f}", flush=True)
	print(f" Training model: {TRAINING_MODEL}", flush=True)

	print(" Chart 1: R1 scores comparison...", flush=True)
	chart_r1_comparison(eval_data)

	print(" Chart 2: R2 scores comparison...", flush=True)
	chart_r2_comparison(eval_data)

	print(" Chart 3: Sprint reward curves...", flush=True)
	chart_sprint_rewards(eval_data)

	print(" Chart 4: Improvement summary...", flush=True)
	chart_improvement_summary(eval_data)

	if args.trainer and Path(args.trainer).exists():
	print(" Chart 5: Training curve...", flush=True)
	chart_training_curve(args.trainer)
	else:
	print(" Chart 5: Training curve — skipped (no --trainer file provided)", flush=True)

	print(f"\n✅ All charts saved to {CHARTS_DIR}/", flush=True)
	print(f" Use improvement_summary.png in your HF blog post and pitch slides.", flush=True)


	if __name__ == "__main__":
	main()