trial1 / plot_results.py
priyaaaaaasharmaaaaa's picture
Upload trained LoRA adapter from Kaggle
35e9f20 verified
Raw
History Blame Contribute Delete
17.1 kB
"""
plot_results.py β€” Results Charts for Hackathon Presentation
============================================================
Generates publication-ready charts from evaluation JSON data.
Saves all charts to results/charts/ as both PNG and SVG.
BASELINE CONSTANTS (FINAL β€” measured, do not change):
R1 Llama-3.1-8B zero-shot: easy=0.0100, medium=0.4583, hard=0.0100, avg=0.1594
R2 Llama-3.1-8B zero-shot: easy=0.3198, medium=0.2443, hard=0.2520, avg=0.2720
Training model: Qwen/Qwen2.5-1.5B-Instruct (GRPO, 4-bit QLoRA)
Charts produced:
1. r1_scores_comparison.png β€” R1 Llama baseline vs trained bar chart
2. r2_scores_comparison.png β€” R2 Llama zero-shot vs trained bar chart (no rule-based)
3. sprint_rewards.png β€” Sprint-by-sprint reward for each R2 scenario
4. improvement_summary.png β€” Combined before/after delta chart (main slide chart)
5. training_curve.png β€” GRPO training loss/reward curve (if trainer_state.json present)
Usage:
# After running evaluate_r2.py --baseline-only:
python plot_results.py --eval results/r2_evaluation.json
# With training curve (after train_llm.py):
python plot_results.py --eval results/r2_evaluation.json \\
--trainer results/trained_model/trainer_state.json
# Hardcode known scores for presentation (no eval file needed):
python plot_results.py --demo
"""
from __future__ import annotations
import argparse
import json
import os
from pathlib import Path
CHARTS_DIR = Path("results/charts")
CHARTS_DIR.mkdir(parents=True, exist_ok=True)
# ── Measured baselines (FINAL) ────────────────────────────────────────────────
LLAMA_BASELINE_R1 = {
"easy_sprint": 0.0100,
"medium_sprint": 0.4583,
"hard_sprint": 0.0100,
"average": 0.1594,
}
LLAMA_BASELINE_R2 = {
"project_easy": 0.3198,
"project_medium": 0.2443,
"project_hard": 0.2520,
"average": 0.2720,
}
TRAINING_MODEL = "Qwen/Qwen2.5-1.5B-Instruct"
# ── Colour palette ────────────────────────────────────────────────────────────
C_LLAMA = "#6B7280" # grey β€” Llama zero-shot baseline
C_RULE = "#3B82F6" # blue β€” rule-based baseline
C_TRAINED = "#10B981" # green β€” trained Qwen (post-GRPO)
C_EASY = "#60A5FA"
C_MEDIUM = "#F59E0B"
C_HARD = "#EF4444"
C_BG = "#F9FAFB"
C_TEXT = "#111827"
def _setup_matplotlib():
import matplotlib
matplotlib.use("Agg") # headless
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
plt.rcParams.update({
"figure.facecolor": C_BG,
"axes.facecolor": C_BG,
"axes.edgecolor": "#D1D5DB",
"axes.labelcolor": C_TEXT,
"text.color": C_TEXT,
"xtick.color": C_TEXT,
"ytick.color": C_TEXT,
"grid.color": "#E5E7EB",
"grid.linestyle": "--",
"grid.alpha": 0.7,
"font.family": "sans-serif",
"font.size": 11,
"axes.titlesize": 13,
"axes.titleweight": "bold",
"figure.dpi": 150,
})
return plt, mpatches
def save(plt, name: str):
png = CHARTS_DIR / f"{name}.png"
svg = CHARTS_DIR / f"{name}.svg"
plt.tight_layout()
plt.savefig(png, bbox_inches="tight")
plt.savefig(svg, bbox_inches="tight")
plt.close()
print(f" [SAVED] {png}", flush=True)
# ── Chart 1: R1 scores comparison ────────────────────────────────────────────
def chart_r1_comparison(eval_data: dict):
plt, mpatches = _setup_matplotlib()
tasks = ["easy_sprint", "medium_sprint", "hard_sprint"]
labels = ["Easy Sprint", "Medium Sprint", "Hard Sprint"]
llama_base = [eval_data.get("r1_llama_baseline", LLAMA_BASELINE_R1).get(t, 0) for t in tasks]
rule_based = [eval_data.get("r1_rule_based", {}).get(t, {}).get("avg_score", 0) for t in tasks]
llm_scores = [eval_data.get("r1_llm", {}).get(t, {}).get("avg_score", 0) for t in tasks]
has_llm = any(v > 0 for v in llm_scores)
x = range(len(tasks))
width = 0.28 if has_llm else 0.38
fig, ax = plt.subplots(figsize=(9, 5))
b1 = ax.bar([i - width for i in x], llama_base, width, label=f"Llama-3.1-8B (zero-shot baseline)", color=C_LLAMA, zorder=3)
b2 = ax.bar([i for i in x], rule_based, width, label="Rule-based", color=C_RULE, zorder=3)
if has_llm:
b3 = ax.bar([i + width for i in x], llm_scores, width, label=f"{TRAINING_MODEL} (GRPO trained)", color=C_TRAINED, zorder=3)
def label_bars(bars):
for bar in bars:
h = bar.get_height()
if h > 0.01:
ax.text(bar.get_x() + bar.get_width()/2, h + 0.01,
f"{h:.2f}", ha="center", va="bottom", fontsize=9)
label_bars(b1); label_bars(b2)
if has_llm: label_bars(b3)
ax.set_xticks(list(x))
ax.set_xticklabels(labels)
ax.set_ylim(0, 1.15)
ax.set_ylabel("Score (0.01 – 0.99)")
ax.set_title("Round 1 β€” Score Comparison")
ax.legend(loc="upper right")
ax.yaxis.grid(True, zorder=0)
ax.set_axisbelow(True)
save(plt, "r1_scores_comparison")
# ── Chart 2: R2 scores comparison ────────────────────────────────────────────
def chart_r2_comparison(eval_data: dict):
plt, mpatches = _setup_matplotlib()
tasks = ["project_easy", "project_medium", "project_hard"]
labels = ["Easy (6 sprints)", "Medium (6 sprints)", "Hard (6 sprints)"]
llama_base = [eval_data.get("r2_llama_baseline", LLAMA_BASELINE_R2).get(t, 0) for t in tasks]
llm_scores = [eval_data.get("r2_llm", {}).get(t, {}).get("avg_score", 0) for t in tasks]
has_llm = any(v > 0 for v in llm_scores)
x = range(len(tasks))
width = 0.32 if has_llm else 0.5
fig, ax = plt.subplots(figsize=(9, 5))
b1 = ax.bar([i - width/2 for i in x], llama_base, width, label="Llama-3.1-8B (zero-shot)", color=C_LLAMA, zorder=3)
if has_llm:
b2 = ax.bar([i + width/2 for i in x], llm_scores, width, label=f"{TRAINING_MODEL} (GRPO)", color=C_TRAINED, zorder=3)
for bars in ([b1] + ([b2] if has_llm else [])):
for bar in bars:
h = bar.get_height()
if h > 0.01:
ax.text(bar.get_x() + bar.get_width()/2, h + 0.01,
f"{h:.2f}", ha="center", va="bottom", fontsize=9)
ax.set_xticks(list(x))
ax.set_xticklabels(labels)
ax.set_ylim(0, 1.15)
ax.set_ylabel("Project Score (delivery Γ— instruction Γ— health)")
ax.set_title("Round 2 β€” Multi-Sprint Project Score")
ax.legend(loc="upper right")
ax.yaxis.grid(True, zorder=0)
ax.set_axisbelow(True)
save(plt, "r2_scores_comparison")
# ── Chart 3: Sprint reward curves ─────────────────────────────────────────────
def chart_sprint_rewards(eval_data: dict):
"""Per-sprint reward for each R2 scenario (Llama baseline vs trained)."""
plt, _ = _setup_matplotlib()
tasks = ["project_easy", "project_medium", "project_hard"]
colors = [C_EASY, C_MEDIUM, C_HARD]
labels = ["Easy", "Medium", "Hard"]
fig, axes = plt.subplots(1, 3, figsize=(13, 4), sharey=True)
for ax, task, color, label in zip(axes, tasks, colors, labels):
llm_eps = eval_data.get("r2_llm", {}).get(task, {}).get("episodes", [])
if llm_eps:
sr = llm_eps[0].get("sprint_rewards", [])
if sr:
ax.plot(range(1, len(sr)+1), sr, "o-",
color=color, label="Qwen GRPO", linewidth=2, markersize=6)
ax.set_xlabel("Sprint")
ax.set_title(f"{label} Project")
ax.set_xticks(range(1, 7))
ax.set_ylim(0, 2.2)
ax.yaxis.grid(True)
ax.set_axisbelow(True)
if ax == axes[0]:
ax.set_ylabel("Sprint Reward")
ax.legend(fontsize=8)
fig.suptitle("Sprint-by-Sprint Reward β€” Qwen GRPO Trained", y=1.02, fontsize=13, fontweight="bold")
save(plt, "sprint_rewards")
# ── Chart 4: Improvement summary (main presentation slide) ───────────────────
def chart_improvement_summary(eval_data: dict):
"""Main before/after chart. Uses Llama zero-shot as the 'before' bar."""
plt, _ = _setup_matplotlib()
all_tasks = (
[f"R1: {t.replace('_sprint','').title()}" for t in ["easy_sprint","medium_sprint","hard_sprint"]] +
[f"R2: {t.replace('project_','').title()}" for t in ["project_easy","project_medium","project_hard"]]
)
llama_base, trained_scores = [], []
for t in ["easy_sprint","medium_sprint","hard_sprint"]:
llama_base.append(eval_data.get("r1_llama_baseline", LLAMA_BASELINE_R1).get(t, 0))
trained_scores.append(eval_data.get("r1_llm", {}).get(t, {}).get("avg_score", 0))
for t in ["project_easy","project_medium","project_hard"]:
llama_base.append(eval_data.get("r2_llama_baseline", LLAMA_BASELINE_R2).get(t, 0))
trained_scores.append(eval_data.get("r2_llm", {}).get(t, {}).get("avg_score", 0))
x = range(len(all_tasks))
width = 0.36
fig, ax = plt.subplots(figsize=(11, 5))
b1 = ax.bar([i - width/2 for i in x], llama_base, width,
label="Before: Llama-3.1-8B (zero-shot)", color=C_LLAMA, zorder=3)
b2 = ax.bar([i + width/2 for i in x], trained_scores, width,
label=f"After: {TRAINING_MODEL} (GRPO)", color=C_TRAINED, zorder=3)
# Delta arrows and labels
for i, (base, trained) in enumerate(zip(llama_base, trained_scores)):
if trained > base + 0.01:
ax.annotate("", xy=(i + width/2, trained + 0.02), xytext=(i - width/2, base + 0.02),
arrowprops=dict(arrowstyle="->", color="#059669", lw=1.5))
ax.text(i, max(base, trained) + 0.06, f"+{trained-base:.2f}",
ha="center", fontsize=8, color="#059669", fontweight="bold")
ax.set_xticks(list(x))
ax.set_xticklabels(all_tasks, rotation=15, ha="right")
ax.set_ylim(0, 1.25)
ax.set_ylabel("Score")
ax.set_title(f"Reward Improvement: Llama Zero-Shot β†’ Qwen2.5-1.5B GRPO Trained")
ax.legend(loc="upper left")
ax.yaxis.grid(True, zorder=0)
ax.set_axisbelow(True)
# Divider between R1 and R2
ax.axvline(x=2.5, color="#9CA3AF", linestyle=":", linewidth=1.5)
ax.text(1.0, 1.20, "Round 1", ha="center", fontsize=10, color="#6B7280")
ax.text(4.0, 1.20, "Round 2", ha="center", fontsize=10, color="#6B7280")
save(plt, "improvement_summary")
# ── Chart 5: Training loss/reward curve ───────────────────────────────────────
def chart_training_curve(trainer_state_path: str):
plt, _ = _setup_matplotlib()
with open(trainer_state_path) as f:
state = json.load(f)
log_history = state.get("log_history", [])
if not log_history:
print(" [SKIP] No log_history in trainer_state.json", flush=True)
return
steps, losses, rewards = [], [], []
for entry in log_history:
if "loss" in entry:
steps.append(entry.get("step", 0))
losses.append(entry["loss"])
if "reward" in entry:
rewards.append((entry.get("step", 0), entry["reward"]))
fig, axes = plt.subplots(1, 2, figsize=(11, 4))
if steps and losses:
axes[0].plot(steps, losses, color=C_RULE, linewidth=2)
axes[0].set_xlabel("Training Step")
axes[0].set_ylabel("Loss")
axes[0].set_title("GRPO Training Loss (Qwen2.5-1.5B)")
axes[0].yaxis.grid(True)
if rewards:
rsteps, rvals = zip(*rewards)
axes[1].plot(rsteps, rvals, color=C_TRAINED, linewidth=2)
axes[1].set_xlabel("Training Step")
axes[1].set_ylabel("Mean Reward")
axes[1].set_title("GRPO Mean Reward per Step")
axes[1].yaxis.grid(True)
fig.suptitle(f"GRPO Training Curves β€” {TRAINING_MODEL}", fontsize=13, fontweight="bold")
save(plt, "training_curve")
# ── Demo mode (hardcoded real baselines, placeholder trained scores) ──────────
def demo_mode():
"""
Generate charts using real measured baselines.
Trained scores are placeholders β€” replace with real evaluate_r2.py output after on-site training.
"""
print("[INFO] Demo mode β€” real Llama baselines, placeholder trained scores", flush=True)
print(f"[INFO] Training model: {TRAINING_MODEL}", flush=True)
# Placeholder trained scores β€” update after on-site GRPO training
PLACEHOLDER_R1_TRAINED = {
"easy_sprint": 0.0, # update after training
"medium_sprint": 0.0, # update after training
"hard_sprint": 0.0, # update after training
}
PLACEHOLDER_R2_TRAINED = {
"project_easy": 0.0, # update after training
"project_medium": 0.0, # update after training
"project_hard": 0.0, # update after training
}
demo_data = {
"r1_llama_baseline": LLAMA_BASELINE_R1,
"r2_llama_baseline": LLAMA_BASELINE_R2,
"r1_rule_based": {
"easy_sprint": {"avg_score": 0.92},
"medium_sprint": {"avg_score": 0.35},
"hard_sprint": {"avg_score": 0.01},
},
"r1_llm": {
t: {"avg_score": v} for t, v in PLACEHOLDER_R1_TRAINED.items()
},
"r2_rule_based": {},
"r2_llm": {
t: {"avg_score": v, "episodes": []}
for t, v in PLACEHOLDER_R2_TRAINED.items()
},
"improvement": {
t: {
"llama_baseline": LLAMA_BASELINE_R2[t],
"trained_llm": PLACEHOLDER_R2_TRAINED[t],
"delta_vs_llama": round(PLACEHOLDER_R2_TRAINED[t] - LLAMA_BASELINE_R2[t], 4),
}
for t in ["project_easy", "project_medium", "project_hard"]
},
}
return demo_data
# ── CLI ───────────────────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(description="Generate result charts for presentation")
parser.add_argument("--eval", type=str, default=None,
help="Path to r2_evaluation.json from evaluate_r2.py")
parser.add_argument("--trainer", type=str, default=None,
help="Path to trainer_state.json from train_llm.py output")
parser.add_argument("--demo", action="store_true",
help="Generate charts with real baselines + placeholder trained scores")
args = parser.parse_args()
try:
import matplotlib
except ImportError:
print("[ERROR] matplotlib not installed. Run: pip install matplotlib", flush=True)
import sys; sys.exit(1)
if args.demo:
eval_data = demo_mode()
elif args.eval:
with open(args.eval) as f:
eval_data = json.load(f)
# Back-fill baseline keys if running against old JSON format
if "r1_llama_baseline" not in eval_data:
eval_data["r1_llama_baseline"] = LLAMA_BASELINE_R1
if "r2_llama_baseline" not in eval_data:
eval_data["r2_llama_baseline"] = LLAMA_BASELINE_R2
else:
print("[INFO] No --eval file specified. Using --demo mode.", flush=True)
eval_data = demo_mode()
print(f"\nGenerating charts β†’ {CHARTS_DIR}/", flush=True)
print(f" Baselines: R1 avg={LLAMA_BASELINE_R1['average']:.4f} R2 avg={LLAMA_BASELINE_R2['average']:.4f}", flush=True)
print(f" Training model: {TRAINING_MODEL}", flush=True)
print(" Chart 1: R1 scores comparison...", flush=True)
chart_r1_comparison(eval_data)
print(" Chart 2: R2 scores comparison...", flush=True)
chart_r2_comparison(eval_data)
print(" Chart 3: Sprint reward curves...", flush=True)
chart_sprint_rewards(eval_data)
print(" Chart 4: Improvement summary...", flush=True)
chart_improvement_summary(eval_data)
if args.trainer and Path(args.trainer).exists():
print(" Chart 5: Training curve...", flush=True)
chart_training_curve(args.trainer)
else:
print(" Chart 5: Training curve β€” skipped (no --trainer file provided)", flush=True)
print(f"\nβœ… All charts saved to {CHARTS_DIR}/", flush=True)
print(f" Use improvement_summary.png in your HF blog post and pitch slides.", flush=True)
if __name__ == "__main__":
main()