Spaces:

Anvit25
/

meta-signal

Sleeping

App Files Files Community

meta-signal / plot_results.py

Anvit25

feat: add base model baseline to plot

df6e4f9 about 1 month ago

raw

history blame contribute delete

7.52 kB

	"""
	Run: python plot_results.py
	Saves: results/meta-signal-results.png
	"""

	import os
	import matplotlib
	matplotlib.use("Agg")
	import matplotlib.pyplot as plt
	import matplotlib.patches as mpatches
	import numpy as np

	os.makedirs("results", exist_ok=True)

	# ── Data ──────────────────────────────────────────────────────────────────────

	EXPERT_ALL = {
	"T1\nBudget\nOptimisation": 0.43,
	"T2\nNoisy Signal\nRecovery": 0.54,
	"T3\nPrivacy\nFrontier": 0.72,
	"T4\nAdversarial\nRegulator": 0.60,
	"T5\nSignal\nRecovery": 0.800,
	"T6\nAndromeda\nStability": 0.864,
	"T7\nQ4\nChampion": 0.850,
	}

	# Base model (Llama-3.1-8B, no fine-tuning): 3 seeds per task (from live API)
	EQUAL_SEEDS = {
	"T5": [0.3956, 0.4511, 0.5900],
	"T6": [0.5333, 0.5400, 0.4917],
	"T7": [0.4623, 0.4623, 0.7090],
	}

	# Fine-tuned: 3 seeds per task
	FT_SEEDS = {
	"T5": [0.800, 0.800, 0.800],
	"T6": [0.9496, 0.9487, 0.9484],
	"T7": [0.850, 0.850, 0.850],
	}

	EXPERT_Q4 = {"T5": 0.800, "T6": 0.864, "T7": 0.850}
	EQUAL_AVG = {k: np.mean(v) for k, v in EQUAL_SEEDS.items()}
	FT_AVG = {k: np.mean(v) for k, v in FT_SEEDS.items()}

	TASK_LABELS = {
	"T5": "Task 5\nSignal Recovery\n(30 steps)",
	"T6": "Task 6\nAndromeda\nStability (75 steps)",
	"T7": "Task 7\nQ4 Champion\n(100 steps)",
	}

	# ── Colors ────────────────────────────────────────────────────────────────────
	C_RANDOM = "#95A5A6"
	C_EXPERT = "#4A90D9"
	C_FT = "#E8874A"
	C_SEED = "#333333"
	BG = "#F7F9FC"

	# ── Figure ────────────────────────────────────────────────────────────────────
	fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6), facecolor=BG)
	fig.subplots_adjust(wspace=0.38)

	# ── Panel 1: ExpertBot baseline — all 7 tasks ─────────────────────────────────
	labels1 = list(EXPERT_ALL.keys())
	scores1 = list(EXPERT_ALL.values())
	colors1 = [C_EXPERT if i < 4 else "#6C5CE7" for i in range(7)]

	bars1 = ax1.bar(labels1, scores1, color=colors1, width=0.55,
	edgecolor="white", linewidth=1.2, zorder=3)

	for bar, score in zip(bars1, scores1):
	ax1.text(bar.get_x() + bar.get_width() / 2,
	bar.get_height() + 0.015,
	f"{score:.2f}",
	ha="center", va="bottom", fontsize=9.5, fontweight="bold",
	color="#333333")

	ax1.axvline(x=3.5, color="#999999", linestyle="--", linewidth=1.2, zorder=2)
	ax1.text(1.5, 0.92, "Core Tasks", ha="center", fontsize=9,
	color=C_EXPERT, fontweight="semibold")
	ax1.text(5.0, 0.92, "Q4 Gauntlet", ha="center", fontsize=9,
	color="#6C5CE7", fontweight="semibold")

	ax1.set_ylim(0, 1.0)
	ax1.set_ylabel("Score (0 – 1)", fontsize=11)
	ax1.set_title("ExpertBot Baseline — All 7 Tasks", fontsize=13, fontweight="bold",
	pad=14)
	ax1.set_facecolor(BG)
	ax1.yaxis.grid(True, linestyle="--", alpha=0.5, zorder=0)
	ax1.set_axisbelow(True)
	ax1.spines[["top", "right"]].set_visible(False)
	ax1.tick_params(axis="x", labelsize=8)

	# ── Panel 2: Random → ExpertBot → Fine-tuned — Q4 tasks ──────────────────────
	tasks = list(TASK_LABELS.keys())
	x = np.arange(len(tasks))
	width = 0.24

	bars_r = ax2.bar(x - width,
	[EQUAL_AVG[t] for t in tasks],
	width, label="Base model — no fine-tuning (avg 3 seeds)",
	color=C_RANDOM, edgecolor="white", linewidth=1.2, zorder=3)

	bars_e = ax2.bar(x,
	[EXPERT_Q4[t] for t in tasks],
	width, label="ExpertBot (hand-coded expert)",
	color=C_EXPERT, edgecolor="white", linewidth=1.2, zorder=3)

	bars_f = ax2.bar(x + width,
	[FT_AVG[t] for t in tasks],
	width, label="Fine-tuned Llama-3.1-8B QLoRA (avg 3 seeds)",
	color=C_FT, edgecolor="white", linewidth=1.2, zorder=3)

	# Seed dots on fine-tuned bars
	for i, t in enumerate(tasks):
	for seed_score in FT_SEEDS[t]:
	ax2.scatter(x[i] + width, seed_score,
	color=C_SEED, s=22, zorder=5, alpha=0.85)

	# Score labels
	for bar in bars_r:
	ax2.text(bar.get_x() + bar.get_width() / 2,
	bar.get_height() + 0.010,
	f"{bar.get_height():.3f}",
	ha="center", va="bottom", fontsize=7.5, fontweight="bold",
	color="#666666")

	for bar in bars_e:
	ax2.text(bar.get_x() + bar.get_width() / 2,
	bar.get_height() + 0.010,
	f"{bar.get_height():.3f}",
	ha="center", va="bottom", fontsize=7.5, fontweight="bold",
	color=C_EXPERT)

	for bar in bars_f:
	ax2.text(bar.get_x() + bar.get_width() / 2,
	bar.get_height() + 0.010,
	f"{bar.get_height():.3f}",
	ha="center", va="bottom", fontsize=7.5, fontweight="bold",
	color=C_FT)

	# Delta annotations (fine-tuned vs ExpertBot)
	for i, t in enumerate(tasks):
	delta = FT_AVG[t] - EXPERT_Q4[t]
	sign = "+" if delta >= 0 else ""
	color = "#27AE60" if delta >= 0 else "#E74C3C"
	ax2.text(x[i] + width, max(EXPERT_Q4[t], FT_AVG[t]) + 0.052,
	f"{sign}{delta:.3f}",
	ha="center", fontsize=8.5, color=color, fontweight="bold")

	ax2.set_ylim(0, 1.05)
	ax2.set_ylabel("Score (0 – 1)", fontsize=11)
	ax2.set_title("Reward Improvement: Base Model → ExpertBot → Fine-tuned\nQ4 Gauntlet Tasks (3 seeds each)",
	fontsize=12, fontweight="bold", pad=14)
	ax2.set_xticks(x)
	ax2.set_xticklabels([TASK_LABELS[t] for t in tasks], fontsize=8.5)
	ax2.set_facecolor(BG)
	ax2.yaxis.grid(True, linestyle="--", alpha=0.5, zorder=0)
	ax2.set_axisbelow(True)
	ax2.spines[["top", "right"]].set_visible(False)

	ax2.legend(handles=[
	mpatches.Patch(color=C_RANDOM, label="Base model — no fine-tuning (avg 3 seeds)"),
	mpatches.Patch(color=C_EXPERT, label="ExpertBot (hand-coded expert)"),
	mpatches.Patch(color=C_FT, label="Fine-tuned Llama-3.1-8B QLoRA (avg 3 seeds)"),
	mpatches.Patch(color=C_SEED, label="Individual seed scores"),
	], fontsize=8, loc="lower right", framealpha=0.85)

	# ── Shared caption ─────────────────────────────────────────────────────────────
	fig.text(
	0.5, 0.01,
	"Fine-tuned: Llama-3.1-8B-Instruct + QLoRA (rank=16), 1 epoch on ~41k expert demos \| "
	"Loss: 0.1080 \| Avg improvement over base model: +67% T5, +82% T6, +56% T7 \| Task 6 BEATS ExpertBot by +8.5 pts",
	ha="center", fontsize=8.5, color="#555555",
	)

	fig.suptitle("Meta-Signal: Privacy-Constrained Ad Budget Optimisation",
	fontsize=15, fontweight="bold", y=1.01, color="#222222")

	# ── Save ───────────────────────────────────────────────────────────────────────
	out = "results/meta-signal-results.png"
	plt.savefig(out, dpi=180, bbox_inches="tight", facecolor=BG)
	print(f"Saved: {out}")