""" Run: python plot_results.py Saves: results/meta-signal-results.png """ import os import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt import matplotlib.patches as mpatches import numpy as np os.makedirs("results", exist_ok=True) # ── Data ────────────────────────────────────────────────────────────────────── EXPERT_ALL = { "T1\nBudget\nOptimisation": 0.43, "T2\nNoisy Signal\nRecovery": 0.54, "T3\nPrivacy\nFrontier": 0.72, "T4\nAdversarial\nRegulator": 0.60, "T5\nSignal\nRecovery": 0.800, "T6\nAndromeda\nStability": 0.864, "T7\nQ4\nChampion": 0.850, } # Base model (Llama-3.1-8B, no fine-tuning): 3 seeds per task (from live API) EQUAL_SEEDS = { "T5": [0.3956, 0.4511, 0.5900], "T6": [0.5333, 0.5400, 0.4917], "T7": [0.4623, 0.4623, 0.7090], } # Fine-tuned: 3 seeds per task FT_SEEDS = { "T5": [0.800, 0.800, 0.800], "T6": [0.9496, 0.9487, 0.9484], "T7": [0.850, 0.850, 0.850], } EXPERT_Q4 = {"T5": 0.800, "T6": 0.864, "T7": 0.850} EQUAL_AVG = {k: np.mean(v) for k, v in EQUAL_SEEDS.items()} FT_AVG = {k: np.mean(v) for k, v in FT_SEEDS.items()} TASK_LABELS = { "T5": "Task 5\nSignal Recovery\n(30 steps)", "T6": "Task 6\nAndromeda\nStability (75 steps)", "T7": "Task 7\nQ4 Champion\n(100 steps)", } # ── Colors ──────────────────────────────────────────────────────────────────── C_RANDOM = "#95A5A6" C_EXPERT = "#4A90D9" C_FT = "#E8874A" C_SEED = "#333333" BG = "#F7F9FC" # ── Figure ──────────────────────────────────────────────────────────────────── fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6), facecolor=BG) fig.subplots_adjust(wspace=0.38) # ── Panel 1: ExpertBot baseline — all 7 tasks ───────────────────────────────── labels1 = list(EXPERT_ALL.keys()) scores1 = list(EXPERT_ALL.values()) colors1 = [C_EXPERT if i < 4 else "#6C5CE7" for i in range(7)] bars1 = ax1.bar(labels1, scores1, color=colors1, width=0.55, edgecolor="white", linewidth=1.2, zorder=3) for bar, score in zip(bars1, scores1): ax1.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.015, f"{score:.2f}", ha="center", va="bottom", fontsize=9.5, fontweight="bold", color="#333333") ax1.axvline(x=3.5, color="#999999", linestyle="--", linewidth=1.2, zorder=2) ax1.text(1.5, 0.92, "Core Tasks", ha="center", fontsize=9, color=C_EXPERT, fontweight="semibold") ax1.text(5.0, 0.92, "Q4 Gauntlet", ha="center", fontsize=9, color="#6C5CE7", fontweight="semibold") ax1.set_ylim(0, 1.0) ax1.set_ylabel("Score (0 – 1)", fontsize=11) ax1.set_title("ExpertBot Baseline — All 7 Tasks", fontsize=13, fontweight="bold", pad=14) ax1.set_facecolor(BG) ax1.yaxis.grid(True, linestyle="--", alpha=0.5, zorder=0) ax1.set_axisbelow(True) ax1.spines[["top", "right"]].set_visible(False) ax1.tick_params(axis="x", labelsize=8) # ── Panel 2: Random → ExpertBot → Fine-tuned — Q4 tasks ────────────────────── tasks = list(TASK_LABELS.keys()) x = np.arange(len(tasks)) width = 0.24 bars_r = ax2.bar(x - width, [EQUAL_AVG[t] for t in tasks], width, label="Base model — no fine-tuning (avg 3 seeds)", color=C_RANDOM, edgecolor="white", linewidth=1.2, zorder=3) bars_e = ax2.bar(x, [EXPERT_Q4[t] for t in tasks], width, label="ExpertBot (hand-coded expert)", color=C_EXPERT, edgecolor="white", linewidth=1.2, zorder=3) bars_f = ax2.bar(x + width, [FT_AVG[t] for t in tasks], width, label="Fine-tuned Llama-3.1-8B QLoRA (avg 3 seeds)", color=C_FT, edgecolor="white", linewidth=1.2, zorder=3) # Seed dots on fine-tuned bars for i, t in enumerate(tasks): for seed_score in FT_SEEDS[t]: ax2.scatter(x[i] + width, seed_score, color=C_SEED, s=22, zorder=5, alpha=0.85) # Score labels for bar in bars_r: ax2.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.010, f"{bar.get_height():.3f}", ha="center", va="bottom", fontsize=7.5, fontweight="bold", color="#666666") for bar in bars_e: ax2.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.010, f"{bar.get_height():.3f}", ha="center", va="bottom", fontsize=7.5, fontweight="bold", color=C_EXPERT) for bar in bars_f: ax2.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.010, f"{bar.get_height():.3f}", ha="center", va="bottom", fontsize=7.5, fontweight="bold", color=C_FT) # Delta annotations (fine-tuned vs ExpertBot) for i, t in enumerate(tasks): delta = FT_AVG[t] - EXPERT_Q4[t] sign = "+" if delta >= 0 else "" color = "#27AE60" if delta >= 0 else "#E74C3C" ax2.text(x[i] + width, max(EXPERT_Q4[t], FT_AVG[t]) + 0.052, f"{sign}{delta:.3f}", ha="center", fontsize=8.5, color=color, fontweight="bold") ax2.set_ylim(0, 1.05) ax2.set_ylabel("Score (0 – 1)", fontsize=11) ax2.set_title("Reward Improvement: Base Model → ExpertBot → Fine-tuned\nQ4 Gauntlet Tasks (3 seeds each)", fontsize=12, fontweight="bold", pad=14) ax2.set_xticks(x) ax2.set_xticklabels([TASK_LABELS[t] for t in tasks], fontsize=8.5) ax2.set_facecolor(BG) ax2.yaxis.grid(True, linestyle="--", alpha=0.5, zorder=0) ax2.set_axisbelow(True) ax2.spines[["top", "right"]].set_visible(False) ax2.legend(handles=[ mpatches.Patch(color=C_RANDOM, label="Base model — no fine-tuning (avg 3 seeds)"), mpatches.Patch(color=C_EXPERT, label="ExpertBot (hand-coded expert)"), mpatches.Patch(color=C_FT, label="Fine-tuned Llama-3.1-8B QLoRA (avg 3 seeds)"), mpatches.Patch(color=C_SEED, label="Individual seed scores"), ], fontsize=8, loc="lower right", framealpha=0.85) # ── Shared caption ───────────────────────────────────────────────────────────── fig.text( 0.5, 0.01, "Fine-tuned: Llama-3.1-8B-Instruct + QLoRA (rank=16), 1 epoch on ~41k expert demos | " "Loss: 0.1080 | Avg improvement over base model: +67% T5, +82% T6, +56% T7 | Task 6 BEATS ExpertBot by +8.5 pts", ha="center", fontsize=8.5, color="#555555", ) fig.suptitle("Meta-Signal: Privacy-Constrained Ad Budget Optimisation", fontsize=15, fontweight="bold", y=1.01, color="#222222") # ── Save ─────────────────────────────────────────────────────────────────────── out = "results/meta-signal-results.png" plt.savefig(out, dpi=180, bbox_inches="tight", facecolor=BG) print(f"Saved: {out}")