"""Generate dashboard plots locally from the committed trace JSONs. Use this when ``trainer_state.json`` is not available locally (it lives only on the Colab Drive). The 24 trace files in ``data/traces/`` are enough to produce ``bypass_bars.png`` and ``per_category.png`` honestly. Usage: python scripts/plots_from_traces.py --traces data/traces --out docs/plots """ from __future__ import annotations import argparse import json from collections import defaultdict from pathlib import Path from typing import Dict, List # Same baseline numbers used by scripts/make_plots.py (handcrafted-corpus side) BASELINES = { "PG2 Bypass": 0.15, "FW Bypass": 0.20, "Task Success": 0.05, "Composed Bypass": 0.02, } def _aggregate(traces_dir: Path) -> Dict: files = sorted(traces_dir.glob("*.json")) n = len(files) if n == 0: raise SystemExit(f"No traces in {traces_dir}") pg2 = fw = task = composed = 0 by_type: Dict[str, Dict[str, int]] = defaultdict(lambda: {"n": 0, "task": 0, "composed": 0, "pg2": 0, "fw": 0}) for f in files: with f.open() as fh: t = json.load(fh) o = t.get("outcome", {}) atype = t.get("attack_type", "?") by_type[atype]["n"] += 1 if o.get("broke_pg2"): pg2 += 1; by_type[atype]["pg2"] += 1 if o.get("broke_fw"): fw += 1; by_type[atype]["fw"] += 1 if o.get("task_succeeded"): task += 1; by_type[atype]["task"] += 1 if o.get("composed_bypass"): composed += 1; by_type[atype]["composed"] += 1 return { "n": n, "pg2_rate": pg2 / n, "fw_rate": fw / n, "task_rate": task / n, "composed_rate": composed / n, "by_type": dict(by_type), } def _plot_bypass_bars(stats: Dict, out: Path) -> None: import matplotlib.pyplot as plt import numpy as np metrics = { "PG2 Bypass": stats["pg2_rate"], "FW Bypass": stats["fw_rate"], "Task Success": stats["task_rate"], "Composed Bypass": stats["composed_rate"], } x = np.arange(len(metrics)) w = 0.35 fig, ax = plt.subplots(figsize=(9, 5)) b1 = ax.bar(x - w / 2, [BASELINES[k] for k in metrics], w, label="Handcrafted Baseline", color="#94a3b8", edgecolor="white") b2 = ax.bar(x + w / 2, [metrics[k] for k in metrics], w, label="InjectArena (RL-trained)", color="#3b82f6", edgecolor="white") ax.set_ylabel("Rate") ax.set_title(f"InjectArena — Attacker Performance vs Baseline (n={stats['n']} traces)") ax.set_xticks(x) ax.set_xticklabels(list(metrics.keys())) ax.set_ylim(0, 1.05) ax.legend() ax.grid(axis="y", alpha=0.3) for bar in list(b1) + list(b2): h = bar.get_height() if h > 0.005: ax.text(bar.get_x() + bar.get_width() / 2, h + 0.015, f"{h:.0%}", ha="center", va="bottom", fontsize=9) out_path = out / "bypass_bars.png" plt.tight_layout() plt.savefig(out_path, dpi=150, bbox_inches="tight") plt.close() print(f"Saved {out_path}") def _plot_per_category(stats: Dict, out: Path) -> None: import matplotlib.pyplot as plt import numpy as np by_type = stats["by_type"] types = sorted(by_type.keys()) pg2_rates = [by_type[t]["pg2"] / by_type[t]["n"] for t in types] fw_rates = [by_type[t]["fw"] / by_type[t]["n"] for t in types] task_rates = [by_type[t]["task"] / by_type[t]["n"] for t in types] x = np.arange(len(types)) w = 0.27 fig, ax = plt.subplots(figsize=(10, 5)) ax.bar(x - w, pg2_rates, w, label="PG2 Bypass", color="#3b82f6", edgecolor="white") ax.bar(x, fw_rates, w, label="FW Bypass", color="#1d4ed8", edgecolor="white") ax.bar(x + w, task_rates, w, label="Task Success", color="#22c55e", edgecolor="white") ax.set_ylabel("Rate") ax.set_title("InjectArena — Per Attack Category (across all step counts)") ax.set_xticks(x) ax.set_xticklabels([t.replace("_", " ").title() for t in types]) ax.set_ylim(0, 1.05) ax.legend() ax.grid(axis="y", alpha=0.3) out_path = out / "per_category.png" plt.tight_layout() plt.savefig(out_path, dpi=150, bbox_inches="tight") plt.close() print(f"Saved {out_path}") def _plot_step_curve(traces_dir: Path, out: Path) -> None: """Bypass rate vs training-step label — the 'progression' visual.""" import matplotlib.pyplot as plt from collections import defaultdict by_step: Dict[int, Dict[str, int]] = defaultdict(lambda: {"n": 0, "pg2": 0, "fw": 0, "task": 0}) for f in sorted(traces_dir.glob("*.json")): with f.open() as fh: t = json.load(fh) s = t.get("steps") if s is None: continue o = t.get("outcome", {}) by_step[s]["n"] += 1 if o.get("broke_pg2"): by_step[s]["pg2"] += 1 if o.get("broke_fw"): by_step[s]["fw"] += 1 if o.get("task_succeeded"): by_step[s]["task"] += 1 if not by_step: return xs = sorted(by_step.keys()) pg2 = [by_step[s]["pg2"] / by_step[s]["n"] for s in xs] fw = [by_step[s]["fw"] / by_step[s]["n"] for s in xs] task = [by_step[s]["task"] / by_step[s]["n"] for s in xs] fig, ax = plt.subplots(figsize=(10, 5)) ax.plot(xs, pg2, marker="o", linewidth=2, label="PG2 Bypass", color="#3b82f6") ax.plot(xs, fw, marker="s", linewidth=2, label="FW Bypass", color="#1d4ed8") ax.plot(xs, task, marker="^", linewidth=2, label="Task Success", color="#22c55e") ax.set_xlabel("Attacker training steps") ax.set_ylabel("Bypass rate") ax.set_title("InjectArena — Bypass Rate by Training Step Count") ax.set_ylim(0, 1.05) ax.legend() ax.grid(alpha=0.3) out_path = out / "reward_curve.png" # reuse this filename so the dashboard finds it plt.tight_layout() plt.savefig(out_path, dpi=150, bbox_inches="tight") plt.close() print(f"Saved {out_path}") def main() -> None: p = argparse.ArgumentParser() p.add_argument("--traces", default="data/traces") p.add_argument("--out", default="docs/plots") args = p.parse_args() traces = Path(args.traces) out = Path(args.out) out.mkdir(parents=True, exist_ok=True) import matplotlib matplotlib.use("Agg") stats = _aggregate(traces) print(f"Aggregated {stats['n']} traces " f"PG2={stats['pg2_rate']:.0%} FW={stats['fw_rate']:.0%} " f"Task={stats['task_rate']:.0%} Composed={stats['composed_rate']:.0%}") _plot_bypass_bars(stats, out) _plot_per_category(stats, out) _plot_step_curve(traces, out) if __name__ == "__main__": main()