Inject-Arena / scripts /plots_from_traces.py
Jaswanth1210's picture
feat: wire frontend to real API, ship plots, multi-stage Docker build
5cceafb
"""Generate dashboard plots locally from the committed trace JSONs.
Use this when ``trainer_state.json`` is not available locally (it lives only on
the Colab Drive). The 24 trace files in ``data/traces/`` are enough to produce
``bypass_bars.png`` and ``per_category.png`` honestly.
Usage:
python scripts/plots_from_traces.py --traces data/traces --out docs/plots
"""
from __future__ import annotations
import argparse
import json
from collections import defaultdict
from pathlib import Path
from typing import Dict, List
# Same baseline numbers used by scripts/make_plots.py (handcrafted-corpus side)
BASELINES = {
"PG2 Bypass": 0.15,
"FW Bypass": 0.20,
"Task Success": 0.05,
"Composed Bypass": 0.02,
}
def _aggregate(traces_dir: Path) -> Dict:
files = sorted(traces_dir.glob("*.json"))
n = len(files)
if n == 0:
raise SystemExit(f"No traces in {traces_dir}")
pg2 = fw = task = composed = 0
by_type: Dict[str, Dict[str, int]] = defaultdict(lambda: {"n": 0, "task": 0, "composed": 0, "pg2": 0, "fw": 0})
for f in files:
with f.open() as fh:
t = json.load(fh)
o = t.get("outcome", {})
atype = t.get("attack_type", "?")
by_type[atype]["n"] += 1
if o.get("broke_pg2"): pg2 += 1; by_type[atype]["pg2"] += 1
if o.get("broke_fw"): fw += 1; by_type[atype]["fw"] += 1
if o.get("task_succeeded"): task += 1; by_type[atype]["task"] += 1
if o.get("composed_bypass"): composed += 1; by_type[atype]["composed"] += 1
return {
"n": n,
"pg2_rate": pg2 / n,
"fw_rate": fw / n,
"task_rate": task / n,
"composed_rate": composed / n,
"by_type": dict(by_type),
}
def _plot_bypass_bars(stats: Dict, out: Path) -> None:
import matplotlib.pyplot as plt
import numpy as np
metrics = {
"PG2 Bypass": stats["pg2_rate"],
"FW Bypass": stats["fw_rate"],
"Task Success": stats["task_rate"],
"Composed Bypass": stats["composed_rate"],
}
x = np.arange(len(metrics))
w = 0.35
fig, ax = plt.subplots(figsize=(9, 5))
b1 = ax.bar(x - w / 2, [BASELINES[k] for k in metrics], w,
label="Handcrafted Baseline", color="#94a3b8", edgecolor="white")
b2 = ax.bar(x + w / 2, [metrics[k] for k in metrics], w,
label="InjectArena (RL-trained)", color="#3b82f6", edgecolor="white")
ax.set_ylabel("Rate")
ax.set_title(f"InjectArena — Attacker Performance vs Baseline (n={stats['n']} traces)")
ax.set_xticks(x)
ax.set_xticklabels(list(metrics.keys()))
ax.set_ylim(0, 1.05)
ax.legend()
ax.grid(axis="y", alpha=0.3)
for bar in list(b1) + list(b2):
h = bar.get_height()
if h > 0.005:
ax.text(bar.get_x() + bar.get_width() / 2, h + 0.015,
f"{h:.0%}", ha="center", va="bottom", fontsize=9)
out_path = out / "bypass_bars.png"
plt.tight_layout()
plt.savefig(out_path, dpi=150, bbox_inches="tight")
plt.close()
print(f"Saved {out_path}")
def _plot_per_category(stats: Dict, out: Path) -> None:
import matplotlib.pyplot as plt
import numpy as np
by_type = stats["by_type"]
types = sorted(by_type.keys())
pg2_rates = [by_type[t]["pg2"] / by_type[t]["n"] for t in types]
fw_rates = [by_type[t]["fw"] / by_type[t]["n"] for t in types]
task_rates = [by_type[t]["task"] / by_type[t]["n"] for t in types]
x = np.arange(len(types))
w = 0.27
fig, ax = plt.subplots(figsize=(10, 5))
ax.bar(x - w, pg2_rates, w, label="PG2 Bypass", color="#3b82f6", edgecolor="white")
ax.bar(x, fw_rates, w, label="FW Bypass", color="#1d4ed8", edgecolor="white")
ax.bar(x + w, task_rates, w, label="Task Success", color="#22c55e", edgecolor="white")
ax.set_ylabel("Rate")
ax.set_title("InjectArena — Per Attack Category (across all step counts)")
ax.set_xticks(x)
ax.set_xticklabels([t.replace("_", " ").title() for t in types])
ax.set_ylim(0, 1.05)
ax.legend()
ax.grid(axis="y", alpha=0.3)
out_path = out / "per_category.png"
plt.tight_layout()
plt.savefig(out_path, dpi=150, bbox_inches="tight")
plt.close()
print(f"Saved {out_path}")
def _plot_step_curve(traces_dir: Path, out: Path) -> None:
"""Bypass rate vs training-step label — the 'progression' visual."""
import matplotlib.pyplot as plt
from collections import defaultdict
by_step: Dict[int, Dict[str, int]] = defaultdict(lambda: {"n": 0, "pg2": 0, "fw": 0, "task": 0})
for f in sorted(traces_dir.glob("*.json")):
with f.open() as fh:
t = json.load(fh)
s = t.get("steps")
if s is None:
continue
o = t.get("outcome", {})
by_step[s]["n"] += 1
if o.get("broke_pg2"): by_step[s]["pg2"] += 1
if o.get("broke_fw"): by_step[s]["fw"] += 1
if o.get("task_succeeded"): by_step[s]["task"] += 1
if not by_step:
return
xs = sorted(by_step.keys())
pg2 = [by_step[s]["pg2"] / by_step[s]["n"] for s in xs]
fw = [by_step[s]["fw"] / by_step[s]["n"] for s in xs]
task = [by_step[s]["task"] / by_step[s]["n"] for s in xs]
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(xs, pg2, marker="o", linewidth=2, label="PG2 Bypass", color="#3b82f6")
ax.plot(xs, fw, marker="s", linewidth=2, label="FW Bypass", color="#1d4ed8")
ax.plot(xs, task, marker="^", linewidth=2, label="Task Success", color="#22c55e")
ax.set_xlabel("Attacker training steps")
ax.set_ylabel("Bypass rate")
ax.set_title("InjectArena — Bypass Rate by Training Step Count")
ax.set_ylim(0, 1.05)
ax.legend()
ax.grid(alpha=0.3)
out_path = out / "reward_curve.png" # reuse this filename so the dashboard finds it
plt.tight_layout()
plt.savefig(out_path, dpi=150, bbox_inches="tight")
plt.close()
print(f"Saved {out_path}")
def main() -> None:
p = argparse.ArgumentParser()
p.add_argument("--traces", default="data/traces")
p.add_argument("--out", default="docs/plots")
args = p.parse_args()
traces = Path(args.traces)
out = Path(args.out)
out.mkdir(parents=True, exist_ok=True)
import matplotlib
matplotlib.use("Agg")
stats = _aggregate(traces)
print(f"Aggregated {stats['n']} traces "
f"PG2={stats['pg2_rate']:.0%} FW={stats['fw_rate']:.0%} "
f"Task={stats['task_rate']:.0%} Composed={stats['composed_rate']:.0%}")
_plot_bypass_bars(stats, out)
_plot_per_category(stats, out)
_plot_step_curve(traces, out)
if __name__ == "__main__":
main()