"""Generate the three infographics for reports/evaluation_report.md. Inputs: - eval/results/results-guarded-scored.summary.json (required) - eval/results/results-raw-scored.summary.json (optional, enables figure 3) Outputs (PNG): - reports/figures/scores_by_axis.png - reports/figures/latency_cost.png - reports/figures/refusal_matrix.png """ from __future__ import annotations import argparse import json from pathlib import Path import matplotlib.pyplot as plt import numpy as np ROOT = Path(__file__).resolve().parent.parent FIG_DIR = ROOT / "reports" / "figures" FIG_DIR.mkdir(parents=True, exist_ok=True) # Public list price as of 2026-01 (cents per 1k tokens). Update if pricing changes. # Llama-3.2-1B is self-hosted on CPU — we report compute-amortized as $0 marginal. # If the OpenAI call fell back to Groq the actual marginal cost is lower than this # (Groq's free tier is $0 within quota); these numbers assume the primary served. COST_PER_1K = { "openai": {"in": 0.200, "out": 0.800}, # gpt-4.1 approx ($/1k tokens) "llama": {"in": 0.000, "out": 0.000}, } AXES = ["hallucination", "content_safety", "bias"] COLOURS = {"openai": "#10a37f", "llama": "#f0b429"} def _load(p: Path): return json.loads(p.read_text(encoding="utf-8")) if p.exists() else None def fig_scores(summary: dict) -> Path: models = list(summary["axis_pct"].keys()) x = np.arange(len(AXES)) w = 0.8 / max(1, len(models)) fig, ax = plt.subplots(figsize=(7, 4.2)) for i, m in enumerate(models): vals = [summary["axis_pct"].get(m, {}).get(a, 0) for a in AXES] bars = ax.bar(x + i*w - 0.4 + w/2, vals, width=w, label=m, color=COLOURS.get(m, "#888")) for b, v in zip(bars, vals): ax.text(b.get_x()+b.get_width()/2, v+1, f"{v:.0f}", ha="center", va="bottom", fontsize=9) ax.set_xticks(x); ax.set_xticklabels([a.replace("_"," ").title() for a in AXES]) ax.set_ylim(0, 105); ax.set_ylabel("Score (% of max)") ax.set_title("Quality by axis — higher is better") ax.legend(loc="lower right"); ax.grid(axis="y", alpha=0.3) p = FIG_DIR / "scores_by_axis.png" fig.tight_layout(); fig.savefig(p, dpi=150); plt.close(fig) return p def fig_latency_cost(summary: dict) -> Path: models = list(summary["latency_ms"].keys()) p50 = [summary["latency_ms"][m].get("p50") or 0 for m in models] p95 = [summary["latency_ms"][m].get("p95") or 0 for m in models] # Cost per turn ≈ mean_in/1k * in + mean_out/1k * out (cents). cost = [] for m in models: t = summary["tokens"].get(m, {}) c = COST_PER_1K.get(m, {"in":0,"out":0}) ci = (t.get("mean_in") or 0) / 1000 * c["in"] co = (t.get("mean_out") or 0) / 1000 * c["out"] cost.append(round(ci + co, 3)) fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(9, 4)) x = np.arange(len(models)) ax1.bar(x-0.2, p50, 0.4, label="p50", color="#6aa6ff") ax1.bar(x+0.2, p95, 0.4, label="p95", color="#1f78b4") ax1.set_xticks(x); ax1.set_xticklabels(models) ax1.set_ylabel("Latency (ms)"); ax1.set_title("Latency per turn"); ax1.legend(); ax1.grid(axis="y", alpha=0.3) for i, (a, b) in enumerate(zip(p50, p95)): ax1.text(i-0.2, a+10, str(a), ha="center", fontsize=9) ax1.text(i+0.2, b+10, str(b), ha="center", fontsize=9) ax2.bar(x, cost, 0.55, color=[COLOURS.get(m, "#888") for m in models]) ax2.set_xticks(x); ax2.set_xticklabels(models) ax2.set_ylabel("¢ / turn (mean)"); ax2.set_title("Cost per turn — Llama self-hosted = $0 marginal") for i, v in enumerate(cost): ax2.text(i, v + max(cost)*0.02 if cost else 0, f"{v:.3f}¢", ha="center", fontsize=9) ax2.grid(axis="y", alpha=0.3) p = FIG_DIR / "latency_cost.png" fig.tight_layout(); fig.savefig(p, dpi=150); plt.close(fig) return p def fig_refusal_matrix(guarded: dict, raw: dict | None) -> Path: models = list(guarded["refusals"].keys()) fig, ax = plt.subplots(figsize=(7, 3.8)) x = np.arange(len(models)) g_rate = [guarded["refusals"][m]["refusal_rate"] or 0 for m in models] g_block = [guarded["refusals"][m]["block_rate"] or 0 for m in models] if raw: r_rate = [raw["refusals"].get(m, {}).get("refusal_rate", 0) or 0 for m in models] else: r_rate = [0]*len(models) w = 0.27 ax.bar(x-w, r_rate, w, label="refusal (guardrails OFF)", color="#8a93a6") ax.bar(x, g_rate, w, label="refusal (guardrails ON)", color="#6aa6ff") ax.bar(x+w, g_block, w, label="output blocked by filter", color="#f06464") ax.set_xticks(x); ax.set_xticklabels(models); ax.set_ylabel("% of prompts") ax.set_title("Refusal & guardrail block rates"); ax.legend(); ax.grid(axis="y", alpha=0.3) p = FIG_DIR / "refusal_matrix.png" fig.tight_layout(); fig.savefig(p, dpi=150); plt.close(fig) return p def main(): ap = argparse.ArgumentParser() ap.add_argument("--guarded", default="eval/results/results-guarded-scored.summary.json") ap.add_argument("--raw", default="eval/results/results-raw-scored.summary.json") args = ap.parse_args() g = _load(ROOT / args.guarded) r = _load(ROOT / args.raw) if g is None: raise SystemExit(f"missing {args.guarded} — run score.py first") paths = [fig_scores(g), fig_latency_cost(g), fig_refusal_matrix(g, r)] for p in paths: print(f"wrote {p.relative_to(ROOT)}") if __name__ == "__main__": main()