Spaces:
Running
Running
| """Generate the three infographics for reports/evaluation_report.md. | |
| Inputs: | |
| - eval/results/results-guarded-scored.summary.json (required) | |
| - eval/results/results-raw-scored.summary.json (optional, enables figure 3) | |
| Outputs (PNG): | |
| - reports/figures/scores_by_axis.png | |
| - reports/figures/latency_cost.png | |
| - reports/figures/refusal_matrix.png | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| from pathlib import Path | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| ROOT = Path(__file__).resolve().parent.parent | |
| FIG_DIR = ROOT / "reports" / "figures" | |
| FIG_DIR.mkdir(parents=True, exist_ok=True) | |
| # Public list price as of 2026-01 (cents per 1k tokens). Update if pricing changes. | |
| # Llama-3.2-1B is self-hosted on CPU — we report compute-amortized as $0 marginal. | |
| # If the OpenAI call fell back to Groq the actual marginal cost is lower than this | |
| # (Groq's free tier is $0 within quota); these numbers assume the primary served. | |
| COST_PER_1K = { | |
| "openai": {"in": 0.200, "out": 0.800}, # gpt-4.1 approx ($/1k tokens) | |
| "llama": {"in": 0.000, "out": 0.000}, | |
| } | |
| AXES = ["hallucination", "content_safety", "bias"] | |
| COLOURS = {"openai": "#10a37f", "llama": "#f0b429"} | |
| def _load(p: Path): | |
| return json.loads(p.read_text(encoding="utf-8")) if p.exists() else None | |
| def fig_scores(summary: dict) -> Path: | |
| models = list(summary["axis_pct"].keys()) | |
| x = np.arange(len(AXES)) | |
| w = 0.8 / max(1, len(models)) | |
| fig, ax = plt.subplots(figsize=(7, 4.2)) | |
| for i, m in enumerate(models): | |
| vals = [summary["axis_pct"].get(m, {}).get(a, 0) for a in AXES] | |
| bars = ax.bar(x + i*w - 0.4 + w/2, vals, width=w, label=m, color=COLOURS.get(m, "#888")) | |
| for b, v in zip(bars, vals): | |
| ax.text(b.get_x()+b.get_width()/2, v+1, f"{v:.0f}", ha="center", va="bottom", fontsize=9) | |
| ax.set_xticks(x); ax.set_xticklabels([a.replace("_"," ").title() for a in AXES]) | |
| ax.set_ylim(0, 105); ax.set_ylabel("Score (% of max)") | |
| ax.set_title("Quality by axis — higher is better") | |
| ax.legend(loc="lower right"); ax.grid(axis="y", alpha=0.3) | |
| p = FIG_DIR / "scores_by_axis.png" | |
| fig.tight_layout(); fig.savefig(p, dpi=150); plt.close(fig) | |
| return p | |
| def fig_latency_cost(summary: dict) -> Path: | |
| models = list(summary["latency_ms"].keys()) | |
| p50 = [summary["latency_ms"][m].get("p50") or 0 for m in models] | |
| p95 = [summary["latency_ms"][m].get("p95") or 0 for m in models] | |
| # Cost per turn ≈ mean_in/1k * in + mean_out/1k * out (cents). | |
| cost = [] | |
| for m in models: | |
| t = summary["tokens"].get(m, {}) | |
| c = COST_PER_1K.get(m, {"in":0,"out":0}) | |
| ci = (t.get("mean_in") or 0) / 1000 * c["in"] | |
| co = (t.get("mean_out") or 0) / 1000 * c["out"] | |
| cost.append(round(ci + co, 3)) | |
| fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(9, 4)) | |
| x = np.arange(len(models)) | |
| ax1.bar(x-0.2, p50, 0.4, label="p50", color="#6aa6ff") | |
| ax1.bar(x+0.2, p95, 0.4, label="p95", color="#1f78b4") | |
| ax1.set_xticks(x); ax1.set_xticklabels(models) | |
| ax1.set_ylabel("Latency (ms)"); ax1.set_title("Latency per turn"); ax1.legend(); ax1.grid(axis="y", alpha=0.3) | |
| for i, (a, b) in enumerate(zip(p50, p95)): | |
| ax1.text(i-0.2, a+10, str(a), ha="center", fontsize=9) | |
| ax1.text(i+0.2, b+10, str(b), ha="center", fontsize=9) | |
| ax2.bar(x, cost, 0.55, color=[COLOURS.get(m, "#888") for m in models]) | |
| ax2.set_xticks(x); ax2.set_xticklabels(models) | |
| ax2.set_ylabel("¢ / turn (mean)"); ax2.set_title("Cost per turn — Llama self-hosted = $0 marginal") | |
| for i, v in enumerate(cost): | |
| ax2.text(i, v + max(cost)*0.02 if cost else 0, f"{v:.3f}¢", ha="center", fontsize=9) | |
| ax2.grid(axis="y", alpha=0.3) | |
| p = FIG_DIR / "latency_cost.png" | |
| fig.tight_layout(); fig.savefig(p, dpi=150); plt.close(fig) | |
| return p | |
| def fig_refusal_matrix(guarded: dict, raw: dict | None) -> Path: | |
| models = list(guarded["refusals"].keys()) | |
| fig, ax = plt.subplots(figsize=(7, 3.8)) | |
| x = np.arange(len(models)) | |
| g_rate = [guarded["refusals"][m]["refusal_rate"] or 0 for m in models] | |
| g_block = [guarded["refusals"][m]["block_rate"] or 0 for m in models] | |
| if raw: | |
| r_rate = [raw["refusals"].get(m, {}).get("refusal_rate", 0) or 0 for m in models] | |
| else: | |
| r_rate = [0]*len(models) | |
| w = 0.27 | |
| ax.bar(x-w, r_rate, w, label="refusal (guardrails OFF)", color="#8a93a6") | |
| ax.bar(x, g_rate, w, label="refusal (guardrails ON)", color="#6aa6ff") | |
| ax.bar(x+w, g_block, w, label="output blocked by filter", color="#f06464") | |
| ax.set_xticks(x); ax.set_xticklabels(models); ax.set_ylabel("% of prompts") | |
| ax.set_title("Refusal & guardrail block rates"); ax.legend(); ax.grid(axis="y", alpha=0.3) | |
| p = FIG_DIR / "refusal_matrix.png" | |
| fig.tight_layout(); fig.savefig(p, dpi=150); plt.close(fig) | |
| return p | |
| def main(): | |
| ap = argparse.ArgumentParser() | |
| ap.add_argument("--guarded", default="eval/results/results-guarded-scored.summary.json") | |
| ap.add_argument("--raw", default="eval/results/results-raw-scored.summary.json") | |
| args = ap.parse_args() | |
| g = _load(ROOT / args.guarded) | |
| r = _load(ROOT / args.raw) | |
| if g is None: | |
| raise SystemExit(f"missing {args.guarded} — run score.py first") | |
| paths = [fig_scores(g), fig_latency_cost(g), fig_refusal_matrix(g, r)] | |
| for p in paths: | |
| print(f"wrote {p.relative_to(ROOT)}") | |
| if __name__ == "__main__": | |
| main() | |