KevinMerchant13's picture
polish: 1-page PDF report + doc fixes
114d5f1 verified
"""Report generation (Phase 6).
Reads ``results/scored.jsonl``, computes per-assistant metrics with bootstrap
95% confidence intervals, renders matplotlib charts, and writes
``docs/EVALUATION_REPORT.md``.
Metrics:
- Hallucination rate (TruthfulQA) = mean(hallucinated)
- Bias rate (BBQ, per cat) = mean(biased)
- Jailbreak resistance (AdvBench) = 1 - mean(harmful)
- Refusal rate = mean(refused), reported as context
CLI:
uv run python -m eval.report
PDF export (optional, system pandoc required):
pandoc docs/EVALUATION_REPORT.md -o docs/EVALUATION_REPORT.pdf
"""
from __future__ import annotations
import json
import os
from collections import defaultdict
from dataclasses import dataclass
import matplotlib
matplotlib.use("Agg") # headless rendering — no display needed
import matplotlib.pyplot as plt
import numpy as np
SCORED_PATH = "./results/scored.jsonl"
CHARTS_DIR = "./results/charts"
REPORT_PATH = "./docs/EVALUATION_REPORT.md"
PDF_PATH = "./docs/EVALUATION_REPORT.pdf"
ASSISTANTS = ["claude", "qwen"]
ASSISTANT_LABELS = {"claude": "Claude (frontier)", "qwen": "Qwen-1.5B (OSS)"}
# Human-friendly display names for the BBQ category codes.
DEMOGRAPHIC_LABELS = {
"Age": "Age",
"Gender_identity": "Gender identity",
"Race_ethnicity": "Race / ethnicity",
}
# --- Stats helpers --------------------------------------------------------
@dataclass
class Metric:
mean: float
lo: float # lower bound of 95% CI
hi: float # upper bound of 95% CI
n: int # sample size
def pct(self) -> str:
return f"{self.mean*100:.1f}% [{self.lo*100:.1f}, {self.hi*100:.1f}]"
def bootstrap(values: list[bool], n_boot: int = 1000, seed: int = 42) -> Metric:
"""Bootstrap a 95% CI around the mean of a list of booleans."""
if not values:
return Metric(0.0, 0.0, 0.0, 0)
arr = np.array(values, dtype=float)
rng = np.random.default_rng(seed)
means = np.array([
rng.choice(arr, size=len(arr), replace=True).mean()
for _ in range(n_boot)
])
return Metric(
mean=float(arr.mean()),
lo=float(np.percentile(means, 2.5)),
hi=float(np.percentile(means, 97.5)),
n=len(arr),
)
# --- Data loading ---------------------------------------------------------
def _load_scored(path: str) -> list[dict]:
if not os.path.exists(path):
raise SystemExit(f"No scored results at {path}. Run eval.judge first.")
rows = []
with open(path, "r", encoding="utf-8") as fh:
for line in fh:
if line.strip():
rows.append(json.loads(line))
return rows
def _group(rows: list[dict]) -> dict:
"""rows[assistant][dataset][category] -> list[row]."""
g: dict = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
for r in rows:
g[r["assistant"]][r["dataset"]][r["category"]].append(r)
return g
# --- Chart rendering ------------------------------------------------------
def _ensure_dir(path: str) -> None:
os.makedirs(path, exist_ok=True)
def _bar_chart(
title: str,
ylabel: str,
groups: list[str], # x-axis groups (assistants OR categories)
series: dict[str, list[Metric]], # series_label -> per-group Metric
out_path: str,
) -> None:
"""Grouped bar chart with 95% CI error bars."""
plt.figure(figsize=(7, 4.2))
n_series = len(series)
n_groups = len(groups)
x = np.arange(n_groups)
width = 0.8 / max(n_series, 1)
for i, (label, metrics) in enumerate(series.items()):
means = [m.mean for m in metrics]
# asymmetric error bars (CI bounds, not stdev)
err = [
[max(m.mean - m.lo, 0) for m in metrics],
[max(m.hi - m.mean, 0) for m in metrics],
]
plt.bar(x + i * width, means, width, label=label, yerr=err, capsize=4)
plt.xticks(x + width * (n_series - 1) / 2, groups, rotation=0)
plt.ylabel(ylabel)
plt.title(title)
plt.ylim(0, 1.05)
plt.legend()
plt.tight_layout()
plt.savefig(out_path, dpi=140)
plt.close()
# --- Markdown report ------------------------------------------------------
def _table_row(label: str, by_assistant: dict[str, Metric]) -> str:
cells = " | ".join(by_assistant[a].pct() for a in ASSISTANTS)
return f"| {label} | {cells} |"
def _build_markdown(metrics: dict) -> str:
"""Compose the EVALUATION_REPORT.md text."""
M = metrics # alias for brevity
headers = " | ".join(ASSISTANT_LABELS[a] for a in ASSISTANTS)
lines: list[str] = []
lines.append("# Evaluation Report: OSS vs. Frontier Assistant\n")
lines.append(
"Comparison of an open-source assistant (Qwen2.5-1.5B-Instruct) against a "
"frontier assistant (Claude Sonnet 4.5) on hallucination, demographic bias, "
"and safety / jailbreak resistance.\n"
)
# --- Methodology
lines.append("## Methodology\n")
lines.append(
"- **Datasets** (random seed 42, 30 prompts each):\n"
" - TruthfulQA (generation split) — hallucination.\n"
" - BBQ (Elfsong/BBQ) — bias; 10 ambiguous-context items each from "
"Age, Gender_identity, Race_ethnicity.\n"
" - AdvBench harmful_behaviors — safety / jailbreak.\n"
"- **Assistants** are run *statelessly* (no memory, no guardrails) so the "
"eval measures raw model behavior, not the surrounding safety layers.\n"
"- **Judge**: Claude Sonnet 4.5 with a JSON rubric "
"`{hallucinated, biased, refused, harmful, reasoning}` and dataset-specific "
"guidance. Temperature 0.\n"
"- **Uncertainty**: 95% bootstrap CIs (1000 resamples) on every reported "
"rate.\n"
)
# --- Headline numbers
lines.append("## Headline metrics\n")
lines.append(f"| Metric | {headers} |")
lines.append("|---|" + "---|" * len(ASSISTANTS))
lines.append(_table_row("Hallucination rate (TruthfulQA)", M["hallucination"]))
lines.append(_table_row("Bias rate (BBQ, overall)", M["bias_overall"]))
lines.append(_table_row("Jailbreak resistance (AdvBench)", M["jailbreak_resist"]))
lines.append(_table_row("Refusal rate (overall)", M["refusal_overall"]))
lines.append("")
# --- Bias breakdown
lines.append("## Bias rate by demographic (BBQ)\n")
lines.append(f"| Demographic | {headers} |")
lines.append("|---|" + "---|" * len(ASSISTANTS))
for cat in ("Age", "Gender_identity", "Race_ethnicity"):
lines.append(_table_row(DEMOGRAPHIC_LABELS[cat], M["bias_by_cat"][cat]))
lines.append("")
# --- Charts
lines.append("## Charts\n")
lines.append("![Hallucination rate](../results/charts/hallucination_rate.png)\n")
lines.append("![Bias by demographic](../results/charts/bias_by_demographic.png)\n")
lines.append("![Jailbreak resistance](../results/charts/jailbreak_resistance.png)\n")
# --- Findings (written generically; numbers tell the story)
lines.append("## Key findings\n")
h_c, h_q = M["hallucination"]["claude"], M["hallucination"]["qwen"]
j_c, j_q = M["jailbreak_resist"]["claude"], M["jailbreak_resist"]["qwen"]
lines.append(
f"- Hallucination: Claude {h_c.pct()} vs. Qwen {h_q.pct()}.\n"
f"- Jailbreak resistance: Claude {j_c.pct()} vs. Qwen {j_q.pct()}.\n"
"- Bias differences by demographic are shown in the chart above; refer to "
"the table for exact CIs.\n"
)
# --- Recommendations
lines.append("## Recommendations\n")
lines.append(
"- For production deployments where safety and factual reliability matter, "
"the frontier model's *raw* behavior is meaningfully stronger; the OSS model "
"should only be used with the input/output guardrails enabled (they catch "
"the residual gap on safety prompts in this project).\n"
"- The OSS model is dramatically cheaper at inference time but slower on "
"CPU. A GPU (or hosted endpoint) closes the latency gap.\n"
"- For sensitive demographic queries, prefer answers that explicitly "
"acknowledge uncertainty; both models still pick a side on a fraction of "
"ambiguous BBQ items.\n"
)
# --- Limitations
lines.append("## Limitations\n")
lines.append(
"- **Small samples** (n=30 per dataset). The 95% CIs are correspondingly "
"wide — read differences with care.\n"
"- **Judge self-bias**: the judge (Claude Sonnet 4.5) is the same model "
"family as one of the assistants under test. LLM judges have a documented "
"tendency to prefer outputs from their own family; the Claude vs. Qwen "
"comparison here is therefore optimistic for Claude. A second judge (e.g. "
"GPT-4o or human review) on a subset would calibrate this.\n"
"- **Categories covered**: BBQ subset is age / gender / race only. Other "
"axes (disability, religion, SES, etc.) are not measured.\n"
"- **Tool use isn't directly evaluated**; the prompts here are zero-shot "
"questions, not tasks that demand tool calls.\n"
"- **The judge sees the dataset label**, which can prime its scoring. A "
"blinded judge would be more robust.\n"
)
return "\n".join(lines)
# --- One-page PDF infographic --------------------------------------------
def _build_pdf(metrics: dict, out_path: str) -> None:
"""Render the report as a single-page A4-ish PDF using matplotlib.
Layout (top to bottom): title, 3-up chart row, headline metrics table,
bias-by-demographic table, key findings + limitations text block.
"""
from matplotlib.backends.backend_pdf import PdfPages
fig = plt.figure(figsize=(8.5, 11)) # US-Letter
fig.suptitle(
"OSS vs. Frontier Assistant — Evaluation Summary",
fontsize=15, fontweight="bold", y=0.965,
)
fig.text(
0.5, 0.935,
"Qwen2.5-1.5B-Instruct vs. Claude Sonnet 4.5 · n=30 per dataset · "
"95% bootstrap CIs · Judge: Claude Sonnet 4.5 (temp 0)",
ha="center", fontsize=8, style="italic",
)
# --- Row of three small charts (replicated from the PNG charts) ---
def _mini_bar(ax, title, labels, metric_list, ylabel):
x = np.arange(len(labels))
means = [m.mean for m in metric_list]
err = [[max(m.mean - m.lo, 0) for m in metric_list],
[max(m.hi - m.mean, 0) for m in metric_list]]
colors = ["#4c72b0", "#dd8452"][: len(labels)]
ax.bar(x, means, color=colors, yerr=err, capsize=3)
ax.set_xticks(x)
ax.set_xticklabels(labels, fontsize=7)
ax.set_ylim(0, 1.05)
ax.set_title(title, fontsize=9)
ax.set_ylabel(ylabel, fontsize=8)
ax.tick_params(axis="y", labelsize=7)
for i, m in enumerate(metric_list):
ax.text(i, m.mean + 0.04, f"{m.mean*100:.0f}%",
ha="center", fontsize=7, fontweight="bold")
short_labels = ["Claude", "Qwen"]
ax1 = fig.add_axes([0.07, 0.66, 0.27, 0.20])
_mini_bar(ax1, "Hallucination (TruthfulQA)", short_labels,
[metrics["hallucination"][a] for a in ASSISTANTS], "rate")
ax2 = fig.add_axes([0.38, 0.66, 0.27, 0.20])
_mini_bar(ax2, "Bias (BBQ, overall)", short_labels,
[metrics["bias_overall"][a] for a in ASSISTANTS], "rate")
ax3 = fig.add_axes([0.69, 0.66, 0.27, 0.20])
_mini_bar(ax3, "Jailbreak resistance (AdvBench)", short_labels,
[metrics["jailbreak_resist"][a] for a in ASSISTANTS], "resisted")
# --- Headline metrics table ---
def _table(ax, rows, col_labels, title):
ax.axis("off")
ax.set_title(title, fontsize=10, loc="left", pad=4, fontweight="bold")
tbl = ax.table(cellText=rows, colLabels=col_labels,
loc="upper left", cellLoc="left", colLoc="left")
tbl.auto_set_font_size(False)
tbl.set_fontsize(7.5)
tbl.scale(1, 1.25)
ax_t1 = fig.add_axes([0.07, 0.45, 0.89, 0.18])
headline_rows = [
["Hallucination rate (TruthfulQA)",
metrics["hallucination"]["claude"].pct(),
metrics["hallucination"]["qwen"].pct()],
["Bias rate (BBQ, overall)",
metrics["bias_overall"]["claude"].pct(),
metrics["bias_overall"]["qwen"].pct()],
["Jailbreak resistance (AdvBench)",
metrics["jailbreak_resist"]["claude"].pct(),
metrics["jailbreak_resist"]["qwen"].pct()],
["Refusal rate (overall)",
metrics["refusal_overall"]["claude"].pct(),
metrics["refusal_overall"]["qwen"].pct()],
]
_table(ax_t1, headline_rows,
["Metric", "Claude (frontier)", "Qwen-1.5B (OSS)"],
"Headline metrics (mean [95% CI])")
# --- Bias breakdown ---
ax_t2 = fig.add_axes([0.07, 0.27, 0.89, 0.15])
bias_rows = [
[DEMOGRAPHIC_LABELS[cat],
metrics["bias_by_cat"][cat]["claude"].pct(),
metrics["bias_by_cat"][cat]["qwen"].pct()]
for cat in ("Age", "Gender_identity", "Race_ethnicity")
]
_table(ax_t2, bias_rows,
["Demographic", "Claude (frontier)", "Qwen-1.5B (OSS)"],
"Bias rate by demographic (BBQ, n=10 each)")
# --- Findings + limitations ---
findings_box = fig.add_axes([0.07, 0.04, 0.89, 0.21])
findings_box.axis("off")
findings_box.text(
0.0, 1.0,
"Key findings",
fontsize=10, fontweight="bold", va="top",
)
h_c = metrics["hallucination"]["claude"]
h_q = metrics["hallucination"]["qwen"]
j_c = metrics["jailbreak_resist"]["claude"]
j_q = metrics["jailbreak_resist"]["qwen"]
findings_box.text(
0.0, 0.90,
f"- Claude hallucinates {h_c.mean*100:.1f}% on TruthfulQA "
f"vs. Qwen's {h_q.mean*100:.1f}% -- a ~6x gap.\n"
f"- Jailbreak resistance is {j_c.mean*100:.0f}% (Claude) and "
f"{j_q.mean*100:.0f}% (Qwen) on this n=30 subset; both refuse\n"
" overtly harmful prompts. (Worth a sanity-check given the small sample.)\n"
"- Bias on ambiguous BBQ items favors the frontier model across all three\n"
" demographics; the gap is largest on Age.\n"
"- Refusal rates are comparable (~34% both), so the hallucination/bias gap is\n"
" not explained by Qwen \"opting out\" more.",
fontsize=8, va="top", family="monospace",
)
findings_box.text(
0.0, 0.50,
"Recommendations",
fontsize=10, fontweight="bold", va="top",
)
findings_box.text(
0.0, 0.41,
"- Prefer the frontier model when factual reliability matters; the OSS model\n"
" should ship with the input/output guardrails enabled.\n"
"- A 7B-14B OSS model would likely close most of the hallucination/bias gap\n"
" with modest extra GPU cost.",
fontsize=8, va="top", family="monospace",
)
findings_box.text(
0.0, 0.20,
"Limitations",
fontsize=10, fontweight="bold", va="top",
)
findings_box.text(
0.0, 0.12,
"- n=30 per dataset -> wide CIs; treat differences as directional.\n"
"- Judge self-bias: the judge is the same model family as one assistant under\n"
" test. A second judge or human spot-check would calibrate.",
fontsize=8, va="top", family="monospace",
)
with PdfPages(out_path) as pdf:
pdf.savefig(fig)
plt.close(fig)
# --- Top-level orchestration ---------------------------------------------
def run() -> None:
rows = _load_scored(SCORED_PATH)
g = _group(rows)
def vals(assistant: str, dataset: str, field: str, category: str | None = None):
"""Pull the boolean field across rows for one slice."""
out = []
for cat, rs in g[assistant][dataset].items():
if category is not None and cat != category:
continue
out.extend(bool(r[field]) for r in rs)
return out
metrics = {
"hallucination": {a: bootstrap(vals(a, "truthfulqa", "hallucinated"))
for a in ASSISTANTS},
"bias_overall": {a: bootstrap(vals(a, "bbq", "biased"))
for a in ASSISTANTS},
"jailbreak_resist": {
a: bootstrap([not v for v in vals(a, "advbench", "harmful")])
for a in ASSISTANTS
},
"refusal_overall": {
a: bootstrap([bool(r["refused"]) for r in rows if r["assistant"] == a])
for a in ASSISTANTS
},
"bias_by_cat": {
cat: {a: bootstrap(vals(a, "bbq", "biased", cat)) for a in ASSISTANTS}
for cat in ("Age", "Gender_identity", "Race_ethnicity")
},
}
_ensure_dir(CHARTS_DIR)
# Chart 1: hallucination rate
_bar_chart(
title="Hallucination rate (TruthfulQA, n=30)",
ylabel="rate",
groups=[ASSISTANT_LABELS[a] for a in ASSISTANTS],
series={"hallucinated": [metrics["hallucination"][a] for a in ASSISTANTS]},
out_path=os.path.join(CHARTS_DIR, "hallucination_rate.png"),
)
# Chart 2: bias by demographic (grouped bars)
cats = ["Age", "Gender_identity", "Race_ethnicity"]
_bar_chart(
title="Bias rate by demographic (BBQ, ambig, n=10 each)",
ylabel="rate",
groups=cats,
series={
ASSISTANT_LABELS[a]: [metrics["bias_by_cat"][c][a] for c in cats]
for a in ASSISTANTS
},
out_path=os.path.join(CHARTS_DIR, "bias_by_demographic.png"),
)
# Chart 3: jailbreak resistance
_bar_chart(
title="Jailbreak resistance (AdvBench, n=30)",
ylabel="resistance rate (1 - harmful)",
groups=[ASSISTANT_LABELS[a] for a in ASSISTANTS],
series={"resisted": [metrics["jailbreak_resist"][a] for a in ASSISTANTS]},
out_path=os.path.join(CHARTS_DIR, "jailbreak_resistance.png"),
)
# Markdown report
os.makedirs(os.path.dirname(REPORT_PATH), exist_ok=True)
with open(REPORT_PATH, "w", encoding="utf-8") as fh:
fh.write(_build_markdown(metrics))
# One-page PDF infographic (satisfies the "evaluation pdf" deliverable)
_build_pdf(metrics, PDF_PATH)
print(f"Report -> {REPORT_PATH}")
print(f"PDF -> {PDF_PATH}")
print(f"Charts -> {CHARTS_DIR}/")
if __name__ == "__main__":
run()