| """Report generation (Phase 6). |
| |
| Reads ``results/scored.jsonl``, computes per-assistant metrics with bootstrap |
| 95% confidence intervals, renders matplotlib charts, and writes |
| ``docs/EVALUATION_REPORT.md``. |
| |
| Metrics: |
| - Hallucination rate (TruthfulQA) = mean(hallucinated) |
| - Bias rate (BBQ, per cat) = mean(biased) |
| - Jailbreak resistance (AdvBench) = 1 - mean(harmful) |
| - Refusal rate = mean(refused), reported as context |
| |
| CLI: |
| uv run python -m eval.report |
| |
| PDF export (optional, system pandoc required): |
| pandoc docs/EVALUATION_REPORT.md -o docs/EVALUATION_REPORT.pdf |
| """ |
|
|
| from __future__ import annotations |
|
|
| import json |
| import os |
| from collections import defaultdict |
| from dataclasses import dataclass |
|
|
| import matplotlib |
|
|
| matplotlib.use("Agg") |
| import matplotlib.pyplot as plt |
| import numpy as np |
|
|
| SCORED_PATH = "./results/scored.jsonl" |
| CHARTS_DIR = "./results/charts" |
| REPORT_PATH = "./docs/EVALUATION_REPORT.md" |
| PDF_PATH = "./docs/EVALUATION_REPORT.pdf" |
|
|
| ASSISTANTS = ["claude", "qwen"] |
| ASSISTANT_LABELS = {"claude": "Claude (frontier)", "qwen": "Qwen-1.5B (OSS)"} |
|
|
| |
| DEMOGRAPHIC_LABELS = { |
| "Age": "Age", |
| "Gender_identity": "Gender identity", |
| "Race_ethnicity": "Race / ethnicity", |
| } |
|
|
|
|
| |
|
|
|
|
| @dataclass |
| class Metric: |
| mean: float |
| lo: float |
| hi: float |
| n: int |
|
|
| def pct(self) -> str: |
| return f"{self.mean*100:.1f}% [{self.lo*100:.1f}, {self.hi*100:.1f}]" |
|
|
|
|
| def bootstrap(values: list[bool], n_boot: int = 1000, seed: int = 42) -> Metric: |
| """Bootstrap a 95% CI around the mean of a list of booleans.""" |
| if not values: |
| return Metric(0.0, 0.0, 0.0, 0) |
| arr = np.array(values, dtype=float) |
| rng = np.random.default_rng(seed) |
| means = np.array([ |
| rng.choice(arr, size=len(arr), replace=True).mean() |
| for _ in range(n_boot) |
| ]) |
| return Metric( |
| mean=float(arr.mean()), |
| lo=float(np.percentile(means, 2.5)), |
| hi=float(np.percentile(means, 97.5)), |
| n=len(arr), |
| ) |
|
|
|
|
| |
|
|
|
|
| def _load_scored(path: str) -> list[dict]: |
| if not os.path.exists(path): |
| raise SystemExit(f"No scored results at {path}. Run eval.judge first.") |
| rows = [] |
| with open(path, "r", encoding="utf-8") as fh: |
| for line in fh: |
| if line.strip(): |
| rows.append(json.loads(line)) |
| return rows |
|
|
|
|
| def _group(rows: list[dict]) -> dict: |
| """rows[assistant][dataset][category] -> list[row].""" |
| g: dict = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) |
| for r in rows: |
| g[r["assistant"]][r["dataset"]][r["category"]].append(r) |
| return g |
|
|
|
|
| |
|
|
|
|
| def _ensure_dir(path: str) -> None: |
| os.makedirs(path, exist_ok=True) |
|
|
|
|
| def _bar_chart( |
| title: str, |
| ylabel: str, |
| groups: list[str], |
| series: dict[str, list[Metric]], |
| out_path: str, |
| ) -> None: |
| """Grouped bar chart with 95% CI error bars.""" |
| plt.figure(figsize=(7, 4.2)) |
| n_series = len(series) |
| n_groups = len(groups) |
| x = np.arange(n_groups) |
| width = 0.8 / max(n_series, 1) |
| for i, (label, metrics) in enumerate(series.items()): |
| means = [m.mean for m in metrics] |
| |
| err = [ |
| [max(m.mean - m.lo, 0) for m in metrics], |
| [max(m.hi - m.mean, 0) for m in metrics], |
| ] |
| plt.bar(x + i * width, means, width, label=label, yerr=err, capsize=4) |
| plt.xticks(x + width * (n_series - 1) / 2, groups, rotation=0) |
| plt.ylabel(ylabel) |
| plt.title(title) |
| plt.ylim(0, 1.05) |
| plt.legend() |
| plt.tight_layout() |
| plt.savefig(out_path, dpi=140) |
| plt.close() |
|
|
|
|
| |
|
|
|
|
| def _table_row(label: str, by_assistant: dict[str, Metric]) -> str: |
| cells = " | ".join(by_assistant[a].pct() for a in ASSISTANTS) |
| return f"| {label} | {cells} |" |
|
|
|
|
| def _build_markdown(metrics: dict) -> str: |
| """Compose the EVALUATION_REPORT.md text.""" |
| M = metrics |
| headers = " | ".join(ASSISTANT_LABELS[a] for a in ASSISTANTS) |
|
|
| lines: list[str] = [] |
| lines.append("# Evaluation Report: OSS vs. Frontier Assistant\n") |
| lines.append( |
| "Comparison of an open-source assistant (Qwen2.5-1.5B-Instruct) against a " |
| "frontier assistant (Claude Sonnet 4.5) on hallucination, demographic bias, " |
| "and safety / jailbreak resistance.\n" |
| ) |
|
|
| |
| lines.append("## Methodology\n") |
| lines.append( |
| "- **Datasets** (random seed 42, 30 prompts each):\n" |
| " - TruthfulQA (generation split) — hallucination.\n" |
| " - BBQ (Elfsong/BBQ) — bias; 10 ambiguous-context items each from " |
| "Age, Gender_identity, Race_ethnicity.\n" |
| " - AdvBench harmful_behaviors — safety / jailbreak.\n" |
| "- **Assistants** are run *statelessly* (no memory, no guardrails) so the " |
| "eval measures raw model behavior, not the surrounding safety layers.\n" |
| "- **Judge**: Claude Sonnet 4.5 with a JSON rubric " |
| "`{hallucinated, biased, refused, harmful, reasoning}` and dataset-specific " |
| "guidance. Temperature 0.\n" |
| "- **Uncertainty**: 95% bootstrap CIs (1000 resamples) on every reported " |
| "rate.\n" |
| ) |
|
|
| |
| lines.append("## Headline metrics\n") |
| lines.append(f"| Metric | {headers} |") |
| lines.append("|---|" + "---|" * len(ASSISTANTS)) |
| lines.append(_table_row("Hallucination rate (TruthfulQA)", M["hallucination"])) |
| lines.append(_table_row("Bias rate (BBQ, overall)", M["bias_overall"])) |
| lines.append(_table_row("Jailbreak resistance (AdvBench)", M["jailbreak_resist"])) |
| lines.append(_table_row("Refusal rate (overall)", M["refusal_overall"])) |
| lines.append("") |
|
|
| |
| lines.append("## Bias rate by demographic (BBQ)\n") |
| lines.append(f"| Demographic | {headers} |") |
| lines.append("|---|" + "---|" * len(ASSISTANTS)) |
| for cat in ("Age", "Gender_identity", "Race_ethnicity"): |
| lines.append(_table_row(DEMOGRAPHIC_LABELS[cat], M["bias_by_cat"][cat])) |
| lines.append("") |
|
|
| |
| lines.append("## Charts\n") |
| lines.append("\n") |
| lines.append("\n") |
| lines.append("\n") |
|
|
| |
| lines.append("## Key findings\n") |
| h_c, h_q = M["hallucination"]["claude"], M["hallucination"]["qwen"] |
| j_c, j_q = M["jailbreak_resist"]["claude"], M["jailbreak_resist"]["qwen"] |
| lines.append( |
| f"- Hallucination: Claude {h_c.pct()} vs. Qwen {h_q.pct()}.\n" |
| f"- Jailbreak resistance: Claude {j_c.pct()} vs. Qwen {j_q.pct()}.\n" |
| "- Bias differences by demographic are shown in the chart above; refer to " |
| "the table for exact CIs.\n" |
| ) |
|
|
| |
| lines.append("## Recommendations\n") |
| lines.append( |
| "- For production deployments where safety and factual reliability matter, " |
| "the frontier model's *raw* behavior is meaningfully stronger; the OSS model " |
| "should only be used with the input/output guardrails enabled (they catch " |
| "the residual gap on safety prompts in this project).\n" |
| "- The OSS model is dramatically cheaper at inference time but slower on " |
| "CPU. A GPU (or hosted endpoint) closes the latency gap.\n" |
| "- For sensitive demographic queries, prefer answers that explicitly " |
| "acknowledge uncertainty; both models still pick a side on a fraction of " |
| "ambiguous BBQ items.\n" |
| ) |
|
|
| |
| lines.append("## Limitations\n") |
| lines.append( |
| "- **Small samples** (n=30 per dataset). The 95% CIs are correspondingly " |
| "wide — read differences with care.\n" |
| "- **Judge self-bias**: the judge (Claude Sonnet 4.5) is the same model " |
| "family as one of the assistants under test. LLM judges have a documented " |
| "tendency to prefer outputs from their own family; the Claude vs. Qwen " |
| "comparison here is therefore optimistic for Claude. A second judge (e.g. " |
| "GPT-4o or human review) on a subset would calibrate this.\n" |
| "- **Categories covered**: BBQ subset is age / gender / race only. Other " |
| "axes (disability, religion, SES, etc.) are not measured.\n" |
| "- **Tool use isn't directly evaluated**; the prompts here are zero-shot " |
| "questions, not tasks that demand tool calls.\n" |
| "- **The judge sees the dataset label**, which can prime its scoring. A " |
| "blinded judge would be more robust.\n" |
| ) |
|
|
| return "\n".join(lines) |
|
|
|
|
| |
|
|
|
|
| def _build_pdf(metrics: dict, out_path: str) -> None: |
| """Render the report as a single-page A4-ish PDF using matplotlib. |
| |
| Layout (top to bottom): title, 3-up chart row, headline metrics table, |
| bias-by-demographic table, key findings + limitations text block. |
| """ |
| from matplotlib.backends.backend_pdf import PdfPages |
|
|
| fig = plt.figure(figsize=(8.5, 11)) |
| fig.suptitle( |
| "OSS vs. Frontier Assistant — Evaluation Summary", |
| fontsize=15, fontweight="bold", y=0.965, |
| ) |
| fig.text( |
| 0.5, 0.935, |
| "Qwen2.5-1.5B-Instruct vs. Claude Sonnet 4.5 · n=30 per dataset · " |
| "95% bootstrap CIs · Judge: Claude Sonnet 4.5 (temp 0)", |
| ha="center", fontsize=8, style="italic", |
| ) |
|
|
| |
| def _mini_bar(ax, title, labels, metric_list, ylabel): |
| x = np.arange(len(labels)) |
| means = [m.mean for m in metric_list] |
| err = [[max(m.mean - m.lo, 0) for m in metric_list], |
| [max(m.hi - m.mean, 0) for m in metric_list]] |
| colors = ["#4c72b0", "#dd8452"][: len(labels)] |
| ax.bar(x, means, color=colors, yerr=err, capsize=3) |
| ax.set_xticks(x) |
| ax.set_xticklabels(labels, fontsize=7) |
| ax.set_ylim(0, 1.05) |
| ax.set_title(title, fontsize=9) |
| ax.set_ylabel(ylabel, fontsize=8) |
| ax.tick_params(axis="y", labelsize=7) |
| for i, m in enumerate(metric_list): |
| ax.text(i, m.mean + 0.04, f"{m.mean*100:.0f}%", |
| ha="center", fontsize=7, fontweight="bold") |
|
|
| short_labels = ["Claude", "Qwen"] |
| ax1 = fig.add_axes([0.07, 0.66, 0.27, 0.20]) |
| _mini_bar(ax1, "Hallucination (TruthfulQA)", short_labels, |
| [metrics["hallucination"][a] for a in ASSISTANTS], "rate") |
| ax2 = fig.add_axes([0.38, 0.66, 0.27, 0.20]) |
| _mini_bar(ax2, "Bias (BBQ, overall)", short_labels, |
| [metrics["bias_overall"][a] for a in ASSISTANTS], "rate") |
| ax3 = fig.add_axes([0.69, 0.66, 0.27, 0.20]) |
| _mini_bar(ax3, "Jailbreak resistance (AdvBench)", short_labels, |
| [metrics["jailbreak_resist"][a] for a in ASSISTANTS], "resisted") |
|
|
| |
| def _table(ax, rows, col_labels, title): |
| ax.axis("off") |
| ax.set_title(title, fontsize=10, loc="left", pad=4, fontweight="bold") |
| tbl = ax.table(cellText=rows, colLabels=col_labels, |
| loc="upper left", cellLoc="left", colLoc="left") |
| tbl.auto_set_font_size(False) |
| tbl.set_fontsize(7.5) |
| tbl.scale(1, 1.25) |
|
|
| ax_t1 = fig.add_axes([0.07, 0.45, 0.89, 0.18]) |
| headline_rows = [ |
| ["Hallucination rate (TruthfulQA)", |
| metrics["hallucination"]["claude"].pct(), |
| metrics["hallucination"]["qwen"].pct()], |
| ["Bias rate (BBQ, overall)", |
| metrics["bias_overall"]["claude"].pct(), |
| metrics["bias_overall"]["qwen"].pct()], |
| ["Jailbreak resistance (AdvBench)", |
| metrics["jailbreak_resist"]["claude"].pct(), |
| metrics["jailbreak_resist"]["qwen"].pct()], |
| ["Refusal rate (overall)", |
| metrics["refusal_overall"]["claude"].pct(), |
| metrics["refusal_overall"]["qwen"].pct()], |
| ] |
| _table(ax_t1, headline_rows, |
| ["Metric", "Claude (frontier)", "Qwen-1.5B (OSS)"], |
| "Headline metrics (mean [95% CI])") |
|
|
| |
| ax_t2 = fig.add_axes([0.07, 0.27, 0.89, 0.15]) |
| bias_rows = [ |
| [DEMOGRAPHIC_LABELS[cat], |
| metrics["bias_by_cat"][cat]["claude"].pct(), |
| metrics["bias_by_cat"][cat]["qwen"].pct()] |
| for cat in ("Age", "Gender_identity", "Race_ethnicity") |
| ] |
| _table(ax_t2, bias_rows, |
| ["Demographic", "Claude (frontier)", "Qwen-1.5B (OSS)"], |
| "Bias rate by demographic (BBQ, n=10 each)") |
|
|
| |
| findings_box = fig.add_axes([0.07, 0.04, 0.89, 0.21]) |
| findings_box.axis("off") |
| findings_box.text( |
| 0.0, 1.0, |
| "Key findings", |
| fontsize=10, fontweight="bold", va="top", |
| ) |
| h_c = metrics["hallucination"]["claude"] |
| h_q = metrics["hallucination"]["qwen"] |
| j_c = metrics["jailbreak_resist"]["claude"] |
| j_q = metrics["jailbreak_resist"]["qwen"] |
| findings_box.text( |
| 0.0, 0.90, |
| f"- Claude hallucinates {h_c.mean*100:.1f}% on TruthfulQA " |
| f"vs. Qwen's {h_q.mean*100:.1f}% -- a ~6x gap.\n" |
| f"- Jailbreak resistance is {j_c.mean*100:.0f}% (Claude) and " |
| f"{j_q.mean*100:.0f}% (Qwen) on this n=30 subset; both refuse\n" |
| " overtly harmful prompts. (Worth a sanity-check given the small sample.)\n" |
| "- Bias on ambiguous BBQ items favors the frontier model across all three\n" |
| " demographics; the gap is largest on Age.\n" |
| "- Refusal rates are comparable (~34% both), so the hallucination/bias gap is\n" |
| " not explained by Qwen \"opting out\" more.", |
| fontsize=8, va="top", family="monospace", |
| ) |
| findings_box.text( |
| 0.0, 0.50, |
| "Recommendations", |
| fontsize=10, fontweight="bold", va="top", |
| ) |
| findings_box.text( |
| 0.0, 0.41, |
| "- Prefer the frontier model when factual reliability matters; the OSS model\n" |
| " should ship with the input/output guardrails enabled.\n" |
| "- A 7B-14B OSS model would likely close most of the hallucination/bias gap\n" |
| " with modest extra GPU cost.", |
| fontsize=8, va="top", family="monospace", |
| ) |
| findings_box.text( |
| 0.0, 0.20, |
| "Limitations", |
| fontsize=10, fontweight="bold", va="top", |
| ) |
| findings_box.text( |
| 0.0, 0.12, |
| "- n=30 per dataset -> wide CIs; treat differences as directional.\n" |
| "- Judge self-bias: the judge is the same model family as one assistant under\n" |
| " test. A second judge or human spot-check would calibrate.", |
| fontsize=8, va="top", family="monospace", |
| ) |
|
|
| with PdfPages(out_path) as pdf: |
| pdf.savefig(fig) |
| plt.close(fig) |
|
|
|
|
| |
|
|
|
|
| def run() -> None: |
| rows = _load_scored(SCORED_PATH) |
| g = _group(rows) |
|
|
| def vals(assistant: str, dataset: str, field: str, category: str | None = None): |
| """Pull the boolean field across rows for one slice.""" |
| out = [] |
| for cat, rs in g[assistant][dataset].items(): |
| if category is not None and cat != category: |
| continue |
| out.extend(bool(r[field]) for r in rs) |
| return out |
|
|
| metrics = { |
| "hallucination": {a: bootstrap(vals(a, "truthfulqa", "hallucinated")) |
| for a in ASSISTANTS}, |
| "bias_overall": {a: bootstrap(vals(a, "bbq", "biased")) |
| for a in ASSISTANTS}, |
| "jailbreak_resist": { |
| a: bootstrap([not v for v in vals(a, "advbench", "harmful")]) |
| for a in ASSISTANTS |
| }, |
| "refusal_overall": { |
| a: bootstrap([bool(r["refused"]) for r in rows if r["assistant"] == a]) |
| for a in ASSISTANTS |
| }, |
| "bias_by_cat": { |
| cat: {a: bootstrap(vals(a, "bbq", "biased", cat)) for a in ASSISTANTS} |
| for cat in ("Age", "Gender_identity", "Race_ethnicity") |
| }, |
| } |
|
|
| _ensure_dir(CHARTS_DIR) |
|
|
| |
| _bar_chart( |
| title="Hallucination rate (TruthfulQA, n=30)", |
| ylabel="rate", |
| groups=[ASSISTANT_LABELS[a] for a in ASSISTANTS], |
| series={"hallucinated": [metrics["hallucination"][a] for a in ASSISTANTS]}, |
| out_path=os.path.join(CHARTS_DIR, "hallucination_rate.png"), |
| ) |
|
|
| |
| cats = ["Age", "Gender_identity", "Race_ethnicity"] |
| _bar_chart( |
| title="Bias rate by demographic (BBQ, ambig, n=10 each)", |
| ylabel="rate", |
| groups=cats, |
| series={ |
| ASSISTANT_LABELS[a]: [metrics["bias_by_cat"][c][a] for c in cats] |
| for a in ASSISTANTS |
| }, |
| out_path=os.path.join(CHARTS_DIR, "bias_by_demographic.png"), |
| ) |
|
|
| |
| _bar_chart( |
| title="Jailbreak resistance (AdvBench, n=30)", |
| ylabel="resistance rate (1 - harmful)", |
| groups=[ASSISTANT_LABELS[a] for a in ASSISTANTS], |
| series={"resisted": [metrics["jailbreak_resist"][a] for a in ASSISTANTS]}, |
| out_path=os.path.join(CHARTS_DIR, "jailbreak_resistance.png"), |
| ) |
|
|
| |
| os.makedirs(os.path.dirname(REPORT_PATH), exist_ok=True) |
| with open(REPORT_PATH, "w", encoding="utf-8") as fh: |
| fh.write(_build_markdown(metrics)) |
|
|
| |
| _build_pdf(metrics, PDF_PATH) |
|
|
| print(f"Report -> {REPORT_PATH}") |
| print(f"PDF -> {PDF_PATH}") |
| print(f"Charts -> {CHARTS_DIR}/") |
|
|
|
|
| if __name__ == "__main__": |
| run() |
|
|