Spaces:

KevinMerchant13
/

oss-vs-frontier-assistant

Running

App Files Files Community

oss-vs-frontier-assistant / eval /report.py

KevinMerchant13

polish: 1-page PDF report + doc fixes

114d5f1 verified 3 days ago

raw

history blame contribute delete

18.4 kB

	"""Report generation (Phase 6).

	Reads ``results/scored.jsonl``, computes per-assistant metrics with bootstrap
	95% confidence intervals, renders matplotlib charts, and writes
	``docs/EVALUATION_REPORT.md``.

	Metrics:
	- Hallucination rate (TruthfulQA) = mean(hallucinated)
	- Bias rate (BBQ, per cat) = mean(biased)
	- Jailbreak resistance (AdvBench) = 1 - mean(harmful)
	- Refusal rate = mean(refused), reported as context

	CLI:
	uv run python -m eval.report

	PDF export (optional, system pandoc required):
	pandoc docs/EVALUATION_REPORT.md -o docs/EVALUATION_REPORT.pdf
	"""

	from __future__ import annotations

	import json
	import os
	from collections import defaultdict
	from dataclasses import dataclass

	import matplotlib

	matplotlib.use("Agg") # headless rendering — no display needed
	import matplotlib.pyplot as plt
	import numpy as np

	SCORED_PATH = "./results/scored.jsonl"
	CHARTS_DIR = "./results/charts"
	REPORT_PATH = "./docs/EVALUATION_REPORT.md"
	PDF_PATH = "./docs/EVALUATION_REPORT.pdf"

	ASSISTANTS = ["claude", "qwen"]
	ASSISTANT_LABELS = {"claude": "Claude (frontier)", "qwen": "Qwen-1.5B (OSS)"}

	# Human-friendly display names for the BBQ category codes.
	DEMOGRAPHIC_LABELS = {
	"Age": "Age",
	"Gender_identity": "Gender identity",
	"Race_ethnicity": "Race / ethnicity",
	}


	# --- Stats helpers --------------------------------------------------------


	@dataclass
	class Metric:
	mean: float
	lo: float # lower bound of 95% CI
	hi: float # upper bound of 95% CI
	n: int # sample size

	def pct(self) -> str:
	return f"{self.mean100:.1f}% [{self.lo100:.1f}, {self.hi*100:.1f}]"


	def bootstrap(values: list[bool], n_boot: int = 1000, seed: int = 42) -> Metric:
	"""Bootstrap a 95% CI around the mean of a list of booleans."""
	if not values:
	return Metric(0.0, 0.0, 0.0, 0)
	arr = np.array(values, dtype=float)
	rng = np.random.default_rng(seed)
	means = np.array([
	rng.choice(arr, size=len(arr), replace=True).mean()
	for _ in range(n_boot)
	])
	return Metric(
	mean=float(arr.mean()),
	lo=float(np.percentile(means, 2.5)),
	hi=float(np.percentile(means, 97.5)),
	n=len(arr),
	)


	# --- Data loading ---------------------------------------------------------


	def _load_scored(path: str) -> list[dict]:
	if not os.path.exists(path):
	raise SystemExit(f"No scored results at {path}. Run eval.judge first.")
	rows = []
	with open(path, "r", encoding="utf-8") as fh:
	for line in fh:
	if line.strip():
	rows.append(json.loads(line))
	return rows


	def _group(rows: list[dict]) -> dict:
	"""rows[assistant][dataset][category] -> list[row]."""
	g: dict = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
	for r in rows:
	g[r["assistant"]][r["dataset"]][r["category"]].append(r)
	return g


	# --- Chart rendering ------------------------------------------------------


	def _ensure_dir(path: str) -> None:
	os.makedirs(path, exist_ok=True)


	def _bar_chart(
	title: str,
	ylabel: str,
	groups: list[str], # x-axis groups (assistants OR categories)
	series: dict[str, list[Metric]], # series_label -> per-group Metric
	out_path: str,
	) -> None:
	"""Grouped bar chart with 95% CI error bars."""
	plt.figure(figsize=(7, 4.2))
	n_series = len(series)
	n_groups = len(groups)
	x = np.arange(n_groups)
	width = 0.8 / max(n_series, 1)
	for i, (label, metrics) in enumerate(series.items()):
	means = [m.mean for m in metrics]
	# asymmetric error bars (CI bounds, not stdev)
	err = [
	[max(m.mean - m.lo, 0) for m in metrics],
	[max(m.hi - m.mean, 0) for m in metrics],
	]
	plt.bar(x + i * width, means, width, label=label, yerr=err, capsize=4)
	plt.xticks(x + width * (n_series - 1) / 2, groups, rotation=0)
	plt.ylabel(ylabel)
	plt.title(title)
	plt.ylim(0, 1.05)
	plt.legend()
	plt.tight_layout()
	plt.savefig(out_path, dpi=140)
	plt.close()


	# --- Markdown report ------------------------------------------------------


	def _table_row(label: str, by_assistant: dict[str, Metric]) -> str:
	cells = " \| ".join(by_assistant[a].pct() for a in ASSISTANTS)
	return f"\| {label} \| {cells} \|"


	def _build_markdown(metrics: dict) -> str:
	"""Compose the EVALUATION_REPORT.md text."""
	M = metrics # alias for brevity
	headers = " \| ".join(ASSISTANT_LABELS[a] for a in ASSISTANTS)

	lines: list[str] = []
	lines.append("# Evaluation Report: OSS vs. Frontier Assistant\n")
	lines.append(
	"Comparison of an open-source assistant (Qwen2.5-1.5B-Instruct) against a "
	"frontier assistant (Claude Sonnet 4.5) on hallucination, demographic bias, "
	"and safety / jailbreak resistance.\n"
	)

	# --- Methodology
	lines.append("## Methodology\n")
	lines.append(
	"- Datasets (random seed 42, 30 prompts each):\n"
	" - TruthfulQA (generation split) — hallucination.\n"
	" - BBQ (Elfsong/BBQ) — bias; 10 ambiguous-context items each from "
	"Age, Gender_identity, Race_ethnicity.\n"
	" - AdvBench harmful_behaviors — safety / jailbreak.\n"
	"- Assistants are run statelessly (no memory, no guardrails) so the "
	"eval measures raw model behavior, not the surrounding safety layers.\n"
	"- Judge: Claude Sonnet 4.5 with a JSON rubric "
	"`{hallucinated, biased, refused, harmful, reasoning}` and dataset-specific "
	"guidance. Temperature 0.\n"
	"- Uncertainty: 95% bootstrap CIs (1000 resamples) on every reported "
	"rate.\n"
	)

	# --- Headline numbers
	lines.append("## Headline metrics\n")
	lines.append(f"\| Metric \| {headers} \|")
	lines.append("\|---\|" + "---\|" * len(ASSISTANTS))
	lines.append(_table_row("Hallucination rate (TruthfulQA)", M["hallucination"]))
	lines.append(_table_row("Bias rate (BBQ, overall)", M["bias_overall"]))
	lines.append(_table_row("Jailbreak resistance (AdvBench)", M["jailbreak_resist"]))
	lines.append(_table_row("Refusal rate (overall)", M["refusal_overall"]))
	lines.append("")

	# --- Bias breakdown
	lines.append("## Bias rate by demographic (BBQ)\n")
	lines.append(f"\| Demographic \| {headers} \|")
	lines.append("\|---\|" + "---\|" * len(ASSISTANTS))
	for cat in ("Age", "Gender_identity", "Race_ethnicity"):
	lines.append(_table_row(DEMOGRAPHIC_LABELS[cat], M["bias_by_cat"][cat]))
	lines.append("")

	# --- Charts
	lines.append("## Charts\n")
	lines.append("![Hallucination rate](../results/charts/hallucination_rate.png)\n")
	lines.append("![Bias by demographic](../results/charts/bias_by_demographic.png)\n")
	lines.append("![Jailbreak resistance](../results/charts/jailbreak_resistance.png)\n")

	# --- Findings (written generically; numbers tell the story)
	lines.append("## Key findings\n")
	h_c, h_q = M["hallucination"]["claude"], M["hallucination"]["qwen"]
	j_c, j_q = M["jailbreak_resist"]["claude"], M["jailbreak_resist"]["qwen"]
	lines.append(
	f"- Hallucination: Claude {h_c.pct()} vs. Qwen {h_q.pct()}.\n"
	f"- Jailbreak resistance: Claude {j_c.pct()} vs. Qwen {j_q.pct()}.\n"
	"- Bias differences by demographic are shown in the chart above; refer to "
	"the table for exact CIs.\n"
	)

	# --- Recommendations
	lines.append("## Recommendations\n")
	lines.append(
	"- For production deployments where safety and factual reliability matter, "
	"the frontier model's raw behavior is meaningfully stronger; the OSS model "
	"should only be used with the input/output guardrails enabled (they catch "
	"the residual gap on safety prompts in this project).\n"
	"- The OSS model is dramatically cheaper at inference time but slower on "
	"CPU. A GPU (or hosted endpoint) closes the latency gap.\n"
	"- For sensitive demographic queries, prefer answers that explicitly "
	"acknowledge uncertainty; both models still pick a side on a fraction of "
	"ambiguous BBQ items.\n"
	)

	# --- Limitations
	lines.append("## Limitations\n")
	lines.append(
	"- Small samples (n=30 per dataset). The 95% CIs are correspondingly "
	"wide — read differences with care.\n"
	"- Judge self-bias: the judge (Claude Sonnet 4.5) is the same model "
	"family as one of the assistants under test. LLM judges have a documented "
	"tendency to prefer outputs from their own family; the Claude vs. Qwen "
	"comparison here is therefore optimistic for Claude. A second judge (e.g. "
	"GPT-4o or human review) on a subset would calibrate this.\n"
	"- Categories covered: BBQ subset is age / gender / race only. Other "
	"axes (disability, religion, SES, etc.) are not measured.\n"
	"- Tool use isn't directly evaluated; the prompts here are zero-shot "
	"questions, not tasks that demand tool calls.\n"
	"- The judge sees the dataset label, which can prime its scoring. A "
	"blinded judge would be more robust.\n"
	)

	return "\n".join(lines)


	# --- One-page PDF infographic --------------------------------------------


	def _build_pdf(metrics: dict, out_path: str) -> None:
	"""Render the report as a single-page A4-ish PDF using matplotlib.

	Layout (top to bottom): title, 3-up chart row, headline metrics table,
	bias-by-demographic table, key findings + limitations text block.
	"""
	from matplotlib.backends.backend_pdf import PdfPages

	fig = plt.figure(figsize=(8.5, 11)) # US-Letter
	fig.suptitle(
	"OSS vs. Frontier Assistant — Evaluation Summary",
	fontsize=15, fontweight="bold", y=0.965,
	)
	fig.text(
	0.5, 0.935,
	"Qwen2.5-1.5B-Instruct vs. Claude Sonnet 4.5 · n=30 per dataset · "
	"95% bootstrap CIs · Judge: Claude Sonnet 4.5 (temp 0)",
	ha="center", fontsize=8, style="italic",
	)

	# --- Row of three small charts (replicated from the PNG charts) ---
	def _mini_bar(ax, title, labels, metric_list, ylabel):
	x = np.arange(len(labels))
	means = [m.mean for m in metric_list]
	err = [[max(m.mean - m.lo, 0) for m in metric_list],
	[max(m.hi - m.mean, 0) for m in metric_list]]
	colors = ["#4c72b0", "#dd8452"][: len(labels)]
	ax.bar(x, means, color=colors, yerr=err, capsize=3)
	ax.set_xticks(x)
	ax.set_xticklabels(labels, fontsize=7)
	ax.set_ylim(0, 1.05)
	ax.set_title(title, fontsize=9)
	ax.set_ylabel(ylabel, fontsize=8)
	ax.tick_params(axis="y", labelsize=7)
	for i, m in enumerate(metric_list):
	ax.text(i, m.mean + 0.04, f"{m.mean*100:.0f}%",
	ha="center", fontsize=7, fontweight="bold")

	short_labels = ["Claude", "Qwen"]
	ax1 = fig.add_axes([0.07, 0.66, 0.27, 0.20])
	_mini_bar(ax1, "Hallucination (TruthfulQA)", short_labels,
	[metrics["hallucination"][a] for a in ASSISTANTS], "rate")
	ax2 = fig.add_axes([0.38, 0.66, 0.27, 0.20])
	_mini_bar(ax2, "Bias (BBQ, overall)", short_labels,
	[metrics["bias_overall"][a] for a in ASSISTANTS], "rate")
	ax3 = fig.add_axes([0.69, 0.66, 0.27, 0.20])
	_mini_bar(ax3, "Jailbreak resistance (AdvBench)", short_labels,
	[metrics["jailbreak_resist"][a] for a in ASSISTANTS], "resisted")

	# --- Headline metrics table ---
	def _table(ax, rows, col_labels, title):
	ax.axis("off")
	ax.set_title(title, fontsize=10, loc="left", pad=4, fontweight="bold")
	tbl = ax.table(cellText=rows, colLabels=col_labels,
	loc="upper left", cellLoc="left", colLoc="left")
	tbl.auto_set_font_size(False)
	tbl.set_fontsize(7.5)
	tbl.scale(1, 1.25)

	ax_t1 = fig.add_axes([0.07, 0.45, 0.89, 0.18])
	headline_rows = [
	["Hallucination rate (TruthfulQA)",
	metrics["hallucination"]["claude"].pct(),
	metrics["hallucination"]["qwen"].pct()],
	["Bias rate (BBQ, overall)",
	metrics["bias_overall"]["claude"].pct(),
	metrics["bias_overall"]["qwen"].pct()],
	["Jailbreak resistance (AdvBench)",
	metrics["jailbreak_resist"]["claude"].pct(),
	metrics["jailbreak_resist"]["qwen"].pct()],
	["Refusal rate (overall)",
	metrics["refusal_overall"]["claude"].pct(),
	metrics["refusal_overall"]["qwen"].pct()],
	]
	_table(ax_t1, headline_rows,
	["Metric", "Claude (frontier)", "Qwen-1.5B (OSS)"],
	"Headline metrics (mean [95% CI])")

	# --- Bias breakdown ---
	ax_t2 = fig.add_axes([0.07, 0.27, 0.89, 0.15])
	bias_rows = [
	[DEMOGRAPHIC_LABELS[cat],
	metrics["bias_by_cat"][cat]["claude"].pct(),
	metrics["bias_by_cat"][cat]["qwen"].pct()]
	for cat in ("Age", "Gender_identity", "Race_ethnicity")
	]
	_table(ax_t2, bias_rows,
	["Demographic", "Claude (frontier)", "Qwen-1.5B (OSS)"],
	"Bias rate by demographic (BBQ, n=10 each)")

	# --- Findings + limitations ---
	findings_box = fig.add_axes([0.07, 0.04, 0.89, 0.21])
	findings_box.axis("off")
	findings_box.text(
	0.0, 1.0,
	"Key findings",
	fontsize=10, fontweight="bold", va="top",
	)
	h_c = metrics["hallucination"]["claude"]
	h_q = metrics["hallucination"]["qwen"]
	j_c = metrics["jailbreak_resist"]["claude"]
	j_q = metrics["jailbreak_resist"]["qwen"]
	findings_box.text(
	0.0, 0.90,
	f"- Claude hallucinates {h_c.mean*100:.1f}% on TruthfulQA "
	f"vs. Qwen's {h_q.mean*100:.1f}% -- a ~6x gap.\n"
	f"- Jailbreak resistance is {j_c.mean*100:.0f}% (Claude) and "
	f"{j_q.mean*100:.0f}% (Qwen) on this n=30 subset; both refuse\n"
	" overtly harmful prompts. (Worth a sanity-check given the small sample.)\n"
	"- Bias on ambiguous BBQ items favors the frontier model across all three\n"
	" demographics; the gap is largest on Age.\n"
	"- Refusal rates are comparable (~34% both), so the hallucination/bias gap is\n"
	" not explained by Qwen \"opting out\" more.",
	fontsize=8, va="top", family="monospace",
	)
	findings_box.text(
	0.0, 0.50,
	"Recommendations",
	fontsize=10, fontweight="bold", va="top",
	)
	findings_box.text(
	0.0, 0.41,
	"- Prefer the frontier model when factual reliability matters; the OSS model\n"
	" should ship with the input/output guardrails enabled.\n"
	"- A 7B-14B OSS model would likely close most of the hallucination/bias gap\n"
	" with modest extra GPU cost.",
	fontsize=8, va="top", family="monospace",
	)
	findings_box.text(
	0.0, 0.20,
	"Limitations",
	fontsize=10, fontweight="bold", va="top",
	)
	findings_box.text(
	0.0, 0.12,
	"- n=30 per dataset -> wide CIs; treat differences as directional.\n"
	"- Judge self-bias: the judge is the same model family as one assistant under\n"
	" test. A second judge or human spot-check would calibrate.",
	fontsize=8, va="top", family="monospace",
	)

	with PdfPages(out_path) as pdf:
	pdf.savefig(fig)
	plt.close(fig)


	# --- Top-level orchestration ---------------------------------------------


	def run() -> None:
	rows = _load_scored(SCORED_PATH)
	g = _group(rows)

	def vals(assistant: str, dataset: str, field: str, category: str \| None = None):
	"""Pull the boolean field across rows for one slice."""
	out = []
	for cat, rs in g[assistant][dataset].items():
	if category is not None and cat != category:
	continue
	out.extend(bool(r[field]) for r in rs)
	return out

	metrics = {
	"hallucination": {a: bootstrap(vals(a, "truthfulqa", "hallucinated"))
	for a in ASSISTANTS},
	"bias_overall": {a: bootstrap(vals(a, "bbq", "biased"))
	for a in ASSISTANTS},
	"jailbreak_resist": {
	a: bootstrap([not v for v in vals(a, "advbench", "harmful")])
	for a in ASSISTANTS
	},
	"refusal_overall": {
	a: bootstrap([bool(r["refused"]) for r in rows if r["assistant"] == a])
	for a in ASSISTANTS
	},
	"bias_by_cat": {
	cat: {a: bootstrap(vals(a, "bbq", "biased", cat)) for a in ASSISTANTS}
	for cat in ("Age", "Gender_identity", "Race_ethnicity")
	},
	}

	_ensure_dir(CHARTS_DIR)

	# Chart 1: hallucination rate
	_bar_chart(
	title="Hallucination rate (TruthfulQA, n=30)",
	ylabel="rate",
	groups=[ASSISTANT_LABELS[a] for a in ASSISTANTS],
	series={"hallucinated": [metrics["hallucination"][a] for a in ASSISTANTS]},
	out_path=os.path.join(CHARTS_DIR, "hallucination_rate.png"),
	)

	# Chart 2: bias by demographic (grouped bars)
	cats = ["Age", "Gender_identity", "Race_ethnicity"]
	_bar_chart(
	title="Bias rate by demographic (BBQ, ambig, n=10 each)",
	ylabel="rate",
	groups=cats,
	series={
	ASSISTANT_LABELS[a]: [metrics["bias_by_cat"][c][a] for c in cats]
	for a in ASSISTANTS
	},
	out_path=os.path.join(CHARTS_DIR, "bias_by_demographic.png"),
	)

	# Chart 3: jailbreak resistance
	_bar_chart(
	title="Jailbreak resistance (AdvBench, n=30)",
	ylabel="resistance rate (1 - harmful)",
	groups=[ASSISTANT_LABELS[a] for a in ASSISTANTS],
	series={"resisted": [metrics["jailbreak_resist"][a] for a in ASSISTANTS]},
	out_path=os.path.join(CHARTS_DIR, "jailbreak_resistance.png"),
	)

	# Markdown report
	os.makedirs(os.path.dirname(REPORT_PATH), exist_ok=True)
	with open(REPORT_PATH, "w", encoding="utf-8") as fh:
	fh.write(_build_markdown(metrics))

	# One-page PDF infographic (satisfies the "evaluation pdf" deliverable)
	_build_pdf(metrics, PDF_PATH)

	print(f"Report -> {REPORT_PATH}")
	print(f"PDF -> {PDF_PATH}")
	print(f"Charts -> {CHARTS_DIR}/")


	if __name__ == "__main__":
	run()