Spaces:

KevinMerchant13
/

oss-vs-frontier-assistant

Sleeping

File size: 18,432 Bytes

"""Report generation (Phase 6).

Reads ``results/scored.jsonl``, computes per-assistant metrics with bootstrap
95% confidence intervals, renders matplotlib charts, and writes
``docs/EVALUATION_REPORT.md``.

Metrics:
  - Hallucination rate   (TruthfulQA)   = mean(hallucinated)
  - Bias rate            (BBQ, per cat) = mean(biased)
  - Jailbreak resistance (AdvBench)     = 1 - mean(harmful)
  - Refusal rate                        = mean(refused), reported as context

CLI:
    uv run python -m eval.report

PDF export (optional, system pandoc required):
    pandoc docs/EVALUATION_REPORT.md -o docs/EVALUATION_REPORT.pdf
"""

from __future__ import annotations

import json
import os
from collections import defaultdict
from dataclasses import dataclass

import matplotlib

matplotlib.use("Agg")  # headless rendering — no display needed
import matplotlib.pyplot as plt
import numpy as np

SCORED_PATH = "./results/scored.jsonl"
CHARTS_DIR = "./results/charts"
REPORT_PATH = "./docs/EVALUATION_REPORT.md"
PDF_PATH = "./docs/EVALUATION_REPORT.pdf"

ASSISTANTS = ["claude", "qwen"]
ASSISTANT_LABELS = {"claude": "Claude (frontier)", "qwen": "Qwen-1.5B (OSS)"}

# Human-friendly display names for the BBQ category codes.
DEMOGRAPHIC_LABELS = {
    "Age": "Age",
    "Gender_identity": "Gender identity",
    "Race_ethnicity": "Race / ethnicity",
}


# --- Stats helpers --------------------------------------------------------


@dataclass
class Metric:
    mean: float
    lo: float   # lower bound of 95% CI
    hi: float   # upper bound of 95% CI
    n: int      # sample size

    def pct(self) -> str:
        return f"{self.mean*100:.1f}% [{self.lo*100:.1f}, {self.hi*100:.1f}]"


def bootstrap(values: list[bool], n_boot: int = 1000, seed: int = 42) -> Metric:
    """Bootstrap a 95% CI around the mean of a list of booleans."""
    if not values:
        return Metric(0.0, 0.0, 0.0, 0)
    arr = np.array(values, dtype=float)
    rng = np.random.default_rng(seed)
    means = np.array([
        rng.choice(arr, size=len(arr), replace=True).mean()
        for _ in range(n_boot)
    ])
    return Metric(
        mean=float(arr.mean()),
        lo=float(np.percentile(means, 2.5)),
        hi=float(np.percentile(means, 97.5)),
        n=len(arr),
    )


# --- Data loading ---------------------------------------------------------


def _load_scored(path: str) -> list[dict]:
    if not os.path.exists(path):
        raise SystemExit(f"No scored results at {path}. Run eval.judge first.")
    rows = []
    with open(path, "r", encoding="utf-8") as fh:
        for line in fh:
            if line.strip():
                rows.append(json.loads(line))
    return rows


def _group(rows: list[dict]) -> dict:
    """rows[assistant][dataset][category] -> list[row]."""
    g: dict = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
    for r in rows:
        g[r["assistant"]][r["dataset"]][r["category"]].append(r)
    return g


# --- Chart rendering ------------------------------------------------------


def _ensure_dir(path: str) -> None:
    os.makedirs(path, exist_ok=True)


def _bar_chart(
    title: str,
    ylabel: str,
    groups: list[str],            # x-axis groups (assistants OR categories)
    series: dict[str, list[Metric]],  # series_label -> per-group Metric
    out_path: str,
) -> None:
    """Grouped bar chart with 95% CI error bars."""
    plt.figure(figsize=(7, 4.2))
    n_series = len(series)
    n_groups = len(groups)
    x = np.arange(n_groups)
    width = 0.8 / max(n_series, 1)
    for i, (label, metrics) in enumerate(series.items()):
        means = [m.mean for m in metrics]
        # asymmetric error bars (CI bounds, not stdev)
        err = [
            [max(m.mean - m.lo, 0) for m in metrics],
            [max(m.hi - m.mean, 0) for m in metrics],
        ]
        plt.bar(x + i * width, means, width, label=label, yerr=err, capsize=4)
    plt.xticks(x + width * (n_series - 1) / 2, groups, rotation=0)
    plt.ylabel(ylabel)
    plt.title(title)
    plt.ylim(0, 1.05)
    plt.legend()
    plt.tight_layout()
    plt.savefig(out_path, dpi=140)
    plt.close()


# --- Markdown report ------------------------------------------------------


def _table_row(label: str, by_assistant: dict[str, Metric]) -> str:
    cells = " | ".join(by_assistant[a].pct() for a in ASSISTANTS)
    return f"| {label} | {cells} |"


def _build_markdown(metrics: dict) -> str:
    """Compose the EVALUATION_REPORT.md text."""
    M = metrics  # alias for brevity
    headers = " | ".join(ASSISTANT_LABELS[a] for a in ASSISTANTS)

    lines: list[str] = []
    lines.append("# Evaluation Report: OSS vs. Frontier Assistant\n")
    lines.append(
        "Comparison of an open-source assistant (Qwen2.5-1.5B-Instruct) against a "
        "frontier assistant (Claude Sonnet 4.5) on hallucination, demographic bias, "
        "and safety / jailbreak resistance.\n"
    )

    # --- Methodology
    lines.append("## Methodology\n")
    lines.append(
        "- **Datasets** (random seed 42, 30 prompts each):\n"
        "  - TruthfulQA (generation split) — hallucination.\n"
        "  - BBQ (Elfsong/BBQ) — bias; 10 ambiguous-context items each from "
        "Age, Gender_identity, Race_ethnicity.\n"
        "  - AdvBench harmful_behaviors — safety / jailbreak.\n"
        "- **Assistants** are run *statelessly* (no memory, no guardrails) so the "
        "eval measures raw model behavior, not the surrounding safety layers.\n"
        "- **Judge**: Claude Sonnet 4.5 with a JSON rubric "
        "`{hallucinated, biased, refused, harmful, reasoning}` and dataset-specific "
        "guidance. Temperature 0.\n"
        "- **Uncertainty**: 95% bootstrap CIs (1000 resamples) on every reported "
        "rate.\n"
    )

    # --- Headline numbers
    lines.append("## Headline metrics\n")
    lines.append(f"| Metric | {headers} |")
    lines.append("|---|" + "---|" * len(ASSISTANTS))
    lines.append(_table_row("Hallucination rate (TruthfulQA)", M["hallucination"]))
    lines.append(_table_row("Bias rate (BBQ, overall)",        M["bias_overall"]))
    lines.append(_table_row("Jailbreak resistance (AdvBench)", M["jailbreak_resist"]))
    lines.append(_table_row("Refusal rate (overall)",          M["refusal_overall"]))
    lines.append("")

    # --- Bias breakdown
    lines.append("## Bias rate by demographic (BBQ)\n")
    lines.append(f"| Demographic | {headers} |")
    lines.append("|---|" + "---|" * len(ASSISTANTS))
    for cat in ("Age", "Gender_identity", "Race_ethnicity"):
        lines.append(_table_row(DEMOGRAPHIC_LABELS[cat], M["bias_by_cat"][cat]))
    lines.append("")

    # --- Charts
    lines.append("## Charts\n")
    lines.append("![Hallucination rate](../results/charts/hallucination_rate.png)\n")
    lines.append("![Bias by demographic](../results/charts/bias_by_demographic.png)\n")
    lines.append("![Jailbreak resistance](../results/charts/jailbreak_resistance.png)\n")

    # --- Findings (written generically; numbers tell the story)
    lines.append("## Key findings\n")
    h_c, h_q = M["hallucination"]["claude"], M["hallucination"]["qwen"]
    j_c, j_q = M["jailbreak_resist"]["claude"], M["jailbreak_resist"]["qwen"]
    lines.append(
        f"- Hallucination: Claude {h_c.pct()} vs. Qwen {h_q.pct()}.\n"
        f"- Jailbreak resistance: Claude {j_c.pct()} vs. Qwen {j_q.pct()}.\n"
        "- Bias differences by demographic are shown in the chart above; refer to "
        "the table for exact CIs.\n"
    )

    # --- Recommendations
    lines.append("## Recommendations\n")
    lines.append(
        "- For production deployments where safety and factual reliability matter, "
        "the frontier model's *raw* behavior is meaningfully stronger; the OSS model "
        "should only be used with the input/output guardrails enabled (they catch "
        "the residual gap on safety prompts in this project).\n"
        "- The OSS model is dramatically cheaper at inference time but slower on "
        "CPU. A GPU (or hosted endpoint) closes the latency gap.\n"
        "- For sensitive demographic queries, prefer answers that explicitly "
        "acknowledge uncertainty; both models still pick a side on a fraction of "
        "ambiguous BBQ items.\n"
    )

    # --- Limitations
    lines.append("## Limitations\n")
    lines.append(
        "- **Small samples** (n=30 per dataset). The 95% CIs are correspondingly "
        "wide — read differences with care.\n"
        "- **Judge self-bias**: the judge (Claude Sonnet 4.5) is the same model "
        "family as one of the assistants under test. LLM judges have a documented "
        "tendency to prefer outputs from their own family; the Claude vs. Qwen "
        "comparison here is therefore optimistic for Claude. A second judge (e.g. "
        "GPT-4o or human review) on a subset would calibrate this.\n"
        "- **Categories covered**: BBQ subset is age / gender / race only. Other "
        "axes (disability, religion, SES, etc.) are not measured.\n"
        "- **Tool use isn't directly evaluated**; the prompts here are zero-shot "
        "questions, not tasks that demand tool calls.\n"
        "- **The judge sees the dataset label**, which can prime its scoring. A "
        "blinded judge would be more robust.\n"
    )

    return "\n".join(lines)


# --- One-page PDF infographic --------------------------------------------


def _build_pdf(metrics: dict, out_path: str) -> None:
    """Render the report as a single-page A4-ish PDF using matplotlib.

    Layout (top to bottom): title, 3-up chart row, headline metrics table,
    bias-by-demographic table, key findings + limitations text block.
    """
    from matplotlib.backends.backend_pdf import PdfPages

    fig = plt.figure(figsize=(8.5, 11))   # US-Letter
    fig.suptitle(
        "OSS vs. Frontier Assistant — Evaluation Summary",
        fontsize=15, fontweight="bold", y=0.965,
    )
    fig.text(
        0.5, 0.935,
        "Qwen2.5-1.5B-Instruct vs. Claude Sonnet 4.5 · n=30 per dataset · "
        "95% bootstrap CIs · Judge: Claude Sonnet 4.5 (temp 0)",
        ha="center", fontsize=8, style="italic",
    )

    # --- Row of three small charts (replicated from the PNG charts) ---
    def _mini_bar(ax, title, labels, metric_list, ylabel):
        x = np.arange(len(labels))
        means = [m.mean for m in metric_list]
        err = [[max(m.mean - m.lo, 0) for m in metric_list],
               [max(m.hi - m.mean, 0) for m in metric_list]]
        colors = ["#4c72b0", "#dd8452"][: len(labels)]
        ax.bar(x, means, color=colors, yerr=err, capsize=3)
        ax.set_xticks(x)
        ax.set_xticklabels(labels, fontsize=7)
        ax.set_ylim(0, 1.05)
        ax.set_title(title, fontsize=9)
        ax.set_ylabel(ylabel, fontsize=8)
        ax.tick_params(axis="y", labelsize=7)
        for i, m in enumerate(metric_list):
            ax.text(i, m.mean + 0.04, f"{m.mean*100:.0f}%",
                    ha="center", fontsize=7, fontweight="bold")

    short_labels = ["Claude", "Qwen"]
    ax1 = fig.add_axes([0.07, 0.66, 0.27, 0.20])
    _mini_bar(ax1, "Hallucination (TruthfulQA)", short_labels,
              [metrics["hallucination"][a] for a in ASSISTANTS], "rate")
    ax2 = fig.add_axes([0.38, 0.66, 0.27, 0.20])
    _mini_bar(ax2, "Bias (BBQ, overall)", short_labels,
              [metrics["bias_overall"][a] for a in ASSISTANTS], "rate")
    ax3 = fig.add_axes([0.69, 0.66, 0.27, 0.20])
    _mini_bar(ax3, "Jailbreak resistance (AdvBench)", short_labels,
              [metrics["jailbreak_resist"][a] for a in ASSISTANTS], "resisted")

    # --- Headline metrics table ---
    def _table(ax, rows, col_labels, title):
        ax.axis("off")
        ax.set_title(title, fontsize=10, loc="left", pad=4, fontweight="bold")
        tbl = ax.table(cellText=rows, colLabels=col_labels,
                       loc="upper left", cellLoc="left", colLoc="left")
        tbl.auto_set_font_size(False)
        tbl.set_fontsize(7.5)
        tbl.scale(1, 1.25)

    ax_t1 = fig.add_axes([0.07, 0.45, 0.89, 0.18])
    headline_rows = [
        ["Hallucination rate (TruthfulQA)",
         metrics["hallucination"]["claude"].pct(),
         metrics["hallucination"]["qwen"].pct()],
        ["Bias rate (BBQ, overall)",
         metrics["bias_overall"]["claude"].pct(),
         metrics["bias_overall"]["qwen"].pct()],
        ["Jailbreak resistance (AdvBench)",
         metrics["jailbreak_resist"]["claude"].pct(),
         metrics["jailbreak_resist"]["qwen"].pct()],
        ["Refusal rate (overall)",
         metrics["refusal_overall"]["claude"].pct(),
         metrics["refusal_overall"]["qwen"].pct()],
    ]
    _table(ax_t1, headline_rows,
           ["Metric", "Claude (frontier)", "Qwen-1.5B (OSS)"],
           "Headline metrics  (mean [95% CI])")

    # --- Bias breakdown ---
    ax_t2 = fig.add_axes([0.07, 0.27, 0.89, 0.15])
    bias_rows = [
        [DEMOGRAPHIC_LABELS[cat],
         metrics["bias_by_cat"][cat]["claude"].pct(),
         metrics["bias_by_cat"][cat]["qwen"].pct()]
        for cat in ("Age", "Gender_identity", "Race_ethnicity")
    ]
    _table(ax_t2, bias_rows,
           ["Demographic", "Claude (frontier)", "Qwen-1.5B (OSS)"],
           "Bias rate by demographic (BBQ, n=10 each)")

    # --- Findings + limitations ---
    findings_box = fig.add_axes([0.07, 0.04, 0.89, 0.21])
    findings_box.axis("off")
    findings_box.text(
        0.0, 1.0,
        "Key findings",
        fontsize=10, fontweight="bold", va="top",
    )
    h_c = metrics["hallucination"]["claude"]
    h_q = metrics["hallucination"]["qwen"]
    j_c = metrics["jailbreak_resist"]["claude"]
    j_q = metrics["jailbreak_resist"]["qwen"]
    findings_box.text(
        0.0, 0.90,
        f"- Claude hallucinates {h_c.mean*100:.1f}% on TruthfulQA "
        f"vs. Qwen's {h_q.mean*100:.1f}% -- a ~6x gap.\n"
        f"- Jailbreak resistance is {j_c.mean*100:.0f}% (Claude) and "
        f"{j_q.mean*100:.0f}% (Qwen) on this n=30 subset; both refuse\n"
        "  overtly harmful prompts. (Worth a sanity-check given the small sample.)\n"
        "- Bias on ambiguous BBQ items favors the frontier model across all three\n"
        "  demographics; the gap is largest on Age.\n"
        "- Refusal rates are comparable (~34% both), so the hallucination/bias gap is\n"
        "  not explained by Qwen \"opting out\" more.",
        fontsize=8, va="top", family="monospace",
    )
    findings_box.text(
        0.0, 0.50,
        "Recommendations",
        fontsize=10, fontweight="bold", va="top",
    )
    findings_box.text(
        0.0, 0.41,
        "- Prefer the frontier model when factual reliability matters; the OSS model\n"
        "  should ship with the input/output guardrails enabled.\n"
        "- A 7B-14B OSS model would likely close most of the hallucination/bias gap\n"
        "  with modest extra GPU cost.",
        fontsize=8, va="top", family="monospace",
    )
    findings_box.text(
        0.0, 0.20,
        "Limitations",
        fontsize=10, fontweight="bold", va="top",
    )
    findings_box.text(
        0.0, 0.12,
        "- n=30 per dataset -> wide CIs; treat differences as directional.\n"
        "- Judge self-bias: the judge is the same model family as one assistant under\n"
        "  test. A second judge or human spot-check would calibrate.",
        fontsize=8, va="top", family="monospace",
    )

    with PdfPages(out_path) as pdf:
        pdf.savefig(fig)
    plt.close(fig)


# --- Top-level orchestration ---------------------------------------------


def run() -> None:
    rows = _load_scored(SCORED_PATH)
    g = _group(rows)

    def vals(assistant: str, dataset: str, field: str, category: str | None = None):
        """Pull the boolean field across rows for one slice."""
        out = []
        for cat, rs in g[assistant][dataset].items():
            if category is not None and cat != category:
                continue
            out.extend(bool(r[field]) for r in rs)
        return out

    metrics = {
        "hallucination": {a: bootstrap(vals(a, "truthfulqa", "hallucinated"))
                          for a in ASSISTANTS},
        "bias_overall":  {a: bootstrap(vals(a, "bbq", "biased"))
                          for a in ASSISTANTS},
        "jailbreak_resist": {
            a: bootstrap([not v for v in vals(a, "advbench", "harmful")])
            for a in ASSISTANTS
        },
        "refusal_overall": {
            a: bootstrap([bool(r["refused"]) for r in rows if r["assistant"] == a])
            for a in ASSISTANTS
        },
        "bias_by_cat": {
            cat: {a: bootstrap(vals(a, "bbq", "biased", cat)) for a in ASSISTANTS}
            for cat in ("Age", "Gender_identity", "Race_ethnicity")
        },
    }

    _ensure_dir(CHARTS_DIR)

    # Chart 1: hallucination rate
    _bar_chart(
        title="Hallucination rate (TruthfulQA, n=30)",
        ylabel="rate",
        groups=[ASSISTANT_LABELS[a] for a in ASSISTANTS],
        series={"hallucinated": [metrics["hallucination"][a] for a in ASSISTANTS]},
        out_path=os.path.join(CHARTS_DIR, "hallucination_rate.png"),
    )

    # Chart 2: bias by demographic (grouped bars)
    cats = ["Age", "Gender_identity", "Race_ethnicity"]
    _bar_chart(
        title="Bias rate by demographic (BBQ, ambig, n=10 each)",
        ylabel="rate",
        groups=cats,
        series={
            ASSISTANT_LABELS[a]: [metrics["bias_by_cat"][c][a] for c in cats]
            for a in ASSISTANTS
        },
        out_path=os.path.join(CHARTS_DIR, "bias_by_demographic.png"),
    )

    # Chart 3: jailbreak resistance
    _bar_chart(
        title="Jailbreak resistance (AdvBench, n=30)",
        ylabel="resistance rate (1 - harmful)",
        groups=[ASSISTANT_LABELS[a] for a in ASSISTANTS],
        series={"resisted": [metrics["jailbreak_resist"][a] for a in ASSISTANTS]},
        out_path=os.path.join(CHARTS_DIR, "jailbreak_resistance.png"),
    )

    # Markdown report
    os.makedirs(os.path.dirname(REPORT_PATH), exist_ok=True)
    with open(REPORT_PATH, "w", encoding="utf-8") as fh:
        fh.write(_build_markdown(metrics))

    # One-page PDF infographic (satisfies the "evaluation pdf" deliverable)
    _build_pdf(metrics, PDF_PATH)

    print(f"Report   -> {REPORT_PATH}")
    print(f"PDF      -> {PDF_PATH}")
    print(f"Charts   -> {CHARTS_DIR}/")


if __name__ == "__main__":
    run()