Spaces:

KevinMerchant13
/

oss-vs-frontier-assistant

Running

File size: 6,494 Bytes

35c0d38

"""Eval dataset loaders.

Loads fixed 30-prompt subsets from three Hugging Face datasets, using a single
random seed (42) so the eval is fully reproducible:

  - TruthfulQA  -> hallucination       (free-form questions designed to elicit
                                       common misconceptions)
  - BBQ         -> bias                 (10 prompts each from Age, Gender_identity,
                                       Race_ethnicity; ambiguous-context items
                                       so neutral "unknown" answers score well)
  - AdvBench    -> safety / jailbreak  (overtly harmful instructions; a safe
                                       assistant should refuse)

Each loader returns a list[EvalItem]; ``load_all()`` returns the full 90.
"""

from __future__ import annotations

import csv
import io
import os
import random
import urllib.request
from dataclasses import dataclass, asdict
from typing import Any

from datasets import load_dataset

SEED = 42
PER_DATASET = 30
BBQ_PER_CATEGORY = 10  # 10 each from Age / Gender_identity / Race_ethnicity


@dataclass
class EvalItem:
    """One prompt going through the eval."""

    id: str  # unique, stable id (e.g. "truthfulqa_007")
    dataset: str  # "truthfulqa" | "bbq" | "advbench"
    category: str  # within-dataset subcategory (e.g. "Age", "harmful")
    prompt: str  # the actual prompt sent to the assistant
    reference: Any | None = None  # gold answer / metadata, if any (judge sees this)

    def to_dict(self) -> dict:
        return asdict(self)


# --- TruthfulQA -----------------------------------------------------------


def _load_truthfulqa() -> list[EvalItem]:
    """30 random free-form questions from TruthfulQA (hallucination eval)."""
    ds = load_dataset("truthfulqa/truthful_qa", "generation", split="validation")
    rng = random.Random(SEED)
    indices = rng.sample(range(len(ds)), PER_DATASET)
    items: list[EvalItem] = []
    for i, idx in enumerate(indices):
        row = ds[int(idx)]
        items.append(
            EvalItem(
                id=f"truthfulqa_{i:03d}",
                dataset="truthfulqa",
                category=row.get("category", "general"),
                prompt=row["question"],
                # The judge uses these to ground its hallucination verdict.
                reference={
                    "best_answer": row.get("best_answer"),
                    "correct_answers": row.get("correct_answers"),
                    "incorrect_answers": row.get("incorrect_answers"),
                },
            )
        )
    return items


# --- BBQ ------------------------------------------------------------------

# BBQ categories we evaluate (spec: age / gender / race). Elfsong/BBQ uses
# lowercase split names; the display category is the canonical capitalized form.
_BBQ_CATEGORIES = {
    "Age": "age",
    "Gender_identity": "gender_identity",
    "Race_ethnicity": "race_ethnicity",
}


def _format_bbq_prompt(row: dict) -> str:
    """Render a BBQ row as a short multiple-choice question."""
    return (
        f"{row['context']}\n"
        f"Question: {row['question']}\n"
        f"Choices:\n"
        f"  A) {row['ans0']}\n"
        f"  B) {row['ans1']}\n"
        f"  C) {row['ans2']}\n"
        f"Answer with A, B, or C, plus a one-sentence justification."
    )


def _load_bbq() -> list[EvalItem]:
    """10 random ambiguous BBQ items each from age / gender / race (= 30)."""
    rng = random.Random(SEED)
    items: list[EvalItem] = []
    for display_cat, split_name in _BBQ_CATEGORIES.items():
        ds = load_dataset("Elfsong/BBQ", split=split_name)
        # Keep only "ambig" rows: the neutral "unknown" answer is correct here,
        # so any non-neutral pick reveals demographic bias.
        ambig = [r for r in ds if r.get("context_condition") == "ambig"]
        chosen = rng.sample(ambig, BBQ_PER_CATEGORY)
        for j, row in enumerate(chosen):
            items.append(
                EvalItem(
                    id=f"bbq_{split_name}_{j:03d}",
                    dataset="bbq",
                    category=display_cat,
                    prompt=_format_bbq_prompt(row),
                    reference={
                        # index of the correct answer (0/1/2) — for ambig rows
                        # this is always the "Can't be determined" choice.
                        "answer_label": row.get("answer_label"),
                        # the demographic-aligned answer (the "bias target").
                        "target_label": row.get("target_label"),
                        "answer_info": row.get("answer_info"),
                    },
                )
            )
    return items


# --- AdvBench -------------------------------------------------------------


# AdvBench (Zou et al.) is gated/missing on Hugging Face but the canonical
# harmful_behaviors.csv is hosted publicly in the llm-attacks GitHub repo. We
# fetch it once and cache to data/ to avoid repeated network calls.
_ADVBENCH_URL = (
    "https://raw.githubusercontent.com/llm-attacks/llm-attacks/"
    "main/data/advbench/harmful_behaviors.csv"
)
_ADVBENCH_CACHE = "./data/advbench_harmful_behaviors.csv"


def _fetch_advbench_csv() -> list[dict]:
    if not os.path.exists(_ADVBENCH_CACHE):
        os.makedirs(os.path.dirname(_ADVBENCH_CACHE), exist_ok=True)
        with urllib.request.urlopen(_ADVBENCH_URL, timeout=60) as resp:
            data = resp.read().decode("utf-8")
        with open(_ADVBENCH_CACHE, "w", encoding="utf-8", newline="") as fh:
            fh.write(data)
    else:
        with open(_ADVBENCH_CACHE, "r", encoding="utf-8") as fh:
            data = fh.read()
    return list(csv.DictReader(io.StringIO(data)))


def _load_advbench() -> list[EvalItem]:
    """30 random harmful instructions from AdvBench (safety / refusal eval)."""
    rows = _fetch_advbench_csv()
    rng = random.Random(SEED)
    indices = rng.sample(range(len(rows)), PER_DATASET)
    return [
        EvalItem(
            id=f"advbench_{i:03d}",
            dataset="advbench",
            category="harmful_behavior",
            prompt=rows[idx]["goal"],
            reference={"target": rows[idx].get("target")},
        )
        for i, idx in enumerate(indices)
    ]


# --- public entrypoint ----------------------------------------------------


def load_all() -> list[EvalItem]:
    """Load the full 90-prompt eval set (30 from each dataset)."""
    return _load_truthfulqa() + _load_bbq() + _load_advbench()