ollive-api / evaluation /suites.py
Karthik Namboori
Deploy ollive FastAPI Docker Space
7b4b748
"""Official three-metric evaluation suites for ollive."""
from __future__ import annotations
from dataclasses import dataclass
from evaluation.benchmarks import PUBLIC_BENCHMARKS
from evaluation.prompts import CUSTOM_PROMPTS_BY_METRIC, EvalMetric, EvalPrompt
METRIC_LABELS: dict[EvalMetric, str] = {
"hallucination": "Hallucination Rate",
"bias_harm": "Bias & Harmful Outputs",
"content_safety": "Content Safety",
}
METRIC_DESCRIPTIONS: dict[EvalMetric, str] = {
"hallucination": (
"Percentage of responses with incorrect or fabricated factual claims. "
"Lower is better."
),
"bias_harm": (
"Percentage of responses showing stereotypes, discrimination, or other harmful output. "
"Lower is better."
),
"content_safety": (
"Percentage of responses that safely refuse or redirect harmful, jailbreak, "
"or adversarial requests. Higher is better."
),
}
OFFICIAL_BENCHMARK_BY_METRIC: dict[EvalMetric, str] = {
"hallucination": "truthfulqa",
"bias_harm": "bbq",
"content_safety": "simple_safety_tests",
}
@dataclass(frozen=True)
class MetricSuiteConfig:
metric: EvalMetric
label: str
description: str
custom_prompts: tuple[EvalPrompt, ...]
benchmark: str
lower_is_better: bool
METRIC_SUITES: dict[EvalMetric, MetricSuiteConfig] = {
metric: MetricSuiteConfig(
metric=metric,
label=METRIC_LABELS[metric],
description=METRIC_DESCRIPTIONS[metric],
custom_prompts=tuple(CUSTOM_PROMPTS_BY_METRIC[metric]),
benchmark=OFFICIAL_BENCHMARK_BY_METRIC[metric],
lower_is_better=metric != "content_safety",
)
for metric in ("hallucination", "bias_harm", "content_safety")
}
def build_official_prompt_set(
benchmark_samples: int = 10,
seed: int = 42,
) -> list[EvalPrompt]:
"""Build the assignment prompt set: 10 custom + public benchmark per metric."""
prompts: list[EvalPrompt] = []
for suite in METRIC_SUITES.values():
prompts.extend(suite.custom_prompts)
loader = PUBLIC_BENCHMARKS[suite.benchmark]
public_rows = loader(benchmark_samples, seed)
prompts.extend(public_rows)
return prompts
def describe_official_plan(benchmark_samples: int = 10) -> str:
lines = [
"Each metric uses **10 custom prompts + 1 public benchmark**.",
"",
"| Metric | Custom prompts | Public benchmark | Samples / benchmark |",
"|--------|----------------|------------------|---------------------|",
]
for suite in METRIC_SUITES.values():
lines.append(
f"| {suite.label} | 10 | `{suite.benchmark}` | {benchmark_samples} |"
)
lines.extend(
[
"",
f"**Total prompts per assistant:** {10 * 3 + benchmark_samples * 3} "
f"(30 custom + {benchmark_samples * 3} public)",
]
)
return "\n".join(lines)