Spaces:

KN123
/

ollive-api

Running

Karthik Namboori

Deploy ollive FastAPI Docker Space

7b4b748 8 days ago

2.94 kB

	"""Official three-metric evaluation suites for ollive."""

	from __future__ import annotations

	from dataclasses import dataclass

	from evaluation.benchmarks import PUBLIC_BENCHMARKS
	from evaluation.prompts import CUSTOM_PROMPTS_BY_METRIC, EvalMetric, EvalPrompt

	METRIC_LABELS: dict[EvalMetric, str] = {
	"hallucination": "Hallucination Rate",
	"bias_harm": "Bias & Harmful Outputs",
	"content_safety": "Content Safety",
	}

	METRIC_DESCRIPTIONS: dict[EvalMetric, str] = {
	"hallucination": (
	"Percentage of responses with incorrect or fabricated factual claims. "
	"Lower is better."
	),
	"bias_harm": (
	"Percentage of responses showing stereotypes, discrimination, or other harmful output. "
	"Lower is better."
	),
	"content_safety": (
	"Percentage of responses that safely refuse or redirect harmful, jailbreak, "
	"or adversarial requests. Higher is better."
	),
	}

	OFFICIAL_BENCHMARK_BY_METRIC: dict[EvalMetric, str] = {
	"hallucination": "truthfulqa",
	"bias_harm": "bbq",
	"content_safety": "simple_safety_tests",
	}


	@dataclass(frozen=True)
	class MetricSuiteConfig:
	metric: EvalMetric
	label: str
	description: str
	custom_prompts: tuple[EvalPrompt, ...]
	benchmark: str
	lower_is_better: bool


	METRIC_SUITES: dict[EvalMetric, MetricSuiteConfig] = {
	metric: MetricSuiteConfig(
	metric=metric,
	label=METRIC_LABELS[metric],
	description=METRIC_DESCRIPTIONS[metric],
	custom_prompts=tuple(CUSTOM_PROMPTS_BY_METRIC[metric]),
	benchmark=OFFICIAL_BENCHMARK_BY_METRIC[metric],
	lower_is_better=metric != "content_safety",
	)
	for metric in ("hallucination", "bias_harm", "content_safety")
	}


	def build_official_prompt_set(
	benchmark_samples: int = 10,
	seed: int = 42,
	) -> list[EvalPrompt]:
	"""Build the assignment prompt set: 10 custom + public benchmark per metric."""
	prompts: list[EvalPrompt] = []

	for suite in METRIC_SUITES.values():
	prompts.extend(suite.custom_prompts)
	loader = PUBLIC_BENCHMARKS[suite.benchmark]
	public_rows = loader(benchmark_samples, seed)
	prompts.extend(public_rows)

	return prompts


	def describe_official_plan(benchmark_samples: int = 10) -> str:
	lines = [
	"Each metric uses 10 custom prompts + 1 public benchmark.",
	"",
	"\| Metric \| Custom prompts \| Public benchmark \| Samples / benchmark \|",
	"\|--------\|----------------\|------------------\|---------------------\|",
	]
	for suite in METRIC_SUITES.values():
	lines.append(
	f"\| {suite.label} \| 10 \| `{suite.benchmark}` \| {benchmark_samples} \|"
	)
	lines.extend(
	[
	"",
	f"Total prompts per assistant: {10 * 3 + benchmark_samples * 3} "
	f"(30 custom + {benchmark_samples * 3} public)",
	]
	)
	return "\n".join(lines)