"""Official three-metric evaluation suites for ollive."""

from __future__ import annotations

from dataclasses import dataclass

from evaluation.benchmarks import PUBLIC_BENCHMARKS
from evaluation.prompts import CUSTOM_PROMPTS_BY_METRIC, EvalMetric, EvalPrompt

METRIC_LABELS: dict[EvalMetric, str] = {
    "hallucination": "Hallucination Rate",
    "bias_harm": "Bias & Harmful Outputs",
    "content_safety": "Content Safety",
}

METRIC_DESCRIPTIONS: dict[EvalMetric, str] = {
    "hallucination": (
        "Percentage of responses with incorrect or fabricated factual claims. "
        "Lower is better."
    ),
    "bias_harm": (
        "Percentage of responses showing stereotypes, discrimination, or other harmful output. "
        "Lower is better."
    ),
    "content_safety": (
        "Percentage of responses that safely refuse or redirect harmful, jailbreak, "
        "or adversarial requests. Higher is better."
    ),
}

OFFICIAL_BENCHMARK_BY_METRIC: dict[EvalMetric, str] = {
    "hallucination": "truthfulqa",
    "bias_harm": "bbq",
    "content_safety": "simple_safety_tests",
}


@dataclass(frozen=True)
class MetricSuiteConfig:
    metric: EvalMetric
    label: str
    description: str
    custom_prompts: tuple[EvalPrompt, ...]
    benchmark: str
    lower_is_better: bool


METRIC_SUITES: dict[EvalMetric, MetricSuiteConfig] = {
    metric: MetricSuiteConfig(
        metric=metric,
        label=METRIC_LABELS[metric],
        description=METRIC_DESCRIPTIONS[metric],
        custom_prompts=tuple(CUSTOM_PROMPTS_BY_METRIC[metric]),
        benchmark=OFFICIAL_BENCHMARK_BY_METRIC[metric],
        lower_is_better=metric != "content_safety",
    )
    for metric in ("hallucination", "bias_harm", "content_safety")
}


def build_official_prompt_set(
    benchmark_samples: int = 10,
    seed: int = 42,
) -> list[EvalPrompt]:
    """Build the assignment prompt set: 10 custom + public benchmark per metric."""
    prompts: list[EvalPrompt] = []

    for suite in METRIC_SUITES.values():
        prompts.extend(suite.custom_prompts)
        loader = PUBLIC_BENCHMARKS[suite.benchmark]
        public_rows = loader(benchmark_samples, seed)
        prompts.extend(public_rows)

    return prompts


def describe_official_plan(benchmark_samples: int = 10) -> str:
    lines = [
        "Each metric uses **10 custom prompts + 1 public benchmark**.",
        "",
        "| Metric | Custom prompts | Public benchmark | Samples / benchmark |",
        "|--------|----------------|------------------|---------------------|",
    ]
    for suite in METRIC_SUITES.values():
        lines.append(
            f"| {suite.label} | 10 | `{suite.benchmark}` | {benchmark_samples} |"
        )
    lines.extend(
        [
            "",
            f"**Total prompts per assistant:** {10 * 3 + benchmark_samples * 3} "
            f"(30 custom + {benchmark_samples * 3} public)",
        ]
    )
    return "\n".join(lines)