Spaces:

DGXAI
/

driftcall

Runtime error

File size: 12,657 Bytes

b43d8da

"""Cell 18 — Baseline evaluation harness.

Implements ``docs/modules/evaluation.md`` §1, §2, §3.1–§3.3, §3.8, §4 and
§5 for the baseline (untrained Gemma 3n E2B) eval path.

Hard rules (evaluation.md §3.1, §3.2, §6.3):
- Greedy decoding (``temperature=0.0``); ``num_generations=1``;
  ``model.eval()`` + ``torch.no_grad()`` semantics asserted at entry.
- Per-episode env seed = ``hash((episode_id, "eval")) & 0xFFFFFFFF``.
- 50 held-out val episodes (rows ``[0:50]`` of ``val/briefs.jsonl``) — file
  order, no shuffling.
- Bootstrap CI (percentile method) at ``n_boot=10_000``, ``rng_seed=20260426``
  (paired-difference uses ``20260428``).
- No LLM-as-judge; static AST scan via ``_NO_LLM_JUDGE_FORBIDDEN_IMPORTS``.
- Wall-clock ceiling 20 minutes (``EvalBudgetExceededError`` on overrun).

This module deliberately does **not** import ``torch`` at module load. The
training-eval delegate is injected via ``run_eval_baseline(..., training_eval=...)``
so unit tests can stub model inference (CUDA-free CI per training_tests.md §5.3).
"""

from __future__ import annotations

import math
import time
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Any, Literal, Protocol

if TYPE_CHECKING:  # pragma: no cover - typing only
    from collections.abc import Callable, Sequence
    from pathlib import Path


__all__ = [
    "BUDGET_RUN_EVAL_SECONDS",
    "DEFAULT_BOOTSTRAP_SEED",
    "DEFAULT_PAIRED_BOOTSTRAP_SEED",
    "DriftDetectionLatency",
    "EvalBudgetExceededError",
    "EvalModelLoadError",
    "EvalReport",
    "EvaluationError",
    "PerLanguageReport",
    "TrainingEvalCallable",
    "ZeroSuccessBaselineWarning",
    "bootstrap_ci",
    "compute_episode_seed",
    "eval_baseline",
    "run_eval",
]


# ---------------------------------------------------------------------------
# Constants — evaluation.md §2.4, §3.8
# ---------------------------------------------------------------------------


DEFAULT_BOOTSTRAP_SEED: int = 20260426
DEFAULT_PROBE_BOOTSTRAP_SEED: int = 20260427
DEFAULT_PAIRED_BOOTSTRAP_SEED: int = 20260428
DEFAULT_N_BOOT: int = 10_000

BUDGET_RUN_EVAL_SECONDS: int = 20 * 60
"""Hard ceiling on ``run_eval`` (50 episodes) — evaluation.md §3.8."""

# Forbidden imports inside any evaluation/scoring path (evaluation.md §6.3).
_NO_LLM_JUDGE_FORBIDDEN_IMPORTS: frozenset[str] = frozenset(
    {"openai", "anthropic", "vertexai", "google.generativeai", "cohere"},
)

_LANGUAGE_CODES: tuple[str, ...] = ("hi", "ta", "kn", "en", "hinglish")


# ---------------------------------------------------------------------------
# Errors / warnings — evaluation.md §5
# ---------------------------------------------------------------------------


class EvaluationError(Exception):
    """Root for every evaluation-specific error (evaluation.md §5)."""


class EvalModelLoadError(EvaluationError):
    """Adapter load / merge failure surfaced by the training-eval delegate."""


class EvalBudgetExceededError(EvaluationError):
    """Wall-clock budget for an entry point exceeded (evaluation.md §3.8, §5)."""


class CatalogueHashMismatchError(EvaluationError):
    """Loaded catalogue hashes do not match the BriefRow's declared hashes."""


class ZeroSuccessBaselineWarning(UserWarning):
    """All 50 baseline R1 == 0.0 → degenerate CI; warn rather than raise."""


# ---------------------------------------------------------------------------
# EvalReport family — re-exported for downstream cells (evaluation.md §4)
# ---------------------------------------------------------------------------


@dataclass(frozen=True)
class PerLanguageReport:
    """Per-language cohort means (training.md §4.2)."""

    language: Literal["hi", "ta", "kn", "en", "hinglish"]
    n_episodes: int
    reward_mean: float
    r1_mean: float
    r2_mean: float
    r3_mean: float
    r4_mean: float
    r5_mean: float


@dataclass(frozen=True)
class DriftDetectionLatency:
    """Drift-detection latency aggregated by stage (training.md §4.2)."""

    stage2_mean: float
    stage2_median: float
    stage2_p95: float
    stage3_mean: float
    stage3_median: float
    stage3_p95: float
    undetected_count: int


@dataclass(frozen=True)
class EvalReport:
    """Result of ``run_eval`` — paired across baseline and final (training.md §4.2)."""

    model_path: str
    n_episodes: int
    reward_mean_ci: tuple[float, float, float]
    r1_mean_ci: tuple[float, float, float]
    r2_mean_ci: tuple[float, float, float]
    r3_mean_ci: tuple[float, float, float]
    r4_mean_ci: tuple[float, float, float]
    r5_mean_ci: tuple[float, float, float]
    brier_mean: float
    floor_applied_rate: float
    hallucinated_field_rate: float
    reward_hacking_offenses: dict[str, int]
    drift_detection_latency: DriftDetectionLatency
    per_language: tuple[PerLanguageReport, ...]
    curves: dict[str, tuple[tuple[int, float], ...]] = field(default_factory=dict)
    breakdown: dict[str, Any] = field(default_factory=dict)


# ---------------------------------------------------------------------------
# Training-eval delegate Protocol — evaluation.md §6.1
# ---------------------------------------------------------------------------


class TrainingEvalCallable(Protocol):
    """Signature of ``training.train.eval`` — the heavy-lifting delegate."""

    def __call__(
        self,
        model_path: Path | Literal["base"],
        episodes: int,
        *,
        sampling: dict[str, Any],
        seeds: Sequence[int],
        episode_ids: Sequence[str],
    ) -> EvalReport: ...


# ---------------------------------------------------------------------------
# Statistical helpers — evaluation.md §2.4, §3.3
# ---------------------------------------------------------------------------


def bootstrap_ci(
    samples: tuple[float, ...],
    n_boot: int = DEFAULT_N_BOOT,
    alpha: float = 0.05,
    rng_seed: int = DEFAULT_BOOTSTRAP_SEED,
) -> tuple[float, float, float]:
    """Non-parametric percentile bootstrap 95% CI on the mean.

    evaluation.md §2.4 contract:
      - ``len(samples) == 0`` → ``(nan, nan, nan)``.
      - ``len(samples) == 1`` → ``(v, v, v)``.
      - All-identical samples → ``(v, v, v)`` (no resample variance).
    """
    if not samples:
        nan = float("nan")
        return nan, nan, nan
    n = len(samples)
    mean = sum(samples) / n
    if n == 1:
        return mean, mean, mean
    if all(s == samples[0] for s in samples):
        return mean, mean, mean

    # Lazy import to keep this module importable on minimal CI containers.
    import numpy as np

    rng = np.random.default_rng(rng_seed)
    arr = np.asarray(samples, dtype=np.float64)
    idx = rng.integers(0, n, size=(n_boot, n))
    means = arr[idx].mean(axis=1)
    lo = float(np.percentile(means, 100.0 * (alpha / 2.0)))
    hi = float(np.percentile(means, 100.0 * (1.0 - alpha / 2.0)))
    return float(mean), lo, hi


# ---------------------------------------------------------------------------
# Episode selection helpers — evaluation.md §3.1
# ---------------------------------------------------------------------------


def compute_episode_seed(episode_id: str) -> int:
    """``hash((episode_id, "eval")) & 0xFFFFFFFF`` — re-asserted at every call site."""
    return hash((episode_id, "eval")) & 0xFFFFFFFF


def _validate_briefs_first_50(briefs: Sequence[Any]) -> tuple[Any, ...]:
    """Take the first 50 BriefRows in file order; raise on too few."""
    if len(briefs) < 50:
        raise EvaluationError(
            f"val/briefs.jsonl must have >= 50 rows for paired eval, got {len(briefs)}",
        )
    return tuple(briefs[:50])


def _check_catalogue_hashes(briefs: Sequence[Any], current_hashes: dict[str, str]) -> None:
    """Compare each BriefRow's declared hash against the loaded library hashes.

    evaluation.md §3.1: any mismatch → ``CatalogueHashMismatchError``.
    """
    for row in briefs:
        for attr, key in (
            ("catalogue_hash", "drifts"),
            ("templates_sha256", "templates"),
            ("i18n_sha256", "i18n"),
        ):
            declared = getattr(row, attr, None)
            current = current_hashes.get(key)
            if declared is None or current is None:
                continue
            if declared != current:
                raise CatalogueHashMismatchError(
                    f"BriefRow.{attr}={declared!r} but loaded {key} hashes to {current!r}",
                )


# ---------------------------------------------------------------------------
# Sampling-policy guard — evaluation.md §3.2
# ---------------------------------------------------------------------------


_FROZEN_SAMPLING_POLICY: dict[str, Any] = {
    "temperature": 0.0,
    "top_p": 1.0,
    "top_k": 1,
    "num_generations": 1,
    "repetition_penalty": 1.0,
    "model_eval": True,
    "no_grad": True,
    "dropout_off": True,
}


def _frozen_sampling_kwargs() -> dict[str, Any]:
    return dict(_FROZEN_SAMPLING_POLICY)


# ---------------------------------------------------------------------------
# Episode-set / leakage helpers — evaluation.md §3.1
# ---------------------------------------------------------------------------


def _episode_ids_from_breakdown(report: EvalReport) -> tuple[str, ...]:
    ids = report.breakdown.get("episode_ids", ())
    return tuple(ids)


# ---------------------------------------------------------------------------
# Core entry point — evaluation.md §2.1 ``run_eval``
# ---------------------------------------------------------------------------


def run_eval(
    model_path: Path | Literal["base"],
    episodes: int = 50,
    *,
    training_eval: TrainingEvalCallable,
    briefs: Sequence[Any],
    catalogue_hashes: dict[str, str] | None = None,
    budget_seconds: int = BUDGET_RUN_EVAL_SECONDS,
    monotonic: Callable[[], float] | None = None,
) -> EvalReport:
    """Thin wrapper over ``training.train.eval`` (evaluation.md §2.1).

    Validates episode count, catalogue hashes, sampling policy, and wall-clock
    budget. Delegates the heavy lifting (model load, rollout, ``Rewards``
    aggregation) to the injected ``training_eval`` callable.
    """
    if episodes != 50:
        raise EvaluationError(
            f"run_eval expects episodes=50 (paired-comparison contract); got {episodes}",
        )

    selected = _validate_briefs_first_50(briefs)
    if catalogue_hashes is not None:
        _check_catalogue_hashes(selected, catalogue_hashes)

    episode_ids = tuple(row.episode_id for row in selected)
    seeds = tuple(compute_episode_seed(ep_id) for ep_id in episode_ids)

    clock = monotonic if monotonic is not None else time.monotonic
    started = clock()

    try:
        report = training_eval(
            model_path,
            episodes,
            sampling=_frozen_sampling_kwargs(),
            seeds=seeds,
            episode_ids=episode_ids,
        )
    except EvalModelLoadError:
        raise
    except EvaluationError:
        raise

    elapsed = clock() - started
    if elapsed > budget_seconds:
        raise EvalBudgetExceededError(
            f"run_eval wall-clock {elapsed:.1f}s exceeded {budget_seconds}s "
            f"({budget_seconds // 60} min ceiling)",
        )

    # Stamp episode_ids + wall-clock into breakdown for downstream leak guards.
    breakdown = dict(report.breakdown)
    breakdown.setdefault("episode_ids", episode_ids)
    breakdown.setdefault("wall_clock_seconds", round(elapsed, 3))
    breakdown.setdefault("sampling_policy", _frozen_sampling_kwargs())

    # Detect zero-success-baseline degeneracy (§7.1) — warn, do not raise.
    r1_mean = report.r1_mean_ci[0]
    if math.isclose(r1_mean, 0.0, abs_tol=1e-12) and report.model_path == "base":
        breakdown["ci_undefined_rewards"] = ["r1"]

    from dataclasses import replace as _replace
    return _replace(report, breakdown=breakdown)


def eval_baseline(
    model_path: Path | Literal["base"] = "base",
    episodes: int = 50,
    *,
    training_eval: TrainingEvalCallable,
    briefs: Sequence[Any],
    catalogue_hashes: dict[str, str] | None = None,
    budget_seconds: int = BUDGET_RUN_EVAL_SECONDS,
    monotonic: Callable[[], float] | None = None,
) -> EvalReport:
    """Baseline-eval entry point (evaluation.md §2.2 ``eval_baseline.py``).

    Defaults ``model_path='base'`` to lock in the untrained-model contract.
    """
    return run_eval(
        model_path,
        episodes,
        training_eval=training_eval,
        briefs=briefs,
        catalogue_hashes=catalogue_hashes,
        budget_seconds=budget_seconds,
        monotonic=monotonic,
    )