"""Cell 18 — Baseline evaluation harness. Implements ``docs/modules/evaluation.md`` §1, §2, §3.1–§3.3, §3.8, §4 and §5 for the baseline (untrained Gemma 3n E2B) eval path. Hard rules (evaluation.md §3.1, §3.2, §6.3): - Greedy decoding (``temperature=0.0``); ``num_generations=1``; ``model.eval()`` + ``torch.no_grad()`` semantics asserted at entry. - Per-episode env seed = ``hash((episode_id, "eval")) & 0xFFFFFFFF``. - 50 held-out val episodes (rows ``[0:50]`` of ``val/briefs.jsonl``) — file order, no shuffling. - Bootstrap CI (percentile method) at ``n_boot=10_000``, ``rng_seed=20260426`` (paired-difference uses ``20260428``). - No LLM-as-judge; static AST scan via ``_NO_LLM_JUDGE_FORBIDDEN_IMPORTS``. - Wall-clock ceiling 20 minutes (``EvalBudgetExceededError`` on overrun). This module deliberately does **not** import ``torch`` at module load. The training-eval delegate is injected via ``run_eval_baseline(..., training_eval=...)`` so unit tests can stub model inference (CUDA-free CI per training_tests.md §5.3). """ from __future__ import annotations import math import time from dataclasses import dataclass, field from typing import TYPE_CHECKING, Any, Literal, Protocol if TYPE_CHECKING: # pragma: no cover - typing only from collections.abc import Callable, Sequence from pathlib import Path __all__ = [ "BUDGET_RUN_EVAL_SECONDS", "DEFAULT_BOOTSTRAP_SEED", "DEFAULT_PAIRED_BOOTSTRAP_SEED", "DriftDetectionLatency", "EvalBudgetExceededError", "EvalModelLoadError", "EvalReport", "EvaluationError", "PerLanguageReport", "TrainingEvalCallable", "ZeroSuccessBaselineWarning", "bootstrap_ci", "compute_episode_seed", "eval_baseline", "run_eval", ] # --------------------------------------------------------------------------- # Constants — evaluation.md §2.4, §3.8 # --------------------------------------------------------------------------- DEFAULT_BOOTSTRAP_SEED: int = 20260426 DEFAULT_PROBE_BOOTSTRAP_SEED: int = 20260427 DEFAULT_PAIRED_BOOTSTRAP_SEED: int = 20260428 DEFAULT_N_BOOT: int = 10_000 BUDGET_RUN_EVAL_SECONDS: int = 20 * 60 """Hard ceiling on ``run_eval`` (50 episodes) — evaluation.md §3.8.""" # Forbidden imports inside any evaluation/scoring path (evaluation.md §6.3). _NO_LLM_JUDGE_FORBIDDEN_IMPORTS: frozenset[str] = frozenset( {"openai", "anthropic", "vertexai", "google.generativeai", "cohere"}, ) _LANGUAGE_CODES: tuple[str, ...] = ("hi", "ta", "kn", "en", "hinglish") # --------------------------------------------------------------------------- # Errors / warnings — evaluation.md §5 # --------------------------------------------------------------------------- class EvaluationError(Exception): """Root for every evaluation-specific error (evaluation.md §5).""" class EvalModelLoadError(EvaluationError): """Adapter load / merge failure surfaced by the training-eval delegate.""" class EvalBudgetExceededError(EvaluationError): """Wall-clock budget for an entry point exceeded (evaluation.md §3.8, §5).""" class CatalogueHashMismatchError(EvaluationError): """Loaded catalogue hashes do not match the BriefRow's declared hashes.""" class ZeroSuccessBaselineWarning(UserWarning): """All 50 baseline R1 == 0.0 → degenerate CI; warn rather than raise.""" # --------------------------------------------------------------------------- # EvalReport family — re-exported for downstream cells (evaluation.md §4) # --------------------------------------------------------------------------- @dataclass(frozen=True) class PerLanguageReport: """Per-language cohort means (training.md §4.2).""" language: Literal["hi", "ta", "kn", "en", "hinglish"] n_episodes: int reward_mean: float r1_mean: float r2_mean: float r3_mean: float r4_mean: float r5_mean: float @dataclass(frozen=True) class DriftDetectionLatency: """Drift-detection latency aggregated by stage (training.md §4.2).""" stage2_mean: float stage2_median: float stage2_p95: float stage3_mean: float stage3_median: float stage3_p95: float undetected_count: int @dataclass(frozen=True) class EvalReport: """Result of ``run_eval`` — paired across baseline and final (training.md §4.2).""" model_path: str n_episodes: int reward_mean_ci: tuple[float, float, float] r1_mean_ci: tuple[float, float, float] r2_mean_ci: tuple[float, float, float] r3_mean_ci: tuple[float, float, float] r4_mean_ci: tuple[float, float, float] r5_mean_ci: tuple[float, float, float] brier_mean: float floor_applied_rate: float hallucinated_field_rate: float reward_hacking_offenses: dict[str, int] drift_detection_latency: DriftDetectionLatency per_language: tuple[PerLanguageReport, ...] curves: dict[str, tuple[tuple[int, float], ...]] = field(default_factory=dict) breakdown: dict[str, Any] = field(default_factory=dict) # --------------------------------------------------------------------------- # Training-eval delegate Protocol — evaluation.md §6.1 # --------------------------------------------------------------------------- class TrainingEvalCallable(Protocol): """Signature of ``training.train.eval`` — the heavy-lifting delegate.""" def __call__( self, model_path: Path | Literal["base"], episodes: int, *, sampling: dict[str, Any], seeds: Sequence[int], episode_ids: Sequence[str], ) -> EvalReport: ... # --------------------------------------------------------------------------- # Statistical helpers — evaluation.md §2.4, §3.3 # --------------------------------------------------------------------------- def bootstrap_ci( samples: tuple[float, ...], n_boot: int = DEFAULT_N_BOOT, alpha: float = 0.05, rng_seed: int = DEFAULT_BOOTSTRAP_SEED, ) -> tuple[float, float, float]: """Non-parametric percentile bootstrap 95% CI on the mean. evaluation.md §2.4 contract: - ``len(samples) == 0`` → ``(nan, nan, nan)``. - ``len(samples) == 1`` → ``(v, v, v)``. - All-identical samples → ``(v, v, v)`` (no resample variance). """ if not samples: nan = float("nan") return nan, nan, nan n = len(samples) mean = sum(samples) / n if n == 1: return mean, mean, mean if all(s == samples[0] for s in samples): return mean, mean, mean # Lazy import to keep this module importable on minimal CI containers. import numpy as np rng = np.random.default_rng(rng_seed) arr = np.asarray(samples, dtype=np.float64) idx = rng.integers(0, n, size=(n_boot, n)) means = arr[idx].mean(axis=1) lo = float(np.percentile(means, 100.0 * (alpha / 2.0))) hi = float(np.percentile(means, 100.0 * (1.0 - alpha / 2.0))) return float(mean), lo, hi # --------------------------------------------------------------------------- # Episode selection helpers — evaluation.md §3.1 # --------------------------------------------------------------------------- def compute_episode_seed(episode_id: str) -> int: """``hash((episode_id, "eval")) & 0xFFFFFFFF`` — re-asserted at every call site.""" return hash((episode_id, "eval")) & 0xFFFFFFFF def _validate_briefs_first_50(briefs: Sequence[Any]) -> tuple[Any, ...]: """Take the first 50 BriefRows in file order; raise on too few.""" if len(briefs) < 50: raise EvaluationError( f"val/briefs.jsonl must have >= 50 rows for paired eval, got {len(briefs)}", ) return tuple(briefs[:50]) def _check_catalogue_hashes(briefs: Sequence[Any], current_hashes: dict[str, str]) -> None: """Compare each BriefRow's declared hash against the loaded library hashes. evaluation.md §3.1: any mismatch → ``CatalogueHashMismatchError``. """ for row in briefs: for attr, key in ( ("catalogue_hash", "drifts"), ("templates_sha256", "templates"), ("i18n_sha256", "i18n"), ): declared = getattr(row, attr, None) current = current_hashes.get(key) if declared is None or current is None: continue if declared != current: raise CatalogueHashMismatchError( f"BriefRow.{attr}={declared!r} but loaded {key} hashes to {current!r}", ) # --------------------------------------------------------------------------- # Sampling-policy guard — evaluation.md §3.2 # --------------------------------------------------------------------------- _FROZEN_SAMPLING_POLICY: dict[str, Any] = { "temperature": 0.0, "top_p": 1.0, "top_k": 1, "num_generations": 1, "repetition_penalty": 1.0, "model_eval": True, "no_grad": True, "dropout_off": True, } def _frozen_sampling_kwargs() -> dict[str, Any]: return dict(_FROZEN_SAMPLING_POLICY) # --------------------------------------------------------------------------- # Episode-set / leakage helpers — evaluation.md §3.1 # --------------------------------------------------------------------------- def _episode_ids_from_breakdown(report: EvalReport) -> tuple[str, ...]: ids = report.breakdown.get("episode_ids", ()) return tuple(ids) # --------------------------------------------------------------------------- # Core entry point — evaluation.md §2.1 ``run_eval`` # --------------------------------------------------------------------------- def run_eval( model_path: Path | Literal["base"], episodes: int = 50, *, training_eval: TrainingEvalCallable, briefs: Sequence[Any], catalogue_hashes: dict[str, str] | None = None, budget_seconds: int = BUDGET_RUN_EVAL_SECONDS, monotonic: Callable[[], float] | None = None, ) -> EvalReport: """Thin wrapper over ``training.train.eval`` (evaluation.md §2.1). Validates episode count, catalogue hashes, sampling policy, and wall-clock budget. Delegates the heavy lifting (model load, rollout, ``Rewards`` aggregation) to the injected ``training_eval`` callable. """ if episodes != 50: raise EvaluationError( f"run_eval expects episodes=50 (paired-comparison contract); got {episodes}", ) selected = _validate_briefs_first_50(briefs) if catalogue_hashes is not None: _check_catalogue_hashes(selected, catalogue_hashes) episode_ids = tuple(row.episode_id for row in selected) seeds = tuple(compute_episode_seed(ep_id) for ep_id in episode_ids) clock = monotonic if monotonic is not None else time.monotonic started = clock() try: report = training_eval( model_path, episodes, sampling=_frozen_sampling_kwargs(), seeds=seeds, episode_ids=episode_ids, ) except EvalModelLoadError: raise except EvaluationError: raise elapsed = clock() - started if elapsed > budget_seconds: raise EvalBudgetExceededError( f"run_eval wall-clock {elapsed:.1f}s exceeded {budget_seconds}s " f"({budget_seconds // 60} min ceiling)", ) # Stamp episode_ids + wall-clock into breakdown for downstream leak guards. breakdown = dict(report.breakdown) breakdown.setdefault("episode_ids", episode_ids) breakdown.setdefault("wall_clock_seconds", round(elapsed, 3)) breakdown.setdefault("sampling_policy", _frozen_sampling_kwargs()) # Detect zero-success-baseline degeneracy (§7.1) — warn, do not raise. r1_mean = report.r1_mean_ci[0] if math.isclose(r1_mean, 0.0, abs_tol=1e-12) and report.model_path == "base": breakdown["ci_undefined_rewards"] = ["r1"] from dataclasses import replace as _replace return _replace(report, breakdown=breakdown) def eval_baseline( model_path: Path | Literal["base"] = "base", episodes: int = 50, *, training_eval: TrainingEvalCallable, briefs: Sequence[Any], catalogue_hashes: dict[str, str] | None = None, budget_seconds: int = BUDGET_RUN_EVAL_SECONDS, monotonic: Callable[[], float] | None = None, ) -> EvalReport: """Baseline-eval entry point (evaluation.md §2.2 ``eval_baseline.py``). Defaults ``model_path='base'`` to lock in the untrained-model contract. """ return run_eval( model_path, episodes, training_eval=training_eval, briefs=briefs, catalogue_hashes=catalogue_hashes, budget_seconds=budget_seconds, monotonic=monotonic, )