"""Core abstractions for external benchmark adapters.""" from __future__ import annotations import dataclasses import logging import time from abc import ABC, abstractmethod from typing import Any, Dict, Optional from bench.external.constants import ZERO_FLOAT logger = logging.getLogger(__name__) @dataclasses.dataclass class BenchmarkResult: """Result from running a single external benchmark. Parameters ---------- benchmark_name : str Machine-readable benchmark identifier. scores : dict Metric name to float value mapping. primary_metric : str Key into *scores* for the single headline number. metadata : dict Arbitrary extra info (dataset version, sample count, etc.). raw_outputs : list Per-sample outputs for debugging / qualitative review. elapsed_seconds : float Wall-clock time for the benchmark run. error : str or None If the run failed, a description of the error. """ benchmark_name: str scores: Dict[str, float] = dataclasses.field(default_factory=dict) primary_metric: str = "" metadata: Dict[str, Any] = dataclasses.field(default_factory=dict) raw_outputs: list = dataclasses.field(default_factory=list) elapsed_seconds: float = ZERO_FLOAT error: Optional[str] = None @property def primary_score(self) -> Optional[float]: """Return the primary metric value, or ``None`` on error.""" if self.error is not None: return None return self.scores.get(self.primary_metric) class BenchmarkAdapter(ABC): """Abstract base class for external benchmark integrations.""" @property @abstractmethod def name(self) -> str: """Machine-readable benchmark name.""" @property @abstractmethod def display_name(self) -> str: """Human-readable benchmark name.""" @abstractmethod def run(self, model_handle: Any) -> BenchmarkResult: """Execute the benchmark and return results. Parameters ---------- model_handle : ModelHandle Unified model interface for generation. Returns ------- BenchmarkResult """ def run_safe(self, model_handle: Any) -> BenchmarkResult: """Execute the benchmark, catching any exception. Returns a ``BenchmarkResult`` with the *error* field populated on failure so that the overall pipeline never crashes. """ start = time.monotonic() try: result = self.run(model_handle) result.elapsed_seconds = time.monotonic() - start return result except Exception as exc: # noqa: BLE001 elapsed = time.monotonic() - start logger.exception("Benchmark %s failed", self.name) return BenchmarkResult( benchmark_name=self.name, error=str(exc), elapsed_seconds=elapsed, )