| | """Core abstractions for external benchmark adapters.""" |
| |
|
| | from __future__ import annotations |
| |
|
| | import dataclasses |
| | import logging |
| | import time |
| | from abc import ABC, abstractmethod |
| | from typing import Any, Dict, Optional |
| |
|
| | from bench.external.constants import ZERO_FLOAT |
| |
|
| | logger = logging.getLogger(__name__) |
| |
|
| |
|
| | @dataclasses.dataclass |
| | class BenchmarkResult: |
| | """Result from running a single external benchmark. |
| | |
| | Parameters |
| | ---------- |
| | benchmark_name : str |
| | Machine-readable benchmark identifier. |
| | scores : dict |
| | Metric name to float value mapping. |
| | primary_metric : str |
| | Key into *scores* for the single headline number. |
| | metadata : dict |
| | Arbitrary extra info (dataset version, sample count, etc.). |
| | raw_outputs : list |
| | Per-sample outputs for debugging / qualitative review. |
| | elapsed_seconds : float |
| | Wall-clock time for the benchmark run. |
| | error : str or None |
| | If the run failed, a description of the error. |
| | """ |
| |
|
| | benchmark_name: str |
| | scores: Dict[str, float] = dataclasses.field(default_factory=dict) |
| | primary_metric: str = "" |
| | metadata: Dict[str, Any] = dataclasses.field(default_factory=dict) |
| | raw_outputs: list = dataclasses.field(default_factory=list) |
| | elapsed_seconds: float = ZERO_FLOAT |
| | error: Optional[str] = None |
| |
|
| | @property |
| | def primary_score(self) -> Optional[float]: |
| | """Return the primary metric value, or ``None`` on error.""" |
| | if self.error is not None: |
| | return None |
| | return self.scores.get(self.primary_metric) |
| |
|
| |
|
| | class BenchmarkAdapter(ABC): |
| | """Abstract base class for external benchmark integrations.""" |
| |
|
| | @property |
| | @abstractmethod |
| | def name(self) -> str: |
| | """Machine-readable benchmark name.""" |
| |
|
| | @property |
| | @abstractmethod |
| | def display_name(self) -> str: |
| | """Human-readable benchmark name.""" |
| |
|
| | @abstractmethod |
| | def run(self, model_handle: Any) -> BenchmarkResult: |
| | """Execute the benchmark and return results. |
| | |
| | Parameters |
| | ---------- |
| | model_handle : ModelHandle |
| | Unified model interface for generation. |
| | |
| | Returns |
| | ------- |
| | BenchmarkResult |
| | """ |
| |
|
| | def run_safe(self, model_handle: Any) -> BenchmarkResult: |
| | """Execute the benchmark, catching any exception. |
| | |
| | Returns a ``BenchmarkResult`` with the *error* field populated on |
| | failure so that the overall pipeline never crashes. |
| | """ |
| | start = time.monotonic() |
| | try: |
| | result = self.run(model_handle) |
| | result.elapsed_seconds = time.monotonic() - start |
| | return result |
| | except Exception as exc: |
| | elapsed = time.monotonic() - start |
| | logger.exception("Benchmark %s failed", self.name) |
| | return BenchmarkResult( |
| | benchmark_name=self.name, |
| | error=str(exc), |
| | elapsed_seconds=elapsed, |
| | ) |
| |
|