jtowarek's picture
Upload folder using huggingface_hub
ba4ecd0 verified
"""Core abstractions for external benchmark adapters."""
from __future__ import annotations
import dataclasses
import logging
import time
from abc import ABC, abstractmethod
from typing import Any, Dict, Optional
from bench.external.constants import ZERO_FLOAT
logger = logging.getLogger(__name__)
@dataclasses.dataclass
class BenchmarkResult:
"""Result from running a single external benchmark.
Parameters
----------
benchmark_name : str
Machine-readable benchmark identifier.
scores : dict
Metric name to float value mapping.
primary_metric : str
Key into *scores* for the single headline number.
metadata : dict
Arbitrary extra info (dataset version, sample count, etc.).
raw_outputs : list
Per-sample outputs for debugging / qualitative review.
elapsed_seconds : float
Wall-clock time for the benchmark run.
error : str or None
If the run failed, a description of the error.
"""
benchmark_name: str
scores: Dict[str, float] = dataclasses.field(default_factory=dict)
primary_metric: str = ""
metadata: Dict[str, Any] = dataclasses.field(default_factory=dict)
raw_outputs: list = dataclasses.field(default_factory=list)
elapsed_seconds: float = ZERO_FLOAT
error: Optional[str] = None
@property
def primary_score(self) -> Optional[float]:
"""Return the primary metric value, or ``None`` on error."""
if self.error is not None:
return None
return self.scores.get(self.primary_metric)
class BenchmarkAdapter(ABC):
"""Abstract base class for external benchmark integrations."""
@property
@abstractmethod
def name(self) -> str:
"""Machine-readable benchmark name."""
@property
@abstractmethod
def display_name(self) -> str:
"""Human-readable benchmark name."""
@abstractmethod
def run(self, model_handle: Any) -> BenchmarkResult:
"""Execute the benchmark and return results.
Parameters
----------
model_handle : ModelHandle
Unified model interface for generation.
Returns
-------
BenchmarkResult
"""
def run_safe(self, model_handle: Any) -> BenchmarkResult:
"""Execute the benchmark, catching any exception.
Returns a ``BenchmarkResult`` with the *error* field populated on
failure so that the overall pipeline never crashes.
"""
start = time.monotonic()
try:
result = self.run(model_handle)
result.elapsed_seconds = time.monotonic() - start
return result
except Exception as exc: # noqa: BLE001
elapsed = time.monotonic() - start
logger.exception("Benchmark %s failed", self.name)
return BenchmarkResult(
benchmark_name=self.name,
error=str(exc),
elapsed_seconds=elapsed,
)