Spaces:

openenv-community
/

KantBench-Dashboard

Running

App Files Files Community

KantBench-Dashboard / bench /external /_base.py

jtowarek

Upload folder using huggingface_hub

ba4ecd0 verified 3 days ago

raw

history blame contribute delete

2.98 kB

	"""Core abstractions for external benchmark adapters."""

	from __future__ import annotations

	import dataclasses
	import logging
	import time
	from abc import ABC, abstractmethod
	from typing import Any, Dict, Optional

	from bench.external.constants import ZERO_FLOAT

	logger = logging.getLogger(__name__)


	@dataclasses.dataclass
	class BenchmarkResult:
	"""Result from running a single external benchmark.

	Parameters
	----------
	benchmark_name : str
	Machine-readable benchmark identifier.
	scores : dict
	Metric name to float value mapping.
	primary_metric : str
	Key into scores for the single headline number.
	metadata : dict
	Arbitrary extra info (dataset version, sample count, etc.).
	raw_outputs : list
	Per-sample outputs for debugging / qualitative review.
	elapsed_seconds : float
	Wall-clock time for the benchmark run.
	error : str or None
	If the run failed, a description of the error.
	"""

	benchmark_name: str
	scores: Dict[str, float] = dataclasses.field(default_factory=dict)
	primary_metric: str = ""
	metadata: Dict[str, Any] = dataclasses.field(default_factory=dict)
	raw_outputs: list = dataclasses.field(default_factory=list)
	elapsed_seconds: float = ZERO_FLOAT
	error: Optional[str] = None

	@property
	def primary_score(self) -> Optional[float]:
	"""Return the primary metric value, or ``None`` on error."""
	if self.error is not None:
	return None
	return self.scores.get(self.primary_metric)


	class BenchmarkAdapter(ABC):
	"""Abstract base class for external benchmark integrations."""

	@property
	@abstractmethod
	def name(self) -> str:
	"""Machine-readable benchmark name."""

	@property
	@abstractmethod
	def display_name(self) -> str:
	"""Human-readable benchmark name."""

	@abstractmethod
	def run(self, model_handle: Any) -> BenchmarkResult:
	"""Execute the benchmark and return results.

	Parameters
	----------
	model_handle : ModelHandle
	Unified model interface for generation.

	Returns
	-------
	BenchmarkResult
	"""

	def run_safe(self, model_handle: Any) -> BenchmarkResult:
	"""Execute the benchmark, catching any exception.

	Returns a ``BenchmarkResult`` with the error field populated on
	failure so that the overall pipeline never crashes.
	"""
	start = time.monotonic()
	try:
	result = self.run(model_handle)
	result.elapsed_seconds = time.monotonic() - start
	return result
	except Exception as exc: # noqa: BLE001
	elapsed = time.monotonic() - start
	logger.exception("Benchmark %s failed", self.name)
	return BenchmarkResult(
	benchmark_name=self.name,
	error=str(exc),
	elapsed_seconds=elapsed,
	)