| |
| |
| |
| |
| |
|
|
| """Inspect AI harness integration for OpenEnv. |
| |
| Requires the ``inspect-ai`` package: ``pip install 'inspect-ai>=0.3.0'`` |
| """ |
|
|
| from __future__ import annotations |
|
|
| from typing import Any, Dict, Optional |
|
|
| from openenv.core.evals.base import EvalHarness |
|
|
|
|
| class InspectAIHarness(EvalHarness): |
| """Evaluation harness wrapping Inspect AI's ``eval()`` function. |
| |
| All ``inspect_ai`` imports are deferred to :meth:`run` so this class is |
| importable without inspect-ai installed. An ``ImportError`` with a clear |
| message is raised at call time if the dependency is missing. |
| |
| Args: |
| log_dir: Directory for evaluation log output. Defaults to None |
| (Inspect AI writes logs to its default location). |
| |
| ``eval_parameters`` keys accepted by :meth:`run`: |
| |
| +--------------------------+----------+-----------------+-----------------------------------+ |
| | Key | Type | Default | Purpose | |
| +==========================+==========+=================+===================================+ |
| | ``model`` | str | *required* | Model string, e.g. "openai/gpt-4o"| |
| | ``task`` | str|None | ``dataset`` arg | Task file path or task string | |
| | ``task_args`` | dict | ``{}`` | Arguments to pass to the task | |
| | ``max_samples`` | int|None | None | Limit samples per task | |
| | ``temperature`` | float|None| None | Model generation temperature | |
| | ``max_tokens`` | int|None | None | Max generation tokens | |
| | ``epochs`` | int|None | None | Number of evaluation epochs | |
| | ``solver`` | list|None| None | Solver pipeline override | |
| | ``scorer`` | list|None| None | Scorer override | |
| | ``model_args`` | dict | ``{}`` | Provider-specific model kwargs | |
| +--------------------------+----------+-----------------+-----------------------------------+ |
| """ |
|
|
| def __init__( |
| self, |
| *, |
| log_dir: Optional[str] = None, |
| ): |
| self.log_dir = log_dir |
|
|
| def run( |
| self, |
| harness_version: str, |
| library_versions: Dict[str, str], |
| dataset: str, |
| eval_parameters: Dict[str, Any], |
| ) -> Dict[str, Any]: |
| """Run an Inspect AI evaluation. |
| |
| Args: |
| harness_version: Version of inspect-ai being used. |
| library_versions: Versions of supporting libraries. |
| dataset: Default task string (used when ``task`` is not specified |
| in *eval_parameters*). |
| eval_parameters: See class docstring for accepted keys. |
| |
| Returns: |
| Dictionary mapping metric names to scores. |
| |
| Raises: |
| ImportError: If ``inspect-ai`` is not installed. |
| ValueError: If ``model`` is missing from *eval_parameters*. |
| RuntimeError: If the evaluation fails (log status is not "success"). |
| """ |
| try: |
| from inspect_ai import eval as inspect_eval |
| except ImportError: |
| raise ImportError( |
| "inspect-ai is required for InspectAIHarness. " |
| "Install it with: pip install 'inspect-ai>=0.3.0'" |
| ) |
|
|
| |
| model = eval_parameters.get("model") |
| if model is None: |
| raise ValueError( |
| "eval_parameters must include 'model' " |
| "(e.g. 'openai/gpt-4o', 'hf/meta-llama/...')." |
| ) |
|
|
| |
| task = eval_parameters.get("task", dataset) |
|
|
| |
| eval_kwargs: Dict[str, Any] = {} |
|
|
| task_args = eval_parameters.get("task_args", {}) |
| if task_args: |
| eval_kwargs["task_args"] = task_args |
|
|
| model_args = eval_parameters.get("model_args", {}) |
| if model_args: |
| eval_kwargs["model_args"] = model_args |
|
|
| for key in ("max_samples", "temperature", "max_tokens", "epochs"): |
| value = eval_parameters.get(key) |
| if value is not None: |
| eval_kwargs[key] = value |
|
|
| if eval_parameters.get("solver") is not None: |
| eval_kwargs["solver"] = eval_parameters["solver"] |
|
|
| if eval_parameters.get("scorer") is not None: |
| eval_kwargs["scorer"] = eval_parameters["scorer"] |
|
|
| if self.log_dir is not None: |
| eval_kwargs["log_dir"] = self.log_dir |
|
|
| |
| logs = inspect_eval(task, model=model, **eval_kwargs) |
|
|
| |
| if not logs: |
| raise RuntimeError( |
| "Inspect AI evaluation returned no logs. " |
| "Check that the task and model arguments are valid." |
| ) |
| log = logs[0] |
| if log.status != "success": |
| raise RuntimeError( |
| f"Inspect AI evaluation failed with status: {log.status}" |
| ) |
|
|
| return self._extract_scores(log) |
|
|
| def _extract_scores(self, log: Any) -> Dict[str, Any]: |
| """Parse an EvalLog's results into a flat score dictionary. |
| |
| Iterates over ``log.results.scores`` (a list of ``EvalScore``), |
| flattening each scorer's ``metrics`` dict into a single output dict. |
| |
| Args: |
| log: An ``inspect_ai`` ``EvalLog`` object. |
| |
| Returns: |
| Dictionary mapping metric names to their values. |
| """ |
| scores: Dict[str, Any] = {} |
| if log.results is None: |
| return scores |
|
|
| for eval_score in log.results.scores: |
| for metric_name, metric in eval_score.metrics.items(): |
| scores[metric_name] = metric.value |
|
|
| return scores |
|
|