Spaces:
Runtime error
Runtime error
| # Copyright (c) Meta Platforms, Inc. and affiliates. | |
| # All rights reserved. | |
| # | |
| # This source code is licensed under the BSD-style license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| """Inspect AI harness integration for OpenEnv. | |
| Requires the ``inspect-ai`` package: ``pip install 'inspect-ai>=0.3.0'`` | |
| """ | |
| from __future__ import annotations | |
| from typing import Any, Dict, Optional | |
| from openenv.core.evals.base import EvalHarness | |
| class InspectAIHarness(EvalHarness): | |
| """Evaluation harness wrapping Inspect AI's ``eval()`` function. | |
| All ``inspect_ai`` imports are deferred to :meth:`run` so this class is | |
| importable without inspect-ai installed. An ``ImportError`` with a clear | |
| message is raised at call time if the dependency is missing. | |
| Args: | |
| log_dir: Directory for evaluation log output. Defaults to None | |
| (Inspect AI writes logs to its default location). | |
| ``eval_parameters`` keys accepted by :meth:`run`: | |
| +--------------------------+----------+-----------------+-----------------------------------+ | |
| | Key | Type | Default | Purpose | | |
| +==========================+==========+=================+===================================+ | |
| | ``model`` | str | *required* | Model string, e.g. "openai/gpt-4o"| | |
| | ``task`` | str|None | ``dataset`` arg | Task file path or task string | | |
| | ``task_args`` | dict | ``{}`` | Arguments to pass to the task | | |
| | ``max_samples`` | int|None | None | Limit samples per task | | |
| | ``temperature`` | float|None| None | Model generation temperature | | |
| | ``max_tokens`` | int|None | None | Max generation tokens | | |
| | ``epochs`` | int|None | None | Number of evaluation epochs | | |
| | ``solver`` | list|None| None | Solver pipeline override | | |
| | ``scorer`` | list|None| None | Scorer override | | |
| | ``model_args`` | dict | ``{}`` | Provider-specific model kwargs | | |
| +--------------------------+----------+-----------------+-----------------------------------+ | |
| """ | |
| def __init__( | |
| self, | |
| *, | |
| log_dir: Optional[str] = None, | |
| ): | |
| self.log_dir = log_dir | |
| def run( | |
| self, | |
| harness_version: str, | |
| library_versions: Dict[str, str], | |
| dataset: str, | |
| eval_parameters: Dict[str, Any], | |
| ) -> Dict[str, Any]: | |
| """Run an Inspect AI evaluation. | |
| Args: | |
| harness_version: Version of inspect-ai being used. | |
| library_versions: Versions of supporting libraries. | |
| dataset: Default task string (used when ``task`` is not specified | |
| in *eval_parameters*). | |
| eval_parameters: See class docstring for accepted keys. | |
| Returns: | |
| Dictionary mapping metric names to scores. | |
| Raises: | |
| ImportError: If ``inspect-ai`` is not installed. | |
| ValueError: If ``model`` is missing from *eval_parameters*. | |
| RuntimeError: If the evaluation fails (log status is not "success"). | |
| """ | |
| try: | |
| from inspect_ai import eval as inspect_eval | |
| except ImportError: | |
| raise ImportError( | |
| "inspect-ai is required for InspectAIHarness. " | |
| "Install it with: pip install 'inspect-ai>=0.3.0'" | |
| ) | |
| # Extract required model parameter | |
| model = eval_parameters.get("model") | |
| if model is None: | |
| raise ValueError( | |
| "eval_parameters must include 'model' " | |
| "(e.g. 'openai/gpt-4o', 'hf/meta-llama/...')." | |
| ) | |
| # Task: explicit parameter or fall back to dataset | |
| task = eval_parameters.get("task", dataset) | |
| # Build eval kwargs | |
| eval_kwargs: Dict[str, Any] = {} | |
| task_args = eval_parameters.get("task_args", {}) | |
| if task_args: | |
| eval_kwargs["task_args"] = task_args | |
| model_args = eval_parameters.get("model_args", {}) | |
| if model_args: | |
| eval_kwargs["model_args"] = model_args | |
| for key in ("max_samples", "temperature", "max_tokens", "epochs"): | |
| value = eval_parameters.get(key) | |
| if value is not None: | |
| eval_kwargs[key] = value | |
| if eval_parameters.get("solver") is not None: | |
| eval_kwargs["solver"] = eval_parameters["solver"] | |
| if eval_parameters.get("scorer") is not None: | |
| eval_kwargs["scorer"] = eval_parameters["scorer"] | |
| if self.log_dir is not None: | |
| eval_kwargs["log_dir"] = self.log_dir | |
| # Run evaluation | |
| logs = inspect_eval(task, model=model, **eval_kwargs) | |
| # Extract results from the first log | |
| if not logs: | |
| raise RuntimeError( | |
| "Inspect AI evaluation returned no logs. " | |
| "Check that the task and model arguments are valid." | |
| ) | |
| log = logs[0] | |
| if log.status != "success": | |
| raise RuntimeError( | |
| f"Inspect AI evaluation failed with status: {log.status}" | |
| ) | |
| return self._extract_scores(log) | |
| def _extract_scores(self, log: Any) -> Dict[str, Any]: | |
| """Parse an EvalLog's results into a flat score dictionary. | |
| Iterates over ``log.results.scores`` (a list of ``EvalScore``), | |
| flattening each scorer's ``metrics`` dict into a single output dict. | |
| Args: | |
| log: An ``inspect_ai`` ``EvalLog`` object. | |
| Returns: | |
| Dictionary mapping metric names to their values. | |
| """ | |
| scores: Dict[str, Any] = {} | |
| if log.results is None: | |
| return scores | |
| for eval_score in log.results.scores: | |
| for metric_name, metric in eval_score.metrics.items(): | |
| scores[metric_name] = metric.value | |
| return scores | |