Spaces:

openenv
/

chat_env

Runtime error

File size: 6,144 Bytes

25bcc11

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

"""Inspect AI harness integration for OpenEnv.

Requires the ``inspect-ai`` package: ``pip install 'inspect-ai>=0.3.0'``
"""

from __future__ import annotations

from typing import Any, Dict, Optional

from openenv.core.evals.base import EvalHarness


class InspectAIHarness(EvalHarness):
    """Evaluation harness wrapping Inspect AI's ``eval()`` function.

    All ``inspect_ai`` imports are deferred to :meth:`run` so this class is
    importable without inspect-ai installed.  An ``ImportError`` with a clear
    message is raised at call time if the dependency is missing.

    Args:
        log_dir: Directory for evaluation log output. Defaults to None
            (Inspect AI writes logs to its default location).

    ``eval_parameters`` keys accepted by :meth:`run`:

    +--------------------------+----------+-----------------+-----------------------------------+
    | Key                      | Type     | Default         | Purpose                           |
    +==========================+==========+=================+===================================+
    | ``model``                | str      | *required*      | Model string, e.g. "openai/gpt-4o"|
    | ``task``                 | str|None | ``dataset`` arg | Task file path or task string     |
    | ``task_args``            | dict     | ``{}``          | Arguments to pass to the task     |
    | ``max_samples``          | int|None | None            | Limit samples per task            |
    | ``temperature``          | float|None| None           | Model generation temperature      |
    | ``max_tokens``           | int|None | None            | Max generation tokens             |
    | ``epochs``               | int|None | None            | Number of evaluation epochs       |
    | ``solver``               | list|None| None            | Solver pipeline override          |
    | ``scorer``               | list|None| None            | Scorer override                   |
    | ``model_args``           | dict     | ``{}``          | Provider-specific model kwargs    |
    +--------------------------+----------+-----------------+-----------------------------------+
    """

    def __init__(
        self,
        *,
        log_dir: Optional[str] = None,
    ):
        self.log_dir = log_dir

    def run(
        self,
        harness_version: str,
        library_versions: Dict[str, str],
        dataset: str,
        eval_parameters: Dict[str, Any],
    ) -> Dict[str, Any]:
        """Run an Inspect AI evaluation.

        Args:
            harness_version: Version of inspect-ai being used.
            library_versions: Versions of supporting libraries.
            dataset: Default task string (used when ``task`` is not specified
                in *eval_parameters*).
            eval_parameters: See class docstring for accepted keys.

        Returns:
            Dictionary mapping metric names to scores.

        Raises:
            ImportError: If ``inspect-ai`` is not installed.
            ValueError: If ``model`` is missing from *eval_parameters*.
            RuntimeError: If the evaluation fails (log status is not "success").
        """
        try:
            from inspect_ai import eval as inspect_eval
        except ImportError:
            raise ImportError(
                "inspect-ai is required for InspectAIHarness. "
                "Install it with: pip install 'inspect-ai>=0.3.0'"
            )

        # Extract required model parameter
        model = eval_parameters.get("model")
        if model is None:
            raise ValueError(
                "eval_parameters must include 'model' "
                "(e.g. 'openai/gpt-4o', 'hf/meta-llama/...')."
            )

        # Task: explicit parameter or fall back to dataset
        task = eval_parameters.get("task", dataset)

        # Build eval kwargs
        eval_kwargs: Dict[str, Any] = {}

        task_args = eval_parameters.get("task_args", {})
        if task_args:
            eval_kwargs["task_args"] = task_args

        model_args = eval_parameters.get("model_args", {})
        if model_args:
            eval_kwargs["model_args"] = model_args

        for key in ("max_samples", "temperature", "max_tokens", "epochs"):
            value = eval_parameters.get(key)
            if value is not None:
                eval_kwargs[key] = value

        if eval_parameters.get("solver") is not None:
            eval_kwargs["solver"] = eval_parameters["solver"]

        if eval_parameters.get("scorer") is not None:
            eval_kwargs["scorer"] = eval_parameters["scorer"]

        if self.log_dir is not None:
            eval_kwargs["log_dir"] = self.log_dir

        # Run evaluation
        logs = inspect_eval(task, model=model, **eval_kwargs)

        # Extract results from the first log
        if not logs:
            raise RuntimeError(
                "Inspect AI evaluation returned no logs. "
                "Check that the task and model arguments are valid."
            )
        log = logs[0]
        if log.status != "success":
            raise RuntimeError(
                f"Inspect AI evaluation failed with status: {log.status}"
            )

        return self._extract_scores(log)

    def _extract_scores(self, log: Any) -> Dict[str, Any]:
        """Parse an EvalLog's results into a flat score dictionary.

        Iterates over ``log.results.scores`` (a list of ``EvalScore``),
        flattening each scorer's ``metrics`` dict into a single output dict.

        Args:
            log: An ``inspect_ai`` ``EvalLog`` object.

        Returns:
            Dictionary mapping metric names to their values.
        """
        scores: Dict[str, Any] = {}
        if log.results is None:
            return scores

        for eval_score in log.results.scores:
            for metric_name, metric in eval_score.metrics.items():
                scores[metric_name] = metric.value

        return scores