File size: 6,144 Bytes
25bcc11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

"""Inspect AI harness integration for OpenEnv.

Requires the ``inspect-ai`` package: ``pip install 'inspect-ai>=0.3.0'``
"""

from __future__ import annotations

from typing import Any, Dict, Optional

from openenv.core.evals.base import EvalHarness


class InspectAIHarness(EvalHarness):
    """Evaluation harness wrapping Inspect AI's ``eval()`` function.

    All ``inspect_ai`` imports are deferred to :meth:`run` so this class is
    importable without inspect-ai installed.  An ``ImportError`` with a clear
    message is raised at call time if the dependency is missing.

    Args:
        log_dir: Directory for evaluation log output. Defaults to None
            (Inspect AI writes logs to its default location).

    ``eval_parameters`` keys accepted by :meth:`run`:

    +--------------------------+----------+-----------------+-----------------------------------+
    | Key                      | Type     | Default         | Purpose                           |
    +==========================+==========+=================+===================================+
    | ``model``                | str      | *required*      | Model string, e.g. "openai/gpt-4o"|
    | ``task``                 | str|None | ``dataset`` arg | Task file path or task string     |
    | ``task_args``            | dict     | ``{}``          | Arguments to pass to the task     |
    | ``max_samples``          | int|None | None            | Limit samples per task            |
    | ``temperature``          | float|None| None           | Model generation temperature      |
    | ``max_tokens``           | int|None | None            | Max generation tokens             |
    | ``epochs``               | int|None | None            | Number of evaluation epochs       |
    | ``solver``               | list|None| None            | Solver pipeline override          |
    | ``scorer``               | list|None| None            | Scorer override                   |
    | ``model_args``           | dict     | ``{}``          | Provider-specific model kwargs    |
    +--------------------------+----------+-----------------+-----------------------------------+
    """

    def __init__(
        self,
        *,
        log_dir: Optional[str] = None,
    ):
        self.log_dir = log_dir

    def run(
        self,
        harness_version: str,
        library_versions: Dict[str, str],
        dataset: str,
        eval_parameters: Dict[str, Any],
    ) -> Dict[str, Any]:
        """Run an Inspect AI evaluation.

        Args:
            harness_version: Version of inspect-ai being used.
            library_versions: Versions of supporting libraries.
            dataset: Default task string (used when ``task`` is not specified
                in *eval_parameters*).
            eval_parameters: See class docstring for accepted keys.

        Returns:
            Dictionary mapping metric names to scores.

        Raises:
            ImportError: If ``inspect-ai`` is not installed.
            ValueError: If ``model`` is missing from *eval_parameters*.
            RuntimeError: If the evaluation fails (log status is not "success").
        """
        try:
            from inspect_ai import eval as inspect_eval
        except ImportError:
            raise ImportError(
                "inspect-ai is required for InspectAIHarness. "
                "Install it with: pip install 'inspect-ai>=0.3.0'"
            )

        # Extract required model parameter
        model = eval_parameters.get("model")
        if model is None:
            raise ValueError(
                "eval_parameters must include 'model' "
                "(e.g. 'openai/gpt-4o', 'hf/meta-llama/...')."
            )

        # Task: explicit parameter or fall back to dataset
        task = eval_parameters.get("task", dataset)

        # Build eval kwargs
        eval_kwargs: Dict[str, Any] = {}

        task_args = eval_parameters.get("task_args", {})
        if task_args:
            eval_kwargs["task_args"] = task_args

        model_args = eval_parameters.get("model_args", {})
        if model_args:
            eval_kwargs["model_args"] = model_args

        for key in ("max_samples", "temperature", "max_tokens", "epochs"):
            value = eval_parameters.get(key)
            if value is not None:
                eval_kwargs[key] = value

        if eval_parameters.get("solver") is not None:
            eval_kwargs["solver"] = eval_parameters["solver"]

        if eval_parameters.get("scorer") is not None:
            eval_kwargs["scorer"] = eval_parameters["scorer"]

        if self.log_dir is not None:
            eval_kwargs["log_dir"] = self.log_dir

        # Run evaluation
        logs = inspect_eval(task, model=model, **eval_kwargs)

        # Extract results from the first log
        if not logs:
            raise RuntimeError(
                "Inspect AI evaluation returned no logs. "
                "Check that the task and model arguments are valid."
            )
        log = logs[0]
        if log.status != "success":
            raise RuntimeError(
                f"Inspect AI evaluation failed with status: {log.status}"
            )

        return self._extract_scores(log)

    def _extract_scores(self, log: Any) -> Dict[str, Any]:
        """Parse an EvalLog's results into a flat score dictionary.

        Iterates over ``log.results.scores`` (a list of ``EvalScore``),
        flattening each scorer's ``metrics`` dict into a single output dict.

        Args:
            log: An ``inspect_ai`` ``EvalLog`` object.

        Returns:
            Dictionary mapping metric names to their values.
        """
        scores: Dict[str, Any] = {}
        if log.results is None:
            return scores

        for eval_score in log.results.scores:
            for metric_name, metric in eval_score.metrics.items():
                scores[metric_name] = metric.value

        return scores