Spaces:
Runtime error
Runtime error
File size: 6,144 Bytes
25bcc11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 | # Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
"""Inspect AI harness integration for OpenEnv.
Requires the ``inspect-ai`` package: ``pip install 'inspect-ai>=0.3.0'``
"""
from __future__ import annotations
from typing import Any, Dict, Optional
from openenv.core.evals.base import EvalHarness
class InspectAIHarness(EvalHarness):
"""Evaluation harness wrapping Inspect AI's ``eval()`` function.
All ``inspect_ai`` imports are deferred to :meth:`run` so this class is
importable without inspect-ai installed. An ``ImportError`` with a clear
message is raised at call time if the dependency is missing.
Args:
log_dir: Directory for evaluation log output. Defaults to None
(Inspect AI writes logs to its default location).
``eval_parameters`` keys accepted by :meth:`run`:
+--------------------------+----------+-----------------+-----------------------------------+
| Key | Type | Default | Purpose |
+==========================+==========+=================+===================================+
| ``model`` | str | *required* | Model string, e.g. "openai/gpt-4o"|
| ``task`` | str|None | ``dataset`` arg | Task file path or task string |
| ``task_args`` | dict | ``{}`` | Arguments to pass to the task |
| ``max_samples`` | int|None | None | Limit samples per task |
| ``temperature`` | float|None| None | Model generation temperature |
| ``max_tokens`` | int|None | None | Max generation tokens |
| ``epochs`` | int|None | None | Number of evaluation epochs |
| ``solver`` | list|None| None | Solver pipeline override |
| ``scorer`` | list|None| None | Scorer override |
| ``model_args`` | dict | ``{}`` | Provider-specific model kwargs |
+--------------------------+----------+-----------------+-----------------------------------+
"""
def __init__(
self,
*,
log_dir: Optional[str] = None,
):
self.log_dir = log_dir
def run(
self,
harness_version: str,
library_versions: Dict[str, str],
dataset: str,
eval_parameters: Dict[str, Any],
) -> Dict[str, Any]:
"""Run an Inspect AI evaluation.
Args:
harness_version: Version of inspect-ai being used.
library_versions: Versions of supporting libraries.
dataset: Default task string (used when ``task`` is not specified
in *eval_parameters*).
eval_parameters: See class docstring for accepted keys.
Returns:
Dictionary mapping metric names to scores.
Raises:
ImportError: If ``inspect-ai`` is not installed.
ValueError: If ``model`` is missing from *eval_parameters*.
RuntimeError: If the evaluation fails (log status is not "success").
"""
try:
from inspect_ai import eval as inspect_eval
except ImportError:
raise ImportError(
"inspect-ai is required for InspectAIHarness. "
"Install it with: pip install 'inspect-ai>=0.3.0'"
)
# Extract required model parameter
model = eval_parameters.get("model")
if model is None:
raise ValueError(
"eval_parameters must include 'model' "
"(e.g. 'openai/gpt-4o', 'hf/meta-llama/...')."
)
# Task: explicit parameter or fall back to dataset
task = eval_parameters.get("task", dataset)
# Build eval kwargs
eval_kwargs: Dict[str, Any] = {}
task_args = eval_parameters.get("task_args", {})
if task_args:
eval_kwargs["task_args"] = task_args
model_args = eval_parameters.get("model_args", {})
if model_args:
eval_kwargs["model_args"] = model_args
for key in ("max_samples", "temperature", "max_tokens", "epochs"):
value = eval_parameters.get(key)
if value is not None:
eval_kwargs[key] = value
if eval_parameters.get("solver") is not None:
eval_kwargs["solver"] = eval_parameters["solver"]
if eval_parameters.get("scorer") is not None:
eval_kwargs["scorer"] = eval_parameters["scorer"]
if self.log_dir is not None:
eval_kwargs["log_dir"] = self.log_dir
# Run evaluation
logs = inspect_eval(task, model=model, **eval_kwargs)
# Extract results from the first log
if not logs:
raise RuntimeError(
"Inspect AI evaluation returned no logs. "
"Check that the task and model arguments are valid."
)
log = logs[0]
if log.status != "success":
raise RuntimeError(
f"Inspect AI evaluation failed with status: {log.status}"
)
return self._extract_scores(log)
def _extract_scores(self, log: Any) -> Dict[str, Any]:
"""Parse an EvalLog's results into a flat score dictionary.
Iterates over ``log.results.scores`` (a list of ``EvalScore``),
flattening each scorer's ``metrics`` dict into a single output dict.
Args:
log: An ``inspect_ai`` ``EvalLog`` object.
Returns:
Dictionary mapping metric names to their values.
"""
scores: Dict[str, Any] = {}
if log.results is None:
return scores
for eval_score in log.results.scores:
for metric_name, metric in eval_score.metrics.items():
scores[metric_name] = metric.value
return scores
|