ml-agent / eval /task.py
akseljoonas's picture
akseljoonas HF Staff
Initial commit: ML Agent with Xet storage for binaries
8cfacd3
"""
Inspect AI task definition that runs the existing agent and reuses the rubric scorer.
"""
from __future__ import annotations
import asyncio
import json
import sys
from pathlib import Path
from typing import Any, Sequence
from inspect_ai import Task, task
from inspect_ai.dataset import Sample, hf_dataset
from inspect_ai.scorer import Score, Target, mean, scorer
from inspect_ai.solver._task_state import TaskState
import litellm
PROJECT_ROOT = Path(__file__).resolve().parents[1]
if str(PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(PROJECT_ROOT))
from eval.rubric_eval import RubricData, evaluate_with_rubrics # noqa: E402
from eval.solvers import get_solver # noqa: E402
def _record_to_sample(record: dict[str, Any]) -> Sample:
rubric_payload = json.loads(record["rubric"])
rubrics = rubric_payload.get("rubrics", [])
metadata = {
"question": record["question"],
"discussion_title": record.get("discussion_title"),
"discussion_url": record.get("discussion_url"),
"rubric_title": rubric_payload.get("title"),
"rubric_description": rubric_payload.get("description"),
"rubrics": rubrics,
}
return Sample(
input=record["question"],
target=record["solution"],
id=record.get("discussion_topic_id"),
metadata=metadata,
)
def _load_dataset(dataset_name: str, split: str, limit: int | None) -> Sequence[Sample]:
return hf_dataset(
dataset_name, sample_fields=_record_to_sample, split=split, limit=limit
)
def _metadata_to_rubrics(metadata: dict[str, Any]) -> list[RubricData]:
raw_rubrics = metadata.get("rubrics", [])
return [RubricData(**rubric) for rubric in raw_rubrics]
@scorer(metrics=[mean()], name="rubric_scorer")
def rubric_scorer(judge_model: str = "gpt-5-mini"):
async def score(state: TaskState, target: Target) -> Score:
response_text = state.output.completion or state.output.message.text
question = state.metadata.get("question", state.input_text)
rubrics = _metadata_to_rubrics(state.metadata)
evaluation = await asyncio.to_thread(
evaluate_with_rubrics,
question,
response_text,
rubrics,
judge_model,
)
score_metadata = {
"raw_score": evaluation.raw_score,
"criterion_checks": [
check.model_dump() for check in evaluation.criterion_checks
],
"discussion_title": state.metadata.get("discussion_title"),
"discussion_url": state.metadata.get("discussion_url"),
"reference_answer": target.text,
}
return Score(
value=evaluation.normalized_score,
answer=response_text,
explanation=f"Normalized score {evaluation.normalized_score:.3f}",
metadata=score_metadata,
)
return score
@task(name="hf-benchmark-with-rubrics")
def hf_benchmark_with_rubrics(
solver_name: str = "hf_agent",
solver_kwargs: dict[str, Any] = {
"max_iterations": 10,
"config_path": "agent/config_mcp_example.json",
},
dataset_name: str = "akseljoonas/hf-agent-rubrics@train",
limit: int | None = None,
judge_model: str = "gpt-5-mini",
) -> Task:
litellm.drop_params = True
if "@" not in dataset_name:
raise ValueError("Dataset name must be in the format 'author/dataset@split'")
dataset_name, dataset_split = dataset_name.split("@")
dataset = _load_dataset(dataset_name, dataset_split, limit=limit)
return Task(
dataset=dataset,
solver=get_solver(solver_name, **solver_kwargs),
scorer=rubric_scorer(judge_model=judge_model),
metadata={
"dataset_name": dataset_name,
"dataset_split": dataset_split,
"solver_name": solver_name,
"judge_model": judge_model,
},
)