| | """
|
| | Inspect AI task definition that runs the existing agent and reuses the rubric scorer.
|
| | """
|
| |
|
| | from __future__ import annotations
|
| |
|
| | import asyncio
|
| | import json
|
| | import sys
|
| | from pathlib import Path
|
| | from typing import Any, Sequence
|
| |
|
| | from inspect_ai import Task, task
|
| | from inspect_ai.dataset import Sample, hf_dataset
|
| | from inspect_ai.scorer import Score, Target, mean, scorer
|
| | from inspect_ai.solver._task_state import TaskState
|
| | import litellm
|
| |
|
| | PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
| | if str(PROJECT_ROOT) not in sys.path:
|
| | sys.path.insert(0, str(PROJECT_ROOT))
|
| |
|
| | from eval.rubric_eval import RubricData, evaluate_with_rubrics
|
| | from eval.solvers import get_solver
|
| |
|
| |
|
| | def _record_to_sample(record: dict[str, Any]) -> Sample:
|
| | rubric_payload = json.loads(record["rubric"])
|
| | rubrics = rubric_payload.get("rubrics", [])
|
| |
|
| | metadata = {
|
| | "question": record["question"],
|
| | "discussion_title": record.get("discussion_title"),
|
| | "discussion_url": record.get("discussion_url"),
|
| | "rubric_title": rubric_payload.get("title"),
|
| | "rubric_description": rubric_payload.get("description"),
|
| | "rubrics": rubrics,
|
| | }
|
| |
|
| | return Sample(
|
| | input=record["question"],
|
| | target=record["solution"],
|
| | id=record.get("discussion_topic_id"),
|
| | metadata=metadata,
|
| | )
|
| |
|
| |
|
| | def _load_dataset(dataset_name: str, split: str, limit: int | None) -> Sequence[Sample]:
|
| | return hf_dataset(
|
| | dataset_name, sample_fields=_record_to_sample, split=split, limit=limit
|
| | )
|
| |
|
| |
|
| | def _metadata_to_rubrics(metadata: dict[str, Any]) -> list[RubricData]:
|
| | raw_rubrics = metadata.get("rubrics", [])
|
| | return [RubricData(**rubric) for rubric in raw_rubrics]
|
| |
|
| |
|
| | @scorer(metrics=[mean()], name="rubric_scorer")
|
| | def rubric_scorer(judge_model: str = "gpt-5-mini"):
|
| | async def score(state: TaskState, target: Target) -> Score:
|
| | response_text = state.output.completion or state.output.message.text
|
| | question = state.metadata.get("question", state.input_text)
|
| | rubrics = _metadata_to_rubrics(state.metadata)
|
| |
|
| | evaluation = await asyncio.to_thread(
|
| | evaluate_with_rubrics,
|
| | question,
|
| | response_text,
|
| | rubrics,
|
| | judge_model,
|
| | )
|
| |
|
| | score_metadata = {
|
| | "raw_score": evaluation.raw_score,
|
| | "criterion_checks": [
|
| | check.model_dump() for check in evaluation.criterion_checks
|
| | ],
|
| | "discussion_title": state.metadata.get("discussion_title"),
|
| | "discussion_url": state.metadata.get("discussion_url"),
|
| | "reference_answer": target.text,
|
| | }
|
| |
|
| | return Score(
|
| | value=evaluation.normalized_score,
|
| | answer=response_text,
|
| | explanation=f"Normalized score {evaluation.normalized_score:.3f}",
|
| | metadata=score_metadata,
|
| | )
|
| |
|
| | return score
|
| |
|
| |
|
| | @task(name="hf-benchmark-with-rubrics")
|
| | def hf_benchmark_with_rubrics(
|
| | solver_name: str = "hf_agent",
|
| | solver_kwargs: dict[str, Any] = {
|
| | "max_iterations": 10,
|
| | "config_path": "agent/config_mcp_example.json",
|
| | },
|
| | dataset_name: str = "akseljoonas/hf-agent-rubrics@train",
|
| | limit: int | None = None,
|
| | judge_model: str = "gpt-5-mini",
|
| | ) -> Task:
|
| | litellm.drop_params = True
|
| | if "@" not in dataset_name:
|
| | raise ValueError("Dataset name must be in the format 'author/dataset@split'")
|
| | dataset_name, dataset_split = dataset_name.split("@")
|
| | dataset = _load_dataset(dataset_name, dataset_split, limit=limit)
|
| |
|
| | return Task(
|
| | dataset=dataset,
|
| | solver=get_solver(solver_name, **solver_kwargs),
|
| | scorer=rubric_scorer(judge_model=judge_model),
|
| | metadata={
|
| | "dataset_name": dataset_name,
|
| | "dataset_split": dataset_split,
|
| | "solver_name": solver_name,
|
| | "judge_model": judge_model,
|
| | },
|
| | )
|
| |
|