| | """ |
| | Inspect AI task definition that runs the existing agent and reuses the rubric scorer. |
| | """ |
| |
|
| | from __future__ import annotations |
| |
|
| | import asyncio |
| | import json |
| | import sys |
| | from pathlib import Path |
| | from typing import Any, Sequence |
| |
|
| | from inspect_ai import Task, task |
| | from inspect_ai.dataset import Sample, hf_dataset |
| | from inspect_ai.scorer import Score, Target, mean, scorer |
| | from inspect_ai.solver._task_state import TaskState |
| | import litellm |
| |
|
| | PROJECT_ROOT = Path(__file__).resolve().parents[1] |
| | if str(PROJECT_ROOT) not in sys.path: |
| | sys.path.insert(0, str(PROJECT_ROOT)) |
| |
|
| | from eval.rubric_eval import RubricData, evaluate_with_rubrics |
| | from eval.solvers import get_solver |
| |
|
| |
|
| | def _record_to_sample(record: dict[str, Any]) -> Sample: |
| | rubric_payload = json.loads(record["rubric"]) |
| | rubrics = rubric_payload.get("rubrics", []) |
| |
|
| | metadata = { |
| | "question": record["question"], |
| | "discussion_title": record.get("discussion_title"), |
| | "discussion_url": record.get("discussion_url"), |
| | "rubric_title": rubric_payload.get("title"), |
| | "rubric_description": rubric_payload.get("description"), |
| | "rubrics": rubrics, |
| | } |
| |
|
| | return Sample( |
| | input=record["question"], |
| | target=record["solution"], |
| | id=record.get("discussion_topic_id"), |
| | metadata=metadata, |
| | ) |
| |
|
| |
|
| | def _load_dataset(dataset_name: str, split: str, limit: int | None) -> Sequence[Sample]: |
| | return hf_dataset( |
| | dataset_name, sample_fields=_record_to_sample, split=split, limit=limit |
| | ) |
| |
|
| |
|
| | def _metadata_to_rubrics(metadata: dict[str, Any]) -> list[RubricData]: |
| | raw_rubrics = metadata.get("rubrics", []) |
| | return [RubricData(**rubric) for rubric in raw_rubrics] |
| |
|
| |
|
| | @scorer(metrics=[mean()], name="rubric_scorer") |
| | def rubric_scorer(judge_model: str = "gpt-5-mini"): |
| | async def score(state: TaskState, target: Target) -> Score: |
| | response_text = state.output.completion or state.output.message.text |
| | question = state.metadata.get("question", state.input_text) |
| | rubrics = _metadata_to_rubrics(state.metadata) |
| |
|
| | evaluation = await asyncio.to_thread( |
| | evaluate_with_rubrics, |
| | question, |
| | response_text, |
| | rubrics, |
| | judge_model, |
| | ) |
| |
|
| | score_metadata = { |
| | "raw_score": evaluation.raw_score, |
| | "criterion_checks": [ |
| | check.model_dump() for check in evaluation.criterion_checks |
| | ], |
| | "discussion_title": state.metadata.get("discussion_title"), |
| | "discussion_url": state.metadata.get("discussion_url"), |
| | "reference_answer": target.text, |
| | } |
| |
|
| | return Score( |
| | value=evaluation.normalized_score, |
| | answer=response_text, |
| | explanation=f"Normalized score {evaluation.normalized_score:.3f}", |
| | metadata=score_metadata, |
| | ) |
| |
|
| | return score |
| |
|
| |
|
| | @task(name="hf-benchmark-with-rubrics") |
| | def hf_benchmark_with_rubrics( |
| | solver_name: str = "hf_agent", |
| | solver_kwargs: dict[str, Any] = { |
| | "max_iterations": 10, |
| | "config_path": "agent/config_mcp_example.json", |
| | }, |
| | dataset_name: str = "akseljoonas/hf-agent-rubrics@train", |
| | limit: int | None = None, |
| | judge_model: str = "gpt-5-mini", |
| | ) -> Task: |
| | litellm.drop_params = True |
| | if "@" not in dataset_name: |
| | raise ValueError("Dataset name must be in the format 'author/dataset@split'") |
| | dataset_name, dataset_split = dataset_name.split("@") |
| | dataset = _load_dataset(dataset_name, dataset_split, limit=limit) |
| |
|
| | return Task( |
| | dataset=dataset, |
| | solver=get_solver(solver_name, **solver_kwargs), |
| | scorer=rubric_scorer(judge_model=judge_model), |
| | metadata={ |
| | "dataset_name": dataset_name, |
| | "dataset_split": dataset_split, |
| | "solver_name": solver_name, |
| | "judge_model": judge_model, |
| | }, |
| | ) |
| |
|