| """ |
| Inspect AI task definition that runs the existing agent and reuses the rubric scorer. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import asyncio |
| import json |
| import sys |
| from pathlib import Path |
| from typing import Any, Sequence |
|
|
| from inspect_ai import Task, task |
| from inspect_ai.dataset import Sample, hf_dataset |
| from inspect_ai.scorer import Score, Target, mean, scorer |
| from inspect_ai.solver._task_state import TaskState |
| import litellm |
|
|
| PROJECT_ROOT = Path(__file__).resolve().parents[1] |
| if str(PROJECT_ROOT) not in sys.path: |
| sys.path.insert(0, str(PROJECT_ROOT)) |
|
|
| from eval.rubric_eval import RubricData, evaluate_with_rubrics |
| from eval.solvers import get_solver |
|
|
|
|
| def _record_to_sample(record: dict[str, Any]) -> Sample: |
| rubric_payload = json.loads(record["rubric"]) |
| rubrics = rubric_payload.get("rubrics", []) |
|
|
| metadata = { |
| "question": record["question"], |
| "discussion_title": record.get("discussion_title"), |
| "discussion_url": record.get("discussion_url"), |
| "rubric_title": rubric_payload.get("title"), |
| "rubric_description": rubric_payload.get("description"), |
| "rubrics": rubrics, |
| } |
|
|
| return Sample( |
| input=record["question"], |
| target=record["solution"], |
| id=record.get("discussion_topic_id"), |
| metadata=metadata, |
| ) |
|
|
|
|
| def _load_dataset(dataset_name: str, split: str, limit: int | None) -> Sequence[Sample]: |
| return hf_dataset( |
| dataset_name, sample_fields=_record_to_sample, split=split, limit=limit |
| ) |
|
|
|
|
| def _metadata_to_rubrics(metadata: dict[str, Any]) -> list[RubricData]: |
| raw_rubrics = metadata.get("rubrics", []) |
| return [RubricData(**rubric) for rubric in raw_rubrics] |
|
|
|
|
| @scorer(metrics=[mean()], name="rubric_scorer") |
| def rubric_scorer(judge_model: str = "gpt-5-mini"): |
| async def score(state: TaskState, target: Target) -> Score: |
| response_text = state.output.completion or state.output.message.text |
| question = state.metadata.get("question", state.input_text) |
| rubrics = _metadata_to_rubrics(state.metadata) |
|
|
| evaluation = await asyncio.to_thread( |
| evaluate_with_rubrics, |
| question, |
| response_text, |
| rubrics, |
| judge_model, |
| ) |
|
|
| score_metadata = { |
| "raw_score": evaluation.raw_score, |
| "criterion_checks": [ |
| check.model_dump() for check in evaluation.criterion_checks |
| ], |
| "discussion_title": state.metadata.get("discussion_title"), |
| "discussion_url": state.metadata.get("discussion_url"), |
| "reference_answer": target.text, |
| } |
|
|
| return Score( |
| value=evaluation.normalized_score, |
| answer=response_text, |
| explanation=f"Normalized score {evaluation.normalized_score:.3f}", |
| metadata=score_metadata, |
| ) |
|
|
| return score |
|
|
|
|
| @task(name="hf-benchmark-with-rubrics") |
| def hf_benchmark_with_rubrics( |
| solver_name: str = "hf_agent", |
| solver_kwargs: dict[str, Any] = { |
| "max_iterations": 10, |
| "config_path": "agent/config_mcp_example.json", |
| }, |
| dataset_name: str = "akseljoonas/hf-agent-rubrics@train", |
| limit: int | None = None, |
| judge_model: str = "gpt-5-mini", |
| ) -> Task: |
| litellm.drop_params = True |
| if "@" not in dataset_name: |
| raise ValueError("Dataset name must be in the format 'author/dataset@split'") |
| dataset_name, dataset_split = dataset_name.split("@") |
| dataset = _load_dataset(dataset_name, dataset_split, limit=limit) |
|
|
| return Task( |
| dataset=dataset, |
| solver=get_solver(solver_name, **solver_kwargs), |
| scorer=rubric_scorer(judge_model=judge_model), |
| metadata={ |
| "dataset_name": dataset_name, |
| "dataset_split": dataset_split, |
| "solver_name": solver_name, |
| "judge_model": judge_model, |
| }, |
| ) |
|
|