Spaces:
Running
Running
| """ | |
| Inspect AI task definition that runs the existing agent and reuses the rubric scorer. | |
| """ | |
| from __future__ import annotations | |
| import asyncio | |
| import json | |
| import sys | |
| from pathlib import Path | |
| from typing import Any, Sequence | |
| from inspect_ai import Task, task | |
| from inspect_ai.dataset import Sample, hf_dataset | |
| from inspect_ai.scorer import Score, Target, mean, scorer | |
| from inspect_ai.solver._task_state import TaskState | |
| import litellm | |
| PROJECT_ROOT = Path(__file__).resolve().parents[1] | |
| if str(PROJECT_ROOT) not in sys.path: | |
| sys.path.insert(0, str(PROJECT_ROOT)) | |
| from eval.rubric_eval import RubricData, evaluate_with_rubrics # noqa: E402 | |
| from eval.solvers import get_solver # noqa: E402 | |
| def _record_to_sample(record: dict[str, Any]) -> Sample: | |
| rubric_payload = json.loads(record["rubric"]) | |
| rubrics = rubric_payload.get("rubrics", []) | |
| metadata = { | |
| "question": record["question"], | |
| "discussion_title": record.get("discussion_title"), | |
| "discussion_url": record.get("discussion_url"), | |
| "rubric_title": rubric_payload.get("title"), | |
| "rubric_description": rubric_payload.get("description"), | |
| "rubrics": rubrics, | |
| } | |
| return Sample( | |
| input=record["question"], | |
| target=record["solution"], | |
| id=record.get("discussion_topic_id"), | |
| metadata=metadata, | |
| ) | |
| def _load_dataset(dataset_name: str, split: str, limit: int | None) -> Sequence[Sample]: | |
| return hf_dataset( | |
| dataset_name, sample_fields=_record_to_sample, split=split, limit=limit | |
| ) | |
| def _metadata_to_rubrics(metadata: dict[str, Any]) -> list[RubricData]: | |
| raw_rubrics = metadata.get("rubrics", []) | |
| return [RubricData(**rubric) for rubric in raw_rubrics] | |
| def rubric_scorer(judge_model: str = "gpt-5-mini"): | |
| async def score(state: TaskState, target: Target) -> Score: | |
| response_text = state.output.completion or state.output.message.text | |
| question = state.metadata.get("question", state.input_text) | |
| rubrics = _metadata_to_rubrics(state.metadata) | |
| evaluation = await asyncio.to_thread( | |
| evaluate_with_rubrics, | |
| question, | |
| response_text, | |
| rubrics, | |
| judge_model, | |
| ) | |
| score_metadata = { | |
| "raw_score": evaluation.raw_score, | |
| "criterion_checks": [ | |
| check.model_dump() for check in evaluation.criterion_checks | |
| ], | |
| "discussion_title": state.metadata.get("discussion_title"), | |
| "discussion_url": state.metadata.get("discussion_url"), | |
| "reference_answer": target.text, | |
| } | |
| return Score( | |
| value=evaluation.normalized_score, | |
| answer=response_text, | |
| explanation=f"Normalized score {evaluation.normalized_score:.3f}", | |
| metadata=score_metadata, | |
| ) | |
| return score | |
| def hf_benchmark_with_rubrics( | |
| solver_name: str = "hf_agent", | |
| solver_kwargs: dict[str, Any] = { | |
| "max_iterations": 10, | |
| "config_path": "agent/config_mcp_example.json", | |
| }, | |
| dataset_name: str = "akseljoonas/hf-agent-rubrics@train", | |
| limit: int | None = None, | |
| judge_model: str = "gpt-5-mini", | |
| ) -> Task: | |
| litellm.drop_params = True | |
| if "@" not in dataset_name: | |
| raise ValueError("Dataset name must be in the format 'author/dataset@split'") | |
| dataset_name, dataset_split = dataset_name.split("@") | |
| dataset = _load_dataset(dataset_name, dataset_split, limit=limit) | |
| return Task( | |
| dataset=dataset, | |
| solver=get_solver(solver_name, **solver_kwargs), | |
| scorer=rubric_scorer(judge_model=judge_model), | |
| metadata={ | |
| "dataset_name": dataset_name, | |
| "dataset_split": dataset_split, | |
| "solver_name": solver_name, | |
| "judge_model": judge_model, | |
| }, | |
| ) | |