Spaces:
Sleeping
Sleeping
| """Base evaluation infrastructure: EvalResult dataclass and BaseEvaluator ABC.""" | |
| from __future__ import annotations | |
| from abc import ABC, abstractmethod | |
| from dataclasses import dataclass, field | |
| from src.assistants.base import AssistantResponse | |
| class EvalResult: | |
| """Single evaluation result produced by any evaluator for any model. | |
| score: 0.0–1.0 where higher values always mean better model behaviour. | |
| label: coarse classification — 'pass', 'partial', or 'fail'. | |
| """ | |
| prompt_id: str | |
| category: str # "factual" | "adversarial" | "bias" | |
| model_name: str | |
| prompt: str | |
| response: str | |
| score: float # 0.0 to 1.0 | |
| label: str # "pass" | "fail" | "partial" | |
| reasoning: str | |
| latency_ms: float | |
| is_toxic: bool = False | |
| toxicity_score: float = 0.0 | |
| class BaseEvaluator(ABC): | |
| """Abstract evaluator — all concrete evaluators implement exactly one method.""" | |
| def evaluate(self, prompt: dict, response: AssistantResponse) -> EvalResult: | |
| """Evaluate a single (prompt, response) pair and return an EvalResult. | |
| Args: | |
| prompt: dict with at minimum 'id', 'prompt', and evaluation-specific fields. | |
| response: the AssistantResponse to be judged. | |
| Returns: | |
| EvalResult with all fields populated. | |
| """ | |