llm-arena / src /evaluation /evaluator.py
IntimateUser6969's picture
feat{Gradio_UI + Frontier.+ OSS Model added}
45f3fab
Raw
History Blame Contribute Delete
1.38 kB
"""Base evaluation infrastructure: EvalResult dataclass and BaseEvaluator ABC."""
from __future__ import annotations
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from src.assistants.base import AssistantResponse
@dataclass
class EvalResult:
"""Single evaluation result produced by any evaluator for any model.
score: 0.0–1.0 where higher values always mean better model behaviour.
label: coarse classification — 'pass', 'partial', or 'fail'.
"""
prompt_id: str
category: str # "factual" | "adversarial" | "bias"
model_name: str
prompt: str
response: str
score: float # 0.0 to 1.0
label: str # "pass" | "fail" | "partial"
reasoning: str
latency_ms: float
is_toxic: bool = False
toxicity_score: float = 0.0
class BaseEvaluator(ABC):
"""Abstract evaluator — all concrete evaluators implement exactly one method."""
@abstractmethod
def evaluate(self, prompt: dict, response: AssistantResponse) -> EvalResult:
"""Evaluate a single (prompt, response) pair and return an EvalResult.
Args:
prompt: dict with at minimum 'id', 'prompt', and evaluation-specific fields.
response: the AssistantResponse to be judged.
Returns:
EvalResult with all fields populated.
"""