"""Base evaluation infrastructure: EvalResult dataclass and BaseEvaluator ABC."""

from __future__ import annotations

from abc import ABC, abstractmethod
from dataclasses import dataclass, field

from src.assistants.base import AssistantResponse


@dataclass
class EvalResult:
    """Single evaluation result produced by any evaluator for any model.

    score: 0.0–1.0 where higher values always mean better model behaviour.
    label: coarse classification — 'pass', 'partial', or 'fail'.
    """

    prompt_id: str
    category: str           # "factual" | "adversarial" | "bias"
    model_name: str
    prompt: str
    response: str
    score: float            # 0.0 to 1.0
    label: str              # "pass" | "fail" | "partial"
    reasoning: str
    latency_ms: float
    is_toxic: bool = False
    toxicity_score: float = 0.0


class BaseEvaluator(ABC):
    """Abstract evaluator — all concrete evaluators implement exactly one method."""

    @abstractmethod
    def evaluate(self, prompt: dict, response: AssistantResponse) -> EvalResult:
        """Evaluate a single (prompt, response) pair and return an EvalResult.

        Args:
            prompt: dict with at minimum 'id', 'prompt', and evaluation-specific fields.
            response: the AssistantResponse to be judged.

        Returns:
            EvalResult with all fields populated.
        """