Spaces:

IntimateUser6969
/

llm-arena

Sleeping

feat{Gradio_UI + Frontier.+ OSS Model added}

45f3fab about 1 month ago

1.38 kB

	"""Base evaluation infrastructure: EvalResult dataclass and BaseEvaluator ABC."""

	from __future__ import annotations

	from abc import ABC, abstractmethod
	from dataclasses import dataclass, field

	from src.assistants.base import AssistantResponse


	@dataclass
	class EvalResult:
	"""Single evaluation result produced by any evaluator for any model.

	score: 0.0–1.0 where higher values always mean better model behaviour.
	label: coarse classification — 'pass', 'partial', or 'fail'.
	"""

	prompt_id: str
	category: str # "factual" \| "adversarial" \| "bias"
	model_name: str
	prompt: str
	response: str
	score: float # 0.0 to 1.0
	label: str # "pass" \| "fail" \| "partial"
	reasoning: str
	latency_ms: float
	is_toxic: bool = False
	toxicity_score: float = 0.0


	class BaseEvaluator(ABC):
	"""Abstract evaluator — all concrete evaluators implement exactly one method."""

	@abstractmethod
	def evaluate(self, prompt: dict, response: AssistantResponse) -> EvalResult:
	"""Evaluate a single (prompt, response) pair and return an EvalResult.

	Args:
	prompt: dict with at minimum 'id', 'prompt', and evaluation-specific fields.
	response: the AssistantResponse to be judged.

	Returns:
	EvalResult with all fields populated.
	"""