Spaces:
Sleeping
Sleeping
| from dataclasses import dataclass, field | |
| from typing import Any, Literal, overload | |
| Message = dict[str, Any] # keys role, content | |
| MessageList = list[Message] | |
| class SamplerResponse: | |
| """ | |
| Response from a sampler. | |
| """ | |
| response_text: str | |
| actual_queried_message_list: MessageList | |
| response_metadata: dict[str, Any] | |
| class SamplerBase: | |
| """ | |
| Base class for defining a sampling model, which can be evaluated, | |
| or used as part of the grading process. | |
| """ | |
| def __call__( | |
| self, | |
| message_list: MessageList, | |
| ) -> SamplerResponse: | |
| raise NotImplementedError | |
| class EvalResult: | |
| """ | |
| Result of running an evaluation (usually consisting of many samples) | |
| """ | |
| score: float | None # top-line metric | |
| metrics: dict[str, float] | None # other metrics | |
| htmls: list[str] # strings of valid HTML | |
| convos: list[MessageList] # sampled conversations | |
| metadata: dict[str, Any] | None # Extra data such as rubric scores or sollen | |
| class SingleEvalResult: | |
| """ | |
| Result of evaluating a single sample | |
| """ | |
| score: float | None | |
| metrics: dict[str, float] = field(default_factory=dict) | |
| html: str | None = None | |
| convo: MessageList | None = None # sampled conversation | |
| example_level_metadata: dict[str, Any] | None = ( | |
| None # Extra data such as rubric scores or sollen | |
| ) | |
| class Eval: | |
| """ | |
| Base class for defining an evaluation. | |
| """ | |
| def __call__(self, sampler: SamplerBase) -> EvalResult: | |
| raise NotImplementedError | |