File size: 1,416 Bytes
fa0576d 067ad94 fa0576d be7275a fa0576d be7275a 067ad94 be7275a 067ad94 fa0576d 067ad94 fa0576d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | from typing import Any, Optional
from pydantic import BaseModel
class Benchmark(BaseModel):
name: str
repo: str
num_tasks: int
url: str
class Harness(BaseModel):
name: str
skills: list[str]
is_oss: bool
url: str
class Model(BaseModel):
name: str
repo: str | None = None
is_oss: bool
num_params: int
precision: str
url: str
class Environment(BaseModel):
name: str
config: Optional[dict[str, Any]] = None
url: str
class Metrics(BaseModel):
score: float
n_tasks: Optional[int] = None
n_errors: Optional[int] = None
n_input_tokens: Optional[int] = None
n_cache_tokens: Optional[int] = None
n_output_tokens: Optional[int] = None
n_total_tokens: Optional[int] = None
total_time_seconds: Optional[int] = None
agent_time_seconds: Optional[int] = None
cost_usd: Optional[float] = None
mean_input_tokens_per_task: Optional[int] = None
mean_cache_tokens_per_task: Optional[int] = None
mean_output_tokens_per_task: Optional[int] = None
mean_tokens_per_task: Optional[int] = None
mean_cost_usd_per_task: Optional[float] = None
mean_total_time_seconds_per_task: Optional[int] = None
mean_agent_time_seconds_per_task: Optional[int] = None
class Result(BaseModel):
benchmark: Benchmark
harness: Harness
model: Model
environment: Environment
metrics: Metrics
|