File size: 1,416 Bytes
fa0576d
 
 
 
 
067ad94
fa0576d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
be7275a
fa0576d
be7275a
 
 
 
 
 
067ad94
 
be7275a
 
 
 
 
 
067ad94
 
fa0576d
 
 
067ad94
fa0576d
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from typing import Any, Optional

from pydantic import BaseModel


class Benchmark(BaseModel):
    name: str
    repo: str
    num_tasks: int
    url: str


class Harness(BaseModel):
    name: str
    skills: list[str]
    is_oss: bool
    url: str


class Model(BaseModel):
    name: str
    repo: str | None = None
    is_oss: bool
    num_params: int
    precision: str
    url: str


class Environment(BaseModel):
    name: str
    config: Optional[dict[str, Any]] = None
    url: str


class Metrics(BaseModel):
    
    score: float
    n_tasks: Optional[int] = None
    n_errors: Optional[int] = None
    n_input_tokens: Optional[int] = None
    n_cache_tokens: Optional[int] = None
    n_output_tokens: Optional[int] = None
    n_total_tokens: Optional[int] = None
    total_time_seconds: Optional[int] = None
    agent_time_seconds: Optional[int] = None
    cost_usd: Optional[float] = None
    mean_input_tokens_per_task: Optional[int] = None
    mean_cache_tokens_per_task: Optional[int] = None
    mean_output_tokens_per_task: Optional[int] = None
    mean_tokens_per_task: Optional[int] = None
    mean_cost_usd_per_task: Optional[float] = None
    mean_total_time_seconds_per_task: Optional[int] = None
    mean_agent_time_seconds_per_task: Optional[int] = None


class Result(BaseModel):
    benchmark: Benchmark
    harness: Harness
    model: Model
    environment: Environment
    metrics: Metrics