File size: 3,537 Bytes
1c6306b
 
 
 
 
6da8289
1c6306b
 
 
 
 
 
 
c731123
1c6306b
 
 
 
 
 
 
6da8289
1c6306b
 
87993b5
 
 
6da8289
87993b5
6da8289
87993b5
 
 
6da8289
 
 
 
 
c731123
6da8289
87993b5
 
c731123
87993b5
1c6306b
 
 
 
 
 
 
 
6da8289
87993b5
1c6306b
 
6da8289
1c6306b
 
 
 
 
6da8289
 
 
1c6306b
 
 
6da8289
 
 
1c6306b
 
 
 
 
 
6da8289
 
 
 
1c6306b
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
from dataclasses import dataclass, make_dataclass
from enum import Enum

from src.about import Tasks


def fields(raw_class):
    return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]


# These classes are for user facing column names,
# to avoid having to change them all around the code
# when a modif is needed
@dataclass(frozen=True)
class ColumnContent:
    name: str
    type: str
    displayed_by_default: bool
    hidden: bool = False
    never_hidden: bool = False


## Leaderboard columns
auto_eval_column_dict = []
# Main columns (displayed by default, in order)
auto_eval_column_dict.append(("model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)))
auto_eval_column_dict.append(("organization", ColumnContent, ColumnContent("Organization", "str", True)))
auto_eval_column_dict.append(
    ("model_type_symbol", ColumnContent, ColumnContent("Model Type", "markdown", True, never_hidden=True))
)
# Cost/Efficiency metrics
auto_eval_column_dict.append(("agent_steps", ColumnContent, ColumnContent("Agent Steps", "number", True)))
auto_eval_column_dict.append(("cost_usd", ColumnContent, ColumnContent("Cost (USD)", "number", True)))
# Scores
for idx, task in enumerate(Tasks):
    # Only show overall ANLS (first task) by default
    displayed_by_default = idx == 0
    auto_eval_column_dict.append(
        (task.name, ColumnContent, ColumnContent(task.value.col_name, "number", displayed_by_default))
    )
# Model information (will be hidden in display but needed for filtering)
auto_eval_column_dict.append(("model_type", ColumnContent, ColumnContent("Type", "str", False, hidden=False)))
auto_eval_column_dict.append(("submission_date", ColumnContent, ColumnContent("Submission Date", "str", False)))
auto_eval_column_dict.append(("link", ColumnContent, ColumnContent("Link", "str", False, hidden=True)))

# We use make dataclass to dynamically fill the scores from Tasks
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)

## For the queue columns in the submission tab
@dataclass(frozen=True)
class EvalQueueColumn:  # Queue column
    model = ColumnContent("model", "markdown", True)
    model_type = ColumnContent("model_type", "str", True)
    organization = ColumnContent("organization", "str", True)
    status = ColumnContent("status", "str", True)


## All the model information that we might need
@dataclass
class ModelDetails:
    name: str
    display_name: str = ""
    symbol: str = ""  # emoji fallback
    icon_filename: str = ""
    color: str = ""


class ModelType(Enum):
    API = ModelDetails(name="api", symbol="โ˜๏ธ", icon_filename="snow_cloud2.svg", color="#FF9F36")
    OpenWeight = ModelDetails(name="open-weight", symbol="๐Ÿ”“", icon_filename="snow_code.svg", color="#75CDD7")
    Unknown = ModelDetails(name="", symbol="?", icon_filename="", color="")

    def to_str(self, separator=" "):
        return f"{self.value.symbol}{separator}{self.value.name}"

    @staticmethod
    def from_str(type):
        if "api" in type.lower() or "๐Ÿ”Œ" in type:
            return ModelType.API
        if "open-weight" in type.lower() or "open weight" in type.lower() or "๐Ÿ”“" in type:
            return ModelType.OpenWeight
        return ModelType.Unknown


# Column selection
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]

EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]

BENCHMARK_COLS = [t.value.col_name for t in Tasks]