File size: 5,583 Bytes
5c49242
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
"""GRM Evaluation Suite scoring logic.



GRM Score = mean(Roleplay_score, Actions_score, General_score)

Category Score = sum(score_i * calc_weight_i) / sum(calc_weight_i)

"""

from benchmarks import BENCHMARKS, CATEGORIES, CATEGORY_DISPLAY, get_benchmarks_by_category
from scores import CSV_GRM_SCORES, MODEL_METADATA, MODEL_SCORES

ScoreValue = float | None


def _score_for(model_scores: dict[str, ScoreValue], benchmark: dict) -> ScoreValue:
    return model_scores.get(benchmark["id"])


def _category_benchmarks(category: str, benchmark_ids: set[str] | None = None) -> list[dict]:
    benchmarks = [benchmark for benchmark in get_benchmarks_by_category(category) if benchmark["included_in_grm"]]
    if benchmark_ids is not None:
        benchmarks = [benchmark for benchmark in benchmarks if benchmark["id"] in benchmark_ids]
    return benchmarks


def _compute_category_score_raw(

    model_scores: dict[str, ScoreValue],

    category: str,

    benchmark_ids: set[str] | None = None,

) -> float | None:
    benchmarks = _category_benchmarks(category, benchmark_ids)

    total_weight = 0.0
    total_value = 0.0
    for benchmark in benchmarks:
        score = _score_for(model_scores, benchmark)
        if score is not None:
            total_weight += benchmark["calc_weight"]
            total_value += score * benchmark["calc_weight"]

    if total_weight == 0:
        return None
    return total_value / total_weight


def compute_category_score(

    model_scores: dict[str, ScoreValue],

    category: str,

    benchmark_ids: set[str] | None = None,

) -> float | None:
    """Return weighted category average on a 0-100 scale."""
    raw = _compute_category_score_raw(model_scores, category, benchmark_ids)
    return round(raw, 1) if raw is not None else None


def compute_category_components(

    model_scores: dict[str, ScoreValue],

    category: str,

    benchmark_ids: set[str] | None = None,

) -> dict[str, float | int | None]:
    """Return category score plus core/supplementary averages and missing counts."""
    benchmarks = _category_benchmarks(category, benchmark_ids)

    def average_for(weight: float) -> float | None:
        scores = [_score_for(model_scores, benchmark) for benchmark in benchmarks if benchmark["calc_weight"] == weight]
        available = [score for score in scores if score is not None]
        if not available:
            return None
        return round(sum(available) / len(available), 1)

    missing = sum(1 for benchmark in benchmarks if _score_for(model_scores, benchmark) is None)
    return {
        "score": compute_category_score(model_scores, category, benchmark_ids),
        "core_avg": average_for(1.0),
        "supplementary_avg": average_for(0.5),
        "missing": missing,
        "benchmarks": len(benchmarks),
    }


def compute_grm_score(

    model_scores: dict[str, ScoreValue],

    benchmark_ids: set[str] | None = None,

) -> dict[str, float | None]:
    """Return GRM Score and per-category scores on a 0-100 scale."""
    raw_category_scores = {
        category: _compute_category_score_raw(model_scores, category, benchmark_ids) for category in CATEGORIES
    }
    category_scores = {
        category: round(score, 1) if score is not None else None
        for category, score in raw_category_scores.items()
    }
    available = [score for score in raw_category_scores.values() if score is not None]
    grm = round(sum(available) / len(available), 1) if available else None

    return {
        "GRM Score": grm,
        "Roleplay (33%)": category_scores["ROLEPLAY"],
        "Actions (33%)": category_scores["ACTIONS"],
        "General (33%)": category_scores["GENERAL"],
    }


def build_leaderboard(

    include_closed: bool = True,

    benchmark_ids: set[str] | None = None,

    parameter_range: tuple[float, float] | None = None,

) -> list[dict]:
    """Compute scores for all models and return sorted leaderboard rows."""
    rows = []
    for model_name, model_scores in MODEL_SCORES.items():
        metadata = MODEL_METADATA.get(model_name, {})
        if not include_closed and not metadata.get("open_weights", False):
            continue
        parameter_b = metadata.get("parameter_b")
        if parameter_range is not None and isinstance(parameter_b, int | float):
            minimum, maximum = parameter_range
            if parameter_b < minimum or parameter_b > maximum:
                continue

        result = compute_grm_score(model_scores, benchmark_ids)
        if benchmark_ids is None and model_name in CSV_GRM_SCORES:
            result["GRM Score"] = CSV_GRM_SCORES[model_name]
        result["Model"] = model_name
        result["Family"] = metadata.get("family")
        result["Size"] = metadata.get("size")
        result["Parameter B"] = parameter_b
        result["Open Weights"] = metadata.get("open_weights", False)
        rows.append(result)

    rows.sort(key=lambda row: row["GRM Score"] if row["GRM Score"] is not None else -1, reverse=True)
    for index, row in enumerate(rows, start=1):
        row["Rank"] = index
    return rows


def get_score(model_name: str, benchmark_id: str) -> ScoreValue:
    return MODEL_SCORES.get(model_name, {}).get(benchmark_id)


def official_benchmark_ids() -> set[str]:
    return {benchmark["id"] for benchmark in BENCHMARKS if benchmark["included_in_grm"]}


def category_label(category: str) -> str:
    return CATEGORY_DISPLAY.get(category, category.title())