Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
File size: 5,583 Bytes
5c49242 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 | """GRM Evaluation Suite scoring logic.
GRM Score = mean(Roleplay_score, Actions_score, General_score)
Category Score = sum(score_i * calc_weight_i) / sum(calc_weight_i)
"""
from benchmarks import BENCHMARKS, CATEGORIES, CATEGORY_DISPLAY, get_benchmarks_by_category
from scores import CSV_GRM_SCORES, MODEL_METADATA, MODEL_SCORES
ScoreValue = float | None
def _score_for(model_scores: dict[str, ScoreValue], benchmark: dict) -> ScoreValue:
return model_scores.get(benchmark["id"])
def _category_benchmarks(category: str, benchmark_ids: set[str] | None = None) -> list[dict]:
benchmarks = [benchmark for benchmark in get_benchmarks_by_category(category) if benchmark["included_in_grm"]]
if benchmark_ids is not None:
benchmarks = [benchmark for benchmark in benchmarks if benchmark["id"] in benchmark_ids]
return benchmarks
def _compute_category_score_raw(
model_scores: dict[str, ScoreValue],
category: str,
benchmark_ids: set[str] | None = None,
) -> float | None:
benchmarks = _category_benchmarks(category, benchmark_ids)
total_weight = 0.0
total_value = 0.0
for benchmark in benchmarks:
score = _score_for(model_scores, benchmark)
if score is not None:
total_weight += benchmark["calc_weight"]
total_value += score * benchmark["calc_weight"]
if total_weight == 0:
return None
return total_value / total_weight
def compute_category_score(
model_scores: dict[str, ScoreValue],
category: str,
benchmark_ids: set[str] | None = None,
) -> float | None:
"""Return weighted category average on a 0-100 scale."""
raw = _compute_category_score_raw(model_scores, category, benchmark_ids)
return round(raw, 1) if raw is not None else None
def compute_category_components(
model_scores: dict[str, ScoreValue],
category: str,
benchmark_ids: set[str] | None = None,
) -> dict[str, float | int | None]:
"""Return category score plus core/supplementary averages and missing counts."""
benchmarks = _category_benchmarks(category, benchmark_ids)
def average_for(weight: float) -> float | None:
scores = [_score_for(model_scores, benchmark) for benchmark in benchmarks if benchmark["calc_weight"] == weight]
available = [score for score in scores if score is not None]
if not available:
return None
return round(sum(available) / len(available), 1)
missing = sum(1 for benchmark in benchmarks if _score_for(model_scores, benchmark) is None)
return {
"score": compute_category_score(model_scores, category, benchmark_ids),
"core_avg": average_for(1.0),
"supplementary_avg": average_for(0.5),
"missing": missing,
"benchmarks": len(benchmarks),
}
def compute_grm_score(
model_scores: dict[str, ScoreValue],
benchmark_ids: set[str] | None = None,
) -> dict[str, float | None]:
"""Return GRM Score and per-category scores on a 0-100 scale."""
raw_category_scores = {
category: _compute_category_score_raw(model_scores, category, benchmark_ids) for category in CATEGORIES
}
category_scores = {
category: round(score, 1) if score is not None else None
for category, score in raw_category_scores.items()
}
available = [score for score in raw_category_scores.values() if score is not None]
grm = round(sum(available) / len(available), 1) if available else None
return {
"GRM Score": grm,
"Roleplay (33%)": category_scores["ROLEPLAY"],
"Actions (33%)": category_scores["ACTIONS"],
"General (33%)": category_scores["GENERAL"],
}
def build_leaderboard(
include_closed: bool = True,
benchmark_ids: set[str] | None = None,
parameter_range: tuple[float, float] | None = None,
) -> list[dict]:
"""Compute scores for all models and return sorted leaderboard rows."""
rows = []
for model_name, model_scores in MODEL_SCORES.items():
metadata = MODEL_METADATA.get(model_name, {})
if not include_closed and not metadata.get("open_weights", False):
continue
parameter_b = metadata.get("parameter_b")
if parameter_range is not None and isinstance(parameter_b, int | float):
minimum, maximum = parameter_range
if parameter_b < minimum or parameter_b > maximum:
continue
result = compute_grm_score(model_scores, benchmark_ids)
if benchmark_ids is None and model_name in CSV_GRM_SCORES:
result["GRM Score"] = CSV_GRM_SCORES[model_name]
result["Model"] = model_name
result["Family"] = metadata.get("family")
result["Size"] = metadata.get("size")
result["Parameter B"] = parameter_b
result["Open Weights"] = metadata.get("open_weights", False)
rows.append(result)
rows.sort(key=lambda row: row["GRM Score"] if row["GRM Score"] is not None else -1, reverse=True)
for index, row in enumerate(rows, start=1):
row["Rank"] = index
return rows
def get_score(model_name: str, benchmark_id: str) -> ScoreValue:
return MODEL_SCORES.get(model_name, {}).get(benchmark_id)
def official_benchmark_ids() -> set[str]:
return {benchmark["id"] for benchmark in BENCHMARKS if benchmark["included_in_grm"]}
def category_label(category: str) -> str:
return CATEGORY_DISPLAY.get(category, category.title())
|