GRM / scoring.py
mbagdasarova-nvidia's picture
Upload 9 files
5c49242 verified
"""GRM Evaluation Suite scoring logic.
GRM Score = mean(Roleplay_score, Actions_score, General_score)
Category Score = sum(score_i * calc_weight_i) / sum(calc_weight_i)
"""
from benchmarks import BENCHMARKS, CATEGORIES, CATEGORY_DISPLAY, get_benchmarks_by_category
from scores import CSV_GRM_SCORES, MODEL_METADATA, MODEL_SCORES
ScoreValue = float | None
def _score_for(model_scores: dict[str, ScoreValue], benchmark: dict) -> ScoreValue:
return model_scores.get(benchmark["id"])
def _category_benchmarks(category: str, benchmark_ids: set[str] | None = None) -> list[dict]:
benchmarks = [benchmark for benchmark in get_benchmarks_by_category(category) if benchmark["included_in_grm"]]
if benchmark_ids is not None:
benchmarks = [benchmark for benchmark in benchmarks if benchmark["id"] in benchmark_ids]
return benchmarks
def _compute_category_score_raw(
model_scores: dict[str, ScoreValue],
category: str,
benchmark_ids: set[str] | None = None,
) -> float | None:
benchmarks = _category_benchmarks(category, benchmark_ids)
total_weight = 0.0
total_value = 0.0
for benchmark in benchmarks:
score = _score_for(model_scores, benchmark)
if score is not None:
total_weight += benchmark["calc_weight"]
total_value += score * benchmark["calc_weight"]
if total_weight == 0:
return None
return total_value / total_weight
def compute_category_score(
model_scores: dict[str, ScoreValue],
category: str,
benchmark_ids: set[str] | None = None,
) -> float | None:
"""Return weighted category average on a 0-100 scale."""
raw = _compute_category_score_raw(model_scores, category, benchmark_ids)
return round(raw, 1) if raw is not None else None
def compute_category_components(
model_scores: dict[str, ScoreValue],
category: str,
benchmark_ids: set[str] | None = None,
) -> dict[str, float | int | None]:
"""Return category score plus core/supplementary averages and missing counts."""
benchmarks = _category_benchmarks(category, benchmark_ids)
def average_for(weight: float) -> float | None:
scores = [_score_for(model_scores, benchmark) for benchmark in benchmarks if benchmark["calc_weight"] == weight]
available = [score for score in scores if score is not None]
if not available:
return None
return round(sum(available) / len(available), 1)
missing = sum(1 for benchmark in benchmarks if _score_for(model_scores, benchmark) is None)
return {
"score": compute_category_score(model_scores, category, benchmark_ids),
"core_avg": average_for(1.0),
"supplementary_avg": average_for(0.5),
"missing": missing,
"benchmarks": len(benchmarks),
}
def compute_grm_score(
model_scores: dict[str, ScoreValue],
benchmark_ids: set[str] | None = None,
) -> dict[str, float | None]:
"""Return GRM Score and per-category scores on a 0-100 scale."""
raw_category_scores = {
category: _compute_category_score_raw(model_scores, category, benchmark_ids) for category in CATEGORIES
}
category_scores = {
category: round(score, 1) if score is not None else None
for category, score in raw_category_scores.items()
}
available = [score for score in raw_category_scores.values() if score is not None]
grm = round(sum(available) / len(available), 1) if available else None
return {
"GRM Score": grm,
"Roleplay (33%)": category_scores["ROLEPLAY"],
"Actions (33%)": category_scores["ACTIONS"],
"General (33%)": category_scores["GENERAL"],
}
def build_leaderboard(
include_closed: bool = True,
benchmark_ids: set[str] | None = None,
parameter_range: tuple[float, float] | None = None,
) -> list[dict]:
"""Compute scores for all models and return sorted leaderboard rows."""
rows = []
for model_name, model_scores in MODEL_SCORES.items():
metadata = MODEL_METADATA.get(model_name, {})
if not include_closed and not metadata.get("open_weights", False):
continue
parameter_b = metadata.get("parameter_b")
if parameter_range is not None and isinstance(parameter_b, int | float):
minimum, maximum = parameter_range
if parameter_b < minimum or parameter_b > maximum:
continue
result = compute_grm_score(model_scores, benchmark_ids)
if benchmark_ids is None and model_name in CSV_GRM_SCORES:
result["GRM Score"] = CSV_GRM_SCORES[model_name]
result["Model"] = model_name
result["Family"] = metadata.get("family")
result["Size"] = metadata.get("size")
result["Parameter B"] = parameter_b
result["Open Weights"] = metadata.get("open_weights", False)
rows.append(result)
rows.sort(key=lambda row: row["GRM Score"] if row["GRM Score"] is not None else -1, reverse=True)
for index, row in enumerate(rows, start=1):
row["Rank"] = index
return rows
def get_score(model_name: str, benchmark_id: str) -> ScoreValue:
return MODEL_SCORES.get(model_name, {}).get(benchmark_id)
def official_benchmark_ids() -> set[str]:
return {benchmark["id"] for benchmark in BENCHMARKS if benchmark["included_in_grm"]}
def category_label(category: str) -> str:
return CATEGORY_DISPLAY.get(category, category.title())