GRM / scoring.py
mbagdasarova-nvidia's picture
Add GRM evaluation suite with scoring logic and benchmark registry
2a44234
raw
history blame
2.11 kB
"""
GRM Evaluation Suite — Scoring Logic
GRM Score = mean(Roleplay_score, Actions_score, General_score)
Each category score is a weighted average of its benchmarks:
category_score = sum(score_i * calc_weight_i) / sum(calc_weight_i)
"""
from benchmarks import BENCHMARKS, CATEGORIES, get_benchmarks_by_category
from scores import MODEL_SCORES
def compute_category_score(
model_scores: dict[str, float | None],
category: str,
) -> float | None:
"""Weighted average of available benchmark scores in a category."""
benchmarks = get_benchmarks_by_category(category)
total_weight = 0.0
total_value = 0.0
for b in benchmarks:
score = model_scores.get(b["name"])
if score is not None:
total_weight += b["calc_weight"]
total_value += score * b["calc_weight"]
if total_weight == 0:
return None
return total_value / total_weight
def compute_grm_score(model_scores: dict[str, float | None]) -> dict:
"""Return GRM Score and per-category scores for a single model.
All returned values are on a 0–100 scale, or None if no data.
"""
cat_scores = {}
for cat in CATEGORIES:
raw = compute_category_score(model_scores, cat)
cat_scores[cat] = round(raw * 100, 1) if raw is not None else None
available = [v for v in cat_scores.values() if v is not None]
grm = round(sum(available) / len(available), 1) if available else None
return {
"GRM Score": grm,
"Roleplay (33%)": cat_scores["ROLEPLAY"],
"Actions (33%)": cat_scores["ACTIONS"],
"General (33%)": cat_scores["GENERAL"],
}
def build_leaderboard() -> list[dict]:
"""Compute scores for all models and return sorted rows."""
rows = []
for model_name, model_scores in MODEL_SCORES.items():
result = compute_grm_score(model_scores)
result["Model"] = model_name
rows.append(result)
rows.sort(key=lambda r: r["GRM Score"] if r["GRM Score"] is not None else -1, reverse=True)
for i, row in enumerate(rows, start=1):
row["Rank"] = i
return rows