""" GRM Evaluation Suite — Scoring Logic GRM Score = mean(Roleplay_score, Actions_score, General_score) Each category score is a weighted average of its benchmarks: category_score = sum(score_i * calc_weight_i) / sum(calc_weight_i) """ from benchmarks import BENCHMARKS, CATEGORIES, get_benchmarks_by_category from scores import MODEL_SCORES def compute_category_score( model_scores: dict[str, float | None], category: str, ) -> float | None: """Weighted average of available benchmark scores in a category.""" benchmarks = get_benchmarks_by_category(category) total_weight = 0.0 total_value = 0.0 for b in benchmarks: score = model_scores.get(b["name"]) if score is not None: total_weight += b["calc_weight"] total_value += score * b["calc_weight"] if total_weight == 0: return None return total_value / total_weight def compute_grm_score(model_scores: dict[str, float | None]) -> dict: """Return GRM Score and per-category scores for a single model. All returned values are on a 0–100 scale, or None if no data. """ cat_scores = {} for cat in CATEGORIES: raw = compute_category_score(model_scores, cat) cat_scores[cat] = round(raw * 100, 1) if raw is not None else None available = [v for v in cat_scores.values() if v is not None] grm = round(sum(available) / len(available), 1) if available else None return { "GRM Score": grm, "Roleplay (33%)": cat_scores["ROLEPLAY"], "Actions (33%)": cat_scores["ACTIONS"], "General (33%)": cat_scores["GENERAL"], } def build_leaderboard() -> list[dict]: """Compute scores for all models and return sorted rows.""" rows = [] for model_name, model_scores in MODEL_SCORES.items(): result = compute_grm_score(model_scores) result["Model"] = model_name rows.append(result) rows.sort(key=lambda r: r["GRM Score"] if r["GRM Score"] is not None else -1, reverse=True) for i, row in enumerate(rows, start=1): row["Rank"] = i return rows