Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
| """ | |
| GRM Evaluation Suite — Scoring Logic | |
| GRM Score = mean(Roleplay_score, Actions_score, General_score) | |
| Each category score is a weighted average of its benchmarks: | |
| category_score = sum(score_i * calc_weight_i) / sum(calc_weight_i) | |
| """ | |
| from benchmarks import BENCHMARKS, CATEGORIES, get_benchmarks_by_category | |
| from scores import MODEL_SCORES | |
| def compute_category_score( | |
| model_scores: dict[str, float | None], | |
| category: str, | |
| ) -> float | None: | |
| """Weighted average of available benchmark scores in a category.""" | |
| benchmarks = get_benchmarks_by_category(category) | |
| total_weight = 0.0 | |
| total_value = 0.0 | |
| for b in benchmarks: | |
| score = model_scores.get(b["name"]) | |
| if score is not None: | |
| total_weight += b["calc_weight"] | |
| total_value += score * b["calc_weight"] | |
| if total_weight == 0: | |
| return None | |
| return total_value / total_weight | |
| def compute_grm_score(model_scores: dict[str, float | None]) -> dict: | |
| """Return GRM Score and per-category scores for a single model. | |
| All returned values are on a 0–100 scale, or None if no data. | |
| """ | |
| cat_scores = {} | |
| for cat in CATEGORIES: | |
| raw = compute_category_score(model_scores, cat) | |
| cat_scores[cat] = round(raw * 100, 1) if raw is not None else None | |
| available = [v for v in cat_scores.values() if v is not None] | |
| grm = round(sum(available) / len(available), 1) if available else None | |
| return { | |
| "GRM Score": grm, | |
| "Roleplay (33%)": cat_scores["ROLEPLAY"], | |
| "Actions (33%)": cat_scores["ACTIONS"], | |
| "General (33%)": cat_scores["GENERAL"], | |
| } | |
| def build_leaderboard() -> list[dict]: | |
| """Compute scores for all models and return sorted rows.""" | |
| rows = [] | |
| for model_name, model_scores in MODEL_SCORES.items(): | |
| result = compute_grm_score(model_scores) | |
| result["Model"] = model_name | |
| rows.append(result) | |
| rows.sort(key=lambda r: r["GRM Score"] if r["GRM Score"] is not None else -1, reverse=True) | |
| for i, row in enumerate(rows, start=1): | |
| row["Rank"] = i | |
| return rows | |