Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
| """GRM Evaluation Suite scoring logic. | |
| GRM Score = mean(Roleplay_score, Actions_score, General_score) | |
| Category Score = sum(score_i * calc_weight_i) / sum(calc_weight_i) | |
| """ | |
| from benchmarks import BENCHMARKS, CATEGORIES, CATEGORY_DISPLAY, get_benchmarks_by_category | |
| from scores import CSV_GRM_SCORES, MODEL_METADATA, MODEL_SCORES | |
| ScoreValue = float | None | |
| def _score_for(model_scores: dict[str, ScoreValue], benchmark: dict) -> ScoreValue: | |
| return model_scores.get(benchmark["id"]) | |
| def _category_benchmarks(category: str, benchmark_ids: set[str] | None = None) -> list[dict]: | |
| benchmarks = [benchmark for benchmark in get_benchmarks_by_category(category) if benchmark["included_in_grm"]] | |
| if benchmark_ids is not None: | |
| benchmarks = [benchmark for benchmark in benchmarks if benchmark["id"] in benchmark_ids] | |
| return benchmarks | |
| def _compute_category_score_raw( | |
| model_scores: dict[str, ScoreValue], | |
| category: str, | |
| benchmark_ids: set[str] | None = None, | |
| ) -> float | None: | |
| benchmarks = _category_benchmarks(category, benchmark_ids) | |
| total_weight = 0.0 | |
| total_value = 0.0 | |
| for benchmark in benchmarks: | |
| score = _score_for(model_scores, benchmark) | |
| if score is not None: | |
| total_weight += benchmark["calc_weight"] | |
| total_value += score * benchmark["calc_weight"] | |
| if total_weight == 0: | |
| return None | |
| return total_value / total_weight | |
| def compute_category_score( | |
| model_scores: dict[str, ScoreValue], | |
| category: str, | |
| benchmark_ids: set[str] | None = None, | |
| ) -> float | None: | |
| """Return weighted category average on a 0-100 scale.""" | |
| raw = _compute_category_score_raw(model_scores, category, benchmark_ids) | |
| return round(raw, 1) if raw is not None else None | |
| def compute_category_components( | |
| model_scores: dict[str, ScoreValue], | |
| category: str, | |
| benchmark_ids: set[str] | None = None, | |
| ) -> dict[str, float | int | None]: | |
| """Return category score plus core/supplementary averages and missing counts.""" | |
| benchmarks = _category_benchmarks(category, benchmark_ids) | |
| def average_for(weight: float) -> float | None: | |
| scores = [_score_for(model_scores, benchmark) for benchmark in benchmarks if benchmark["calc_weight"] == weight] | |
| available = [score for score in scores if score is not None] | |
| if not available: | |
| return None | |
| return round(sum(available) / len(available), 1) | |
| missing = sum(1 for benchmark in benchmarks if _score_for(model_scores, benchmark) is None) | |
| return { | |
| "score": compute_category_score(model_scores, category, benchmark_ids), | |
| "core_avg": average_for(1.0), | |
| "supplementary_avg": average_for(0.5), | |
| "missing": missing, | |
| "benchmarks": len(benchmarks), | |
| } | |
| def compute_grm_score( | |
| model_scores: dict[str, ScoreValue], | |
| benchmark_ids: set[str] | None = None, | |
| ) -> dict[str, float | None]: | |
| """Return GRM Score and per-category scores on a 0-100 scale.""" | |
| raw_category_scores = { | |
| category: _compute_category_score_raw(model_scores, category, benchmark_ids) for category in CATEGORIES | |
| } | |
| category_scores = { | |
| category: round(score, 1) if score is not None else None | |
| for category, score in raw_category_scores.items() | |
| } | |
| available = [score for score in raw_category_scores.values() if score is not None] | |
| grm = round(sum(available) / len(available), 1) if available else None | |
| return { | |
| "GRM Score": grm, | |
| "Roleplay (33%)": category_scores["ROLEPLAY"], | |
| "Actions (33%)": category_scores["ACTIONS"], | |
| "General (33%)": category_scores["GENERAL"], | |
| } | |
| def build_leaderboard( | |
| include_closed: bool = True, | |
| benchmark_ids: set[str] | None = None, | |
| parameter_range: tuple[float, float] | None = None, | |
| ) -> list[dict]: | |
| """Compute scores for all models and return sorted leaderboard rows.""" | |
| rows = [] | |
| for model_name, model_scores in MODEL_SCORES.items(): | |
| metadata = MODEL_METADATA.get(model_name, {}) | |
| if not include_closed and not metadata.get("open_weights", False): | |
| continue | |
| parameter_b = metadata.get("parameter_b") | |
| if parameter_range is not None and isinstance(parameter_b, int | float): | |
| minimum, maximum = parameter_range | |
| if parameter_b < minimum or parameter_b > maximum: | |
| continue | |
| result = compute_grm_score(model_scores, benchmark_ids) | |
| if benchmark_ids is None and model_name in CSV_GRM_SCORES: | |
| result["GRM Score"] = CSV_GRM_SCORES[model_name] | |
| result["Model"] = model_name | |
| result["Family"] = metadata.get("family") | |
| result["Size"] = metadata.get("size") | |
| result["Parameter B"] = parameter_b | |
| result["Open Weights"] = metadata.get("open_weights", False) | |
| rows.append(result) | |
| rows.sort(key=lambda row: row["GRM Score"] if row["GRM Score"] is not None else -1, reverse=True) | |
| for index, row in enumerate(rows, start=1): | |
| row["Rank"] = index | |
| return rows | |
| def get_score(model_name: str, benchmark_id: str) -> ScoreValue: | |
| return MODEL_SCORES.get(model_name, {}).get(benchmark_id) | |
| def official_benchmark_ids() -> set[str]: | |
| return {benchmark["id"] for benchmark in BENCHMARKS if benchmark["included_in_grm"]} | |
| def category_label(category: str) -> str: | |
| return CATEGORY_DISPLAY.get(category, category.title()) | |