"""GRM Evaluation Suite scoring logic. GRM Score = mean(Roleplay_score, Actions_score, General_score) Category Score = sum(score_i * calc_weight_i) / sum(calc_weight_i) """ from benchmarks import BENCHMARKS, CATEGORIES, CATEGORY_DISPLAY, get_benchmarks_by_category from scores import CSV_GRM_SCORES, MODEL_METADATA, MODEL_SCORES ScoreValue = float | None def _score_for(model_scores: dict[str, ScoreValue], benchmark: dict) -> ScoreValue: return model_scores.get(benchmark["id"]) def _category_benchmarks(category: str, benchmark_ids: set[str] | None = None) -> list[dict]: benchmarks = [benchmark for benchmark in get_benchmarks_by_category(category) if benchmark["included_in_grm"]] if benchmark_ids is not None: benchmarks = [benchmark for benchmark in benchmarks if benchmark["id"] in benchmark_ids] return benchmarks def _compute_category_score_raw( model_scores: dict[str, ScoreValue], category: str, benchmark_ids: set[str] | None = None, ) -> float | None: benchmarks = _category_benchmarks(category, benchmark_ids) total_weight = 0.0 total_value = 0.0 for benchmark in benchmarks: score = _score_for(model_scores, benchmark) if score is not None: total_weight += benchmark["calc_weight"] total_value += score * benchmark["calc_weight"] if total_weight == 0: return None return total_value / total_weight def compute_category_score( model_scores: dict[str, ScoreValue], category: str, benchmark_ids: set[str] | None = None, ) -> float | None: """Return weighted category average on a 0-100 scale.""" raw = _compute_category_score_raw(model_scores, category, benchmark_ids) return round(raw, 1) if raw is not None else None def compute_category_components( model_scores: dict[str, ScoreValue], category: str, benchmark_ids: set[str] | None = None, ) -> dict[str, float | int | None]: """Return category score plus core/supplementary averages and missing counts.""" benchmarks = _category_benchmarks(category, benchmark_ids) def average_for(weight: float) -> float | None: scores = [_score_for(model_scores, benchmark) for benchmark in benchmarks if benchmark["calc_weight"] == weight] available = [score for score in scores if score is not None] if not available: return None return round(sum(available) / len(available), 1) missing = sum(1 for benchmark in benchmarks if _score_for(model_scores, benchmark) is None) return { "score": compute_category_score(model_scores, category, benchmark_ids), "core_avg": average_for(1.0), "supplementary_avg": average_for(0.5), "missing": missing, "benchmarks": len(benchmarks), } def compute_grm_score( model_scores: dict[str, ScoreValue], benchmark_ids: set[str] | None = None, ) -> dict[str, float | None]: """Return GRM Score and per-category scores on a 0-100 scale.""" raw_category_scores = { category: _compute_category_score_raw(model_scores, category, benchmark_ids) for category in CATEGORIES } category_scores = { category: round(score, 1) if score is not None else None for category, score in raw_category_scores.items() } available = [score for score in raw_category_scores.values() if score is not None] grm = round(sum(available) / len(available), 1) if available else None return { "GRM Score": grm, "Roleplay (33%)": category_scores["ROLEPLAY"], "Actions (33%)": category_scores["ACTIONS"], "General (33%)": category_scores["GENERAL"], } def build_leaderboard( include_closed: bool = True, benchmark_ids: set[str] | None = None, parameter_range: tuple[float, float] | None = None, ) -> list[dict]: """Compute scores for all models and return sorted leaderboard rows.""" rows = [] for model_name, model_scores in MODEL_SCORES.items(): metadata = MODEL_METADATA.get(model_name, {}) if not include_closed and not metadata.get("open_weights", False): continue parameter_b = metadata.get("parameter_b") if parameter_range is not None and isinstance(parameter_b, int | float): minimum, maximum = parameter_range if parameter_b < minimum or parameter_b > maximum: continue result = compute_grm_score(model_scores, benchmark_ids) if benchmark_ids is None and model_name in CSV_GRM_SCORES: result["GRM Score"] = CSV_GRM_SCORES[model_name] result["Model"] = model_name result["Family"] = metadata.get("family") result["Size"] = metadata.get("size") result["Parameter B"] = parameter_b result["Open Weights"] = metadata.get("open_weights", False) rows.append(result) rows.sort(key=lambda row: row["GRM Score"] if row["GRM Score"] is not None else -1, reverse=True) for index, row in enumerate(rows, start=1): row["Rank"] = index return rows def get_score(model_name: str, benchmark_id: str) -> ScoreValue: return MODEL_SCORES.get(model_name, {}).get(benchmark_id) def official_benchmark_ids() -> set[str]: return {benchmark["id"] for benchmark in BENCHMARKS if benchmark["included_in_grm"]} def category_label(category: str) -> str: return CATEGORY_DISPLAY.get(category, category.title())