"""Table view helpers for the Streamlit GRM leaderboard.""" from collections.abc import Iterable import pandas as pd from benchmarks import BENCHMARKS, CATEGORIES, CATEGORY_DISPLAY, GRM_BENCH_DIMENSIONS from scores import MODEL_METADATA, MODEL_SCORES from scoring import build_leaderboard, compute_category_components, compute_grm_score, get_score VIEW_SUMMARY = "Summary" VIEW_CATEGORY = "Category" VIEW_MATRIX = "Benchmark matrix" def format_score(value: float | None) -> str: return f"{value:.1f}" if value is not None else "TBD" def _with_tbd(frame: pd.DataFrame) -> pd.DataFrame: return frame.astype("object").where(pd.notna(frame), "TBD").astype(str) def category_options() -> list[str]: return ["All", *[CATEGORY_DISPLAY[category] for category in CATEGORIES]] def category_from_label(label: str) -> str | None: for category, display in CATEGORY_DISPLAY.items(): if label == display: return category return None def available_domains() -> list[str]: return sorted({benchmark["domain"] for benchmark in BENCHMARKS}) def available_priorities() -> list[str]: return sorted({benchmark["priority"] for benchmark in BENCHMARKS}) def available_sources() -> list[str]: return sorted({benchmark["source"] for benchmark in BENCHMARKS}) def parameter_bounds() -> tuple[float, float]: values = [ metadata["parameter_b"] for metadata in MODEL_METADATA.values() if isinstance(metadata.get("parameter_b"), int | float) ] return (0.0, max(values) if values else 120.0) def _matches_search(benchmark: dict, search: str) -> bool: if not search: return True target = " ".join( str(benchmark.get(key, "")) for key in ["name", "description", "summary", "domain", "source", "priority"] ).lower() return search.lower() in target def filter_benchmarks( category: str | None = None, priorities: Iterable[str] | None = None, sources: Iterable[str] | None = None, domains: Iterable[str] | None = None, search: str = "", include_non_scored: bool = False, ) -> list[dict]: priority_set = set(priorities or []) source_set = set(sources or []) domain_set = set(domains or []) benchmarks = [] for benchmark in BENCHMARKS: if category and benchmark["category"] != category: continue if priority_set and benchmark["priority"] not in priority_set: continue if source_set and benchmark["source"] not in source_set: continue if domain_set and benchmark["domain"] not in domain_set: continue if not include_non_scored and not benchmark["included_in_grm"]: continue if not _matches_search(benchmark, search): continue benchmarks.append(benchmark) return benchmarks def _format_leaderboard_rows(rows: list[dict], use_filtered_label: bool = False) -> pd.DataFrame: label = "Filtered GRM Score" if use_filtered_label else "GRM Score" records = [] for row in rows: records.append( { "Rank": row["Rank"], "Model": row["Model"], label: row["GRM Score"], "Roleplay": row["Roleplay (33%)"], "Actions": row["Actions (33%)"], "General": row["General (33%)"], "Family": row.get("Family"), "Size": row.get("Size"), } ) return pd.DataFrame.from_records(records) def build_summary_frame( include_closed: bool = True, parameter_range: tuple[float, float] | None = None, ) -> pd.DataFrame: return _with_tbd(_format_leaderboard_rows(build_leaderboard(include_closed=include_closed, parameter_range=parameter_range))) def build_category_frame( category: str, include_closed: bool = True, benchmark_ids: set[str] | None = None, filtered_score: bool = False, parameter_range: tuple[float, float] | None = None, ) -> pd.DataFrame: rows = build_leaderboard( include_closed=include_closed, benchmark_ids=benchmark_ids if filtered_score else None, parameter_range=parameter_range, ) records = [] category_name = CATEGORY_DISPLAY[category] for row in rows: components = compute_category_components(MODEL_SCORES[row["Model"]], category, benchmark_ids) records.append( { "Rank": row["Rank"], "Model": row["Model"], "Filtered GRM Score" if filtered_score else "GRM Score": row["GRM Score"], f"{category_name} Score": components["score"], "Core Avg": components["core_avg"], "Supplementary Avg": components["supplementary_avg"], "Missing": f"{components['missing']} / {components['benchmarks']}", "Family": row.get("Family"), "Size": row.get("Size"), } ) return _with_tbd(pd.DataFrame.from_records(records)) def build_benchmark_matrix_frame( benchmarks: list[dict], include_closed: bool = True, recalculate_visible: bool = False, parameter_range: tuple[float, float] | None = None, ) -> pd.DataFrame: benchmark_ids = {benchmark["id"] for benchmark in benchmarks} rows = build_leaderboard( include_closed=include_closed, benchmark_ids=benchmark_ids if recalculate_visible else None, parameter_range=parameter_range, ) records = [] for row in rows: record = { "Rank": row["Rank"], "Model": row["Model"], "Filtered GRM Score" if recalculate_visible else "GRM Score": row["GRM Score"], "Roleplay": row["Roleplay (33%)"], "Actions": row["Actions (33%)"], "General": row["General (33%)"], } for benchmark in benchmarks: score = get_score(row["Model"], benchmark["id"]) record[benchmark["name"]] = score records.append(record) return _with_tbd(pd.DataFrame.from_records(records)) def build_score_explorer_frame( view: str, category_label: str, benchmarks: list[dict], include_closed: bool, recalculate_visible: bool, parameter_range: tuple[float, float] | None = None, ) -> pd.DataFrame: selected_category = category_from_label(category_label) benchmark_ids = {benchmark["id"] for benchmark in benchmarks} if view == VIEW_SUMMARY: if recalculate_visible and benchmark_ids: return _with_tbd(_format_leaderboard_rows( build_leaderboard( include_closed=include_closed, benchmark_ids=benchmark_ids, parameter_range=parameter_range, ), use_filtered_label=True, )) return build_summary_frame(include_closed=include_closed, parameter_range=parameter_range) if view == VIEW_CATEGORY: category = selected_category or "ROLEPLAY" return build_category_frame( category, include_closed=include_closed, benchmark_ids=benchmark_ids, filtered_score=recalculate_visible, parameter_range=parameter_range, ) return build_benchmark_matrix_frame( benchmarks, include_closed=include_closed, recalculate_visible=recalculate_visible, parameter_range=parameter_range, ) def build_benchmark_registry_frame(benchmarks: list[dict]) -> pd.DataFrame: records = [] for benchmark in benchmarks: records.append( { "Benchmark": benchmark["name"], "Category": CATEGORY_DISPLAY[benchmark["category"]], "Domain": benchmark["domain"], "Source": benchmark["source"], "Weight": benchmark["calc_weight"], "Included in GRM": "Yes" if benchmark["included_in_grm"] else "No", "Description": benchmark["description"], "Summary": benchmark["summary"], "Paper / Repo": benchmark.get("paper") or "", } ) return _with_tbd(pd.DataFrame.from_records(records)) def build_grm_dimensions_frame(show_non_scored: bool = True) -> pd.DataFrame: dimensions = [ dimension for dimension in GRM_BENCH_DIMENSIONS if show_non_scored or dimension["included_in_grm"] ] return _with_tbd(pd.DataFrame.from_records( { "Dimension": dimension["dimension"], "Phase": dimension["phase"], "Included in GRM": "Yes" if dimension["included_in_grm"] else "No", "Notes": dimension["notes"], } for dimension in dimensions )) def build_model_detail_frame(model_name: str) -> pd.DataFrame: model_scores = MODEL_SCORES.get(model_name, {}) records = [] for category in CATEGORIES: components = compute_category_components(model_scores, category) records.append( { "Category": CATEGORY_DISPLAY[category], "Score": components["score"], "Core Avg": components["core_avg"], "Supplementary Avg": components["supplementary_avg"], "Missing": f"{components['missing']} / {components['benchmarks']}", } ) return _with_tbd(pd.DataFrame.from_records(records)) def build_model_benchmark_scores(model_name: str, limit: int = 6, strongest: bool = True) -> pd.DataFrame: model_scores = MODEL_SCORES.get(model_name, {}) scored = [ { "Benchmark": benchmark["name"], "Category": CATEGORY_DISPLAY[benchmark["category"]], "Domain": benchmark["domain"], "Score": model_scores.get(benchmark["id"]), } for benchmark in BENCHMARKS if model_scores.get(benchmark["id"]) is not None ] scored.sort(key=lambda item: item["Score"], reverse=strongest) return _with_tbd(pd.DataFrame.from_records(scored[:limit])) def model_options( include_closed: bool = True, parameter_range: tuple[float, float] | None = None, ) -> list[str]: rows = build_leaderboard(include_closed=include_closed, parameter_range=parameter_range) return [row["Model"] for row in rows] def benchmark_options(benchmarks: list[dict]) -> list[str]: return [benchmark["name"] for benchmark in benchmarks] def find_benchmark_by_name(name: str) -> dict | None: for benchmark in BENCHMARKS: if benchmark["name"] == name: return benchmark return None def score_stats( include_closed: bool = True, parameter_range: tuple[float, float] | None = None, ) -> dict[str, str]: rows = build_leaderboard(include_closed=include_closed, parameter_range=parameter_range) open_rows = [row for row in rows if MODEL_METADATA.get(row["Model"], {}).get("open_weights")] active_benchmarks = { benchmark_id for scores in MODEL_SCORES.values() for benchmark_id, score in scores.items() if score is not None } return { "Top model": rows[0]["Model"] if rows else "-", "Best open-source model": open_rows[0]["Model"] if open_rows else "-", "Models": str(len(rows)), "Active benchmarks": str(len(active_benchmarks)), "Latest data source": "GRM Eval - Benchmarks PRD.pdf", } def official_score_for_model(model_name: str) -> dict[str, float | None]: return compute_grm_score(MODEL_SCORES.get(model_name, {}))