Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
| """Table view helpers for the Streamlit GRM leaderboard.""" | |
| from collections.abc import Iterable | |
| import pandas as pd | |
| from benchmarks import BENCHMARKS, CATEGORIES, CATEGORY_DISPLAY, GRM_BENCH_DIMENSIONS | |
| from scores import MODEL_METADATA, MODEL_SCORES | |
| from scoring import build_leaderboard, compute_category_components, compute_grm_score, get_score | |
| VIEW_SUMMARY = "Summary" | |
| VIEW_CATEGORY = "Category" | |
| VIEW_MATRIX = "Benchmark matrix" | |
| def format_score(value: float | None) -> str: | |
| return f"{value:.1f}" if value is not None else "TBD" | |
| def _with_tbd(frame: pd.DataFrame) -> pd.DataFrame: | |
| return frame.astype("object").where(pd.notna(frame), "TBD").astype(str) | |
| def category_options() -> list[str]: | |
| return ["All", *[CATEGORY_DISPLAY[category] for category in CATEGORIES]] | |
| def category_from_label(label: str) -> str | None: | |
| for category, display in CATEGORY_DISPLAY.items(): | |
| if label == display: | |
| return category | |
| return None | |
| def available_domains() -> list[str]: | |
| return sorted({benchmark["domain"] for benchmark in BENCHMARKS}) | |
| def available_priorities() -> list[str]: | |
| return sorted({benchmark["priority"] for benchmark in BENCHMARKS}) | |
| def available_sources() -> list[str]: | |
| return sorted({benchmark["source"] for benchmark in BENCHMARKS}) | |
| def parameter_bounds() -> tuple[float, float]: | |
| values = [ | |
| metadata["parameter_b"] | |
| for metadata in MODEL_METADATA.values() | |
| if isinstance(metadata.get("parameter_b"), int | float) | |
| ] | |
| return (0.0, max(values) if values else 120.0) | |
| def _matches_search(benchmark: dict, search: str) -> bool: | |
| if not search: | |
| return True | |
| target = " ".join( | |
| str(benchmark.get(key, "")) | |
| for key in ["name", "description", "summary", "domain", "source", "priority"] | |
| ).lower() | |
| return search.lower() in target | |
| def filter_benchmarks( | |
| category: str | None = None, | |
| priorities: Iterable[str] | None = None, | |
| sources: Iterable[str] | None = None, | |
| domains: Iterable[str] | None = None, | |
| search: str = "", | |
| include_non_scored: bool = False, | |
| ) -> list[dict]: | |
| priority_set = set(priorities or []) | |
| source_set = set(sources or []) | |
| domain_set = set(domains or []) | |
| benchmarks = [] | |
| for benchmark in BENCHMARKS: | |
| if category and benchmark["category"] != category: | |
| continue | |
| if priority_set and benchmark["priority"] not in priority_set: | |
| continue | |
| if source_set and benchmark["source"] not in source_set: | |
| continue | |
| if domain_set and benchmark["domain"] not in domain_set: | |
| continue | |
| if not include_non_scored and not benchmark["included_in_grm"]: | |
| continue | |
| if not _matches_search(benchmark, search): | |
| continue | |
| benchmarks.append(benchmark) | |
| return benchmarks | |
| def _format_leaderboard_rows(rows: list[dict], use_filtered_label: bool = False) -> pd.DataFrame: | |
| label = "Filtered GRM Score" if use_filtered_label else "GRM Score" | |
| records = [] | |
| for row in rows: | |
| records.append( | |
| { | |
| "Rank": row["Rank"], | |
| "Model": row["Model"], | |
| label: row["GRM Score"], | |
| "Roleplay": row["Roleplay (33%)"], | |
| "Actions": row["Actions (33%)"], | |
| "General": row["General (33%)"], | |
| "Family": row.get("Family"), | |
| "Size": row.get("Size"), | |
| } | |
| ) | |
| return pd.DataFrame.from_records(records) | |
| def build_summary_frame( | |
| include_closed: bool = True, | |
| parameter_range: tuple[float, float] | None = None, | |
| ) -> pd.DataFrame: | |
| return _with_tbd(_format_leaderboard_rows(build_leaderboard(include_closed=include_closed, parameter_range=parameter_range))) | |
| def build_category_frame( | |
| category: str, | |
| include_closed: bool = True, | |
| benchmark_ids: set[str] | None = None, | |
| filtered_score: bool = False, | |
| parameter_range: tuple[float, float] | None = None, | |
| ) -> pd.DataFrame: | |
| rows = build_leaderboard( | |
| include_closed=include_closed, | |
| benchmark_ids=benchmark_ids if filtered_score else None, | |
| parameter_range=parameter_range, | |
| ) | |
| records = [] | |
| category_name = CATEGORY_DISPLAY[category] | |
| for row in rows: | |
| components = compute_category_components(MODEL_SCORES[row["Model"]], category, benchmark_ids) | |
| records.append( | |
| { | |
| "Rank": row["Rank"], | |
| "Model": row["Model"], | |
| "Filtered GRM Score" if filtered_score else "GRM Score": row["GRM Score"], | |
| f"{category_name} Score": components["score"], | |
| "Core Avg": components["core_avg"], | |
| "Supplementary Avg": components["supplementary_avg"], | |
| "Missing": f"{components['missing']} / {components['benchmarks']}", | |
| "Family": row.get("Family"), | |
| "Size": row.get("Size"), | |
| } | |
| ) | |
| return _with_tbd(pd.DataFrame.from_records(records)) | |
| def build_benchmark_matrix_frame( | |
| benchmarks: list[dict], | |
| include_closed: bool = True, | |
| recalculate_visible: bool = False, | |
| parameter_range: tuple[float, float] | None = None, | |
| ) -> pd.DataFrame: | |
| benchmark_ids = {benchmark["id"] for benchmark in benchmarks} | |
| rows = build_leaderboard( | |
| include_closed=include_closed, | |
| benchmark_ids=benchmark_ids if recalculate_visible else None, | |
| parameter_range=parameter_range, | |
| ) | |
| records = [] | |
| for row in rows: | |
| record = { | |
| "Rank": row["Rank"], | |
| "Model": row["Model"], | |
| "Filtered GRM Score" if recalculate_visible else "GRM Score": row["GRM Score"], | |
| "Roleplay": row["Roleplay (33%)"], | |
| "Actions": row["Actions (33%)"], | |
| "General": row["General (33%)"], | |
| } | |
| for benchmark in benchmarks: | |
| score = get_score(row["Model"], benchmark["id"]) | |
| record[benchmark["name"]] = score | |
| records.append(record) | |
| return _with_tbd(pd.DataFrame.from_records(records)) | |
| def build_score_explorer_frame( | |
| view: str, | |
| category_label: str, | |
| benchmarks: list[dict], | |
| include_closed: bool, | |
| recalculate_visible: bool, | |
| parameter_range: tuple[float, float] | None = None, | |
| ) -> pd.DataFrame: | |
| selected_category = category_from_label(category_label) | |
| benchmark_ids = {benchmark["id"] for benchmark in benchmarks} | |
| if view == VIEW_SUMMARY: | |
| if recalculate_visible and benchmark_ids: | |
| return _with_tbd(_format_leaderboard_rows( | |
| build_leaderboard( | |
| include_closed=include_closed, | |
| benchmark_ids=benchmark_ids, | |
| parameter_range=parameter_range, | |
| ), | |
| use_filtered_label=True, | |
| )) | |
| return build_summary_frame(include_closed=include_closed, parameter_range=parameter_range) | |
| if view == VIEW_CATEGORY: | |
| category = selected_category or "ROLEPLAY" | |
| return build_category_frame( | |
| category, | |
| include_closed=include_closed, | |
| benchmark_ids=benchmark_ids, | |
| filtered_score=recalculate_visible, | |
| parameter_range=parameter_range, | |
| ) | |
| return build_benchmark_matrix_frame( | |
| benchmarks, | |
| include_closed=include_closed, | |
| recalculate_visible=recalculate_visible, | |
| parameter_range=parameter_range, | |
| ) | |
| def build_benchmark_registry_frame(benchmarks: list[dict]) -> pd.DataFrame: | |
| records = [] | |
| for benchmark in benchmarks: | |
| records.append( | |
| { | |
| "Benchmark": benchmark["name"], | |
| "Category": CATEGORY_DISPLAY[benchmark["category"]], | |
| "Domain": benchmark["domain"], | |
| "Source": benchmark["source"], | |
| "Weight": benchmark["calc_weight"], | |
| "Included in GRM": "Yes" if benchmark["included_in_grm"] else "No", | |
| "Description": benchmark["description"], | |
| "Summary": benchmark["summary"], | |
| "Paper / Repo": benchmark.get("paper") or "", | |
| } | |
| ) | |
| return _with_tbd(pd.DataFrame.from_records(records)) | |
| def build_grm_dimensions_frame(show_non_scored: bool = True) -> pd.DataFrame: | |
| dimensions = [ | |
| dimension | |
| for dimension in GRM_BENCH_DIMENSIONS | |
| if show_non_scored or dimension["included_in_grm"] | |
| ] | |
| return _with_tbd(pd.DataFrame.from_records( | |
| { | |
| "Dimension": dimension["dimension"], | |
| "Phase": dimension["phase"], | |
| "Included in GRM": "Yes" if dimension["included_in_grm"] else "No", | |
| "Notes": dimension["notes"], | |
| } | |
| for dimension in dimensions | |
| )) | |
| def build_model_detail_frame(model_name: str) -> pd.DataFrame: | |
| model_scores = MODEL_SCORES.get(model_name, {}) | |
| records = [] | |
| for category in CATEGORIES: | |
| components = compute_category_components(model_scores, category) | |
| records.append( | |
| { | |
| "Category": CATEGORY_DISPLAY[category], | |
| "Score": components["score"], | |
| "Core Avg": components["core_avg"], | |
| "Supplementary Avg": components["supplementary_avg"], | |
| "Missing": f"{components['missing']} / {components['benchmarks']}", | |
| } | |
| ) | |
| return _with_tbd(pd.DataFrame.from_records(records)) | |
| def build_model_benchmark_scores(model_name: str, limit: int = 6, strongest: bool = True) -> pd.DataFrame: | |
| model_scores = MODEL_SCORES.get(model_name, {}) | |
| scored = [ | |
| { | |
| "Benchmark": benchmark["name"], | |
| "Category": CATEGORY_DISPLAY[benchmark["category"]], | |
| "Domain": benchmark["domain"], | |
| "Score": model_scores.get(benchmark["id"]), | |
| } | |
| for benchmark in BENCHMARKS | |
| if model_scores.get(benchmark["id"]) is not None | |
| ] | |
| scored.sort(key=lambda item: item["Score"], reverse=strongest) | |
| return _with_tbd(pd.DataFrame.from_records(scored[:limit])) | |
| def model_options( | |
| include_closed: bool = True, | |
| parameter_range: tuple[float, float] | None = None, | |
| ) -> list[str]: | |
| rows = build_leaderboard(include_closed=include_closed, parameter_range=parameter_range) | |
| return [row["Model"] for row in rows] | |
| def benchmark_options(benchmarks: list[dict]) -> list[str]: | |
| return [benchmark["name"] for benchmark in benchmarks] | |
| def find_benchmark_by_name(name: str) -> dict | None: | |
| for benchmark in BENCHMARKS: | |
| if benchmark["name"] == name: | |
| return benchmark | |
| return None | |
| def score_stats( | |
| include_closed: bool = True, | |
| parameter_range: tuple[float, float] | None = None, | |
| ) -> dict[str, str]: | |
| rows = build_leaderboard(include_closed=include_closed, parameter_range=parameter_range) | |
| open_rows = [row for row in rows if MODEL_METADATA.get(row["Model"], {}).get("open_weights")] | |
| active_benchmarks = { | |
| benchmark_id | |
| for scores in MODEL_SCORES.values() | |
| for benchmark_id, score in scores.items() | |
| if score is not None | |
| } | |
| return { | |
| "Top model": rows[0]["Model"] if rows else "-", | |
| "Best open-source model": open_rows[0]["Model"] if open_rows else "-", | |
| "Models": str(len(rows)), | |
| "Active benchmarks": str(len(active_benchmarks)), | |
| "Latest data source": "GRM Eval - Benchmarks PRD.pdf", | |
| } | |
| def official_score_for_model(model_name: str) -> dict[str, float | None]: | |
| return compute_grm_score(MODEL_SCORES.get(model_name, {})) | |