GRM / data_views.py
mbagdasarova-nvidia's picture
Upload 9 files
5c49242 verified
"""Table view helpers for the Streamlit GRM leaderboard."""
from collections.abc import Iterable
import pandas as pd
from benchmarks import BENCHMARKS, CATEGORIES, CATEGORY_DISPLAY, GRM_BENCH_DIMENSIONS
from scores import MODEL_METADATA, MODEL_SCORES
from scoring import build_leaderboard, compute_category_components, compute_grm_score, get_score
VIEW_SUMMARY = "Summary"
VIEW_CATEGORY = "Category"
VIEW_MATRIX = "Benchmark matrix"
def format_score(value: float | None) -> str:
return f"{value:.1f}" if value is not None else "TBD"
def _with_tbd(frame: pd.DataFrame) -> pd.DataFrame:
return frame.astype("object").where(pd.notna(frame), "TBD").astype(str)
def category_options() -> list[str]:
return ["All", *[CATEGORY_DISPLAY[category] for category in CATEGORIES]]
def category_from_label(label: str) -> str | None:
for category, display in CATEGORY_DISPLAY.items():
if label == display:
return category
return None
def available_domains() -> list[str]:
return sorted({benchmark["domain"] for benchmark in BENCHMARKS})
def available_priorities() -> list[str]:
return sorted({benchmark["priority"] for benchmark in BENCHMARKS})
def available_sources() -> list[str]:
return sorted({benchmark["source"] for benchmark in BENCHMARKS})
def parameter_bounds() -> tuple[float, float]:
values = [
metadata["parameter_b"]
for metadata in MODEL_METADATA.values()
if isinstance(metadata.get("parameter_b"), int | float)
]
return (0.0, max(values) if values else 120.0)
def _matches_search(benchmark: dict, search: str) -> bool:
if not search:
return True
target = " ".join(
str(benchmark.get(key, ""))
for key in ["name", "description", "summary", "domain", "source", "priority"]
).lower()
return search.lower() in target
def filter_benchmarks(
category: str | None = None,
priorities: Iterable[str] | None = None,
sources: Iterable[str] | None = None,
domains: Iterable[str] | None = None,
search: str = "",
include_non_scored: bool = False,
) -> list[dict]:
priority_set = set(priorities or [])
source_set = set(sources or [])
domain_set = set(domains or [])
benchmarks = []
for benchmark in BENCHMARKS:
if category and benchmark["category"] != category:
continue
if priority_set and benchmark["priority"] not in priority_set:
continue
if source_set and benchmark["source"] not in source_set:
continue
if domain_set and benchmark["domain"] not in domain_set:
continue
if not include_non_scored and not benchmark["included_in_grm"]:
continue
if not _matches_search(benchmark, search):
continue
benchmarks.append(benchmark)
return benchmarks
def _format_leaderboard_rows(rows: list[dict], use_filtered_label: bool = False) -> pd.DataFrame:
label = "Filtered GRM Score" if use_filtered_label else "GRM Score"
records = []
for row in rows:
records.append(
{
"Rank": row["Rank"],
"Model": row["Model"],
label: row["GRM Score"],
"Roleplay": row["Roleplay (33%)"],
"Actions": row["Actions (33%)"],
"General": row["General (33%)"],
"Family": row.get("Family"),
"Size": row.get("Size"),
}
)
return pd.DataFrame.from_records(records)
def build_summary_frame(
include_closed: bool = True,
parameter_range: tuple[float, float] | None = None,
) -> pd.DataFrame:
return _with_tbd(_format_leaderboard_rows(build_leaderboard(include_closed=include_closed, parameter_range=parameter_range)))
def build_category_frame(
category: str,
include_closed: bool = True,
benchmark_ids: set[str] | None = None,
filtered_score: bool = False,
parameter_range: tuple[float, float] | None = None,
) -> pd.DataFrame:
rows = build_leaderboard(
include_closed=include_closed,
benchmark_ids=benchmark_ids if filtered_score else None,
parameter_range=parameter_range,
)
records = []
category_name = CATEGORY_DISPLAY[category]
for row in rows:
components = compute_category_components(MODEL_SCORES[row["Model"]], category, benchmark_ids)
records.append(
{
"Rank": row["Rank"],
"Model": row["Model"],
"Filtered GRM Score" if filtered_score else "GRM Score": row["GRM Score"],
f"{category_name} Score": components["score"],
"Core Avg": components["core_avg"],
"Supplementary Avg": components["supplementary_avg"],
"Missing": f"{components['missing']} / {components['benchmarks']}",
"Family": row.get("Family"),
"Size": row.get("Size"),
}
)
return _with_tbd(pd.DataFrame.from_records(records))
def build_benchmark_matrix_frame(
benchmarks: list[dict],
include_closed: bool = True,
recalculate_visible: bool = False,
parameter_range: tuple[float, float] | None = None,
) -> pd.DataFrame:
benchmark_ids = {benchmark["id"] for benchmark in benchmarks}
rows = build_leaderboard(
include_closed=include_closed,
benchmark_ids=benchmark_ids if recalculate_visible else None,
parameter_range=parameter_range,
)
records = []
for row in rows:
record = {
"Rank": row["Rank"],
"Model": row["Model"],
"Filtered GRM Score" if recalculate_visible else "GRM Score": row["GRM Score"],
"Roleplay": row["Roleplay (33%)"],
"Actions": row["Actions (33%)"],
"General": row["General (33%)"],
}
for benchmark in benchmarks:
score = get_score(row["Model"], benchmark["id"])
record[benchmark["name"]] = score
records.append(record)
return _with_tbd(pd.DataFrame.from_records(records))
def build_score_explorer_frame(
view: str,
category_label: str,
benchmarks: list[dict],
include_closed: bool,
recalculate_visible: bool,
parameter_range: tuple[float, float] | None = None,
) -> pd.DataFrame:
selected_category = category_from_label(category_label)
benchmark_ids = {benchmark["id"] for benchmark in benchmarks}
if view == VIEW_SUMMARY:
if recalculate_visible and benchmark_ids:
return _with_tbd(_format_leaderboard_rows(
build_leaderboard(
include_closed=include_closed,
benchmark_ids=benchmark_ids,
parameter_range=parameter_range,
),
use_filtered_label=True,
))
return build_summary_frame(include_closed=include_closed, parameter_range=parameter_range)
if view == VIEW_CATEGORY:
category = selected_category or "ROLEPLAY"
return build_category_frame(
category,
include_closed=include_closed,
benchmark_ids=benchmark_ids,
filtered_score=recalculate_visible,
parameter_range=parameter_range,
)
return build_benchmark_matrix_frame(
benchmarks,
include_closed=include_closed,
recalculate_visible=recalculate_visible,
parameter_range=parameter_range,
)
def build_benchmark_registry_frame(benchmarks: list[dict]) -> pd.DataFrame:
records = []
for benchmark in benchmarks:
records.append(
{
"Benchmark": benchmark["name"],
"Category": CATEGORY_DISPLAY[benchmark["category"]],
"Domain": benchmark["domain"],
"Source": benchmark["source"],
"Weight": benchmark["calc_weight"],
"Included in GRM": "Yes" if benchmark["included_in_grm"] else "No",
"Description": benchmark["description"],
"Summary": benchmark["summary"],
"Paper / Repo": benchmark.get("paper") or "",
}
)
return _with_tbd(pd.DataFrame.from_records(records))
def build_grm_dimensions_frame(show_non_scored: bool = True) -> pd.DataFrame:
dimensions = [
dimension
for dimension in GRM_BENCH_DIMENSIONS
if show_non_scored or dimension["included_in_grm"]
]
return _with_tbd(pd.DataFrame.from_records(
{
"Dimension": dimension["dimension"],
"Phase": dimension["phase"],
"Included in GRM": "Yes" if dimension["included_in_grm"] else "No",
"Notes": dimension["notes"],
}
for dimension in dimensions
))
def build_model_detail_frame(model_name: str) -> pd.DataFrame:
model_scores = MODEL_SCORES.get(model_name, {})
records = []
for category in CATEGORIES:
components = compute_category_components(model_scores, category)
records.append(
{
"Category": CATEGORY_DISPLAY[category],
"Score": components["score"],
"Core Avg": components["core_avg"],
"Supplementary Avg": components["supplementary_avg"],
"Missing": f"{components['missing']} / {components['benchmarks']}",
}
)
return _with_tbd(pd.DataFrame.from_records(records))
def build_model_benchmark_scores(model_name: str, limit: int = 6, strongest: bool = True) -> pd.DataFrame:
model_scores = MODEL_SCORES.get(model_name, {})
scored = [
{
"Benchmark": benchmark["name"],
"Category": CATEGORY_DISPLAY[benchmark["category"]],
"Domain": benchmark["domain"],
"Score": model_scores.get(benchmark["id"]),
}
for benchmark in BENCHMARKS
if model_scores.get(benchmark["id"]) is not None
]
scored.sort(key=lambda item: item["Score"], reverse=strongest)
return _with_tbd(pd.DataFrame.from_records(scored[:limit]))
def model_options(
include_closed: bool = True,
parameter_range: tuple[float, float] | None = None,
) -> list[str]:
rows = build_leaderboard(include_closed=include_closed, parameter_range=parameter_range)
return [row["Model"] for row in rows]
def benchmark_options(benchmarks: list[dict]) -> list[str]:
return [benchmark["name"] for benchmark in benchmarks]
def find_benchmark_by_name(name: str) -> dict | None:
for benchmark in BENCHMARKS:
if benchmark["name"] == name:
return benchmark
return None
def score_stats(
include_closed: bool = True,
parameter_range: tuple[float, float] | None = None,
) -> dict[str, str]:
rows = build_leaderboard(include_closed=include_closed, parameter_range=parameter_range)
open_rows = [row for row in rows if MODEL_METADATA.get(row["Model"], {}).get("open_weights")]
active_benchmarks = {
benchmark_id
for scores in MODEL_SCORES.values()
for benchmark_id, score in scores.items()
if score is not None
}
return {
"Top model": rows[0]["Model"] if rows else "-",
"Best open-source model": open_rows[0]["Model"] if open_rows else "-",
"Models": str(len(rows)),
"Active benchmarks": str(len(active_benchmarks)),
"Latest data source": "GRM Eval - Benchmarks PRD.pdf",
}
def official_score_for_model(model_name: str) -> dict[str, float | None]:
return compute_grm_score(MODEL_SCORES.get(model_name, {}))