GRM / streamlit_app.py
mbagdasarova-nvidia's picture
Upload 9 files
5c49242 verified
"""Streamlit implementation of the GRM Leaderboard Space."""
from textwrap import dedent
import streamlit as st
from benchmarks import CATEGORY_DISPLAY, SOURCE_GRM_BENCH
from data_views import (
VIEW_CATEGORY,
VIEW_MATRIX,
VIEW_SUMMARY,
available_domains,
available_sources,
benchmark_options,
build_benchmark_registry_frame,
build_model_benchmark_scores,
build_model_detail_frame,
build_score_explorer_frame,
category_from_label,
category_options,
filter_benchmarks,
find_benchmark_by_name,
model_options,
parameter_bounds,
score_stats,
)
from ui_theme import CUSTOM_CSS, FORMULAS, HEADER_HTML, OVERVIEW_BLOCKS, palette_css
st.set_page_config(
page_title="GRM Score - Game Ready Leaderboard",
layout="wide",
)
theme_columns = st.columns([0.72, 0.28])
theme_mode = theme_columns[1].radio(
"Theme",
["Night", "Day"],
horizontal=True,
key="grm_theme_mode",
)
light_mode = theme_mode == "Day"
st.markdown(CUSTOM_CSS, unsafe_allow_html=True)
st.markdown(palette_css(light_mode), unsafe_allow_html=True)
st.markdown(HEADER_HTML, unsafe_allow_html=True)
def themed_dataframe(data):
if not light_mode or not hasattr(data, "style"):
return data
return data.style.set_properties(
**{
"background-color": "#ffffff",
"color": "#15181b",
"border-color": "#d9dee3",
}
).set_table_styles(
[
{
"selector": "th",
"props": [
("background-color", "#eef1f4"),
("color", "#15181b"),
("border-color", "#d9dee3"),
],
}
]
)
def render_dataframe(data, **kwargs) -> None:
st.dataframe(themed_dataframe(data), **kwargs)
def format_sample_code(code: str) -> str:
lines = dedent(code).strip().splitlines()
formatted_lines = []
for line in lines:
stripped = line.lstrip()
leading_spaces = len(line) - len(stripped)
normalized_indent = " " * (leading_spaces // 2)
formatted_lines.append(f"{normalized_indent}{stripped}" if stripped else "")
return "\n".join(formatted_lines)
def render_about_grm() -> None:
st.markdown("## About GRM Score")
for block in OVERVIEW_BLOCKS:
st.markdown(block)
for formula in FORMULAS:
st.markdown(f'<p class="formula-line">{formula}</p>', unsafe_allow_html=True)
def render_stats(include_closed: bool, parameter_range: tuple[float, float]) -> None:
stats = score_stats(include_closed=include_closed, parameter_range=parameter_range)
columns = st.columns(5)
for column, (label, value) in zip(columns, stats.items(), strict=True):
column.metric(label, value)
def render_score_controls() -> dict:
top = st.columns([1.2, 1.2, 1.2, 1.4])
model_visibility = top[0].radio(
"Model visibility",
["All models", "Open-source only"],
index=1,
horizontal=True,
key="leaderboard_model_visibility",
)
view = top[1].radio(
"Score view",
[VIEW_SUMMARY, VIEW_CATEGORY, VIEW_MATRIX],
index=2,
horizontal=True,
key="leaderboard_score_view",
)
category_label = top[2].selectbox("Category", category_options(), key="leaderboard_category")
search = top[3].text_input("Search", placeholder="Model or benchmark", key="leaderboard_search")
filters = st.columns([1.5, 1.8, 1.8, 1.2])
sources = filters[0].multiselect("Source", available_sources(), placeholder="All sources", key="leaderboard_source")
domains = filters[1].multiselect("Domain", available_domains(), placeholder="All domains", key="leaderboard_domain")
minimum_size, maximum_size = parameter_bounds()
parameter_range = filters[2].slider(
"Model parameter class (B)",
min_value=minimum_size,
max_value=maximum_size,
value=(0.0, min(4.0, maximum_size)),
step=0.5,
format="%.1fB",
key="leaderboard_parameter_range_v2",
help="Use 0.0B for <0.5B class. The top end includes 120B+ models.",
)
recalculate_visible = filters[3].checkbox("Filtered score", value=False, key="leaderboard_filtered_score")
return {
"include_closed": model_visibility == "All models",
"view": view,
"category_label": category_label,
"search": search,
"sources": sources,
"domains": domains,
"parameter_range": parameter_range,
"recalculate_visible": recalculate_visible,
}
def render_score_explorer() -> None:
st.markdown("## Leaderboard")
st.caption(
"Current score values are static PRD-backed values with TBD entries shown as missing. "
"Filtered score recalculates an exploratory score from visible benchmarks only."
)
controls = render_score_controls()
selected_category = category_from_label(controls["category_label"])
benchmarks = filter_benchmarks(
category=selected_category,
sources=controls["sources"],
domains=controls["domains"],
search=controls["search"],
)
render_stats(
include_closed=controls["include_closed"],
parameter_range=controls["parameter_range"],
)
frame = build_score_explorer_frame(
view=controls["view"],
category_label=controls["category_label"],
benchmarks=benchmarks,
include_closed=controls["include_closed"],
recalculate_visible=controls["recalculate_visible"],
parameter_range=controls["parameter_range"],
)
render_dataframe(frame, width="stretch", hide_index=True)
if controls["view"] == VIEW_MATRIX:
st.caption(f"Showing {len(benchmarks)} benchmark columns from the active filters.")
detail_cols = st.columns(2)
with detail_cols[0]:
render_model_detail(controls["include_closed"], controls["parameter_range"])
with detail_cols[1]:
render_benchmark_detail(benchmarks, key="leaderboard_benchmark_detail")
def render_model_detail(include_closed: bool, parameter_range: tuple[float, float]) -> None:
models = model_options(include_closed=include_closed, parameter_range=parameter_range)
if not models:
return
selected_model = st.selectbox("Model detail", models, key="leaderboard_model_detail")
render_dataframe(build_model_detail_frame(selected_model), width="stretch", hide_index=True)
tabs = st.tabs(["Strongest", "Weakest"])
with tabs[0]:
render_dataframe(
build_model_benchmark_scores(selected_model, strongest=True),
width="stretch",
hide_index=True,
)
with tabs[1]:
render_dataframe(
build_model_benchmark_scores(selected_model, strongest=False),
width="stretch",
hide_index=True,
)
def render_benchmark_detail(benchmarks: list[dict], key: str) -> None:
options = benchmark_options(benchmarks) or benchmark_options(filter_benchmarks())
if not options:
return
selected_name = st.selectbox("Benchmark detail", options, key=key)
benchmark = find_benchmark_by_name(selected_name)
if benchmark is None:
return
st.markdown(
f"""
<div class="detail-panel">
<div class="detail-kicker">{benchmark['source']} 路 weight {benchmark['calc_weight']}</div>
<h3>{benchmark['name']}</h3>
<p>{benchmark['description']}</p>
<p>{benchmark['summary']}</p>
</div>
""",
unsafe_allow_html=True,
)
if benchmark.get("paper"):
st.markdown(f"[Paper / Source]({benchmark['paper']})")
if benchmark.get("detection_scope"):
st.markdown("#### Detection Scope")
render_dataframe(
benchmark["detection_scope"],
width="stretch",
hide_index=True,
)
with st.expander("Methodology", expanded=False):
st.write(benchmark["methodology"])
st.write("Included in official GRM Score:" if benchmark["included_in_grm"] else "Not included in official GRM Score.")
def render_benchmark_summary_card(benchmark: dict) -> None:
st.markdown(
f"""
<div class="detail-panel">
<div class="detail-kicker">{CATEGORY_DISPLAY[benchmark['category']]}{benchmark['source']} 路 weight {benchmark['calc_weight']}</div>
<h3>{benchmark['name']}</h3>
<p>{benchmark['description']}</p>
<p>{benchmark['summary']}</p>
</div>
""",
unsafe_allow_html=True,
)
if benchmark.get("paper"):
st.markdown(f"[Paper / Source]({benchmark['paper']})")
with st.expander(f"{benchmark['name']} methodology and scope", expanded=False):
st.write(benchmark["methodology"])
if benchmark.get("detection_scope"):
render_dataframe(benchmark["detection_scope"], width="stretch", hide_index=True)
def render_grm_methodology() -> None:
st.markdown("## GRM-Bench Methodology")
st.markdown(
"GRM-Bench is the in-house authored benchmark suite for game-facing assistants, companions, "
"and NPC behaviors that are not well-covered by broad academic leaderboards. The sections below "
"preserve the authored benchmark methodology, failure modes, and representative examples."
)
grm_benchmarks = filter_benchmarks(sources=[SOURCE_GRM_BENCH], include_non_scored=True)
for benchmark in grm_benchmarks:
st.markdown(f"### {benchmark['name'].replace('GRM - ', '')}")
st.write(benchmark["summary"])
st.markdown("#### Test Methodology")
st.write(benchmark["methodology"])
if benchmark.get("detection_scope"):
st.markdown("#### Detection Scope")
render_dataframe(benchmark["detection_scope"], width="stretch", hide_index=True)
if benchmark["id"] == "grm_coherence":
st.markdown("#### Representative Samples")
for sample in benchmark.get("samples", []):
with st.expander(sample["id"], expanded=False):
for label, value in sample["metadata"]:
st.markdown(f"**{label}:** {value}")
st.code(format_sample_code(sample["code"]), language="json", wrap_lines=True)
def render_benchmark_library() -> None:
st.markdown("## Benchmark Library")
st.caption("Evaluation suite reference with benchmark summaries, paper links, and GRM-Bench methodology.")
filter_cols = st.columns([1.2, 1.4, 1.8, 1.2])
category_label = filter_cols[0].selectbox("Library category", category_options(), key="library_category")
sources = filter_cols[1].multiselect("Library source", available_sources(), placeholder="All sources", key="library_source")
domains = filter_cols[2].multiselect("Library domain", available_domains(), placeholder="All domains", key="library_domain")
include_non_scored = filter_cols[3].checkbox("Show non-scored", value=True, key="library_show_non_scored")
search = st.text_input("Benchmark library search", placeholder="Benchmark, domain, description", key="library_search")
benchmarks = filter_benchmarks(
category=category_from_label(category_label),
sources=sources,
domains=domains,
search=search,
include_non_scored=include_non_scored,
)
render_dataframe(
build_benchmark_registry_frame(benchmarks),
width="stretch",
hide_index=True,
column_config={
"Paper / Repo": st.column_config.LinkColumn("Paper / Repo"),
"Summary": st.column_config.TextColumn("Summary", width="large"),
"Description": st.column_config.TextColumn("Description", width="large"),
},
)
render_grm_methodology()
render_about_grm()
leaderboard_tab, library_tab = st.tabs(["Leaderboard", "Benchmark Library"])
with leaderboard_tab:
render_score_explorer()
with library_tab:
render_benchmark_library()