"""Streamlit implementation of the GRM Leaderboard Space.""" from textwrap import dedent import streamlit as st from benchmarks import CATEGORY_DISPLAY, SOURCE_GRM_BENCH from data_views import ( VIEW_CATEGORY, VIEW_MATRIX, VIEW_SUMMARY, available_domains, available_sources, benchmark_options, build_benchmark_registry_frame, build_model_benchmark_scores, build_model_detail_frame, build_score_explorer_frame, category_from_label, category_options, filter_benchmarks, find_benchmark_by_name, model_options, parameter_bounds, score_stats, ) from ui_theme import CUSTOM_CSS, FORMULAS, HEADER_HTML, OVERVIEW_BLOCKS, palette_css st.set_page_config( page_title="GRM Score - Game Ready Leaderboard", layout="wide", ) theme_columns = st.columns([0.72, 0.28]) theme_mode = theme_columns[1].radio( "Theme", ["Night", "Day"], horizontal=True, key="grm_theme_mode", ) light_mode = theme_mode == "Day" st.markdown(CUSTOM_CSS, unsafe_allow_html=True) st.markdown(palette_css(light_mode), unsafe_allow_html=True) st.markdown(HEADER_HTML, unsafe_allow_html=True) def themed_dataframe(data): if not light_mode or not hasattr(data, "style"): return data return data.style.set_properties( **{ "background-color": "#ffffff", "color": "#15181b", "border-color": "#d9dee3", } ).set_table_styles( [ { "selector": "th", "props": [ ("background-color", "#eef1f4"), ("color", "#15181b"), ("border-color", "#d9dee3"), ], } ] ) def render_dataframe(data, **kwargs) -> None: st.dataframe(themed_dataframe(data), **kwargs) def format_sample_code(code: str) -> str: lines = dedent(code).strip().splitlines() formatted_lines = [] for line in lines: stripped = line.lstrip() leading_spaces = len(line) - len(stripped) normalized_indent = " " * (leading_spaces // 2) formatted_lines.append(f"{normalized_indent}{stripped}" if stripped else "") return "\n".join(formatted_lines) def render_about_grm() -> None: st.markdown("## About GRM Score") for block in OVERVIEW_BLOCKS: st.markdown(block) for formula in FORMULAS: st.markdown(f'

{formula}

', unsafe_allow_html=True) def render_stats(include_closed: bool, parameter_range: tuple[float, float]) -> None: stats = score_stats(include_closed=include_closed, parameter_range=parameter_range) columns = st.columns(5) for column, (label, value) in zip(columns, stats.items(), strict=True): column.metric(label, value) def render_score_controls() -> dict: top = st.columns([1.2, 1.2, 1.2, 1.4]) model_visibility = top[0].radio( "Model visibility", ["All models", "Open-source only"], index=1, horizontal=True, key="leaderboard_model_visibility", ) view = top[1].radio( "Score view", [VIEW_SUMMARY, VIEW_CATEGORY, VIEW_MATRIX], index=2, horizontal=True, key="leaderboard_score_view", ) category_label = top[2].selectbox("Category", category_options(), key="leaderboard_category") search = top[3].text_input("Search", placeholder="Model or benchmark", key="leaderboard_search") filters = st.columns([1.5, 1.8, 1.8, 1.2]) sources = filters[0].multiselect("Source", available_sources(), placeholder="All sources", key="leaderboard_source") domains = filters[1].multiselect("Domain", available_domains(), placeholder="All domains", key="leaderboard_domain") minimum_size, maximum_size = parameter_bounds() parameter_range = filters[2].slider( "Model parameter class (B)", min_value=minimum_size, max_value=maximum_size, value=(0.0, min(4.0, maximum_size)), step=0.5, format="%.1fB", key="leaderboard_parameter_range_v2", help="Use 0.0B for <0.5B class. The top end includes 120B+ models.", ) recalculate_visible = filters[3].checkbox("Filtered score", value=False, key="leaderboard_filtered_score") return { "include_closed": model_visibility == "All models", "view": view, "category_label": category_label, "search": search, "sources": sources, "domains": domains, "parameter_range": parameter_range, "recalculate_visible": recalculate_visible, } def render_score_explorer() -> None: st.markdown("## Leaderboard") st.caption( "Current score values are static PRD-backed values with TBD entries shown as missing. " "Filtered score recalculates an exploratory score from visible benchmarks only." ) controls = render_score_controls() selected_category = category_from_label(controls["category_label"]) benchmarks = filter_benchmarks( category=selected_category, sources=controls["sources"], domains=controls["domains"], search=controls["search"], ) render_stats( include_closed=controls["include_closed"], parameter_range=controls["parameter_range"], ) frame = build_score_explorer_frame( view=controls["view"], category_label=controls["category_label"], benchmarks=benchmarks, include_closed=controls["include_closed"], recalculate_visible=controls["recalculate_visible"], parameter_range=controls["parameter_range"], ) render_dataframe(frame, width="stretch", hide_index=True) if controls["view"] == VIEW_MATRIX: st.caption(f"Showing {len(benchmarks)} benchmark columns from the active filters.") detail_cols = st.columns(2) with detail_cols[0]: render_model_detail(controls["include_closed"], controls["parameter_range"]) with detail_cols[1]: render_benchmark_detail(benchmarks, key="leaderboard_benchmark_detail") def render_model_detail(include_closed: bool, parameter_range: tuple[float, float]) -> None: models = model_options(include_closed=include_closed, parameter_range=parameter_range) if not models: return selected_model = st.selectbox("Model detail", models, key="leaderboard_model_detail") render_dataframe(build_model_detail_frame(selected_model), width="stretch", hide_index=True) tabs = st.tabs(["Strongest", "Weakest"]) with tabs[0]: render_dataframe( build_model_benchmark_scores(selected_model, strongest=True), width="stretch", hide_index=True, ) with tabs[1]: render_dataframe( build_model_benchmark_scores(selected_model, strongest=False), width="stretch", hide_index=True, ) def render_benchmark_detail(benchmarks: list[dict], key: str) -> None: options = benchmark_options(benchmarks) or benchmark_options(filter_benchmarks()) if not options: return selected_name = st.selectbox("Benchmark detail", options, key=key) benchmark = find_benchmark_by_name(selected_name) if benchmark is None: return st.markdown( f"""
{benchmark['source']} · weight {benchmark['calc_weight']}

{benchmark['name']}

{benchmark['description']}

{benchmark['summary']}

""", unsafe_allow_html=True, ) if benchmark.get("paper"): st.markdown(f"[Paper / Source]({benchmark['paper']})") if benchmark.get("detection_scope"): st.markdown("#### Detection Scope") render_dataframe( benchmark["detection_scope"], width="stretch", hide_index=True, ) with st.expander("Methodology", expanded=False): st.write(benchmark["methodology"]) st.write("Included in official GRM Score:" if benchmark["included_in_grm"] else "Not included in official GRM Score.") def render_benchmark_summary_card(benchmark: dict) -> None: st.markdown( f"""
{CATEGORY_DISPLAY[benchmark['category']]} · {benchmark['source']} · weight {benchmark['calc_weight']}

{benchmark['name']}

{benchmark['description']}

{benchmark['summary']}

""", unsafe_allow_html=True, ) if benchmark.get("paper"): st.markdown(f"[Paper / Source]({benchmark['paper']})") with st.expander(f"{benchmark['name']} methodology and scope", expanded=False): st.write(benchmark["methodology"]) if benchmark.get("detection_scope"): render_dataframe(benchmark["detection_scope"], width="stretch", hide_index=True) def render_grm_methodology() -> None: st.markdown("## GRM-Bench Methodology") st.markdown( "GRM-Bench is the in-house authored benchmark suite for game-facing assistants, companions, " "and NPC behaviors that are not well-covered by broad academic leaderboards. The sections below " "preserve the authored benchmark methodology, failure modes, and representative examples." ) grm_benchmarks = filter_benchmarks(sources=[SOURCE_GRM_BENCH], include_non_scored=True) for benchmark in grm_benchmarks: st.markdown(f"### {benchmark['name'].replace('GRM - ', '')}") st.write(benchmark["summary"]) st.markdown("#### Test Methodology") st.write(benchmark["methodology"]) if benchmark.get("detection_scope"): st.markdown("#### Detection Scope") render_dataframe(benchmark["detection_scope"], width="stretch", hide_index=True) if benchmark["id"] == "grm_coherence": st.markdown("#### Representative Samples") for sample in benchmark.get("samples", []): with st.expander(sample["id"], expanded=False): for label, value in sample["metadata"]: st.markdown(f"**{label}:** {value}") st.code(format_sample_code(sample["code"]), language="json", wrap_lines=True) def render_benchmark_library() -> None: st.markdown("## Benchmark Library") st.caption("Evaluation suite reference with benchmark summaries, paper links, and GRM-Bench methodology.") filter_cols = st.columns([1.2, 1.4, 1.8, 1.2]) category_label = filter_cols[0].selectbox("Library category", category_options(), key="library_category") sources = filter_cols[1].multiselect("Library source", available_sources(), placeholder="All sources", key="library_source") domains = filter_cols[2].multiselect("Library domain", available_domains(), placeholder="All domains", key="library_domain") include_non_scored = filter_cols[3].checkbox("Show non-scored", value=True, key="library_show_non_scored") search = st.text_input("Benchmark library search", placeholder="Benchmark, domain, description", key="library_search") benchmarks = filter_benchmarks( category=category_from_label(category_label), sources=sources, domains=domains, search=search, include_non_scored=include_non_scored, ) render_dataframe( build_benchmark_registry_frame(benchmarks), width="stretch", hide_index=True, column_config={ "Paper / Repo": st.column_config.LinkColumn("Paper / Repo"), "Summary": st.column_config.TextColumn("Summary", width="large"), "Description": st.column_config.TextColumn("Description", width="large"), }, ) render_grm_methodology() render_about_grm() leaderboard_tab, library_tab = st.tabs(["Leaderboard", "Benchmark Library"]) with leaderboard_tab: render_score_explorer() with library_tab: render_benchmark_library()