"""Streamlit implementation of the GRM Leaderboard Space.""" from textwrap import dedent import streamlit as st from benchmarks import CATEGORY_DISPLAY, SOURCE_GRM_BENCH from data_views import ( VIEW_CATEGORY, VIEW_MATRIX, VIEW_SUMMARY, available_domains, available_sources, benchmark_options, build_benchmark_registry_frame, build_model_benchmark_scores, build_model_detail_frame, build_score_explorer_frame, category_from_label, category_options, filter_benchmarks, find_benchmark_by_name, model_options, parameter_bounds, score_stats, ) from ui_theme import CUSTOM_CSS, FORMULAS, HEADER_HTML, OVERVIEW_BLOCKS, palette_css st.set_page_config( page_title="GRM Score - Game Ready Leaderboard", layout="wide", ) theme_columns = st.columns([0.72, 0.28]) theme_mode = theme_columns[1].radio( "Theme", ["Night", "Day"], horizontal=True, key="grm_theme_mode", ) light_mode = theme_mode == "Day" st.markdown(CUSTOM_CSS, unsafe_allow_html=True) st.markdown(palette_css(light_mode), unsafe_allow_html=True) st.markdown(HEADER_HTML, unsafe_allow_html=True) def themed_dataframe(data): if not light_mode or not hasattr(data, "style"): return data return data.style.set_properties( **{ "background-color": "#ffffff", "color": "#15181b", "border-color": "#d9dee3", } ).set_table_styles( [ { "selector": "th", "props": [ ("background-color", "#eef1f4"), ("color", "#15181b"), ("border-color", "#d9dee3"), ], } ] ) def render_dataframe(data, **kwargs) -> None: st.dataframe(themed_dataframe(data), **kwargs) def format_sample_code(code: str) -> str: lines = dedent(code).strip().splitlines() formatted_lines = [] for line in lines: stripped = line.lstrip() leading_spaces = len(line) - len(stripped) normalized_indent = " " * (leading_spaces // 2) formatted_lines.append(f"{normalized_indent}{stripped}" if stripped else "") return "\n".join(formatted_lines) def render_about_grm() -> None: st.markdown("## About GRM Score") for block in OVERVIEW_BLOCKS: st.markdown(block) for formula in FORMULAS: st.markdown(f'
{formula}
', unsafe_allow_html=True) def render_stats(include_closed: bool, parameter_range: tuple[float, float]) -> None: stats = score_stats(include_closed=include_closed, parameter_range=parameter_range) columns = st.columns(5) for column, (label, value) in zip(columns, stats.items(), strict=True): column.metric(label, value) def render_score_controls() -> dict: top = st.columns([1.2, 1.2, 1.2, 1.4]) model_visibility = top[0].radio( "Model visibility", ["All models", "Open-source only"], index=1, horizontal=True, key="leaderboard_model_visibility", ) view = top[1].radio( "Score view", [VIEW_SUMMARY, VIEW_CATEGORY, VIEW_MATRIX], index=2, horizontal=True, key="leaderboard_score_view", ) category_label = top[2].selectbox("Category", category_options(), key="leaderboard_category") search = top[3].text_input("Search", placeholder="Model or benchmark", key="leaderboard_search") filters = st.columns([1.5, 1.8, 1.8, 1.2]) sources = filters[0].multiselect("Source", available_sources(), placeholder="All sources", key="leaderboard_source") domains = filters[1].multiselect("Domain", available_domains(), placeholder="All domains", key="leaderboard_domain") minimum_size, maximum_size = parameter_bounds() parameter_range = filters[2].slider( "Model parameter class (B)", min_value=minimum_size, max_value=maximum_size, value=(0.0, min(4.0, maximum_size)), step=0.5, format="%.1fB", key="leaderboard_parameter_range_v2", help="Use 0.0B for <0.5B class. The top end includes 120B+ models.", ) recalculate_visible = filters[3].checkbox("Filtered score", value=False, key="leaderboard_filtered_score") return { "include_closed": model_visibility == "All models", "view": view, "category_label": category_label, "search": search, "sources": sources, "domains": domains, "parameter_range": parameter_range, "recalculate_visible": recalculate_visible, } def render_score_explorer() -> None: st.markdown("## Leaderboard") st.caption( "Current score values are static PRD-backed values with TBD entries shown as missing. " "Filtered score recalculates an exploratory score from visible benchmarks only." ) controls = render_score_controls() selected_category = category_from_label(controls["category_label"]) benchmarks = filter_benchmarks( category=selected_category, sources=controls["sources"], domains=controls["domains"], search=controls["search"], ) render_stats( include_closed=controls["include_closed"], parameter_range=controls["parameter_range"], ) frame = build_score_explorer_frame( view=controls["view"], category_label=controls["category_label"], benchmarks=benchmarks, include_closed=controls["include_closed"], recalculate_visible=controls["recalculate_visible"], parameter_range=controls["parameter_range"], ) render_dataframe(frame, width="stretch", hide_index=True) if controls["view"] == VIEW_MATRIX: st.caption(f"Showing {len(benchmarks)} benchmark columns from the active filters.") detail_cols = st.columns(2) with detail_cols[0]: render_model_detail(controls["include_closed"], controls["parameter_range"]) with detail_cols[1]: render_benchmark_detail(benchmarks, key="leaderboard_benchmark_detail") def render_model_detail(include_closed: bool, parameter_range: tuple[float, float]) -> None: models = model_options(include_closed=include_closed, parameter_range=parameter_range) if not models: return selected_model = st.selectbox("Model detail", models, key="leaderboard_model_detail") render_dataframe(build_model_detail_frame(selected_model), width="stretch", hide_index=True) tabs = st.tabs(["Strongest", "Weakest"]) with tabs[0]: render_dataframe( build_model_benchmark_scores(selected_model, strongest=True), width="stretch", hide_index=True, ) with tabs[1]: render_dataframe( build_model_benchmark_scores(selected_model, strongest=False), width="stretch", hide_index=True, ) def render_benchmark_detail(benchmarks: list[dict], key: str) -> None: options = benchmark_options(benchmarks) or benchmark_options(filter_benchmarks()) if not options: return selected_name = st.selectbox("Benchmark detail", options, key=key) benchmark = find_benchmark_by_name(selected_name) if benchmark is None: return st.markdown( f"""{benchmark['description']}
{benchmark['summary']}
{benchmark['description']}
{benchmark['summary']}