Spaces:

nvidia
/

GRM

Running on CPU Upgrade

File size: 12,271 Bytes

5c49242

"""Streamlit implementation of the GRM Leaderboard Space."""

from textwrap import dedent

import streamlit as st

from benchmarks import CATEGORY_DISPLAY, SOURCE_GRM_BENCH
from data_views import (
    VIEW_CATEGORY,
    VIEW_MATRIX,
    VIEW_SUMMARY,
    available_domains,
    available_sources,
    benchmark_options,
    build_benchmark_registry_frame,
    build_model_benchmark_scores,
    build_model_detail_frame,
    build_score_explorer_frame,
    category_from_label,
    category_options,
    filter_benchmarks,
    find_benchmark_by_name,
    model_options,
    parameter_bounds,
    score_stats,
)
from ui_theme import CUSTOM_CSS, FORMULAS, HEADER_HTML, OVERVIEW_BLOCKS, palette_css


st.set_page_config(
    page_title="GRM Score - Game Ready Leaderboard",
    layout="wide",
)

theme_columns = st.columns([0.72, 0.28])
theme_mode = theme_columns[1].radio(
    "Theme",
    ["Night", "Day"],
    horizontal=True,
    key="grm_theme_mode",
)
light_mode = theme_mode == "Day"

st.markdown(CUSTOM_CSS, unsafe_allow_html=True)
st.markdown(palette_css(light_mode), unsafe_allow_html=True)
st.markdown(HEADER_HTML, unsafe_allow_html=True)


def themed_dataframe(data):
    if not light_mode or not hasattr(data, "style"):
        return data

    return data.style.set_properties(
        **{
            "background-color": "#ffffff",
            "color": "#15181b",
            "border-color": "#d9dee3",
        }
    ).set_table_styles(
        [
            {
                "selector": "th",
                "props": [
                    ("background-color", "#eef1f4"),
                    ("color", "#15181b"),
                    ("border-color", "#d9dee3"),
                ],
            }
        ]
    )


def render_dataframe(data, **kwargs) -> None:
    st.dataframe(themed_dataframe(data), **kwargs)


def format_sample_code(code: str) -> str:
    lines = dedent(code).strip().splitlines()
    formatted_lines = []

    for line in lines:
        stripped = line.lstrip()
        leading_spaces = len(line) - len(stripped)
        normalized_indent = " " * (leading_spaces // 2)
        formatted_lines.append(f"{normalized_indent}{stripped}" if stripped else "")

    return "\n".join(formatted_lines)


def render_about_grm() -> None:
    st.markdown("## About GRM Score")
    for block in OVERVIEW_BLOCKS:
        st.markdown(block)
    for formula in FORMULAS:
        st.markdown(f'<p class="formula-line">{formula}</p>', unsafe_allow_html=True)


def render_stats(include_closed: bool, parameter_range: tuple[float, float]) -> None:
    stats = score_stats(include_closed=include_closed, parameter_range=parameter_range)
    columns = st.columns(5)
    for column, (label, value) in zip(columns, stats.items(), strict=True):
        column.metric(label, value)


def render_score_controls() -> dict:
    top = st.columns([1.2, 1.2, 1.2, 1.4])
    model_visibility = top[0].radio(
        "Model visibility",
        ["All models", "Open-source only"],
        index=1,
        horizontal=True,
        key="leaderboard_model_visibility",
    )
    view = top[1].radio(
        "Score view",
        [VIEW_SUMMARY, VIEW_CATEGORY, VIEW_MATRIX],
        index=2,
        horizontal=True,
        key="leaderboard_score_view",
    )
    category_label = top[2].selectbox("Category", category_options(), key="leaderboard_category")
    search = top[3].text_input("Search", placeholder="Model or benchmark", key="leaderboard_search")

    filters = st.columns([1.5, 1.8, 1.8, 1.2])
    sources = filters[0].multiselect("Source", available_sources(), placeholder="All sources", key="leaderboard_source")
    domains = filters[1].multiselect("Domain", available_domains(), placeholder="All domains", key="leaderboard_domain")
    minimum_size, maximum_size = parameter_bounds()
    parameter_range = filters[2].slider(
        "Model parameter class (B)",
        min_value=minimum_size,
        max_value=maximum_size,
        value=(0.0, min(4.0, maximum_size)),
        step=0.5,
        format="%.1fB",
        key="leaderboard_parameter_range_v2",
        help="Use 0.0B for <0.5B class. The top end includes 120B+ models.",
    )
    recalculate_visible = filters[3].checkbox("Filtered score", value=False, key="leaderboard_filtered_score")

    return {
        "include_closed": model_visibility == "All models",
        "view": view,
        "category_label": category_label,
        "search": search,
        "sources": sources,
        "domains": domains,
        "parameter_range": parameter_range,
        "recalculate_visible": recalculate_visible,
    }


def render_score_explorer() -> None:
    st.markdown("## Leaderboard")
    st.caption(
        "Current score values are static PRD-backed values with TBD entries shown as missing. "
        "Filtered score recalculates an exploratory score from visible benchmarks only."
    )

    controls = render_score_controls()
    selected_category = category_from_label(controls["category_label"])
    benchmarks = filter_benchmarks(
        category=selected_category,
        sources=controls["sources"],
        domains=controls["domains"],
        search=controls["search"],
    )

    render_stats(
        include_closed=controls["include_closed"],
        parameter_range=controls["parameter_range"],
    )

    frame = build_score_explorer_frame(
        view=controls["view"],
        category_label=controls["category_label"],
        benchmarks=benchmarks,
        include_closed=controls["include_closed"],
        recalculate_visible=controls["recalculate_visible"],
        parameter_range=controls["parameter_range"],
    )
    render_dataframe(frame, width="stretch", hide_index=True)

    if controls["view"] == VIEW_MATRIX:
        st.caption(f"Showing {len(benchmarks)} benchmark columns from the active filters.")

    detail_cols = st.columns(2)
    with detail_cols[0]:
        render_model_detail(controls["include_closed"], controls["parameter_range"])
    with detail_cols[1]:
        render_benchmark_detail(benchmarks, key="leaderboard_benchmark_detail")


def render_model_detail(include_closed: bool, parameter_range: tuple[float, float]) -> None:
    models = model_options(include_closed=include_closed, parameter_range=parameter_range)
    if not models:
        return

    selected_model = st.selectbox("Model detail", models, key="leaderboard_model_detail")
    render_dataframe(build_model_detail_frame(selected_model), width="stretch", hide_index=True)

    tabs = st.tabs(["Strongest", "Weakest"])
    with tabs[0]:
        render_dataframe(
            build_model_benchmark_scores(selected_model, strongest=True),
            width="stretch",
            hide_index=True,
        )
    with tabs[1]:
        render_dataframe(
            build_model_benchmark_scores(selected_model, strongest=False),
            width="stretch",
            hide_index=True,
        )


def render_benchmark_detail(benchmarks: list[dict], key: str) -> None:
    options = benchmark_options(benchmarks) or benchmark_options(filter_benchmarks())
    if not options:
        return

    selected_name = st.selectbox("Benchmark detail", options, key=key)
    benchmark = find_benchmark_by_name(selected_name)
    if benchmark is None:
        return

    st.markdown(
        f"""

<div class="detail-panel">

    <div class="detail-kicker">{benchmark['source']} · weight {benchmark['calc_weight']}</div>

  <h3>{benchmark['name']}</h3>

  <p>{benchmark['description']}</p>

  <p>{benchmark['summary']}</p>

</div>

""",
        unsafe_allow_html=True,
    )

    if benchmark.get("paper"):
        st.markdown(f"[Paper / Source]({benchmark['paper']})")

    if benchmark.get("detection_scope"):
        st.markdown("#### Detection Scope")
        render_dataframe(
            benchmark["detection_scope"],
            width="stretch",
            hide_index=True,
        )

    with st.expander("Methodology", expanded=False):
        st.write(benchmark["methodology"])
        st.write("Included in official GRM Score:" if benchmark["included_in_grm"] else "Not included in official GRM Score.")


def render_benchmark_summary_card(benchmark: dict) -> None:
    st.markdown(
        f"""

<div class="detail-panel">

  <div class="detail-kicker">{CATEGORY_DISPLAY[benchmark['category']]} · {benchmark['source']} · weight {benchmark['calc_weight']}</div>

  <h3>{benchmark['name']}</h3>

  <p>{benchmark['description']}</p>

  <p>{benchmark['summary']}</p>

</div>

""",
        unsafe_allow_html=True,
    )
    if benchmark.get("paper"):
        st.markdown(f"[Paper / Source]({benchmark['paper']})")
    with st.expander(f"{benchmark['name']} methodology and scope", expanded=False):
        st.write(benchmark["methodology"])
        if benchmark.get("detection_scope"):
            render_dataframe(benchmark["detection_scope"], width="stretch", hide_index=True)


def render_grm_methodology() -> None:
    st.markdown("## GRM-Bench Methodology")
    st.markdown(
        "GRM-Bench is the in-house authored benchmark suite for game-facing assistants, companions, "
        "and NPC behaviors that are not well-covered by broad academic leaderboards. The sections below "
        "preserve the authored benchmark methodology, failure modes, and representative examples."
    )

    grm_benchmarks = filter_benchmarks(sources=[SOURCE_GRM_BENCH], include_non_scored=True)
    for benchmark in grm_benchmarks:
        st.markdown(f"### {benchmark['name'].replace('GRM - ', '')}")
        st.write(benchmark["summary"])
        st.markdown("#### Test Methodology")
        st.write(benchmark["methodology"])
        if benchmark.get("detection_scope"):
            st.markdown("#### Detection Scope")
            render_dataframe(benchmark["detection_scope"], width="stretch", hide_index=True)

        if benchmark["id"] == "grm_coherence":
            st.markdown("#### Representative Samples")
            for sample in benchmark.get("samples", []):
                with st.expander(sample["id"], expanded=False):
                    for label, value in sample["metadata"]:
                        st.markdown(f"**{label}:** {value}")
                    st.code(format_sample_code(sample["code"]), language="json", wrap_lines=True)


def render_benchmark_library() -> None:
    st.markdown("## Benchmark Library")
    st.caption("Evaluation suite reference with benchmark summaries, paper links, and GRM-Bench methodology.")

    filter_cols = st.columns([1.2, 1.4, 1.8, 1.2])
    category_label = filter_cols[0].selectbox("Library category", category_options(), key="library_category")
    sources = filter_cols[1].multiselect("Library source", available_sources(), placeholder="All sources", key="library_source")
    domains = filter_cols[2].multiselect("Library domain", available_domains(), placeholder="All domains", key="library_domain")
    include_non_scored = filter_cols[3].checkbox("Show non-scored", value=True, key="library_show_non_scored")
    search = st.text_input("Benchmark library search", placeholder="Benchmark, domain, description", key="library_search")

    benchmarks = filter_benchmarks(
        category=category_from_label(category_label),
        sources=sources,
        domains=domains,
        search=search,
        include_non_scored=include_non_scored,
    )
    render_dataframe(
        build_benchmark_registry_frame(benchmarks),
        width="stretch",
        hide_index=True,
        column_config={
            "Paper / Repo": st.column_config.LinkColumn("Paper / Repo"),
            "Summary": st.column_config.TextColumn("Summary", width="large"),
            "Description": st.column_config.TextColumn("Description", width="large"),
        },
    )
    render_grm_methodology()


render_about_grm()

leaderboard_tab, library_tab = st.tabs(["Leaderboard", "Benchmark Library"])

with leaderboard_tab:
    render_score_explorer()

with library_tab:
    render_benchmark_library()