Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
| """Streamlit implementation of the GRM Leaderboard Space.""" | |
| from textwrap import dedent | |
| import streamlit as st | |
| from benchmarks import CATEGORY_DISPLAY, SOURCE_GRM_BENCH | |
| from data_views import ( | |
| VIEW_CATEGORY, | |
| VIEW_MATRIX, | |
| VIEW_SUMMARY, | |
| available_domains, | |
| available_sources, | |
| benchmark_options, | |
| build_benchmark_registry_frame, | |
| build_model_benchmark_scores, | |
| build_model_detail_frame, | |
| build_score_explorer_frame, | |
| category_from_label, | |
| category_options, | |
| filter_benchmarks, | |
| find_benchmark_by_name, | |
| model_options, | |
| parameter_bounds, | |
| score_stats, | |
| ) | |
| from ui_theme import CUSTOM_CSS, FORMULAS, HEADER_HTML, OVERVIEW_BLOCKS, palette_css | |
| st.set_page_config( | |
| page_title="GRM Score - Game Ready Leaderboard", | |
| layout="wide", | |
| ) | |
| theme_columns = st.columns([0.72, 0.28]) | |
| theme_mode = theme_columns[1].radio( | |
| "Theme", | |
| ["Night", "Day"], | |
| horizontal=True, | |
| key="grm_theme_mode", | |
| ) | |
| light_mode = theme_mode == "Day" | |
| st.markdown(CUSTOM_CSS, unsafe_allow_html=True) | |
| st.markdown(palette_css(light_mode), unsafe_allow_html=True) | |
| st.markdown(HEADER_HTML, unsafe_allow_html=True) | |
| def themed_dataframe(data): | |
| if not light_mode or not hasattr(data, "style"): | |
| return data | |
| return data.style.set_properties( | |
| **{ | |
| "background-color": "#ffffff", | |
| "color": "#15181b", | |
| "border-color": "#d9dee3", | |
| } | |
| ).set_table_styles( | |
| [ | |
| { | |
| "selector": "th", | |
| "props": [ | |
| ("background-color", "#eef1f4"), | |
| ("color", "#15181b"), | |
| ("border-color", "#d9dee3"), | |
| ], | |
| } | |
| ] | |
| ) | |
| def render_dataframe(data, **kwargs) -> None: | |
| st.dataframe(themed_dataframe(data), **kwargs) | |
| def format_sample_code(code: str) -> str: | |
| lines = dedent(code).strip().splitlines() | |
| formatted_lines = [] | |
| for line in lines: | |
| stripped = line.lstrip() | |
| leading_spaces = len(line) - len(stripped) | |
| normalized_indent = " " * (leading_spaces // 2) | |
| formatted_lines.append(f"{normalized_indent}{stripped}" if stripped else "") | |
| return "\n".join(formatted_lines) | |
| def render_about_grm() -> None: | |
| st.markdown("## About GRM Score") | |
| for block in OVERVIEW_BLOCKS: | |
| st.markdown(block) | |
| for formula in FORMULAS: | |
| st.markdown(f'<p class="formula-line">{formula}</p>', unsafe_allow_html=True) | |
| def render_stats(include_closed: bool, parameter_range: tuple[float, float]) -> None: | |
| stats = score_stats(include_closed=include_closed, parameter_range=parameter_range) | |
| columns = st.columns(5) | |
| for column, (label, value) in zip(columns, stats.items(), strict=True): | |
| column.metric(label, value) | |
| def render_score_controls() -> dict: | |
| top = st.columns([1.2, 1.2, 1.2, 1.4]) | |
| model_visibility = top[0].radio( | |
| "Model visibility", | |
| ["All models", "Open-source only"], | |
| index=1, | |
| horizontal=True, | |
| key="leaderboard_model_visibility", | |
| ) | |
| view = top[1].radio( | |
| "Score view", | |
| [VIEW_SUMMARY, VIEW_CATEGORY, VIEW_MATRIX], | |
| index=2, | |
| horizontal=True, | |
| key="leaderboard_score_view", | |
| ) | |
| category_label = top[2].selectbox("Category", category_options(), key="leaderboard_category") | |
| search = top[3].text_input("Search", placeholder="Model or benchmark", key="leaderboard_search") | |
| filters = st.columns([1.5, 1.8, 1.8, 1.2]) | |
| sources = filters[0].multiselect("Source", available_sources(), placeholder="All sources", key="leaderboard_source") | |
| domains = filters[1].multiselect("Domain", available_domains(), placeholder="All domains", key="leaderboard_domain") | |
| minimum_size, maximum_size = parameter_bounds() | |
| parameter_range = filters[2].slider( | |
| "Model parameter class (B)", | |
| min_value=minimum_size, | |
| max_value=maximum_size, | |
| value=(0.0, min(4.0, maximum_size)), | |
| step=0.5, | |
| format="%.1fB", | |
| key="leaderboard_parameter_range_v2", | |
| help="Use 0.0B for <0.5B class. The top end includes 120B+ models.", | |
| ) | |
| recalculate_visible = filters[3].checkbox("Filtered score", value=False, key="leaderboard_filtered_score") | |
| return { | |
| "include_closed": model_visibility == "All models", | |
| "view": view, | |
| "category_label": category_label, | |
| "search": search, | |
| "sources": sources, | |
| "domains": domains, | |
| "parameter_range": parameter_range, | |
| "recalculate_visible": recalculate_visible, | |
| } | |
| def render_score_explorer() -> None: | |
| st.markdown("## Leaderboard") | |
| st.caption( | |
| "Current score values are static PRD-backed values with TBD entries shown as missing. " | |
| "Filtered score recalculates an exploratory score from visible benchmarks only." | |
| ) | |
| controls = render_score_controls() | |
| selected_category = category_from_label(controls["category_label"]) | |
| benchmarks = filter_benchmarks( | |
| category=selected_category, | |
| sources=controls["sources"], | |
| domains=controls["domains"], | |
| search=controls["search"], | |
| ) | |
| render_stats( | |
| include_closed=controls["include_closed"], | |
| parameter_range=controls["parameter_range"], | |
| ) | |
| frame = build_score_explorer_frame( | |
| view=controls["view"], | |
| category_label=controls["category_label"], | |
| benchmarks=benchmarks, | |
| include_closed=controls["include_closed"], | |
| recalculate_visible=controls["recalculate_visible"], | |
| parameter_range=controls["parameter_range"], | |
| ) | |
| render_dataframe(frame, width="stretch", hide_index=True) | |
| if controls["view"] == VIEW_MATRIX: | |
| st.caption(f"Showing {len(benchmarks)} benchmark columns from the active filters.") | |
| detail_cols = st.columns(2) | |
| with detail_cols[0]: | |
| render_model_detail(controls["include_closed"], controls["parameter_range"]) | |
| with detail_cols[1]: | |
| render_benchmark_detail(benchmarks, key="leaderboard_benchmark_detail") | |
| def render_model_detail(include_closed: bool, parameter_range: tuple[float, float]) -> None: | |
| models = model_options(include_closed=include_closed, parameter_range=parameter_range) | |
| if not models: | |
| return | |
| selected_model = st.selectbox("Model detail", models, key="leaderboard_model_detail") | |
| render_dataframe(build_model_detail_frame(selected_model), width="stretch", hide_index=True) | |
| tabs = st.tabs(["Strongest", "Weakest"]) | |
| with tabs[0]: | |
| render_dataframe( | |
| build_model_benchmark_scores(selected_model, strongest=True), | |
| width="stretch", | |
| hide_index=True, | |
| ) | |
| with tabs[1]: | |
| render_dataframe( | |
| build_model_benchmark_scores(selected_model, strongest=False), | |
| width="stretch", | |
| hide_index=True, | |
| ) | |
| def render_benchmark_detail(benchmarks: list[dict], key: str) -> None: | |
| options = benchmark_options(benchmarks) or benchmark_options(filter_benchmarks()) | |
| if not options: | |
| return | |
| selected_name = st.selectbox("Benchmark detail", options, key=key) | |
| benchmark = find_benchmark_by_name(selected_name) | |
| if benchmark is None: | |
| return | |
| st.markdown( | |
| f""" | |
| <div class="detail-panel"> | |
| <div class="detail-kicker">{benchmark['source']} 路 weight {benchmark['calc_weight']}</div> | |
| <h3>{benchmark['name']}</h3> | |
| <p>{benchmark['description']}</p> | |
| <p>{benchmark['summary']}</p> | |
| </div> | |
| """, | |
| unsafe_allow_html=True, | |
| ) | |
| if benchmark.get("paper"): | |
| st.markdown(f"[Paper / Source]({benchmark['paper']})") | |
| if benchmark.get("detection_scope"): | |
| st.markdown("#### Detection Scope") | |
| render_dataframe( | |
| benchmark["detection_scope"], | |
| width="stretch", | |
| hide_index=True, | |
| ) | |
| with st.expander("Methodology", expanded=False): | |
| st.write(benchmark["methodology"]) | |
| st.write("Included in official GRM Score:" if benchmark["included_in_grm"] else "Not included in official GRM Score.") | |
| def render_benchmark_summary_card(benchmark: dict) -> None: | |
| st.markdown( | |
| f""" | |
| <div class="detail-panel"> | |
| <div class="detail-kicker">{CATEGORY_DISPLAY[benchmark['category']]} 路 {benchmark['source']} 路 weight {benchmark['calc_weight']}</div> | |
| <h3>{benchmark['name']}</h3> | |
| <p>{benchmark['description']}</p> | |
| <p>{benchmark['summary']}</p> | |
| </div> | |
| """, | |
| unsafe_allow_html=True, | |
| ) | |
| if benchmark.get("paper"): | |
| st.markdown(f"[Paper / Source]({benchmark['paper']})") | |
| with st.expander(f"{benchmark['name']} methodology and scope", expanded=False): | |
| st.write(benchmark["methodology"]) | |
| if benchmark.get("detection_scope"): | |
| render_dataframe(benchmark["detection_scope"], width="stretch", hide_index=True) | |
| def render_grm_methodology() -> None: | |
| st.markdown("## GRM-Bench Methodology") | |
| st.markdown( | |
| "GRM-Bench is the in-house authored benchmark suite for game-facing assistants, companions, " | |
| "and NPC behaviors that are not well-covered by broad academic leaderboards. The sections below " | |
| "preserve the authored benchmark methodology, failure modes, and representative examples." | |
| ) | |
| grm_benchmarks = filter_benchmarks(sources=[SOURCE_GRM_BENCH], include_non_scored=True) | |
| for benchmark in grm_benchmarks: | |
| st.markdown(f"### {benchmark['name'].replace('GRM - ', '')}") | |
| st.write(benchmark["summary"]) | |
| st.markdown("#### Test Methodology") | |
| st.write(benchmark["methodology"]) | |
| if benchmark.get("detection_scope"): | |
| st.markdown("#### Detection Scope") | |
| render_dataframe(benchmark["detection_scope"], width="stretch", hide_index=True) | |
| if benchmark["id"] == "grm_coherence": | |
| st.markdown("#### Representative Samples") | |
| for sample in benchmark.get("samples", []): | |
| with st.expander(sample["id"], expanded=False): | |
| for label, value in sample["metadata"]: | |
| st.markdown(f"**{label}:** {value}") | |
| st.code(format_sample_code(sample["code"]), language="json", wrap_lines=True) | |
| def render_benchmark_library() -> None: | |
| st.markdown("## Benchmark Library") | |
| st.caption("Evaluation suite reference with benchmark summaries, paper links, and GRM-Bench methodology.") | |
| filter_cols = st.columns([1.2, 1.4, 1.8, 1.2]) | |
| category_label = filter_cols[0].selectbox("Library category", category_options(), key="library_category") | |
| sources = filter_cols[1].multiselect("Library source", available_sources(), placeholder="All sources", key="library_source") | |
| domains = filter_cols[2].multiselect("Library domain", available_domains(), placeholder="All domains", key="library_domain") | |
| include_non_scored = filter_cols[3].checkbox("Show non-scored", value=True, key="library_show_non_scored") | |
| search = st.text_input("Benchmark library search", placeholder="Benchmark, domain, description", key="library_search") | |
| benchmarks = filter_benchmarks( | |
| category=category_from_label(category_label), | |
| sources=sources, | |
| domains=domains, | |
| search=search, | |
| include_non_scored=include_non_scored, | |
| ) | |
| render_dataframe( | |
| build_benchmark_registry_frame(benchmarks), | |
| width="stretch", | |
| hide_index=True, | |
| column_config={ | |
| "Paper / Repo": st.column_config.LinkColumn("Paper / Repo"), | |
| "Summary": st.column_config.TextColumn("Summary", width="large"), | |
| "Description": st.column_config.TextColumn("Description", width="large"), | |
| }, | |
| ) | |
| render_grm_methodology() | |
| render_about_grm() | |
| leaderboard_tab, library_tab = st.tabs(["Leaderboard", "Benchmark Library"]) | |
| with leaderboard_tab: | |
| render_score_explorer() | |
| with library_tab: | |
| render_benchmark_library() | |