Spaces:

nvidia
/

GRM

Running on CPU Upgrade

App Files Files Community

GRM / streamlit_app.py

mbagdasarova-nvidia

Upload 9 files

5c49242 verified 22 days ago

raw

history blame contribute delete

12.3 kB

	"""Streamlit implementation of the GRM Leaderboard Space."""

	from textwrap import dedent

	import streamlit as st

	from benchmarks import CATEGORY_DISPLAY, SOURCE_GRM_BENCH
	from data_views import (
	VIEW_CATEGORY,
	VIEW_MATRIX,
	VIEW_SUMMARY,
	available_domains,
	available_sources,
	benchmark_options,
	build_benchmark_registry_frame,
	build_model_benchmark_scores,
	build_model_detail_frame,
	build_score_explorer_frame,
	category_from_label,
	category_options,
	filter_benchmarks,
	find_benchmark_by_name,
	model_options,
	parameter_bounds,
	score_stats,
	)
	from ui_theme import CUSTOM_CSS, FORMULAS, HEADER_HTML, OVERVIEW_BLOCKS, palette_css


	st.set_page_config(
	page_title="GRM Score - Game Ready Leaderboard",
	layout="wide",
	)

	theme_columns = st.columns([0.72, 0.28])
	theme_mode = theme_columns[1].radio(
	"Theme",
	["Night", "Day"],
	horizontal=True,
	key="grm_theme_mode",
	)
	light_mode = theme_mode == "Day"

	st.markdown(CUSTOM_CSS, unsafe_allow_html=True)
	st.markdown(palette_css(light_mode), unsafe_allow_html=True)
	st.markdown(HEADER_HTML, unsafe_allow_html=True)


	def themed_dataframe(data):
	if not light_mode or not hasattr(data, "style"):
	return data

	return data.style.set_properties(
	**{
	"background-color": "#ffffff",
	"color": "#15181b",
	"border-color": "#d9dee3",
	}
	).set_table_styles(
	[
	{
	"selector": "th",
	"props": [
	("background-color", "#eef1f4"),
	("color", "#15181b"),
	("border-color", "#d9dee3"),
	],
	}
	]
	)


	def render_dataframe(data, **kwargs) -> None:
	st.dataframe(themed_dataframe(data), **kwargs)


	def format_sample_code(code: str) -> str:
	lines = dedent(code).strip().splitlines()
	formatted_lines = []

	for line in lines:
	stripped = line.lstrip()
	leading_spaces = len(line) - len(stripped)
	normalized_indent = " " * (leading_spaces // 2)
	formatted_lines.append(f"{normalized_indent}{stripped}" if stripped else "")

	return "\n".join(formatted_lines)


	def render_about_grm() -> None:
	st.markdown("## About GRM Score")
	for block in OVERVIEW_BLOCKS:
	st.markdown(block)
	for formula in FORMULAS:
	st.markdown(f'<p class="formula-line">{formula}</p>', unsafe_allow_html=True)


	def render_stats(include_closed: bool, parameter_range: tuple[float, float]) -> None:
	stats = score_stats(include_closed=include_closed, parameter_range=parameter_range)
	columns = st.columns(5)
	for column, (label, value) in zip(columns, stats.items(), strict=True):
	column.metric(label, value)


	def render_score_controls() -> dict:
	top = st.columns([1.2, 1.2, 1.2, 1.4])
	model_visibility = top[0].radio(
	"Model visibility",
	["All models", "Open-source only"],
	index=1,
	horizontal=True,
	key="leaderboard_model_visibility",
	)
	view = top[1].radio(
	"Score view",
	[VIEW_SUMMARY, VIEW_CATEGORY, VIEW_MATRIX],
	index=2,
	horizontal=True,
	key="leaderboard_score_view",
	)
	category_label = top[2].selectbox("Category", category_options(), key="leaderboard_category")
	search = top[3].text_input("Search", placeholder="Model or benchmark", key="leaderboard_search")

	filters = st.columns([1.5, 1.8, 1.8, 1.2])
	sources = filters[0].multiselect("Source", available_sources(), placeholder="All sources", key="leaderboard_source")
	domains = filters[1].multiselect("Domain", available_domains(), placeholder="All domains", key="leaderboard_domain")
	minimum_size, maximum_size = parameter_bounds()
	parameter_range = filters[2].slider(
	"Model parameter class (B)",
	min_value=minimum_size,
	max_value=maximum_size,
	value=(0.0, min(4.0, maximum_size)),
	step=0.5,
	format="%.1fB",
	key="leaderboard_parameter_range_v2",
	help="Use 0.0B for <0.5B class. The top end includes 120B+ models.",
	)
	recalculate_visible = filters[3].checkbox("Filtered score", value=False, key="leaderboard_filtered_score")

	return {
	"include_closed": model_visibility == "All models",
	"view": view,
	"category_label": category_label,
	"search": search,
	"sources": sources,
	"domains": domains,
	"parameter_range": parameter_range,
	"recalculate_visible": recalculate_visible,
	}


	def render_score_explorer() -> None:
	st.markdown("## Leaderboard")
	st.caption(
	"Current score values are static PRD-backed values with TBD entries shown as missing. "
	"Filtered score recalculates an exploratory score from visible benchmarks only."
	)

	controls = render_score_controls()
	selected_category = category_from_label(controls["category_label"])
	benchmarks = filter_benchmarks(
	category=selected_category,
	sources=controls["sources"],
	domains=controls["domains"],
	search=controls["search"],
	)

	render_stats(
	include_closed=controls["include_closed"],
	parameter_range=controls["parameter_range"],
	)

	frame = build_score_explorer_frame(
	view=controls["view"],
	category_label=controls["category_label"],
	benchmarks=benchmarks,
	include_closed=controls["include_closed"],
	recalculate_visible=controls["recalculate_visible"],
	parameter_range=controls["parameter_range"],
	)
	render_dataframe(frame, width="stretch", hide_index=True)

	if controls["view"] == VIEW_MATRIX:
	st.caption(f"Showing {len(benchmarks)} benchmark columns from the active filters.")

	detail_cols = st.columns(2)
	with detail_cols[0]:
	render_model_detail(controls["include_closed"], controls["parameter_range"])
	with detail_cols[1]:
	render_benchmark_detail(benchmarks, key="leaderboard_benchmark_detail")


	def render_model_detail(include_closed: bool, parameter_range: tuple[float, float]) -> None:
	models = model_options(include_closed=include_closed, parameter_range=parameter_range)
	if not models:
	return

	selected_model = st.selectbox("Model detail", models, key="leaderboard_model_detail")
	render_dataframe(build_model_detail_frame(selected_model), width="stretch", hide_index=True)

	tabs = st.tabs(["Strongest", "Weakest"])
	with tabs[0]:
	render_dataframe(
	build_model_benchmark_scores(selected_model, strongest=True),
	width="stretch",
	hide_index=True,
	)
	with tabs[1]:
	render_dataframe(
	build_model_benchmark_scores(selected_model, strongest=False),
	width="stretch",
	hide_index=True,
	)


	def render_benchmark_detail(benchmarks: list[dict], key: str) -> None:
	options = benchmark_options(benchmarks) or benchmark_options(filter_benchmarks())
	if not options:
	return

	selected_name = st.selectbox("Benchmark detail", options, key=key)
	benchmark = find_benchmark_by_name(selected_name)
	if benchmark is None:
	return

	st.markdown(
	f"""
	<div class="detail-panel">
	<div class="detail-kicker">{benchmark['source']} · weight {benchmark['calc_weight']}</div>
	<h3>{benchmark['name']}</h3>
	<p>{benchmark['description']}</p>
	<p>{benchmark['summary']}</p>
	</div>
	""",
	unsafe_allow_html=True,
	)

	if benchmark.get("paper"):
	st.markdown(f"[Paper / Source]({benchmark['paper']})")

	if benchmark.get("detection_scope"):
	st.markdown("#### Detection Scope")
	render_dataframe(
	benchmark["detection_scope"],
	width="stretch",
	hide_index=True,
	)

	with st.expander("Methodology", expanded=False):
	st.write(benchmark["methodology"])
	st.write("Included in official GRM Score:" if benchmark["included_in_grm"] else "Not included in official GRM Score.")


	def render_benchmark_summary_card(benchmark: dict) -> None:
	st.markdown(
	f"""
	<div class="detail-panel">
	<div class="detail-kicker">{CATEGORY_DISPLAY[benchmark['category']]} · {benchmark['source']} · weight {benchmark['calc_weight']}</div>
	<h3>{benchmark['name']}</h3>
	<p>{benchmark['description']}</p>
	<p>{benchmark['summary']}</p>
	</div>
	""",
	unsafe_allow_html=True,
	)
	if benchmark.get("paper"):
	st.markdown(f"[Paper / Source]({benchmark['paper']})")
	with st.expander(f"{benchmark['name']} methodology and scope", expanded=False):
	st.write(benchmark["methodology"])
	if benchmark.get("detection_scope"):
	render_dataframe(benchmark["detection_scope"], width="stretch", hide_index=True)


	def render_grm_methodology() -> None:
	st.markdown("## GRM-Bench Methodology")
	st.markdown(
	"GRM-Bench is the in-house authored benchmark suite for game-facing assistants, companions, "
	"and NPC behaviors that are not well-covered by broad academic leaderboards. The sections below "
	"preserve the authored benchmark methodology, failure modes, and representative examples."
	)

	grm_benchmarks = filter_benchmarks(sources=[SOURCE_GRM_BENCH], include_non_scored=True)
	for benchmark in grm_benchmarks:
	st.markdown(f"### {benchmark['name'].replace('GRM - ', '')}")
	st.write(benchmark["summary"])
	st.markdown("#### Test Methodology")
	st.write(benchmark["methodology"])
	if benchmark.get("detection_scope"):
	st.markdown("#### Detection Scope")
	render_dataframe(benchmark["detection_scope"], width="stretch", hide_index=True)

	if benchmark["id"] == "grm_coherence":
	st.markdown("#### Representative Samples")
	for sample in benchmark.get("samples", []):
	with st.expander(sample["id"], expanded=False):
	for label, value in sample["metadata"]:
	st.markdown(f"{label}: {value}")
	st.code(format_sample_code(sample["code"]), language="json", wrap_lines=True)


	def render_benchmark_library() -> None:
	st.markdown("## Benchmark Library")
	st.caption("Evaluation suite reference with benchmark summaries, paper links, and GRM-Bench methodology.")

	filter_cols = st.columns([1.2, 1.4, 1.8, 1.2])
	category_label = filter_cols[0].selectbox("Library category", category_options(), key="library_category")
	sources = filter_cols[1].multiselect("Library source", available_sources(), placeholder="All sources", key="library_source")
	domains = filter_cols[2].multiselect("Library domain", available_domains(), placeholder="All domains", key="library_domain")
	include_non_scored = filter_cols[3].checkbox("Show non-scored", value=True, key="library_show_non_scored")
	search = st.text_input("Benchmark library search", placeholder="Benchmark, domain, description", key="library_search")

	benchmarks = filter_benchmarks(
	category=category_from_label(category_label),
	sources=sources,
	domains=domains,
	search=search,
	include_non_scored=include_non_scored,
	)
	render_dataframe(
	build_benchmark_registry_frame(benchmarks),
	width="stretch",
	hide_index=True,
	column_config={
	"Paper / Repo": st.column_config.LinkColumn("Paper / Repo"),
	"Summary": st.column_config.TextColumn("Summary", width="large"),
	"Description": st.column_config.TextColumn("Description", width="large"),
	},
	)
	render_grm_methodology()


	render_about_grm()

	leaderboard_tab, library_tab = st.tabs(["Leaderboard", "Benchmark Library"])

	with leaderboard_tab:
	render_score_explorer()

	with library_tab:
	render_benchmark_library()