Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
| """Nvidia Game Ready Model Score (GRM Score) Gradio app.""" | |
| from html import escape | |
| from pathlib import Path | |
| import gradio as gr | |
| from benchmarks import BENCHMARKS, CATEGORIES, CATEGORY_DISPLAY, get_benchmarks_by_category | |
| from scores import MODEL_SCORES | |
| from scoring import build_leaderboard | |
| LEADERBOARD_COLUMNS = [ | |
| "Rank", | |
| "Model", | |
| "GRM Score", | |
| "Roleplay (33%)", | |
| "Actions (33%)", | |
| "General (33%)", | |
| ] | |
| GRADIO_MAJOR_VERSION = int(gr.__version__.split(".", 1)[0]) | |
| APP_ROOT = Path(__file__).resolve().parent | |
| REF_ROOT = APP_ROOT / "ref" | |
| DEFAULT_OVERVIEW_BLOCKS = [ | |
| "Nvidia Game Ready Model Score (GRM) is an aggregated quality metric designed to assess LLM capabilites in gaming use cases.", | |
| "General state-of-the-art language models are optimized for broad benchmarks such as math, code, and general knowledge. That does not reliably translate to in-game performance, and it does not reliably predict NPC quality, gameplay actions, or immersion.", | |
| "With game model evaluation, game developers can accelerate AI integration pipelines by reducing time spent on model evaluation and narrowing model choice earlier. The overall score is the average of Roleplay, Actions, and General, while benchmarks inside each category are combined with weighted averaging using core weights of 1.0 and supplementary weights of 0.5.", | |
| "GRM Score = (Roleplay + Actions + General) / 3", | |
| "Category Score = sum(score x weight) / sum(weight)", | |
| ] | |
| PROPRIETARY_MODELS = frozenset({"GPT-5.4", "Gemini 2.5 Pro"}) | |
| BASE_GRM_BENCH_SECTIONS = [ | |
| { | |
| "title": "Coherence", | |
| "summary": [ | |
| "Above all other failure modes that break immersion in character and NPC interactions are responses that feel illogical, inconsistent, or irrelevant to the active game state.", | |
| "Incoherence can surface as hallucinated details, role confusion, contradictions across turns, or answers that stop tracking the subject under discussion.", | |
| ], | |
| "methodology": ( | |
| "Because coherence can fail in many different ways, the authored scenarios are designed to trigger a common failure mode and then measure whether the model stays grounded under pressure." | |
| ), | |
| "scope": [ | |
| [ | |
| "Factual / Logical", | |
| "Objectively false or contradicted by the system prompt or game state, including invented entities, rules, or details.", | |
| ], | |
| ["Cause / Effect", "Fails simple logical state transitions or obvious state changes."], | |
| [ | |
| "Contradiction", | |
| "Contradicts something previously said or done without an in-world justification.", | |
| ], | |
| [ | |
| "Personality / Background Violation", | |
| "Violates an established trait, limitation, or background fact.", | |
| ], | |
| [ | |
| "Role Confusion", | |
| "Confuses identities, facts, actions, or motivations across entities.", | |
| ], | |
| [ | |
| "Irrelevance", | |
| "Stops tracking the active subject or responds in a way that is not relevant to the discussion.", | |
| ], | |
| [ | |
| "Knowledge Boundary", | |
| "Invents knowledge the character cannot have instead of separating observation from speculation.", | |
| ], | |
| [ | |
| "False Premise", | |
| "Incorrectly accepts a smuggled-in user premise about something that never happened.", | |
| ], | |
| ], | |
| "samples": ( | |
| "Representative cases include long multi-turn identity-confusion exchanges and hidden-information prompts where the character must avoid inventing unseen facts." | |
| ), | |
| }, | |
| { | |
| "title": "Response Diversity", | |
| "summary": [ | |
| "Response Diversity measures whether a model stays engaging without collapsing into repetitive wording, sentence structure, or stock phrasing across similar prompts and multi-turn play.", | |
| "The goal is not randomness. The goal is controlled variation that still preserves the correct task intent, tone, and world state.", | |
| ], | |
| "methodology": ( | |
| "Equivalent requests are expressed across repeated turns and neighboring scenarios so the evaluation can separate healthy consistency from repetitive degeneration." | |
| ), | |
| "scope": [ | |
| ["Repetition Loop", "Repeats phrases, clauses, or sentence frames across adjacent responses."], | |
| ["Lexical Compression", "Collapses to a narrow vocabulary even when there is room for variation."], | |
| ["Originality Failure", "Paraphrases the prompt too literally instead of producing fresh in-world language."], | |
| ["Near-Duplicate Continuation", "Makes only superficial wording changes while repeating the same response content."], | |
| ["Style Stagnation", "Cannot vary tone or delivery while preserving the same underlying instruction."], | |
| ], | |
| }, | |
| { | |
| "title": "Tool Recovery", | |
| "summary": [ | |
| "Tool Recovery evaluates whether the model can recognize a failed tool step, repair the plan, and continue without fabricating results.", | |
| "This matters for assistants that need to survive partial failures instead of derailing the whole interaction after one bad tool call.", | |
| ], | |
| "methodology": ( | |
| "Benchmarks inject missing tool calls, malformed arguments, or explicit tool failures and then measure whether the model retries correctly, replans, or asks for the right follow-up." | |
| ), | |
| "scope": [ | |
| ["Missed Invocation", "Fails to issue a required tool call at all."], | |
| ["Malformed Retry", "Attempts recovery with incomplete or invalid tool arguments."], | |
| ["Fabricated Output", "Invents tool output after a failure instead of acknowledging the error."], | |
| ["Recovery Sequencing", "Does not replan correctly after a tool error or partial result."], | |
| ["Silent Drop", "Continues as if the failed tool step never mattered to the task."], | |
| ], | |
| }, | |
| { | |
| "title": "Context Adaptation", | |
| "summary": [ | |
| "Context Adaptation measures whether a model tracks a changing world state without letting values, locations, inventories, or statuses drift across turns.", | |
| "These tests target dynamic sessions where the model must stay synchronized with the newest state while preserving earlier facts that still remain true.", | |
| ], | |
| "methodology": ( | |
| "Stateful scenarios update facts mid-conversation and require the model to carry forward the latest values while also keeping dependent details accurate." | |
| ), | |
| "scope": [ | |
| ["State Drift", "Values change without cause as the conversation continues."], | |
| ["Temporal Mismatch", "Old state is treated as current after a newer update is provided."], | |
| ["Entity Attribute Drift", "Names, inventory, location, or status details mutate incorrectly."], | |
| ["Partial Update Failure", "One field is updated but dependent fields are left stale."], | |
| ["Conflict Resolution", "Cannot reconcile new information with earlier context in a coherent way."], | |
| ], | |
| }, | |
| { | |
| "title": "Prompt Robustness", | |
| "summary": [ | |
| "Prompt Robustness checks whether the same underlying intent is handled reliably across terse prompts, verbose instructions, structured payloads, and mixed formatting.", | |
| "A model should not need one exact prompt style in order to understand the task, infer the right tool path, or preserve the requested output behavior.", | |
| ], | |
| "methodology": ( | |
| "Equivalent requests are expressed in long-form prose, shorthand, JSON, XML, and other wrappers to measure sensitivity to presentation rather than intent." | |
| ), | |
| "scope": [ | |
| ["Format Sensitivity", "Succeeds in plain prose but fails when the request is wrapped in JSON, XML, or other structure."], | |
| ["Instruction Alias Failure", "Equivalent wording changes alter behavior more than they should."], | |
| ["Verbosity Dependency", "Requires unusually long prompting to perform a task it should infer directly."], | |
| ["Tool Intent Drift", "Misses the right tool plan when the same task is phrased in a different form."], | |
| ["Structure Overfitting", "Responds too literally to markup or formatting instead of following the underlying request."], | |
| ], | |
| }, | |
| ] | |
| def _read_reference_file(name: str) -> str | None: | |
| try: | |
| return (REF_ROOT / name).read_text(encoding="utf-8").strip() | |
| except OSError: | |
| return None | |
| def _split_reference_blocks(text: str) -> list[str]: | |
| blocks = [] | |
| for chunk in text.split("\n\n"): | |
| block = " ".join(line.strip() for line in chunk.splitlines() if line.strip()) | |
| if block: | |
| blocks.append(block) | |
| return blocks | |
| def _build_overview_html(blocks: list[str]) -> str: | |
| parts = ['<div class="longform-copy">'] | |
| title_prefix = "Nvidia Game Ready Model Score (GRM)" | |
| for block in blocks: | |
| if block.startswith("GRM Score ="): | |
| parts.append(f'<p class="formula-line">{escape(block)}</p>') | |
| continue | |
| if block.startswith("Category Score ="): | |
| parts.append(f'<p class="formula-line subdued">{escape(block)}</p>') | |
| continue | |
| if block.startswith(title_prefix): | |
| suffix = block[len(title_prefix) :] | |
| parts.append(f"<p><strong>{escape(title_prefix)}</strong>{escape(suffix)}</p>") | |
| continue | |
| parts.append(f"<p>{escape(block)}</p>") | |
| parts.append("</div>") | |
| return "".join(parts) | |
| def _load_overview_html() -> str: | |
| text = _read_reference_file("Overview") | |
| blocks = _split_reference_blocks(text) if text else DEFAULT_OVERVIEW_BLOCKS | |
| return _build_overview_html(blocks) | |
| def _load_coherence_section() -> dict | None: | |
| text = _read_reference_file("Coherence_Summary") | |
| if not text: | |
| return None | |
| lines = text.splitlines() | |
| index = 0 | |
| while index < len(lines) and not lines[index].strip(): | |
| index += 1 | |
| if index >= len(lines): | |
| return None | |
| title = lines[index].strip() | |
| index += 1 | |
| summary_lines = [] | |
| while index < len(lines) and lines[index].strip() != "Test Methodology": | |
| if lines[index].strip(): | |
| summary_lines.append(lines[index].strip()) | |
| index += 1 | |
| if index >= len(lines): | |
| return None | |
| index += 1 | |
| methodology_lines = [] | |
| while index < len(lines) and lines[index].strip() != "Detection Scope:": | |
| if lines[index].strip(): | |
| methodology_lines.append(lines[index].strip()) | |
| index += 1 | |
| if index >= len(lines): | |
| return None | |
| index += 1 | |
| scope = [] | |
| while index < len(lines) and lines[index].strip() != "Test Samples": | |
| line = lines[index].strip() | |
| if line: | |
| category, _, description = line.partition(" - ") | |
| scope.append([category.strip(), description.strip()]) | |
| index += 1 | |
| samples = [] | |
| if index < len(lines) and lines[index].strip() == "Test Samples": | |
| index += 1 | |
| while index < len(lines): | |
| while index < len(lines) and not lines[index].strip(): | |
| index += 1 | |
| if index >= len(lines): | |
| break | |
| if not lines[index].strip().startswith("TEST_"): | |
| index += 1 | |
| continue | |
| sample_id = lines[index].strip() | |
| index += 1 | |
| metadata = [] | |
| code_lines = [] | |
| while index < len(lines): | |
| line = lines[index] | |
| stripped = line.strip() | |
| if stripped.startswith("TEST_"): | |
| break | |
| if stripped == "Messages:": | |
| index += 1 | |
| while index < len(lines) and not lines[index].strip().startswith("TEST_"): | |
| code_lines.append(lines[index].rstrip()) | |
| index += 1 | |
| break | |
| if stripped and ":" in stripped: | |
| label, value = stripped.split(":", 1) | |
| metadata.append([label.strip(), value.strip()]) | |
| index += 1 | |
| samples.append( | |
| { | |
| "id": sample_id, | |
| "metadata": metadata, | |
| "code": "\n".join(code_lines).strip(), | |
| } | |
| ) | |
| if not summary_lines or not methodology_lines or not scope: | |
| return None | |
| return { | |
| "title": title, | |
| "summary": [" ".join(summary_lines)], | |
| "methodology": " ".join(methodology_lines), | |
| "scope": scope, | |
| "samples": samples, | |
| } | |
| def _load_grm_bench_sections() -> list[dict]: | |
| sections = list(BASE_GRM_BENCH_SECTIONS) | |
| coherence_section = _load_coherence_section() | |
| if coherence_section is not None: | |
| sections[0] = coherence_section | |
| return sections | |
| GRM_BENCH_SECTIONS = _load_grm_bench_sections() | |
| def _fmt(value: float | None) -> str: | |
| return f"{value:.1f}" if value is not None else "-" | |
| def _fmt_weight(value: float) -> str: | |
| return f"{value:.2f}" | |
| def _include_model(model_name: str, include_proprietary: bool) -> bool: | |
| return include_proprietary or model_name not in PROPRIETARY_MODELS | |
| def build_html_table( | |
| headers: list[str], | |
| rows: list[list[str]], | |
| table_class: str = "", | |
| shell_class: str = "table-scroll-shell", | |
| ) -> str: | |
| class_attr = f' class="data-table {table_class}"' if table_class else ' class="data-table"' | |
| shell_classes = " ".join(part for part in ["table-shell", shell_class] if part) | |
| parts = [f'<div class="{shell_classes}">', f"<table{class_attr}>", "<thead><tr>"] | |
| for header in headers: | |
| parts.append(f"<th>{escape(header)}</th>") | |
| parts.append("</tr></thead><tbody>") | |
| for row in rows: | |
| parts.append("<tr>") | |
| for cell in row: | |
| parts.append(f"<td>{escape(str(cell))}</td>") | |
| parts.append("</tr>") | |
| parts.append("</tbody></table></div>") | |
| return "".join(parts) | |
| def get_leaderboard_entries(include_proprietary: bool = True) -> list[dict]: | |
| entries = [] | |
| for row in build_leaderboard(): | |
| if _include_model(row["Model"], include_proprietary): | |
| row_entry = dict(row) | |
| row_entry["Rank"] = len(entries) + 1 | |
| entries.append(row_entry) | |
| return entries | |
| def get_leaderboard_rows(include_proprietary: bool = True) -> list[list[str]]: | |
| rows = [] | |
| for row in get_leaderboard_entries(include_proprietary): | |
| rows.append( | |
| [ | |
| str(row["Rank"]), | |
| row["Model"], | |
| _fmt(row["GRM Score"]), | |
| _fmt(row["Roleplay (33%)"]), | |
| _fmt(row["Actions (33%)"]), | |
| _fmt(row["General (33%)"]), | |
| ] | |
| ) | |
| return rows | |
| def get_ranked_model_names(include_proprietary: bool = True) -> list[str]: | |
| return [row["Model"] for row in get_leaderboard_entries(include_proprietary)] | |
| def build_evaluation_suite_html() -> str: | |
| parts = [ | |
| '<div class="table-shell evaluation-suite-shell">', | |
| "<table class=\"data-table evaluation-suite-table\">", | |
| "<colgroup>", | |
| '<col class="evaluation-suite-category-col">', | |
| '<col class="evaluation-suite-benchmark-col">', | |
| '<col class="evaluation-suite-description-col">', | |
| '<col class="evaluation-suite-weight-col">', | |
| "</colgroup>", | |
| "<thead><tr>", | |
| "<th>Category</th>", | |
| "<th>Benchmark</th>", | |
| "<th>Description</th>", | |
| '<th class="weight-column" title="Weight">Wt.</th>', | |
| "</tr></thead><tbody>", | |
| ] | |
| for category in CATEGORIES: | |
| benchmarks = get_benchmarks_by_category(category) | |
| rowspan = len(benchmarks) | |
| for index, benchmark in enumerate(benchmarks): | |
| parts.append("<tr>") | |
| if index == 0: | |
| parts.append( | |
| f'<td class="category-cell" rowspan="{rowspan}">{escape(CATEGORY_DISPLAY[category])}</td>' | |
| ) | |
| parts.append(f'<td class="benchmark-cell">{escape(benchmark["name"])}</td>') | |
| parts.append(f'<td class="description-cell">{escape(benchmark["description"])}</td>') | |
| parts.append(f'<td class="weight-cell">{_fmt_weight(benchmark["calc_weight"])}</td>') | |
| parts.append("</tr>") | |
| parts.append("</tbody></table></div>") | |
| return "".join(parts) | |
| def build_leaderboard_html(include_proprietary: bool = True) -> str: | |
| return build_html_table( | |
| LEADERBOARD_COLUMNS, | |
| get_leaderboard_rows(include_proprietary), | |
| table_class="leaderboard-table", | |
| shell_class="leaderboard-shell", | |
| ) | |
| def build_category_score_table_html(category: str, include_proprietary: bool = True) -> str: | |
| benchmark_names = [benchmark["name"] for benchmark in get_benchmarks_by_category(category)] | |
| rows = [] | |
| for model in get_ranked_model_names(include_proprietary): | |
| row = [model] | |
| for benchmark_name in benchmark_names: | |
| score = MODEL_SCORES[model].get(benchmark_name) | |
| row.append(f"{score * 100:.1f}" if score is not None else "-") | |
| rows.append(row) | |
| return build_html_table(["Model"] + benchmark_names, rows, table_class="category-score-table") | |
| def update_leaderboard_tables(include_proprietary: bool) -> list[str]: | |
| outputs = [build_leaderboard_html(include_proprietary)] | |
| for category in CATEGORIES: | |
| outputs.append(build_category_score_table_html(category, include_proprietary)) | |
| return outputs | |
| def build_benchmark_details_html() -> str: | |
| parts = [] | |
| for category in CATEGORIES: | |
| parts.append( | |
| "<section class=\"benchmark-section\">" | |
| f"<h3>{escape(CATEGORY_DISPLAY[category])}</h3>" | |
| ) | |
| for benchmark in get_benchmarks_by_category(category): | |
| weight_label = "Core" if benchmark["calc_weight"] == 1.0 else "Supplementary" | |
| paper_html = "" | |
| if benchmark.get("paper"): | |
| paper_html = ( | |
| "<div class=\"benchmark-link\">" | |
| f"<a href=\"{escape(benchmark['paper'])}\" target=\"_blank\" rel=\"noreferrer\">" | |
| "Paper / Source" | |
| "</a>" | |
| "</div>" | |
| ) | |
| parts.append( | |
| "<article class=\"benchmark-entry\">" | |
| "<div class=\"benchmark-entry-top\">" | |
| f"<h4>{escape(benchmark['name'])}</h4>" | |
| f"<span class=\"benchmark-weight\">{weight_label} · {benchmark['calc_weight']}</span>" | |
| "</div>" | |
| f"<p class=\"benchmark-description\">{escape(benchmark['description'])}</p>" | |
| f"<p>{escape(benchmark['summary'])}</p>" | |
| f"{paper_html}" | |
| "</article>" | |
| ) | |
| parts.append("</section>") | |
| return "".join(parts) | |
| def _build_grm_bench_sample_html(sample: dict) -> str: | |
| parts = [ | |
| '<article class="grm-bench-sample">', | |
| f'<div class="grm-bench-sample-id">{escape(sample["id"])}</div>', | |
| ] | |
| for label, value in sample.get("metadata", []): | |
| parts.append( | |
| '<p class="grm-bench-sample-meta">' | |
| f'<span class="grm-bench-sample-label">{escape(label)}:</span> {escape(value)}' | |
| "</p>" | |
| ) | |
| if sample.get("code"): | |
| parts.append('<pre class="grm-bench-sample-code"><code>') | |
| parts.append(escape(sample["code"])) | |
| parts.append("</code></pre>") | |
| parts.append("</article>") | |
| return "".join(parts) | |
| def build_grm_bench_section_html(section: dict) -> str: | |
| parts = [ | |
| "<section class=\"grm-bench-section\">", | |
| "<div class=\"grm-bench-kicker\">Nvidia-Authored Benchmark</div>", | |
| f"<h2>{escape(section['title'])}</h2>", | |
| ] | |
| for paragraph in section["summary"]: | |
| parts.append(f"<p>{escape(paragraph)}</p>") | |
| parts.append("<div class=\"grm-bench-subtitle\">Test Methodology</div>") | |
| parts.append(f"<p>{escape(section['methodology'])}</p>") | |
| parts.append("<div class=\"grm-bench-subtitle\">Detection Scope</div>") | |
| parts.append( | |
| build_html_table(["Category", "Description"], section["scope"], table_class="grm-bench-scope-table") | |
| ) | |
| samples = section.get("samples") | |
| if samples: | |
| parts.append("<div class=\"grm-bench-subtitle\">Representative Samples</div>") | |
| if isinstance(samples, str): | |
| parts.append(f"<p>{escape(samples)}</p>") | |
| else: | |
| for sample in samples: | |
| parts.append(_build_grm_bench_sample_html(sample)) | |
| parts.append("</section>") | |
| return "".join(parts) | |
| def build_grm_bench_html() -> str: | |
| parts = [ | |
| "<div class=\"longform-copy\">", | |
| "<p><strong>GRM-Bench</strong> is the in-house authored benchmark suite for game-facing assistants, companions, and NPC behaviors that are not well-covered by broad academic leaderboards.</p>", | |
| "<p>The sections below describe the initial authored benchmark families and the concrete failure modes each family is designed to surface.</p>", | |
| "</div>", | |
| ] | |
| for section in GRM_BENCH_SECTIONS: | |
| parts.append(build_grm_bench_section_html(section)) | |
| return "".join(parts) | |
| HEADER_HTML = """ | |
| <section class="page-header"> | |
| <div class="page-eyebrow">NVIDIA Game Ready Evaluation</div> | |
| <h1>Game Ready Leaderboard</h1> | |
| <p> | |
| An open game model evaluation surface for comparing LLMs across roleplay, gameplay | |
| actions, and practical in-game reasoning. | |
| </p> | |
| </section> | |
| """ | |
| OVERVIEW_HTML = _load_overview_html() | |
| CUSTOM_CSS = """ | |
| :root { | |
| --bg-top: #202327; | |
| --bg-bottom: #0f1012; | |
| --surface: #15181b; | |
| --surface-strong: #24282d; | |
| --surface-alt: #1d2126; | |
| --surface-alt-2: #262a2f; | |
| --text-main: #f5f7f8; | |
| --text-muted: #c1c6cb; | |
| --text-soft: #a2a8ae; | |
| --accent: #76b900; | |
| --rule: rgba(255, 255, 255, 0.08); | |
| --rule-soft: rgba(255, 255, 255, 0.05); | |
| } | |
| html, | |
| body { | |
| display: block !important; | |
| height: auto !important; | |
| min-height: 100%; | |
| overflow-x: hidden !important; | |
| overflow-y: auto !important; | |
| scroll-behavior: auto !important; | |
| } | |
| body { | |
| background: linear-gradient(180deg, var(--bg-top) 0%, var(--bg-bottom) 100%) !important; | |
| } | |
| .gradio-container, | |
| .gradio-container .main, | |
| .gradio-container .wrap, | |
| .gradio-container .contain, | |
| .gradio-container [role="tabpanel"] { | |
| overflow: visible !important; | |
| max-height: none !important; | |
| } | |
| .gradio-container { | |
| max-width: 1260px !important; | |
| margin: 0 auto !important; | |
| padding: 24px 24px 48px !important; | |
| background: transparent !important; | |
| color: var(--text-main) !important; | |
| font-family: "Segoe UI", "Helvetica Neue", Arial, sans-serif !important; | |
| } | |
| .page-header { | |
| text-align: center; | |
| margin: 4px auto 26px; | |
| } | |
| .page-eyebrow { | |
| color: var(--text-soft); | |
| text-transform: uppercase; | |
| letter-spacing: 0.16em; | |
| font-size: 0.76rem; | |
| margin-bottom: 12px; | |
| } | |
| .page-header h1 { | |
| color: var(--text-main); | |
| font-size: 2.35rem; | |
| line-height: 1.1; | |
| letter-spacing: -0.02em; | |
| margin: 0; | |
| font-weight: 650; | |
| } | |
| .page-header p { | |
| max-width: 860px; | |
| margin: 12px auto 0; | |
| color: var(--text-muted); | |
| font-size: 1rem; | |
| line-height: 1.65; | |
| } | |
| .gradio-container .tab-nav { | |
| border-bottom: 1px solid var(--rule) !important; | |
| gap: 18px; | |
| margin: 0 0 18px 0 !important; | |
| } | |
| .gradio-container .tab-nav button { | |
| background: transparent !important; | |
| border: none !important; | |
| border-radius: 0 !important; | |
| color: var(--text-soft) !important; | |
| font-size: 0.8rem !important; | |
| font-weight: 650 !important; | |
| letter-spacing: 0.08em !important; | |
| min-width: unset !important; | |
| padding: 0 0 12px 0 !important; | |
| text-transform: uppercase !important; | |
| } | |
| .gradio-container .tab-nav button.selected, | |
| .gradio-container .tab-nav button[aria-selected="true"] { | |
| box-shadow: inset 0 -2px 0 var(--accent) !important; | |
| color: var(--text-main) !important; | |
| } | |
| .gradio-container .prose { | |
| color: var(--text-muted) !important; | |
| } | |
| .gradio-container .prose h2 { | |
| color: var(--text-main) !important; | |
| font-size: 1.6rem !important; | |
| font-weight: 600 !important; | |
| margin: 1.65rem 0 0.4rem !important; | |
| letter-spacing: -0.01em; | |
| } | |
| .gradio-container .prose h3 { | |
| color: var(--text-main) !important; | |
| font-size: 1.1rem !important; | |
| font-weight: 600 !important; | |
| margin: 1.1rem 0 0.4rem !important; | |
| } | |
| .gradio-container .prose p, | |
| .gradio-container .prose li { | |
| color: var(--text-muted) !important; | |
| font-size: 0.98rem !important; | |
| line-height: 1.65 !important; | |
| } | |
| .gradio-container .prose strong { | |
| color: var(--text-main) !important; | |
| } | |
| .gradio-container .prose a, | |
| .benchmark-link a { | |
| color: var(--accent) !important; | |
| text-decoration: none !important; | |
| } | |
| .section-note { | |
| color: var(--text-soft); | |
| font-size: 0.88rem; | |
| margin-top: 8px; | |
| } | |
| .longform-copy p { | |
| color: var(--text-muted); | |
| font-size: 0.98rem; | |
| line-height: 1.68; | |
| margin: 0 0 10px 0; | |
| } | |
| .formula-line { | |
| color: var(--text-main) !important; | |
| font-weight: 600; | |
| margin-top: 12px !important; | |
| } | |
| .formula-line.subdued { | |
| color: var(--text-soft) !important; | |
| font-weight: 500; | |
| margin-top: -1px !important; | |
| } | |
| .table-shell { | |
| width: 100%; | |
| margin-top: 10px; | |
| } | |
| .table-scroll-shell { | |
| overflow-x: auto; | |
| overflow-y: visible; | |
| } | |
| .evaluation-suite-shell { | |
| overflow: visible; | |
| } | |
| .leaderboard-shell { | |
| overflow: visible; | |
| } | |
| .data-table { | |
| width: 100%; | |
| border-collapse: collapse; | |
| border-spacing: 0; | |
| } | |
| .data-table thead th { | |
| background: #2b2f34; | |
| color: #d2d7dc; | |
| font-size: 0.8rem; | |
| font-weight: 650; | |
| text-transform: uppercase; | |
| letter-spacing: 0.04em; | |
| text-align: left; | |
| padding: 11px 12px; | |
| } | |
| .data-table tbody tr:nth-child(odd) td { | |
| background: #1c2024; | |
| } | |
| .data-table tbody tr:nth-child(even) td { | |
| background: #24282d; | |
| } | |
| .data-table td { | |
| color: var(--text-main); | |
| font-size: 0.94rem; | |
| line-height: 1.45; | |
| padding: 10px 12px; | |
| vertical-align: top; | |
| } | |
| .evaluation-suite-table thead th { | |
| padding: 9px 11px; | |
| font-size: 0.78rem; | |
| } | |
| .evaluation-suite-table { | |
| table-layout: fixed; | |
| } | |
| .evaluation-suite-table td { | |
| padding: 7px 11px; | |
| font-size: 0.9rem; | |
| line-height: 1.3; | |
| } | |
| .evaluation-suite-category-col { | |
| width: 8rem; | |
| } | |
| .evaluation-suite-benchmark-col { | |
| width: 12.5rem; | |
| } | |
| .evaluation-suite-weight-col { | |
| width: 5ch; | |
| } | |
| .evaluation-suite-table .weight-column, | |
| .evaluation-suite-table .weight-cell { | |
| font-variant-numeric: tabular-nums; | |
| max-width: 5ch; | |
| min-width: 5ch; | |
| text-align: center; | |
| white-space: nowrap; | |
| width: 5ch; | |
| padding-left: 4px; | |
| padding-right: 4px; | |
| } | |
| .evaluation-suite-table .category-cell { | |
| color: var(--text-soft); | |
| font-size: 0.79rem; | |
| font-weight: 650; | |
| text-transform: uppercase; | |
| letter-spacing: 0.06em; | |
| vertical-align: top; | |
| min-width: 8rem; | |
| } | |
| .evaluation-suite-table .benchmark-cell, | |
| .evaluation-suite-table .description-cell { | |
| overflow-wrap: anywhere; | |
| word-break: normal; | |
| } | |
| .evaluation-suite-table .benchmark-cell { | |
| width: 12.5rem; | |
| } | |
| .evaluation-suite-table .description-cell { | |
| min-width: 0; | |
| } | |
| .leaderboard-table tbody tr:first-child td { | |
| background: #252d1d; | |
| } | |
| .data-table tbody tr:hover td { | |
| background: #30353a; | |
| } | |
| .gradio-accordion { | |
| background: transparent !important; | |
| border: none !important; | |
| box-shadow: none !important; | |
| margin-bottom: 6px !important; | |
| } | |
| .gradio-accordion > .label-wrap { | |
| background: #23272c !important; | |
| color: var(--text-main) !important; | |
| border: none !important; | |
| border-radius: 8px !important; | |
| padding: 0.75rem 0.9rem !important; | |
| } | |
| .gradio-accordion > .label-wrap:hover { | |
| background: #2a2f34 !important; | |
| } | |
| .benchmark-section { | |
| margin-top: 18px; | |
| } | |
| .benchmark-section h3 { | |
| color: var(--text-main); | |
| font-size: 1.15rem; | |
| font-weight: 600; | |
| margin: 0 0 8px 0; | |
| } | |
| .benchmark-entry { | |
| padding: 12px 0; | |
| border-bottom: 1px solid var(--rule-soft); | |
| } | |
| .benchmark-entry-top { | |
| display: flex; | |
| align-items: baseline; | |
| justify-content: space-between; | |
| gap: 12px; | |
| flex-wrap: wrap; | |
| } | |
| .benchmark-entry-top h4 { | |
| margin: 0; | |
| color: var(--text-main); | |
| font-size: 1rem; | |
| font-weight: 600; | |
| } | |
| .benchmark-weight { | |
| color: var(--accent); | |
| font-size: 0.84rem; | |
| white-space: nowrap; | |
| } | |
| .benchmark-description { | |
| color: var(--text-soft) !important; | |
| margin: 6px 0 6px 0 !important; | |
| } | |
| .benchmark-entry p { | |
| color: var(--text-muted); | |
| line-height: 1.62; | |
| margin: 0; | |
| } | |
| .benchmark-link { | |
| margin-top: 7px; | |
| font-size: 0.86rem; | |
| } | |
| .grm-bench-section { | |
| border-top: 1px solid var(--rule-soft); | |
| margin-top: 24px; | |
| padding-top: 18px; | |
| } | |
| .grm-bench-section:first-of-type { | |
| margin-top: 16px; | |
| } | |
| .grm-bench-kicker, | |
| .grm-bench-subtitle { | |
| color: var(--text-soft); | |
| font-size: 0.78rem; | |
| font-weight: 650; | |
| letter-spacing: 0.08em; | |
| text-transform: uppercase; | |
| } | |
| .grm-bench-section h2 { | |
| color: var(--text-main); | |
| font-size: 1.32rem; | |
| font-weight: 620; | |
| letter-spacing: -0.01em; | |
| margin: 4px 0 10px 0; | |
| } | |
| .grm-bench-section p { | |
| color: var(--text-muted); | |
| font-size: 0.97rem; | |
| line-height: 1.66; | |
| margin: 0 0 10px 0; | |
| } | |
| .grm-bench-subtitle { | |
| margin: 14px 0 6px 0; | |
| } | |
| .grm-bench-sample { | |
| background: #171b1f; | |
| border: 1px solid var(--rule); | |
| border-radius: 10px; | |
| margin-top: 12px; | |
| padding: 14px 16px; | |
| } | |
| .grm-bench-sample-id { | |
| color: var(--text-main); | |
| font-size: 0.84rem; | |
| font-weight: 700; | |
| letter-spacing: 0.08em; | |
| text-transform: uppercase; | |
| } | |
| .grm-bench-sample-meta { | |
| margin: 6px 0 0 0 !important; | |
| } | |
| .grm-bench-sample-label { | |
| color: var(--text-main); | |
| font-weight: 600; | |
| } | |
| .grm-bench-sample-code { | |
| background: #0f1215; | |
| border: 1px solid var(--rule-soft); | |
| border-radius: 8px; | |
| color: #d7dde3; | |
| font-family: Consolas, "SFMono-Regular", monospace; | |
| font-size: 0.84rem; | |
| line-height: 1.55; | |
| margin: 12px 0 0 0; | |
| overflow-x: auto; | |
| padding: 12px 14px; | |
| white-space: pre-wrap; | |
| } | |
| .grm-bench-sample-code code { | |
| font-family: inherit; | |
| } | |
| .grm-bench-scope-table th:first-child, | |
| .grm-bench-scope-table td:first-child { | |
| min-width: 180px; | |
| width: 180px; | |
| } | |
| @media (max-width: 720px) { | |
| .gradio-container { | |
| padding: 20px 14px 40px !important; | |
| } | |
| .page-header h1 { | |
| font-size: 2rem; | |
| } | |
| .data-table thead th, | |
| .data-table td { | |
| padding: 10px 9px; | |
| } | |
| .leaderboard-shell { | |
| overflow-x: auto; | |
| overflow-y: visible; | |
| } | |
| .evaluation-suite-shell { | |
| overflow-x: auto; | |
| overflow-y: visible; | |
| } | |
| .evaluation-suite-table { | |
| min-width: 38rem; | |
| } | |
| } | |
| """ | |
| blocks_kwargs = {"title": "GRM Score - Game Ready Leaderboard"} | |
| if GRADIO_MAJOR_VERSION < 6: | |
| blocks_kwargs["theme"] = gr.themes.Base() | |
| blocks_kwargs["css"] = CUSTOM_CSS | |
| with gr.Blocks(**blocks_kwargs) as demo: | |
| gr.HTML(HEADER_HTML) | |
| with gr.Tabs(): | |
| with gr.Tab("Game Ready Leaderboard"): | |
| gr.Markdown("## Overview") | |
| gr.HTML(OVERVIEW_HTML) | |
| gr.Markdown("## Leaderboard") | |
| gr.Markdown( | |
| "The leaderboard now sits directly after the overview so rankings are visible before the deeper methodology sections." | |
| ) | |
| show_proprietary_models = gr.Checkbox(label="Show proprietary models", value=True) | |
| gr.HTML( | |
| "<div class=\"section-note\">Turn this off to switch the ranking and score breakdowns to an open-source-only view.</div>" | |
| ) | |
| leaderboard_html = gr.HTML(build_leaderboard_html()) | |
| gr.HTML("<div class=\"section-note\">Placeholder data for layout validation. Replace with real benchmark outputs when ready.</div>") | |
| gr.Markdown("## Per-Benchmark Score Breakdown") | |
| gr.Markdown("Expand a category to inspect the individual benchmark scores backing the leaderboard.") | |
| category_score_tables = [] | |
| for category in CATEGORIES: | |
| with gr.Accordion(f"{CATEGORY_DISPLAY[category]} benchmark scores", open=False): | |
| category_score_tables.append(gr.HTML(build_category_score_table_html(category))) | |
| show_proprietary_models.change( | |
| fn=update_leaderboard_tables, | |
| inputs=show_proprietary_models, | |
| outputs=[leaderboard_html, *category_score_tables], | |
| ) | |
| gr.Markdown("## Evaluation Suite") | |
| gr.Markdown( | |
| "Benchmarks are grouped into fused category cells so the suite reads more like a methodology table than a generic spreadsheet." | |
| ) | |
| gr.HTML(build_evaluation_suite_html()) | |
| gr.Markdown("## Benchmark Details") | |
| gr.Markdown( | |
| "Detailed summaries of each benchmark in the evaluation suite, grouped by category." | |
| ) | |
| gr.HTML(build_benchmark_details_html()) | |
| with gr.Tab("GRM-Bench"): | |
| gr.Markdown("## GRM-Bench") | |
| gr.Markdown( | |
| "Nvidia-authored benchmark families targeting in-house game interaction failure modes and evaluation surfaces." | |
| ) | |
| gr.HTML(build_grm_bench_html()) | |
| if __name__ == "__main__": | |
| launch_kwargs = {} | |
| if GRADIO_MAJOR_VERSION >= 6: | |
| launch_kwargs["theme"] = gr.themes.Base() | |
| launch_kwargs["css"] = CUSTOM_CSS | |
| demo.launch(**launch_kwargs) |