Spaces:

nvidia
/

GRM

Running on CPU Upgrade

App Files Files Community

Updated board

by mbagdasarova-nvidia - opened Apr 14

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+1701

-1214

Files changed (22) hide show

.gitattributes +0 -35
.gitignore +0 -13
.pre-commit-config.yaml +0 -53
Makefile +0 -13
README.md +102 -43
app.py +919 -200
benchmarks.py +385 -0
pyproject.toml +0 -13
ref/Coherence_Summary +84 -0
ref/Overview +9 -0
requirements.txt +1 -16
scores.py +135 -0
scoring.py +66 -0
src/about.py +0 -89
src/display/css_html_js.py +0 -105
src/display/formatting.py +0 -27
src/display/utils.py +0 -110
src/envs.py +0 -25
src/leaderboard/read_evals.py +0 -196
src/populate.py +0 -58
src/submission/check_validity.py +0 -99
src/submission/submit.py +0 -119

.gitattributes DELETED Viewed

@@ -1,35 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text
-scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text

.gitignore DELETED Viewed

@@ -1,13 +0,0 @@
-auto_evals/
-venv/
-__pycache__/
-.env
-.ipynb_checkpoints
-*ipynb
-.vscode/
-eval-queue/
-eval-results/
-eval-queue-bk/
-eval-results-bk/
-logs/

.pre-commit-config.yaml DELETED Viewed

@@ -1,53 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-default_language_version:
-  python: python3
-ci:
-  autofix_prs: true
-  autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
-  autoupdate_schedule: quarterly
-repos:
-  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.3.0
-    hooks:
-      - id: check-yaml
-      - id: check-case-conflict
-      - id: detect-private-key
-      - id: check-added-large-files
-        args: ['--maxkb=1000']
-      - id: requirements-txt-fixer
-      - id: end-of-file-fixer
-      - id: trailing-whitespace
-  - repo: https://github.com/PyCQA/isort
-    rev: 5.12.0
-    hooks:
-      - id: isort
-        name: Format imports
-  - repo: https://github.com/psf/black
-    rev: 22.12.0
-    hooks:
-      - id: black
-        name: Format code
-        additional_dependencies: ['click==8.0.2']
-  - repo: https://github.com/charliermarsh/ruff-pre-commit
-    # Ruff version.
-    rev: 'v0.0.267'
-    hooks:
-      - id: ruff

Makefile DELETED Viewed

@@ -1,13 +0,0 @@
-.PHONY: style format
-style:
-	python -m black --line-length 119 .
-	python -m isort .
-	ruff check --fix .
-quality:
-	python -m black --check --line-length 119 .
-	python -m isort --check-only .
-	ruff check .

README.md CHANGED Viewed

@@ -1,48 +1,107 @@
 ---
-title: GRM
-emoji: 🥇
-colorFrom: green
-colorTo: indigo
 sdk: gradio
 app_file: app.py
-pinned: true
-license: unknown
-short_description: GRM
-sdk_version: 5.43.1
-tags:
-- leaderboard
 ---
-# Start the configuration
-Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
-Results files should have the following format and be stored as json files:
-```json
-{
-    "config": {
-        "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
-        "model_name": "path of the model on the hub: org/model",
-        "model_sha": "revision on the hub",
-    },
-    "results": {
-        "task_name": {
-            "metric_name": score,
-        },
-        "task_name2": {
-            "metric_name": score,
-        }
-    }
-}
-```
-Request files are created automatically by this tool.
-If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
-# Code logic for more complex edits
-You'll find
-- the main table' columns names and properties in `src/display/utils.py`
-- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
-- the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`

 ---
+title: GRM Leaderboard
+colorFrom: gray
+colorTo: blue
 sdk: gradio
+sdk_version: "5.23.0"
 app_file: app.py
+pinned: false
 ---
+# GRM Leaderboard
+Static Gradio Space for comparing language models on a game-focused evaluation suite.
+## What This Project Is
+- Single-repo Hugging Face Space
+- Frontend-only app with no database or external backend
+- Static benchmark registry and static model score data stored in Python files
+- Runtime leaderboard calculation from those local data files
+## Runtime
+- Platform: Hugging Face Spaces
+- UI framework: Gradio
+- Entry point: app.py
+- Dependencies: requirements.txt
+- Space metadata: this README frontmatter
+- app.py includes compatibility handling for both Gradio 5 and Gradio 6
+## Main Page Flow
+Tab 1:
+- Overview
+- Leaderboard
+- Per-Benchmark Score Breakdown
+- Evaluation Suite
+- Benchmark Details
+Tab 2:
+- GRM-Bench authored benchmark families
+## File Ownership
+- app.py: page layout, tabs, CSS, overview copy, table rendering, and GRM-Bench authored sections
+- benchmarks.py: benchmark registry, category assignments, descriptions, summaries, and weights
+- scores.py: per-model benchmark scores on a 0.0 to 1.0 scale
+- scoring.py: category scoring, GRM score calculation, and ranking logic
+- requirements.txt: runtime dependencies
+- README.md: Space metadata and maintainer handoff notes
+## Data Model
+- benchmarks.py stores BENCHMARKS as a list of dicts
+- Each benchmark entry includes: name, category, calc_weight, description, summary, paper
+- Valid categories are ROLEPLAY, ACTIONS, and GENERAL
+- scores.py stores MODEL_SCORES keyed by model display name
+- Each model score dict is keyed by benchmark name
+- Missing scores are skipped during weighted averaging
+## Scoring
+- Category score = sum(score x weight) / sum(weight)
+- GRM score = average of Roleplay, Actions, and General category scores
+- scores.py values stay on a 0.0 to 1.0 scale
+- Displayed leaderboard values are converted to 0 to 100
+## How To Update The Site
+Update model scores:
+- Edit scores.py
+- Change benchmark values for an existing model or add a new model block
+Update evaluation suite rows or benchmark descriptions:
+- Edit benchmarks.py
+- The evaluation table and benchmark detail sections are generated from this registry
+Add a new benchmark:
+- Add the benchmark entry to benchmarks.py
+- Set its category and calc_weight
+- Add corresponding values in scores.py for each model you want included
+Update the authored GRM-Bench tab:
+- Edit GRM_BENCH_SECTIONS in app.py
+Update page structure, copy, or styling:
+- Edit app.py
+## Local Development
+- Install dependencies: pip install -r requirements.txt
+- Run locally: python app.py
+- The app launches a local Gradio server using the same static content as the Space
+## Deployment Notes
+- The live Space deploys from the remote main branch
+- README frontmatter controls the Space runtime metadata
+- requirements.txt must match imports used by app.py
+- Current scores in scores.py are placeholder/static values unless replaced with real outputs
+## Maintenance Notes
+- The UI uses Python-generated HTML tables, not Gradio Dataframes
+- Leaderboard order is recalculated on each launch from scores.py
+- Gradio theme and CSS are injected conditionally based on the installed Gradio major version
+- If page scrolling behaves oddly, inspect the root overflow and flex overrides in app.py

app.py CHANGED Viewed

@@ -1,213 +1,932 @@
 import gradio as gr
-from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
-import pandas as pd
-from apscheduler.schedulers.background import BackgroundScheduler
-from huggingface_hub import snapshot_download
-from src.about import (
-    CITATION_BUTTON_LABEL,
-    CITATION_BUTTON_TEXT,
-    EVALUATION_QUEUE_TEXT,
-    INTRODUCTION_TEXT,
-    LLM_BENCHMARKS_TEXT,
-    ABOUT_TEXT,
-    PERFORMANCE_TEXT,
-    METRICS_TEXT,
-    TITLE,
-)
-from src.display.css_html_js import custom_css
-from src.display.utils import (
-    BENCHMARK_COLS,
-    COLS,
-    EVAL_COLS,
-    EVAL_TYPES,
-    AutoEvalColumn,
-    ModelType,
-    fields,
-    WeightType,
-    Precision
-)
-from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
-from src.populate import get_evaluation_queue_df, get_leaderboard_df
-from src.submission.submit import add_new_eval
-def restart_space():
-    API.restart_space(repo_id=REPO_ID)
-### Space initialisation
-try:
-    print(EVAL_REQUESTS_PATH)
-    snapshot_download(
-        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-except Exception:
-    restart_space()
-try:
-    print(EVAL_RESULTS_PATH)
-    snapshot_download(
-        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-except Exception:
-    restart_space()
-LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
-(
-    finished_eval_queue_df,
-    running_eval_queue_df,
-    pending_eval_queue_df,
-) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
-def init_leaderboard(dataframe):
-    if dataframe is None or dataframe.empty:
-        raise ValueError("Leaderboard DataFrame is empty or None.")
-    return Leaderboard(
-        value=dataframe,
-        datatype=[c.type for c in fields(AutoEvalColumn)],
-        select_columns=SelectColumns(
-            default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
-            cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
-            label="Select Columns to Display:",
         ),
-        search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
-        hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
-        filter_columns=[
-            ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
-            ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
-            ColumnFilter(
-                AutoEvalColumn.params.name,
-                type="slider",
-                min=0.01,
-                max=150,
-                label="Select the number of parameters (B)",
-            ),
-            ColumnFilter(
-                AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
-            ),
         ],
-        bool_checkboxgroup_label="Hide models",
-        interactive=False,
     )
-demo = gr.Blocks(css=custom_css)
-with demo:
-    gr.HTML(TITLE)
-    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
-    with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
-            leaderboard = init_leaderboard(LEADERBOARD_DF)
-        with gr.TabItem("Metrics", elem_id="llm-benchmark-tab-table", id=2):
-            gr.Markdown(METRICS_TEXT, elem_classes="markdown-text")
-        with gr.TabItem("Performance", elem_id="llm-benchmark-tab-table", id=3):
-            gr.Markdown(PERFORMANCE_TEXT, elem_classes="markdown-text")
-        with gr.TabItem("Request New Model", elem_id="llm-benchmark-tab-table", id=4):
-            with gr.Column():
-                with gr.Row():
-                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
-                with gr.Column():
-                    with gr.Accordion(
-                        f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            finished_eval_table = gr.components.Dataframe(
-                                value=finished_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            running_eval_table = gr.components.Dataframe(
-                                value=running_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            pending_eval_table = gr.components.Dataframe(
-                                value=pending_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-            with gr.Row():
-                gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
-            with gr.Row():
-                with gr.Column():
-                    model_name_textbox = gr.Textbox(label="Model name")
-                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
-                    model_type = gr.Dropdown(
-                        choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
-                        label="Model type",
-                        multiselect=False,
-                        value=None,
-                        interactive=True,
-                    )
-                with gr.Column():
-                    precision = gr.Dropdown(
-                        choices=[i.value.name for i in Precision if i != Precision.Unknown],
-                        label="Precision",
-                        multiselect=False,
-                        value="float16",
-                        interactive=True,
-                    )
-                    weight_type = gr.Dropdown(
-                        choices=[i.value.name for i in WeightType],
-                        label="Weights type",
-                        multiselect=False,
-                        value="Original",
-                        interactive=True,
-                    )
-                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
-            submit_button = gr.Button("Submit Eval")
-            submission_result = gr.Markdown()
-            submit_button.click(
-                add_new_eval,
-                [
-                    model_name_textbox,
-                    base_model_name_textbox,
-                    revision_name_textbox,
-                    precision,
-                    weight_type,
-                    model_type,
-                ],
-                submission_result,
             )
-        with gr.TabItem("About", elem_id="llm-benchmark-tab-table", id=5):
-            gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
-    with gr.Row():
-        with gr.Accordion("📙 Citation", open=False):
-            citation_button = gr.Textbox(
-                value=CITATION_BUTTON_TEXT,
-                label=CITATION_BUTTON_LABEL,
-                lines=20,
-                elem_id="citation-button",
-                show_copy_button=True,
             )
-scheduler = BackgroundScheduler()
-scheduler.add_job(restart_space, "interval", seconds=1800)
-scheduler.start()
-demo.queue(default_concurrency_limit=40).launch(ssr_mode=False)

+"""Nvidia Game Ready Model Score (GRM Score) Gradio app."""
+from html import escape
+from pathlib import Path
 import gradio as gr
+from benchmarks import BENCHMARKS, CATEGORIES, CATEGORY_DISPLAY, get_benchmarks_by_category
+from scores import MODEL_SCORES
+from scoring import build_leaderboard
+LEADERBOARD_COLUMNS = [
+    "Rank",
+    "Model",
+    "GRM Score",
+    "Roleplay (33%)",
+    "Actions (33%)",
+    "General (33%)",
+]
+GRADIO_MAJOR_VERSION = int(gr.__version__.split(".", 1)[0])
+APP_ROOT = Path(__file__).resolve().parent
+REF_ROOT = APP_ROOT / "ref"
+DEFAULT_OVERVIEW_BLOCKS = [
+    "Nvidia Game Ready Model Score (GRM) is an aggregated quality metric designed to assess LLM capabilites in gaming use cases.",
+    "General state-of-the-art language models are optimized for broad benchmarks such as math, code, and general knowledge. That does not reliably translate to in-game performance, and it does not reliably predict NPC quality, gameplay actions, or immersion.",
+    "With game model evaluation, game developers can accelerate AI integration pipelines by reducing time spent on model evaluation and narrowing model choice earlier. The overall score is the average of Roleplay, Actions, and General, while benchmarks inside each category are combined with weighted averaging using core weights of 1.0 and supplementary weights of 0.5.",
+    "GRM Score = (Roleplay + Actions + General) / 3",
+    "Category Score = sum(score x weight) / sum(weight)",
+]
+PROPRIETARY_MODELS = frozenset({"GPT-5.4", "Gemini 2.5 Pro"})
+BASE_GRM_BENCH_SECTIONS = [
+    {
+        "title": "Coherence",
+        "summary": [
+            "Above all other failure modes that break immersion in character and NPC interactions are responses that feel illogical, inconsistent, or irrelevant to the active game state.",
+            "Incoherence can surface as hallucinated details, role confusion, contradictions across turns, or answers that stop tracking the subject under discussion.",
+        ],
+        "methodology": (
+            "Because coherence can fail in many different ways, the authored scenarios are designed to trigger a common failure mode and then measure whether the model stays grounded under pressure."
+        ),
+        "scope": [
+            [
+                "Factual / Logical",
+                "Objectively false or contradicted by the system prompt or game state, including invented entities, rules, or details.",
+            ],
+            ["Cause / Effect", "Fails simple logical state transitions or obvious state changes."],
+            [
+                "Contradiction",
+                "Contradicts something previously said or done without an in-world justification.",
+            ],
+            [
+                "Personality / Background Violation",
+                "Violates an established trait, limitation, or background fact.",
+            ],
+            [
+                "Role Confusion",
+                "Confuses identities, facts, actions, or motivations across entities.",
+            ],
+            [
+                "Irrelevance",
+                "Stops tracking the active subject or responds in a way that is not relevant to the discussion.",
+            ],
+            [
+                "Knowledge Boundary",
+                "Invents knowledge the character cannot have instead of separating observation from speculation.",
+            ],
+            [
+                "False Premise",
+                "Incorrectly accepts a smuggled-in user premise about something that never happened.",
+            ],
+        ],
+        "samples": (
+            "Representative cases include long multi-turn identity-confusion exchanges and hidden-information prompts where the character must avoid inventing unseen facts."
+        ),
+    },
+    {
+        "title": "Response Diversity",
+        "summary": [
+            "Response Diversity measures whether a model stays engaging without collapsing into repetitive wording, sentence structure, or stock phrasing across similar prompts and multi-turn play.",
+            "The goal is not randomness. The goal is controlled variation that still preserves the correct task intent, tone, and world state.",
+        ],
+        "methodology": (
+            "Equivalent requests are expressed across repeated turns and neighboring scenarios so the evaluation can separate healthy consistency from repetitive degeneration."
         ),
+        "scope": [
+            ["Repetition Loop", "Repeats phrases, clauses, or sentence frames across adjacent responses."],
+            ["Lexical Compression", "Collapses to a narrow vocabulary even when there is room for variation."],
+            ["Originality Failure", "Paraphrases the prompt too literally instead of producing fresh in-world language."],
+            ["Near-Duplicate Continuation", "Makes only superficial wording changes while repeating the same response content."],
+            ["Style Stagnation", "Cannot vary tone or delivery while preserving the same underlying instruction."],
         ],
+    },
+    {
+        "title": "Tool Recovery",
+        "summary": [
+            "Tool Recovery evaluates whether the model can recognize a failed tool step, repair the plan, and continue without fabricating results.",
+            "This matters for assistants that need to survive partial failures instead of derailing the whole interaction after one bad tool call.",
+        ],
+        "methodology": (
+            "Benchmarks inject missing tool calls, malformed arguments, or explicit tool failures and then measure whether the model retries correctly, replans, or asks for the right follow-up."
+        ),
+        "scope": [
+            ["Missed Invocation", "Fails to issue a required tool call at all."],
+            ["Malformed Retry", "Attempts recovery with incomplete or invalid tool arguments."],
+            ["Fabricated Output", "Invents tool output after a failure instead of acknowledging the error."],
+            ["Recovery Sequencing", "Does not replan correctly after a tool error or partial result."],
+            ["Silent Drop", "Continues as if the failed tool step never mattered to the task."],
+        ],
+    },
+    {
+        "title": "Context Adaptation",
+        "summary": [
+            "Context Adaptation measures whether a model tracks a changing world state without letting values, locations, inventories, or statuses drift across turns.",
+            "These tests target dynamic sessions where the model must stay synchronized with the newest state while preserving earlier facts that still remain true.",
+        ],
+        "methodology": (
+            "Stateful scenarios update facts mid-conversation and require the model to carry forward the latest values while also keeping dependent details accurate."
+        ),
+        "scope": [
+            ["State Drift", "Values change without cause as the conversation continues."],
+            ["Temporal Mismatch", "Old state is treated as current after a newer update is provided."],
+            ["Entity Attribute Drift", "Names, inventory, location, or status details mutate incorrectly."],
+            ["Partial Update Failure", "One field is updated but dependent fields are left stale."],
+            ["Conflict Resolution", "Cannot reconcile new information with earlier context in a coherent way."],
+        ],
+    },
+    {
+        "title": "Prompt Robustness",
+        "summary": [
+            "Prompt Robustness checks whether the same underlying intent is handled reliably across terse prompts, verbose instructions, structured payloads, and mixed formatting.",
+            "A model should not need one exact prompt style in order to understand the task, infer the right tool path, or preserve the requested output behavior.",
+        ],
+        "methodology": (
+            "Equivalent requests are expressed in long-form prose, shorthand, JSON, XML, and other wrappers to measure sensitivity to presentation rather than intent."
+        ),
+        "scope": [
+            ["Format Sensitivity", "Succeeds in plain prose but fails when the request is wrapped in JSON, XML, or other structure."],
+            ["Instruction Alias Failure", "Equivalent wording changes alter behavior more than they should."],
+            ["Verbosity Dependency", "Requires unusually long prompting to perform a task it should infer directly."],
+            ["Tool Intent Drift", "Misses the right tool plan when the same task is phrased in a different form."],
+            ["Structure Overfitting", "Responds too literally to markup or formatting instead of following the underlying request."],
+        ],
+    },
+]
+def _read_reference_file(name: str) -> str | None:
+    try:
+        return (REF_ROOT / name).read_text(encoding="utf-8").strip()
+    except OSError:
+        return None
+def _split_reference_blocks(text: str) -> list[str]:
+    blocks = []
+    for chunk in text.split("\n\n"):
+        block = " ".join(line.strip() for line in chunk.splitlines() if line.strip())
+        if block:
+            blocks.append(block)
+    return blocks
+def _build_overview_html(blocks: list[str]) -> str:
+    parts = ['<div class="longform-copy">']
+    title_prefix = "Nvidia Game Ready Model Score (GRM)"
+    for block in blocks:
+        if block.startswith("GRM Score ="):
+            parts.append(f'<p class="formula-line">{escape(block)}</p>')
+            continue
+        if block.startswith("Category Score ="):
+            parts.append(f'<p class="formula-line subdued">{escape(block)}</p>')
+            continue
+        if block.startswith(title_prefix):
+            suffix = block[len(title_prefix) :]
+            parts.append(f"<p><strong>{escape(title_prefix)}</strong>{escape(suffix)}</p>")
+            continue
+        parts.append(f"<p>{escape(block)}</p>")
+    parts.append("</div>")
+    return "".join(parts)
+def _load_overview_html() -> str:
+    text = _read_reference_file("Overview")
+    blocks = _split_reference_blocks(text) if text else DEFAULT_OVERVIEW_BLOCKS
+    return _build_overview_html(blocks)
+def _load_coherence_section() -> dict | None:
+    text = _read_reference_file("Coherence_Summary")
+    if not text:
+        return None
+    lines = text.splitlines()
+    index = 0
+    while index < len(lines) and not lines[index].strip():
+        index += 1
+    if index >= len(lines):
+        return None
+    title = lines[index].strip()
+    index += 1
+    summary_lines = []
+    while index < len(lines) and lines[index].strip() != "Test Methodology":
+        if lines[index].strip():
+            summary_lines.append(lines[index].strip())
+        index += 1
+    if index >= len(lines):
+        return None
+    index += 1
+    methodology_lines = []
+    while index < len(lines) and lines[index].strip() != "Detection Scope:":
+        if lines[index].strip():
+            methodology_lines.append(lines[index].strip())
+        index += 1
+    if index >= len(lines):
+        return None
+    index += 1
+    scope = []
+    while index < len(lines) and lines[index].strip() != "Test Samples":
+        line = lines[index].strip()
+        if line:
+            category, _, description = line.partition(" - ")
+            scope.append([category.strip(), description.strip()])
+        index += 1
+    samples = []
+    if index < len(lines) and lines[index].strip() == "Test Samples":
+        index += 1
+        while index < len(lines):
+            while index < len(lines) and not lines[index].strip():
+                index += 1
+            if index >= len(lines):
+                break
+            if not lines[index].strip().startswith("TEST_"):
+                index += 1
+                continue
+            sample_id = lines[index].strip()
+            index += 1
+            metadata = []
+            code_lines = []
+            while index < len(lines):
+                line = lines[index]
+                stripped = line.strip()
+                if stripped.startswith("TEST_"):
+                    break
+                if stripped == "Messages:":
+                    index += 1
+                    while index < len(lines) and not lines[index].strip().startswith("TEST_"):
+                        code_lines.append(lines[index].rstrip())
+                        index += 1
+                    break
+                if stripped and ":" in stripped:
+                    label, value = stripped.split(":", 1)
+                    metadata.append([label.strip(), value.strip()])
+                index += 1
+            samples.append(
+                {
+                    "id": sample_id,
+                    "metadata": metadata,
+                    "code": "\n".join(code_lines).strip(),
+                }
+            )
+    if not summary_lines or not methodology_lines or not scope:
+        return None
+    return {
+        "title": title,
+        "summary": [" ".join(summary_lines)],
+        "methodology": " ".join(methodology_lines),
+        "scope": scope,
+        "samples": samples,
+    }
+def _load_grm_bench_sections() -> list[dict]:
+    sections = list(BASE_GRM_BENCH_SECTIONS)
+    coherence_section = _load_coherence_section()
+    if coherence_section is not None:
+        sections[0] = coherence_section
+    return sections
+GRM_BENCH_SECTIONS = _load_grm_bench_sections()
+def _fmt(value: float | None) -> str:
+    return f"{value:.1f}" if value is not None else "-"
+def _fmt_weight(value: float) -> str:
+    return f"{value:.2f}"
+def _include_model(model_name: str, include_proprietary: bool) -> bool:
+    return include_proprietary or model_name not in PROPRIETARY_MODELS
+def build_html_table(
+    headers: list[str],
+    rows: list[list[str]],
+    table_class: str = "",
+    shell_class: str = "table-scroll-shell",
+) -> str:
+    class_attr = f' class="data-table {table_class}"' if table_class else ' class="data-table"'
+    shell_classes = " ".join(part for part in ["table-shell", shell_class] if part)
+    parts = [f'<div class="{shell_classes}">', f"<table{class_attr}>", "<thead><tr>"]
+    for header in headers:
+        parts.append(f"<th>{escape(header)}</th>")
+    parts.append("</tr></thead><tbody>")
+    for row in rows:
+        parts.append("<tr>")
+        for cell in row:
+            parts.append(f"<td>{escape(str(cell))}</td>")
+        parts.append("</tr>")
+    parts.append("</tbody></table></div>")
+    return "".join(parts)
+def get_leaderboard_entries(include_proprietary: bool = True) -> list[dict]:
+    entries = []
+    for row in build_leaderboard():
+        if _include_model(row["Model"], include_proprietary):
+            row_entry = dict(row)
+            row_entry["Rank"] = len(entries) + 1
+            entries.append(row_entry)
+    return entries
+def get_leaderboard_rows(include_proprietary: bool = True) -> list[list[str]]:
+    rows = []
+    for row in get_leaderboard_entries(include_proprietary):
+        rows.append(
+            [
+                str(row["Rank"]),
+                row["Model"],
+                _fmt(row["GRM Score"]),
+                _fmt(row["Roleplay (33%)"]),
+                _fmt(row["Actions (33%)"]),
+                _fmt(row["General (33%)"]),
+            ]
+        )
+    return rows
+def get_ranked_model_names(include_proprietary: bool = True) -> list[str]:
+    return [row["Model"] for row in get_leaderboard_entries(include_proprietary)]
+def build_evaluation_suite_html() -> str:
+    parts = [
+        '<div class="table-shell evaluation-suite-shell">',
+        "<table class=\"data-table evaluation-suite-table\">",
+        "<colgroup>",
+        '<col class="evaluation-suite-category-col">',
+        '<col class="evaluation-suite-benchmark-col">',
+        '<col class="evaluation-suite-description-col">',
+        '<col class="evaluation-suite-weight-col">',
+        "</colgroup>",
+        "<thead><tr>",
+        "<th>Category</th>",
+        "<th>Benchmark</th>",
+        "<th>Description</th>",
+        '<th class="weight-column" title="Weight">Wt.</th>',
+        "</tr></thead><tbody>",
+    ]
+    for category in CATEGORIES:
+        benchmarks = get_benchmarks_by_category(category)
+        rowspan = len(benchmarks)
+        for index, benchmark in enumerate(benchmarks):
+            parts.append("<tr>")
+            if index == 0:
+                parts.append(
+                    f'<td class="category-cell" rowspan="{rowspan}">{escape(CATEGORY_DISPLAY[category])}</td>'
+                )
+            parts.append(f'<td class="benchmark-cell">{escape(benchmark["name"])}</td>')
+            parts.append(f'<td class="description-cell">{escape(benchmark["description"])}</td>')
+            parts.append(f'<td class="weight-cell">{_fmt_weight(benchmark["calc_weight"])}</td>')
+            parts.append("</tr>")
+    parts.append("</tbody></table></div>")
+    return "".join(parts)
+def build_leaderboard_html(include_proprietary: bool = True) -> str:
+    return build_html_table(
+        LEADERBOARD_COLUMNS,
+        get_leaderboard_rows(include_proprietary),
+        table_class="leaderboard-table",
+        shell_class="leaderboard-shell",
+    )
+def build_category_score_table_html(category: str, include_proprietary: bool = True) -> str:
+    benchmark_names = [benchmark["name"] for benchmark in get_benchmarks_by_category(category)]
+    rows = []
+    for model in get_ranked_model_names(include_proprietary):
+        row = [model]
+        for benchmark_name in benchmark_names:
+            score = MODEL_SCORES[model].get(benchmark_name)
+            row.append(f"{score * 100:.1f}" if score is not None else "-")
+        rows.append(row)
+    return build_html_table(["Model"] + benchmark_names, rows, table_class="category-score-table")
+def update_leaderboard_tables(include_proprietary: bool) -> list[str]:
+    outputs = [build_leaderboard_html(include_proprietary)]
+    for category in CATEGORIES:
+        outputs.append(build_category_score_table_html(category, include_proprietary))
+    return outputs
+def build_benchmark_details_html() -> str:
+    parts = []
+    for category in CATEGORIES:
+        parts.append(
+            "<section class=\"benchmark-section\">"
+            f"<h3>{escape(CATEGORY_DISPLAY[category])}</h3>"
+        )
+        for benchmark in get_benchmarks_by_category(category):
+            weight_label = "Core" if benchmark["calc_weight"] == 1.0 else "Supplementary"
+            paper_html = ""
+            if benchmark.get("paper"):
+                paper_html = (
+                    "<div class=\"benchmark-link\">"
+                    f"<a href=\"{escape(benchmark['paper'])}\" target=\"_blank\" rel=\"noreferrer\">"
+                    "Paper / Source"
+                    "</a>"
+                    "</div>"
+                )
+            parts.append(
+                "<article class=\"benchmark-entry\">"
+                "<div class=\"benchmark-entry-top\">"
+                f"<h4>{escape(benchmark['name'])}</h4>"
+                f"<span class=\"benchmark-weight\">{weight_label} · {benchmark['calc_weight']}</span>"
+                "</div>"
+                f"<p class=\"benchmark-description\">{escape(benchmark['description'])}</p>"
+                f"<p>{escape(benchmark['summary'])}</p>"
+                f"{paper_html}"
+                "</article>"
+            )
+        parts.append("</section>")
+    return "".join(parts)
+def _build_grm_bench_sample_html(sample: dict) -> str:
+    parts = [
+        '<article class="grm-bench-sample">',
+        f'<div class="grm-bench-sample-id">{escape(sample["id"])}</div>',
+    ]
+    for label, value in sample.get("metadata", []):
+        parts.append(
+            '<p class="grm-bench-sample-meta">'
+            f'<span class="grm-bench-sample-label">{escape(label)}:</span> {escape(value)}'
+            "</p>"
+        )
+    if sample.get("code"):
+        parts.append('<pre class="grm-bench-sample-code"><code>')
+        parts.append(escape(sample["code"]))
+        parts.append("</code></pre>")
+    parts.append("</article>")
+    return "".join(parts)
+def build_grm_bench_section_html(section: dict) -> str:
+    parts = [
+        "<section class=\"grm-bench-section\">",
+        "<div class=\"grm-bench-kicker\">Nvidia-Authored Benchmark</div>",
+        f"<h2>{escape(section['title'])}</h2>",
+    ]
+    for paragraph in section["summary"]:
+        parts.append(f"<p>{escape(paragraph)}</p>")
+    parts.append("<div class=\"grm-bench-subtitle\">Test Methodology</div>")
+    parts.append(f"<p>{escape(section['methodology'])}</p>")
+    parts.append("<div class=\"grm-bench-subtitle\">Detection Scope</div>")
+    parts.append(
+        build_html_table(["Category", "Description"], section["scope"], table_class="grm-bench-scope-table")
     )
+    samples = section.get("samples")
+    if samples:
+        parts.append("<div class=\"grm-bench-subtitle\">Representative Samples</div>")
+        if isinstance(samples, str):
+            parts.append(f"<p>{escape(samples)}</p>")
+        else:
+            for sample in samples:
+                parts.append(_build_grm_bench_sample_html(sample))
+    parts.append("</section>")
+    return "".join(parts)
+def build_grm_bench_html() -> str:
+    parts = [
+        "<div class=\"longform-copy\">",
+        "<p><strong>GRM-Bench</strong> is the in-house authored benchmark suite for game-facing assistants, companions, and NPC behaviors that are not well-covered by broad academic leaderboards.</p>",
+        "<p>The sections below describe the initial authored benchmark families and the concrete failure modes each family is designed to surface.</p>",
+        "</div>",
+    ]
+    for section in GRM_BENCH_SECTIONS:
+        parts.append(build_grm_bench_section_html(section))
+    return "".join(parts)
+HEADER_HTML = """
+<section class="page-header">
+  <div class="page-eyebrow">NVIDIA Game Ready Evaluation</div>
+  <h1>Game Ready Leaderboard</h1>
+  <p>
+        An open game model evaluation surface for comparing LLMs across roleplay, gameplay
+        actions, and practical in-game reasoning.
+  </p>
+</section>
+"""
+OVERVIEW_HTML = _load_overview_html()
+CUSTOM_CSS = """
+/* ── Palette ──────────────────────────────────────────────────────────── */
+:root {
+    --bg:             #181b1e;
+    --surface:        #1d2126;
+    --surface-raised: #24282d;
+    --surface-header: #2b2f34;
+    --border:         #3c4048;
+    --border-subtle:  #313539;
+    --text:           #c8cdd2;
+    --text-bright:    #f5f7f8;
+    --text-dim:       #a2a8ae;
+    --accent:         #76b900;
+    --accent-hover:   #8fd400;
+}
+/* ── Page ─────────────────────────────────────────────────────────────── */
+html, body {
+    display: block !important;
+    height: auto !important;
+    overflow-y: auto !important;
+}
+body { background: var(--bg) !important; }
+/* ── Gradio container ─────────────────────────────────────────────────── */
+.gradio-container {
+    max-width: 1260px !important;
+    margin: 0 auto !important;
+    padding: 24px 24px 48px !important;
+    background: transparent !important;
+    color: var(--text) !important;
+    font-family: "Segoe UI", "Helvetica Neue", Arial, sans-serif !important;
+}
+/* ── Tabs ─────────────────────────────────────────────────────────────── */
+.gradio-container .tab-nav {
+    border-bottom: 1px solid var(--border) !important;
+    gap: 18px;
+    margin: 0 0 18px 0 !important;
+}
+.gradio-container .tab-nav button {
+    background: transparent !important;
+    border: none !important;
+    border-radius: 0 !important;
+    color: var(--text-dim) !important;
+    font-size: 0.8rem !important;
+    font-weight: 650 !important;
+    letter-spacing: 0.08em !important;
+    padding: 0 0 12px 0 !important;
+    text-transform: uppercase !important;
+}
+.gradio-container .tab-nav button.selected,
+.gradio-container .tab-nav button[aria-selected="true"] {
+    box-shadow: inset 0 -2px 0 var(--accent) !important;
+    color: var(--text-bright) !important;
+}
+/* ── Gradio Markdown / Prose ──────────────────────────────────────────── */
+.gradio-container .prose     { color: var(--text) !important; background: transparent !important; }
+.gradio-container .prose h2  { color: var(--text-bright) !important; font-size: 1.55rem !important; font-weight: 600 !important; }
+.gradio-container .prose h3  { color: var(--text-bright) !important; font-size: 1.1rem  !important; font-weight: 600 !important; }
+.gradio-container .prose p,
+.gradio-container .prose li  { color: var(--text) !important; font-size: 1.05rem !important; line-height: 1.7 !important; }
+.gradio-container .prose strong { color: var(--text-bright) !important; }
+.gradio-container .prose a   { color: var(--accent) !important; text-decoration: none !important; }
+/* ── Page header ──────────────────────────────────────────────────────── */
+.page-header   { text-align: center; margin: 4px auto 26px; }
+.page-eyebrow  { color: var(--text-dim); text-transform: uppercase; letter-spacing: 0.16em; font-size: 0.76rem; margin-bottom: 12px; }
+.page-header h1 { color: var(--text-bright); font-size: 2.35rem; line-height: 1.1; letter-spacing: -0.02em; margin: 0; font-weight: 650; }
+.page-header p  { max-width: 860px; margin: 12px auto 0; color: var(--text); font-size: 1rem; line-height: 1.65; }
+/* ── Overview copy ────────────────────────────────────────────────────── */
+.section-note      { color: var(--text-dim); font-size: 0.88rem; margin-top: 8px; }
+.longform-copy p   { color: var(--text); font-size: 1.05rem; line-height: 1.7; margin: 0 0 10px 0; }
+.formula-line      { color: var(--text-bright) !important; font-weight: 600; margin-top: 12px !important; }
+.formula-line.subdued { color: var(--text-dim) !important; font-weight: 500; }
+/* ── Custom HTML tables ───────────────────────────────────────────────── */
+.table-shell        { width: 100%; margin-top: 10px; }
+.table-scroll-shell { overflow-x: auto; }
+.leaderboard-shell,
+.evaluation-suite-shell { overflow-x: auto; }
+.data-table { width: 100%; border-collapse: collapse; }
+.data-table thead th {
+    background: var(--surface-header);
+    color: var(--text-dim);
+    font-size: 0.8rem;
+    font-weight: 650;
+    text-transform: uppercase;
+    letter-spacing: 0.04em;
+    text-align: left;
+    padding: 11px 12px;
+    border: 1px solid var(--border);
+}
+.data-table thead th + th { border-left: 1px solid var(--border); }
+.data-table tbody tr:nth-child(odd)  td { background: var(--surface); }
+.data-table tbody tr:nth-child(even) td { background: var(--surface-raised); }
+.data-table tbody tr:hover           td { background: #30353a; }
+.data-table td {
+    color: var(--text);
+    font-size: 0.94rem;
+    line-height: 1.45;
+    padding: 10px 12px;
+    vertical-align: top;
+    border: 1px solid var(--border-subtle);
+}
+/* Rank-1 row highlight */
+# .leaderboard-table tbody tr:first-child td { background: #252d1d; }
+/* Evaluation suite overrides */
+.evaluation-suite-table { table-layout: fixed; }
+.evaluation-suite-table thead th { padding: 9px 11px; font-size: 0.78rem; }
+.evaluation-suite-table td       { padding: 7px 11px; font-size: 0.9rem; line-height: 1.3; }
+.evaluation-suite-category-col  { width: 8rem; }
+.evaluation-suite-benchmark-col { width: 12.5rem; }
+.evaluation-suite-weight-col    { width: 5ch; }
+.evaluation-suite-table .weight-column,
+.evaluation-suite-table .weight-cell {
+    font-variant-numeric: tabular-nums;
+    text-align: center;
+    white-space: nowrap;
+    width: 5ch;
+    padding: 0 4px;
+}
+.evaluation-suite-table .category-cell {
+    color: var(--text-dim);
+    font-size: 0.79rem;
+    font-weight: 650;
+    text-transform: uppercase;
+    letter-spacing: 0.06em;
+    vertical-align: top;
+    min-width: 8rem;
+}
+.evaluation-suite-table .benchmark-cell { width: 12.5rem; overflow-wrap: anywhere; }
+.evaluation-suite-table .description-cell { min-width: 0; overflow-wrap: anywhere; }
+.grm-bench-scope-table th:first-child,
+.grm-bench-scope-table td:first-child { min-width: 180px; width: 180px; }
+/* ── Accordion ────────────────────────────────────────────────────────── */
+/* Gradio 4: .gradio-accordion  |  Gradio 5: details */
+.gradio-container .gradio-accordion,
+.gradio-container details {
+    background: transparent !important;
+    border: 1px solid var(--border) !important;
+    border-radius: 8px !important;
+    margin-bottom: 6px !important;
+    overflow: hidden !important;
+}
+.gradio-container .gradio-accordion > .label-wrap,
+.gradio-container details > summary {
+    background: var(--surface-raised) !important;
+    color: var(--text-bright) !important;
+    border: none !important;
+    padding: 0.75rem 0.9rem !important;
+    cursor: pointer;
+    font-weight: 600;
+    font-size: 0.92rem;
+    list-style: none;
+}
+.gradio-container .gradio-accordion > .label-wrap:hover,
+.gradio-container details > summary:hover { background: #2a2f34 !important; }
+.gradio-container details[open] > summary  { border-bottom: 1px solid var(--border) !important; }
+.gradio-container details > summary::-webkit-details-marker { display: none; }
+/* ── Benchmark details ────────────────────────────────────────────────── */
+.benchmark-section   { margin-top: 18px; }
+.benchmark-section h3 { color: var(--text-bright); font-size: 1.15rem; font-weight: 600; margin: 0 0 8px 0; }
+.benchmark-entry     { padding: 12px 0; border-bottom: 1px solid var(--border-subtle); }
+.benchmark-entry-top { display: flex; align-items: baseline; justify-content: space-between; gap: 12px; flex-wrap: wrap; }
+.benchmark-entry-top h4  { margin: 0; color: var(--text-bright); font-size: 1rem; font-weight: 600; }
+.benchmark-weight        { color: var(--accent); font-size: 0.84rem; white-space: nowrap; }
+.benchmark-description   { color: var(--text-dim); margin: 6px 0 !important; }
+.benchmark-entry p       { color: var(--text); line-height: 1.62; margin: 0; }
+.benchmark-link          { margin-top: 7px; font-size: 0.86rem; }
+.benchmark-link a        { color: var(--accent); text-decoration: none; }
+/* ── GRM-Bench sections ───────────────────────────────────────────────── */
+.grm-bench-section          { border-top: 1px solid var(--border-subtle); margin-top: 24px; padding-top: 18px; }
+.grm-bench-section:first-of-type { margin-top: 16px; }
+.grm-bench-kicker,
+.grm-bench-subtitle         { color: var(--text-dim); font-size: 0.78rem; font-weight: 650; letter-spacing: 0.08em; text-transform: uppercase; }
+.grm-bench-section h2       { color: var(--text-bright); font-size: 1.32rem; font-weight: 620; letter-spacing: -0.01em; margin: 4px 0 10px 0; }
+.grm-bench-section p        { color: var(--text); font-size: 1.05rem; line-height: 1.7; margin: 0 0 10px 0; }
+.grm-bench-subtitle         { margin: 14px 0 6px 0; }
+.grm-bench-sample           { background: #171b1f; border: 1px solid var(--border); border-radius: 10px; margin-top: 12px; padding: 14px 16px; }
+.grm-bench-sample-id        { color: var(--text-bright); font-size: 0.84rem; font-weight: 700; letter-spacing: 0.08em; text-transform: uppercase; }
+.grm-bench-sample-meta      { margin: 6px 0 0 0 !important; }
+.grm-bench-sample-label     { color: var(--text-bright); font-weight: 600; }
+.grm-bench-sample-code {
+    background: #0f1215;
+    border: 1px solid var(--border-subtle);
+    border-radius: 8px;
+    color: #d7dde3;
+    font-family: Consolas, "SFMono-Regular", monospace;
+    font-size: 0.84rem;
+    line-height: 1.55;
+    margin: 12px 0 0 0;
+    overflow-x: auto;
+    padding: 12px 14px;
+    white-space: pre-wrap;
+}
+.grm-bench-sample-code code { font-family: inherit; }
+/* ── Scrollbars ───────────────────────────────────────────────────────── */
+::-webkit-scrollbar       { width: 6px; height: 6px; }
+::-webkit-scrollbar-track { background: var(--bg); }
+::-webkit-scrollbar-thumb { background: #3a3f45; border-radius: 3px; }
+::-webkit-scrollbar-thumb:hover { background: var(--accent); }
+/* ── Responsive ───────────────────────────────────────────────────────── */
+@media (max-width: 720px) {
+    .gradio-container { padding: 20px 14px 40px !important; }
+    .page-header h1   { font-size: 2rem; }
+    .data-table thead th,
+    .data-table td    { padding: 10px 9px; }
+    .leaderboard-shell,
+    .evaluation-suite-shell { overflow-x: auto; }
+    .evaluation-suite-table { min-width: 38rem; }
+}
+.red-text { color: red !important; }
+"""
+_NVIDIA_DARK_THEME = gr.themes.Base().set(
+    # Page
+    body_background_fill="#181b1e",
+    body_text_color="#f5f7f8",
+    body_text_color_subdued="#a2a8ae",
+    # Blocks / panels
+    background_fill_primary="#1d2126",
+    background_fill_secondary="#15181b",
+    block_background_fill="#1d2126",
+    block_border_color="#3c4048",
+    block_border_width="1px",
+    block_label_background_fill="#23272c",
+    block_label_border_color="#3c4048",
+    block_label_text_color="#f5f7f8",
+    block_title_background_fill="transparent",
+    block_title_text_color="#f5f7f8",
+    block_info_text_color="#a2a8ae",
+    panel_background_fill="#1d2126",
+    panel_border_color="#3c4048",
+    # Inputs
+    input_background_fill="#24282d",
+    input_background_fill_focus="#2b2f34",
+    input_border_color="#3c4048",
+    input_border_color_focus="#76b900",
+    input_placeholder_color="#6b7280",
+    input_shadow="none",
+    input_shadow_focus="0 0 0 2px rgba(118,185,0,0.25)",
+    # Buttons
+    button_primary_background_fill="#76b900",
+    button_primary_background_fill_hover="#8fd400",
+    button_primary_border_color="transparent",
+    button_primary_text_color="#0f1012",
+    button_secondary_background_fill="#24282d",
+    button_secondary_background_fill_hover="#2b2f34",
+    button_secondary_border_color="#3c4048",
+    button_secondary_text_color="#f5f7f8",
+    # Borders / accent
+    border_color_accent="#76b900",
+    border_color_primary="#3c4048",
+    color_accent="#76b900",
+    link_text_color="#76b900",
+    link_text_color_hover="#8fd400",
+    link_text_color_visited="#5a9000",
+    # Table
+    table_even_background_fill="#1c2024",
+    table_odd_background_fill="#24282d",
+    table_row_focus="#30353a",
+    # Checkbox / radio
+    checkbox_background_color="#24282d",
+    checkbox_background_color_selected="#76b900",
+    checkbox_background_color_hover="#2b2f34",
+    checkbox_border_color="#3c4048",
+    checkbox_border_color_focus="#76b900",
+    checkbox_border_color_selected="#76b900",
+    checkbox_label_background_fill="transparent",
+    checkbox_label_background_fill_selected="rgba(118,185,0,0.12)",
+    checkbox_label_text_color="#f5f7f8",
+    # Slider
+    slider_color="#76b900",
+    # Shadows
+    shadow_drop="0 2px 8px rgba(0,0,0,0.45)",
+    shadow_drop_lg="0 4px 20px rgba(0,0,0,0.6)",
+)
+blocks_kwargs = {"title": "GRM Score - Game Ready Leaderboard"}
+if GRADIO_MAJOR_VERSION < 6:
+    blocks_kwargs["theme"] = _NVIDIA_DARK_THEME
+    blocks_kwargs["css"] = CUSTOM_CSS
+with gr.Blocks(**blocks_kwargs) as demo:
+    gr.HTML(HEADER_HTML)
+    with gr.Tabs():
+        with gr.Tab("Game Ready Leaderboard"):
+            gr.Markdown("## Overview")
+            gr.HTML(OVERVIEW_HTML)
+            gr.Markdown("## Leaderboard")
+            gr.Markdown(
+                "The leaderboard now sits directly after the overview so rankings are visible before the deeper methodology sections.", elem_classes=["red-text"]
+            )
+            show_proprietary_models = gr.Checkbox(label="Show proprietary models", value=True)
+            gr.HTML(
+                "<div class=\"section-note\">Turn this off to switch the ranking and score breakdowns to an open-source-only view.</div>"
+            )
+            leaderboard_html = gr.HTML(build_leaderboard_html())
+            gr.HTML("<div class=\"section-note red-text\"> Placeholder data for layout validation. Replace with real benchmark outputs when ready.</div>")
+            gr.Markdown("## Per-Benchmark Score Breakdown")
+            gr.Markdown("Expand a category to inspect the individual benchmark scores backing the leaderboard.")
+            category_score_tables = []
+            for category in CATEGORIES:
+                with gr.Accordion(f"{CATEGORY_DISPLAY[category]} benchmark scores", open=False):
+                    category_score_tables.append(gr.HTML(build_category_score_table_html(category)))
+            show_proprietary_models.change(
+                fn=update_leaderboard_tables,
+                inputs=show_proprietary_models,
+                outputs=[leaderboard_html, *category_score_tables],
             )
+            gr.Markdown("## Evaluation Suite")
+            gr.Markdown(
+                "Benchmarks are grouped into fused category cells so the suite reads more like a methodology table than a generic spreadsheet."
             )
+            gr.HTML(build_evaluation_suite_html())
+            gr.Markdown("## Benchmark Details")
+            gr.Markdown(
+                "Detailed summaries of each benchmark in the evaluation suite, grouped by category."
+            )
+            gr.HTML(build_benchmark_details_html())
+        with gr.Tab("GRM-Bench"):
+            gr.Markdown("## GRM-Bench")
+            gr.Markdown(
+                "Nvidia-authored benchmark families targeting in-house game interaction failure modes and evaluation surfaces."
+            )
+            gr.HTML(build_grm_bench_html())
+if __name__ == "__main__":
+    launch_kwargs = {}
+    if GRADIO_MAJOR_VERSION >= 6:
+        launch_kwargs["theme"] = _NVIDIA_DARK_THEME
+        launch_kwargs["css"] = CUSTOM_CSS
+    demo.launch(**launch_kwargs)

benchmarks.py ADDED Viewed

	@@ -0,0 +1,385 @@

+"""
+GRM Evaluation Suite — Benchmark Registry
+Each benchmark is defined with:
+  - name: Display name
+  - category: ROLEPLAY | ACTIONS | GENERAL
+  - description: Short description of what the benchmark tests
+  - calc_weight: 1.0 (core) or 0.5 (supplementary)
+  - summary: Multi-sentence methodology summary for display
+  - paper: URL to the paper or resource (if applicable)
+"""
+BENCHMARKS = [
+    # ── ROLEPLAY (33%) ──────────────────────────────────────────────
+    {
+        "name": "MultiChallenge",
+        "category": "ROLEPLAY",
+        "calc_weight": 1.0,
+        "description": "Multi-turn instruction following & coherence across turns",
+        "summary": (
+            "MultiChallenge evaluates multi-turn instruction following where models must "
+            "maintain coherence across multiple challenging conversational turns. Each test "
+            "scenario involves complex, multi-constraint instructions that require the model "
+            "to track context, resolve references, and keep all prior commitments intact while "
+            "handling new user requests."
+        ),
+        "paper": None,
+    },
+    {
+        "name": "RoleBench",
+        "category": "ROLEPLAY",
+        "calc_weight": 1.0,
+        "description": "Character-level roleplaying with explicit role framing",
+        "summary": (
+            "RoleBench is the first systematic and fine-grained character-level benchmark "
+            "for role-playing, comprising 168,093 samples covering 100 distinct roles. "
+            "Created via the RoleLLM framework using Context-Instruct for role-specific "
+            "knowledge extraction and RoleGPT for speaking style imitation. Evaluates a "
+            "model's ability to maintain character persona, domain knowledge, and consistent "
+            "speaking style throughout interactions."
+        ),
+        "paper": "https://arxiv.org/abs/2310.00746",
+    },
+    {
+        "name": "GRM-Bench \u2014 Coherence",
+        "category": "ROLEPLAY",
+        "calc_weight": 1.0,
+        "description": "Logically sound and coherent across turns, without contradictions",
+        "summary": (
+            "Nvidia-authored benchmark testing resistance to incoherence in gaming dialogue. "
+            "Scenarios are crafted to invoke common coherence failures, then a model's "
+            "resilience is measured. Detection covers eight categories: factual/logical errors, "
+            "cause-effect failures, self-contradiction, personality/background violations, "
+            "role confusion, irrelevance, knowledge boundary violations, and false premise "
+            "acceptance."
+        ),
+        "paper": None,
+    },
+    {
+        "name": "GRM-Bench \u2014 Response Diversity",
+        "category": "ROLEPLAY",
+        "calc_weight": 1.0,
+        "description": "Avoids using repetitive language and speech structure",
+        "summary": (
+            "Nvidia-authored benchmark measuring whether models avoid repetitive language "
+            "patterns, vocabulary, and sentence structures across varied dialogue exchanges. "
+            "Evaluates lexical diversity, syntactic variation, and stylistic range when a "
+            "character is placed in different conversation contexts."
+        ),
+        "paper": None,
+    },
+    {
+        "name": "GRM-Bench \u2014 Context Adaption",
+        "category": "ROLEPLAY",
+        "calc_weight": 1.0,
+        "description": "Using latest knowledge/variable updates even if changed during convo",
+        "summary": (
+            "Nvidia-authored benchmark testing whether models correctly incorporate the latest "
+            "game-state and knowledge updates, even when facts change mid-conversation. "
+            "Scenarios involve dynamic variable mutations (e.g. inventory changes, NPC status "
+            "updates) and verify the model references the current state rather than stale data."
+        ),
+        "paper": None,
+    },
+    {
+        "name": "DialogueNLI",
+        "category": "ROLEPLAY",
+        "calc_weight": 0.5,
+        "description": "Checks contradiction/consistency crumbling",
+        "summary": (
+            "Dialogue Natural Language Inference dataset for evaluating consistency in "
+            "dialogue agents. Uses NLI-style classification (entailment / contradiction / "
+            "neutral) to detect when a dialogue agent contradicts its established persona "
+            "or previous statements. Derived from the Persona-Chat dataset with human-"
+            "annotated sentence pairs."
+        ),
+        "paper": "https://arxiv.org/abs/1811.00671",
+    },
+    {
+        "name": "RoleMRC",
+        "category": "ROLEPLAY",
+        "calc_weight": 0.5,
+        "description": "Follow complex nested instructions while remaining in character",
+        "summary": (
+            "Role-based Machine Reading Comprehension benchmark that tests the ability to "
+            "follow complex, nested instructions while remaining fully in character. Combines "
+            "reading comprehension challenges with role-playing constraints, requiring models "
+            "to extract and reason about information without breaking persona."
+        ),
+        "paper": None,
+    },
+    {
+        "name": "EQBench v3",
+        "category": "ROLEPLAY",
+        "calc_weight": 0.5,
+        "description": "Detect nuances in tone/intent and modulate response accordingly",
+        "summary": (
+            "Emotional Quotient Benchmark v3 assesses a model's emotional intelligence — "
+            "specifically the ability to detect nuances in tone, intent, and emotional subtext, "
+            "and to modulate responses accordingly. Tests include recognizing sarcasm, empathy "
+            "calibration, emotional escalation/de-escalation, and context-appropriate tonal "
+            "shifts."
+        ),
+        "paper": "https://eqbench.com/",
+    },
+    # ── ACTIONS (33%) ───────────────────────────────────────────────
+    {
+        "name": "BFCLv3",
+        "category": "ACTIONS",
+        "calc_weight": 1.0,
+        "description": "Serial/parallel tool calling, multi-step settings",
+        "summary": (
+            "Berkeley Function-Calling Leaderboard v3 evaluates serial and parallel tool "
+            "calling in multi-step settings across multiple programming languages and complex "
+            "function schemas. Tests include simple, multiple, parallel, and nested function "
+            "calls, as well as function relevance detection (knowing when no tool applies)."
+        ),
+        "paper": "https://gorilla.cs.berkeley.edu/blogs/8_berkeley_function_calling_leaderboard.html",
+    },
+    {
+        "name": "Tau2-Bench",
+        "category": "ACTIONS",
+        "calc_weight": 1.0,
+        "description": "Multi-turn interactions w/ real-world commercial operations",
+        "summary": (
+            "\u03c4\u00b2-Bench from Sierra Research is a multi-turn agentic benchmark using "
+            "dual-control agent-user simulation for testing tool use in real-world commercial "
+            "operations. The telecom domain contains 114 programmatically generated tasks "
+            "with varying intents (service, mobile data, MMS). The outcome world-state "
+            "determines success — e.g. whether Data is functioning after agent completion."
+        ),
+        "paper": "https://arxiv.org/abs/2506.07982",
+    },
+    {
+        "name": "ToolSandbox",
+        "category": "ACTIONS",
+        "calc_weight": 1.0,
+        "description": "Stateful dependencies + conversational tool calling",
+        "summary": (
+            "Apple's stateful, conversational, interactive evaluation benchmark for LLM tool "
+            "use. Includes stateful tool execution with implicit state dependencies between "
+            "tools, a built-in user simulator supporting on-policy conversational evaluation, "
+            "and dynamic evaluation of intermediate and final milestones. Tests canonicalization, "
+            "insufficient information handling, and complex state management."
+        ),
+        "paper": "https://arxiv.org/abs/2408.04682",
+    },
+    {
+        "name": "When2Call",
+        "category": "ACTIONS",
+        "calc_weight": 1.0,
+        "description": "Tool call timing \u2014 when to trigger, when to follow-up, etc.",
+        "summary": (
+            "Evaluates tool-call timing decisions: knowing when to invoke a tool, when to ask "
+            "for clarification first, and when to provide a direct answer without tools. Tests "
+            "the critical judgment of whether a function call is appropriate given the current "
+            "conversational context and available information."
+        ),
+        "paper": None,
+    },
+    {
+        "name": "GRM-Bench \u2014 Prompt Robustness",
+        "category": "ACTIONS",
+        "calc_weight": 1.0,
+        "description": "Same prompt expressed differently still invoking intended tools",
+        "summary": (
+            "Nvidia-authored benchmark testing whether semantically equivalent prompts "
+            "expressed in different phrasings, formality levels, and syntactic structures "
+            "still correctly invoke the intended tools and actions. Measures robustness of "
+            "tool-call intent recognition against natural language variation."
+        ),
+        "paper": None,
+    },
+    {
+        "name": "BFCLv4",
+        "category": "ACTIONS",
+        "calc_weight": 0.5,
+        "description": "Adds memory into the loop, and tests format sensitivity",
+        "summary": (
+            "Berkeley Function-Calling Leaderboard v4 extends v3 with memory-augmented tool "
+            "calling scenarios and tests sensitivity to format variations in function schemas. "
+            "Evaluates how well models handle evolving context windows and maintain tool-call "
+            "accuracy when schema formats shift."
+        ),
+        "paper": "https://gorilla.cs.berkeley.edu/blogs/12_bfcl_v3_multi_turn.html",
+    },
+    {
+        "name": "T-Eval",
+        "category": "ACTIONS",
+        "calc_weight": 0.5,
+        "description": "Step-by-step tool use and orchestration, logical tool decomposition",
+        "summary": (
+            "T-Eval evaluates step-by-step tool use and orchestration capabilities. Tests "
+            "logical decomposition of complex tasks into tool-calling sequences, including "
+            "plan generation, tool selection, argument filling, and response summarization. "
+            "Provides fine-grained analysis of where in the tool-use pipeline models fail."
+        ),
+        "paper": "https://arxiv.org/abs/2312.14033",
+    },
+    # ── GENERAL (33%) ──────────────────────────────────────────────
+    {
+        "name": "RULER",
+        "category": "GENERAL",
+        "calc_weight": 1.0,
+        "description": "Needle-in-haystack + polluted state stress test",
+        "summary": (
+            "RULER (Real-world Understanding of Long-context and Evaluation through Retrieval) "
+            "extends needle-in-a-haystack testing with multiple retrieval types, multi-hop "
+            "composition, and aggregation tasks at varying context lengths. Includes polluted "
+            "state and distractor injection to stress-test long-context faithfulness."
+        ),
+        "paper": "https://arxiv.org/abs/2404.06654",
+    },
+    {
+        "name": "GaRAGe",
+        "category": "GENERAL",
+        "calc_weight": 1.0,
+        "description": "Deflect/refuse action when state is insufficient/corrupted",
+        "summary": (
+            "GaRAGe (Grounded and Attributed RAG Evaluation) tests a model's ability to "
+            "deflect or refuse action when the provided retrieval context is insufficient, "
+            "corrupted, or contradictory. Evaluates robustness against adversarial or low-"
+            "quality retrieved passages and the model's capacity to say 'I don't know' rather "
+            "than hallucinate an answer."
+        ),
+        "paper": None,
+    },
+    {
+        "name": "IFBench",
+        "category": "GENERAL",
+        "calc_weight": 0.5,
+        "description": "Generic instruction following, not as prone to overfit vs IFEval",
+        "summary": (
+            "IFBench from AllenAI evaluates precise instruction following with 294 single-turn "
+            "questions testing counting, formatting, and sentence manipulation. Uses a loose "
+            "evaluation mode that accounts for extraneous text or formatting. Designed to be "
+            "less prone to overfitting compared to IFEval."
+        ),
+        "paper": "https://arxiv.org/abs/2507.02833",
+    },
+    {
+        "name": "AA LCR",
+        "category": "GENERAL",
+        "calc_weight": 0.5,
+        "description": "Complex reasoning across long contexts",
+        "summary": (
+            "Artificial Analysis Long Context Reasoning benchmark with 100 hard text-based "
+            "questions spanning 7 document categories (Company Reports, Industry Reports, "
+            "Government Consultations, Academia, Legal, Marketing, Surveys). Requires ~100K "
+            "tokens of input per question, demanding reasoning across multiple long documents."
+        ),
+        "paper": "https://artificialanalysis.ai/methodology/intelligence-benchmarking",
+    },
+    {
+        "name": "StructEval-T",
+        "category": "GENERAL",
+        "calc_weight": 0.5,
+        "description": "Format-following",
+        "summary": (
+            "StructEval-T evaluates structured output and format-following capabilities. "
+            "Tests whether models can adhere to specified output templates, formatting "
+            "constraints (JSON, XML, Markdown, tables), and structural requirements while "
+            "maintaining content accuracy."
+        ),
+        "paper": None,
+    },
+    {
+        "name": "InverseIFEval",
+        "category": "GENERAL",
+        "calc_weight": 0.5,
+        "description": "Unconventional instruction following",
+        "summary": (
+            "Tests unconventional and counter-intuitive instruction following where models "
+            "must comply with unusual or inverted constraints. Evaluates whether models can "
+            "faithfully execute instructions that go against typical patterns, such as "
+            "intentionally producing specific error formats or following negative constraints."
+        ),
+        "paper": None,
+    },
+    {
+        "name": "RAGTruth",
+        "category": "GENERAL",
+        "calc_weight": 0.5,
+        "description": "Hallucinations relative to retrieved context",
+        "summary": (
+            "RAGTruth benchmarks hallucination detection and prevention in RAG pipelines. "
+            "Evaluates whether models faithfully ground responses in provided retrieved "
+            "documents rather than generating unsupported claims. Covers summary-level and "
+            "sentence-level faithfulness across diverse document types."
+        ),
+        "paper": "https://arxiv.org/abs/2401.00396",
+    },
+    {
+        "name": "SpatialText",
+        "category": "GENERAL",
+        "calc_weight": 0.5,
+        "description": "Text-based spatial cognition",
+        "summary": (
+            "SpatialText tests text-based spatial cognition — understanding spatial "
+            "relationships, positions, orientations, and arrangements described purely through "
+            "natural language. Critical for gaming scenarios involving navigation, object "
+            "placement, and environmental descriptions."
+        ),
+        "paper": None,
+    },
+    {
+        "name": "SpartQA",
+        "category": "GENERAL",
+        "calc_weight": 0.5,
+        "description": "Spatial reasoning with textual spatial descriptions and Q&A",
+        "summary": (
+            "SpartQA is a question-answering benchmark for spatial reasoning from textual "
+            "descriptions. Models must understand object positions, relative locations, "
+            "containment, and spatial logic described in natural language passages, then answer "
+            "questions requiring spatial inference. Includes FindRelation, FindBlock, and "
+            "YesNo question types."
+        ),
+        "paper": "https://arxiv.org/abs/2104.05832",
+    },
+    {
+        "name": "COPA",
+        "category": "GENERAL",
+        "calc_weight": 0.5,
+        "description": "Premise + choose plausible cause/effect",
+        "summary": (
+            "Choice of Plausible Alternatives (COPA) is a classic commonsense causal reasoning "
+            "benchmark. Given a premise, the model must select the more plausible cause or "
+            "effect from two options. Tests open-domain commonsense causal reasoning critical "
+            "for narrative coherence in game dialogue."
+        ),
+        "paper": "https://people.ict.usc.edu/~gordon/copa.html",
+    },
+    {
+        "name": "PIQA",
+        "category": "GENERAL",
+        "calc_weight": 0.5,
+        "description": "Physical interaction common sense",
+        "summary": (
+            "Physical Interaction Question Answering tests physical commonsense knowledge "
+            "about everyday objects, their properties, affordances, and interactions. Each "
+            "question presents a goal and two solutions; the model must select the physically "
+            "plausible one. Relevant for game NPCs reasoning about physical world interactions."
+        ),
+        "paper": "https://arxiv.org/abs/1911.11641",
+    },
+]
+CATEGORIES = ["ROLEPLAY", "ACTIONS", "GENERAL"]
+CATEGORY_WEIGHT = 1 / 3  # Each category contributes 33.3%
+CATEGORY_DISPLAY = {
+    "ROLEPLAY": "Roleplay",
+    "ACTIONS": "Actions",
+    "GENERAL": "General",
+}
+def get_benchmarks_by_category(category: str) -> list[dict]:
+    return [b for b in BENCHMARKS if b["category"] == category]
+def get_all_benchmark_names() -> list[str]:
+    return [b["name"] for b in BENCHMARKS]

pyproject.toml DELETED Viewed

@@ -1,13 +0,0 @@
-[tool.ruff]
-# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
-select = ["E", "F"]
-ignore = ["E501"] # line too long (black is taking care of this)
-line-length = 119
-fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
-[tool.isort]
-profile = "black"
-line_length = 119
-[tool.black]
-line-length = 119

ref/Coherence_Summary ADDED Viewed

	@@ -0,0 +1,84 @@

+Coherence
+Above all other factors that can break immersion is and the character/NPC interaction experience is instances in which the model response seem illogical or irrelevant. Incoherence can occur at any time for any number of reasons ranging from outright hallucinations to confusion of their role, or contradicting a previous statement/event/world state.
+Test Methodology
+Since incoherence can occur in any instance it means we need a wide range of detection to mark them, but it means almost any testing structure could still be useful. Our strategy however, is to craft test scenarios that are likely to invoke a common failure, then test a model's resilience to it.
+Detection Scope:
+Factual / Logical - Objectively false or contradicted by the system prompt, gamestate (e.g. hallucinates details, refers to an entity that doesn't exist, invents a rule not established anywhere in context).
+Cause/ Effect - Fails to understand simple logical state transitions (e.g. “just drank whole glass of water therefore glass is empty”)
+Contradiction - The response contradicts something the assistant previously said or did in an earlier turn without any in-world justification.
+Personality / Background Violation -  The response has the character do or claim something that directly violates a trait, limitation, or background fact established.
+Role Confusion - Confuses facts, details, actions, or motivations of who or what entities and identity.
+Irrelevance - The response does not seem relevant and/or  confuses the subject matter or topic in discussion.
+Knowledge Boundary - Invents knowledge it cannot possibly have, instead of clearly separating observation from speculation
+False Premise - Incorrectly accepts a user’s smuggled-in premise about something that never happened
+Test Samples
+TEST_001
+Tag(s): Role Confusion
+Summary: Tests identity confusion between two companions with opposite elemental specialties across 9 rounds of dialogue with heavy noun saturation. Kael (fire mage, Cinderforge Academy, Emberchill Ring) and Lyra (ice mage, Glacial Sanctum, Frostbane Amulet). The final user message deliberately misattributes all key facts. Model must correct every error without confusing the details itself.
+Expectation: Must correct all misattributions: Kael=Cinderforge (not Glacial Sanctum), Lyra=Glacial Sanctum (not Cinderforge), Kael wears Emberchill Ring (ice resist), Lyra wears Frostbane Amulet (fire resist)
+Messages:
+[
+  {
+    "role": "system",
+    "content": "You are Kael Emberheart, a fire mage and combat specialist who serves as one of the player's two companions in the Shadowveil Dungeon. You are accompanied by Lyra Frostweaver, an ice mage and defensive specialist.\n\nYOUR IDENTITY — KAEL EMBERHEART:\n- Class: Fire Mage…"
+  },
+  {
+    "role": "user",
+    "content": "Kael, quick question — what element do you specialize in? And what's your companion Lyra's element?"
+  },
+  {
+    "role": "assistant",
+    "content": "Ha! Fire, obviously — I'm Kael Emberheart, pyromancer supreme! Lyra's the frost mage. She handles the cold stuff, I bring the heat."
+  },
+  {
+    "role": "user",
+    "content": "There's an ice wraith ahead. Who's better suited to fight it — you or Lyra?"
+  },
+  … (8x turns)
+  {
+    "role": "assistant",
+    "content": "The Convergence Blast! I launch a concentrated fireball and Lyra intercepts it mid-flight with a frost bolt. Extreme heat meets extreme cold — massive steam explosion that devastates everything in the blast zone. I bring the fire, she brings the ice, and the result is pure chaos. One of my favorite moves."
+  },
+  {
+    "role": "user",
+    "content": "Okay, one last thing before we push into the Embervault. I want to make sure I've got everyone straight. You're the one who trained at the Glacial Sanctum, right? And Lyra graduated from Cinderforge Academy? Also remind me — you're wearing the Frostbane Amulet for the fire resistance, and Lyra has the Emberchill Ring?"
+  }
+TEST_021
+Summary: Tests whether the model invents contents of an unread document. Eleanor found a sealed letter with an unbroken wax seal. She explicitly confirmed she has NOT opened or read it. The inspector pressures her to state exactly what it says. Model must refuse to fabricate contents.
+Tag(s): Knowledge Boundary
+Expectation: Must NOT claim to know the letter's contents; should restate the seal is unbroken and she has not read it
+Messages:
+[
+ {
+{
+    "role": "system",
+    "content": "You are Eleanor Vale, lady's maid in a gothic mystery set at Blackthorne Manor.\n\n# PERSONALITY\n- Composed, discreet, observant\n- Speaks carefully and avoids exaggeration\n\n# CURRENT FACTS\n- You found a sealed letter on the late master's desk…"
+  },
+  {
+    "role": "user",
+    "content": "What did you find in the study?"
+  },
+  {
+    "role": "assistant",
+    "content": "A sealed letter addressed to Lady Whitcomb. The wax is intact, so I have not read what is inside."
+  },
+  {
+    "role": "user",
+    "content": "Can you tell whether it's important?"
+  },
+  {
+    "role": "assistant",
+    "content": "Only that it was hidden carefully, which suggests it may matter. But that is only an inference. I do not know its contents."
+  },
+  {
+    "role": "user",
+    "content": "The inspector is impatient. Tell him exactly what the letter says."
+  }
+  }

ref/Overview ADDED Viewed

	@@ -0,0 +1,9 @@

+Nvidia Game Ready Model Score (GRM) is an aggregated quality metric designed to assess LLM capabilites in gaming use cases.
+General state-of-the-art language models are optimized for broad benchmarks such as math, code, and general knowledge. That does not reliably translate to in-game performance, and it does not reliably predict NPC quality, gameplay actions, or immersion.
+With game model evaluation, game developers can accelerate AI integration pipelines by reducing time spent on model evaluation and narrowing model choice earlier. The overall score is the average of Roleplay, Actions, and General, while benchmarks inside each category are combined with weighted averaging using core weights of 1.0 and supplementary weights of 0.5.
+GRM Score = (Roleplay + Actions + General) / 3
+Category Score = sum(score x weight) / sum(weight)

requirements.txt CHANGED Viewed

@@ -1,16 +1 @@
-APScheduler
-black
-datasets
-gradio==4.43
-gradio[oauth]
-gradio_leaderboard==0.0.13
-gradio_client
-huggingface-hub>=0.18.0
-matplotlib
-numpy
-pandas
-python-dateutil
-tqdm
-transformers
-tokenizers>=0.15.0
-sentencepiece


1	+ gradio

scores.py ADDED Viewed

	@@ -0,0 +1,135 @@

+"""
+GRM Evaluation Suite — Placeholder Model Scores
+Scores are on a 0.0–1.0 scale (will be displayed as 0–100%).
+All benchmarks have synthetic placeholder values for now.
+"""
+# fmt: off
+MODEL_SCORES: dict[str, dict[str, float]] = {
+    "GPT-5.4": {
+        # ROLEPLAY
+        "MultiChallenge": 0.91, "RoleBench": 0.89, "GRM-Bench \u2014 Coherence": 0.87,
+        "GRM-Bench \u2014 Response Diversity": 0.85, "GRM-Bench \u2014 Context Adaption": 0.88,
+        "DialogueNLI": 0.90, "RoleMRC": 0.86, "EQBench v3": 0.88,
+        # ACTIONS
+        "BFCLv3": 0.91, "Tau2-Bench": 0.88, "ToolSandbox": 0.86,
+        "When2Call": 0.89, "GRM-Bench \u2014 Prompt Robustness": 0.84,
+        "BFCLv4": 0.89, "T-Eval": 0.87,
+        # GENERAL
+        "RULER": 0.93, "GaRAGe": 0.85, "IFBench": 0.87, "AA LCR": 0.85,
+        "StructEval-T": 0.86, "InverseIFEval": 0.82, "RAGTruth": 0.88,
+        "SpatialText": 0.80, "SpartQA": 0.81, "COPA": 0.94, "PIQA": 0.93,
+    },
+    "GPT-oss20b": {
+        "MultiChallenge": 0.76, "RoleBench": 0.74, "GRM-Bench \u2014 Coherence": 0.71,
+        "GRM-Bench \u2014 Response Diversity": 0.69, "GRM-Bench \u2014 Context Adaption": 0.73,
+        "DialogueNLI": 0.75, "RoleMRC": 0.70, "EQBench v3": 0.72,
+        "BFCLv3": 0.78, "Tau2-Bench": 0.72, "ToolSandbox": 0.70,
+        "When2Call": 0.74, "GRM-Bench \u2014 Prompt Robustness": 0.68,
+        "BFCLv4": 0.75, "T-Eval": 0.71,
+        "RULER": 0.80, "GaRAGe": 0.69, "IFBench": 0.71, "AA LCR": 0.68,
+        "StructEval-T": 0.70, "InverseIFEval": 0.65, "RAGTruth": 0.72,
+        "SpatialText": 0.63, "SpartQA": 0.64, "COPA": 0.82, "PIQA": 0.80,
+    },
+    "Gemini 2.5 Pro": {
+        "MultiChallenge": 0.89, "RoleBench": 0.87, "GRM-Bench \u2014 Coherence": 0.85,
+        "GRM-Bench \u2014 Response Diversity": 0.83, "GRM-Bench \u2014 Context Adaption": 0.86,
+        "DialogueNLI": 0.88, "RoleMRC": 0.84, "EQBench v3": 0.86,
+        "BFCLv3": 0.89, "Tau2-Bench": 0.86, "ToolSandbox": 0.84,
+        "When2Call": 0.87, "GRM-Bench \u2014 Prompt Robustness": 0.82,
+        "BFCLv4": 0.87, "T-Eval": 0.85,
+        "RULER": 0.91, "GaRAGe": 0.83, "IFBench": 0.84, "AA LCR": 0.82,
+        "StructEval-T": 0.84, "InverseIFEval": 0.80, "RAGTruth": 0.86,
+        "SpatialText": 0.78, "SpartQA": 0.79, "COPA": 0.92, "PIQA": 0.91,
+    },
+    "Nemotron 3 Super 120B A12B [BF16]": {
+        "MultiChallenge": 0.86, "RoleBench": 0.84, "GRM-Bench \u2014 Coherence": 0.82,
+        "GRM-Bench \u2014 Response Diversity": 0.80, "GRM-Bench \u2014 Context Adaption": 0.83,
+        "DialogueNLI": 0.85, "RoleMRC": 0.81, "EQBench v3": 0.83,
+        "BFCLv3": 0.86, "Tau2-Bench": 0.83, "ToolSandbox": 0.81,
+        "When2Call": 0.84, "GRM-Bench \u2014 Prompt Robustness": 0.79,
+        "BFCLv4": 0.84, "T-Eval": 0.82,
+        "RULER": 0.89, "GaRAGe": 0.80, "IFBench": 0.82, "AA LCR": 0.80,
+        "StructEval-T": 0.81, "InverseIFEval": 0.77, "RAGTruth": 0.83,
+        "SpatialText": 0.75, "SpartQA": 0.76, "COPA": 0.90, "PIQA": 0.89,
+    },
+    "Nemotron 3 Nano 30B A3B [BF16]": {
+        "MultiChallenge": 0.73, "RoleBench": 0.71, "GRM-Bench \u2014 Coherence": 0.68,
+        "GRM-Bench \u2014 Response Diversity": 0.66, "GRM-Bench \u2014 Context Adaption": 0.70,
+        "DialogueNLI": 0.72, "RoleMRC": 0.67, "EQBench v3": 0.69,
+        "BFCLv3": 0.74, "Tau2-Bench": 0.70, "ToolSandbox": 0.68,
+        "When2Call": 0.71, "GRM-Bench \u2014 Prompt Robustness": 0.65,
+        "BFCLv4": 0.72, "T-Eval": 0.68,
+        "RULER": 0.77, "GaRAGe": 0.67, "IFBench": 0.69, "AA LCR": 0.66,
+        "StructEval-T": 0.68, "InverseIFEval": 0.63, "RAGTruth": 0.70,
+        "SpatialText": 0.61, "SpartQA": 0.62, "COPA": 0.79, "PIQA": 0.78,
+    },
+    "Nemotron 3 Nano 4B": {
+        "MultiChallenge": 0.57, "RoleBench": 0.54, "GRM-Bench \u2014 Coherence": 0.51,
+        "GRM-Bench \u2014 Response Diversity": 0.49, "GRM-Bench \u2014 Context Adaption": 0.53,
+        "DialogueNLI": 0.56, "RoleMRC": 0.50, "EQBench v3": 0.52,
+        "BFCLv3": 0.58, "Tau2-Bench": 0.52, "ToolSandbox": 0.50,
+        "When2Call": 0.54, "GRM-Bench \u2014 Prompt Robustness": 0.47,
+        "BFCLv4": 0.55, "T-Eval": 0.51,
+        "RULER": 0.62, "GaRAGe": 0.50, "IFBench": 0.54, "AA LCR": 0.48,
+        "StructEval-T": 0.51, "InverseIFEval": 0.46, "RAGTruth": 0.53,
+        "SpatialText": 0.44, "SpartQA": 0.45, "COPA": 0.65, "PIQA": 0.64,
+    },
+    "Qwen 3.5 27B": {
+        "MultiChallenge": 0.81, "RoleBench": 0.79, "GRM-Bench \u2014 Coherence": 0.76,
+        "GRM-Bench \u2014 Response Diversity": 0.74, "GRM-Bench \u2014 Context Adaption": 0.78,
+        "DialogueNLI": 0.80, "RoleMRC": 0.75, "EQBench v3": 0.77,
+        "BFCLv3": 0.82, "Tau2-Bench": 0.79, "ToolSandbox": 0.77,
+        "When2Call": 0.80, "GRM-Bench \u2014 Prompt Robustness": 0.74,
+        "BFCLv4": 0.80, "T-Eval": 0.76,
+        "RULER": 0.85, "GaRAGe": 0.75, "IFBench": 0.77, "AA LCR": 0.74,
+        "StructEval-T": 0.76, "InverseIFEval": 0.71, "RAGTruth": 0.78,
+        "SpatialText": 0.69, "SpartQA": 0.70, "COPA": 0.87, "PIQA": 0.86,
+    },
+    "Nemotron Nano 9B v2 [BF16]": {
+        "MultiChallenge": 0.66, "RoleBench": 0.63, "GRM-Bench \u2014 Coherence": 0.60,
+        "GRM-Bench \u2014 Response Diversity": 0.58, "GRM-Bench \u2014 Context Adaption": 0.62,
+        "DialogueNLI": 0.65, "RoleMRC": 0.59, "EQBench v3": 0.61,
+        "BFCLv3": 0.67, "Tau2-Bench": 0.63, "ToolSandbox": 0.61,
+        "When2Call": 0.64, "GRM-Bench \u2014 Prompt Robustness": 0.57,
+        "BFCLv4": 0.65, "T-Eval": 0.60,
+        "RULER": 0.71, "GaRAGe": 0.60, "IFBench": 0.62, "AA LCR": 0.58,
+        "StructEval-T": 0.60, "InverseIFEval": 0.55, "RAGTruth": 0.63,
+        "SpatialText": 0.53, "SpartQA": 0.54, "COPA": 0.74, "PIQA": 0.73,
+    },
+    "Qwen 3.5 9B [BF16]": {
+        "MultiChallenge": 0.64, "RoleBench": 0.61, "GRM-Bench \u2014 Coherence": 0.58,
+        "GRM-Bench \u2014 Response Diversity": 0.56, "GRM-Bench \u2014 Context Adaption": 0.60,
+        "DialogueNLI": 0.63, "RoleMRC": 0.57, "EQBench v3": 0.59,
+        "BFCLv3": 0.65, "Tau2-Bench": 0.61, "ToolSandbox": 0.59,
+        "When2Call": 0.62, "GRM-Bench \u2014 Prompt Robustness": 0.55,
+        "BFCLv4": 0.63, "T-Eval": 0.58,
+        "RULER": 0.69, "GaRAGe": 0.58, "IFBench": 0.60, "AA LCR": 0.56,
+        "StructEval-T": 0.58, "InverseIFEval": 0.53, "RAGTruth": 0.61,
+        "SpatialText": 0.51, "SpartQA": 0.52, "COPA": 0.72, "PIQA": 0.71,
+    },
+    "Qwen 3.5 4B [BF16]": {
+        "MultiChallenge": 0.55, "RoleBench": 0.52, "GRM-Bench \u2014 Coherence": 0.49,
+        "GRM-Bench \u2014 Response Diversity": 0.47, "GRM-Bench \u2014 Context Adaption": 0.51,
+        "DialogueNLI": 0.54, "RoleMRC": 0.48, "EQBench v3": 0.50,
+        "BFCLv3": 0.56, "Tau2-Bench": 0.50, "ToolSandbox": 0.48,
+        "When2Call": 0.52, "GRM-Bench \u2014 Prompt Robustness": 0.45,
+        "BFCLv4": 0.53, "T-Eval": 0.49,
+        "RULER": 0.60, "GaRAGe": 0.48, "IFBench": 0.52, "AA LCR": 0.46,
+        "StructEval-T": 0.49, "InverseIFEval": 0.44, "RAGTruth": 0.51,
+        "SpatialText": 0.42, "SpartQA": 0.43, "COPA": 0.63, "PIQA": 0.62,
+    },
+    "Qwen 3.5 2B [BF16]": {
+        "MultiChallenge": 0.46, "RoleBench": 0.43, "GRM-Bench \u2014 Coherence": 0.40,
+        "GRM-Bench \u2014 Response Diversity": 0.38, "GRM-Bench \u2014 Context Adaption": 0.42,
+        "DialogueNLI": 0.45, "RoleMRC": 0.39, "EQBench v3": 0.41,
+        "BFCLv3": 0.47, "Tau2-Bench": 0.41, "ToolSandbox": 0.39,
+        "When2Call": 0.43, "GRM-Bench \u2014 Prompt Robustness": 0.36,
+        "BFCLv4": 0.44, "T-Eval": 0.40,
+        "RULER": 0.51, "GaRAGe": 0.39, "IFBench": 0.43, "AA LCR": 0.38,
+        "StructEval-T": 0.40, "InverseIFEval": 0.35, "RAGTruth": 0.42,
+        "SpatialText": 0.33, "SpartQA": 0.34, "COPA": 0.54, "PIQA": 0.53,
+    },
+}
+# fmt: on

scoring.py ADDED Viewed

	@@ -0,0 +1,66 @@

+"""
+GRM Evaluation Suite — Scoring Logic
+GRM Score = mean(Roleplay_score, Actions_score, General_score)
+Each category score is a weighted average of its benchmarks:
+  category_score = sum(score_i * calc_weight_i) / sum(calc_weight_i)
+"""
+from benchmarks import BENCHMARKS, CATEGORIES, get_benchmarks_by_category
+from scores import MODEL_SCORES
+def compute_category_score(
+    model_scores: dict[str, float | None],
+    category: str,
+) -> float | None:
+    """Weighted average of available benchmark scores in a category."""
+    benchmarks = get_benchmarks_by_category(category)
+    total_weight = 0.0
+    total_value = 0.0
+    for b in benchmarks:
+        score = model_scores.get(b["name"])
+        if score is not None:
+            total_weight += b["calc_weight"]
+            total_value += score * b["calc_weight"]
+    if total_weight == 0:
+        return None
+    return total_value / total_weight
+def compute_grm_score(model_scores: dict[str, float | None]) -> dict:
+    """Return GRM Score and per-category scores for a single model.
+    All returned values are on a 0–100 scale, or None if no data.
+    """
+    cat_scores = {}
+    for cat in CATEGORIES:
+        raw = compute_category_score(model_scores, cat)
+        cat_scores[cat] = round(raw * 100, 1) if raw is not None else None
+    available = [v for v in cat_scores.values() if v is not None]
+    grm = round(sum(available) / len(available), 1) if available else None
+    return {
+        "GRM Score": grm,
+        "Roleplay (33%)": cat_scores["ROLEPLAY"],
+        "Actions (33%)": cat_scores["ACTIONS"],
+        "General (33%)": cat_scores["GENERAL"],
+    }
+def build_leaderboard() -> list[dict]:
+    """Compute scores for all models and return sorted rows."""
+    rows = []
+    for model_name, model_scores in MODEL_SCORES.items():
+        result = compute_grm_score(model_scores)
+        result["Model"] = model_name
+        rows.append(result)
+    rows.sort(key=lambda r: r["GRM Score"] if r["GRM Score"] is not None else -1, reverse=True)
+    for i, row in enumerate(rows, start=1):
+        row["Rank"] = i
+    return rows

src/about.py DELETED Viewed

@@ -1,89 +0,0 @@
-from dataclasses import dataclass
-from enum import Enum
-@dataclass
-class Task:
-    benchmark: str
-    metric: str
-    col_name: str
-# Select your tasks here
-# ---------------------------------------------------
-class Tasks(Enum):
-    # task_key in the json file, metric_key in the json file, name to display in the leaderboard
-    task0 = Task("anli_r1", "acc", "ANLI")
-    task1 = Task("logiqa", "acc_norm", "LogiQA")
-NUM_FEWSHOT = 0 # Change with your few shot
-# ---------------------------------------------------
-# Your leaderboard name
-TITLE = """<h1 align="center" id="space-title">Game Model Leaderboard</h1>"""
-# What does your leaderboard evaluate?
-INTRODUCTION_TEXT = """<h3 align="center">Best LLMs for Gaming</h3>"""
-# Which evaluations are you running? how can people reproduce what you have?
-LLM_BENCHMARKS_TEXT = f"""
-## How it works
-## Reproducibility
-To reproduce our results, here is the commands you can run:
-"""
-METRICS_TEXT = """
-## Metrics placeholder
-"""
-PERFORMANCE_TEXT = """
-## Performance placeholder
-"""
-ABOUT_TEXT = """
-## About placeholder
-"""
-EVALUATION_QUEUE_TEXT = """
-## Some good practices before submitting a model
-### 1) Make sure you can load your model and tokenizer using AutoClasses:
-```python
-from transformers import AutoConfig, AutoModel, AutoTokenizer
-config = AutoConfig.from_pretrained("your model name", revision=revision)
-model = AutoModel.from_pretrained("your model name", revision=revision)
-tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
-```
-If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
-Note: make sure your model is public!
-Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
-### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
-It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
-### 3) Make sure your model has an open license!
-This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
-### 4) Fill up your model card
-When we add extra information about models to the leaderboard, it will be automatically taken from the model card
-## In case of model failure
-If your model is displayed in the `FAILED` category, its execution stopped.
-Make sure you have followed the above steps first.
-If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
-"""
-CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
-CITATION_BUTTON_TEXT = r"""
-@misc{game_ready_models_leaderboard,
- author = {Maria Bagdasarova, Jani Joki, Warren Drones, Irina Filatova},
- title = {Game Ready Model Leaderboard},
- year = {2026},
- publisher = {Hugging Face},
- howpublished = "\url{https://huggingface.co/spaces/nvidia/GRM}"
- }
-"""

src/display/css_html_js.py DELETED Viewed

@@ -1,105 +0,0 @@
-custom_css = """
-.markdown-text {
-    font-size: 16px !important;
-}
-#models-to-add-text {
-    font-size: 18px !important;
-}
-#citation-button span {
-    font-size: 16px !important;
-}
-#citation-button textarea {
-    font-size: 16px !important;
-}
-#citation-button > label > button {
-    margin: 6px;
-    transform: scale(1.3);
-}
-#leaderboard-table {
-    margin-top: 15px
-}
-#leaderboard-table-lite {
-    margin-top: 15px
-}
-#search-bar-table-box > div:first-child {
-    background: none;
-    border: none;
-}
-#search-bar {
-    padding: 0px;
-}
-/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
-#leaderboard-table td:nth-child(2),
-#leaderboard-table th:nth-child(2) {
-    max-width: 400px;
-    overflow: auto;
-    white-space: nowrap;
-}
-.tab-buttons button {
-    font-size: 20px;
-}
-#scale-logo {
-    border-style: none !important;
-    box-shadow: none;
-    display: block;
-    margin-left: auto;
-    margin-right: auto;
-    max-width: 600px;
-}
-#scale-logo .download {
-    display: none;
-}
-#filter_type{
-    border: 0;
-    padding-left: 0;
-    padding-top: 0;
-}
-#filter_type label {
-    display: flex;
-}
-#filter_type label > span{
-    margin-top: var(--spacing-lg);
-    margin-right: 0.5em;
-}
-#filter_type label > .wrap{
-    width: 103px;
-}
-#filter_type label > .wrap .wrap-inner{
-    padding: 2px;
-}
-#filter_type label > .wrap .wrap-inner input{
-    width: 1px
-}
-#filter-columns-type{
-    border:0;
-    padding:0.5;
-}
-#filter-columns-size{
-    border:0;
-    padding:0.5;
-}
-#box-filter > .form{
-    border: 0
-}
-"""
-get_window_url_params = """
-    function(url_params) {
-        const params = new URLSearchParams(window.location.search);
-        url_params = Object.fromEntries(params);
-        return url_params;
-    }
-    """

src/display/formatting.py DELETED Viewed

@@ -1,27 +0,0 @@
-def model_hyperlink(link, model_name):
-    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
-def make_clickable_model(model_name):
-    link = f"https://huggingface.co/{model_name}"
-    return model_hyperlink(link, model_name)
-def styled_error(error):
-    return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
-def styled_warning(warn):
-    return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
-def styled_message(message):
-    return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
-def has_no_nan_values(df, columns):
-    return df[columns].notna().all(axis=1)
-def has_nan_values(df, columns):
-    return df[columns].isna().any(axis=1)

src/display/utils.py DELETED Viewed

@@ -1,110 +0,0 @@
-from dataclasses import dataclass, make_dataclass
-from enum import Enum
-import pandas as pd
-from src.about import Tasks
-def fields(raw_class):
-    return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
-# These classes are for user facing column names,
-# to avoid having to change them all around the code
-# when a modif is needed
-@dataclass
-class ColumnContent:
-    name: str
-    type: str
-    displayed_by_default: bool
-    hidden: bool = False
-    never_hidden: bool = False
-## Leaderboard columns
-auto_eval_column_dict = []
-# Init
-auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
-auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
-#Scores
-auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
-for task in Tasks:
-    auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
-# Model information
-auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
-auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
-auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
-auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
-auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
-auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
-auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
-auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
-auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
-# We use make dataclass to dynamically fill the scores from Tasks
-AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
-## For the queue columns in the submission tab
-@dataclass(frozen=True)
-class EvalQueueColumn:  # Queue column
-    model = ColumnContent("model", "markdown", True)
-    revision = ColumnContent("revision", "str", True)
-    private = ColumnContent("private", "bool", True)
-    precision = ColumnContent("precision", "str", True)
-    weight_type = ColumnContent("weight_type", "str", "Original")
-    status = ColumnContent("status", "str", True)
-## All the model information that we might need
-@dataclass
-class ModelDetails:
-    name: str
-    display_name: str = ""
-    symbol: str = "" # emoji
-class ModelType(Enum):
-    PT = ModelDetails(name="pretrained", symbol="🟢")
-    FT = ModelDetails(name="fine-tuned", symbol="🔶")
-    IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
-    RL = ModelDetails(name="RL-tuned", symbol="🟦")
-    Unknown = ModelDetails(name="", symbol="?")
-    def to_str(self, separator=" "):
-        return f"{self.value.symbol}{separator}{self.value.name}"
-    @staticmethod
-    def from_str(type):
-        if "fine-tuned" in type or "🔶" in type:
-            return ModelType.FT
-        if "pretrained" in type or "🟢" in type:
-            return ModelType.PT
-        if "RL-tuned" in type or "🟦" in type:
-            return ModelType.RL
-        if "instruction-tuned" in type or "⭕" in type:
-            return ModelType.IFT
-        return ModelType.Unknown
-class WeightType(Enum):
-    Adapter = ModelDetails("Adapter")
-    Original = ModelDetails("Original")
-    Delta = ModelDetails("Delta")
-class Precision(Enum):
-    float16 = ModelDetails("float16")
-    bfloat16 = ModelDetails("bfloat16")
-    Unknown = ModelDetails("?")
-    def from_str(precision):
-        if precision in ["torch.float16", "float16"]:
-            return Precision.float16
-        if precision in ["torch.bfloat16", "bfloat16"]:
-            return Precision.bfloat16
-        return Precision.Unknown
-# Column selection
-COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
-EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
-EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
-BENCHMARK_COLS = [t.value.col_name for t in Tasks]

src/envs.py DELETED Viewed

@@ -1,25 +0,0 @@
-import os
-from huggingface_hub import HfApi
-# Info to change for your repository
-# ----------------------------------
-TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
-OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
-# ----------------------------------
-REPO_ID = f"{OWNER}/leaderboard"
-QUEUE_REPO = f"{OWNER}/requests"
-RESULTS_REPO = f"{OWNER}/results"
-# If you setup a cache later, just change HF_HOME
-CACHE_PATH=os.getenv("HF_HOME", ".")
-# Local caches
-EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
-EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
-EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
-EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
-API = HfApi(token=TOKEN)

src/leaderboard/read_evals.py DELETED Viewed

@@ -1,196 +0,0 @@
-import glob
-import json
-import math
-import os
-from dataclasses import dataclass
-import dateutil
-import numpy as np
-from src.display.formatting import make_clickable_model
-from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
-from src.submission.check_validity import is_model_on_hub
-@dataclass
-class EvalResult:
-    """Represents one full evaluation. Built from a combination of the result and request file for a given run.
-    """
-    eval_name: str # org_model_precision (uid)
-    full_model: str # org/model (path on hub)
-    org: str
-    model: str
-    revision: str # commit hash, "" if main
-    results: dict
-    precision: Precision = Precision.Unknown
-    model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
-    weight_type: WeightType = WeightType.Original # Original or Adapter
-    architecture: str = "Unknown"
-    license: str = "?"
-    likes: int = 0
-    num_params: int = 0
-    date: str = "" # submission date of request file
-    still_on_hub: bool = False
-    @classmethod
-    def init_from_json_file(self, json_filepath):
-        """Inits the result from the specific model result file"""
-        with open(json_filepath) as fp:
-            data = json.load(fp)
-        config = data.get("config")
-        # Precision
-        precision = Precision.from_str(config.get("model_dtype"))
-        # Get model and org
-        org_and_model = config.get("model_name", config.get("model_args", None))
-        org_and_model = org_and_model.split("/", 1)
-        if len(org_and_model) == 1:
-            org = None
-            model = org_and_model[0]
-            result_key = f"{model}_{precision.value.name}"
-        else:
-            org = org_and_model[0]
-            model = org_and_model[1]
-            result_key = f"{org}_{model}_{precision.value.name}"
-        full_model = "/".join(org_and_model)
-        still_on_hub, _, model_config = is_model_on_hub(
-            full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
-        )
-        architecture = "?"
-        if model_config is not None:
-            architectures = getattr(model_config, "architectures", None)
-            if architectures:
-                architecture = ";".join(architectures)
-        # Extract results available in this file (some results are split in several files)
-        results = {}
-        for task in Tasks:
-            task = task.value
-            # We average all scores of a given metric (not all metrics are present in all files)
-            accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
-            if accs.size == 0 or any([acc is None for acc in accs]):
-                continue
-            mean_acc = np.mean(accs) * 100.0
-            results[task.benchmark] = mean_acc
-        return self(
-            eval_name=result_key,
-            full_model=full_model,
-            org=org,
-            model=model,
-            results=results,
-            precision=precision,
-            revision= config.get("model_sha", ""),
-            still_on_hub=still_on_hub,
-            architecture=architecture
-        )
-    def update_with_request_file(self, requests_path):
-        """Finds the relevant request file for the current model and updates info with it"""
-        request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
-        try:
-            with open(request_file, "r") as f:
-                request = json.load(f)
-            self.model_type = ModelType.from_str(request.get("model_type", ""))
-            self.weight_type = WeightType[request.get("weight_type", "Original")]
-            self.license = request.get("license", "?")
-            self.likes = request.get("likes", 0)
-            self.num_params = request.get("params", 0)
-            self.date = request.get("submitted_time", "")
-        except Exception:
-            print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
-    def to_dict(self):
-        """Converts the Eval Result to a dict compatible with our dataframe display"""
-        average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
-        data_dict = {
-            "eval_name": self.eval_name,  # not a column, just a save name,
-            AutoEvalColumn.precision.name: self.precision.value.name,
-            AutoEvalColumn.model_type.name: self.model_type.value.name,
-            AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
-            AutoEvalColumn.weight_type.name: self.weight_type.value.name,
-            AutoEvalColumn.architecture.name: self.architecture,
-            AutoEvalColumn.model.name: make_clickable_model(self.full_model),
-            AutoEvalColumn.revision.name: self.revision,
-            AutoEvalColumn.average.name: average,
-            AutoEvalColumn.license.name: self.license,
-            AutoEvalColumn.likes.name: self.likes,
-            AutoEvalColumn.params.name: self.num_params,
-            AutoEvalColumn.still_on_hub.name: self.still_on_hub,
-        }
-        for task in Tasks:
-            data_dict[task.value.col_name] = self.results[task.value.benchmark]
-        return data_dict
-def get_request_file_for_model(requests_path, model_name, precision):
-    """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
-    request_files = os.path.join(
-        requests_path,
-        f"{model_name}_eval_request_*.json",
-    )
-    request_files = glob.glob(request_files)
-    # Select correct request file (precision)
-    request_file = ""
-    request_files = sorted(request_files, reverse=True)
-    for tmp_request_file in request_files:
-        with open(tmp_request_file, "r") as f:
-            req_content = json.load(f)
-            if (
-                req_content["status"] in ["FINISHED"]
-                and req_content["precision"] == precision.split(".")[-1]
-            ):
-                request_file = tmp_request_file
-    return request_file
-def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
-    """From the path of the results folder root, extract all needed info for results"""
-    model_result_filepaths = []
-    for root, _, files in os.walk(results_path):
-        # We should only have json files in model results
-        if len(files) == 0 or any([not f.endswith(".json") for f in files]):
-            continue
-        # Sort the files by date
-        try:
-            files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
-        except dateutil.parser._parser.ParserError:
-            files = [files[-1]]
-        for file in files:
-            model_result_filepaths.append(os.path.join(root, file))
-    eval_results = {}
-    for model_result_filepath in model_result_filepaths:
-        # Creation of result
-        eval_result = EvalResult.init_from_json_file(model_result_filepath)
-        eval_result.update_with_request_file(requests_path)
-        # Store results of same eval together
-        eval_name = eval_result.eval_name
-        if eval_name in eval_results.keys():
-            eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
-        else:
-            eval_results[eval_name] = eval_result
-    results = []
-    for v in eval_results.values():
-        try:
-            v.to_dict() # we test if the dict version is complete
-            results.append(v)
-        except KeyError:  # not all eval values present
-            continue
-    return results

src/populate.py DELETED Viewed

@@ -1,58 +0,0 @@
-import json
-import os
-import pandas as pd
-from src.display.formatting import has_no_nan_values, make_clickable_model
-from src.display.utils import AutoEvalColumn, EvalQueueColumn
-from src.leaderboard.read_evals import get_raw_eval_results
-def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
-    """Creates a dataframe from all the individual experiment results"""
-    raw_data = get_raw_eval_results(results_path, requests_path)
-    all_data_json = [v.to_dict() for v in raw_data]
-    df = pd.DataFrame.from_records(all_data_json)
-    df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
-    df = df[cols].round(decimals=2)
-    # filter out if any of the benchmarks have not been produced
-    df = df[has_no_nan_values(df, benchmark_cols)]
-    return df
-def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
-    """Creates the different dataframes for the evaluation queues requestes"""
-    entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
-    all_evals = []
-    for entry in entries:
-        if ".json" in entry:
-            file_path = os.path.join(save_path, entry)
-            with open(file_path) as fp:
-                data = json.load(fp)
-            data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
-            data[EvalQueueColumn.revision.name] = data.get("revision", "main")
-            all_evals.append(data)
-        elif ".md" not in entry:
-            # this is a folder
-            sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
-            for sub_entry in sub_entries:
-                file_path = os.path.join(save_path, entry, sub_entry)
-                with open(file_path) as fp:
-                    data = json.load(fp)
-                data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
-                data[EvalQueueColumn.revision.name] = data.get("revision", "main")
-                all_evals.append(data)
-    pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
-    running_list = [e for e in all_evals if e["status"] == "RUNNING"]
-    finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
-    df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
-    df_running = pd.DataFrame.from_records(running_list, columns=cols)
-    df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
-    return df_finished[cols], df_running[cols], df_pending[cols]

src/submission/check_validity.py DELETED Viewed

@@ -1,99 +0,0 @@
-import json
-import os
-import re
-from collections import defaultdict
-from datetime import datetime, timedelta, timezone
-import huggingface_hub
-from huggingface_hub import ModelCard
-from huggingface_hub.hf_api import ModelInfo
-from transformers import AutoConfig
-from transformers.models.auto.tokenization_auto import AutoTokenizer
-def check_model_card(repo_id: str) -> tuple[bool, str]:
-    """Checks if the model card and license exist and have been filled"""
-    try:
-        card = ModelCard.load(repo_id)
-    except huggingface_hub.utils.EntryNotFoundError:
-        return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
-    # Enforce license metadata
-    if card.data.license is None:
-        if not ("license_name" in card.data and "license_link" in card.data):
-            return False, (
-                "License not found. Please add a license to your model card using the `license` metadata or a"
-                " `license_name`/`license_link` pair."
-            )
-    # Enforce card content
-    if len(card.text) < 200:
-        return False, "Please add a description to your model card, it is too short."
-    return True, ""
-def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
-    """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
-    try:
-        config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
-        if test_tokenizer:
-            try:
-                tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
-            except ValueError as e:
-                return (
-                    False,
-                    f"uses a tokenizer which is not in a transformers release: {e}",
-                    None
-                )
-            except Exception as e:
-                return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
-        return True, None, config
-    except ValueError:
-        return (
-            False,
-            "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
-            None
-        )
-    except Exception as e:
-        return False, "was not found on hub!", None
-def get_model_size(model_info: ModelInfo, precision: str):
-    """Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
-    try:
-        model_size = round(model_info.safetensors["total"] / 1e9, 3)
-    except (AttributeError, TypeError):
-        return 0  # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
-    size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
-    model_size = size_factor * model_size
-    return model_size
-def get_model_arch(model_info: ModelInfo):
-    """Gets the model architecture from the configuration"""
-    return model_info.config.get("architectures", "Unknown")
-def already_submitted_models(requested_models_dir: str) -> set[str]:
-    """Gather a list of already submitted models to avoid duplicates"""
-    depth = 1
-    file_names = []
-    users_to_submission_dates = defaultdict(list)
-    for root, _, files in os.walk(requested_models_dir):
-        current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
-        if current_depth == depth:
-            for file in files:
-                if not file.endswith(".json"):
-                    continue
-                with open(os.path.join(root, file), "r") as f:
-                    info = json.load(f)
-                    file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
-                    # Select organisation
-                    if info["model"].count("/") == 0 or "submitted_time" not in info:
-                        continue
-                    organisation, _ = info["model"].split("/")
-                    users_to_submission_dates[organisation].append(info["submitted_time"])
-    return set(file_names), users_to_submission_dates

src/submission/submit.py DELETED Viewed

@@ -1,119 +0,0 @@
-import json
-import os
-from datetime import datetime, timezone
-from src.display.formatting import styled_error, styled_message, styled_warning
-from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
-from src.submission.check_validity import (
-    already_submitted_models,
-    check_model_card,
-    get_model_size,
-    is_model_on_hub,
-)
-REQUESTED_MODELS = None
-USERS_TO_SUBMISSION_DATES = None
-def add_new_eval(
-    model: str,
-    base_model: str,
-    revision: str,
-    precision: str,
-    weight_type: str,
-    model_type: str,
-):
-    global REQUESTED_MODELS
-    global USERS_TO_SUBMISSION_DATES
-    if not REQUESTED_MODELS:
-        REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
-    user_name = ""
-    model_path = model
-    if "/" in model:
-        user_name = model.split("/")[0]
-        model_path = model.split("/")[1]
-    precision = precision.split(" ")[0]
-    current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
-    if model_type is None or model_type == "":
-        return styled_error("Please select a model type.")
-    # Does the model actually exist?
-    if revision == "":
-        revision = "main"
-    # Is the model on the hub?
-    if weight_type in ["Delta", "Adapter"]:
-        base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
-        if not base_model_on_hub:
-            return styled_error(f'Base model "{base_model}" {error}')
-    if not weight_type == "Adapter":
-        model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
-        if not model_on_hub:
-            return styled_error(f'Model "{model}" {error}')
-    # Is the model info correctly filled?
-    try:
-        model_info = API.model_info(repo_id=model, revision=revision)
-    except Exception:
-        return styled_error("Could not get your model information. Please fill it up properly.")
-    model_size = get_model_size(model_info=model_info, precision=precision)
-    # Were the model card and license filled?
-    try:
-        license = model_info.cardData["license"]
-    except Exception:
-        return styled_error("Please select a license for your model")
-    modelcard_OK, error_msg = check_model_card(model)
-    if not modelcard_OK:
-        return styled_error(error_msg)
-    # Seems good, creating the eval
-    print("Adding new eval")
-    eval_entry = {
-        "model": model,
-        "base_model": base_model,
-        "revision": revision,
-        "precision": precision,
-        "weight_type": weight_type,
-        "status": "PENDING",
-        "submitted_time": current_time,
-        "model_type": model_type,
-        "likes": model_info.likes,
-        "params": model_size,
-        "license": license,
-        "private": False,
-    }
-    # Check for duplicate submission
-    if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
-        return styled_warning("This model has been already submitted.")
-    print("Creating eval file")
-    OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
-    os.makedirs(OUT_DIR, exist_ok=True)
-    out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
-    with open(out_path, "w") as f:
-        f.write(json.dumps(eval_entry))
-    print("Uploading eval file")
-    API.upload_file(
-        path_or_fileobj=out_path,
-        path_in_repo=out_path.split("eval-queue/")[1],
-        repo_id=QUEUE_REPO,
-        repo_type="dataset",
-        commit_message=f"Add {model} to eval queue",
-    )
-    # Remove the local file
-    os.remove(out_path)
-    return styled_message(
-        "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
-    )