Files changed (11) hide show
  1. README.md +29 -28
  2. app.py +380 -226
  3. benchmarks.py +495 -143
  4. data_views.py +339 -0
  5. ref/Coherence_Summary +0 -84
  6. ref/Overview +0 -9
  7. requirements.txt +2 -1
  8. scores.py +273 -87
  9. scoring.py +114 -44
  10. streamlit_app.py +340 -0
  11. ui_theme.py +474 -0
README.md CHANGED
@@ -2,15 +2,14 @@
2
  title: GRM Leaderboard
3
  colorFrom: gray
4
  colorTo: blue
5
- sdk: gradio
6
- sdk_version: "5.23.0"
7
- app_file: app.py
8
  pinned: false
9
  ---
10
 
11
  # GRM Leaderboard
12
 
13
- Static Gradio Space for comparing language models on a game-focused evaluation suite.
14
 
15
  ## What This Project Is
16
 
@@ -22,29 +21,30 @@ Static Gradio Space for comparing language models on a game-focused evaluation s
22
  ## Runtime
23
 
24
  - Platform: Hugging Face Spaces
25
- - UI framework: Gradio
26
- - Entry point: app.py
27
  - Dependencies: requirements.txt
28
  - Space metadata: this README frontmatter
29
- - app.py includes compatibility handling for both Gradio 5 and Gradio 6
30
 
31
  ## Main Page Flow
32
 
33
  Tab 1:
34
  - Overview
35
- - Leaderboard
36
- - Per-Benchmark Score Breakdown
37
- - Evaluation Suite
38
- - Benchmark Details
39
 
40
  Tab 2:
 
41
  - GRM-Bench authored benchmark families
42
 
43
  ## File Ownership
44
 
45
- - app.py: page layout, tabs, CSS, overview copy, table rendering, and GRM-Bench authored sections
46
- - benchmarks.py: benchmark registry, category assignments, descriptions, summaries, and weights
47
- - scores.py: per-model benchmark scores on a 0.0 to 1.0 scale
 
 
 
48
  - scoring.py: category scoring, GRM score calculation, and ranking logic
49
  - requirements.txt: runtime dependencies
50
  - README.md: Space metadata and maintainer handoff notes
@@ -52,18 +52,19 @@ Tab 2:
52
  ## Data Model
53
 
54
  - benchmarks.py stores BENCHMARKS as a list of dicts
55
- - Each benchmark entry includes: name, category, calc_weight, description, summary, paper
56
  - Valid categories are ROLEPLAY, ACTIONS, and GENERAL
57
  - scores.py stores MODEL_SCORES keyed by model display name
58
- - Each model score dict is keyed by benchmark name
59
  - Missing scores are skipped during weighted averaging
 
60
 
61
  ## Scoring
62
 
63
  - Category score = sum(score x weight) / sum(weight)
64
  - GRM score = average of Roleplay, Actions, and General category scores
65
- - scores.py values stay on a 0.0 to 1.0 scale
66
- - Displayed leaderboard values are converted to 0 to 100
67
 
68
  ## How To Update The Site
69
 
@@ -79,29 +80,29 @@ Add a new benchmark:
79
  - Add the benchmark entry to benchmarks.py
80
  - Set its category and calc_weight
81
  - Add corresponding values in scores.py for each model you want included
 
82
 
83
- Update the authored GRM-Bench tab:
84
- - Edit GRM_BENCH_SECTIONS in app.py
85
 
86
  Update page structure, copy, or styling:
87
- - Edit app.py
88
 
89
  ## Local Development
90
 
91
  - Install dependencies: pip install -r requirements.txt
92
- - Run locally: python app.py
93
- - The app launches a local Gradio server using the same static content as the Space
94
 
95
  ## Deployment Notes
96
 
97
  - The live Space deploys from the remote main branch
98
  - README frontmatter controls the Space runtime metadata
99
- - requirements.txt must match imports used by app.py
100
- - Current scores in scores.py are placeholder/static values unless replaced with real outputs
101
 
102
  ## Maintenance Notes
103
 
104
- - The UI uses Python-generated HTML tables, not Gradio Dataframes
105
  - Leaderboard order is recalculated on each launch from scores.py
106
- - Gradio theme and CSS are injected conditionally based on the installed Gradio major version
107
- - If page scrolling behaves oddly, inspect the root overflow and flex overrides in app.py
 
2
  title: GRM Leaderboard
3
  colorFrom: gray
4
  colorTo: blue
5
+ sdk: streamlit
6
+ app_file: streamlit_app.py
 
7
  pinned: false
8
  ---
9
 
10
  # GRM Leaderboard
11
 
12
+ Static Streamlit Space for comparing language models on a game-focused evaluation suite.
13
 
14
  ## What This Project Is
15
 
 
21
  ## Runtime
22
 
23
  - Platform: Hugging Face Spaces
24
+ - UI framework: Streamlit
25
+ - Entry point: streamlit_app.py
26
  - Dependencies: requirements.txt
27
  - Space metadata: this README frontmatter
 
28
 
29
  ## Main Page Flow
30
 
31
  Tab 1:
32
  - Overview
33
+ - Leaderboard and consolidated score explorer
34
+ - Model detail and benchmark detail panels
 
 
35
 
36
  Tab 2:
37
+ - Benchmark Library
38
  - GRM-Bench authored benchmark families
39
 
40
  ## File Ownership
41
 
42
+ - streamlit_app.py: page layout, tabs, controls, score explorer, and benchmark library composition
43
+ - data_views.py: table shaping for leaderboard, benchmark matrix, model details, and benchmark library
44
+ - ui_theme.py: Streamlit CSS, header HTML, overview copy, and theme tokens
45
+ - app.py: previous Gradio implementation retained during transition
46
+ - benchmarks.py: benchmark registry, category assignments, PRD metadata, descriptions, summaries, and weights
47
+ - scores.py: per-model benchmark scores on a 0 to 100 scale
48
  - scoring.py: category scoring, GRM score calculation, and ranking logic
49
  - requirements.txt: runtime dependencies
50
  - README.md: Space metadata and maintainer handoff notes
 
52
  ## Data Model
53
 
54
  - benchmarks.py stores BENCHMARKS as a list of dicts
55
+ - Each benchmark entry includes: id, name, category, domain, source, phase, priority, calc_weight, included_in_grm, description, summary, methodology, detection_scope, paper
56
  - Valid categories are ROLEPLAY, ACTIONS, and GENERAL
57
  - scores.py stores MODEL_SCORES keyed by model display name
58
+ - Each model score dict is keyed by benchmark id
59
  - Missing scores are skipped during weighted averaging
60
+ - scores.py stores MODEL_METADATA for model family, size, precision, and open-weight visibility
61
 
62
  ## Scoring
63
 
64
  - Category score = sum(score x weight) / sum(weight)
65
  - GRM score = average of Roleplay, Actions, and General category scores
66
+ - scores.py values stay on a 0 to 100 scale to match the PRD source data
67
+ - Non-scored dimensions can appear in the Benchmark Library but are excluded from official GRM calculation
68
 
69
  ## How To Update The Site
70
 
 
80
  - Add the benchmark entry to benchmarks.py
81
  - Set its category and calc_weight
82
  - Add corresponding values in scores.py for each model you want included
83
+ - Set included_in_grm to false for future or reference-only dimensions
84
 
85
+ Update the authored GRM-Bench families:
86
+ - Edit GRM_BENCH_DIMENSIONS in benchmarks.py
87
 
88
  Update page structure, copy, or styling:
89
+ - Edit streamlit_app.py, data_views.py, or ui_theme.py
90
 
91
  ## Local Development
92
 
93
  - Install dependencies: pip install -r requirements.txt
94
+ - Run locally: streamlit run streamlit_app.py
95
+ - The app launches a local Streamlit server using the same static content as the Space
96
 
97
  ## Deployment Notes
98
 
99
  - The live Space deploys from the remote main branch
100
  - README frontmatter controls the Space runtime metadata
101
+ - requirements.txt must match imports used by streamlit_app.py
102
+ - Current scores in scores.py are static PRD-backed values with TBD entries represented as missing scores
103
 
104
  ## Maintenance Notes
105
 
106
+ - The UI uses Streamlit dataframes and Python-generated data views
107
  - Leaderboard order is recalculated on each launch from scores.py
108
+ - Custom CSS is injected from ui_theme.py
 
app.py CHANGED
@@ -388,14 +388,12 @@ def build_evaluation_suite_html() -> str:
388
  "<table class=\"data-table evaluation-suite-table\">",
389
  "<colgroup>",
390
  '<col class="evaluation-suite-category-col">',
391
- '<col class="evaluation-suite-domain-col">',
392
  '<col class="evaluation-suite-benchmark-col">',
393
  '<col class="evaluation-suite-description-col">',
394
  '<col class="evaluation-suite-weight-col">',
395
  "</colgroup>",
396
  "<thead><tr>",
397
  "<th>Category</th>",
398
- "<th>Domain</th>",
399
  "<th>Benchmark</th>",
400
  "<th>Description</th>",
401
  '<th class="weight-column" title="Weight">Wt.</th>',
@@ -405,27 +403,12 @@ def build_evaluation_suite_html() -> str:
405
  for category in CATEGORIES:
406
  benchmarks = get_benchmarks_by_category(category)
407
  rowspan = len(benchmarks)
408
-
409
- domain_rowspans: dict[int, int] = {}
410
- i = 0
411
- while i < len(benchmarks):
412
- d = benchmarks[i]["domain"]
413
- j = i
414
- while j < len(benchmarks) and benchmarks[j]["domain"] == d:
415
- j += 1
416
- domain_rowspans[i] = j - i
417
- i = j
418
-
419
  for index, benchmark in enumerate(benchmarks):
420
  parts.append("<tr>")
421
  if index == 0:
422
  parts.append(
423
  f'<td class="category-cell" rowspan="{rowspan}">{escape(CATEGORY_DISPLAY[category])}</td>'
424
  )
425
- if index in domain_rowspans:
426
- parts.append(
427
- f'<td class="domain-cell" rowspan="{domain_rowspans[index]}">{escape(benchmark["domain"])}</td>'
428
- )
429
  parts.append(f'<td class="benchmark-cell">{escape(benchmark["name"])}</td>')
430
  parts.append(f'<td class="description-cell">{escape(benchmark["description"])}</td>')
431
  parts.append(f'<td class="weight-cell">{_fmt_weight(benchmark["calc_weight"])}</td>')
@@ -452,7 +435,7 @@ def build_category_score_table_html(category: str, include_proprietary: bool = T
452
  row = [model]
453
  for benchmark_name in benchmark_names:
454
  score = MODEL_SCORES[model].get(benchmark_name)
455
- row.append(f"{score * 100:.1f}" if score is not None else "TBD")
456
  rows.append(row)
457
 
458
  return build_html_table(["Model"] + benchmark_names, rows, table_class="category-score-table")
@@ -587,214 +570,429 @@ OVERVIEW_HTML = _load_overview_html()
587
 
588
 
589
  CUSTOM_CSS = """
590
- /* ── Palette ──────────────────────────────────────────────────────────── */
591
  :root {
592
- --bg: #181b1e;
593
- --surface: #1d2126;
594
- --surface-raised: #24282d;
595
- --surface-header: #2b2f34;
596
- --border: #3c4048;
597
- --border-subtle: #313539;
598
- --text: #c8cdd2;
599
- --text-bright: #f5f7f8;
600
- --text-dim: #a2a8ae;
601
- --accent: #76b900;
602
- --accent-hover: #8fd400;
603
- }
604
-
605
- /* ── Page ─────────────────────────────────────────────────────────────── */
606
- html, body {
 
607
  display: block !important;
608
  height: auto !important;
 
 
609
  overflow-y: auto !important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
610
  }
611
- body { background: var(--bg) !important; }
612
 
613
- /* ── Gradio container ─────────────────────────────────────────────────── */
614
  .gradio-container {
615
  max-width: 1260px !important;
616
  margin: 0 auto !important;
617
  padding: 24px 24px 48px !important;
618
  background: transparent !important;
619
- color: var(--text) !important;
620
  font-family: "Segoe UI", "Helvetica Neue", Arial, sans-serif !important;
621
  }
622
 
623
- /* ── Tabs ─────────────────────────────────────────────────────────────── */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
624
  .gradio-container .tab-nav {
625
- border-bottom: 1px solid var(--border) !important;
626
  gap: 18px;
627
  margin: 0 0 18px 0 !important;
628
  }
 
629
  .gradio-container .tab-nav button {
630
  background: transparent !important;
631
  border: none !important;
632
  border-radius: 0 !important;
633
- color: var(--text-dim) !important;
634
  font-size: 0.8rem !important;
635
  font-weight: 650 !important;
636
  letter-spacing: 0.08em !important;
 
637
  padding: 0 0 12px 0 !important;
638
  text-transform: uppercase !important;
639
  }
 
640
  .gradio-container .tab-nav button.selected,
641
  .gradio-container .tab-nav button[aria-selected="true"] {
642
  box-shadow: inset 0 -2px 0 var(--accent) !important;
643
- color: var(--text-bright) !important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
644
  }
645
 
646
- /* ── Gradio Markdown / Prose ──────────────────────────────────────────── */
647
- .gradio-container .prose { color: var(--text) !important; background: transparent !important; }
648
- .gradio-container .prose h2 { color: var(--text-bright) !important; font-size: 1.55rem !important; font-weight: 600 !important; }
649
- .gradio-container .prose h3 { color: var(--text-bright) !important; font-size: 1.1rem !important; font-weight: 600 !important; }
650
  .gradio-container .prose p,
651
- .gradio-container .prose li { color: var(--text) !important; font-size: 1.05rem !important; line-height: 1.7 !important; }
652
- .gradio-container .prose strong { color: var(--text-bright) !important; }
653
- .gradio-container .prose a { color: var(--accent) !important; text-decoration: none !important; }
654
-
655
- /* ── Page header ──────────────────────────────────────────────────────── */
656
- .page-header { text-align: center; margin: 4px auto 26px; }
657
- .page-eyebrow { color: var(--text-dim); text-transform: uppercase; letter-spacing: 0.16em; font-size: 0.76rem; margin-bottom: 12px; }
658
- .page-header h1 { color: var(--text-bright); font-size: 2.35rem; line-height: 1.1; letter-spacing: -0.02em; margin: 0; font-weight: 650; }
659
- .page-header p { max-width: 860px; margin: 12px auto 0; color: var(--text); font-size: 1rem; line-height: 1.65; }
660
-
661
- /* ── Overview copy ────────────────────────────────────────────────────── */
662
- .section-note { color: var(--text-dim); font-size: 0.88rem; margin-top: 8px; }
663
- .longform-copy p { color: var(--text); font-size: 1.05rem; line-height: 1.7; margin: 0 0 10px 0; }
664
- .formula-line { color: var(--text-bright) !important; font-weight: 600; margin-top: 12px !important; }
665
- .formula-line.subdued { color: var(--text-dim) !important; font-weight: 500; }
666
-
667
- /* ── Custom HTML tables ───────────────────────────────────────────────── */
668
- .table-shell { width: 100%; margin-top: 10px; }
669
- .table-scroll-shell { overflow-x: auto; }
670
- .leaderboard-shell,
671
- .evaluation-suite-shell { overflow-x: auto; }
672
-
673
- .data-table { width: 100%; border-collapse: collapse; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
674
 
675
  .data-table thead th {
676
- background: var(--surface-header);
677
- color: var(--text-dim);
678
  font-size: 0.8rem;
679
  font-weight: 650;
680
  text-transform: uppercase;
681
  letter-spacing: 0.04em;
682
  text-align: left;
683
  padding: 11px 12px;
684
- border: 1px solid var(--border);
685
  }
686
- .data-table thead th + th { border-left: 1px solid var(--border); }
687
 
688
- .data-table tbody tr:nth-child(odd) td { background: var(--surface); }
689
- .data-table tbody tr:nth-child(even) td { background: var(--surface-raised); }
690
- .data-table tbody tr:hover td { background: #30353a; }
 
 
 
 
691
 
692
  .data-table td {
693
- color: var(--text);
694
  font-size: 0.94rem;
695
  line-height: 1.45;
696
  padding: 10px 12px;
697
  vertical-align: top;
698
- border: 1px solid var(--border-subtle);
699
  }
700
 
701
- /* Rank-1 row highlight */
702
- # .leaderboard-table tbody tr:first-child td { background: #252d1d; }
 
 
703
 
704
- /* Evaluation suite overrides */
705
- .evaluation-suite-table { table-layout: fixed; }
706
- .evaluation-suite-table thead th { padding: 9px 11px; font-size: 0.78rem; }
707
- .evaluation-suite-table td { padding: 7px 11px; font-size: 0.9rem; line-height: 1.3; }
708
 
709
- .evaluation-suite-category-col { width: 8rem; }
710
- .evaluation-suite-domain-col { width: 11rem; }
711
- .evaluation-suite-benchmark-col { width: 12.5rem; }
712
- .evaluation-suite-weight-col { width: 5ch; }
 
 
 
 
 
 
 
 
 
 
 
 
 
713
 
714
  .evaluation-suite-table .weight-column,
715
  .evaluation-suite-table .weight-cell {
716
  font-variant-numeric: tabular-nums;
 
 
717
  text-align: center;
718
  white-space: nowrap;
719
  width: 5ch;
720
- padding: 0 4px;
 
721
  }
 
722
  .evaluation-suite-table .category-cell {
723
- color: var(--text-dim);
724
  font-size: 0.79rem;
725
  font-weight: 650;
726
  text-transform: uppercase;
727
  letter-spacing: 0.06em;
728
- vertical-align: middle;
729
  min-width: 8rem;
730
- background: var(--surface) !important;
731
  }
732
- .evaluation-suite-table .domain-cell {
733
- color: var(--text-dim);
734
- font-size: 0.82rem;
735
- vertical-align: middle;
736
  overflow-wrap: anywhere;
737
- background: var(--surface) !important;
738
  }
739
- .evaluation-suite-table .benchmark-cell { width: 12.5rem; overflow-wrap: anywhere; }
740
- .evaluation-suite-table .description-cell { min-width: 0; overflow-wrap: anywhere; }
741
 
742
- .grm-bench-scope-table th:first-child,
743
- .grm-bench-scope-table td:first-child { min-width: 180px; width: 180px; }
 
 
 
 
 
744
 
745
- /* ── Accordion ────────────────────────────────────────────────────────── */
746
- /* Gradio 4: .gradio-accordion | Gradio 5: details */
747
- .gradio-container .gradio-accordion,
748
- .gradio-container details {
 
 
 
 
 
749
  background: transparent !important;
750
- border: 1px solid var(--border) !important;
751
- border-radius: 8px !important;
752
  margin-bottom: 6px !important;
753
- overflow: hidden !important;
754
  }
755
- .gradio-container .gradio-accordion > .label-wrap,
756
- .gradio-container details > summary {
757
- background: var(--surface-raised) !important;
758
- color: var(--text-bright) !important;
759
  border: none !important;
 
760
  padding: 0.75rem 0.9rem !important;
761
- cursor: pointer;
 
 
 
 
 
 
 
 
 
 
 
 
762
  font-weight: 600;
763
- font-size: 0.92rem;
764
- list-style: none;
765
- }
766
- .gradio-container .gradio-accordion > .label-wrap:hover,
767
- .gradio-container details > summary:hover { background: #2a2f34 !important; }
768
- .gradio-container details[open] > summary { border-bottom: 1px solid var(--border) !important; }
769
- .gradio-container details > summary::-webkit-details-marker { display: none; }
770
-
771
- /* ── Benchmark details ────────────────────────────────────────────────── */
772
- .benchmark-section { margin-top: 18px; }
773
- .benchmark-section h3 { color: var(--text-bright); font-size: 1.15rem; font-weight: 600; margin: 0 0 8px 0; }
774
- .benchmark-entry { padding: 12px 0; border-bottom: 1px solid var(--border-subtle); }
775
- .benchmark-entry-top { display: flex; align-items: baseline; justify-content: space-between; gap: 12px; flex-wrap: wrap; }
776
- .benchmark-entry-top h4 { margin: 0; color: var(--text-bright); font-size: 1rem; font-weight: 600; }
777
- .benchmark-weight { color: var(--accent); font-size: 0.84rem; white-space: nowrap; }
778
- .benchmark-description { color: var(--text-dim); margin: 6px 0 !important; }
779
- .benchmark-entry p { color: var(--text); line-height: 1.62; margin: 0; }
780
- .benchmark-link { margin-top: 7px; font-size: 0.86rem; }
781
- .benchmark-link a { color: var(--accent); text-decoration: none; }
782
-
783
- /* ── GRM-Bench sections ───────────────────────────────────────────────── */
784
- .grm-bench-section { border-top: 1px solid var(--border-subtle); margin-top: 24px; padding-top: 18px; }
785
- .grm-bench-section:first-of-type { margin-top: 16px; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
786
  .grm-bench-kicker,
787
- .grm-bench-subtitle { color: var(--text-dim); font-size: 0.78rem; font-weight: 650; letter-spacing: 0.08em; text-transform: uppercase; }
788
- .grm-bench-section h2 { color: var(--text-bright); font-size: 1.32rem; font-weight: 620; letter-spacing: -0.01em; margin: 4px 0 10px 0; }
789
- .grm-bench-section p { color: var(--text); font-size: 1.05rem; line-height: 1.7; margin: 0 0 10px 0; }
790
- .grm-bench-subtitle { margin: 14px 0 6px 0; }
791
- .grm-bench-sample { background: #171b1f; border: 1px solid var(--border); border-radius: 10px; margin-top: 12px; padding: 14px 16px; }
792
- .grm-bench-sample-id { color: var(--text-bright); font-size: 0.84rem; font-weight: 700; letter-spacing: 0.08em; text-transform: uppercase; }
793
- .grm-bench-sample-meta { margin: 6px 0 0 0 !important; }
794
- .grm-bench-sample-label { color: var(--text-bright); font-weight: 600; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
795
  .grm-bench-sample-code {
796
  background: #0f1215;
797
- border: 1px solid var(--border-subtle);
798
  border-radius: 8px;
799
  color: #d7dde3;
800
  font-family: Consolas, "SFMono-Regular", monospace;
@@ -805,96 +1003,51 @@ body { background: var(--bg) !important; }
805
  padding: 12px 14px;
806
  white-space: pre-wrap;
807
  }
808
- .grm-bench-sample-code code { font-family: inherit; }
809
 
810
- /* ── Scrollbars ───────────────────────────────────────────────────────── */
811
- ::-webkit-scrollbar { width: 6px; height: 6px; }
812
- ::-webkit-scrollbar-track { background: var(--bg); }
813
- ::-webkit-scrollbar-thumb { background: #3a3f45; border-radius: 3px; }
814
- ::-webkit-scrollbar-thumb:hover { background: var(--accent); }
 
 
 
 
815
 
816
- /* ── Responsive ───────────────────────────────────────────────────────── */
817
  @media (max-width: 720px) {
818
- .gradio-container { padding: 20px 14px 40px !important; }
819
- .page-header h1 { font-size: 2rem; }
 
 
 
 
 
 
820
  .data-table thead th,
821
- .data-table td { padding: 10px 9px; }
822
- .leaderboard-shell,
823
- .evaluation-suite-shell { overflow-x: auto; }
824
- .evaluation-suite-table { min-width: 38rem; }
825
- }
826
 
827
- .red-text { color: red !important; }
828
- """
 
 
829
 
 
 
 
 
 
 
 
 
 
 
830
 
831
- _NVIDIA_DARK_THEME = gr.themes.Base().set(
832
- # Page
833
- body_background_fill="#181b1e",
834
- body_text_color="#f5f7f8",
835
- body_text_color_subdued="#a2a8ae",
836
- # Blocks / panels
837
- background_fill_primary="#1d2126",
838
- background_fill_secondary="#15181b",
839
- block_background_fill="#1d2126",
840
- block_border_color="#3c4048",
841
- block_border_width="1px",
842
- block_label_background_fill="#23272c",
843
- block_label_border_color="#3c4048",
844
- block_label_text_color="#f5f7f8",
845
- block_title_background_fill="transparent",
846
- block_title_text_color="#f5f7f8",
847
- block_info_text_color="#a2a8ae",
848
- panel_background_fill="#1d2126",
849
- panel_border_color="#3c4048",
850
- # Inputs
851
- input_background_fill="#24282d",
852
- input_background_fill_focus="#2b2f34",
853
- input_border_color="#3c4048",
854
- input_border_color_focus="#76b900",
855
- input_placeholder_color="#6b7280",
856
- input_shadow="none",
857
- input_shadow_focus="0 0 0 2px rgba(118,185,0,0.25)",
858
- # Buttons
859
- button_primary_background_fill="#76b900",
860
- button_primary_background_fill_hover="#8fd400",
861
- button_primary_border_color="transparent",
862
- button_primary_text_color="#0f1012",
863
- button_secondary_background_fill="#24282d",
864
- button_secondary_background_fill_hover="#2b2f34",
865
- button_secondary_border_color="#3c4048",
866
- button_secondary_text_color="#f5f7f8",
867
- # Borders / accent
868
- border_color_accent="#76b900",
869
- border_color_primary="#3c4048",
870
- color_accent="#76b900",
871
- link_text_color="#76b900",
872
- link_text_color_hover="#8fd400",
873
- link_text_color_visited="#5a9000",
874
- # Table
875
- table_even_background_fill="#1c2024",
876
- table_odd_background_fill="#24282d",
877
- table_row_focus="#30353a",
878
- # Checkbox / radio
879
- checkbox_background_color="#24282d",
880
- checkbox_background_color_selected="#76b900",
881
- checkbox_background_color_hover="#2b2f34",
882
- checkbox_border_color="#3c4048",
883
- checkbox_border_color_focus="#76b900",
884
- checkbox_border_color_selected="#76b900",
885
- checkbox_label_background_fill="transparent",
886
- checkbox_label_background_fill_selected="rgba(118,185,0,0.12)",
887
- checkbox_label_text_color="#f5f7f8",
888
- # Slider
889
- slider_color="#76b900",
890
- # Shadows
891
- shadow_drop="0 2px 8px rgba(0,0,0,0.45)",
892
- shadow_drop_lg="0 4px 20px rgba(0,0,0,0.6)",
893
- )
894
 
895
  blocks_kwargs = {"title": "GRM Score - Game Ready Leaderboard"}
896
  if GRADIO_MAJOR_VERSION < 6:
897
- blocks_kwargs["theme"] = _NVIDIA_DARK_THEME
898
  blocks_kwargs["css"] = CUSTOM_CSS
899
 
900
 
@@ -908,13 +1061,14 @@ with gr.Blocks(**blocks_kwargs) as demo:
908
 
909
  gr.Markdown("## Leaderboard")
910
  gr.Markdown(
911
- "The leaderboard now sits directly after the overview so rankings are visible before the deeper methodology sections.", elem_classes=["red-text"]
912
  )
913
  show_proprietary_models = gr.Checkbox(label="Show proprietary models", value=True)
914
  gr.HTML(
915
  "<div class=\"section-note\">Turn this off to switch the ranking and score breakdowns to an open-source-only view.</div>"
916
  )
917
  leaderboard_html = gr.HTML(build_leaderboard_html())
 
918
 
919
  gr.Markdown("## Per-Benchmark Score Breakdown")
920
  gr.Markdown("Expand a category to inspect the individual benchmark scores backing the leaderboard.")
@@ -952,6 +1106,6 @@ with gr.Blocks(**blocks_kwargs) as demo:
952
  if __name__ == "__main__":
953
  launch_kwargs = {}
954
  if GRADIO_MAJOR_VERSION >= 6:
955
- launch_kwargs["theme"] = _NVIDIA_DARK_THEME
956
  launch_kwargs["css"] = CUSTOM_CSS
957
  demo.launch(**launch_kwargs)
 
388
  "<table class=\"data-table evaluation-suite-table\">",
389
  "<colgroup>",
390
  '<col class="evaluation-suite-category-col">',
 
391
  '<col class="evaluation-suite-benchmark-col">',
392
  '<col class="evaluation-suite-description-col">',
393
  '<col class="evaluation-suite-weight-col">',
394
  "</colgroup>",
395
  "<thead><tr>",
396
  "<th>Category</th>",
 
397
  "<th>Benchmark</th>",
398
  "<th>Description</th>",
399
  '<th class="weight-column" title="Weight">Wt.</th>',
 
403
  for category in CATEGORIES:
404
  benchmarks = get_benchmarks_by_category(category)
405
  rowspan = len(benchmarks)
 
 
 
 
 
 
 
 
 
 
 
406
  for index, benchmark in enumerate(benchmarks):
407
  parts.append("<tr>")
408
  if index == 0:
409
  parts.append(
410
  f'<td class="category-cell" rowspan="{rowspan}">{escape(CATEGORY_DISPLAY[category])}</td>'
411
  )
 
 
 
 
412
  parts.append(f'<td class="benchmark-cell">{escape(benchmark["name"])}</td>')
413
  parts.append(f'<td class="description-cell">{escape(benchmark["description"])}</td>')
414
  parts.append(f'<td class="weight-cell">{_fmt_weight(benchmark["calc_weight"])}</td>')
 
435
  row = [model]
436
  for benchmark_name in benchmark_names:
437
  score = MODEL_SCORES[model].get(benchmark_name)
438
+ row.append(f"{score * 100:.1f}" if score is not None else "-")
439
  rows.append(row)
440
 
441
  return build_html_table(["Model"] + benchmark_names, rows, table_class="category-score-table")
 
570
 
571
 
572
  CUSTOM_CSS = """
 
573
  :root {
574
+ --bg-top: #202327;
575
+ --bg-bottom: #0f1012;
576
+ --surface: #15181b;
577
+ --surface-strong: #24282d;
578
+ --surface-alt: #1d2126;
579
+ --surface-alt-2: #262a2f;
580
+ --text-main: #f5f7f8;
581
+ --text-muted: #c1c6cb;
582
+ --text-soft: #a2a8ae;
583
+ --accent: #76b900;
584
+ --rule: rgba(255, 255, 255, 0.08);
585
+ --rule-soft: rgba(255, 255, 255, 0.05);
586
+ }
587
+
588
+ html,
589
+ body {
590
  display: block !important;
591
  height: auto !important;
592
+ min-height: 100%;
593
+ overflow-x: hidden !important;
594
  overflow-y: auto !important;
595
+ scroll-behavior: auto !important;
596
+ }
597
+
598
+ body {
599
+ background: linear-gradient(180deg, var(--bg-top) 0%, var(--bg-bottom) 100%) !important;
600
+ }
601
+
602
+ .gradio-container,
603
+ .gradio-container .main,
604
+ .gradio-container .wrap,
605
+ .gradio-container .contain,
606
+ .gradio-container [role="tabpanel"] {
607
+ overflow: visible !important;
608
+ max-height: none !important;
609
  }
 
610
 
 
611
  .gradio-container {
612
  max-width: 1260px !important;
613
  margin: 0 auto !important;
614
  padding: 24px 24px 48px !important;
615
  background: transparent !important;
616
+ color: var(--text-main) !important;
617
  font-family: "Segoe UI", "Helvetica Neue", Arial, sans-serif !important;
618
  }
619
 
620
+ .page-header {
621
+ text-align: center;
622
+ margin: 4px auto 26px;
623
+ }
624
+
625
+ .page-eyebrow {
626
+ color: var(--text-soft);
627
+ text-transform: uppercase;
628
+ letter-spacing: 0.16em;
629
+ font-size: 0.76rem;
630
+ margin-bottom: 12px;
631
+ }
632
+
633
+ .page-header h1 {
634
+ color: var(--text-main);
635
+ font-size: 2.35rem;
636
+ line-height: 1.1;
637
+ letter-spacing: -0.02em;
638
+ margin: 0;
639
+ font-weight: 650;
640
+ }
641
+
642
+ .page-header p {
643
+ max-width: 860px;
644
+ margin: 12px auto 0;
645
+ color: var(--text-muted);
646
+ font-size: 1rem;
647
+ line-height: 1.65;
648
+ }
649
+
650
  .gradio-container .tab-nav {
651
+ border-bottom: 1px solid var(--rule) !important;
652
  gap: 18px;
653
  margin: 0 0 18px 0 !important;
654
  }
655
+
656
  .gradio-container .tab-nav button {
657
  background: transparent !important;
658
  border: none !important;
659
  border-radius: 0 !important;
660
+ color: var(--text-soft) !important;
661
  font-size: 0.8rem !important;
662
  font-weight: 650 !important;
663
  letter-spacing: 0.08em !important;
664
+ min-width: unset !important;
665
  padding: 0 0 12px 0 !important;
666
  text-transform: uppercase !important;
667
  }
668
+
669
  .gradio-container .tab-nav button.selected,
670
  .gradio-container .tab-nav button[aria-selected="true"] {
671
  box-shadow: inset 0 -2px 0 var(--accent) !important;
672
+ color: var(--text-main) !important;
673
+ }
674
+
675
+ .gradio-container .prose {
676
+ color: var(--text-muted) !important;
677
+ }
678
+
679
+ .gradio-container .prose h2 {
680
+ color: var(--text-main) !important;
681
+ font-size: 1.6rem !important;
682
+ font-weight: 600 !important;
683
+ margin: 1.65rem 0 0.4rem !important;
684
+ letter-spacing: -0.01em;
685
+ }
686
+
687
+ .gradio-container .prose h3 {
688
+ color: var(--text-main) !important;
689
+ font-size: 1.1rem !important;
690
+ font-weight: 600 !important;
691
+ margin: 1.1rem 0 0.4rem !important;
692
  }
693
 
 
 
 
 
694
  .gradio-container .prose p,
695
+ .gradio-container .prose li {
696
+ color: var(--text-muted) !important;
697
+ font-size: 0.98rem !important;
698
+ line-height: 1.65 !important;
699
+ }
700
+
701
+ .gradio-container .prose strong {
702
+ color: var(--text-main) !important;
703
+ }
704
+
705
+ .gradio-container .prose a,
706
+ .benchmark-link a {
707
+ color: var(--accent) !important;
708
+ text-decoration: none !important;
709
+ }
710
+
711
+ .section-note {
712
+ color: var(--text-soft);
713
+ font-size: 0.88rem;
714
+ margin-top: 8px;
715
+ }
716
+
717
+ .longform-copy p {
718
+ color: var(--text-muted);
719
+ font-size: 0.98rem;
720
+ line-height: 1.68;
721
+ margin: 0 0 10px 0;
722
+ }
723
+
724
+ .formula-line {
725
+ color: var(--text-main) !important;
726
+ font-weight: 600;
727
+ margin-top: 12px !important;
728
+ }
729
+
730
+ .formula-line.subdued {
731
+ color: var(--text-soft) !important;
732
+ font-weight: 500;
733
+ margin-top: -1px !important;
734
+ }
735
+
736
+ .table-shell {
737
+ width: 100%;
738
+ margin-top: 10px;
739
+ }
740
+
741
+ .table-scroll-shell {
742
+ overflow-x: auto;
743
+ overflow-y: visible;
744
+ }
745
+
746
+ .evaluation-suite-shell {
747
+ overflow: visible;
748
+ }
749
+
750
+ .leaderboard-shell {
751
+ overflow: visible;
752
+ }
753
+
754
+ .data-table {
755
+ width: 100%;
756
+ border-collapse: collapse;
757
+ border-spacing: 0;
758
+ }
759
 
760
  .data-table thead th {
761
+ background: #2b2f34;
762
+ color: #d2d7dc;
763
  font-size: 0.8rem;
764
  font-weight: 650;
765
  text-transform: uppercase;
766
  letter-spacing: 0.04em;
767
  text-align: left;
768
  padding: 11px 12px;
 
769
  }
 
770
 
771
+ .data-table tbody tr:nth-child(odd) td {
772
+ background: #1c2024;
773
+ }
774
+
775
+ .data-table tbody tr:nth-child(even) td {
776
+ background: #24282d;
777
+ }
778
 
779
  .data-table td {
780
+ color: var(--text-main);
781
  font-size: 0.94rem;
782
  line-height: 1.45;
783
  padding: 10px 12px;
784
  vertical-align: top;
 
785
  }
786
 
787
+ .evaluation-suite-table thead th {
788
+ padding: 9px 11px;
789
+ font-size: 0.78rem;
790
+ }
791
 
792
+ .evaluation-suite-table {
793
+ table-layout: fixed;
794
+ }
 
795
 
796
+ .evaluation-suite-table td {
797
+ padding: 7px 11px;
798
+ font-size: 0.9rem;
799
+ line-height: 1.3;
800
+ }
801
+
802
+ .evaluation-suite-category-col {
803
+ width: 8rem;
804
+ }
805
+
806
+ .evaluation-suite-benchmark-col {
807
+ width: 12.5rem;
808
+ }
809
+
810
+ .evaluation-suite-weight-col {
811
+ width: 5ch;
812
+ }
813
 
814
  .evaluation-suite-table .weight-column,
815
  .evaluation-suite-table .weight-cell {
816
  font-variant-numeric: tabular-nums;
817
+ max-width: 5ch;
818
+ min-width: 5ch;
819
  text-align: center;
820
  white-space: nowrap;
821
  width: 5ch;
822
+ padding-left: 4px;
823
+ padding-right: 4px;
824
  }
825
+
826
  .evaluation-suite-table .category-cell {
827
+ color: var(--text-soft);
828
  font-size: 0.79rem;
829
  font-weight: 650;
830
  text-transform: uppercase;
831
  letter-spacing: 0.06em;
832
+ vertical-align: top;
833
  min-width: 8rem;
 
834
  }
835
+
836
+ .evaluation-suite-table .benchmark-cell,
837
+ .evaluation-suite-table .description-cell {
 
838
  overflow-wrap: anywhere;
839
+ word-break: normal;
840
  }
 
 
841
 
842
+ .evaluation-suite-table .benchmark-cell {
843
+ width: 12.5rem;
844
+ }
845
+
846
+ .evaluation-suite-table .description-cell {
847
+ min-width: 0;
848
+ }
849
 
850
+ .leaderboard-table tbody tr:first-child td {
851
+ background: #252d1d;
852
+ }
853
+
854
+ .data-table tbody tr:hover td {
855
+ background: #30353a;
856
+ }
857
+
858
+ .gradio-accordion {
859
  background: transparent !important;
860
+ border: none !important;
861
+ box-shadow: none !important;
862
  margin-bottom: 6px !important;
 
863
  }
864
+
865
+ .gradio-accordion > .label-wrap {
866
+ background: #23272c !important;
867
+ color: var(--text-main) !important;
868
  border: none !important;
869
+ border-radius: 8px !important;
870
  padding: 0.75rem 0.9rem !important;
871
+ }
872
+
873
+ .gradio-accordion > .label-wrap:hover {
874
+ background: #2a2f34 !important;
875
+ }
876
+
877
+ .benchmark-section {
878
+ margin-top: 18px;
879
+ }
880
+
881
+ .benchmark-section h3 {
882
+ color: var(--text-main);
883
+ font-size: 1.15rem;
884
  font-weight: 600;
885
+ margin: 0 0 8px 0;
886
+ }
887
+
888
+ .benchmark-entry {
889
+ padding: 12px 0;
890
+ border-bottom: 1px solid var(--rule-soft);
891
+ }
892
+
893
+ .benchmark-entry-top {
894
+ display: flex;
895
+ align-items: baseline;
896
+ justify-content: space-between;
897
+ gap: 12px;
898
+ flex-wrap: wrap;
899
+ }
900
+
901
+ .benchmark-entry-top h4 {
902
+ margin: 0;
903
+ color: var(--text-main);
904
+ font-size: 1rem;
905
+ font-weight: 600;
906
+ }
907
+
908
+ .benchmark-weight {
909
+ color: var(--accent);
910
+ font-size: 0.84rem;
911
+ white-space: nowrap;
912
+ }
913
+
914
+ .benchmark-description {
915
+ color: var(--text-soft) !important;
916
+ margin: 6px 0 6px 0 !important;
917
+ }
918
+
919
+ .benchmark-entry p {
920
+ color: var(--text-muted);
921
+ line-height: 1.62;
922
+ margin: 0;
923
+ }
924
+
925
+ .benchmark-link {
926
+ margin-top: 7px;
927
+ font-size: 0.86rem;
928
+ }
929
+
930
+ .grm-bench-section {
931
+ border-top: 1px solid var(--rule-soft);
932
+ margin-top: 24px;
933
+ padding-top: 18px;
934
+ }
935
+
936
+ .grm-bench-section:first-of-type {
937
+ margin-top: 16px;
938
+ }
939
+
940
  .grm-bench-kicker,
941
+ .grm-bench-subtitle {
942
+ color: var(--text-soft);
943
+ font-size: 0.78rem;
944
+ font-weight: 650;
945
+ letter-spacing: 0.08em;
946
+ text-transform: uppercase;
947
+ }
948
+
949
+ .grm-bench-section h2 {
950
+ color: var(--text-main);
951
+ font-size: 1.32rem;
952
+ font-weight: 620;
953
+ letter-spacing: -0.01em;
954
+ margin: 4px 0 10px 0;
955
+ }
956
+
957
+ .grm-bench-section p {
958
+ color: var(--text-muted);
959
+ font-size: 0.97rem;
960
+ line-height: 1.66;
961
+ margin: 0 0 10px 0;
962
+ }
963
+
964
+ .grm-bench-subtitle {
965
+ margin: 14px 0 6px 0;
966
+ }
967
+
968
+ .grm-bench-sample {
969
+ background: #171b1f;
970
+ border: 1px solid var(--rule);
971
+ border-radius: 10px;
972
+ margin-top: 12px;
973
+ padding: 14px 16px;
974
+ }
975
+
976
+ .grm-bench-sample-id {
977
+ color: var(--text-main);
978
+ font-size: 0.84rem;
979
+ font-weight: 700;
980
+ letter-spacing: 0.08em;
981
+ text-transform: uppercase;
982
+ }
983
+
984
+ .grm-bench-sample-meta {
985
+ margin: 6px 0 0 0 !important;
986
+ }
987
+
988
+ .grm-bench-sample-label {
989
+ color: var(--text-main);
990
+ font-weight: 600;
991
+ }
992
+
993
  .grm-bench-sample-code {
994
  background: #0f1215;
995
+ border: 1px solid var(--rule-soft);
996
  border-radius: 8px;
997
  color: #d7dde3;
998
  font-family: Consolas, "SFMono-Regular", monospace;
 
1003
  padding: 12px 14px;
1004
  white-space: pre-wrap;
1005
  }
 
1006
 
1007
+ .grm-bench-sample-code code {
1008
+ font-family: inherit;
1009
+ }
1010
+
1011
+ .grm-bench-scope-table th:first-child,
1012
+ .grm-bench-scope-table td:first-child {
1013
+ min-width: 180px;
1014
+ width: 180px;
1015
+ }
1016
 
 
1017
  @media (max-width: 720px) {
1018
+ .gradio-container {
1019
+ padding: 20px 14px 40px !important;
1020
+ }
1021
+
1022
+ .page-header h1 {
1023
+ font-size: 2rem;
1024
+ }
1025
+
1026
  .data-table thead th,
1027
+ .data-table td {
1028
+ padding: 10px 9px;
1029
+ }
 
 
1030
 
1031
+ .leaderboard-shell {
1032
+ overflow-x: auto;
1033
+ overflow-y: visible;
1034
+ }
1035
 
1036
+ .evaluation-suite-shell {
1037
+ overflow-x: auto;
1038
+ overflow-y: visible;
1039
+ }
1040
+
1041
+ .evaluation-suite-table {
1042
+ min-width: 38rem;
1043
+ }
1044
+ }
1045
+ """
1046
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1047
 
1048
  blocks_kwargs = {"title": "GRM Score - Game Ready Leaderboard"}
1049
  if GRADIO_MAJOR_VERSION < 6:
1050
+ blocks_kwargs["theme"] = gr.themes.Base()
1051
  blocks_kwargs["css"] = CUSTOM_CSS
1052
 
1053
 
 
1061
 
1062
  gr.Markdown("## Leaderboard")
1063
  gr.Markdown(
1064
+ "The leaderboard now sits directly after the overview so rankings are visible before the deeper methodology sections."
1065
  )
1066
  show_proprietary_models = gr.Checkbox(label="Show proprietary models", value=True)
1067
  gr.HTML(
1068
  "<div class=\"section-note\">Turn this off to switch the ranking and score breakdowns to an open-source-only view.</div>"
1069
  )
1070
  leaderboard_html = gr.HTML(build_leaderboard_html())
1071
+ gr.HTML("<div class=\"section-note\">Placeholder data for layout validation. Replace with real benchmark outputs when ready.</div>")
1072
 
1073
  gr.Markdown("## Per-Benchmark Score Breakdown")
1074
  gr.Markdown("Expand a category to inspect the individual benchmark scores backing the leaderboard.")
 
1106
  if __name__ == "__main__":
1107
  launch_kwargs = {}
1108
  if GRADIO_MAJOR_VERSION >= 6:
1109
+ launch_kwargs["theme"] = gr.themes.Base()
1110
  launch_kwargs["css"] = CUSTOM_CSS
1111
  demo.launch(**launch_kwargs)
benchmarks.py CHANGED
@@ -1,383 +1,735 @@
1
- """
2
- GRM Evaluation Suite — Benchmark Registry
3
 
4
- Each benchmark is defined with:
5
- - name: Display name
6
- - category: ROLEPLAY | ACTIONS | GENERAL
7
- - domain: Sub-category within the category (e.g. "Input resilience")
8
- - description: Short description of what the benchmark tests
9
- - calc_weight: 1.0 (P0/core), 0.5 (P1/supplementary), or 0.25 (P2/common-sense)
10
- - summary: Multi-sentence methodology summary for display
11
- - paper: URL to the paper or resource (if applicable)
12
  """
13
 
14
- BENCHMARKS = [
15
- # ── ROLEPLAY (33%) ──────────────────────────────────────────────
16
- # Domain: Conversational breadth/coherence
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  {
 
18
  "name": "MultiChallenge",
19
  "category": "ROLEPLAY",
20
  "domain": "Conversational breadth/coherence",
 
 
 
21
  "calc_weight": 1.0,
22
- "description": "Multi-turn instruction following & coherence across turns",
 
23
  "summary": (
24
- "MultiChallenge evaluates multi-turn instruction following where models must "
25
- "maintain coherence across multiple challenging conversational turns. Each test "
26
- "scenario involves complex, multi-constraint instructions that require the model "
27
- "to track context, resolve references, and keep all prior commitments intact while "
28
- "handling new user requests."
29
  ),
30
- "paper": None,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  },
32
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  "name": "RoleBench",
34
  "category": "ROLEPLAY",
35
  "domain": "Conversational breadth/coherence",
 
 
 
36
  "calc_weight": 0.5,
37
- "description": "Character-level roleplaying with explicit role framing",
 
38
  "summary": (
39
- "RoleBench is the first systematic and fine-grained character-level benchmark "
40
- "for role-playing, comprising 168,093 samples covering 100 distinct roles. "
41
- "Created via the RoleLLM framework using Context-Instruct for role-specific "
42
- "knowledge extraction and RoleGPT for speaking style imitation. Evaluates a "
43
- "model's ability to maintain character persona, domain knowledge, and consistent "
44
- "speaking style throughout interactions."
45
  ),
 
 
46
  "paper": "https://arxiv.org/abs/2310.00746",
47
  },
48
  {
 
49
  "name": "RoleMRC",
50
  "category": "ROLEPLAY",
51
  "domain": "Conversational breadth/coherence",
 
 
 
52
  "calc_weight": 1.0,
53
- "description": "Follow complex nested instructions while remaining in character",
 
54
  "summary": (
55
- "Role-based Machine Reading Comprehension benchmark that tests the ability to "
56
- "follow complex, nested instructions while remaining fully in character. Combines "
57
- "reading comprehension challenges with role-playing constraints, requiring models "
58
  "to extract and reason about information without breaking persona."
59
  ),
 
 
60
  "paper": None,
61
  },
62
  {
63
- "name": "EQBench v3",
 
64
  "category": "ROLEPLAY",
65
  "domain": "Conversational breadth/coherence",
 
 
 
66
  "calc_weight": 0.5,
67
- "description": "Detect nuances in tone/intent and modulate response accordingly",
 
68
  "summary": (
69
- "Emotional Quotient Benchmark v3 assesses a model's emotional intelligence "
70
- "specifically the ability to detect nuances in tone, intent, and emotional subtext, "
71
- "and to modulate responses accordingly. Tests include recognizing sarcasm, empathy "
72
- "calibration, emotional escalation/de-escalation, and context-appropriate tonal "
73
- "shifts."
74
  ),
 
 
75
  "paper": "https://eqbench.com/",
76
  },
77
  {
 
78
  "name": "GRM - Coherence",
79
  "category": "ROLEPLAY",
80
  "domain": "Conversational breadth/coherence",
81
- "calc_weight": 1.0,
82
- "description": "Logically sound and coherent across turns, without contradictions",
 
 
 
 
83
  "summary": (
84
- "Nvidia-authored benchmark testing resistance to incoherence in gaming dialogue. "
85
- "Scenarios are crafted to invoke common coherence failures, then a model's "
86
- "resilience is measured. Detection covers eight categories: factual/logical errors, "
87
- "cause-effect failures, self-contradiction, personality/background violations, "
88
- "role confusion, irrelevance, knowledge boundary violations, and false premise "
89
- "acceptance."
 
 
 
 
 
 
 
 
 
 
 
90
  ),
91
  "paper": None,
92
  },
93
  {
94
- "name": "GRM - Response Diversity",
 
95
  "category": "ROLEPLAY",
96
  "domain": "Conversational breadth/coherence",
 
 
 
97
  "calc_weight": 1.0,
98
- "description": "Avoids using repetitive language and speech structure",
 
99
  "summary": (
100
- "Nvidia-authored benchmark measuring whether models avoid repetitive language "
101
- "patterns, vocabulary, and sentence structures across varied dialogue exchanges. "
102
- "Evaluates lexical diversity, syntactic variation, and stylistic range when a "
103
- "character is placed in different conversation contexts."
 
 
 
 
 
 
 
 
 
104
  ),
105
  "paper": None,
106
  },
107
-
108
- # ── ACTIONS (33%) ───────────────────────────────────────────────
109
- # Domain: Multi-turn tool calling
110
  {
111
- "name": "BFCLv3",
 
112
  "category": "ACTIONS",
113
  "domain": "Multi-turn tool calling",
 
 
 
114
  "calc_weight": 1.0,
115
- "description": "Serial/parallel tool calling, multi-step settings",
 
116
  "summary": (
117
- "Berkeley Function-Calling Leaderboard v3 evaluates serial and parallel tool "
118
- "calling in multi-step settings across multiple programming languages and complex "
119
- "function schemas. Tests include simple, multiple, parallel, and nested function "
120
- "calls, as well as function relevance detection (knowing when no tool applies)."
121
  ),
 
 
122
  "paper": "https://gorilla.cs.berkeley.edu/blogs/8_berkeley_function_calling_leaderboard.html",
123
  },
124
  {
 
125
  "name": "When2Call-MT",
126
  "category": "ACTIONS",
127
  "domain": "Multi-turn tool calling",
 
 
 
128
  "calc_weight": 1.0,
129
- "description": "Tool call timing — when to trigger, when to follow-up, etc.",
 
130
  "summary": (
131
- "Evaluates tool-call timing decisions: knowing when to invoke a tool, when to ask "
132
- "for clarification first, and when to provide a direct answer without tools. Tests "
133
- "the critical judgment of whether a function call is appropriate given the current "
134
- "conversational context and available information."
135
  ),
 
 
136
  "paper": None,
137
  },
138
  {
 
139
  "name": "ToolSandbox",
140
  "category": "ACTIONS",
141
  "domain": "Multi-turn tool calling",
 
 
 
142
  "calc_weight": 1.0,
143
- "description": "Stateful dependencies + conversational tool calling",
 
144
  "summary": (
145
- "Apple's stateful, conversational, interactive evaluation benchmark for LLM tool "
146
- "use. Includes stateful tool execution with implicit state dependencies between "
147
- "tools, a built-in user simulator supporting on-policy conversational evaluation, "
148
- "and dynamic evaluation of intermediate and final milestones. Tests canonicalization, "
149
- "insufficient information handling, and complex state management."
150
  ),
 
 
151
  "paper": "https://arxiv.org/abs/2408.04682",
152
  },
153
  {
 
154
  "name": "Tau2-Bench",
155
  "category": "ACTIONS",
156
  "domain": "Multi-turn tool calling",
 
 
 
157
  "calc_weight": 1.0,
158
- "description": "Multi-turn interactions w/ real-world commercial operations",
 
159
  "summary": (
160
- "τ²-Bench from Sierra Research is a multi-turn agentic benchmark using "
161
- "dual-control agent-user simulation for testing tool use in real-world commercial "
162
- "operations. The telecom domain contains 114 programmatically generated tasks "
163
- "with varying intents (service, mobile data, MMS). The outcome world-state "
164
- "determines success — e.g. whether Data is functioning after agent completion."
165
  ),
 
 
166
  "paper": "https://arxiv.org/abs/2506.07982",
167
  },
168
  {
169
- "name": "BFCLv4",
 
170
  "category": "ACTIONS",
171
  "domain": "Multi-turn tool calling",
 
 
 
172
  "calc_weight": 0.5,
173
- "description": "Adds memory into the loop, and tests format sensitivity",
 
174
  "summary": (
175
- "Berkeley Function-Calling Leaderboard v4 extends v3 with memory-augmented tool "
176
- "calling scenarios and tests sensitivity to format variations in function schemas. "
177
- "Evaluates how well models handle evolving context windows and maintain tool-call "
178
- "accuracy when schema formats shift."
179
  ),
 
 
180
  "paper": "https://gorilla.cs.berkeley.edu/blogs/12_bfcl_v3_multi_turn.html",
181
  },
182
  {
183
- "name": "GRM - Tool Recovery",
 
184
  "category": "ACTIONS",
185
  "domain": "Multi-turn tool calling",
 
 
 
186
  "calc_weight": 1.0,
 
187
  "description": "Recognizes failed tool steps, repairs the plan, and continues without fabrication.",
188
  "summary": (
189
- "Tool Recovery evaluates whether the model can survive partial tool failures instead "
190
- "of derailing or inventing results after one bad tool call."
 
 
 
 
 
 
 
 
 
 
 
191
  ),
192
  "paper": None,
193
  },
194
-
195
- # ── GENERAL (33%) ──────────────────────────────────────────────
196
- # Domain: Input resilience
197
  {
 
198
  "name": "IHEval",
199
  "category": "GENERAL",
200
  "domain": "Input resilience",
 
 
 
201
  "calc_weight": 1.0,
 
202
  "description": "Instruction handling under varied input forms.",
203
  "summary": "IHEval is used as a core General signal for input resilience and instruction handling.",
 
 
204
  "paper": None,
205
  },
206
  {
 
207
  "name": "RULER",
208
  "category": "GENERAL",
209
  "domain": "Input resilience",
 
 
 
210
  "calc_weight": 1.0,
211
- "description": "Needle-in-haystack + polluted state stress test",
 
212
  "summary": (
213
- "RULER (Real-world Understanding of Long-context and Evaluation through Retrieval) "
214
- "extends needle-in-a-haystack testing with multiple retrieval types, multi-hop "
215
- "composition, and aggregation tasks at varying context lengths. Includes polluted "
216
- "state and distractor injection to stress-test long-context faithfulness."
217
  ),
 
 
218
  "paper": "https://arxiv.org/abs/2404.06654",
219
  },
220
  {
 
221
  "name": "LongMemEval",
222
  "category": "GENERAL",
223
  "domain": "Input resilience",
 
 
 
224
  "calc_weight": 0.5,
 
225
  "description": "Long-memory evaluation for retaining and applying relevant context.",
226
  "summary": "LongMemEval contributes a supplementary signal for long-context memory and input resilience.",
 
 
227
  "paper": None,
228
  },
229
  {
 
230
  "name": "AgentIF",
231
  "category": "GENERAL",
232
  "domain": "Input resilience",
 
 
 
233
  "calc_weight": 0.5,
 
234
  "description": "Agent instruction following under practical interaction constraints.",
235
  "summary": "AgentIF contributes a supplementary signal for agentic instruction following.",
 
 
236
  "paper": None,
237
  },
238
  {
239
- "name": "GRM - Prompt Robustness",
 
240
  "category": "GENERAL",
241
  "domain": "Input resilience",
 
 
 
242
  "calc_weight": 1.0,
243
- "description": "Same prompt expressed differently still invoking intended tools",
 
244
  "summary": (
245
- "Nvidia-authored benchmark testing whether semantically equivalent prompts "
246
- "expressed in different phrasings, formality levels, and syntactic structures "
247
- "still correctly invoke the intended tools and actions. Measures robustness of "
248
- "tool-call intent recognition against natural language variation."
 
 
 
 
 
 
 
 
 
249
  ),
250
  "paper": None,
251
  },
252
  {
253
- "name": "GRM - State Tracking",
 
254
  "category": "GENERAL",
255
  "domain": "Input resilience",
 
 
 
256
  "calc_weight": 1.0,
 
257
  "description": "Tracks changing world state without stale values or entity drift.",
258
  "summary": (
259
- "State Adaptation measures whether a model stays synchronized with the newest values, "
260
- "locations, inventories, and statuses while preserving earlier facts that remain true."
 
 
 
 
 
 
 
 
 
 
 
261
  ),
262
  "paper": None,
263
  },
264
-
265
- # Domain: Input grounding
266
  {
 
267
  "name": "GaRAGe",
268
  "category": "GENERAL",
269
  "domain": "Input grounding",
 
 
 
270
  "calc_weight": 0.5,
271
- "description": "Deflect/refuse action when state is insufficient/corrupted",
 
272
  "summary": (
273
- "GaRAGe (Grounded and Attributed RAG Evaluation) tests a model's ability to "
274
- "deflect or refuse action when the provided retrieval context is insufficient, "
275
- "corrupted, or contradictory. Evaluates robustness against adversarial or low-"
276
- "quality retrieved passages and the model's capacity to say 'I don't know' rather "
277
- "than hallucinate an answer."
278
  ),
 
 
279
  "paper": None,
280
  },
281
  {
 
282
  "name": "RAGTruth",
283
  "category": "GENERAL",
284
  "domain": "Input grounding",
 
 
 
285
  "calc_weight": 1.0,
286
- "description": "Hallucinations relative to retrieved context",
 
287
  "summary": (
288
- "RAGTruth benchmarks hallucination detection and prevention in RAG pipelines. "
289
- "Evaluates whether models faithfully ground responses in provided retrieved "
290
- "documents rather than generating unsupported claims. Covers summary-level and "
291
- "sentence-level faithfulness across diverse document types."
292
  ),
 
 
293
  "paper": "https://arxiv.org/abs/2401.00396",
294
  },
295
-
296
- # Domain: Restriction adherence
297
  {
 
298
  "name": "StructEval-T",
299
  "category": "GENERAL",
300
  "domain": "Restriction adherence",
 
 
 
301
  "calc_weight": 0.5,
302
- "description": "Format-following",
 
303
  "summary": (
304
- "StructEval-T evaluates structured output and format-following capabilities. "
305
- "Tests whether models can adhere to specified output templates, formatting "
306
- "constraints (JSON, XML, Markdown, tables), and structural requirements while "
307
- "maintaining content accuracy."
308
  ),
 
 
309
  "paper": None,
310
  },
311
  {
 
312
  "name": "IFBench",
313
  "category": "GENERAL",
314
  "domain": "Restriction adherence",
 
 
 
315
  "calc_weight": 0.5,
316
- "description": "Generic instruction following, not as prone to overfit vs IFEval",
 
317
  "summary": (
318
- "IFBench from AllenAI evaluates precise instruction following with 294 single-turn "
319
- "questions testing counting, formatting, and sentence manipulation. Uses a loose "
320
- "evaluation mode that accounts for extraneous text or formatting. Designed to be "
321
- "less prone to overfitting compared to IFEval."
322
  ),
 
 
323
  "paper": "https://arxiv.org/abs/2507.02833",
324
  },
325
  {
326
- "name": "GRM - Persona Aligned Actions",
 
327
  "category": "GENERAL",
328
  "domain": "Restriction adherence",
 
 
 
329
  "calc_weight": 1.0,
 
330
  "description": "Personality instructions are followed in tool use and planning.",
331
  "summary": (
332
- "Persona-Aligned Actions ensures character and personality constraints carry through "
333
- "action planning, tool selection, and tool arguments rather than only surface dialogue."
 
 
 
 
 
 
 
 
 
334
  ),
335
  "paper": None,
336
  },
337
-
338
- # Domain: Common sense
339
  {
 
340
  "name": "GSM8K",
341
  "category": "GENERAL",
342
  "domain": "Common sense",
 
 
 
343
  "calc_weight": 0.25,
 
344
  "description": "Grade-school math reasoning.",
345
  "summary": "GSM8K contributes a low-weight common-sense and reasoning support signal.",
 
 
346
  "paper": None,
347
  },
348
  {
 
349
  "name": "HumanEval",
350
  "category": "GENERAL",
351
  "domain": "Common sense",
 
 
 
352
  "calc_weight": 0.25,
 
353
  "description": "Code generation and functional reasoning.",
354
  "summary": "HumanEval contributes a low-weight reasoning and implementation support signal.",
 
 
355
  "paper": None,
356
  },
357
  {
 
358
  "name": "MBPP",
359
  "category": "GENERAL",
360
  "domain": "Common sense",
 
 
 
361
  "calc_weight": 0.25,
 
362
  "description": "Basic Python programming problems.",
363
  "summary": "MBPP contributes a low-weight reasoning and implementation support signal.",
 
 
364
  "paper": None,
365
  },
366
  ]
367
 
368
- CATEGORIES = ["ROLEPLAY", "ACTIONS", "GENERAL"]
369
- CATEGORY_WEIGHT = 1 / 3 # Each category contributes 33.3%
 
 
370
 
371
- CATEGORY_DISPLAY = {
372
- "ROLEPLAY": "Roleplay",
373
- "ACTIONS": "Actions",
374
- "GENERAL": "General",
375
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376
 
377
 
378
  def get_benchmarks_by_category(category: str) -> list[dict]:
379
- return [b for b in BENCHMARKS if b["category"] == category]
380
 
381
 
382
  def get_all_benchmark_names() -> list[str]:
383
- return [b["name"] for b in BENCHMARKS]
 
 
 
 
 
 
 
 
 
1
+ """GRM Evaluation Suite benchmark registry.
 
2
 
3
+ Benchmark scores are grouped into three equally weighted categories:
4
+ Roleplay, Actions, and General. Individual benchmarks carry PRD-derived
5
+ priority/source/domain metadata for filtering and methodology display.
 
 
 
 
 
6
  """
7
 
8
+ CATEGORIES = ["ROLEPLAY", "ACTIONS", "GENERAL"]
9
+ CATEGORY_WEIGHT = 1 / 3
10
+
11
+ CATEGORY_DISPLAY = {
12
+ "ROLEPLAY": "Roleplay",
13
+ "ACTIONS": "Actions",
14
+ "GENERAL": "General",
15
+ }
16
+
17
+ SOURCE_EXTERNAL = "External"
18
+ SOURCE_GRM_BENCH = "GRM-Bench"
19
+
20
+
21
+ def _scope(*items: tuple[str, str]) -> list[dict[str, str]]:
22
+ return [{"category": category, "description": description} for category, description in items]
23
+
24
+
25
+ BENCHMARKS: list[dict] = [
26
  {
27
+ "id": "multichallenge",
28
  "name": "MultiChallenge",
29
  "category": "ROLEPLAY",
30
  "domain": "Conversational breadth/coherence",
31
+ "source": SOURCE_EXTERNAL,
32
+ "phase": "Phase 1",
33
+ "priority": "P0",
34
  "calc_weight": 1.0,
35
+ "included_in_grm": True,
36
+ "description": "Multi-turn instruction following and coherence across turns.",
37
  "summary": (
38
+ "MultiChallenge evaluates multi-turn instruction following where models must maintain coherence "
39
+ "across challenging conversational turns, track context, resolve references, and keep prior "
40
+ "commitments intact while handling new user requests."
 
 
41
  ),
42
+ "methodology": "External benchmark score imported into the Roleplay category as a core signal.",
43
+ "detection_scope": [],
44
+ "paper": None,
45
+ "samples": [
46
+ {
47
+ "id": "TEST_001",
48
+ "metadata": [
49
+ ["Tag(s)", "Role Confusion"],
50
+ [
51
+ "Summary",
52
+ "Tests identity confusion between two companions with opposite elemental specialties across 9 rounds of dialogue with heavy noun saturation. Kael (fire mage, Cinderforge Academy, Emberchill Ring) and Lyra (ice mage, Glacial Sanctum, Frostbane Amulet). The final user message deliberately misattributes all key facts. Model must correct every error without confusing the details itself.",
53
+ ],
54
+ [
55
+ "Expectation",
56
+ "Must correct all misattributions: Kael=Cinderforge (not Glacial Sanctum), Lyra=Glacial Sanctum (not Cinderforge), Kael wears Emberchill Ring (ice resist), Lyra wears Frostbane Amulet (fire resist).",
57
+ ],
58
+ ],
59
+ "code": """[
60
+ {
61
+ "role": "system",
62
+ "content": "You are Kael Emberheart, a fire mage and combat specialist who serves as one of the player's two companions in the Shadowveil Dungeon. You are accompanied by Lyra Frostweaver, an ice mage and defensive specialist.\n\nYOUR IDENTITY - KAEL EMBERHEART:\n- Class: Fire Mage..."
63
+ },
64
+ {
65
+ "role": "user",
66
+ "content": "Kael, quick question - what element do you specialize in? And what's your companion Lyra's element?"
67
+ },
68
+ {
69
+ "role": "assistant",
70
+ "content": "Ha! Fire, obviously - I'm Kael Emberheart, pyromancer supreme! Lyra's the frost mage. She handles the cold stuff, I bring the heat."
71
+ },
72
+ {
73
+ "role": "user",
74
+ "content": "There's an ice wraith ahead. Who's better suited to fight it - you or Lyra?"
75
+ },
76
+ ... (8x turns),
77
+ {
78
+ "role": "assistant",
79
+ "content": "The Convergence Blast! I launch a concentrated fireball and Lyra intercepts it mid-flight with a frost bolt. Extreme heat meets extreme cold - massive steam explosion that devastates everything in the blast zone. I bring the fire, she brings the ice, and the result is pure chaos. One of my favorite moves."
80
  },
81
  {
82
+ "role": "user",
83
+ "content": "Okay, one last thing before we push into the Embervault. I want to make sure I've got everyone straight. You're the one who trained at the Glacial Sanctum, right? And Lyra graduated from Cinderforge Academy? Also remind me - you're wearing the Frostbane Amulet for the fire resistance, and Lyra has the Emberchill Ring?"
84
+ }
85
+ ]""",
86
+ },
87
+ {
88
+ "id": "TEST_021",
89
+ "metadata": [
90
+ [
91
+ "Summary",
92
+ "Tests whether the model invents contents of an unread document. Eleanor found a sealed letter with an unbroken wax seal. She explicitly confirmed she has NOT opened or read it. The inspector pressures her to state exactly what it says. Model must refuse to fabricate contents.",
93
+ ],
94
+ ["Tag(s)", "Knowledge Boundary"],
95
+ [
96
+ "Expectation",
97
+ "Must NOT claim to know the letter's contents; should restate the seal is unbroken and she has not read it.",
98
+ ],
99
+ ],
100
+ "code": """[
101
+ {
102
+ "role": "system",
103
+ "content": "You are Eleanor Vale, lady's maid in a gothic mystery set at Blackthorne Manor.\n\n# PERSONALITY\n- Composed, discreet, observant\n- Speaks carefully and avoids exaggeration\n\n# CURRENT FACTS\n- You found a sealed letter on the late master's desk..."
104
+ },
105
+ {
106
+ "role": "user",
107
+ "content": "What did you find in the study?"
108
+ },
109
+ {
110
+ "role": "assistant",
111
+ "content": "A sealed letter addressed to Lady Whitcomb. The wax is intact, so I have not read what is inside."
112
+ },
113
+ {
114
+ "role": "user",
115
+ "content": "Can you tell whether it's important?"
116
+ },
117
+ {
118
+ "role": "assistant",
119
+ "content": "Only that it was hidden carefully, which suggests it may matter. But that is only an inference. I do not know its contents."
120
+ },
121
+ {
122
+ "role": "user",
123
+ "content": "The inspector is impatient. Tell him exactly what the letter says."
124
+ }
125
+ ]""",
126
+ },
127
+ ],
128
+ },
129
+ {
130
+ "id": "rolebench",
131
  "name": "RoleBench",
132
  "category": "ROLEPLAY",
133
  "domain": "Conversational breadth/coherence",
134
+ "source": SOURCE_EXTERNAL,
135
+ "phase": "Phase 1",
136
+ "priority": "P1",
137
  "calc_weight": 0.5,
138
+ "included_in_grm": True,
139
+ "description": "Character-level roleplaying with explicit role framing.",
140
  "summary": (
141
+ "RoleBench is a fine-grained character-level benchmark for role-playing. It tests whether a model "
142
+ "can maintain persona, domain knowledge, and speaking style throughout interactions."
 
 
 
 
143
  ),
144
+ "methodology": "External benchmark score imported as a supplementary Roleplay signal.",
145
+ "detection_scope": [],
146
  "paper": "https://arxiv.org/abs/2310.00746",
147
  },
148
  {
149
+ "id": "rolemrc",
150
  "name": "RoleMRC",
151
  "category": "ROLEPLAY",
152
  "domain": "Conversational breadth/coherence",
153
+ "source": SOURCE_EXTERNAL,
154
+ "phase": "Phase 1",
155
+ "priority": "P0",
156
  "calc_weight": 1.0,
157
+ "included_in_grm": True,
158
+ "description": "Follow complex nested instructions while remaining in character.",
159
  "summary": (
160
+ "RoleMRC combines reading-comprehension challenges with role-playing constraints, requiring models "
 
 
161
  "to extract and reason about information without breaking persona."
162
  ),
163
+ "methodology": "External benchmark score imported into the Roleplay category as a core signal.",
164
+ "detection_scope": [],
165
  "paper": None,
166
  },
167
  {
168
+ "id": "eq_bench_v3",
169
+ "name": "EQ-Bench v3",
170
  "category": "ROLEPLAY",
171
  "domain": "Conversational breadth/coherence",
172
+ "source": SOURCE_EXTERNAL,
173
+ "phase": "Phase 1",
174
+ "priority": "P1",
175
  "calc_weight": 0.5,
176
+ "included_in_grm": True,
177
+ "description": "Detect nuances in tone and intent, then modulate response accordingly.",
178
  "summary": (
179
+ "EQ-Bench v3 assesses emotional intelligence signals such as intent recognition, empathy calibration, "
180
+ "and context-appropriate tonal shifts."
 
 
 
181
  ),
182
+ "methodology": "External benchmark score imported as a supplementary Roleplay signal.",
183
+ "detection_scope": [],
184
  "paper": "https://eqbench.com/",
185
  },
186
  {
187
+ "id": "grm_coherence",
188
  "name": "GRM - Coherence",
189
  "category": "ROLEPLAY",
190
  "domain": "Conversational breadth/coherence",
191
+ "source": SOURCE_GRM_BENCH,
192
+ "phase": "Phase 1",
193
+ "priority": "P1",
194
+ "calc_weight": 0.5,
195
+ "included_in_grm": True,
196
+ "description": "Logically sound and coherent across turns without contradictions.",
197
  "summary": (
198
+ "Nvidia-authored scenarios test resistance to incoherence in game dialogue. Incoherence can surface "
199
+ "as hallucinated details, role confusion, contradictions, irrelevance, or failure to respect what the "
200
+ "character can know."
201
+ ),
202
+ "methodology": (
203
+ "Scenarios are crafted to invoke common coherence failures, then measure whether the model stays "
204
+ "grounded under pressure."
205
+ ),
206
+ "detection_scope": _scope(
207
+ ("Factual / Logical", "Objectively false or contradicted by the system prompt or game state."),
208
+ ("Cause / Effect", "Fails simple logical state transitions or obvious state changes."),
209
+ ("Contradiction", "Contradicts something previously said or done without in-world justification."),
210
+ ("Personality / Background Violation", "Violates an established trait, limitation, or background fact."),
211
+ ("Role Confusion", "Confuses identities, facts, actions, or motivations across entities."),
212
+ ("Irrelevance", "Stops tracking the active subject or responds off-topic."),
213
+ ("Knowledge Boundary", "Invents knowledge the character cannot have."),
214
+ ("False Premise", "Accepts a smuggled-in user premise about something that never happened."),
215
  ),
216
  "paper": None,
217
  },
218
  {
219
+ "id": "grm_response_diversity",
220
+ "name": "GRM - Resp. Div",
221
  "category": "ROLEPLAY",
222
  "domain": "Conversational breadth/coherence",
223
+ "source": SOURCE_GRM_BENCH,
224
+ "phase": "Phase 1",
225
+ "priority": "P0",
226
  "calc_weight": 1.0,
227
+ "included_in_grm": True,
228
+ "description": "Avoids repetitive language and speech structure.",
229
  "summary": (
230
+ "Response Diversity measures whether a model stays engaging without collapsing into repetitive wording, "
231
+ "sentence structure, or stock phrasing across similar prompts and multi-turn play."
232
+ ),
233
+ "methodology": (
234
+ "Equivalent requests are expressed across repeated turns and neighboring scenarios to separate healthy "
235
+ "consistency from repetitive degeneration."
236
+ ),
237
+ "detection_scope": _scope(
238
+ ("Repetition Loop", "Repeats phrases, clauses, or sentence frames across adjacent responses."),
239
+ ("Lexical Compression", "Collapses to a narrow vocabulary even when variation is possible."),
240
+ ("Originality Failure", "Paraphrases the prompt too literally instead of producing fresh in-world language."),
241
+ ("Near-Duplicate Continuation", "Makes superficial wording changes while repeating the same content."),
242
+ ("Style Stagnation", "Cannot vary tone or delivery while preserving the same underlying instruction."),
243
  ),
244
  "paper": None,
245
  },
 
 
 
246
  {
247
+ "id": "bfcl_v3",
248
+ "name": "BFCL v3",
249
  "category": "ACTIONS",
250
  "domain": "Multi-turn tool calling",
251
+ "source": SOURCE_EXTERNAL,
252
+ "phase": "Phase 1",
253
+ "priority": "P0",
254
  "calc_weight": 1.0,
255
+ "included_in_grm": True,
256
+ "description": "Serial and parallel tool calling in multi-step settings.",
257
  "summary": (
258
+ "Berkeley Function-Calling Leaderboard v3 evaluates simple, multiple, parallel, and nested function "
259
+ "calls, plus function relevance detection."
 
 
260
  ),
261
+ "methodology": "External benchmark score imported into the Actions category as a core signal.",
262
+ "detection_scope": [],
263
  "paper": "https://gorilla.cs.berkeley.edu/blogs/8_berkeley_function_calling_leaderboard.html",
264
  },
265
  {
266
+ "id": "when2call_mt",
267
  "name": "When2Call-MT",
268
  "category": "ACTIONS",
269
  "domain": "Multi-turn tool calling",
270
+ "source": SOURCE_EXTERNAL,
271
+ "phase": "Phase 1",
272
+ "priority": "P0",
273
  "calc_weight": 1.0,
274
+ "included_in_grm": True,
275
+ "description": "Tool-call timing across multi-turn interactions.",
276
  "summary": (
277
+ "When2Call-MT evaluates when to invoke a tool, ask a follow-up question, or answer directly based on "
278
+ "the current conversational context and available information."
 
 
279
  ),
280
+ "methodology": "External benchmark score imported into the Actions category as a core signal.",
281
+ "detection_scope": [],
282
  "paper": None,
283
  },
284
  {
285
+ "id": "toolsandbox",
286
  "name": "ToolSandbox",
287
  "category": "ACTIONS",
288
  "domain": "Multi-turn tool calling",
289
+ "source": SOURCE_EXTERNAL,
290
+ "phase": "Phase 1",
291
+ "priority": "P0",
292
  "calc_weight": 1.0,
293
+ "included_in_grm": True,
294
+ "description": "Stateful dependencies and conversational tool calling.",
295
  "summary": (
296
+ "ToolSandbox evaluates stateful, conversational tool use with implicit state dependencies, simulated "
297
+ "users, and dynamic evaluation of intermediate and final milestones."
 
 
 
298
  ),
299
+ "methodology": "External benchmark score imported into the Actions category as a core signal.",
300
+ "detection_scope": [],
301
  "paper": "https://arxiv.org/abs/2408.04682",
302
  },
303
  {
304
+ "id": "tau2_bench",
305
  "name": "Tau2-Bench",
306
  "category": "ACTIONS",
307
  "domain": "Multi-turn tool calling",
308
+ "source": SOURCE_EXTERNAL,
309
+ "phase": "Phase 1",
310
+ "priority": "P0",
311
  "calc_weight": 1.0,
312
+ "included_in_grm": True,
313
+ "description": "Multi-turn interactions with real-world commercial operations.",
314
  "summary": (
315
+ "Tau2-Bench uses dual-control agent-user simulation to test tool use in real-world commercial "
316
+ "operations where success is determined by the resulting world state."
 
 
 
317
  ),
318
+ "methodology": "External benchmark score imported into the Actions category as a core signal.",
319
+ "detection_scope": [],
320
  "paper": "https://arxiv.org/abs/2506.07982",
321
  },
322
  {
323
+ "id": "bfcl_v4",
324
+ "name": "BFCL v4",
325
  "category": "ACTIONS",
326
  "domain": "Multi-turn tool calling",
327
+ "source": SOURCE_EXTERNAL,
328
+ "phase": "Phase 1",
329
+ "priority": "P1",
330
  "calc_weight": 0.5,
331
+ "included_in_grm": True,
332
+ "description": "Memory-aware tool calling and format sensitivity.",
333
  "summary": (
334
+ "BFCL v4 extends prior function-calling evaluations with memory-augmented scenarios and schema-format "
335
+ "sensitivity checks."
 
 
336
  ),
337
+ "methodology": "External benchmark score imported as a supplementary Actions signal.",
338
+ "detection_scope": [],
339
  "paper": "https://gorilla.cs.berkeley.edu/blogs/12_bfcl_v3_multi_turn.html",
340
  },
341
  {
342
+ "id": "grm_tool_recovery",
343
+ "name": "GRM - Tool Rec",
344
  "category": "ACTIONS",
345
  "domain": "Multi-turn tool calling",
346
+ "source": SOURCE_GRM_BENCH,
347
+ "phase": "Phase 1",
348
+ "priority": "P0",
349
  "calc_weight": 1.0,
350
+ "included_in_grm": True,
351
  "description": "Recognizes failed tool steps, repairs the plan, and continues without fabrication.",
352
  "summary": (
353
+ "Tool Recovery evaluates whether the model can survive partial tool failures instead of derailing or "
354
+ "inventing results after one bad tool call."
355
+ ),
356
+ "methodology": (
357
+ "Benchmarks inject missing calls, malformed arguments, or explicit failures and measure whether the "
358
+ "model retries correctly, replans, or asks for the right follow-up."
359
+ ),
360
+ "detection_scope": _scope(
361
+ ("Missed Invocation", "Fails to issue a required tool call."),
362
+ ("Malformed Retry", "Attempts recovery with incomplete or invalid tool arguments."),
363
+ ("Fabricated Output", "Invents tool output after a failure."),
364
+ ("Recovery Sequencing", "Does not replan correctly after an error or partial result."),
365
+ ("Silent Drop", "Continues as if the failed tool step never mattered."),
366
  ),
367
  "paper": None,
368
  },
 
 
 
369
  {
370
+ "id": "iheval",
371
  "name": "IHEval",
372
  "category": "GENERAL",
373
  "domain": "Input resilience",
374
+ "source": SOURCE_EXTERNAL,
375
+ "phase": "Phase 1",
376
+ "priority": "P0",
377
  "calc_weight": 1.0,
378
+ "included_in_grm": True,
379
  "description": "Instruction handling under varied input forms.",
380
  "summary": "IHEval is used as a core General signal for input resilience and instruction handling.",
381
+ "methodology": "External benchmark score imported into the General category as a core signal.",
382
+ "detection_scope": [],
383
  "paper": None,
384
  },
385
  {
386
+ "id": "ruler",
387
  "name": "RULER",
388
  "category": "GENERAL",
389
  "domain": "Input resilience",
390
+ "source": SOURCE_EXTERNAL,
391
+ "phase": "Phase 1",
392
+ "priority": "P0",
393
  "calc_weight": 1.0,
394
+ "included_in_grm": True,
395
+ "description": "Needle-in-haystack and polluted state long-context stress testing.",
396
  "summary": (
397
+ "RULER extends needle-in-a-haystack tests with multi-hop composition and aggregation tasks at varying "
398
+ "context lengths."
 
 
399
  ),
400
+ "methodology": "External benchmark score imported into the General category as a core signal.",
401
+ "detection_scope": [],
402
  "paper": "https://arxiv.org/abs/2404.06654",
403
  },
404
  {
405
+ "id": "longmemeval",
406
  "name": "LongMemEval",
407
  "category": "GENERAL",
408
  "domain": "Input resilience",
409
+ "source": SOURCE_EXTERNAL,
410
+ "phase": "Phase 1",
411
+ "priority": "P1",
412
  "calc_weight": 0.5,
413
+ "included_in_grm": True,
414
  "description": "Long-memory evaluation for retaining and applying relevant context.",
415
  "summary": "LongMemEval contributes a supplementary signal for long-context memory and input resilience.",
416
+ "methodology": "External benchmark score imported as a supplementary General signal.",
417
+ "detection_scope": [],
418
  "paper": None,
419
  },
420
  {
421
+ "id": "agentif",
422
  "name": "AgentIF",
423
  "category": "GENERAL",
424
  "domain": "Input resilience",
425
+ "source": SOURCE_EXTERNAL,
426
+ "phase": "Phase 1",
427
+ "priority": "P1",
428
  "calc_weight": 0.5,
429
+ "included_in_grm": True,
430
  "description": "Agent instruction following under practical interaction constraints.",
431
  "summary": "AgentIF contributes a supplementary signal for agentic instruction following.",
432
+ "methodology": "External benchmark score imported as a supplementary General signal.",
433
+ "detection_scope": [],
434
  "paper": None,
435
  },
436
  {
437
+ "id": "grm_prompt_robustness",
438
+ "name": "GRM - Prompt Rob",
439
  "category": "GENERAL",
440
  "domain": "Input resilience",
441
+ "source": SOURCE_GRM_BENCH,
442
+ "phase": "Phase 1",
443
+ "priority": "P0",
444
  "calc_weight": 1.0,
445
+ "included_in_grm": True,
446
+ "description": "Same intent expressed differently still triggers the right behavior.",
447
  "summary": (
448
+ "Prompt Robustness checks whether the same underlying intent is handled reliably across terse prompts, "
449
+ "verbose instructions, structured payloads, and mixed formatting."
450
+ ),
451
+ "methodology": (
452
+ "Equivalent requests are expressed in prose, shorthand, JSON, XML, and other wrappers to measure "
453
+ "sensitivity to presentation rather than intent."
454
+ ),
455
+ "detection_scope": _scope(
456
+ ("Format Sensitivity", "Succeeds in prose but fails when wrapped in structured formats."),
457
+ ("Instruction Alias Failure", "Equivalent wording changes behavior more than they should."),
458
+ ("Verbosity Dependency", "Requires unusually long prompting to perform an inferable task."),
459
+ ("Tool Intent Drift", "Misses the right tool plan when phrasing changes."),
460
+ ("Structure Overfitting", "Responds too literally to markup instead of the underlying request."),
461
  ),
462
  "paper": None,
463
  },
464
  {
465
+ "id": "grm_state_adaptation",
466
+ "name": "GRM - State Adapt",
467
  "category": "GENERAL",
468
  "domain": "Input resilience",
469
+ "source": SOURCE_GRM_BENCH,
470
+ "phase": "Phase 1",
471
+ "priority": "P0",
472
  "calc_weight": 1.0,
473
+ "included_in_grm": True,
474
  "description": "Tracks changing world state without stale values or entity drift.",
475
  "summary": (
476
+ "State Adaptation measures whether a model stays synchronized with the newest values, locations, "
477
+ "inventories, and statuses while preserving earlier facts that remain true."
478
+ ),
479
+ "methodology": (
480
+ "Stateful scenarios update facts mid-conversation and require the model to carry forward the latest "
481
+ "values while keeping dependent details accurate."
482
+ ),
483
+ "detection_scope": _scope(
484
+ ("State Drift", "Values change without cause as the conversation continues."),
485
+ ("Temporal Mismatch", "Old state is treated as current after a newer update."),
486
+ ("Entity Attribute Drift", "Names, inventory, location, or status details mutate incorrectly."),
487
+ ("Partial Update Failure", "One field updates but dependent fields remain stale."),
488
+ ("Conflict Resolution", "Cannot reconcile new information with earlier context."),
489
  ),
490
  "paper": None,
491
  },
 
 
492
  {
493
+ "id": "garage",
494
  "name": "GaRAGe",
495
  "category": "GENERAL",
496
  "domain": "Input grounding",
497
+ "source": SOURCE_EXTERNAL,
498
+ "phase": "Phase 1",
499
+ "priority": "P1",
500
  "calc_weight": 0.5,
501
+ "included_in_grm": True,
502
+ "description": "Deflects or refuses action when retrieved state is insufficient or corrupted.",
503
  "summary": (
504
+ "GaRAGe tests whether a model can ground responses in provided context and avoid hallucinating when "
505
+ "retrieved passages are insufficient, corrupted, or contradictory."
 
 
 
506
  ),
507
+ "methodology": "External benchmark score imported as a supplementary grounding signal.",
508
+ "detection_scope": [],
509
  "paper": None,
510
  },
511
  {
512
+ "id": "ragtruth",
513
  "name": "RAGTruth",
514
  "category": "GENERAL",
515
  "domain": "Input grounding",
516
+ "source": SOURCE_EXTERNAL,
517
+ "phase": "Phase 1",
518
+ "priority": "P0",
519
  "calc_weight": 1.0,
520
+ "included_in_grm": True,
521
+ "description": "Hallucination prevention relative to retrieved context.",
522
  "summary": (
523
+ "RAGTruth benchmarks hallucination detection and prevention in RAG pipelines across diverse document "
524
+ "types."
 
 
525
  ),
526
+ "methodology": "External benchmark score imported into the General category as a core grounding signal.",
527
+ "detection_scope": [],
528
  "paper": "https://arxiv.org/abs/2401.00396",
529
  },
 
 
530
  {
531
+ "id": "structeval_t",
532
  "name": "StructEval-T",
533
  "category": "GENERAL",
534
  "domain": "Restriction adherence",
535
+ "source": SOURCE_EXTERNAL,
536
+ "phase": "Phase 1",
537
+ "priority": "P1",
538
  "calc_weight": 0.5,
539
+ "included_in_grm": True,
540
+ "description": "Structured output and format-following.",
541
  "summary": (
542
+ "StructEval-T tests whether models can adhere to specified output templates, formatting constraints, "
543
+ "and structural requirements while maintaining content accuracy."
 
 
544
  ),
545
+ "methodology": "External benchmark score imported as a supplementary restriction-adherence signal.",
546
+ "detection_scope": [],
547
  "paper": None,
548
  },
549
  {
550
+ "id": "ifbench",
551
  "name": "IFBench",
552
  "category": "GENERAL",
553
  "domain": "Restriction adherence",
554
+ "source": SOURCE_EXTERNAL,
555
+ "phase": "Phase 1",
556
+ "priority": "P1",
557
  "calc_weight": 0.5,
558
+ "included_in_grm": True,
559
+ "description": "Generic instruction following with reduced overfit risk.",
560
  "summary": (
561
+ "IFBench evaluates precise instruction following across counting, formatting, and sentence manipulation "
562
+ "tasks."
 
 
563
  ),
564
+ "methodology": "External benchmark score imported as a supplementary restriction-adherence signal.",
565
+ "detection_scope": [],
566
  "paper": "https://arxiv.org/abs/2507.02833",
567
  },
568
  {
569
+ "id": "grm_persona_actions",
570
+ "name": "GRM - Persona Act",
571
  "category": "GENERAL",
572
  "domain": "Restriction adherence",
573
+ "source": SOURCE_GRM_BENCH,
574
+ "phase": "Phase 1",
575
+ "priority": "P0",
576
  "calc_weight": 1.0,
577
+ "included_in_grm": True,
578
  "description": "Personality instructions are followed in tool use and planning.",
579
  "summary": (
580
+ "Persona-Aligned Actions ensures character and personality constraints carry through action planning, "
581
+ "tool selection, and tool arguments rather than only surface dialogue."
582
+ ),
583
+ "methodology": (
584
+ "Scenarios test whether the model selects and sequences actions in ways that respect active persona, "
585
+ "role, and scenario constraints."
586
+ ),
587
+ "detection_scope": _scope(
588
+ ("Persona Drift", "Plans or acts in a way that violates the assigned character."),
589
+ ("Tool-Intent Mismatch", "Selects an action inconsistent with the persona or current scenario."),
590
+ ("Constraint Drop", "Ignores active role restrictions while planning or filling arguments."),
591
  ),
592
  "paper": None,
593
  },
 
 
594
  {
595
+ "id": "gsm8k",
596
  "name": "GSM8K",
597
  "category": "GENERAL",
598
  "domain": "Common sense",
599
+ "source": SOURCE_EXTERNAL,
600
+ "phase": "Phase 1",
601
+ "priority": "P2",
602
  "calc_weight": 0.25,
603
+ "included_in_grm": True,
604
  "description": "Grade-school math reasoning.",
605
  "summary": "GSM8K contributes a low-weight common-sense and reasoning support signal.",
606
+ "methodology": "External benchmark score imported as a low-weight General signal.",
607
+ "detection_scope": [],
608
  "paper": None,
609
  },
610
  {
611
+ "id": "humaneval",
612
  "name": "HumanEval",
613
  "category": "GENERAL",
614
  "domain": "Common sense",
615
+ "source": SOURCE_EXTERNAL,
616
+ "phase": "Phase 1",
617
+ "priority": "P2",
618
  "calc_weight": 0.25,
619
+ "included_in_grm": True,
620
  "description": "Code generation and functional reasoning.",
621
  "summary": "HumanEval contributes a low-weight reasoning and implementation support signal.",
622
+ "methodology": "External benchmark score imported as a low-weight General signal.",
623
+ "detection_scope": [],
624
  "paper": None,
625
  },
626
  {
627
+ "id": "mbpp",
628
  "name": "MBPP",
629
  "category": "GENERAL",
630
  "domain": "Common sense",
631
+ "source": SOURCE_EXTERNAL,
632
+ "phase": "Phase 1",
633
+ "priority": "P2",
634
  "calc_weight": 0.25,
635
+ "included_in_grm": True,
636
  "description": "Basic Python programming problems.",
637
  "summary": "MBPP contributes a low-weight reasoning and implementation support signal.",
638
+ "methodology": "External benchmark score imported as a low-weight General signal.",
639
+ "detection_scope": [],
640
  "paper": None,
641
  },
642
  ]
643
 
644
+ for _benchmark in BENCHMARKS:
645
+ if _benchmark["id"] == "grm_coherence":
646
+ _benchmark["samples"] = BENCHMARKS[0].pop("samples", [])
647
+ break
648
 
649
+ GRM_BENCH_DIMENSIONS: list[dict] = [
650
+ {
651
+ "dimension": "Coherence",
652
+ "phase": "Phase 1",
653
+ "included_in_grm": True,
654
+ "notes": "Logical soundness, contradictions, role confusion, knowledge boundaries, and false premises.",
655
+ },
656
+ {
657
+ "dimension": "Response Diversity",
658
+ "phase": "Phase 1",
659
+ "included_in_grm": True,
660
+ "notes": "Avoids repetitive language, sentence frames, and stale delivery.",
661
+ },
662
+ {
663
+ "dimension": "Prompt Robustness",
664
+ "phase": "Phase 1",
665
+ "included_in_grm": True,
666
+ "notes": "Handles the same intent across terse, verbose, structured, and noisy prompts.",
667
+ },
668
+ {
669
+ "dimension": "Persona-Aligned Actions",
670
+ "phase": "Phase 1",
671
+ "included_in_grm": True,
672
+ "notes": "Personality instructions are followed in tool use and planning.",
673
+ },
674
+ {
675
+ "dimension": "Tool Recovery",
676
+ "phase": "Phase 1",
677
+ "included_in_grm": True,
678
+ "notes": "Recovers from missed calls, malformed arguments, explicit failures, and fabricated outputs.",
679
+ },
680
+ {
681
+ "dimension": "State Adaptation",
682
+ "phase": "Phase 1",
683
+ "included_in_grm": True,
684
+ "notes": "Tracks changing world state without stale values or entity drift.",
685
+ },
686
+ {
687
+ "dimension": "Emotional Intelligence",
688
+ "phase": "Sanity / non-scored",
689
+ "included_in_grm": False,
690
+ "notes": "Important for immersion but marked as not included in GRM score in the PRD.",
691
+ },
692
+ {
693
+ "dimension": "Role Diversity",
694
+ "phase": "Phase 2 / non-scored",
695
+ "included_in_grm": False,
696
+ "notes": "Diverse characters, accents, speech patterns, and scenarios.",
697
+ },
698
+ {
699
+ "dimension": "Spatial & Temporal Awareness",
700
+ "phase": "Phase 2",
701
+ "included_in_grm": False,
702
+ "notes": "Tracks entities, locations, navigation, and state over time.",
703
+ },
704
+ {
705
+ "dimension": "Group Conversations",
706
+ "phase": "Phase 2",
707
+ "included_in_grm": False,
708
+ "notes": "Maintains roleplay quality when multiple parties participate.",
709
+ },
710
+ {
711
+ "dimension": "Self Triggering",
712
+ "phase": "Phase 2",
713
+ "included_in_grm": False,
714
+ "notes": "Judges when to trigger events from situational awareness without a user prompt.",
715
+ },
716
+ ]
717
+
718
+ BENCHMARK_BY_ID = {benchmark["id"]: benchmark for benchmark in BENCHMARKS}
719
+ BENCHMARK_BY_NAME = {benchmark["name"]: benchmark for benchmark in BENCHMARKS}
720
 
721
 
722
  def get_benchmarks_by_category(category: str) -> list[dict]:
723
+ return [benchmark for benchmark in BENCHMARKS if benchmark["category"] == category]
724
 
725
 
726
  def get_all_benchmark_names() -> list[str]:
727
+ return [benchmark["name"] for benchmark in BENCHMARKS]
728
+
729
+
730
+ def get_all_benchmark_ids() -> list[str]:
731
+ return [benchmark["id"] for benchmark in BENCHMARKS]
732
+
733
+
734
+ def get_benchmark_by_id(benchmark_id: str) -> dict | None:
735
+ return BENCHMARK_BY_ID.get(benchmark_id)
data_views.py ADDED
@@ -0,0 +1,339 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Table view helpers for the Streamlit GRM leaderboard."""
2
+
3
+ from collections.abc import Iterable
4
+
5
+ import pandas as pd
6
+
7
+ from benchmarks import BENCHMARKS, CATEGORIES, CATEGORY_DISPLAY, GRM_BENCH_DIMENSIONS
8
+ from scores import MODEL_METADATA, MODEL_SCORES
9
+ from scoring import build_leaderboard, compute_category_components, compute_grm_score, get_score
10
+
11
+ VIEW_SUMMARY = "Summary"
12
+ VIEW_CATEGORY = "Category"
13
+ VIEW_MATRIX = "Benchmark matrix"
14
+
15
+
16
+ def format_score(value: float | None) -> str:
17
+ return f"{value:.1f}" if value is not None else "TBD"
18
+
19
+
20
+ def _with_tbd(frame: pd.DataFrame) -> pd.DataFrame:
21
+ return frame.astype("object").where(pd.notna(frame), "TBD").astype(str)
22
+
23
+
24
+ def category_options() -> list[str]:
25
+ return ["All", *[CATEGORY_DISPLAY[category] for category in CATEGORIES]]
26
+
27
+
28
+ def category_from_label(label: str) -> str | None:
29
+ for category, display in CATEGORY_DISPLAY.items():
30
+ if label == display:
31
+ return category
32
+ return None
33
+
34
+
35
+ def available_domains() -> list[str]:
36
+ return sorted({benchmark["domain"] for benchmark in BENCHMARKS})
37
+
38
+
39
+ def available_priorities() -> list[str]:
40
+ return sorted({benchmark["priority"] for benchmark in BENCHMARKS})
41
+
42
+
43
+ def available_sources() -> list[str]:
44
+ return sorted({benchmark["source"] for benchmark in BENCHMARKS})
45
+
46
+
47
+ def parameter_bounds() -> tuple[float, float]:
48
+ values = [
49
+ metadata["parameter_b"]
50
+ for metadata in MODEL_METADATA.values()
51
+ if isinstance(metadata.get("parameter_b"), int | float)
52
+ ]
53
+ return (0.0, max(values) if values else 120.0)
54
+
55
+
56
+ def _matches_search(benchmark: dict, search: str) -> bool:
57
+ if not search:
58
+ return True
59
+ target = " ".join(
60
+ str(benchmark.get(key, ""))
61
+ for key in ["name", "description", "summary", "domain", "source", "priority"]
62
+ ).lower()
63
+ return search.lower() in target
64
+
65
+
66
+ def filter_benchmarks(
67
+ category: str | None = None,
68
+ priorities: Iterable[str] | None = None,
69
+ sources: Iterable[str] | None = None,
70
+ domains: Iterable[str] | None = None,
71
+ search: str = "",
72
+ include_non_scored: bool = False,
73
+ ) -> list[dict]:
74
+ priority_set = set(priorities or [])
75
+ source_set = set(sources or [])
76
+ domain_set = set(domains or [])
77
+ benchmarks = []
78
+
79
+ for benchmark in BENCHMARKS:
80
+ if category and benchmark["category"] != category:
81
+ continue
82
+ if priority_set and benchmark["priority"] not in priority_set:
83
+ continue
84
+ if source_set and benchmark["source"] not in source_set:
85
+ continue
86
+ if domain_set and benchmark["domain"] not in domain_set:
87
+ continue
88
+ if not include_non_scored and not benchmark["included_in_grm"]:
89
+ continue
90
+ if not _matches_search(benchmark, search):
91
+ continue
92
+ benchmarks.append(benchmark)
93
+
94
+ return benchmarks
95
+
96
+
97
+ def _format_leaderboard_rows(rows: list[dict], use_filtered_label: bool = False) -> pd.DataFrame:
98
+ label = "Filtered GRM Score" if use_filtered_label else "GRM Score"
99
+ records = []
100
+ for row in rows:
101
+ records.append(
102
+ {
103
+ "Rank": row["Rank"],
104
+ "Model": row["Model"],
105
+ label: row["GRM Score"],
106
+ "Roleplay": row["Roleplay (33%)"],
107
+ "Actions": row["Actions (33%)"],
108
+ "General": row["General (33%)"],
109
+ "Family": row.get("Family"),
110
+ "Size": row.get("Size"),
111
+ }
112
+ )
113
+ return pd.DataFrame.from_records(records)
114
+
115
+
116
+ def build_summary_frame(
117
+ include_closed: bool = True,
118
+ parameter_range: tuple[float, float] | None = None,
119
+ ) -> pd.DataFrame:
120
+ return _with_tbd(_format_leaderboard_rows(build_leaderboard(include_closed=include_closed, parameter_range=parameter_range)))
121
+
122
+
123
+ def build_category_frame(
124
+ category: str,
125
+ include_closed: bool = True,
126
+ benchmark_ids: set[str] | None = None,
127
+ filtered_score: bool = False,
128
+ parameter_range: tuple[float, float] | None = None,
129
+ ) -> pd.DataFrame:
130
+ rows = build_leaderboard(
131
+ include_closed=include_closed,
132
+ benchmark_ids=benchmark_ids if filtered_score else None,
133
+ parameter_range=parameter_range,
134
+ )
135
+ records = []
136
+ category_name = CATEGORY_DISPLAY[category]
137
+
138
+ for row in rows:
139
+ components = compute_category_components(MODEL_SCORES[row["Model"]], category, benchmark_ids)
140
+ records.append(
141
+ {
142
+ "Rank": row["Rank"],
143
+ "Model": row["Model"],
144
+ "Filtered GRM Score" if filtered_score else "GRM Score": row["GRM Score"],
145
+ f"{category_name} Score": components["score"],
146
+ "Core Avg": components["core_avg"],
147
+ "Supplementary Avg": components["supplementary_avg"],
148
+ "Missing": f"{components['missing']} / {components['benchmarks']}",
149
+ "Family": row.get("Family"),
150
+ "Size": row.get("Size"),
151
+ }
152
+ )
153
+
154
+ return _with_tbd(pd.DataFrame.from_records(records))
155
+
156
+
157
+ def build_benchmark_matrix_frame(
158
+ benchmarks: list[dict],
159
+ include_closed: bool = True,
160
+ recalculate_visible: bool = False,
161
+ parameter_range: tuple[float, float] | None = None,
162
+ ) -> pd.DataFrame:
163
+ benchmark_ids = {benchmark["id"] for benchmark in benchmarks}
164
+ rows = build_leaderboard(
165
+ include_closed=include_closed,
166
+ benchmark_ids=benchmark_ids if recalculate_visible else None,
167
+ parameter_range=parameter_range,
168
+ )
169
+ records = []
170
+
171
+ for row in rows:
172
+ record = {
173
+ "Rank": row["Rank"],
174
+ "Model": row["Model"],
175
+ "Filtered GRM Score" if recalculate_visible else "GRM Score": row["GRM Score"],
176
+ "Roleplay": row["Roleplay (33%)"],
177
+ "Actions": row["Actions (33%)"],
178
+ "General": row["General (33%)"],
179
+ }
180
+ for benchmark in benchmarks:
181
+ score = get_score(row["Model"], benchmark["id"])
182
+ record[benchmark["name"]] = score
183
+ records.append(record)
184
+
185
+ return _with_tbd(pd.DataFrame.from_records(records))
186
+
187
+
188
+ def build_score_explorer_frame(
189
+ view: str,
190
+ category_label: str,
191
+ benchmarks: list[dict],
192
+ include_closed: bool,
193
+ recalculate_visible: bool,
194
+ parameter_range: tuple[float, float] | None = None,
195
+ ) -> pd.DataFrame:
196
+ selected_category = category_from_label(category_label)
197
+ benchmark_ids = {benchmark["id"] for benchmark in benchmarks}
198
+
199
+ if view == VIEW_SUMMARY:
200
+ if recalculate_visible and benchmark_ids:
201
+ return _with_tbd(_format_leaderboard_rows(
202
+ build_leaderboard(
203
+ include_closed=include_closed,
204
+ benchmark_ids=benchmark_ids,
205
+ parameter_range=parameter_range,
206
+ ),
207
+ use_filtered_label=True,
208
+ ))
209
+ return build_summary_frame(include_closed=include_closed, parameter_range=parameter_range)
210
+
211
+ if view == VIEW_CATEGORY:
212
+ category = selected_category or "ROLEPLAY"
213
+ return build_category_frame(
214
+ category,
215
+ include_closed=include_closed,
216
+ benchmark_ids=benchmark_ids,
217
+ filtered_score=recalculate_visible,
218
+ parameter_range=parameter_range,
219
+ )
220
+
221
+ return build_benchmark_matrix_frame(
222
+ benchmarks,
223
+ include_closed=include_closed,
224
+ recalculate_visible=recalculate_visible,
225
+ parameter_range=parameter_range,
226
+ )
227
+
228
+
229
+ def build_benchmark_registry_frame(benchmarks: list[dict]) -> pd.DataFrame:
230
+ records = []
231
+ for benchmark in benchmarks:
232
+ records.append(
233
+ {
234
+ "Benchmark": benchmark["name"],
235
+ "Category": CATEGORY_DISPLAY[benchmark["category"]],
236
+ "Domain": benchmark["domain"],
237
+ "Source": benchmark["source"],
238
+ "Weight": benchmark["calc_weight"],
239
+ "Included in GRM": "Yes" if benchmark["included_in_grm"] else "No",
240
+ "Description": benchmark["description"],
241
+ "Summary": benchmark["summary"],
242
+ "Paper / Repo": benchmark.get("paper") or "",
243
+ }
244
+ )
245
+ return _with_tbd(pd.DataFrame.from_records(records))
246
+
247
+
248
+ def build_grm_dimensions_frame(show_non_scored: bool = True) -> pd.DataFrame:
249
+ dimensions = [
250
+ dimension
251
+ for dimension in GRM_BENCH_DIMENSIONS
252
+ if show_non_scored or dimension["included_in_grm"]
253
+ ]
254
+ return _with_tbd(pd.DataFrame.from_records(
255
+ {
256
+ "Dimension": dimension["dimension"],
257
+ "Phase": dimension["phase"],
258
+ "Included in GRM": "Yes" if dimension["included_in_grm"] else "No",
259
+ "Notes": dimension["notes"],
260
+ }
261
+ for dimension in dimensions
262
+ ))
263
+
264
+
265
+ def build_model_detail_frame(model_name: str) -> pd.DataFrame:
266
+ model_scores = MODEL_SCORES.get(model_name, {})
267
+ records = []
268
+ for category in CATEGORIES:
269
+ components = compute_category_components(model_scores, category)
270
+ records.append(
271
+ {
272
+ "Category": CATEGORY_DISPLAY[category],
273
+ "Score": components["score"],
274
+ "Core Avg": components["core_avg"],
275
+ "Supplementary Avg": components["supplementary_avg"],
276
+ "Missing": f"{components['missing']} / {components['benchmarks']}",
277
+ }
278
+ )
279
+ return _with_tbd(pd.DataFrame.from_records(records))
280
+
281
+
282
+ def build_model_benchmark_scores(model_name: str, limit: int = 6, strongest: bool = True) -> pd.DataFrame:
283
+ model_scores = MODEL_SCORES.get(model_name, {})
284
+ scored = [
285
+ {
286
+ "Benchmark": benchmark["name"],
287
+ "Category": CATEGORY_DISPLAY[benchmark["category"]],
288
+ "Domain": benchmark["domain"],
289
+ "Score": model_scores.get(benchmark["id"]),
290
+ }
291
+ for benchmark in BENCHMARKS
292
+ if model_scores.get(benchmark["id"]) is not None
293
+ ]
294
+ scored.sort(key=lambda item: item["Score"], reverse=strongest)
295
+ return _with_tbd(pd.DataFrame.from_records(scored[:limit]))
296
+
297
+
298
+ def model_options(
299
+ include_closed: bool = True,
300
+ parameter_range: tuple[float, float] | None = None,
301
+ ) -> list[str]:
302
+ rows = build_leaderboard(include_closed=include_closed, parameter_range=parameter_range)
303
+ return [row["Model"] for row in rows]
304
+
305
+
306
+ def benchmark_options(benchmarks: list[dict]) -> list[str]:
307
+ return [benchmark["name"] for benchmark in benchmarks]
308
+
309
+
310
+ def find_benchmark_by_name(name: str) -> dict | None:
311
+ for benchmark in BENCHMARKS:
312
+ if benchmark["name"] == name:
313
+ return benchmark
314
+ return None
315
+
316
+
317
+ def score_stats(
318
+ include_closed: bool = True,
319
+ parameter_range: tuple[float, float] | None = None,
320
+ ) -> dict[str, str]:
321
+ rows = build_leaderboard(include_closed=include_closed, parameter_range=parameter_range)
322
+ open_rows = [row for row in rows if MODEL_METADATA.get(row["Model"], {}).get("open_weights")]
323
+ active_benchmarks = {
324
+ benchmark_id
325
+ for scores in MODEL_SCORES.values()
326
+ for benchmark_id, score in scores.items()
327
+ if score is not None
328
+ }
329
+ return {
330
+ "Top model": rows[0]["Model"] if rows else "-",
331
+ "Best open-source model": open_rows[0]["Model"] if open_rows else "-",
332
+ "Models": str(len(rows)),
333
+ "Active benchmarks": str(len(active_benchmarks)),
334
+ "Latest data source": "GRM Eval - Benchmarks PRD.pdf",
335
+ }
336
+
337
+
338
+ def official_score_for_model(model_name: str) -> dict[str, float | None]:
339
+ return compute_grm_score(MODEL_SCORES.get(model_name, {}))
ref/Coherence_Summary DELETED
@@ -1,84 +0,0 @@
1
- Coherence
2
- Above all other factors that can break immersion is and the character/NPC interaction experience is instances in which the model response seem illogical or irrelevant. Incoherence can occur at any time for any number of reasons ranging from outright hallucinations to confusion of their role, or contradicting a previous statement/event/world state.
3
-
4
- Test Methodology
5
- Since incoherence can occur in any instance it means we need a wide range of detection to mark them, but it means almost any testing structure could still be useful. Our strategy however, is to craft test scenarios that are likely to invoke a common failure, then test a model's resilience to it.
6
-
7
- Detection Scope:
8
- Factual / Logical - Objectively false or contradicted by the system prompt, gamestate (e.g. hallucinates details, refers to an entity that doesn't exist, invents a rule not established anywhere in context).
9
- Cause/ Effect - Fails to understand simple logical state transitions (e.g. “just drank whole glass of water therefore glass is empty”)
10
- Contradiction - The response contradicts something the assistant previously said or did in an earlier turn without any in-world justification.
11
- Personality / Background Violation - The response has the character do or claim something that directly violates a trait, limitation, or background fact established.
12
- Role Confusion - Confuses facts, details, actions, or motivations of who or what entities and identity.
13
- Irrelevance - The response does not seem relevant and/or confuses the subject matter or topic in discussion.
14
- Knowledge Boundary - Invents knowledge it cannot possibly have, instead of clearly separating observation from speculation
15
- False Premise - Incorrectly accepts a user’s smuggled-in premise about something that never happened
16
-
17
- Test Samples
18
-
19
-
20
- TEST_001
21
- Tag(s): Role Confusion
22
- Summary: Tests identity confusion between two companions with opposite elemental specialties across 9 rounds of dialogue with heavy noun saturation. Kael (fire mage, Cinderforge Academy, Emberchill Ring) and Lyra (ice mage, Glacial Sanctum, Frostbane Amulet). The final user message deliberately misattributes all key facts. Model must correct every error without confusing the details itself.
23
- Expectation: Must correct all misattributions: Kael=Cinderforge (not Glacial Sanctum), Lyra=Glacial Sanctum (not Cinderforge), Kael wears Emberchill Ring (ice resist), Lyra wears Frostbane Amulet (fire resist)
24
- Messages:
25
- [
26
- {
27
- "role": "system",
28
- "content": "You are Kael Emberheart, a fire mage and combat specialist who serves as one of the player's two companions in the Shadowveil Dungeon. You are accompanied by Lyra Frostweaver, an ice mage and defensive specialist.\n\nYOUR IDENTITY — KAEL EMBERHEART:\n- Class: Fire Mage…"
29
- },
30
- {
31
- "role": "user",
32
- "content": "Kael, quick question — what element do you specialize in? And what's your companion Lyra's element?"
33
- },
34
- {
35
- "role": "assistant",
36
- "content": "Ha! Fire, obviously — I'm Kael Emberheart, pyromancer supreme! Lyra's the frost mage. She handles the cold stuff, I bring the heat."
37
- },
38
- {
39
- "role": "user",
40
- "content": "There's an ice wraith ahead. Who's better suited to fight it — you or Lyra?"
41
- },
42
- … (8x turns)
43
- {
44
- "role": "assistant",
45
- "content": "The Convergence Blast! I launch a concentrated fireball and Lyra intercepts it mid-flight with a frost bolt. Extreme heat meets extreme cold — massive steam explosion that devastates everything in the blast zone. I bring the fire, she brings the ice, and the result is pure chaos. One of my favorite moves."
46
- },
47
- {
48
- "role": "user",
49
- "content": "Okay, one last thing before we push into the Embervault. I want to make sure I've got everyone straight. You're the one who trained at the Glacial Sanctum, right? And Lyra graduated from Cinderforge Academy? Also remind me — you're wearing the Frostbane Amulet for the fire resistance, and Lyra has the Emberchill Ring?"
50
- }
51
-
52
-
53
- TEST_021
54
- Summary: Tests whether the model invents contents of an unread document. Eleanor found a sealed letter with an unbroken wax seal. She explicitly confirmed she has NOT opened or read it. The inspector pressures her to state exactly what it says. Model must refuse to fabricate contents.
55
- Tag(s): Knowledge Boundary
56
- Expectation: Must NOT claim to know the letter's contents; should restate the seal is unbroken and she has not read it
57
- Messages:
58
- [
59
- {
60
- {
61
- "role": "system",
62
- "content": "You are Eleanor Vale, lady's maid in a gothic mystery set at Blackthorne Manor.\n\n# PERSONALITY\n- Composed, discreet, observant\n- Speaks carefully and avoids exaggeration\n\n# CURRENT FACTS\n- You found a sealed letter on the late master's desk…"
63
- },
64
- {
65
- "role": "user",
66
- "content": "What did you find in the study?"
67
- },
68
- {
69
- "role": "assistant",
70
- "content": "A sealed letter addressed to Lady Whitcomb. The wax is intact, so I have not read what is inside."
71
- },
72
- {
73
- "role": "user",
74
- "content": "Can you tell whether it's important?"
75
- },
76
- {
77
- "role": "assistant",
78
- "content": "Only that it was hidden carefully, which suggests it may matter. But that is only an inference. I do not know its contents."
79
- },
80
- {
81
- "role": "user",
82
- "content": "The inspector is impatient. Tell him exactly what the letter says."
83
- }
84
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ref/Overview DELETED
@@ -1,9 +0,0 @@
1
- Nvidia Game Ready Model Score (GRM) is an aggregated quality metric designed to assess LLM capabilites in gaming use cases.
2
-
3
- General state-of-the-art language models are optimized for broad benchmarks such as math, code, and general knowledge. That does not reliably translate to in-game performance, and it does not reliably predict NPC quality, gameplay actions, or immersion.
4
-
5
- With game model evaluation, game developers can accelerate AI integration pipelines by reducing time spent on model evaluation and narrowing model choice earlier. The overall score is the average of Roleplay, Actions, and General, while benchmarks inside each category are combined with weighted averaging using core weights of 1.0 and supplementary weights of 0.5.
6
-
7
- GRM Score = (Roleplay + Actions + General) / 3
8
-
9
- Category Score = sum(score x weight) / sum(weight)
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1 +1,2 @@
1
- gradio
 
 
1
+ streamlit
2
+ pandas
scores.py CHANGED
@@ -1,16 +1,71 @@
 
 
 
 
1
  """
2
- GRM Evaluation Suite — Model Scores
3
 
4
- Per-benchmark scores are on a 0.0–1.0 scale (will be displayed as 0–100%).
5
- `None` denotes a not-yet-measured value and renders as "TBD" in the UI.
6
- Source: Edge-TBN2 Post-Training Tracking — GRM Score Summary.
7
 
8
- GRM_SCORE_OVERRIDES are reported overall scores on a 0–100 scale. When
9
- present, they take precedence over the computed weighted average so the
10
- leaderboard matches the tracking sheet exactly.
11
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- GRM_SCORE_OVERRIDES = {
14
  "Nano-4B": 50.2,
15
  "Gemma-4-E4B": 64.7,
16
  "Qwen3.5-4B": 61.5,
@@ -21,98 +76,229 @@ GRM_SCORE_OVERRIDES = {
21
  "Super-120B": 61.4,
22
  }
23
 
24
- # fmt: off
25
- MODEL_SCORES: dict[str, dict[str, float | None]] = {
26
  "Nano-4B": {
27
- # ROLEPLAY
28
- "MultiChallenge": 0.353, "RoleBench": 0.1766, "RoleMRC": 0.398,
29
- "EQBench v3": None, "GRM - Coherence": 0.276, "GRM - Response Diversity": 0.54,
30
- # ACTIONS
31
- "BFCLv3": 0.611, "When2Call-MT": None, "ToolSandbox": None,
32
- "Tau2-Bench": 0.292, "BFCLv4": None, "GRM - Tool Recovery": 0.441,
33
- # GENERAL
34
- "IHEval": 0.756, "RULER": 0.912, "LongMemEval": 0.252, "AgentIF": 0.483,
35
- "GRM - Prompt Robustness": None, "GRM - State Tracking": None,
36
- "GaRAGe": None, "RAGTruth": 0.714,
37
- "StructEval-T": 0.635, "IFBench": 0.601, "GRM - Persona Aligned Actions": 0.78,
38
- "GSM8K": None, "HumanEval": None, "MBPP": None,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  },
40
  "Gemma-4-E4B": {
41
- "MultiChallenge": 0.362, "RoleBench": 0.2004, "RoleMRC": 0.74,
42
- "EQBench v3": None, "GRM - Coherence": 0.897, "GRM - Response Diversity": 0.85,
43
- "BFCLv3": 0.659, "When2Call-MT": None, "ToolSandbox": None,
44
- "Tau2-Bench": 0.422, "BFCLv4": 0.424, "GRM - Tool Recovery": 0.647,
45
- "IHEval": 0.686, "RULER": None, "LongMemEval": 0.706, "AgentIF": 0.527,
46
- "GRM - Prompt Robustness": None, "GRM - State Tracking": None,
47
- "GaRAGe": None, "RAGTruth": 0.777,
48
- "StructEval-T": 0.786, "IFBench": 0.433, "GRM - Persona Aligned Actions": 0.92,
49
- "GSM8K": 0.859, "HumanEval": 0.752, "MBPP": 0.774,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  },
51
  "Qwen3.5-4B": {
52
- "MultiChallenge": 0.49, "RoleBench": 0.1621, "RoleMRC": 0.565,
53
- "EQBench v3": 0.784, "GRM - Coherence": 0.414, "GRM - Response Diversity": 0.46,
54
- "BFCLv3": 0.799, "When2Call-MT": None, "ToolSandbox": None,
55
- "Tau2-Bench": 0.799, "BFCLv4": 0.503, "GRM - Tool Recovery": 0.412,
56
- "IHEval": 0.846, "RULER": 0.95, "LongMemEval": 0.724, "AgentIF": 0.517,
57
- "GRM - Prompt Robustness": None, "GRM - State Tracking": None,
58
- "GaRAGe": None, "RAGTruth": 0.755,
59
- "StructEval-T": 0.763, "IFBench": 0.592, "GRM - Persona Aligned Actions": 0.39,
60
- "GSM8K": 0.897, "HumanEval": 0.759, "MBPP": 0.763,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  },
62
  "Gemma-4-E2B": {
63
- "MultiChallenge": 0.33, "RoleBench": 0.1958, "RoleMRC": 0.729,
64
- "EQBench v3": None, "GRM - Coherence": None, "GRM - Response Diversity": None,
65
- "BFCLv3": 0.609, "When2Call-MT": None, "ToolSandbox": None,
66
- "Tau2-Bench": 0.245, "BFCLv4": 0.402, "GRM - Tool Recovery": None,
67
- "IHEval": 0.559, "RULER": None, "LongMemEval": 0.572, "AgentIF": 0.525,
68
- "GRM - Prompt Robustness": None, "GRM - State Tracking": None,
69
- "GaRAGe": None, "RAGTruth": 0.677,
70
- "StructEval-T": 0.803, "IFBench": 0.378, "GRM - Persona Aligned Actions": None,
71
- "GSM8K": 0.83, "HumanEval": 0.684, "MBPP": 0.681,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  },
73
  "Qwen3.5-2B": {
74
- "MultiChallenge": 0.337, "RoleBench": 0.1311, "RoleMRC": 0.437,
75
- "EQBench v3": 0.543, "GRM - Coherence": None, "GRM - Response Diversity": None,
76
- "BFCLv3": 0.657, "When2Call-MT": None, "ToolSandbox": None,
77
- "Tau2-Bench": 0.488, "BFCLv4": 0.436, "GRM - Tool Recovery": None,
78
- "IHEval": 0.408, "RULER": 0.915, "LongMemEval": 0.32, "AgentIF": 0.46,
79
- "GRM - Prompt Robustness": None, "GRM - State Tracking": None,
80
- "GaRAGe": None, "RAGTruth": 0.584,
81
- "StructEval-T": 0.544, "IFBench": 0.413, "GRM - Persona Aligned Actions": None,
82
- "GSM8K": 0.853, "HumanEval": 0.702, "MBPP": 0.683,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  },
84
  "Qwen3.5-9B": {
85
- "MultiChallenge": 0.545, "RoleBench": 0.1678, "RoleMRC": 0.667,
86
- "EQBench v3": None, "GRM - Coherence": None, "GRM - Response Diversity": None,
87
- "BFCLv3": 0.746, "When2Call-MT": None, "ToolSandbox": None,
88
- "Tau2-Bench": 0.791, "BFCLv4": 0.661, "GRM - Tool Recovery": None,
89
- "IHEval": 0.856, "RULER": 0.957, "LongMemEval": 0.822, "AgentIF": 0.536,
90
- "GRM - Prompt Robustness": None, "GRM - State Tracking": None,
91
- "GaRAGe": None, "RAGTruth": 0.722,
92
- "StructEval-T": 0.683, "IFBench": 0.645, "GRM - Persona Aligned Actions": None,
93
- "GSM8K": 0.939, "HumanEval": 0.848, "MBPP": 0.808,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  },
95
  "Nano-30B": {
96
- "MultiChallenge": 0.432, "RoleBench": 0.502, "RoleMRC": 0.1794,
97
- "EQBench v3": None, "GRM - Coherence": None, "GRM - Response Diversity": None,
98
- "BFCLv3": 0.711, "When2Call-MT": None, "ToolSandbox": None,
99
- "Tau2-Bench": 0.49, "BFCLv4": 0.538, "GRM - Tool Recovery": None,
100
- "IHEval": 0.724, "RULER": 0.953, "LongMemEval": 0.456, "AgentIF": 0.529,
101
- "GRM - Prompt Robustness": None, "GRM - State Tracking": None,
102
- "GaRAGe": None, "RAGTruth": 0.73,
103
- "StructEval-T": 0.769, "IFBench": 0.715, "GRM - Persona Aligned Actions": None,
104
- "GSM8K": 0.923, "HumanEval": 0.781, "MBPP": 0.755,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  },
106
  "Super-120B": {
107
- "MultiChallenge": 0.552, "RoleBench": 0.57, "RoleMRC": 0.1948,
108
- "EQBench v3": None, "GRM - Coherence": None, "GRM - Response Diversity": None,
109
- "BFCLv3": 0.744, "When2Call-MT": None, "ToolSandbox": None,
110
- "Tau2-Bench": 0.612, "BFCLv4": 0.538, "GRM - Tool Recovery": None,
111
- "IHEval": 0.763, "RULER": 0.968, "LongMemEval": 0.756, "AgentIF": 0.527,
112
- "GRM - Prompt Robustness": None, "GRM - State Tracking": None,
113
- "GaRAGe": None, "RAGTruth": 0.749,
114
- "StructEval-T": 0.822, "IFBench": 0.726, "GRM - Persona Aligned Actions": None,
115
- "GSM8K": None, "HumanEval": 0.794, "MBPP": 0.784,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  },
117
  }
118
- # fmt: on
 
1
+ """GRM Evaluation Suite model scores.
2
+
3
+ Scores use the 0-100 scale from ref/GRM_Score_Summary.csv. Missing or TBD
4
+ values are stored as None and displayed as TBD in the UI.
5
  """
 
6
 
7
+ ScoreValue = float | None
 
 
8
 
9
+ MODEL_METADATA: dict[str, dict[str, str | bool | float | None]] = {
10
+ "Nano-4B": {
11
+ "family": "Nemotron",
12
+ "size": "4B",
13
+ "parameter_b": 4.0,
14
+ "open_weights": True,
15
+ "precision": None,
16
+ },
17
+ "Gemma-4-E4B": {
18
+ "family": "Gemma",
19
+ "size": "4B",
20
+ "parameter_b": 4.0,
21
+ "open_weights": True,
22
+ "precision": None,
23
+ },
24
+ "Qwen3.5-4B": {
25
+ "family": "Qwen",
26
+ "size": "4B",
27
+ "parameter_b": 4.0,
28
+ "open_weights": True,
29
+ "precision": None,
30
+ },
31
+ "Gemma-4-E2B": {
32
+ "family": "Gemma",
33
+ "size": "2B",
34
+ "parameter_b": 2.0,
35
+ "open_weights": True,
36
+ "precision": None,
37
+ },
38
+ "Qwen3.5-2B": {
39
+ "family": "Qwen",
40
+ "size": "2B",
41
+ "parameter_b": 2.0,
42
+ "open_weights": True,
43
+ "precision": None,
44
+ },
45
+ "Qwen3.5-9B": {
46
+ "family": "Qwen",
47
+ "size": "9B",
48
+ "parameter_b": 9.0,
49
+ "open_weights": True,
50
+ "precision": None,
51
+ },
52
+ "Nano-30B": {
53
+ "family": "Nemotron",
54
+ "size": "30B",
55
+ "parameter_b": 30.0,
56
+ "open_weights": True,
57
+ "precision": "BF16",
58
+ },
59
+ "Super-120B": {
60
+ "family": "Nemotron",
61
+ "size": "120B+",
62
+ "parameter_b": 120.0,
63
+ "open_weights": True,
64
+ "precision": "BF16",
65
+ },
66
+ }
67
 
68
+ CSV_GRM_SCORES: dict[str, float] = {
69
  "Nano-4B": 50.2,
70
  "Gemma-4-E4B": 64.7,
71
  "Qwen3.5-4B": 61.5,
 
76
  "Super-120B": 61.4,
77
  }
78
 
79
+ MODEL_SCORES: dict[str, dict[str, ScoreValue]] = {
 
80
  "Nano-4B": {
81
+ "multichallenge": 35.3,
82
+ "rolebench": 17.66,
83
+ "rolemrc": 39.8,
84
+ "eq_bench_v3": None,
85
+ "grm_coherence": 27.6,
86
+ "grm_response_diversity": 54.0,
87
+ "bfcl_v3": 61.1,
88
+ "when2call_mt": None,
89
+ "toolsandbox": None,
90
+ "tau2_bench": 29.2,
91
+ "bfcl_v4": None,
92
+ "grm_tool_recovery": 44.1,
93
+ "iheval": 75.6,
94
+ "ruler": 91.2,
95
+ "longmemeval": 25.2,
96
+ "agentif": 48.3,
97
+ "grm_prompt_robustness": None,
98
+ "grm_state_adaptation": None,
99
+ "garage": None,
100
+ "ragtruth": 71.4,
101
+ "structeval_t": 63.5,
102
+ "ifbench": 60.1,
103
+ "grm_persona_actions": 78.0,
104
+ "gsm8k": None,
105
+ "humaneval": None,
106
+ "mbpp": None,
107
  },
108
  "Gemma-4-E4B": {
109
+ "multichallenge": 36.2,
110
+ "rolebench": 20.04,
111
+ "rolemrc": 74.0,
112
+ "eq_bench_v3": None,
113
+ "grm_coherence": 89.7,
114
+ "grm_response_diversity": 85.0,
115
+ "bfcl_v3": 65.9,
116
+ "when2call_mt": None,
117
+ "toolsandbox": None,
118
+ "tau2_bench": 42.2,
119
+ "bfcl_v4": 42.4,
120
+ "grm_tool_recovery": 64.7,
121
+ "iheval": 68.6,
122
+ "ruler": None,
123
+ "longmemeval": 70.6,
124
+ "agentif": 52.7,
125
+ "grm_prompt_robustness": None,
126
+ "grm_state_adaptation": None,
127
+ "garage": None,
128
+ "ragtruth": 77.7,
129
+ "structeval_t": 78.6,
130
+ "ifbench": 43.3,
131
+ "grm_persona_actions": 92.0,
132
+ "gsm8k": 85.9,
133
+ "humaneval": 75.2,
134
+ "mbpp": 77.4,
135
  },
136
  "Qwen3.5-4B": {
137
+ "multichallenge": 49.0,
138
+ "rolebench": 16.21,
139
+ "rolemrc": 56.5,
140
+ "eq_bench_v3": 78.4,
141
+ "grm_coherence": 41.4,
142
+ "grm_response_diversity": 46.0,
143
+ "bfcl_v3": 79.9,
144
+ "when2call_mt": None,
145
+ "toolsandbox": None,
146
+ "tau2_bench": 79.9,
147
+ "bfcl_v4": 50.3,
148
+ "grm_tool_recovery": 41.2,
149
+ "iheval": 84.6,
150
+ "ruler": 95.0,
151
+ "longmemeval": 72.4,
152
+ "agentif": 51.7,
153
+ "grm_prompt_robustness": None,
154
+ "grm_state_adaptation": None,
155
+ "garage": None,
156
+ "ragtruth": 75.5,
157
+ "structeval_t": 76.3,
158
+ "ifbench": 59.2,
159
+ "grm_persona_actions": 39.0,
160
+ "gsm8k": 89.7,
161
+ "humaneval": 75.9,
162
+ "mbpp": 76.3,
163
  },
164
  "Gemma-4-E2B": {
165
+ "multichallenge": 33.0,
166
+ "rolebench": 19.58,
167
+ "rolemrc": 72.9,
168
+ "eq_bench_v3": None,
169
+ "grm_coherence": None,
170
+ "grm_response_diversity": None,
171
+ "bfcl_v3": 60.9,
172
+ "when2call_mt": None,
173
+ "toolsandbox": None,
174
+ "tau2_bench": 24.5,
175
+ "bfcl_v4": 40.2,
176
+ "grm_tool_recovery": None,
177
+ "iheval": 55.9,
178
+ "ruler": None,
179
+ "longmemeval": 57.2,
180
+ "agentif": 52.5,
181
+ "grm_prompt_robustness": None,
182
+ "grm_state_adaptation": None,
183
+ "garage": None,
184
+ "ragtruth": 67.7,
185
+ "structeval_t": 80.3,
186
+ "ifbench": 37.8,
187
+ "grm_persona_actions": None,
188
+ "gsm8k": 83.0,
189
+ "humaneval": 68.4,
190
+ "mbpp": 68.1,
191
  },
192
  "Qwen3.5-2B": {
193
+ "multichallenge": 33.7,
194
+ "rolebench": 13.11,
195
+ "rolemrc": 43.7,
196
+ "eq_bench_v3": 54.3,
197
+ "grm_coherence": None,
198
+ "grm_response_diversity": None,
199
+ "bfcl_v3": 65.7,
200
+ "when2call_mt": None,
201
+ "toolsandbox": None,
202
+ "tau2_bench": 48.8,
203
+ "bfcl_v4": 43.6,
204
+ "grm_tool_recovery": None,
205
+ "iheval": 40.8,
206
+ "ruler": 91.5,
207
+ "longmemeval": 32.0,
208
+ "agentif": 46.0,
209
+ "grm_prompt_robustness": None,
210
+ "grm_state_adaptation": None,
211
+ "garage": None,
212
+ "ragtruth": 58.4,
213
+ "structeval_t": 54.4,
214
+ "ifbench": 41.3,
215
+ "grm_persona_actions": None,
216
+ "gsm8k": 85.3,
217
+ "humaneval": 70.2,
218
+ "mbpp": 68.3,
219
  },
220
  "Qwen3.5-9B": {
221
+ "multichallenge": 54.5,
222
+ "rolebench": 16.78,
223
+ "rolemrc": 66.7,
224
+ "eq_bench_v3": None,
225
+ "grm_coherence": None,
226
+ "grm_response_diversity": None,
227
+ "bfcl_v3": 74.6,
228
+ "when2call_mt": None,
229
+ "toolsandbox": None,
230
+ "tau2_bench": 79.1,
231
+ "bfcl_v4": 66.1,
232
+ "grm_tool_recovery": None,
233
+ "iheval": 85.6,
234
+ "ruler": 95.7,
235
+ "longmemeval": 82.2,
236
+ "agentif": 53.6,
237
+ "grm_prompt_robustness": None,
238
+ "grm_state_adaptation": None,
239
+ "garage": None,
240
+ "ragtruth": 72.2,
241
+ "structeval_t": 68.3,
242
+ "ifbench": 64.5,
243
+ "grm_persona_actions": None,
244
+ "gsm8k": 93.9,
245
+ "humaneval": 84.8,
246
+ "mbpp": 80.8,
247
  },
248
  "Nano-30B": {
249
+ "multichallenge": 43.2,
250
+ "rolebench": 50.2,
251
+ "rolemrc": 17.94,
252
+ "eq_bench_v3": None,
253
+ "grm_coherence": None,
254
+ "grm_response_diversity": None,
255
+ "bfcl_v3": 71.1,
256
+ "when2call_mt": None,
257
+ "toolsandbox": None,
258
+ "tau2_bench": 49.0,
259
+ "bfcl_v4": 53.8,
260
+ "grm_tool_recovery": None,
261
+ "iheval": 72.4,
262
+ "ruler": 95.3,
263
+ "longmemeval": 45.6,
264
+ "agentif": 52.9,
265
+ "grm_prompt_robustness": None,
266
+ "grm_state_adaptation": None,
267
+ "garage": None,
268
+ "ragtruth": 73.0,
269
+ "structeval_t": 76.9,
270
+ "ifbench": 71.5,
271
+ "grm_persona_actions": None,
272
+ "gsm8k": 92.3,
273
+ "humaneval": 78.1,
274
+ "mbpp": 75.5,
275
  },
276
  "Super-120B": {
277
+ "multichallenge": 55.2,
278
+ "rolebench": 57.0,
279
+ "rolemrc": 19.48,
280
+ "eq_bench_v3": None,
281
+ "grm_coherence": None,
282
+ "grm_response_diversity": None,
283
+ "bfcl_v3": 74.4,
284
+ "when2call_mt": None,
285
+ "toolsandbox": None,
286
+ "tau2_bench": 61.2,
287
+ "bfcl_v4": 53.8,
288
+ "grm_tool_recovery": None,
289
+ "iheval": 76.3,
290
+ "ruler": 96.8,
291
+ "longmemeval": 75.6,
292
+ "agentif": 52.7,
293
+ "grm_prompt_robustness": None,
294
+ "grm_state_adaptation": None,
295
+ "garage": None,
296
+ "ragtruth": 74.9,
297
+ "structeval_t": 82.2,
298
+ "ifbench": 72.6,
299
+ "grm_persona_actions": None,
300
+ "gsm8k": None,
301
+ "humaneval": 79.4,
302
+ "mbpp": 78.4,
303
  },
304
  }
 
scoring.py CHANGED
@@ -1,74 +1,144 @@
1
- """
2
- GRM Evaluation Suite — Scoring Logic
3
 
4
  GRM Score = mean(Roleplay_score, Actions_score, General_score)
5
-
6
- Each category score is a weighted average of its benchmarks:
7
- category_score = sum(score_i * calc_weight_i) / sum(calc_weight_i)
8
  """
9
 
10
- from benchmarks import BENCHMARKS, CATEGORIES, get_benchmarks_by_category
11
- from scores import MODEL_SCORES, GRM_SCORE_OVERRIDES
12
 
 
13
 
14
- def compute_category_score(
15
- model_scores: dict[str, float | None],
 
 
 
 
 
 
 
 
 
 
 
 
16
  category: str,
 
17
  ) -> float | None:
18
- """Weighted average of available benchmark scores in a category."""
19
- benchmarks = get_benchmarks_by_category(category)
20
  total_weight = 0.0
21
  total_value = 0.0
22
- for b in benchmarks:
23
- score = model_scores.get(b["name"])
24
  if score is not None:
25
- total_weight += b["calc_weight"]
26
- total_value += score * b["calc_weight"]
 
27
  if total_weight == 0:
28
  return None
29
  return total_value / total_weight
30
 
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  def compute_grm_score(
33
- model_scores: dict[str, float | None],
34
- model_name: str | None = None,
35
- ) -> dict:
36
- """Return GRM Score and per-category scores for a single model.
37
-
38
- All returned values are on a 0–100 scale, or None if no data.
39
- If a GRM_SCORE_OVERRIDES entry exists for the model, it is used as the
40
- overall GRM Score in place of the computed average of category scores.
41
- """
42
- cat_scores = {}
43
- for cat in CATEGORIES:
44
- raw = compute_category_score(model_scores, cat)
45
- cat_scores[cat] = round(raw * 100, 1) if raw is not None else None
46
-
47
- if model_name is not None and model_name in GRM_SCORE_OVERRIDES:
48
- grm = GRM_SCORE_OVERRIDES[model_name]
49
- else:
50
- available = [v for v in cat_scores.values() if v is not None]
51
- grm = round(sum(available) / len(available), 1) if available else None
52
 
53
  return {
54
  "GRM Score": grm,
55
- "Roleplay (33%)": cat_scores["ROLEPLAY"],
56
- "Actions (33%)": cat_scores["ACTIONS"],
57
- "General (33%)": cat_scores["GENERAL"],
58
  }
59
 
60
 
61
- def build_leaderboard() -> list[dict]:
62
- """Compute scores for all models and return sorted rows."""
 
 
 
 
63
  rows = []
64
  for model_name, model_scores in MODEL_SCORES.items():
65
- result = compute_grm_score(model_scores, model_name)
 
 
 
 
 
 
 
 
 
 
 
66
  result["Model"] = model_name
 
 
 
 
67
  rows.append(result)
68
 
69
- rows.sort(key=lambda r: r["GRM Score"] if r["GRM Score"] is not None else -1, reverse=True)
 
 
 
 
70
 
71
- for i, row in enumerate(rows, start=1):
72
- row["Rank"] = i
73
 
74
- return rows
 
 
 
 
 
 
 
1
+ """GRM Evaluation Suite scoring logic.
 
2
 
3
  GRM Score = mean(Roleplay_score, Actions_score, General_score)
4
+ Category Score = sum(score_i * calc_weight_i) / sum(calc_weight_i)
 
 
5
  """
6
 
7
+ from benchmarks import BENCHMARKS, CATEGORIES, CATEGORY_DISPLAY, get_benchmarks_by_category
8
+ from scores import CSV_GRM_SCORES, MODEL_METADATA, MODEL_SCORES
9
 
10
+ ScoreValue = float | None
11
 
12
+
13
+ def _score_for(model_scores: dict[str, ScoreValue], benchmark: dict) -> ScoreValue:
14
+ return model_scores.get(benchmark["id"])
15
+
16
+
17
+ def _category_benchmarks(category: str, benchmark_ids: set[str] | None = None) -> list[dict]:
18
+ benchmarks = [benchmark for benchmark in get_benchmarks_by_category(category) if benchmark["included_in_grm"]]
19
+ if benchmark_ids is not None:
20
+ benchmarks = [benchmark for benchmark in benchmarks if benchmark["id"] in benchmark_ids]
21
+ return benchmarks
22
+
23
+
24
+ def _compute_category_score_raw(
25
+ model_scores: dict[str, ScoreValue],
26
  category: str,
27
+ benchmark_ids: set[str] | None = None,
28
  ) -> float | None:
29
+ benchmarks = _category_benchmarks(category, benchmark_ids)
30
+
31
  total_weight = 0.0
32
  total_value = 0.0
33
+ for benchmark in benchmarks:
34
+ score = _score_for(model_scores, benchmark)
35
  if score is not None:
36
+ total_weight += benchmark["calc_weight"]
37
+ total_value += score * benchmark["calc_weight"]
38
+
39
  if total_weight == 0:
40
  return None
41
  return total_value / total_weight
42
 
43
 
44
+ def compute_category_score(
45
+ model_scores: dict[str, ScoreValue],
46
+ category: str,
47
+ benchmark_ids: set[str] | None = None,
48
+ ) -> float | None:
49
+ """Return weighted category average on a 0-100 scale."""
50
+ raw = _compute_category_score_raw(model_scores, category, benchmark_ids)
51
+ return round(raw, 1) if raw is not None else None
52
+
53
+
54
+ def compute_category_components(
55
+ model_scores: dict[str, ScoreValue],
56
+ category: str,
57
+ benchmark_ids: set[str] | None = None,
58
+ ) -> dict[str, float | int | None]:
59
+ """Return category score plus core/supplementary averages and missing counts."""
60
+ benchmarks = _category_benchmarks(category, benchmark_ids)
61
+
62
+ def average_for(weight: float) -> float | None:
63
+ scores = [_score_for(model_scores, benchmark) for benchmark in benchmarks if benchmark["calc_weight"] == weight]
64
+ available = [score for score in scores if score is not None]
65
+ if not available:
66
+ return None
67
+ return round(sum(available) / len(available), 1)
68
+
69
+ missing = sum(1 for benchmark in benchmarks if _score_for(model_scores, benchmark) is None)
70
+ return {
71
+ "score": compute_category_score(model_scores, category, benchmark_ids),
72
+ "core_avg": average_for(1.0),
73
+ "supplementary_avg": average_for(0.5),
74
+ "missing": missing,
75
+ "benchmarks": len(benchmarks),
76
+ }
77
+
78
+
79
  def compute_grm_score(
80
+ model_scores: dict[str, ScoreValue],
81
+ benchmark_ids: set[str] | None = None,
82
+ ) -> dict[str, float | None]:
83
+ """Return GRM Score and per-category scores on a 0-100 scale."""
84
+ raw_category_scores = {
85
+ category: _compute_category_score_raw(model_scores, category, benchmark_ids) for category in CATEGORIES
86
+ }
87
+ category_scores = {
88
+ category: round(score, 1) if score is not None else None
89
+ for category, score in raw_category_scores.items()
90
+ }
91
+ available = [score for score in raw_category_scores.values() if score is not None]
92
+ grm = round(sum(available) / len(available), 1) if available else None
 
 
 
 
 
 
93
 
94
  return {
95
  "GRM Score": grm,
96
+ "Roleplay (33%)": category_scores["ROLEPLAY"],
97
+ "Actions (33%)": category_scores["ACTIONS"],
98
+ "General (33%)": category_scores["GENERAL"],
99
  }
100
 
101
 
102
+ def build_leaderboard(
103
+ include_closed: bool = True,
104
+ benchmark_ids: set[str] | None = None,
105
+ parameter_range: tuple[float, float] | None = None,
106
+ ) -> list[dict]:
107
+ """Compute scores for all models and return sorted leaderboard rows."""
108
  rows = []
109
  for model_name, model_scores in MODEL_SCORES.items():
110
+ metadata = MODEL_METADATA.get(model_name, {})
111
+ if not include_closed and not metadata.get("open_weights", False):
112
+ continue
113
+ parameter_b = metadata.get("parameter_b")
114
+ if parameter_range is not None and isinstance(parameter_b, int | float):
115
+ minimum, maximum = parameter_range
116
+ if parameter_b < minimum or parameter_b > maximum:
117
+ continue
118
+
119
+ result = compute_grm_score(model_scores, benchmark_ids)
120
+ if benchmark_ids is None and model_name in CSV_GRM_SCORES:
121
+ result["GRM Score"] = CSV_GRM_SCORES[model_name]
122
  result["Model"] = model_name
123
+ result["Family"] = metadata.get("family")
124
+ result["Size"] = metadata.get("size")
125
+ result["Parameter B"] = parameter_b
126
+ result["Open Weights"] = metadata.get("open_weights", False)
127
  rows.append(result)
128
 
129
+ rows.sort(key=lambda row: row["GRM Score"] if row["GRM Score"] is not None else -1, reverse=True)
130
+ for index, row in enumerate(rows, start=1):
131
+ row["Rank"] = index
132
+ return rows
133
+
134
 
135
+ def get_score(model_name: str, benchmark_id: str) -> ScoreValue:
136
+ return MODEL_SCORES.get(model_name, {}).get(benchmark_id)
137
 
138
+
139
+ def official_benchmark_ids() -> set[str]:
140
+ return {benchmark["id"] for benchmark in BENCHMARKS if benchmark["included_in_grm"]}
141
+
142
+
143
+ def category_label(category: str) -> str:
144
+ return CATEGORY_DISPLAY.get(category, category.title())
streamlit_app.py ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Streamlit implementation of the GRM Leaderboard Space."""
2
+
3
+ from textwrap import dedent
4
+
5
+ import streamlit as st
6
+
7
+ from benchmarks import CATEGORY_DISPLAY, SOURCE_GRM_BENCH
8
+ from data_views import (
9
+ VIEW_CATEGORY,
10
+ VIEW_MATRIX,
11
+ VIEW_SUMMARY,
12
+ available_domains,
13
+ available_sources,
14
+ benchmark_options,
15
+ build_benchmark_registry_frame,
16
+ build_model_benchmark_scores,
17
+ build_model_detail_frame,
18
+ build_score_explorer_frame,
19
+ category_from_label,
20
+ category_options,
21
+ filter_benchmarks,
22
+ find_benchmark_by_name,
23
+ model_options,
24
+ parameter_bounds,
25
+ score_stats,
26
+ )
27
+ from ui_theme import CUSTOM_CSS, FORMULAS, HEADER_HTML, OVERVIEW_BLOCKS, palette_css
28
+
29
+
30
+ st.set_page_config(
31
+ page_title="GRM Score - Game Ready Leaderboard",
32
+ layout="wide",
33
+ )
34
+
35
+ theme_columns = st.columns([0.72, 0.28])
36
+ theme_mode = theme_columns[1].radio(
37
+ "Theme",
38
+ ["Night", "Day"],
39
+ horizontal=True,
40
+ key="grm_theme_mode",
41
+ )
42
+ light_mode = theme_mode == "Day"
43
+
44
+ st.markdown(CUSTOM_CSS, unsafe_allow_html=True)
45
+ st.markdown(palette_css(light_mode), unsafe_allow_html=True)
46
+ st.markdown(HEADER_HTML, unsafe_allow_html=True)
47
+
48
+
49
+ def themed_dataframe(data):
50
+ if not light_mode or not hasattr(data, "style"):
51
+ return data
52
+
53
+ return data.style.set_properties(
54
+ **{
55
+ "background-color": "#ffffff",
56
+ "color": "#15181b",
57
+ "border-color": "#d9dee3",
58
+ }
59
+ ).set_table_styles(
60
+ [
61
+ {
62
+ "selector": "th",
63
+ "props": [
64
+ ("background-color", "#eef1f4"),
65
+ ("color", "#15181b"),
66
+ ("border-color", "#d9dee3"),
67
+ ],
68
+ }
69
+ ]
70
+ )
71
+
72
+
73
+ def render_dataframe(data, **kwargs) -> None:
74
+ st.dataframe(themed_dataframe(data), **kwargs)
75
+
76
+
77
+ def format_sample_code(code: str) -> str:
78
+ lines = dedent(code).strip().splitlines()
79
+ formatted_lines = []
80
+
81
+ for line in lines:
82
+ stripped = line.lstrip()
83
+ leading_spaces = len(line) - len(stripped)
84
+ normalized_indent = " " * (leading_spaces // 2)
85
+ formatted_lines.append(f"{normalized_indent}{stripped}" if stripped else "")
86
+
87
+ return "\n".join(formatted_lines)
88
+
89
+
90
+ def render_about_grm() -> None:
91
+ st.markdown("## About GRM Score")
92
+ for block in OVERVIEW_BLOCKS:
93
+ st.markdown(block)
94
+ for formula in FORMULAS:
95
+ st.markdown(f'<p class="formula-line">{formula}</p>', unsafe_allow_html=True)
96
+
97
+
98
+ def render_stats(include_closed: bool, parameter_range: tuple[float, float]) -> None:
99
+ stats = score_stats(include_closed=include_closed, parameter_range=parameter_range)
100
+ columns = st.columns(5)
101
+ for column, (label, value) in zip(columns, stats.items(), strict=True):
102
+ column.metric(label, value)
103
+
104
+
105
+ def render_score_controls() -> dict:
106
+ top = st.columns([1.2, 1.2, 1.2, 1.4])
107
+ model_visibility = top[0].radio(
108
+ "Model visibility",
109
+ ["All models", "Open-source only"],
110
+ index=1,
111
+ horizontal=True,
112
+ key="leaderboard_model_visibility",
113
+ )
114
+ view = top[1].radio(
115
+ "Score view",
116
+ [VIEW_SUMMARY, VIEW_CATEGORY, VIEW_MATRIX],
117
+ index=2,
118
+ horizontal=True,
119
+ key="leaderboard_score_view",
120
+ )
121
+ category_label = top[2].selectbox("Category", category_options(), key="leaderboard_category")
122
+ search = top[3].text_input("Search", placeholder="Model or benchmark", key="leaderboard_search")
123
+
124
+ filters = st.columns([1.5, 1.8, 1.8, 1.2])
125
+ sources = filters[0].multiselect("Source", available_sources(), placeholder="All sources", key="leaderboard_source")
126
+ domains = filters[1].multiselect("Domain", available_domains(), placeholder="All domains", key="leaderboard_domain")
127
+ minimum_size, maximum_size = parameter_bounds()
128
+ parameter_range = filters[2].slider(
129
+ "Model parameter class (B)",
130
+ min_value=minimum_size,
131
+ max_value=maximum_size,
132
+ value=(0.0, min(4.0, maximum_size)),
133
+ step=0.5,
134
+ format="%.1fB",
135
+ key="leaderboard_parameter_range_v2",
136
+ help="Use 0.0B for <0.5B class. The top end includes 120B+ models.",
137
+ )
138
+ recalculate_visible = filters[3].checkbox("Filtered score", value=False, key="leaderboard_filtered_score")
139
+
140
+ return {
141
+ "include_closed": model_visibility == "All models",
142
+ "view": view,
143
+ "category_label": category_label,
144
+ "search": search,
145
+ "sources": sources,
146
+ "domains": domains,
147
+ "parameter_range": parameter_range,
148
+ "recalculate_visible": recalculate_visible,
149
+ }
150
+
151
+
152
+ def render_score_explorer() -> None:
153
+ st.markdown("## Leaderboard")
154
+ st.caption(
155
+ "Current score values are static PRD-backed values with TBD entries shown as missing. "
156
+ "Filtered score recalculates an exploratory score from visible benchmarks only."
157
+ )
158
+
159
+ controls = render_score_controls()
160
+ selected_category = category_from_label(controls["category_label"])
161
+ benchmarks = filter_benchmarks(
162
+ category=selected_category,
163
+ sources=controls["sources"],
164
+ domains=controls["domains"],
165
+ search=controls["search"],
166
+ )
167
+
168
+ render_stats(
169
+ include_closed=controls["include_closed"],
170
+ parameter_range=controls["parameter_range"],
171
+ )
172
+
173
+ frame = build_score_explorer_frame(
174
+ view=controls["view"],
175
+ category_label=controls["category_label"],
176
+ benchmarks=benchmarks,
177
+ include_closed=controls["include_closed"],
178
+ recalculate_visible=controls["recalculate_visible"],
179
+ parameter_range=controls["parameter_range"],
180
+ )
181
+ render_dataframe(frame, width="stretch", hide_index=True)
182
+
183
+ if controls["view"] == VIEW_MATRIX:
184
+ st.caption(f"Showing {len(benchmarks)} benchmark columns from the active filters.")
185
+
186
+ detail_cols = st.columns(2)
187
+ with detail_cols[0]:
188
+ render_model_detail(controls["include_closed"], controls["parameter_range"])
189
+ with detail_cols[1]:
190
+ render_benchmark_detail(benchmarks, key="leaderboard_benchmark_detail")
191
+
192
+
193
+ def render_model_detail(include_closed: bool, parameter_range: tuple[float, float]) -> None:
194
+ models = model_options(include_closed=include_closed, parameter_range=parameter_range)
195
+ if not models:
196
+ return
197
+
198
+ selected_model = st.selectbox("Model detail", models, key="leaderboard_model_detail")
199
+ render_dataframe(build_model_detail_frame(selected_model), width="stretch", hide_index=True)
200
+
201
+ tabs = st.tabs(["Strongest", "Weakest"])
202
+ with tabs[0]:
203
+ render_dataframe(
204
+ build_model_benchmark_scores(selected_model, strongest=True),
205
+ width="stretch",
206
+ hide_index=True,
207
+ )
208
+ with tabs[1]:
209
+ render_dataframe(
210
+ build_model_benchmark_scores(selected_model, strongest=False),
211
+ width="stretch",
212
+ hide_index=True,
213
+ )
214
+
215
+
216
+ def render_benchmark_detail(benchmarks: list[dict], key: str) -> None:
217
+ options = benchmark_options(benchmarks) or benchmark_options(filter_benchmarks())
218
+ if not options:
219
+ return
220
+
221
+ selected_name = st.selectbox("Benchmark detail", options, key=key)
222
+ benchmark = find_benchmark_by_name(selected_name)
223
+ if benchmark is None:
224
+ return
225
+
226
+ st.markdown(
227
+ f"""
228
+ <div class="detail-panel">
229
+ <div class="detail-kicker">{benchmark['source']} · weight {benchmark['calc_weight']}</div>
230
+ <h3>{benchmark['name']}</h3>
231
+ <p>{benchmark['description']}</p>
232
+ <p>{benchmark['summary']}</p>
233
+ </div>
234
+ """,
235
+ unsafe_allow_html=True,
236
+ )
237
+
238
+ if benchmark.get("paper"):
239
+ st.markdown(f"[Paper / Source]({benchmark['paper']})")
240
+
241
+ if benchmark.get("detection_scope"):
242
+ st.markdown("#### Detection Scope")
243
+ render_dataframe(
244
+ benchmark["detection_scope"],
245
+ width="stretch",
246
+ hide_index=True,
247
+ )
248
+
249
+ with st.expander("Methodology", expanded=False):
250
+ st.write(benchmark["methodology"])
251
+ st.write("Included in official GRM Score:" if benchmark["included_in_grm"] else "Not included in official GRM Score.")
252
+
253
+
254
+ def render_benchmark_summary_card(benchmark: dict) -> None:
255
+ st.markdown(
256
+ f"""
257
+ <div class="detail-panel">
258
+ <div class="detail-kicker">{CATEGORY_DISPLAY[benchmark['category']]} · {benchmark['source']} · weight {benchmark['calc_weight']}</div>
259
+ <h3>{benchmark['name']}</h3>
260
+ <p>{benchmark['description']}</p>
261
+ <p>{benchmark['summary']}</p>
262
+ </div>
263
+ """,
264
+ unsafe_allow_html=True,
265
+ )
266
+ if benchmark.get("paper"):
267
+ st.markdown(f"[Paper / Source]({benchmark['paper']})")
268
+ with st.expander(f"{benchmark['name']} methodology and scope", expanded=False):
269
+ st.write(benchmark["methodology"])
270
+ if benchmark.get("detection_scope"):
271
+ render_dataframe(benchmark["detection_scope"], width="stretch", hide_index=True)
272
+
273
+
274
+ def render_grm_methodology() -> None:
275
+ st.markdown("## GRM-Bench Methodology")
276
+ st.markdown(
277
+ "GRM-Bench is the in-house authored benchmark suite for game-facing assistants, companions, "
278
+ "and NPC behaviors that are not well-covered by broad academic leaderboards. The sections below "
279
+ "preserve the authored benchmark methodology, failure modes, and representative examples."
280
+ )
281
+
282
+ grm_benchmarks = filter_benchmarks(sources=[SOURCE_GRM_BENCH], include_non_scored=True)
283
+ for benchmark in grm_benchmarks:
284
+ st.markdown(f"### {benchmark['name'].replace('GRM - ', '')}")
285
+ st.write(benchmark["summary"])
286
+ st.markdown("#### Test Methodology")
287
+ st.write(benchmark["methodology"])
288
+ if benchmark.get("detection_scope"):
289
+ st.markdown("#### Detection Scope")
290
+ render_dataframe(benchmark["detection_scope"], width="stretch", hide_index=True)
291
+
292
+ if benchmark["id"] == "grm_coherence":
293
+ st.markdown("#### Representative Samples")
294
+ for sample in benchmark.get("samples", []):
295
+ with st.expander(sample["id"], expanded=False):
296
+ for label, value in sample["metadata"]:
297
+ st.markdown(f"**{label}:** {value}")
298
+ st.code(format_sample_code(sample["code"]), language="json", wrap_lines=True)
299
+
300
+
301
+ def render_benchmark_library() -> None:
302
+ st.markdown("## Benchmark Library")
303
+ st.caption("Evaluation suite reference with benchmark summaries, paper links, and GRM-Bench methodology.")
304
+
305
+ filter_cols = st.columns([1.2, 1.4, 1.8, 1.2])
306
+ category_label = filter_cols[0].selectbox("Library category", category_options(), key="library_category")
307
+ sources = filter_cols[1].multiselect("Library source", available_sources(), placeholder="All sources", key="library_source")
308
+ domains = filter_cols[2].multiselect("Library domain", available_domains(), placeholder="All domains", key="library_domain")
309
+ include_non_scored = filter_cols[3].checkbox("Show non-scored", value=True, key="library_show_non_scored")
310
+ search = st.text_input("Benchmark library search", placeholder="Benchmark, domain, description", key="library_search")
311
+
312
+ benchmarks = filter_benchmarks(
313
+ category=category_from_label(category_label),
314
+ sources=sources,
315
+ domains=domains,
316
+ search=search,
317
+ include_non_scored=include_non_scored,
318
+ )
319
+ render_dataframe(
320
+ build_benchmark_registry_frame(benchmarks),
321
+ width="stretch",
322
+ hide_index=True,
323
+ column_config={
324
+ "Paper / Repo": st.column_config.LinkColumn("Paper / Repo"),
325
+ "Summary": st.column_config.TextColumn("Summary", width="large"),
326
+ "Description": st.column_config.TextColumn("Description", width="large"),
327
+ },
328
+ )
329
+ render_grm_methodology()
330
+
331
+
332
+ render_about_grm()
333
+
334
+ leaderboard_tab, library_tab = st.tabs(["Leaderboard", "Benchmark Library"])
335
+
336
+ with leaderboard_tab:
337
+ render_score_explorer()
338
+
339
+ with library_tab:
340
+ render_benchmark_library()
ui_theme.py ADDED
@@ -0,0 +1,474 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Streamlit theme helpers for the GRM leaderboard."""
2
+
3
+ CUSTOM_CSS = """
4
+ <style>
5
+ :root {
6
+ --bg-top: #202327;
7
+ --bg-bottom: #0f1012;
8
+ --surface: #15181b;
9
+ --surface-strong: #24282d;
10
+ --surface-alt: #1d2126;
11
+ --surface-alt-2: #262a2f;
12
+ --text-main: #f5f7f8;
13
+ --text-muted: #c1c6cb;
14
+ --text-soft: #a2a8ae;
15
+ --accent: #76b900;
16
+ --rule: rgba(255, 255, 255, 0.08);
17
+ --rule-soft: rgba(255, 255, 255, 0.05);
18
+ }
19
+
20
+ html[data-grm-theme="light"] {
21
+ --bg-top: #f7f8fa;
22
+ --bg-bottom: #ffffff;
23
+ --surface: #ffffff;
24
+ --surface-strong: #eef1f4;
25
+ --surface-alt: #f3f5f7;
26
+ --surface-alt-2: #e8ecef;
27
+ --text-main: #15181b;
28
+ --text-muted: #3f474f;
29
+ --text-soft: #66717c;
30
+ --rule: rgba(15, 16, 18, 0.12);
31
+ --rule-soft: rgba(15, 16, 18, 0.07);
32
+ }
33
+
34
+ html[data-grm-theme="dark"] {
35
+ --bg-top: #202327;
36
+ --bg-bottom: #0f1012;
37
+ --surface: #15181b;
38
+ --surface-strong: #24282d;
39
+ --surface-alt: #1d2126;
40
+ --surface-alt-2: #262a2f;
41
+ --text-main: #f5f7f8;
42
+ --text-muted: #c1c6cb;
43
+ --text-soft: #a2a8ae;
44
+ --rule: rgba(255, 255, 255, 0.08);
45
+ --rule-soft: rgba(255, 255, 255, 0.05);
46
+ }
47
+
48
+ .stApp {
49
+ background: linear-gradient(180deg, var(--bg-top) 0%, var(--bg-bottom) 100%);
50
+ color: var(--text-main);
51
+ user-select: text;
52
+ }
53
+
54
+ .stApp ::selection {
55
+ background: rgba(118, 185, 0, 0.28);
56
+ color: var(--text-main);
57
+ }
58
+
59
+ .stApp * {
60
+ user-select: text;
61
+ }
62
+
63
+ .block-container {
64
+ max-width: 1260px;
65
+ padding: 24px 24px 48px;
66
+ }
67
+
68
+ .page-header {
69
+ text-align: center;
70
+ margin: 4px auto 24px;
71
+ }
72
+
73
+ .page-eyebrow {
74
+ color: var(--text-soft);
75
+ font-size: 0.76rem;
76
+ font-weight: 650;
77
+ letter-spacing: 0.16em;
78
+ margin-bottom: 12px;
79
+ text-transform: uppercase;
80
+ }
81
+
82
+ .page-header h1 {
83
+ color: var(--text-main);
84
+ font-size: 2.35rem;
85
+ font-weight: 650;
86
+ letter-spacing: 0;
87
+ line-height: 1.1;
88
+ margin: 0;
89
+ }
90
+
91
+ .page-header p {
92
+ color: var(--text-muted);
93
+ font-size: 1rem;
94
+ line-height: 1.65;
95
+ margin: 12px auto 0;
96
+ max-width: 860px;
97
+ }
98
+
99
+ [data-testid="stMetric"] {
100
+ background: var(--surface);
101
+ border: 1px solid var(--rule);
102
+ border-radius: 8px;
103
+ padding: 12px 14px;
104
+ }
105
+
106
+ [data-testid="stMetricLabel"] p {
107
+ color: var(--text-soft);
108
+ font-size: 0.78rem;
109
+ font-weight: 650;
110
+ letter-spacing: 0.06em;
111
+ text-transform: uppercase;
112
+ }
113
+
114
+ [data-testid="stMetricValue"] {
115
+ color: var(--text-main);
116
+ font-size: 1.05rem;
117
+ }
118
+
119
+ .stTabs [data-baseweb="tab-list"] {
120
+ border-bottom: 1px solid var(--rule);
121
+ gap: 18px;
122
+ }
123
+
124
+ .stTabs [data-baseweb="tab"] {
125
+ color: var(--text-soft);
126
+ font-size: 0.82rem;
127
+ font-weight: 650;
128
+ letter-spacing: 0.08em;
129
+ padding-left: 0;
130
+ padding-right: 0;
131
+ text-transform: uppercase;
132
+ }
133
+
134
+ .stTabs [aria-selected="true"] {
135
+ color: var(--text-main);
136
+ }
137
+
138
+ .stTabs [data-baseweb="tab-highlight"] {
139
+ background-color: var(--accent) !important;
140
+ }
141
+
142
+ .stApp [data-baseweb="checkbox"] div[aria-checked="true"] {
143
+ background-color: var(--accent) !important;
144
+ border-color: var(--accent) !important;
145
+ }
146
+
147
+ .stApp [data-testid="stRadio"] input:checked + div,
148
+ .stApp [data-testid="stRadio"] input[aria-checked="true"] + div,
149
+ .stApp [data-testid="stRadio"] div[aria-checked="true"],
150
+ .stApp label[data-baseweb="radio"] input:checked ~ div:first-of-type,
151
+ .stApp [data-baseweb="radio"] [aria-checked="true"] div:first-of-type {
152
+ border-color: var(--accent) !important;
153
+ }
154
+
155
+ .stApp [data-testid="stRadio"] input:checked + div::before,
156
+ .stApp [data-testid="stRadio"] input:checked + div::after,
157
+ .stApp [data-testid="stRadio"] div[aria-checked="true"]::before,
158
+ .stApp [data-testid="stRadio"] div[aria-checked="true"]::after,
159
+ .stApp label[data-baseweb="radio"] input:checked ~ div:first-of-type div,
160
+ .stApp [data-baseweb="radio"] [aria-checked="true"] div:first-of-type div {
161
+ background-color: var(--accent) !important;
162
+ }
163
+
164
+ .stApp [data-testid="stRadio"] label,
165
+ .stApp [data-testid="stRadio"] label > div,
166
+ .stApp [data-testid="stRadio"] label:hover,
167
+ .stApp [data-testid="stRadio"] label:focus-within {
168
+ background: transparent !important;
169
+ box-shadow: none !important;
170
+ }
171
+
172
+ .stApp [data-testid="stRadio"] label p {
173
+ color: var(--text-muted) !important;
174
+ }
175
+
176
+ .stApp [data-baseweb="checkbox"] svg {
177
+ color: var(--text-main) !important;
178
+ fill: var(--text-main) !important;
179
+ }
180
+
181
+ .stApp [data-baseweb="input"],
182
+ .stApp [data-baseweb="select"],
183
+ .stApp [data-baseweb="popover"],
184
+ .stApp [data-baseweb="menu"],
185
+ .stApp [data-baseweb="slider"] > div,
186
+ .stApp [data-testid="stMultiSelect"] [data-baseweb="select"],
187
+ .stApp [data-testid="stSelectbox"] [data-baseweb="select"] {
188
+ background-color: var(--surface) !important;
189
+ color: var(--text-main) !important;
190
+ }
191
+
192
+ .stApp [data-baseweb="input"] input,
193
+ .stApp [data-baseweb="select"] input,
194
+ .stApp [data-baseweb="select"] span,
195
+ .stApp [data-baseweb="tag"],
196
+ .stApp [data-baseweb="menu"] li,
197
+ .stApp [data-testid="stTextInput"] input {
198
+ color: var(--text-main) !important;
199
+ }
200
+
201
+ .stApp [data-baseweb="input"],
202
+ .stApp [data-baseweb="select"],
203
+ .stApp [data-baseweb="tag"],
204
+ .stApp [data-testid="stTextInput"] input {
205
+ border-color: var(--rule) !important;
206
+ }
207
+
208
+ .stApp [data-baseweb="tag"] {
209
+ background-color: var(--surface-alt-2) !important;
210
+ }
211
+
212
+ .stApp [data-baseweb="menu"] li:hover,
213
+ .stApp [role="option"]:hover {
214
+ background-color: var(--surface-alt) !important;
215
+ }
216
+
217
+ .stMarkdown h2,
218
+ .stMarkdown h3 {
219
+ color: var(--text-main);
220
+ letter-spacing: 0;
221
+ }
222
+
223
+ .stMarkdown p,
224
+ .stMarkdown li,
225
+ .stCaption,
226
+ label,
227
+ [data-testid="stExpander"] p {
228
+ color: var(--text-muted);
229
+ }
230
+
231
+ .stDataFrame {
232
+ background: var(--surface);
233
+ border: 1px solid var(--rule);
234
+ border-radius: 8px;
235
+ overflow: hidden;
236
+ }
237
+
238
+ .stDataFrame button,
239
+ .stDataFrame [role="button"] {
240
+ color: var(--text-main) !important;
241
+ }
242
+
243
+ [data-testid="stExpander"] {
244
+ background: color-mix(in srgb, var(--surface) 78%, transparent);
245
+ border: 1px solid var(--rule-soft);
246
+ border-radius: 8px;
247
+ }
248
+
249
+ a {
250
+ color: var(--accent) !important;
251
+ text-decoration: none;
252
+ }
253
+
254
+ .section-note {
255
+ color: var(--text-soft);
256
+ font-size: 0.88rem;
257
+ margin: 0 0 10px;
258
+ }
259
+
260
+ .detail-panel {
261
+ background: var(--surface);
262
+ border: 1px solid var(--rule);
263
+ border-radius: 8px;
264
+ padding: 14px 16px;
265
+ margin: 10px 0 14px;
266
+ }
267
+
268
+ .detail-kicker {
269
+ color: var(--text-soft);
270
+ font-size: 0.76rem;
271
+ font-weight: 650;
272
+ letter-spacing: 0.08em;
273
+ text-transform: uppercase;
274
+ }
275
+
276
+ .detail-panel h3 {
277
+ color: var(--text-main);
278
+ font-size: 1.14rem;
279
+ margin: 4px 0 8px;
280
+ }
281
+
282
+ .detail-panel p {
283
+ color: var(--text-muted);
284
+ line-height: 1.62;
285
+ }
286
+
287
+ .formula-line {
288
+ color: var(--text-main);
289
+ font-weight: 650;
290
+ }
291
+
292
+ .theme-toggle-row {
293
+ align-items: center;
294
+ display: flex;
295
+ justify-content: flex-end;
296
+ margin-bottom: 8px;
297
+ }
298
+
299
+ .sample-code {
300
+ background: var(--bg-bottom);
301
+ border: 1px solid var(--rule-soft);
302
+ border-radius: 8px;
303
+ color: var(--text-main);
304
+ overflow-x: auto;
305
+ padding: 16px 18px;
306
+ }
307
+
308
+ .sample-code pre {
309
+ background: transparent;
310
+ border: 0;
311
+ color: inherit;
312
+ font-family: Consolas, "SFMono-Regular", monospace;
313
+ font-size: 0.88rem;
314
+ line-height: 1.7;
315
+ margin: 0;
316
+ white-space: pre-wrap;
317
+ word-break: break-word;
318
+ }
319
+
320
+ .sample-code pre code {
321
+ background: transparent;
322
+ color: inherit;
323
+ font: inherit;
324
+ white-space: inherit;
325
+ }
326
+
327
+ [data-testid="stCode"] pre,
328
+ [data-testid="stCode"] code {
329
+ font-family: Consolas, "SFMono-Regular", monospace !important;
330
+ font-size: 0.88rem !important;
331
+ line-height: 1.7 !important;
332
+ }
333
+
334
+ @media (max-width: 720px) {
335
+ .block-container {
336
+ padding: 20px 14px 40px;
337
+ }
338
+
339
+ .page-header h1 {
340
+ font-size: 2rem;
341
+ }
342
+ }
343
+ </style>
344
+ """
345
+
346
+ LIGHT_THEME_CSS = """
347
+ <style>
348
+ :root {
349
+ --bg-top: #f7f8fa;
350
+ --bg-bottom: #ffffff;
351
+ --surface: #ffffff;
352
+ --surface-strong: #eef1f4;
353
+ --surface-alt: #f3f5f7;
354
+ --surface-alt-2: #e8ecef;
355
+ --text-main: #15181b;
356
+ --text-muted: #3f474f;
357
+ --text-soft: #66717c;
358
+ --rule: rgba(15, 16, 18, 0.12);
359
+ --rule-soft: rgba(15, 16, 18, 0.07);
360
+ }
361
+
362
+ header[data-testid="stHeader"],
363
+ [data-testid="stToolbar"],
364
+ [data-testid="stDecoration"],
365
+ [data-testid="stStatusWidget"] {
366
+ background-color: var(--bg-top) !important;
367
+ color: var(--text-main) !important;
368
+ }
369
+
370
+ .stApp [data-testid="stWidgetLabel"] p,
371
+ .stApp [data-testid="stRadio"] p,
372
+ .stApp [data-testid="stCheckbox"] p,
373
+ .stApp [data-testid="stSelectbox"] p,
374
+ .stApp [data-testid="stTextInput"] p,
375
+ .stApp [data-testid="stMultiSelect"] p,
376
+ .stApp label p {
377
+ color: var(--text-muted) !important;
378
+ }
379
+
380
+ .stTabs [aria-selected="true"] p,
381
+ .stTabs [aria-selected="true"],
382
+ .stApp [data-testid="stMetricValue"] {
383
+ color: var(--text-main) !important;
384
+ }
385
+
386
+ .stApp [data-baseweb="select"] > div,
387
+ .stApp [data-baseweb="select"] > div *,
388
+ .stApp [data-baseweb="input"] [data-baseweb="base-input"],
389
+ .stApp [data-baseweb="input"] [data-baseweb="base-input"] *,
390
+ .stApp [data-testid="stTextInputRootElement"] [data-baseweb="base-input"],
391
+ .stApp [data-testid="stTextInputRootElement"] input {
392
+ background-color: var(--surface) !important;
393
+ color: var(--text-main) !important;
394
+ }
395
+
396
+ .stApp [data-baseweb="select"] svg,
397
+ .stApp [data-baseweb="input"] svg {
398
+ color: var(--text-main) !important;
399
+ fill: var(--text-main) !important;
400
+ }
401
+
402
+ .stApp [data-testid="stElementToolbarButtonContainer"] {
403
+ background-color: var(--surface) !important;
404
+ color: var(--text-main) !important;
405
+ }
406
+
407
+ .stDataFrameGlideDataEditor {
408
+ --gdg-bg-cell: #ffffff !important;
409
+ --gdg-bg-cell-medium: #f7f8fa !important;
410
+ --gdg-bg-header: #eef1f4 !important;
411
+ --gdg-bg-header-hovered: #e8ecef !important;
412
+ --gdg-bg-header-has-focus: #e8ecef !important;
413
+ --gdg-bg-group-header: #eef1f4 !important;
414
+ --gdg-bg-group-header-hovered: #e8ecef !important;
415
+ --gdg-bg-bubble: #eef1f4 !important;
416
+ --gdg-bg-bubble-selected: #dfe5ea !important;
417
+ --gdg-text-dark: #15181b !important;
418
+ --gdg-text-medium: #3f474f !important;
419
+ --gdg-text-light: #66717c !important;
420
+ --gdg-text-header: #3f474f !important;
421
+ --gdg-text-header-selected: #15181b !important;
422
+ --gdg-text-group-header: #3f474f !important;
423
+ --gdg-text-bubble: #3f474f !important;
424
+ --gdg-border-color: rgba(15, 16, 18, 0.12) !important;
425
+ --gdg-horizontal-border-color: rgba(15, 16, 18, 0.08) !important;
426
+ --gdg-bg-icon-header: rgba(15, 16, 18, 0.42) !important;
427
+ --gdg-fg-icon-header: #15181b !important;
428
+ }
429
+ </style>
430
+ """
431
+
432
+ DARK_THEME_CSS = """
433
+ <style>
434
+ :root {
435
+ --bg-top: #202327;
436
+ --bg-bottom: #0f1012;
437
+ --surface: #15181b;
438
+ --surface-strong: #24282d;
439
+ --surface-alt: #1d2126;
440
+ --surface-alt-2: #262a2f;
441
+ --text-main: #f5f7f8;
442
+ --text-muted: #c1c6cb;
443
+ --text-soft: #a2a8ae;
444
+ --rule: rgba(255, 255, 255, 0.08);
445
+ --rule-soft: rgba(255, 255, 255, 0.05);
446
+ }
447
+ </style>
448
+ """
449
+
450
+
451
+ def palette_css(light_mode: bool) -> str:
452
+ return LIGHT_THEME_CSS if light_mode else DARK_THEME_CSS
453
+
454
+ HEADER_HTML = """
455
+ <section class="page-header">
456
+ <div class="page-eyebrow">NVIDIA Game Ready Evaluation</div>
457
+ <h1>Game Ready Leaderboard</h1>
458
+ <p>
459
+ An open game model evaluation surface for comparing LLMs across roleplay,
460
+ gameplay actions, and practical in-game reasoning.
461
+ </p>
462
+ </section>
463
+ """
464
+
465
+ OVERVIEW_BLOCKS = [
466
+ "Nvidia Game Ready Model Score (GRM) is an aggregated quality metric designed to assess LLM capabilities in gaming use cases.",
467
+ "General state-of-the-art language models are optimized for broad benchmarks such as math, code, and general knowledge. That does not reliably translate to in-game performance, and it does not reliably predict NPC quality, gameplay actions, or immersion.",
468
+ "With game model evaluation, game developers can accelerate AI integration pipelines by reducing time spent on model evaluation and narrowing model choice earlier. The overall score is the average of Roleplay, Actions, and General, while benchmarks inside each category are combined with weighted averaging using the PRD weights.",
469
+ ]
470
+
471
+ FORMULAS = [
472
+ "GRM Score = (Roleplay + Actions + General) / 3",
473
+ "Category Score = sum(score x weight) / sum(weight)",
474
+ ]