File size: 17,400 Bytes
c9a41bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
626604a
 
c9a41bc
 
 
02acb48
 
 
 
 
 
 
 
 
 
 
 
075a517
 
 
 
c9a41bc
 
 
075a517
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c9a41bc
 
 
626604a
c9a41bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
626604a
c9a41bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
626604a
c9a41bc
4113988
c9a41bc
59cb222
 
 
 
 
 
 
 
 
 
 
 
 
075a517
 
 
 
59cb222
 
c9a41bc
 
 
626604a
c9a41bc
626604a
 
0845811
 
 
626604a
 
c9a41bc
0845811
c9a41bc
 
 
 
626604a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c9a41bc
 
 
626604a
 
 
 
 
c9a41bc
 
075a517
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c9a41bc
626604a
 
 
 
 
c9a41bc
 
 
 
626604a
4113988
626604a
 
 
 
c9a41bc
 
 
 
 
 
 
 
 
 
 
 
075a517
 
 
 
 
 
 
 
 
 
 
 
 
 
c9a41bc
e74b832
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="utf-8" />
  <title>LudoBench: Board Game Reasoning Benchmark</title>
  <meta name="viewport" content="width=device-width, initial-scale=1" />
  <link rel="stylesheet" href="css/main.css" />
  <link rel="stylesheet" href="css/leaderboard.css" />
  <link rel="stylesheet" href="css/viewer.css" />
</head>
<body>

  <nav id="topNav">
    <div class="nav-brand">LudoBench</div>
    <div class="nav-tabs">
      <button class="tab-btn active" data-tab="about">LudoBench Overview</button>
      <button class="tab-btn" data-tab="leaderboard">Leaderboard</button>
      <button class="tab-btn" data-tab="browser">Dataset Browser</button>
    </div>
    <div class="nav-links">
      <a href="https://openreview.net/forum?id=TOgQ00DEek" target="_blank" class="nav-badge">
        <img src="https://upload.wikimedia.org/wikipedia/commons/b/bc/ArXiv_logo_2022.svg" alt="arXiv" class="nav-icon" />
        Paper
      </a>
      <a href="https://huggingface.co/datasets/jpeper/LudoBench_test" target="_blank" class="nav-badge">
        <img src="https://huggingface.co/front/assets/huggingface_logo.svg" alt="HF" class="nav-icon" />
        Dataset
      </a>
      <a href="https://github.com/jpeper/LudoBench" target="_blank" class="nav-badge">
        <svg class="nav-icon" viewBox="0 0 16 16" fill="currentColor"><path d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0016 8c0-4.42-3.58-8-8-8z"/></svg>
        GitHub
      </a>
      <button class="nav-badge bibtex-btn" onclick="toggleBibtex()">
        <svg class="nav-icon" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><polyline points="14 2 14 8 20 8"/><line x1="16" y1="13" x2="8" y2="13"/><line x1="16" y1="17" x2="8" y2="17"/><polyline points="10 9 9 9 8 9"/></svg>
        BibTeX
      </button>
    </div>
  </nav>

  <!-- BibTeX Modal -->
  <div id="bibtexModal" class="bibtex-modal" onclick="if(event.target===this)toggleBibtex()">
    <div class="bibtex-modal-content">
      <div class="bibtex-modal-header">
        <span>BibTeX Citation</span>
        <button class="bibtex-close" onclick="toggleBibtex()">&times;</button>
      </div>
      <pre id="bibtexCode" class="bibtex-code">@inproceedings{peper2026ludobench,
  title={{LLMs} as Rules Oracles: Exploring Real-World Multimodal Reasoning in Tabletop Strategy Game Environments},
  author={Peper, Joseph J. and Gandra, Sai Krishna and Zhang, Yunxiang and Chennareddy, Vaibhav and Jha, Shloki and Payani, Ali and Wang, Lu},
  booktitle={Proceedings of the Fourteenth International Conference on Learning Representations (ICLR)},
  year={2026},
  address={Rio de Janeiro, Brazil}
}</pre>
      <button class="bibtex-copy-btn" onclick="copyBibtex()">Copy to Clipboard</button>
    </div>
  </div>

  <main>

    <!-- ==================== LEADERBOARD TAB ==================== -->
    <section id="tab-leaderboard" class="tab-panel">
      <h2>Leaderboard</h2>
      <p class="tab-subtitle">
        Accuracy of multimodal models across <strong>5 board games</strong>,
        <strong>3 reasoning tiers</strong>, and <strong>3 rulebook modalities</strong>.
        Click any column header to sort.
      </p>

      <div id="leaderboard-filters">
        <div class="filter-group">
          <span class="filter-label">Tier:</span>
          <div class="filter-buttons" data-filter="tier">
            <button class="filter-btn active" data-value="all">All</button>
            <button class="filter-btn" data-value="T1">T1</button>
            <button class="filter-btn" data-value="T2">T2</button>
            <button class="filter-btn" data-value="T3">T3</button>
          </div>
        </div>
        <div class="filter-group">
          <span class="filter-label">Game:</span>
          <div class="filter-buttons" data-filter="game">
            <button class="filter-btn active" data-value="all">All</button>
            <button class="filter-btn" data-value="KingD">Kingdomino</button>
            <button class="filter-btn" data-value="Res Arcana">Res Arcana</button>
            <button class="filter-btn" data-value="Pax Ren.">Pax Ren.</button>
            <button class="filter-btn" data-value="Carca.">Carcassonne</button>
            <button class="filter-btn" data-value="Catan">Catan</button>
          </div>
        </div>
        <div class="filter-group">
          <span class="filter-label">Rules Modality:</span>
          <div class="filter-buttons" data-filter="modality">
            <button class="filter-btn active" data-value="all">All</button>
            <button class="filter-btn" data-value="None">None</button>
            <button class="filter-btn" data-value="Text">Text</button>
            <button class="filter-btn" data-value="Image">Image</button>
          </div>
        </div>
      </div>

      <div id="leaderboard-table-wrap">
        <table id="leaderboard-table">
          <thead id="leaderboard-thead"></thead>
          <tbody id="leaderboard-tbody"></tbody>
        </table>
      </div>
    </section>

    <!-- ==================== DATASET BROWSER TAB ==================== -->
    <section id="tab-browser" class="tab-panel">
      <h2>Dataset Browser</h2>
      <p class="tab-subtitle">Browse 638 annotated QA examples across 5 games and 3 difficulty tiers.</p>

      <div id="controls">
        <label>
          Folder:
          <select id="folderSelect"></select>
        </label>
        <label>
          Example:
          <select id="fileSelect"></select>
        </label>
        <div id="navButtons">
          <button id="prevBtn"><span class="icon">&laquo;</span> Prev</button>
          <button id="nextBtn">Next <span class="icon">&raquo;</span></button>
        </div>
      </div>

      <div id="layout">
        <div>
          <div id="questionCard">Loading&hellip;</div>
          <div id="answerBlock">
            <label for="answerInput">Your answer:</label>
            <input id="answerInput" type="text" />
            <button id="checkButton">Check</button>
            <div id="answerInfo" class="answer-info"></div>
            <details>
              <summary>Show solution</summary>
              <div id="solutionText"></div>
            </details>
          </div>
        </div>
        <div>
          <div id="imageContainer" class="image-wrapper hidden">
            <h3>Game state</h3>
            <div id="multiImages"></div>
          </div>
        </div>
      </div>
    </section>

    <!-- ==================== ABOUT TAB ==================== -->
    <section id="tab-about" class="tab-panel active">
      <div class="about-content">
        <h2>LLMs as Rules Oracles: Exploring Real-World Multimodal Reasoning in Tabletop Strategy Game Environments <span class="venue-tag">[ICLR 2026]</span></h2>

        <div class="about-badges">
          <a href="https://openreview.net/forum?id=TOgQ00DEek" target="_blank" class="nav-badge">
            <img src="https://upload.wikimedia.org/wikipedia/commons/b/bc/ArXiv_logo_2022.svg" alt="arXiv" class="nav-icon" />
            Paper
          </a>
          <a href="https://huggingface.co/datasets/jpeper/LudoBench_test" target="_blank" class="nav-badge">
            <img src="https://huggingface.co/front/assets/huggingface_logo.svg" alt="HF" class="nav-icon" />
            Dataset
          </a>
          <a href="https://github.com/jpeper/LudoBench" target="_blank" class="nav-badge">
            <svg class="nav-icon" viewBox="0 0 16 16" fill="currentColor"><path d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0016 8c0-4.42-3.58-8-8-8z"/></svg>
            GitHub
          </a>
          <button class="nav-badge bibtex-btn" onclick="toggleBibtex()">
            <svg class="nav-icon" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><polyline points="14 2 14 8 20 8"/><line x1="16" y1="13" x2="8" y2="13"/><line x1="16" y1="17" x2="8" y2="17"/><polyline points="10 9 9 9 8 9"/></svg>
            BibTeX
          </button>
        </div>

        <div class="about-section">
          <h3>Abstract</h3>
          <p>
            We introduce <strong>LudoBench</strong>, a multimodal reasoning benchmark that evaluates whether vision-enabled large language models (LMs) can acquire, integrate, and reason over heterogeneous game knowledge in <strong>mainstream analog tabletop games</strong>. Unlike prior works that emphasize deep strategic mastery, LudoBench targets an initial reasoning challenge uninitiated gamers face: <strong>correctly comprehending a new tabletop strategy game for the first time</strong>. We examine whether, given a visual depiction of a tabletop scene and a corresponding ruleset, a model can correctly answer grounded questions about the pictured scenario.
          </p>
          <p>Concretely, LudoBench tests three cumulative situated game-comprehension capabilities:</p>
          <ul>
            <li><strong>Tier 1: Environment Perception</strong> &mdash; recognizing objects, counting components, and identifying basic game state features</li>
            <li><strong>Tier 2: Heterogeneous Rules Integration</strong> &mdash; applying multimodal rulebook knowledge to answer grounded questions</li>
            <li><strong>Tier 3: Short-Horizon Optimization</strong> &mdash; planning optimal moves requiring strategic reasoning over game mechanics</li>
          </ul>
          <p>These progressively stress-test the foundational reasoning required for real-world game comprehension.</p>
          <p>
            Evaluating frontier LMs on five diverse strategy games, we find that even the strongest models achieve only <strong>~76% accuracy</strong> on simple T1 environment perception tasks and fall below <strong>13%</strong> on situated T3 multi-step comprehension puzzles that hobbyist gamers can routinely solve. Our extensive failure analysis and knowledge-ablation experiments reveal that models largely <strong>fail to comprehend rich cross-modal reference knowledge</strong> and are subsequently unable to apply this knowledge to messy and unfamiliar situated environments. Our findings highlight the many steps remaining for current methods to succeed on complex multimodal reasoning in the real world.
          </p>
        </div>

        <div class="about-section">
          <h3>Games Overview</h3>
          <p>
            The dataset consists of five tabletop strategy games that vary widely in complexity, components, and rule structure. The details of each game&mdash;along with representative sample game states&mdash;are shown below.
          </p>
          <img src="assets/games_overview_updated.png" alt="Games Overview" class="overview-img" />
          <table class="overview-table">
            <thead>
              <tr>
                <th>Game</th>
                <th>Rulebook</th>
                <th>Diff.</th>
                <th>Unique Game Properties</th>
                <th># Rules</th>
                <th># Figs.</th>
              </tr>
            </thead>
            <tbody>
              <tr><td><em>Kingdomino</em></td><td>4 pg.</td><td>1.2</td><td>tile-laying, spatial scoring, individual player areas</td><td>35</td><td>6</td></tr>
              <tr><td><em>Carcassonne</em></td><td>8 pg.</td><td>1.9</td><td>shared tile-laying, dynamic board topology, position-coded roles</td><td>39</td><td>30</td></tr>
              <tr><td><em>Catan</em></td><td>16 pg.</td><td>2.3</td><td>network building, connectivity constraints, action chaining</td><td>44</td><td>19</td></tr>
              <tr><td><em>Res Arcana</em></td><td>12 pg.</td><td>2.6</td><td>card-based interactions, heavy symbol usage, card orientation, action sequencing</td><td>112</td><td>31</td></tr>
              <tr><td><em>Pax Ren. (2e)</em></td><td>44 pg.</td><td>4.6</td><td>shared map, private cards/tableau, large number of components, intricate ruleset</td><td>247</td><td>58</td></tr>
            </tbody>
          </table>
        </div>

        <div class="about-section">
          <h3>Tiers Overview</h3>
          <p>
            The benchmark evaluates models across three tiered reasoning levels that progressively increase in difficulty, from basic visual perception to rule integration and short-horizon planning. An example of how questions differ for each tier in Kingdomino is shown below:
          </p>
          <img src="assets/tier_overview.png" alt="Tier-wise Q&A Example" class="overview-img" />
        </div>

        <div class="about-section">
          <h3>Knowledge Ablation: Rules Modalities</h3>
          <p>
            A central question in LudoBench is whether models can acquire and apply game rules from different knowledge sources.
            To investigate this, every question is evaluated under three <strong>rules modality</strong> conditions that vary what reference knowledge is provided alongside the game-state image and question:
          </p>
          <div class="tier-cards">
            <div class="tier-card">
              <h4>None (Parametric)</h4>
              <p>No rulebook is provided. The model must rely entirely on <strong>parametric knowledge</strong> &mdash; whatever it has internalized about the game from pretraining. This baseline reveals how much a model already "knows" about a game's rules.</p>
            </div>
            <div class="tier-card">
              <h4>Text Rules</h4>
              <p>The game's rulebook is provided as <strong>extracted text</strong> in the prompt context. This tests whether explicit textual rule descriptions improve situated reasoning, and whether models can ground text-based rules against a visual game state.</p>
            </div>
            <div class="tier-card">
              <h4>Image Rules</h4>
              <p>The rulebook is provided as <strong>images of the original pages</strong>, including diagrams, icons, and annotated examples. This tests the model's ability to extract and apply rules from rich, cross-modal visual documents &mdash; the format real players actually encounter.</p>
            </div>
          </div>
          <p>
            Across all three conditions, models consistently struggle to comprehend cross-modal reference knowledge. Notably, providing rulebook content &mdash; whether as text or images &mdash; does not uniformly improve performance, revealing fundamental gaps in how models integrate heterogeneous knowledge with situated visual environments.
          </p>
        </div>

        <div class="about-section">
          <h3>Failure Analysis</h3>
          <p>
            We analyze where models go wrong by collecting common failure cases across multiple models and organizing them for visualization on Kingdomino. The table below summarizes the relevant rulebook rules, supporting annotations, and the observed model errors for each failure pattern.
          </p>
          <img src="assets/failure_analysis.png" alt="Failure Analysis" class="overview-img" />
        </div>

        <div class="about-section">
          <h3>Citation</h3>
          <pre class="citation-block">@inproceedings{peper2026ludobench,
  title={{LLMs} as Rules Oracles: Exploring Real-World Multimodal Reasoning in Tabletop Strategy Game Environments},
  author={Peper, Joseph J. and Gandra, Sai Krishna and Zhang, Yunxiang and Chennareddy, Vaibhav and Jha, Shloki and Payani, Ali and Wang, Lu},
  booktitle={Proceedings of the Fourteenth International Conference on Learning Representations (ICLR)},
  year={2026},
  address={Rio de Janeiro, Brazil}
}</pre>
        </div>

      </div>
    </section>

  </main>

  <script src="js/leaderboard-data.js"></script>
  <script src="js/leaderboard.js"></script>
  <script src="js/viewer.js"></script>
  <script src="js/app.js"></script>
  <script>
    function toggleBibtex() {
      var m = document.getElementById("bibtexModal");
      m.classList.toggle("open");
    }
    function copyBibtex() {
      var text = document.getElementById("bibtexCode").textContent;
      navigator.clipboard.writeText(text).then(function() {
        var btn = document.querySelector(".bibtex-copy-btn");
        btn.textContent = "Copied!";
        setTimeout(function() { btn.textContent = "Copy to Clipboard"; }, 2000);
      });
    }
  </script>
</body>
</html>