jpeper commited on
Commit
626604a
·
verified ·
1 Parent(s): c9a41bc

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/failure_analysis.png filter=lfs diff=lfs merge=lfs -text
37
+ assets/games_overview_updated.png filter=lfs diff=lfs merge=lfs -text
38
+ assets/tier_overview.png filter=lfs diff=lfs merge=lfs -text
assets/failure_analysis.png ADDED

Git LFS Details

  • SHA256: 9a5bf4e38165055083c3d3998e04a7d010e9ad657f3cfc6e9ea056c8abb173a2
  • Pointer size: 132 Bytes
  • Size of remote file: 2.98 MB
assets/games_overview_updated.png ADDED

Git LFS Details

  • SHA256: d8e0d03d18379d6a7cca2d777200ba07e0ef994a34bf448812c54338356ab791
  • Pointer size: 132 Bytes
  • Size of remote file: 1.67 MB
assets/tier_overview.png ADDED

Git LFS Details

  • SHA256: 18c7e1cbda91e534134e259079b458b904a1fac4132194f711f4cffc7b021c8b
  • Pointer size: 132 Bytes
  • Size of remote file: 1.85 MB
css/main.css CHANGED
@@ -163,6 +163,39 @@ main { padding: 24px 32px; }
163
  color: #555;
164
  }
165
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  .citation-block {
167
  background: #f0f2f5;
168
  padding: 16px;
 
163
  color: #555;
164
  }
165
 
166
+ .overview-img {
167
+ width: 100%;
168
+ max-width: 860px;
169
+ height: auto;
170
+ border-radius: 8px;
171
+ border: 1px solid #ddd;
172
+ margin: 12px 0;
173
+ }
174
+
175
+ .overview-table {
176
+ width: 100%;
177
+ border-collapse: collapse;
178
+ font-size: 0.9rem;
179
+ margin: 12px 0;
180
+ }
181
+
182
+ .overview-table th,
183
+ .overview-table td {
184
+ padding: 8px 12px;
185
+ border: 1px solid #ddd;
186
+ text-align: left;
187
+ }
188
+
189
+ .overview-table th {
190
+ background: #f0f2f5;
191
+ font-weight: 600;
192
+ color: #333;
193
+ }
194
+
195
+ .overview-table td {
196
+ color: #444;
197
+ }
198
+
199
  .citation-block {
200
  background: #f0f2f5;
201
  padding: 16px;
index.html CHANGED
@@ -13,9 +13,9 @@
13
  <nav id="topNav">
14
  <div class="nav-brand">LudoBench</div>
15
  <div class="nav-tabs">
16
- <button class="tab-btn active" data-tab="leaderboard">Leaderboard</button>
 
17
  <button class="tab-btn" data-tab="browser">Dataset Browser</button>
18
- <button class="tab-btn" data-tab="about">About</button>
19
  </div>
20
  <div class="nav-links">
21
  <a href="#" target="_blank">Paper</a>
@@ -26,7 +26,7 @@
26
  <main>
27
 
28
  <!-- ==================== LEADERBOARD TAB ==================== -->
29
- <section id="tab-leaderboard" class="tab-panel active">
30
  <h2>Leaderboard</h2>
31
  <p class="tab-subtitle">
32
  Accuracy of multimodal models across <strong>5 board games</strong>,
@@ -56,7 +56,7 @@
56
  </div>
57
  </div>
58
  <div class="filter-group">
59
- <span class="filter-label">Modality:</span>
60
  <div class="filter-buttons" data-filter="modality">
61
  <button class="filter-btn active" data-value="all">All</button>
62
  <button class="filter-btn" data-value="None">None</button>
@@ -118,85 +118,78 @@
118
  </section>
119
 
120
  <!-- ==================== ABOUT TAB ==================== -->
121
- <section id="tab-about" class="tab-panel">
122
  <div class="about-content">
123
  <h2>LudoBench: Evaluating Multimodal Reasoning through Real-World Tabletop Strategy Games</h2>
124
 
125
  <div class="about-section">
126
  <h3>Abstract</h3>
127
  <p>
128
- LudoBench is a multimodal game reasoning benchmark designed to evaluate how well
129
- vision-enabled language models handle real-world tabletop strategy games. Unlike
130
- prior benchmarks that focus mainly on familiar board games, LudoBench recreates the
131
- experience of players learning and applying new rules for the first time.
132
  </p>
 
 
 
 
 
 
 
133
  <p>
134
- LudoBench includes 638 annotated question&ndash;answer examples drawn from five diverse
135
- games: <em>Kingdomino, Res Arcana, Pax Renaissance (2nd Edition), Carcassonne,</em> and
136
- <em>Catan</em>. These games are chosen based on increasing complexity of game states
137
- and rules comprehension.
138
  </p>
139
  </div>
140
 
141
  <div class="about-section">
142
- <h3>Reasoning Tiers</h3>
143
- <div class="tier-cards">
144
- <div class="tier-card">
145
- <h4>Tier 1: Environment Perception</h4>
146
- <p>Recognizing objects, counting components, and identifying basic game state features from board images.</p>
147
- </div>
148
- <div class="tier-card">
149
- <h4>Tier 2: Rules Integration</h4>
150
- <p>Applying multimodal rulebook knowledge to answer questions requiring rule understanding and application.</p>
151
- </div>
152
- <div class="tier-card">
153
- <h4>Tier 3: Short-Horizon Optimization</h4>
154
- <p>Planning optimal short-term moves that require strategic reasoning about game mechanics and resource management.</p>
155
- </div>
156
- </div>
 
 
 
 
 
 
 
 
 
157
  </div>
158
 
159
  <div class="about-section">
160
- <h3>Rulebook Modalities</h3>
161
- <p>Models are evaluated under three rulebook modalities:</p>
162
- <ul>
163
- <li><strong>None</strong> &mdash; parametric knowledge only (no rulebook provided)</li>
164
- <li><strong>Text</strong> &mdash; text-only rulebook provided in context</li>
165
- <li><strong>Image</strong> &mdash; image-based rulebook pages provided</li>
166
- </ul>
167
  </div>
168
 
169
  <div class="about-section">
170
- <h3>Games</h3>
171
- <div class="game-grid">
172
- <div class="game-card">
173
- <strong>Kingdomino</strong>
174
- <p>Domino-based kingdom building with terrain matching and crown scoring.</p>
175
- </div>
176
- <div class="game-card">
177
- <strong>Res Arcana</strong>
178
- <p>Engine-building card game with essence management and artifact abilities.</p>
179
- </div>
180
- <div class="game-card">
181
- <strong>Pax Renaissance</strong>
182
- <p>Complex political strategy game with multiple victory conditions and card-driven actions.</p>
183
- </div>
184
- <div class="game-card">
185
- <strong>Carcassonne</strong>
186
- <p>Tile-placement game with feature scoring (cities, roads, fields, monasteries).</p>
187
- </div>
188
- <div class="game-card">
189
- <strong>Catan</strong>
190
- <p>Resource management and trading game with settlement building and development.</p>
191
- </div>
192
- </div>
193
  </div>
194
 
195
  <div class="about-section">
196
  <h3>Citation</h3>
197
- <pre class="citation-block">@article{ludobench2026,
198
  title={LudoBench: Evaluating Multimodal Reasoning through Real-World Tabletop Strategy Games},
199
- year={2026}
 
 
 
200
  }</pre>
201
  </div>
202
 
 
13
  <nav id="topNav">
14
  <div class="nav-brand">LudoBench</div>
15
  <div class="nav-tabs">
16
+ <button class="tab-btn active" data-tab="about">LudoBench Overview</button>
17
+ <button class="tab-btn" data-tab="leaderboard">Leaderboard</button>
18
  <button class="tab-btn" data-tab="browser">Dataset Browser</button>
 
19
  </div>
20
  <div class="nav-links">
21
  <a href="#" target="_blank">Paper</a>
 
26
  <main>
27
 
28
  <!-- ==================== LEADERBOARD TAB ==================== -->
29
+ <section id="tab-leaderboard" class="tab-panel">
30
  <h2>Leaderboard</h2>
31
  <p class="tab-subtitle">
32
  Accuracy of multimodal models across <strong>5 board games</strong>,
 
56
  </div>
57
  </div>
58
  <div class="filter-group">
59
+ <span class="filter-label">Rules Modality:</span>
60
  <div class="filter-buttons" data-filter="modality">
61
  <button class="filter-btn active" data-value="all">All</button>
62
  <button class="filter-btn" data-value="None">None</button>
 
118
  </section>
119
 
120
  <!-- ==================== ABOUT TAB ==================== -->
121
+ <section id="tab-about" class="tab-panel active">
122
  <div class="about-content">
123
  <h2>LudoBench: Evaluating Multimodal Reasoning through Real-World Tabletop Strategy Games</h2>
124
 
125
  <div class="about-section">
126
  <h3>Abstract</h3>
127
  <p>
128
+ We introduce <strong>LudoBench</strong>, a multimodal reasoning benchmark that evaluates whether vision-enabled large language models (LMs) can acquire, integrate, and reason over heterogeneous game knowledge in <strong>mainstream analog tabletop games</strong>. Unlike prior works that emphasize deep strategic mastery, LudoBench targets an initial reasoning challenge uninitiated gamers face: <strong>correctly comprehending a new tabletop strategy game for the first time</strong>. We examine whether, given a visual depiction of a tabletop scene and a corresponding ruleset, a model can correctly answer grounded questions about the pictured scenario.
 
 
 
129
  </p>
130
+ <p>Concretely, LudoBench tests three cumulative situated game-comprehension capabilities:</p>
131
+ <ul>
132
+ <li><strong>Environment Perception</strong> &mdash; recognizing objects, counting components, and identifying basic game state features</li>
133
+ <li><strong>Heterogeneous Rules Integration</strong> &mdash; applying multimodal rulebook knowledge to answer grounded questions</li>
134
+ <li><strong>Short-Horizon Optimization</strong> &mdash; planning optimal moves requiring strategic reasoning over game mechanics</li>
135
+ </ul>
136
+ <p>These progressively stress-test the foundational reasoning required for real-world game comprehension.</p>
137
  <p>
138
+ Evaluating frontier LMs on five diverse strategy games, we find that even the strongest models achieve only <strong>~76% accuracy</strong> on simple environment perception tasks and fall below <strong>13%</strong> on situated multi-step comprehension puzzles that hobbyist gamers can routinely solve. Our extensive failure analysis and knowledge-ablation experiments reveal that models largely <strong>fail to comprehend rich cross-modal reference knowledge</strong> and are subsequently unable to apply this knowledge to messy and unfamiliar situated environments. Our findings highlight the many steps remaining for current methods to succeed on complex multimodal reasoning in the real world.
 
 
 
139
  </p>
140
  </div>
141
 
142
  <div class="about-section">
143
+ <h3>Games Overview</h3>
144
+ <p>
145
+ The dataset consists of five tabletop strategy games that vary widely in complexity, components, and rule structure. The details of each game&mdash;along with representative sample game states&mdash;are shown below.
146
+ </p>
147
+ <img src="assets/games_overview_updated.png" alt="Games Overview" class="overview-img" />
148
+ <table class="overview-table">
149
+ <thead>
150
+ <tr>
151
+ <th>Game</th>
152
+ <th>Rulebook</th>
153
+ <th>Diff.</th>
154
+ <th>Unique Game Properties</th>
155
+ <th># Rules</th>
156
+ <th># Figs.</th>
157
+ </tr>
158
+ </thead>
159
+ <tbody>
160
+ <tr><td><em>Kingdomino</em></td><td>4 pg.</td><td>1.2</td><td>tile-laying, spatial scoring, individual player areas</td><td>35</td><td>6</td></tr>
161
+ <tr><td><em>Carcassonne</em></td><td>8 pg.</td><td>1.9</td><td>shared tile-laying, dynamic board topology, position-coded roles</td><td>39</td><td>30</td></tr>
162
+ <tr><td><em>Catan</em></td><td>16 pg.</td><td>2.3</td><td>network building, connectivity constraints, action chaining</td><td>44</td><td>19</td></tr>
163
+ <tr><td><em>Res Arcana</em></td><td>12 pg.</td><td>2.6</td><td>card-based interactions, heavy symbol usage, card orientation, action sequencing</td><td>112</td><td>31</td></tr>
164
+ <tr><td><em>Pax Ren. (2e)</em></td><td>44 pg.</td><td>4.6</td><td>shared map, private cards/tableau, large number of components, intricate ruleset</td><td>247</td><td>58</td></tr>
165
+ </tbody>
166
+ </table>
167
  </div>
168
 
169
  <div class="about-section">
170
+ <h3>Tiers Overview</h3>
171
+ <p>
172
+ The benchmark evaluates models across three tiered reasoning levels that progressively increase in difficulty, from basic visual perception to rule integration and short-horizon planning. An example of how questions differ for each tier in Kingdomino is shown below:
173
+ </p>
174
+ <img src="assets/tier_overview.png" alt="Tier-wise Q&A Example" class="overview-img" />
 
 
175
  </div>
176
 
177
  <div class="about-section">
178
+ <h3>Failure Analysis</h3>
179
+ <p>
180
+ We analyze where models go wrong by collecting common failure cases across multiple models and organizing them for visualization on Kingdomino. The table below summarizes the relevant rulebook rules, supporting annotations, and the observed model errors for each failure pattern.
181
+ </p>
182
+ <img src="assets/failure_analysis.png" alt="Failure Analysis" class="overview-img" />
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  </div>
184
 
185
  <div class="about-section">
186
  <h3>Citation</h3>
187
+ <pre class="citation-block">@inproceedings{peper2026ludobench,
188
  title={LudoBench: Evaluating Multimodal Reasoning through Real-World Tabletop Strategy Games},
189
+ author={Peper, Joseph J. and Gandra, Sai Krishna and Zhang, Yunxiang and Chennareddy, Vaibhav and Jha, Shloki and Payani, Ali and Wang, Lu},
190
+ booktitle={Proceedings of the Fourteenth International Conference on Learning Representations (ICLR)},
191
+ year={2026},
192
+ address={Rio de Janeiro, Brazil}
193
  }</pre>
194
  </div>
195
 
js/app.js CHANGED
@@ -14,10 +14,13 @@ document.addEventListener("DOMContentLoaded", () => {
14
  btn.classList.add("active");
15
  document.getElementById("tab-" + target).classList.add("active");
16
 
17
- // Lazy-init the dataset viewer on first access
18
  if (target === "browser" && typeof initViewer === "function") {
19
  initViewer();
20
  }
 
 
 
21
  });
22
  });
23
 
 
14
  btn.classList.add("active");
15
  document.getElementById("tab-" + target).classList.add("active");
16
 
17
+ // Lazy-init on first access
18
  if (target === "browser" && typeof initViewer === "function") {
19
  initViewer();
20
  }
21
+ if (target === "leaderboard" && typeof initLeaderboard === "function") {
22
+ initLeaderboard();
23
+ }
24
  });
25
  });
26