Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- .gitattributes +3 -0
- assets/failure_analysis.png +3 -0
- assets/games_overview_updated.png +3 -0
- assets/tier_overview.png +3 -0
- css/main.css +33 -0
- index.html +53 -60
- js/app.js +4 -1
.gitattributes
CHANGED
|
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
assets/failure_analysis.png filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
assets/games_overview_updated.png filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
assets/tier_overview.png filter=lfs diff=lfs merge=lfs -text
|
assets/failure_analysis.png
ADDED
|
Git LFS Details
|
assets/games_overview_updated.png
ADDED
|
Git LFS Details
|
assets/tier_overview.png
ADDED
|
Git LFS Details
|
css/main.css
CHANGED
|
@@ -163,6 +163,39 @@ main { padding: 24px 32px; }
|
|
| 163 |
color: #555;
|
| 164 |
}
|
| 165 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
.citation-block {
|
| 167 |
background: #f0f2f5;
|
| 168 |
padding: 16px;
|
|
|
|
| 163 |
color: #555;
|
| 164 |
}
|
| 165 |
|
| 166 |
+
.overview-img {
|
| 167 |
+
width: 100%;
|
| 168 |
+
max-width: 860px;
|
| 169 |
+
height: auto;
|
| 170 |
+
border-radius: 8px;
|
| 171 |
+
border: 1px solid #ddd;
|
| 172 |
+
margin: 12px 0;
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
.overview-table {
|
| 176 |
+
width: 100%;
|
| 177 |
+
border-collapse: collapse;
|
| 178 |
+
font-size: 0.9rem;
|
| 179 |
+
margin: 12px 0;
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
.overview-table th,
|
| 183 |
+
.overview-table td {
|
| 184 |
+
padding: 8px 12px;
|
| 185 |
+
border: 1px solid #ddd;
|
| 186 |
+
text-align: left;
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
.overview-table th {
|
| 190 |
+
background: #f0f2f5;
|
| 191 |
+
font-weight: 600;
|
| 192 |
+
color: #333;
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
.overview-table td {
|
| 196 |
+
color: #444;
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
.citation-block {
|
| 200 |
background: #f0f2f5;
|
| 201 |
padding: 16px;
|
index.html
CHANGED
|
@@ -13,9 +13,9 @@
|
|
| 13 |
<nav id="topNav">
|
| 14 |
<div class="nav-brand">LudoBench</div>
|
| 15 |
<div class="nav-tabs">
|
| 16 |
-
<button class="tab-btn active" data-tab="
|
|
|
|
| 17 |
<button class="tab-btn" data-tab="browser">Dataset Browser</button>
|
| 18 |
-
<button class="tab-btn" data-tab="about">About</button>
|
| 19 |
</div>
|
| 20 |
<div class="nav-links">
|
| 21 |
<a href="#" target="_blank">Paper</a>
|
|
@@ -26,7 +26,7 @@
|
|
| 26 |
<main>
|
| 27 |
|
| 28 |
<!-- ==================== LEADERBOARD TAB ==================== -->
|
| 29 |
-
<section id="tab-leaderboard" class="tab-panel
|
| 30 |
<h2>Leaderboard</h2>
|
| 31 |
<p class="tab-subtitle">
|
| 32 |
Accuracy of multimodal models across <strong>5 board games</strong>,
|
|
@@ -56,7 +56,7 @@
|
|
| 56 |
</div>
|
| 57 |
</div>
|
| 58 |
<div class="filter-group">
|
| 59 |
-
<span class="filter-label">Modality:</span>
|
| 60 |
<div class="filter-buttons" data-filter="modality">
|
| 61 |
<button class="filter-btn active" data-value="all">All</button>
|
| 62 |
<button class="filter-btn" data-value="None">None</button>
|
|
@@ -118,85 +118,78 @@
|
|
| 118 |
</section>
|
| 119 |
|
| 120 |
<!-- ==================== ABOUT TAB ==================== -->
|
| 121 |
-
<section id="tab-about" class="tab-panel">
|
| 122 |
<div class="about-content">
|
| 123 |
<h2>LudoBench: Evaluating Multimodal Reasoning through Real-World Tabletop Strategy Games</h2>
|
| 124 |
|
| 125 |
<div class="about-section">
|
| 126 |
<h3>Abstract</h3>
|
| 127 |
<p>
|
| 128 |
-
|
| 129 |
-
vision-enabled language models handle real-world tabletop strategy games. Unlike
|
| 130 |
-
prior benchmarks that focus mainly on familiar board games, LudoBench recreates the
|
| 131 |
-
experience of players learning and applying new rules for the first time.
|
| 132 |
</p>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
<p>
|
| 134 |
-
|
| 135 |
-
games: <em>Kingdomino, Res Arcana, Pax Renaissance (2nd Edition), Carcassonne,</em> and
|
| 136 |
-
<em>Catan</em>. These games are chosen based on increasing complexity of game states
|
| 137 |
-
and rules comprehension.
|
| 138 |
</p>
|
| 139 |
</div>
|
| 140 |
|
| 141 |
<div class="about-section">
|
| 142 |
-
<h3>
|
| 143 |
-
<
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
<
|
| 149 |
-
<
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
</div>
|
| 158 |
|
| 159 |
<div class="about-section">
|
| 160 |
-
<h3>
|
| 161 |
-
<p>
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
<li><strong>Image</strong> — image-based rulebook pages provided</li>
|
| 166 |
-
</ul>
|
| 167 |
</div>
|
| 168 |
|
| 169 |
<div class="about-section">
|
| 170 |
-
<h3>
|
| 171 |
-
<
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
</div>
|
| 176 |
-
<div class="game-card">
|
| 177 |
-
<strong>Res Arcana</strong>
|
| 178 |
-
<p>Engine-building card game with essence management and artifact abilities.</p>
|
| 179 |
-
</div>
|
| 180 |
-
<div class="game-card">
|
| 181 |
-
<strong>Pax Renaissance</strong>
|
| 182 |
-
<p>Complex political strategy game with multiple victory conditions and card-driven actions.</p>
|
| 183 |
-
</div>
|
| 184 |
-
<div class="game-card">
|
| 185 |
-
<strong>Carcassonne</strong>
|
| 186 |
-
<p>Tile-placement game with feature scoring (cities, roads, fields, monasteries).</p>
|
| 187 |
-
</div>
|
| 188 |
-
<div class="game-card">
|
| 189 |
-
<strong>Catan</strong>
|
| 190 |
-
<p>Resource management and trading game with settlement building and development.</p>
|
| 191 |
-
</div>
|
| 192 |
-
</div>
|
| 193 |
</div>
|
| 194 |
|
| 195 |
<div class="about-section">
|
| 196 |
<h3>Citation</h3>
|
| 197 |
-
<pre class="citation-block">@
|
| 198 |
title={LudoBench: Evaluating Multimodal Reasoning through Real-World Tabletop Strategy Games},
|
| 199 |
-
|
|
|
|
|
|
|
|
|
|
| 200 |
}</pre>
|
| 201 |
</div>
|
| 202 |
|
|
|
|
| 13 |
<nav id="topNav">
|
| 14 |
<div class="nav-brand">LudoBench</div>
|
| 15 |
<div class="nav-tabs">
|
| 16 |
+
<button class="tab-btn active" data-tab="about">LudoBench Overview</button>
|
| 17 |
+
<button class="tab-btn" data-tab="leaderboard">Leaderboard</button>
|
| 18 |
<button class="tab-btn" data-tab="browser">Dataset Browser</button>
|
|
|
|
| 19 |
</div>
|
| 20 |
<div class="nav-links">
|
| 21 |
<a href="#" target="_blank">Paper</a>
|
|
|
|
| 26 |
<main>
|
| 27 |
|
| 28 |
<!-- ==================== LEADERBOARD TAB ==================== -->
|
| 29 |
+
<section id="tab-leaderboard" class="tab-panel">
|
| 30 |
<h2>Leaderboard</h2>
|
| 31 |
<p class="tab-subtitle">
|
| 32 |
Accuracy of multimodal models across <strong>5 board games</strong>,
|
|
|
|
| 56 |
</div>
|
| 57 |
</div>
|
| 58 |
<div class="filter-group">
|
| 59 |
+
<span class="filter-label">Rules Modality:</span>
|
| 60 |
<div class="filter-buttons" data-filter="modality">
|
| 61 |
<button class="filter-btn active" data-value="all">All</button>
|
| 62 |
<button class="filter-btn" data-value="None">None</button>
|
|
|
|
| 118 |
</section>
|
| 119 |
|
| 120 |
<!-- ==================== ABOUT TAB ==================== -->
|
| 121 |
+
<section id="tab-about" class="tab-panel active">
|
| 122 |
<div class="about-content">
|
| 123 |
<h2>LudoBench: Evaluating Multimodal Reasoning through Real-World Tabletop Strategy Games</h2>
|
| 124 |
|
| 125 |
<div class="about-section">
|
| 126 |
<h3>Abstract</h3>
|
| 127 |
<p>
|
| 128 |
+
We introduce <strong>LudoBench</strong>, a multimodal reasoning benchmark that evaluates whether vision-enabled large language models (LMs) can acquire, integrate, and reason over heterogeneous game knowledge in <strong>mainstream analog tabletop games</strong>. Unlike prior works that emphasize deep strategic mastery, LudoBench targets an initial reasoning challenge uninitiated gamers face: <strong>correctly comprehending a new tabletop strategy game for the first time</strong>. We examine whether, given a visual depiction of a tabletop scene and a corresponding ruleset, a model can correctly answer grounded questions about the pictured scenario.
|
|
|
|
|
|
|
|
|
|
| 129 |
</p>
|
| 130 |
+
<p>Concretely, LudoBench tests three cumulative situated game-comprehension capabilities:</p>
|
| 131 |
+
<ul>
|
| 132 |
+
<li><strong>Environment Perception</strong> — recognizing objects, counting components, and identifying basic game state features</li>
|
| 133 |
+
<li><strong>Heterogeneous Rules Integration</strong> — applying multimodal rulebook knowledge to answer grounded questions</li>
|
| 134 |
+
<li><strong>Short-Horizon Optimization</strong> — planning optimal moves requiring strategic reasoning over game mechanics</li>
|
| 135 |
+
</ul>
|
| 136 |
+
<p>These progressively stress-test the foundational reasoning required for real-world game comprehension.</p>
|
| 137 |
<p>
|
| 138 |
+
Evaluating frontier LMs on five diverse strategy games, we find that even the strongest models achieve only <strong>~76% accuracy</strong> on simple environment perception tasks and fall below <strong>13%</strong> on situated multi-step comprehension puzzles that hobbyist gamers can routinely solve. Our extensive failure analysis and knowledge-ablation experiments reveal that models largely <strong>fail to comprehend rich cross-modal reference knowledge</strong> and are subsequently unable to apply this knowledge to messy and unfamiliar situated environments. Our findings highlight the many steps remaining for current methods to succeed on complex multimodal reasoning in the real world.
|
|
|
|
|
|
|
|
|
|
| 139 |
</p>
|
| 140 |
</div>
|
| 141 |
|
| 142 |
<div class="about-section">
|
| 143 |
+
<h3>Games Overview</h3>
|
| 144 |
+
<p>
|
| 145 |
+
The dataset consists of five tabletop strategy games that vary widely in complexity, components, and rule structure. The details of each game—along with representative sample game states—are shown below.
|
| 146 |
+
</p>
|
| 147 |
+
<img src="assets/games_overview_updated.png" alt="Games Overview" class="overview-img" />
|
| 148 |
+
<table class="overview-table">
|
| 149 |
+
<thead>
|
| 150 |
+
<tr>
|
| 151 |
+
<th>Game</th>
|
| 152 |
+
<th>Rulebook</th>
|
| 153 |
+
<th>Diff.</th>
|
| 154 |
+
<th>Unique Game Properties</th>
|
| 155 |
+
<th># Rules</th>
|
| 156 |
+
<th># Figs.</th>
|
| 157 |
+
</tr>
|
| 158 |
+
</thead>
|
| 159 |
+
<tbody>
|
| 160 |
+
<tr><td><em>Kingdomino</em></td><td>4 pg.</td><td>1.2</td><td>tile-laying, spatial scoring, individual player areas</td><td>35</td><td>6</td></tr>
|
| 161 |
+
<tr><td><em>Carcassonne</em></td><td>8 pg.</td><td>1.9</td><td>shared tile-laying, dynamic board topology, position-coded roles</td><td>39</td><td>30</td></tr>
|
| 162 |
+
<tr><td><em>Catan</em></td><td>16 pg.</td><td>2.3</td><td>network building, connectivity constraints, action chaining</td><td>44</td><td>19</td></tr>
|
| 163 |
+
<tr><td><em>Res Arcana</em></td><td>12 pg.</td><td>2.6</td><td>card-based interactions, heavy symbol usage, card orientation, action sequencing</td><td>112</td><td>31</td></tr>
|
| 164 |
+
<tr><td><em>Pax Ren. (2e)</em></td><td>44 pg.</td><td>4.6</td><td>shared map, private cards/tableau, large number of components, intricate ruleset</td><td>247</td><td>58</td></tr>
|
| 165 |
+
</tbody>
|
| 166 |
+
</table>
|
| 167 |
</div>
|
| 168 |
|
| 169 |
<div class="about-section">
|
| 170 |
+
<h3>Tiers Overview</h3>
|
| 171 |
+
<p>
|
| 172 |
+
The benchmark evaluates models across three tiered reasoning levels that progressively increase in difficulty, from basic visual perception to rule integration and short-horizon planning. An example of how questions differ for each tier in Kingdomino is shown below:
|
| 173 |
+
</p>
|
| 174 |
+
<img src="assets/tier_overview.png" alt="Tier-wise Q&A Example" class="overview-img" />
|
|
|
|
|
|
|
| 175 |
</div>
|
| 176 |
|
| 177 |
<div class="about-section">
|
| 178 |
+
<h3>Failure Analysis</h3>
|
| 179 |
+
<p>
|
| 180 |
+
We analyze where models go wrong by collecting common failure cases across multiple models and organizing them for visualization on Kingdomino. The table below summarizes the relevant rulebook rules, supporting annotations, and the observed model errors for each failure pattern.
|
| 181 |
+
</p>
|
| 182 |
+
<img src="assets/failure_analysis.png" alt="Failure Analysis" class="overview-img" />
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
</div>
|
| 184 |
|
| 185 |
<div class="about-section">
|
| 186 |
<h3>Citation</h3>
|
| 187 |
+
<pre class="citation-block">@inproceedings{peper2026ludobench,
|
| 188 |
title={LudoBench: Evaluating Multimodal Reasoning through Real-World Tabletop Strategy Games},
|
| 189 |
+
author={Peper, Joseph J. and Gandra, Sai Krishna and Zhang, Yunxiang and Chennareddy, Vaibhav and Jha, Shloki and Payani, Ali and Wang, Lu},
|
| 190 |
+
booktitle={Proceedings of the Fourteenth International Conference on Learning Representations (ICLR)},
|
| 191 |
+
year={2026},
|
| 192 |
+
address={Rio de Janeiro, Brazil}
|
| 193 |
}</pre>
|
| 194 |
</div>
|
| 195 |
|
js/app.js
CHANGED
|
@@ -14,10 +14,13 @@ document.addEventListener("DOMContentLoaded", () => {
|
|
| 14 |
btn.classList.add("active");
|
| 15 |
document.getElementById("tab-" + target).classList.add("active");
|
| 16 |
|
| 17 |
-
// Lazy-init
|
| 18 |
if (target === "browser" && typeof initViewer === "function") {
|
| 19 |
initViewer();
|
| 20 |
}
|
|
|
|
|
|
|
|
|
|
| 21 |
});
|
| 22 |
});
|
| 23 |
|
|
|
|
| 14 |
btn.classList.add("active");
|
| 15 |
document.getElementById("tab-" + target).classList.add("active");
|
| 16 |
|
| 17 |
+
// Lazy-init on first access
|
| 18 |
if (target === "browser" && typeof initViewer === "function") {
|
| 19 |
initViewer();
|
| 20 |
}
|
| 21 |
+
if (target === "leaderboard" && typeof initLeaderboard === "function") {
|
| 22 |
+
initLeaderboard();
|
| 23 |
+
}
|
| 24 |
});
|
| 25 |
});
|
| 26 |
|