Spaces:

jpeper
/

LudoBench_test

Running

App Files Files Community

jpeper commited on Mar 1

Commit

626604a

verified ·

1 Parent(s): c9a41bc

Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

.gitattributes +3 -0
assets/failure_analysis.png +3 -0
assets/games_overview_updated.png +3 -0
assets/tier_overview.png +3 -0
css/main.css +33 -0
index.html +53 -60
js/app.js +4 -1

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/failure_analysis.png filter=lfs diff=lfs merge=lfs -text
+assets/games_overview_updated.png filter=lfs diff=lfs merge=lfs -text
+assets/tier_overview.png filter=lfs diff=lfs merge=lfs -text

assets/failure_analysis.png ADDED Viewed

Git LFS Details

SHA256: 9a5bf4e38165055083c3d3998e04a7d010e9ad657f3cfc6e9ea056c8abb173a2
Pointer size: 132 Bytes
Size of remote file: 2.98 MB

assets/games_overview_updated.png ADDED Viewed

Git LFS Details

SHA256: d8e0d03d18379d6a7cca2d777200ba07e0ef994a34bf448812c54338356ab791
Pointer size: 132 Bytes
Size of remote file: 1.67 MB

assets/tier_overview.png ADDED Viewed

Git LFS Details

SHA256: 18c7e1cbda91e534134e259079b458b904a1fac4132194f711f4cffc7b021c8b
Pointer size: 132 Bytes
Size of remote file: 1.85 MB

css/main.css CHANGED Viewed

@@ -163,6 +163,39 @@ main { padding: 24px 32px; }
   color: #555;
 }
 .citation-block {
   background: #f0f2f5;
   padding: 16px;

   color: #555;
 }
+.overview-img {
+  width: 100%;
+  max-width: 860px;
+  height: auto;
+  border-radius: 8px;
+  border: 1px solid #ddd;
+  margin: 12px 0;
+}
+.overview-table {
+  width: 100%;
+  border-collapse: collapse;
+  font-size: 0.9rem;
+  margin: 12px 0;
+}
+.overview-table th,
+.overview-table td {
+  padding: 8px 12px;
+  border: 1px solid #ddd;
+  text-align: left;
+}
+.overview-table th {
+  background: #f0f2f5;
+  font-weight: 600;
+  color: #333;
+}
+.overview-table td {
+  color: #444;
+}
 .citation-block {
   background: #f0f2f5;
   padding: 16px;

index.html CHANGED Viewed

@@ -13,9 +13,9 @@
   <nav id="topNav">
     <div class="nav-brand">LudoBench</div>
     <div class="nav-tabs">
-      <button class="tab-btn active" data-tab="leaderboard">Leaderboard</button>
       <button class="tab-btn" data-tab="browser">Dataset Browser</button>
-      <button class="tab-btn" data-tab="about">About</button>
     </div>
     <div class="nav-links">
       <a href="#" target="_blank">Paper</a>
@@ -26,7 +26,7 @@
   <main>
     <!-- ==================== LEADERBOARD TAB ==================== -->
-    <section id="tab-leaderboard" class="tab-panel active">
       <h2>Leaderboard</h2>
       <p class="tab-subtitle">
         Accuracy of multimodal models across <strong>5 board games</strong>,
@@ -56,7 +56,7 @@
           </div>
         </div>
         <div class="filter-group">
-          <span class="filter-label">Modality:</span>
           <div class="filter-buttons" data-filter="modality">
             <button class="filter-btn active" data-value="all">All</button>
             <button class="filter-btn" data-value="None">None</button>
@@ -118,85 +118,78 @@
     </section>
     <!-- ==================== ABOUT TAB ==================== -->
-    <section id="tab-about" class="tab-panel">
       <div class="about-content">
         <h2>LudoBench: Evaluating Multimodal Reasoning through Real-World Tabletop Strategy Games</h2>
         <div class="about-section">
           <h3>Abstract</h3>
           <p>
-            LudoBench is a multimodal game reasoning benchmark designed to evaluate how well
-            vision-enabled language models handle real-world tabletop strategy games. Unlike
-            prior benchmarks that focus mainly on familiar board games, LudoBench recreates the
-            experience of players learning and applying new rules for the first time.
           </p>
           <p>
-            LudoBench includes 638 annotated question&ndash;answer examples drawn from five diverse
-            games: <em>Kingdomino, Res Arcana, Pax Renaissance (2nd Edition), Carcassonne,</em> and
-            <em>Catan</em>. These games are chosen based on increasing complexity of game states
-            and rules comprehension.
           </p>
         </div>
         <div class="about-section">
-          <h3>Reasoning Tiers</h3>
-          <div class="tier-cards">
-            <div class="tier-card">
-              <h4>Tier 1: Environment Perception</h4>
-              <p>Recognizing objects, counting components, and identifying basic game state features from board images.</p>
-            </div>
-            <div class="tier-card">
-              <h4>Tier 2: Rules Integration</h4>
-              <p>Applying multimodal rulebook knowledge to answer questions requiring rule understanding and application.</p>
-            </div>
-            <div class="tier-card">
-              <h4>Tier 3: Short-Horizon Optimization</h4>
-              <p>Planning optimal short-term moves that require strategic reasoning about game mechanics and resource management.</p>
-            </div>
-          </div>
         </div>
         <div class="about-section">
-          <h3>Rulebook Modalities</h3>
-          <p>Models are evaluated under three rulebook modalities:</p>
-          <ul>
-            <li><strong>None</strong> &mdash; parametric knowledge only (no rulebook provided)</li>
-            <li><strong>Text</strong> &mdash; text-only rulebook provided in context</li>
-            <li><strong>Image</strong> &mdash; image-based rulebook pages provided</li>
-          </ul>
         </div>
         <div class="about-section">
-          <h3>Games</h3>
-          <div class="game-grid">
-            <div class="game-card">
-              <strong>Kingdomino</strong>
-              <p>Domino-based kingdom building with terrain matching and crown scoring.</p>
-            </div>
-            <div class="game-card">
-              <strong>Res Arcana</strong>
-              <p>Engine-building card game with essence management and artifact abilities.</p>
-            </div>
-            <div class="game-card">
-              <strong>Pax Renaissance</strong>
-              <p>Complex political strategy game with multiple victory conditions and card-driven actions.</p>
-            </div>
-            <div class="game-card">
-              <strong>Carcassonne</strong>
-              <p>Tile-placement game with feature scoring (cities, roads, fields, monasteries).</p>
-            </div>
-            <div class="game-card">
-              <strong>Catan</strong>
-              <p>Resource management and trading game with settlement building and development.</p>
-            </div>
-          </div>
         </div>
         <div class="about-section">
           <h3>Citation</h3>
-          <pre class="citation-block">@article{ludobench2026,
   title={LudoBench: Evaluating Multimodal Reasoning through Real-World Tabletop Strategy Games},
-  year={2026}
 }</pre>
         </div>

   <nav id="topNav">
     <div class="nav-brand">LudoBench</div>
     <div class="nav-tabs">
+      <button class="tab-btn active" data-tab="about">LudoBench Overview</button>
+      <button class="tab-btn" data-tab="leaderboard">Leaderboard</button>
       <button class="tab-btn" data-tab="browser">Dataset Browser</button>
     </div>
     <div class="nav-links">
       <a href="#" target="_blank">Paper</a>
   <main>
     <!-- ==================== LEADERBOARD TAB ==================== -->
+    <section id="tab-leaderboard" class="tab-panel">
       <h2>Leaderboard</h2>
       <p class="tab-subtitle">
         Accuracy of multimodal models across <strong>5 board games</strong>,
           </div>
         </div>
         <div class="filter-group">
+          <span class="filter-label">Rules Modality:</span>
           <div class="filter-buttons" data-filter="modality">
             <button class="filter-btn active" data-value="all">All</button>
             <button class="filter-btn" data-value="None">None</button>
     </section>
     <!-- ==================== ABOUT TAB ==================== -->
+    <section id="tab-about" class="tab-panel active">
       <div class="about-content">
         <h2>LudoBench: Evaluating Multimodal Reasoning through Real-World Tabletop Strategy Games</h2>
         <div class="about-section">
           <h3>Abstract</h3>
           <p>
+            We introduce <strong>LudoBench</strong>, a multimodal reasoning benchmark that evaluates whether vision-enabled large language models (LMs) can acquire, integrate, and reason over heterogeneous game knowledge in <strong>mainstream analog tabletop games</strong>. Unlike prior works that emphasize deep strategic mastery, LudoBench targets an initial reasoning challenge uninitiated gamers face: <strong>correctly comprehending a new tabletop strategy game for the first time</strong>. We examine whether, given a visual depiction of a tabletop scene and a corresponding ruleset, a model can correctly answer grounded questions about the pictured scenario.
           </p>
+          <p>Concretely, LudoBench tests three cumulative situated game-comprehension capabilities:</p>
+          <ul>
+            <li><strong>Environment Perception</strong> &mdash; recognizing objects, counting components, and identifying basic game state features</li>
+            <li><strong>Heterogeneous Rules Integration</strong> &mdash; applying multimodal rulebook knowledge to answer grounded questions</li>
+            <li><strong>Short-Horizon Optimization</strong> &mdash; planning optimal moves requiring strategic reasoning over game mechanics</li>
+          </ul>
+          <p>These progressively stress-test the foundational reasoning required for real-world game comprehension.</p>
           <p>
+            Evaluating frontier LMs on five diverse strategy games, we find that even the strongest models achieve only <strong>~76% accuracy</strong> on simple environment perception tasks and fall below <strong>13%</strong> on situated multi-step comprehension puzzles that hobbyist gamers can routinely solve. Our extensive failure analysis and knowledge-ablation experiments reveal that models largely <strong>fail to comprehend rich cross-modal reference knowledge</strong> and are subsequently unable to apply this knowledge to messy and unfamiliar situated environments. Our findings highlight the many steps remaining for current methods to succeed on complex multimodal reasoning in the real world.
           </p>
         </div>
         <div class="about-section">
+          <h3>Games Overview</h3>
+          <p>
+            The dataset consists of five tabletop strategy games that vary widely in complexity, components, and rule structure. The details of each game&mdash;along with representative sample game states&mdash;are shown below.
+          </p>
+          <img src="assets/games_overview_updated.png" alt="Games Overview" class="overview-img" />
+          <table class="overview-table">
+            <thead>
+              <tr>
+                <th>Game</th>
+                <th>Rulebook</th>
+                <th>Diff.</th>
+                <th>Unique Game Properties</th>
+                <th># Rules</th>
+                <th># Figs.</th>
+              </tr>
+            </thead>
+            <tbody>
+              <tr><td><em>Kingdomino</em></td><td>4 pg.</td><td>1.2</td><td>tile-laying, spatial scoring, individual player areas</td><td>35</td><td>6</td></tr>
+              <tr><td><em>Carcassonne</em></td><td>8 pg.</td><td>1.9</td><td>shared tile-laying, dynamic board topology, position-coded roles</td><td>39</td><td>30</td></tr>
+              <tr><td><em>Catan</em></td><td>16 pg.</td><td>2.3</td><td>network building, connectivity constraints, action chaining</td><td>44</td><td>19</td></tr>
+              <tr><td><em>Res Arcana</em></td><td>12 pg.</td><td>2.6</td><td>card-based interactions, heavy symbol usage, card orientation, action sequencing</td><td>112</td><td>31</td></tr>
+              <tr><td><em>Pax Ren. (2e)</em></td><td>44 pg.</td><td>4.6</td><td>shared map, private cards/tableau, large number of components, intricate ruleset</td><td>247</td><td>58</td></tr>
+            </tbody>
+          </table>
         </div>
         <div class="about-section">
+          <h3>Tiers Overview</h3>
+          <p>
+            The benchmark evaluates models across three tiered reasoning levels that progressively increase in difficulty, from basic visual perception to rule integration and short-horizon planning. An example of how questions differ for each tier in Kingdomino is shown below:
+          </p>
+          <img src="assets/tier_overview.png" alt="Tier-wise Q&A Example" class="overview-img" />
         </div>
         <div class="about-section">
+          <h3>Failure Analysis</h3>
+          <p>
+            We analyze where models go wrong by collecting common failure cases across multiple models and organizing them for visualization on Kingdomino. The table below summarizes the relevant rulebook rules, supporting annotations, and the observed model errors for each failure pattern.
+          </p>
+          <img src="assets/failure_analysis.png" alt="Failure Analysis" class="overview-img" />
         </div>
         <div class="about-section">
           <h3>Citation</h3>
+          <pre class="citation-block">@inproceedings{peper2026ludobench,
   title={LudoBench: Evaluating Multimodal Reasoning through Real-World Tabletop Strategy Games},
+  author={Peper, Joseph J. and Gandra, Sai Krishna and Zhang, Yunxiang and Chennareddy, Vaibhav and Jha, Shloki and Payani, Ali and Wang, Lu},
+  booktitle={Proceedings of the Fourteenth International Conference on Learning Representations (ICLR)},
+  year={2026},
+  address={Rio de Janeiro, Brazil}
 }</pre>
         </div>

js/app.js CHANGED Viewed

@@ -14,10 +14,13 @@ document.addEventListener("DOMContentLoaded", () => {
       btn.classList.add("active");
       document.getElementById("tab-" + target).classList.add("active");
-      // Lazy-init the dataset viewer on first access
       if (target === "browser" && typeof initViewer === "function") {
         initViewer();
       }
     });
   });

       btn.classList.add("active");
       document.getElementById("tab-" + target).classList.add("active");
+      // Lazy-init on first access
       if (target === "browser" && typeof initViewer === "function") {
         initViewer();
       }
+      if (target === "leaderboard" && typeof initLeaderboard === "function") {
+        initLeaderboard();
+      }
     });
   });