eleusis-benchmark

Running

App Files Files Community

dlouapre HF Staff commited on Jan 22

Commit

7f4144c

1 Parent(s): 25932c2

First draft

Browse files

Files changed (21) hide show

app/src/content/article.mdx +22 -37
app/src/content/assets/figures/basic_metrics.csv +3 -0
app/src/content/assets/figures/by_rule.json +3 -0
app/src/content/assets/figures/by_rule.png +3 -0
app/src/content/assets/figures/calibration_curves.json +3 -0
app/src/content/assets/figures/calibration_curves.png +3 -0
app/src/content/assets/figures/complexity_analysis.json +3 -0
app/src/content/assets/figures/complexity_analysis.png +3 -0
app/src/content/assets/figures/confidence_distribution.json +3 -0
app/src/content/assets/figures/confidence_distribution.png +3 -0
app/src/content/assets/figures/overall_performance.json +3 -0
app/src/content/assets/figures/overall_performance.png +3 -0
app/src/content/assets/figures/score_vs_failed_guesses.json +3 -0
app/src/content/assets/figures/score_vs_failed_guesses.png +3 -0
app/src/content/assets/figures/summary.txt +109 -0
app/src/content/chapters/eleusis/analysis.mdx +82 -0
app/src/content/chapters/eleusis/appendix.mdx +87 -0
app/src/content/chapters/eleusis/benchmark.mdx +69 -0
app/src/content/chapters/eleusis/conclusion.mdx +56 -0
app/src/content/chapters/eleusis/introduction.mdx +34 -0
app/src/content/chapters/eleusis/results.mdx +103 -0

app/src/content/article.mdx CHANGED Viewed

@@ -1,57 +1,42 @@
 ---
-title: "Bringing paper to life:\n A modern template for\n scientific writing"
-subtitle: "Publish‑ready workflow that lets you focus on ideas, not infrastructure"
-description: "Publish‑ready workflow that lets you focus on ideas, not infrastructure"
 authors:
-  - name: "Thibaud Frere"
-    url: "https://huggingface.co/tfrere"
     affiliations: [1]
 affiliations:
   - name: "Hugging Face"
     url: "https://huggingface.co"
-published: "Sep. 01, 2025"
-doi: 10.1234/abcd.efgh
 licence: >
-  Diagrams and text are licensed under <a href="https://creativecommons.org/licenses/by/4.0/" target="_blank" rel="noopener noreferrer">CC‑BY 4.0</a> with the source available on <a href="https://huggingface.co/spaces/tfrere/research-article-template" target="_blank" rel="noopener noreferrer">Hugging Face</a>, unless noted otherwise.
-  Figures reused from other sources are excluded and marked in their captions (“Figure from …”).
 tags:
-  - research
-  - template
 tableOfContentsAutoCollapse: true
 pdfProOnly: false
 showPdf: true
 ---
-import Introduction from "./chapters/demo/introduction.mdx";
-import BuiltWithThis from "./chapters/demo/built-with-this.mdx";
-import BestPractices from "./chapters/demo/best-pratices.mdx";
-import WritingYourContent from "./chapters/demo/writing-your-content.mdx";
-import AvailableBlocks from "./chapters/demo/markdown.mdx";
-import GettingStarted from "./chapters/demo/getting-started.mdx";
-import Markdown from "./chapters/demo/markdown.mdx";
-import Components from "./chapters/demo/components.mdx";
-import Greetings from "./chapters/demo/greetings.mdx";
-import VibeCodingCharts from "./chapters/demo/vibe-coding-charts.mdx";
-import ImportContent from "./chapters/demo/import-content.mdx";
 <Introduction />
-<BuiltWithThis />
-<GettingStarted />
-<WritingYourContent />
-<Markdown />
-<Components />
-<VibeCodingCharts />
-<ImportContent />
-<BestPractices />
-<Greetings />

 ---
+title: "Are LLMs any good at the Science Game?\n Evaluating scientific reasoning using the card game Eleusis"
+subtitle: "Testing LLM calibration and iterative hypothesis formation"
+description: "A benchmark for evaluating LLM scientific reasoning using the card game Eleusis, testing iterative hypothesis formation, calibration, and strategic experimentation."
 authors:
+  - name: "David Louapre"
+    url: "https://huggingface.co/dlouapre"
     affiliations: [1]
 affiliations:
   - name: "Hugging Face"
     url: "https://huggingface.co"
+published: "Jan. 22, 2026"
 licence: >
+  Diagrams and text are licensed under <a href="https://creativecommons.org/licenses/by/4.0/" target="_blank" rel="noopener noreferrer">CC‑BY 4.0</a> with the source available on <a href="https://huggingface.co/spaces/dlouapre/eleusis-benchmark" target="_blank" rel="noopener noreferrer">Hugging Face</a>, unless noted otherwise.
 tags:
+  - LLM evaluation
+  - scientific reasoning
+  - benchmarks
+  - calibration
 tableOfContentsAutoCollapse: true
 pdfProOnly: false
 showPdf: true
 ---
+import Introduction from "./chapters/eleusis/introduction.mdx";
+import Benchmark from "./chapters/eleusis/benchmark.mdx";
+import Results from "./chapters/eleusis/results.mdx";
+import Analysis from "./chapters/eleusis/analysis.mdx";
+import Conclusion from "./chapters/eleusis/conclusion.mdx";
+import Appendix from "./chapters/eleusis/appendix.mdx";
 <Introduction />
+<Benchmark />
+<Results />
+<Analysis />
+<Conclusion />
+<Appendix />

app/src/content/assets/figures/basic_metrics.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0e0bdb47eeb82b62a05a7d6dd2b3815404567be86ea4f7cc44a7f2e47a262d35
+size 1372

app/src/content/assets/figures/by_rule.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7883abbd4a92c8f305c5c030315878579bb42d6acfcefe24d7d96d550f47120d
+size 5864

app/src/content/assets/figures/by_rule.png ADDED Viewed

Git LFS Details

SHA256: 9a26abae8a4f9e0d6bb06c11a8be47272a4fbe3055464a8bf8ce836ffa6380c8
Pointer size: 131 Bytes
Size of remote file: 266 kB

app/src/content/assets/figures/calibration_curves.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6db808595939baa8afcef3106b6963d19940949b864a06a80c0b7e479d03b38e
+size 5681

app/src/content/assets/figures/calibration_curves.png ADDED Viewed

Git LFS Details

SHA256: 52506eaeca312227972fd785ae3152e9a1c994fa5367feb6ab256b27a252522f
Pointer size: 131 Bytes
Size of remote file: 142 kB

app/src/content/assets/figures/complexity_analysis.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ad53beba3b7e00c248664f291eaba015dd716be80013584479952bc26c79f83
+size 1612

app/src/content/assets/figures/complexity_analysis.png ADDED Viewed

Git LFS Details

SHA256: de5818ead0206b6bdb5359d89924388d42b5a9c05033dfea048fac37ce90e075
Pointer size: 130 Bytes
Size of remote file: 71.6 kB

app/src/content/assets/figures/confidence_distribution.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:67d35eb63310d743c06a7a5b401228792e3532d6c22880369d61b2d4efb213b1
+size 5577

app/src/content/assets/figures/confidence_distribution.png ADDED Viewed

Git LFS Details

SHA256: 0b8ad66db19f9226d626fae36ce29862563358fbcfae52951b6fe4042ae34583
Pointer size: 131 Bytes
Size of remote file: 201 kB

app/src/content/assets/figures/overall_performance.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c620d1614704161071e6b3fdf51031228bc35a0aab8f70d6221f024a68e21e32
+size 1413

app/src/content/assets/figures/overall_performance.png ADDED Viewed

Git LFS Details

SHA256: ab104cea226dde36e097010335a2253c4286e25b7c19cf837d96907edce1db8c
Pointer size: 130 Bytes
Size of remote file: 60.7 kB

app/src/content/assets/figures/score_vs_failed_guesses.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:abff24e6757f5f108647f4a42dcecf7f85a38f9b2dc509eab02884cd311d685d
+size 1372

app/src/content/assets/figures/score_vs_failed_guesses.png ADDED Viewed

Git LFS Details

SHA256: 218cb01d01f02dca9ade3e4917dbc3bf1e4aa04d350b4bfa706ccced166a1268
Pointer size: 130 Bytes
Size of remote file: 58.8 kB

app/src/content/assets/figures/summary.txt ADDED Viewed

	@@ -0,0 +1,109 @@

+============================================================
+ELEUSIS RESULTS ANALYSIS
+============================================================
+Analyzing: results/260121_78_rounds
+Loading results...
+Loaded 6 evaluation runs:
+  - solo_evaluation_20260120_091620_gpt_5_2_high
+  - solo_evaluation_20260120_091622_gpt_oss_120b
+  - solo_evaluation_20260121_070517_claude_haiku_4_5
+  - solo_evaluation_20260121_070518_gpt_5_mini_medium
+  - solo_evaluation_20260121_070520_gemini_3_flash_preview_low
+  - solo_evaluation_20260121_070522_gpt_oss_20b
+Extracted 26 unique rules from results files
+Built DataFrames: 468 rounds, 7836 turns
+Loaded colors for 17 models
+============================================================
+BASIC MODEL COMPARISON
+============================================================
+                     model  rounds_played  total_score  avg_score  total_turns  total_output_tokens  total_wall_clock  avg_failed_guesses  success_rate  avg_output_tokens_per_turn  wall_clock_per_turn  intra_rule_variance  inter_rule_variance  variance_ratio
+              Gpt 5.2 High             78         1102  14.128205         1200              3341037          73525.83            0.333333      0.961538                 2784.197500            61.271525            25.346154            36.062906        0.702832
+         Gpt 5 Mini Medium             78         1001  12.833333         1247              3618399          58345.97            1.256410      0.756410                 2901.683240            46.789070            40.051282            79.228889        0.505514
+Gemini 3 Flash Preview Low             78          955  12.243590         1299              1581524          12702.02            1.717949      0.769231                 1217.493457             9.778306            35.910256            81.480513        0.440722
+              Gpt Oss 120B             78          938  12.025641         1226              3190828          24633.15            3.692308      0.756410                 2602.632953            20.092292            51.320513            80.710427        0.635860
+               Gpt Oss 20B             78          773   9.910256         1277              7009392          62397.50            6.205128      0.717949                 5488.952232            48.862569            80.782051           122.849402        0.657570
+          Claude Haiku 4.5             78          713   9.141026         1223              6973411          57734.39            7.551282      0.705128                 5701.889616            47.207187            88.576923           152.125983        0.582260
+Saved: results/260121_78_rounds/basic_metrics.csv
+Saved: results/260121_78_rounds/overall_performance.png
+Saved: results/260121_78_rounds/overall_performance.json
+Saved: results/260121_78_rounds/score_vs_failed_guesses.png
+Saved: results/260121_78_rounds/score_vs_failed_guesses.json
+Saved: results/260121_78_rounds/calibration_curves.png
+Saved: results/260121_78_rounds/calibration_curves.json
+Saved: results/260121_78_rounds/confidence_distribution.png
+Saved: results/260121_78_rounds/confidence_distribution.json
+============================================================
+BY-RULE ANALYSIS
+============================================================
+Score by rule (sorted by avg_score):
+                                                                                                                                            rule_description  count  avg_score  std_score  success_rate
+                                                                                                                        Only red cards (hearts or diamonds).     18  23.888889   2.541164      1.000000
+                                                                             Cards must alternate between red and black colors. Any card may start the line.     18  23.500000   3.166925      1.000000
+                                                                                                                              Only cards of the suit spades.     18  23.444444   2.254987      1.000000
+                                                                                                               Only cards with an even rank (2,4,6,8,10,12).     18  22.333333   2.950573      1.000000
+                                                             The card must be of a different suit than the card just before it. Any card may start the line.     18  19.277778   7.282578      0.944444
+                                                      Card rank must have opposite odd/even parity to the previous card's rank. Any card may start the line.     18  19.000000   5.636019      1.000000
+                                                                                             Only hearts, clubs, and diamonds allowed. Spades are forbidden.     18  18.333333   5.851093      0.944444
+                                                                                                          Only ranks that are prime numbers (2,3,5,7,11,13).     18  18.000000   6.859943      0.944444
+                                           The card must be of a different suit than but same color as the card just before it. Any card may start the line.     18  17.944444   9.295617      1.000000
+                                                                                                                                   Only spades and diamonds.     18  17.500000   4.973459      1.000000
+                                                                                                                                 Only face cards (11,12,13).     18  16.388889   9.356589      0.833333
+                                           Suits must repeat in the cyclic order hearts → spades → clubs → diamonds → hearts... Any card may start the line.     18  16.388889   7.769767      1.000000
+                                                                                                                                        Only Aces (rank 1) .     18  16.111111   9.682543      0.944444
+                                                                                                                       Only cards between 1 and 7 inclusive.     18  10.277778   8.870344      0.944444
+                                                                                                                                      Only black face cards.     18   7.111111  10.093031      0.833333
+                                                              Each card must have a rank greater or equal to the previous card. Only Ace can start the line.     18   6.277778  11.113349      0.500000
+                                 Each card must share at least one property with the previous card: same color, or same parity. Any card may start the line.     18   6.055556  11.305762      0.611111
+                                                                                                                           Only red cards whose rank is <=7.     18   5.611111  10.330645      1.000000
+                                                                                               Alternate face and number cards. Any card may start the line.     18   5.333333  12.362181      0.611111
+                                                                                                                       Only cards between 5 and 9 inclusive.     18   4.500000   9.977917      0.888889
+Suits must appear in pairs: card 1 and 2 same suit, cards 3 and 4 same suit (different from 1 and 2), cards 5 and 6 same suit (different from 3 and 4), etc.     18   1.944444  12.511041      0.777778
+        Face cards imposes the suit: if a face card is played, the next card must match its suit. Otherwise, the next card must be a different suit than it.     18   1.666667   3.880570      0.333333
+                   Rank repeats in pairs: ranks must come in doubles: (x, x), then (y, y) with y different from x, then (z, z) with z different from y, etc.     18   1.444444   4.217920      0.111111
+    If the previous card was red, rank must increase or be equal; if black, rank must decrease or be equal. Starting card must be between 5 and 9 inclusive.     18   1.444444   5.690262      0.277778
+                                     Hearts and spades form Group A; clubs and diamonds form Group B. Alternate between groups. Any card may start the line.     18   0.833333   6.242643      0.277778
+                                                                                          Face cards (11-13) must be red; number cards (1-10) must be black.     18  -0.055556   7.255604      0.444444
+Saved: results/260121_78_rounds/by_rule.png
+Saved: results/260121_78_rounds/by_rule.json
+============================================================
+COMPLEXITY ANALYSIS
+============================================================
+Optimal K for aggregated complexity: 0.05
+  Formula: complexity = cyclomatic + 0.05 * node_count
+  Correlation with relative_score: -0.429
+Score by complexity quartile:
+complexity_bin  count  avg_score  avg_relative_score  success_rate
+            Q1    144  16.909722            1.478439      0.944444
+            Q2     90  12.911111            1.105104      0.877778
+            Q3    126  12.150794            1.021103      0.761905
+            Q4    108   3.277778            0.249874      0.490741
+Saved: results/260121_78_rounds/complexity_analysis.png
+Saved: results/260121_78_rounds/complexity_analysis.json
+============================================================
+PER-MODEL REPORTS
+============================================================
+Saved: results/260121_78_rounds/model_gpt_5_2_high.png
+Saved: results/260121_78_rounds/model_gpt_oss_120b.png
+Saved: results/260121_78_rounds/model_claude_haiku_4_5.png
+Saved: results/260121_78_rounds/model_gpt_5_mini_medium.png
+Saved: results/260121_78_rounds/model_gemini_3_flash_preview_low.png
+Saved: results/260121_78_rounds/model_gpt_oss_20b.png
+============================================================
+Analysis complete! All outputs saved to: results/260121_78_rounds
+============================================================

app/src/content/chapters/eleusis/analysis.mdx ADDED Viewed

	@@ -0,0 +1,82 @@

+import Note from "../../../components/Note.astro";
+import Sidenote from "../../../components/Sidenote.astro";
+import Accordion from "../../../components/Accordion.astro";
+## Deeper Analysis
+### Learning Curves
+How do models improve within a single round? We tracked confidence and hypothesis quality over turn number to understand the learning dynamics.
+<Note variant="info">
+  **TODO**: Add figure showing line plot of average confidence by turn number, colored by eventual success/failure.
+</Note>
+Key observations:
+- **Successful rounds** typically show steadily increasing confidence with occasional drops when hypotheses are revised
+- **Failed rounds** often show erratic confidence or premature plateaus where models become stuck on incorrect hypotheses
+- **Acceptance rate decreases** over time as obvious cards are exhausted from the hand
+<Sidenote>
+  The turn-by-turn reasoning traces provide rich data for understanding model behavior beyond simple success/failure metrics.
+</Sidenote>
+### Failure Modes
+When models fail, why? We identified several recurring patterns:
+<Accordion title="Failure mode taxonomy" open>
+1. **Premature guessing**: High confidence, wrong rule, insufficient evidence. The model becomes convinced too early based on limited data.
+2. **Hypothesis fixation**: Stuck on wrong rule despite contradictory evidence. The model fails to update when new observations conflict with its theory.
+3. **Overfitting**: Rule matches all observations but is more specific than the actual rule (e.g., guessing "only red hearts" when the rule is "only red cards").
+4. **Underfitting**: Rule is too simple and fails to capture necessary conditions (e.g., guessing "black cards" when rule is "black even cards").
+5. **Position blindness**: Fails on rules depending on position in mainline or relationship to previous cards.
+</Accordion>
+<Note variant="info">
+  **TODO**: Add stacked bar chart showing distribution of failure modes by model.
+</Note>
+### Symmetric Rules
+An interesting test: are symmetric rules equally difficult? For example, "only spades" vs "only non-spades" should be logically equivalent in difficulty, but models might have biases.
+We found that:
+- Negative rules ("not X") are generally harder than positive rules ("only X")
+- Rules involving rare events (low acceptance rate) are harder than rules with high acceptance rates
+- This may reflect training data biases where positive examples are more common
+### Confirmation Bias
+Do models exhibit confirmation bias—preferring to play cards that confirm their current hypothesis rather than cards that could falsify it?
+<Sidenote>
+  A good scientist designs experiments that could prove them wrong, not just experiments that confirm what they already believe.
+</Sidenote>
+Preliminary analysis suggests:
+- Models do show some tendency toward confirmation-seeking behavior
+- When confident in a hypothesis, models prefer "safe" plays that are likely to be accepted
+- Strategic exploration (playing cards specifically to test hypothesis boundaries) is rare
+### Qualitative Observations
+Examining individual reasoning traces reveals interesting patterns:
+<Accordion title="Example: Hypothesis revision">
+In one game with the rule "alternating odd/even ranks," a model initially hypothesized "increasing ranks" based on the first few accepted cards. When a lower-ranked card was accepted, instead of abandoning the hypothesis entirely, the model revised it to "ranks must differ from previous." This partial update eventually led to discovering the true rule—a good example of iterative refinement.
+</Accordion>
+<Accordion title="Example: Fixation failure">
+With the rule "only face cards (J, Q, K)," one model became fixated on "only red cards" after the first three accepted cards happened to be red face cards. Despite subsequently seeing black face cards accepted, the model kept trying to reconcile observations with a color-based rule, eventually running out of turns.
+</Accordion>

app/src/content/chapters/eleusis/appendix.mdx ADDED Viewed

	@@ -0,0 +1,87 @@

+import Accordion from "../../../components/Accordion.astro";
+import Note from "../../../components/Note.astro";
+## Appendix: Detailed Methods
+### Models Evaluated
+<Accordion title="Model configurations" open>
+All models were evaluated using their respective APIs with the following settings:
+| Parameter | Value |
+|-----------|-------|
+| Temperature | 0.0 (deterministic) |
+| Max tokens | 4096 |
+| Retries | 3 (on API failures) |
+Reasoning models (o1, o3-mini, etc.) were allowed their default reasoning budgets. Standard models used the base inference without chain-of-thought prompting beyond what's included in the game prompt.
+</Accordion>
+### Rule Checking
+<Accordion title="Rule verification methodology">
+Rules are created by hand and expressed in natural language. Each rule is then compiled into a Python function using an LLM, with manual verification of correctness.
+When the model outputs a guessed rule, we:
+1. Compile the guess into a Python function using the same LLM
+2. Test the compiled function against all cards played in that game
+3. Mark the guess as correct only if it matches the true rule's behavior on all observations
+This simulation-based approach avoids issues with semantic equivalence in natural language. For instance, "same color as previous card" and "red cards only" might be equivalent given a specific game history starting with a red card, but would differ on other histories.
+</Accordion>
+### Prompt Structure
+<Accordion title="Full prompt template">
+The prompt includes:
+1. **Game rules**: Complete explanation of how Eleusis works, without mentioning the game's name to avoid potential training data leakage
+2. **Scoring system**: Explicit explanation of the scoring formula and strategic implications
+3. **Response format**: JSON schema specifying required fields (reasoning, card choice, tentative rule, confidence, guess decision)
+4. **Game state**: Current mainline, all sidelines, current hand, and reasoning from the previous 3 turns
+5. **Format reminders**: Instructions for confidence scale interpretation (7 = 70% probability)
+</Accordion>
+### Evaluation Metrics
+<Accordion title="Metric definitions">
+- **Success rate**: Fraction of games where the model correctly identified the rule before running out of turns
+- **Average score**: Mean score across all games, including zeros for failed games
+- **Calibration error**: Mean absolute difference between stated confidence and empirical success rate at that confidence level
+- **Failed guesses**: Average number of incorrect formal guesses per game
+- **Turns to success**: For successful games, mean number of turns before correct guess
+</Accordion>
+### References
+<Accordion title="Bibliography">
+- Abbott, R. (1963). "Eleusis" — Original game rules and design philosophy
+- Guo, C., et al. (2017). "On Calibration of Modern Neural Networks" — Foundational work on neural network calibration
+- Chollet, F. (2019). "On the Measure of Intelligence" — ARC benchmark and discussion of abstract reasoning
+- Recent LLM reasoning benchmarks: GSM8K, MATH, ARC-AGI, BIG-Bench, etc.
+</Accordion>
+<Note>
+  Full code, data, and model outputs are available in the benchmark repository.
+</Note>

app/src/content/chapters/eleusis/benchmark.mdx ADDED Viewed

	@@ -0,0 +1,69 @@

+import Sidenote from "../../../components/Sidenote.astro";
+import Note from "../../../components/Note.astro";
+import Accordion from "../../../components/Accordion.astro";
+## The Eleusis Benchmark
+### The Original Game
+In the original Eleusis card game, one player acts as the "dealer" (sometimes called "God" or "Nature") and secretly invents a rule determining which cards can be legally played. The other players don't know this rule—they must discover it through experimentation.
+Players take turns playing cards from their hand onto a central "mainline." If a card satisfies the secret rule, it's accepted and added to the mainline. If it violates the rule, it's rejected and placed in a "sideline" below the mainline at that position. Over time, the pattern of accepted and rejected cards provides evidence about the hidden rule.
+<Sidenote>
+  The name "Eleusis" comes from the ancient Greek mystery cult, where initiates gradually discovered hidden truths.
+</Sidenote>
+At any point, a player can attempt to guess the rule; correctly identifying it ends the game. A specific scoring system rewards efficiency in discovering the rule while penalizing reckless guessing.
+### Our Adaptation
+We adapted Eleusis into a single-player benchmark focused purely on the scientific reasoning process. By removing multi-player dynamics, we isolate the core challenge: hypothesis formation and testing under uncertainty.
+The game uses a standard 52-card deck with ranks 1–13 (Ace through King) and four suits. A secret rule—a deterministic function that takes the card being played and the current sequence of accepted cards (the "mainline")—determines whether each card is accepted or rejected. The player maintains a hand of 12 cards, drawing a replacement after each play.
+On each turn, the player selects a card from their hand to play. If the card satisfies the secret rule, it joins the mainline; if rejected, it's placed in a sideline below the mainline at that position. At any point, the player may attempt to guess the rule.
+<Sidenote>
+  We chose 12-card hands to give models enough options for strategic experimentation.
+</Sidenote>
+The game lasts at most 30 turns, with scoring designed to reward efficiency while penalizing reckless guessing:
+$$\text{score} = (30 - \text{turns\_used}) - 2 \times \text{wrong\_guesses}$$
+A player who correctly identifies the rule on turn 10 with no wrong guesses scores 20 points; one who made 3 wrong guesses along the way scores only 14. Failing to identify the rule scores 0. This creates an interesting tension: guessing early yields more points if correct, but wrong guesses are costly. The optimal strategy requires accurately assessing one's own confidence—exactly the calibration we want to measure.
+<Note variant="info">
+  **TODO**: Add figure showing an example turn with the game state (mainline with sidelines) and the model's structured response.
+</Note>
+### Rule Library
+We created a library of 26 hand-crafted rules spanning a range of types and complexity. Some rules involve simply card properties (e.g., "only red cards"), while others depend on the sequence of previously accepted cards (e.g., "card rank must be higher than previous card"). The rule might involve rank, suits, color or a combination thereof, and may include positional dependencies.
+| Category | Examples |
+|----------|----------|
+| Static property | "Only red cards", "Only face cards (J, Q, K)" |
+| Combined properties | "Only hearts with rank ≤7", "Only red face cards" |
+| Sequential | "Rank must be higher than previous card" |
+| Cyclic patterns | "Alternate between odd and even ranks", "Suits cycle: ♥→♠→♣→♦" |
+| Complex conditionals | "Same suit as previous OR rank differs by exactly 2" |
+Each rule is played 3 times with different random seeds (affecting the initial hand and deck order). This ensures every model is tested on the same deck sequences for a given seed, and captures variance in performance when the starting hand differs.
+### What the LLM Must Do
+On each turn, the model receives the complete game state: the mainline of accepted cards, the sidelines of rejected cards at each position, its current hand, and its history of reasoning from the 3 previous turns. It must output a structured response containing:
+<Accordion title="Structured response format" open>
+1. **Reasoning summary**: A brief explanation of its current thinking
+2. **Card choice**: Which card to play from its hand
+3. **Tentative rule**: Its current best hypothesis about the secret rule
+4. **Confidence level**: A self-reported probability (0–10 scale, where 7 means "I estimate 70% chance my tentative rule is correct")
+5. **Guess decision**: Whether to formally guess the rule this turn
+</Accordion>
+This structure lets us analyze not just whether models succeed, but *how* they reason: Do they update hypotheses appropriately when evidence contradicts them? Do they explore strategically or play conservatively? Is their stated confidence calibrated to their actual accuracy?

app/src/content/chapters/eleusis/conclusion.mdx ADDED Viewed

	@@ -0,0 +1,56 @@

+import Note from "../../../components/Note.astro";
+import Sidenote from "../../../components/Sidenote.astro";
+## Conclusion
+### Key Findings
+Our evaluation of LLMs on the Eleusis benchmark reveals several important insights:
+1. **LLMs can do inductive reasoning**—but with significant variation across models. The best models successfully discover hidden rules through iterative experimentation, while others struggle with basic hypothesis formation.
+2. **Complexity matters**—simple rules are easy, complex rules are hard. This isn't surprising, but our benchmark provides quantitative measurements of how different complexity factors affect performance.
+3. **Calibration is imperfect**—models don't always know what they don't know. Most models show systematic overconfidence, particularly at high stated confidence levels.
+4. **Reasoning traces are valuable**—the turn-by-turn data reveals how models think, exposing failure modes that wouldn't be visible from success/failure metrics alone.
+<Sidenote>
+  The gap between the best and worst models is substantial, suggesting this benchmark captures meaningful capability differences.
+</Sidenote>
+### Limitations
+This work has several important limitations:
+- **Rule library scope**: 26 hand-crafted rules may not cover all types of scientific reasoning. Real-world hypothesis formation involves much more complex domains.
+- **Statistical power**: 3 seeds per rule provides limited data for variance estimates. Some effects may not be reliably estimated.
+- **Prompt sensitivity**: Different prompts might yield different results. We used a single carefully designed prompt but did not extensively test prompt variations.
+- **No human baseline**: Without human performance data on the same rules, it's hard to contextualize whether model performance is "good" or "bad" in absolute terms.
+- **Cost and API differences**: Models have different pricing and rate limits, which affects practical deployment considerations not captured here.
+### What's Next
+Several directions for future work:
+- **More models**: As new models are released, evaluating them on this benchmark will help track progress in scientific reasoning capabilities.
+- **More rules**: Expanding the rule library to cover additional reasoning patterns (temporal rules, multi-step dependencies, etc.)
+- **Human comparisons**: Collecting human performance data would provide crucial context for interpreting model capabilities.
+- **Interactive exploration**: Building tools to explore individual game traces could help researchers understand model reasoning more deeply.
+<Note variant="info">
+  The benchmark is open source. Try it yourself and contribute new rules or model evaluations!
+</Note>
+### Final Thoughts
+The Eleusis benchmark offers a window into capabilities that matter for real-world scientific reasoning: iterative hypothesis refinement, strategic experimentation, and calibrated confidence. While current LLMs show promising capabilities, significant gaps remain—particularly in calibration and avoiding cognitive biases like hypothesis fixation.
+As LLMs are increasingly deployed to assist with scientific research, understanding these limitations becomes crucial. A model that is brilliant at generating hypotheses but systematically overconfident could lead researchers down unproductive paths. The Eleusis benchmark provides one lens for evaluating and improving these capabilities.

app/src/content/chapters/eleusis/introduction.mdx ADDED Viewed

	@@ -0,0 +1,34 @@

+import Sidenote from "../../../components/Sidenote.astro";
+import Note from "../../../components/Note.astro";
+Large language models are increasingly being deployed as tools for scientific research—analyzing data, generating hypotheses, and even designing experiments. But how well do they actually embody the scientific method?
+<Sidenote>
+  Read time: 15–20 minutes.
+</Sidenote>
+Most reasoning benchmarks test whether models can solve well-defined problems: given premises, derive a conclusion. The ARC challenge, for instance, evaluates inductive reasoning on visual patterns. These benchmarks capture important capabilities, but they miss something fundamental about how science actually works.
+Real scientific reasoning is not a single inference step. It's an iterative process of observation, hypothesis formation, experimentation, and refinement—often spanning many cycles before reaching a conclusion. It requires not just logical ability, but also *strategic* thinking: which experiment to run next, how much evidence is enough, when to commit to a theory versus when to keep exploring.
+<Sidenote>
+  Think of debugging code or diagnosing a medical condition—both follow this same iterative pattern.
+</Sidenote>
+Beyond pure reasoning, effective science depends on psychological factors that are rarely evaluated: **calibration** (does my confidence match my actual accuracy?), **metacognition** (how certain am I about my uncertainty?), and resistance to **cognitive biases** like confirmation bias (seeking only evidence that supports my current hypothesis). A scientist who is brilliant at deduction but overconfident in weak theories will waste resources pursuing dead ends. One who is well-calibrated but overly cautious may never publish.
+We wanted to test whether LLMs can exhibit these deeper aspects of scientific reasoning. To do this, we turned to an unlikely source: a 1950s card game called Eleusis.
+## The Eleusis Game
+Eleusis was designed by Robert Abbott explicitly to simulate the process of scientific discovery. In the game, one player invents a secret rule governing which cards can be played, and other players must deduce the rule through experimentation—playing cards and observing whether they are accepted or rejected.
+It's a microcosm of the scientific method: the rule is a hidden law of nature, each card play is an experiment, and the sequence of accepted and rejected cards is the accumulating evidence.
+<Note variant="info">
+  **TODO**: Add figure showing an example Eleusis game sequence with the secret rule "alternating colors" (red, black, red, black...).
+</Note>
+We built a benchmark around Eleusis to evaluate LLMs on this iterative, hypothesis-driven reasoning. Rather than testing knowledge retrieval or instruction-following, our benchmark asks: can models act like scientists? Can they observe evidence, form hypotheses, design informative experiments, and refine their theories? Can they calibrate their confidence appropriately and know when they've gathered enough evidence to commit to a conclusion?
+These skills are fundamental not just to science, but to debugging code, diagnosing problems, and everyday reasoning under uncertainty.

app/src/content/chapters/eleusis/results.mdx ADDED Viewed

	@@ -0,0 +1,103 @@

+import Image from "../../../components/Image.astro";
+import Wide from "../../../components/Wide.astro";
+import Note from "../../../components/Note.astro";
+import Sidenote from "../../../components/Sidenote.astro";
+import overallPerformance from "../../assets/figures/overall_performance.png";
+import calibrationCurves from "../../assets/figures/calibration_curves.png";
+import confidenceDistribution from "../../assets/figures/confidence_distribution.png";
+import scoreVsFailedGuesses from "../../assets/figures/score_vs_failed_guesses.png";
+import byRule from "../../assets/figures/by_rule.png";
+import complexityAnalysis from "../../assets/figures/complexity_analysis.png";
+## Results
+### Overall Performance
+We evaluated a range of models on the Eleusis benchmark. Performance varies significantly across models, correlating with both model size and reasoning effort (measured by output token usage).
+<Wide>
+<Image
+  src={overallPerformance}
+  alt="LLM performance on Eleusis benchmark: 2D scatter plot showing average score vs output token count for each model"
+  caption="<strong>Figure 1:</strong> Overall model performance on the Eleusis benchmark. Each point represents a model, with position showing average score vs. token usage. Larger reasoning budgets generally correlate with better performance."
+  id="fig-overall"
+  zoomable
+/>
+</Wide>
+<Sidenote>
+  Token usage serves as a proxy for "thinking effort"—models that produce longer reasoning traces tend to perform better.
+</Sidenote>
+### Confidence and Calibration
+Models are asked to output their confidence level, with clear instructions on what it means (7 = 70% probability of being correct, etc.). Even when they don't guess, they report their tentative rule. When confidence ≥5, we test whether they would have guessed correctly.
+<Image
+  src={calibrationCurves}
+  alt="Calibration curves showing reported confidence vs actual success rate for all models"
+  caption="<strong>Figure 2:</strong> Calibration curves for each model. A perfectly calibrated model would follow the diagonal. Points above the line indicate overconfidence; points below indicate underconfidence."
+  id="fig-calibration"
+  zoomable
+/>
+The calibration analysis reveals several patterns:
+- **Most models are overconfident** at high confidence levels—when they report 90% confidence, actual success rates are often closer to 70%
+- **Some models are well-calibrated** at lower confidence levels but diverge as confidence increases
+- **Reasoning models** tend to show better calibration overall
+<Image
+  src={confidenceDistribution}
+  alt="Histogram showing distribution of confidence levels when models choose to guess vs not guess"
+  caption="<strong>Figure 3:</strong> Distribution of confidence levels. Left: when models choose to formally guess. Right: when models choose not to guess. Well-calibrated models should show clear separation between these distributions."
+  id="fig-confidence"
+  zoomable
+/>
+### Guessing Strategy
+The scoring system creates a strategic tension: guess early for more points, but wrong guesses are costly. How do models navigate this tradeoff?
+<Image
+  src={scoreVsFailedGuesses}
+  alt="2D scatter plot showing average score vs average number of failed guesses per round for each model"
+  caption="<strong>Figure 4:</strong> Score vs. failed guesses per round. Models in the upper-left are efficient (high scores, few wrong guesses). Models that guess recklessly appear on the right with low scores."
+  id="fig-guessing"
+  zoomable
+/>
+<Sidenote>
+  The optimal strategy depends on accurate self-assessment—knowing when you've gathered enough evidence to commit.
+</Sidenote>
+### Performance by Rule
+Not all rules are created equal. Some rules are discovered quickly by all models, while others prove consistently challenging.
+<Wide>
+<Image
+  src={byRule}
+  alt="Performance breakdown by rule showing score distribution for each rule across all models"
+  caption="<strong>Figure 5:</strong> Score distribution by rule. Each row is a different rule, with individual run scores shown as points. Some rules show high variance (sensitive to initial conditions), while others are consistently easy or hard."
+  id="fig-by-rule"
+  zoomable
+/>
+</Wide>
+### Rule Complexity
+What makes some rules harder than others? We examined several factors: acceptance rate (rules that accept few cards provide less positive evidence), code complexity of the rule implementation, and semantic complexity.
+<Image
+  src={complexityAnalysis}
+  alt="Scatter plot showing relationship between rule complexity metrics and model performance"
+  caption="<strong>Figure 6:</strong> Relationship between rule complexity and performance. Multiple complexity factors contribute: acceptance rate, structural complexity, and semantic difficulty."
+  id="fig-complexity"
+  zoomable
+/>
+<Note variant="info">
+  Interestingly, code complexity (cyclomatic complexity, AST node count) doesn't perfectly predict difficulty. Semantically simple rules like "only face cards" can be harder than structurally complex rules if the semantic concept is unfamiliar to models.
+</Note>