eleusis-benchmark

Running

dlouapre HF Staff

Title

a2c0d8a about 2 months ago

1.45 kB

	---
	title: "Can LLMs Play the Game of Science?"
	subtitle: "Evaluating scientific reasoning and metacognition using the card game Eleusis reveals distinct scientist personalities in large language models"
	description: "A benchmark for evaluating LLM scientific reasoning using the card game Eleusis, testing iterative hypothesis formation, calibration, and strategic experimentation."
	authors:
	- name: "David Louapre"
	url: "https://huggingface.co/dlouapre"
	affiliations: [1]
	affiliations:
	- name: "Hugging Face"
	url: "https://huggingface.co"
	published: "Feb. 09, 2026"
	licence: >
	Diagrams and text are licensed under <a href="https://creativecommons.org/licenses/by/4.0/" target="_blank" rel="noopener noreferrer">CC‑BY 4.0</a> with the source available on <a href="https://huggingface.co/spaces/dlouapre/eleusis-benchmark" target="_blank" rel="noopener noreferrer">Hugging Face</a>, unless noted otherwise.
	tags:
	- LLM evaluation
	- scientific reasoning
	- benchmarks
	- calibration
	tableOfContentsAutoCollapse: true
	pdfProOnly: false
	showPdf: true
	---

	import Introduction from "./chapters/eleusis/introduction.mdx";
	import Benchmark from "./chapters/eleusis/benchmark.mdx";
	import Results from "./chapters/eleusis/results.mdx";
	import Discussion from "./chapters/eleusis/discussion.mdx";
	import Appendix from "./chapters/eleusis/appendix.mdx";

	<Introduction />

	<Benchmark />

	<Results />

	<Discussion />

	<Appendix />