EASI / leaderboard.json
Hrant's picture
Update leaderboard via Leaderboarder
6f644d3 verified
[
{
"model_name":"Human",
"score":79.2,
"VSI [66]":"79.2",
"SITE [57]":"67.5",
"MMSI [68]":"97.2",
"OmniSpatial [23]":"92.63",
"MindCube βˆ— [69]":"94.55",
"STARE [32]":"96.50",
"CoreCognition [33]":"86.98",
"SpatialViz [55]":"82.46",
"source_title":"EASI",
"source_url":"https:\/\/arxiv.org\/pdf\/2508.13142.pdf",
"notes":"None"
},
{
"model_name":"Qwen3-8B-Instruct [65]",
"score":57.9,
"VSI [66]":"57.90",
"SITE [57]":"45.83",
"MMSI [68]":"31.10",
"OmniSpatial [23]":"45.73",
"MindCube βˆ— [69]":"29.42",
"STARE [32]":"39.76",
"CoreCognition [33]":"69.67",
"SpatialViz [55]":"17.54 †",
"source_title":"EASI",
"source_url":"https:\/\/arxiv.org\/pdf\/2508.13142.pdf",
"notes":"† indicates cases where generations were truncated due to overlong chains of thought, yielding no final answer; such instances are counted as incorrect, which depresses the score."
},
{
"model_name":"InternVL3.5-8B [56]",
"score":56.05,
"VSI [66]":"56.05",
"SITE [57]":"43.79",
"MMSI [68]":"27.30",
"OmniSpatial [23]":"46.71",
"MindCube βˆ— [69]":"42.50",
"STARE [32]":"40.18",
"CoreCognition [33]":"66.40",
"SpatialViz [55]":"23.98",
"source_title":"EASI",
"source_url":"https:\/\/arxiv.org\/pdf\/2508.13142.pdf",
"notes":"None"
},
{
"model_name":"GPT-5-2025-08-07 [45]",
"score":55.03,
"VSI [66]":"55.03",
"SITE [57]":"61.88",
"MMSI [68]":"41.80",
"OmniSpatial [23]":"59.90",
"MindCube βˆ— [69]":"56.30",
"STARE [32]":"54.59",
"CoreCognition [33]":"84.37",
"SpatialViz [55]":"51.27",
"source_title":"EASI",
"source_url":"https:\/\/arxiv.org\/pdf\/2508.13142.pdf",
"notes":"None"
},
{
"model_name":"Gemini-2.5-pro-2025-06 [52]",
"score":53.57,
"VSI [66]":"53.57",
"SITE [57]":"57.06",
"MMSI [68]":"38.00",
"OmniSpatial [23]":"55.38",
"MindCube βˆ— [69]":"57.60",
"STARE [32]":"49.14",
"CoreCognition [33]":"76.70",
"SpatialViz [55]":"42.71",
"source_title":"EASI",
"source_url":"https:\/\/arxiv.org\/pdf\/2508.13142.pdf",
"notes":"None"
},
{
"model_name":"Seed-1.6-2025-06-15 [51]",
"score":49.91,
"VSI [66]":"49.91",
"SITE [57]":"54.61",
"MMSI [68]":"38.30",
"OmniSpatial [23]":"49.32",
"MindCube βˆ— [69]":"48.75",
"STARE [32]":"46.06",
"CoreCognition [33]":"77.17",
"SpatialViz [55]":"34.58",
"source_title":"EASI",
"source_url":"https:\/\/arxiv.org\/pdf\/2508.13142.pdf",
"notes":"None"
},
{
"model_name":"GPT-5-mini-2025-08-07 [45]",
"score":48.67,
"VSI [66]":"48.67",
"SITE [57]":"52.47",
"MMSI [68]":"34.10",
"OmniSpatial [23]":"55.52",
"MindCube βˆ— [69]":"56.69",
"STARE [32]":"52.51",
"CoreCognition [33]":"77.77",
"SpatialViz [55]":"44.66",
"source_title":"EASI",
"source_url":"https:\/\/arxiv.org\/pdf\/2508.13142.pdf",
"notes":"None"
},
{
"model_name":"Grok-4-2025-07-09 [62]",
"score":47.92,
"VSI [66]":"47.92",
"SITE [57]":"47.01",
"MMSI [68]":"37.80",
"OmniSpatial [23]":"46.84",
"MindCube βˆ— [69]":"63.56",
"STARE [32]":"26.90",
"CoreCognition [33]":"79.27",
"SpatialViz [55]":"19.40 †",
"source_title":"EASI",
"source_url":"https:\/\/arxiv.org\/pdf\/2508.13142.pdf",
"notes":"† indicates cases where generations were truncated due to overlong chains of thought, yielding no final answer; such instances are counted as incorrect, which depresses the score."
},
{
"model_name":"InternVL3-78B [79]",
"score":47.55,
"VSI [66]":"47.55",
"SITE [57]":"52.72",
"MMSI [68]":"30.50",
"OmniSpatial [23]":"50.95",
"MindCube βˆ— [69]":"49.52",
"STARE [32]":"42.00",
"CoreCognition [33]":"71.16",
"SpatialViz [55]":"31.10",
"source_title":"EASI",
"source_url":"https:\/\/arxiv.org\/pdf\/2508.13142.pdf",
"notes":"None"
},
{
"model_name":"GPT-5-nano-2025-08-07 [45]",
"score":43.22,
"VSI [66]":"43.22",
"SITE [57]":"35.81",
"MMSI [68]":"28.90",
"OmniSpatial [23]":"47.81",
"MindCube βˆ— [69]":"41.48",
"STARE [32]":"46.05",
"CoreCognition [33]":"67.92",
"SpatialViz [55]":"35.59",
"source_title":"EASI",
"source_url":"https:\/\/arxiv.org\/pdf\/2508.13142.pdf",
"notes":"None"
},
{
"model_name":"InternVL3-8B [79]",
"score":42.14,
"VSI [66]":"42.14",
"SITE [57]":"41.15",
"MMSI [68]":"28.00",
"OmniSpatial [23]":"46.25",
"MindCube βˆ— [69]":"41.54",
"STARE [32]":"41.36",
"CoreCognition [33]":"60.92",
"SpatialViz [55]":"30.00",
"source_title":"EASI",
"source_url":"https:\/\/arxiv.org\/pdf\/2508.13142.pdf",
"notes":"None"
},
{
"model_name":"Qwen2.5-VL-72B-Instruct [1]",
"score":35.77,
"VSI [66]":"35.77",
"SITE [57]":"47.41",
"MMSI [68]":"32.50",
"OmniSpatial [23]":"47.81",
"MindCube βˆ— [69]":"42.40",
"STARE [32]":"38.37",
"CoreCognition [33]":"69.22",
"SpatialViz [55]":"32.54",
"source_title":"EASI",
"source_url":"https:\/\/arxiv.org\/pdf\/2508.13142.pdf",
"notes":"None"
},
{
"model_name":"Random Choice",
"score":34.0,
"VSI [66]":"34.00",
"SITE [57]":"0.0",
"MMSI [68]":"25.00",
"OmniSpatial [23]":"24.98",
"MindCube βˆ— [69]":"32.35",
"STARE [32]":"34.80",
"CoreCognition [33]":"33.93",
"SpatialViz [55]":"25.08",
"source_title":"EASI",
"source_url":"https:\/\/arxiv.org\/pdf\/2508.13142.pdf",
"notes":"VSI random choice here is chance level(Frequency)."
},
{
"model_name":"Qwen2.5-VL-7B-Instruct [1]",
"score":32.3,
"VSI [66]":"32.30",
"SITE [57]":"37.64",
"MMSI [68]":"26.80",
"OmniSpatial [23]":"39.07",
"MindCube βˆ— [69]":"36.05",
"STARE [32]":"35.03",
"CoreCognition [33]":"62.16",
"SpatialViz [55]":"26.78",
"source_title":"EASI",
"source_url":"https:\/\/arxiv.org\/pdf\/2508.13142.pdf",
"notes":"None"
},
{
"model_name":"Qwen2.5-VL-3B-Instruct [1]",
"score":27.0,
"VSI [66]":"27.00",
"SITE [57]":"33.14",
"MMSI [68]":"28.60",
"OmniSpatial [23]":"42.47",
"MindCube βˆ— [69]":"37.60",
"STARE [32]":"37.83",
"CoreCognition [33]":"60.19",
"SpatialViz [55]":"21.86",
"source_title":"EASI",
"source_url":"https:\/\/arxiv.org\/pdf\/2508.13142.pdf",
"notes":"None"
}
]