File size: 11,333 Bytes
194476e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 | // Auto-generated from figure3_master_heatmap.csv
// Structure: array of { model, tier, scores: { "Game|Modality": value } }
const GAMES = ["KingD", "Res Arcana", "Pax Ren.", "Carca.", "Catan"];
const GAME_LABELS = {
"KingD": "Kingdomino",
"Res Arcana": "Res Arcana",
"Pax Ren.": "Pax Renaissance",
"Carca.": "Carcassonne",
"Catan": "Catan"
};
const MODALITIES = ["None", "Text", "Image"];
const TIERS = ["T1", "T2", "T3"];
const TIER_LABELS = {
"T1": "Tier 1: Environment Perception",
"T2": "Tier 2: Rules Integration",
"T3": "Tier 3: Short-Horizon Optimization"
};
const BENCHMARK_DATA = [
{ model: "GPT-4o", tier: "T1", scores: { "KingD|None": 0.381, "KingD|Text": 0.571, "KingD|Image": 0.429, "Res Arcana|None": 0.650, "Res Arcana|Text": 0.650, "Res Arcana|Image": 0.575, "Pax Ren.|None": 0.375, "Pax Ren.|Text": 0.475, "Pax Ren.|Image": 0.600, "Carca.|None": 0.425, "Carca.|Text": 0.350, "Carca.|Image": 0.400, "Catan|None": 0.450, "Catan|Text": 0.450, "Catan|Image": 0.450 } },
{ model: "o1", tier: "T1", scores: { "KingD|None": 0.524, "KingD|Text": 0.429, "KingD|Image": 0.429, "Res Arcana|None": 0.675, "Res Arcana|Text": 0.525, "Res Arcana|Image": 0.600, "Pax Ren.|None": 0.450, "Pax Ren.|Text": 0.550, "Pax Ren.|Image": 0.475, "Carca.|None": 0.450, "Carca.|Text": 0.450, "Carca.|Image": 0.375, "Catan|None": 0.475, "Catan|Text": 0.550, "Catan|Image": 0.550 } },
{ model: "GPT-4.1", tier: "T1", scores: { "KingD|None": 0.619, "KingD|Text": 0.524, "KingD|Image": 0.619, "Res Arcana|None": 0.775, "Res Arcana|Text": 0.725, "Res Arcana|Image": 0.750, "Pax Ren.|None": 0.525, "Pax Ren.|Text": 0.575, "Pax Ren.|Image": 0.600, "Carca.|None": 0.575, "Carca.|Text": 0.400, "Carca.|Image": 0.475, "Catan|None": 0.575, "Catan|Text": 0.450, "Catan|Image": 0.650 } },
{ model: "o3", tier: "T1", scores: { "KingD|None": 0.750, "KingD|Text": 0.675, "KingD|Image": 0.650, "Res Arcana|None": 0.775, "Res Arcana|Text": 0.775, "Res Arcana|Image": 0.700, "Pax Ren.|None": 0.475, "Pax Ren.|Text": 0.675, "Pax Ren.|Image": 0.650, "Carca.|None": 0.450, "Carca.|Text": 0.500, "Carca.|Image": 0.575, "Catan|None": 0.600, "Catan|Text": 0.575, "Catan|Image": 0.550 } },
{ model: "GPT-5.1", tier: "T1", scores: { "KingD|None": 0.750, "KingD|Text": 0.650, "KingD|Image": 0.675, "Res Arcana|None": 0.775, "Res Arcana|Text": 0.800, "Res Arcana|Image": 0.800, "Pax Ren.|None": 0.600, "Pax Ren.|Text": 0.600, "Pax Ren.|Image": 0.650, "Carca.|None": 0.725, "Carca.|Text": 0.525, "Carca.|Image": 0.550, "Catan|None": 0.600, "Catan|Text": 0.575, "Catan|Image": 0.575 } },
{ model: "Gemini 2.5 Flash", tier: "T1", scores: { "KingD|None": 0.524, "KingD|Text": 0.476, "KingD|Image": 0.381, "Res Arcana|None": 0.675, "Res Arcana|Text": 0.775, "Res Arcana|Image": 0.500, "Pax Ren.|None": 0.550, "Pax Ren.|Text": 0.400, "Pax Ren.|Image": 0.375, "Carca.|None": 0.450, "Carca.|Text": 0.400, "Carca.|Image": 0.350, "Catan|None": 0.625, "Catan|Text": 0.500, "Catan|Image": 0.325 } },
{ model: "Gemini 2.5 Pro", tier: "T1", scores: { "KingD|None": 0.524, "KingD|Text": 0.524, "KingD|Image": 0.333, "Res Arcana|None": 0.650, "Res Arcana|Text": 0.700, "Res Arcana|Image": 0.525, "Pax Ren.|None": 0.650, "Pax Ren.|Text": 0.725, "Pax Ren.|Image": 0.375, "Carca.|None": 0.425, "Carca.|Text": 0.350, "Carca.|Image": 0.450, "Catan|None": 0.575, "Catan|Text": 0.650, "Catan|Image": 0.425 } },
{ model: "Gemini 3 Pro", tier: "T1", scores: { "KingD|None": 0.825, "KingD|Text": 0.800, "KingD|Image": 0.850, "Res Arcana|None": 0.875, "Res Arcana|Text": 0.850, "Res Arcana|Image": 0.775, "Pax Ren.|None": 0.775, "Pax Ren.|Text": 0.750, "Pax Ren.|Image": 0.800, "Carca.|None": 0.825, "Carca.|Text": 0.750, "Carca.|Image": 0.650, "Catan|None": 0.525, "Catan|Text": 0.625, "Catan|Image": 0.700 } },
{ model: "Claude 4.5 Sonnet", tier: "T1", scores: { "KingD|None": 0.571, "KingD|Text": 0.619, "KingD|Image": 0.524, "Res Arcana|None": 0.650, "Res Arcana|Text": 0.800, "Res Arcana|Image": 0.800, "Pax Ren.|None": 0.600, "Pax Ren.|Text": 0.650, "Pax Ren.|Image": null, "Carca.|None": 0.375, "Carca.|Text": 0.325, "Carca.|Image": 0.300, "Catan|None": 0.600, "Catan|Text": 0.625, "Catan|Image": null } },
{ model: "GPT-4o", tier: "T2", scores: { "KingD|None": 0.300, "KingD|Text": 0.433, "KingD|Image": 0.300, "Res Arcana|None": 0.225, "Res Arcana|Text": 0.400, "Res Arcana|Image": 0.350, "Pax Ren.|None": 0.125, "Pax Ren.|Text": 0.525, "Pax Ren.|Image": 0.475, "Carca.|None": 0.125, "Carca.|Text": 0.150, "Carca.|Image": 0.225, "Catan|None": 0.225, "Catan|Text": 0.200, "Catan|Image": 0.200 } },
{ model: "o1", tier: "T2", scores: { "KingD|None": 0.200, "KingD|Text": 0.400, "KingD|Image": 0.267, "Res Arcana|None": 0.350, "Res Arcana|Text": 0.350, "Res Arcana|Image": 0.350, "Pax Ren.|None": 0.250, "Pax Ren.|Text": 0.400, "Pax Ren.|Image": 0.450, "Carca.|None": 0.225, "Carca.|Text": 0.250, "Carca.|Image": 0.300, "Catan|None": 0.150, "Catan|Text": 0.250, "Catan|Image": 0.225 } },
{ model: "GPT-4.1", tier: "T2", scores: { "KingD|None": 0.333, "KingD|Text": 0.433, "KingD|Image": 0.300, "Res Arcana|None": 0.400, "Res Arcana|Text": 0.500, "Res Arcana|Image": 0.475, "Pax Ren.|None": 0.275, "Pax Ren.|Text": 0.400, "Pax Ren.|Image": 0.375, "Carca.|None": 0.175, "Carca.|Text": 0.200, "Carca.|Image": 0.150, "Catan|None": 0.325, "Catan|Text": 0.225, "Catan|Image": 0.325 } },
{ model: "o3", tier: "T2", scores: { "KingD|None": 0.350, "KingD|Text": 0.400, "KingD|Image": 0.375, "Res Arcana|None": 0.325, "Res Arcana|Text": 0.625, "Res Arcana|Image": 0.475, "Pax Ren.|None": 0.300, "Pax Ren.|Text": 0.550, "Pax Ren.|Image": 0.575, "Carca.|None": 0.275, "Carca.|Text": 0.275, "Carca.|Image": 0.275, "Catan|None": 0.375, "Catan|Text": 0.275, "Catan|Image": 0.275 } },
{ model: "GPT-5.1", tier: "T2", scores: { "KingD|None": 0.300, "KingD|Text": 0.325, "KingD|Image": 0.275, "Res Arcana|None": 0.325, "Res Arcana|Text": 0.525, "Res Arcana|Image": 0.525, "Pax Ren.|None": 0.200, "Pax Ren.|Text": 0.600, "Pax Ren.|Image": 0.467, "Carca.|None": 0.250, "Carca.|Text": 0.250, "Carca.|Image": 0.325, "Catan|None": 0.275, "Catan|Text": 0.275, "Catan|Image": 0.300 } },
{ model: "Gemini 2.5 Flash", tier: "T2", scores: { "KingD|None": 0.167, "KingD|Text": 0.300, "KingD|Image": 0.267, "Res Arcana|None": 0.225, "Res Arcana|Text": 0.300, "Res Arcana|Image": 0.300, "Pax Ren.|None": 0.250, "Pax Ren.|Text": 0.300, "Pax Ren.|Image": 0.375, "Carca.|None": 0.250, "Carca.|Text": 0.300, "Carca.|Image": 0.200, "Catan|None": 0.250, "Catan|Text": 0.275, "Catan|Image": 0.125 } },
{ model: "Gemini 2.5 Pro", tier: "T2", scores: { "KingD|None": 0.367, "KingD|Text": 0.367, "KingD|Image": 0.267, "Res Arcana|None": 0.375, "Res Arcana|Text": 0.475, "Res Arcana|Image": 0.375, "Pax Ren.|None": 0.100, "Pax Ren.|Text": 0.400, "Pax Ren.|Image": 0.250, "Carca.|None": 0.225, "Carca.|Text": 0.275, "Carca.|Image": 0.150, "Catan|None": 0.275, "Catan|Text": 0.350, "Catan|Image": 0.175 } },
{ model: "Gemini 3 Pro", tier: "T2", scores: { "KingD|None": 0.725, "KingD|Text": 0.675, "KingD|Image": 0.750, "Res Arcana|None": 0.550, "Res Arcana|Text": 0.625, "Res Arcana|Image": 0.650, "Pax Ren.|None": 0.325, "Pax Ren.|Text": 0.475, "Pax Ren.|Image": 0.425, "Carca.|None": 0.425, "Carca.|Text": 0.425, "Carca.|Image": 0.325, "Catan|None": 0.500, "Catan|Text": 0.325, "Catan|Image": 0.500 } },
{ model: "Claude 4.5 Sonnet", tier: "T2", scores: { "KingD|None": 0.300, "KingD|Text": 0.333, "KingD|Image": 0.233, "Res Arcana|None": 0.433, "Res Arcana|Text": 0.633, "Res Arcana|Image": 0.400, "Pax Ren.|None": 0.300, "Pax Ren.|Text": 0.533, "Pax Ren.|Image": null, "Carca.|None": 0.275, "Carca.|Text": 0.275, "Carca.|Image": 0.375, "Catan|None": 0.225, "Catan|Text": 0.275, "Catan|Image": null } },
{ model: "GPT-4o", tier: "T3", scores: { "KingD|None": 0.000, "KingD|Text": 0.000, "KingD|Image": 0.000, "Res Arcana|None": 0.020, "Res Arcana|Text": 0.060, "Res Arcana|Image": 0.000, "Pax Ren.|None": 0.053, "Pax Ren.|Text": 0.053, "Pax Ren.|Image": 0.018, "Carca.|None": 0.040, "Carca.|Text": 0.000, "Carca.|Image": 0.040, "Catan|None": 0.065, "Catan|Text": 0.000, "Catan|Image": 0.032 } },
{ model: "o1", tier: "T3", scores: { "KingD|None": 0.000, "KingD|Text": 0.020, "KingD|Image": 0.000, "Res Arcana|None": 0.040, "Res Arcana|Text": 0.060, "Res Arcana|Image": 0.040, "Pax Ren.|None": 0.105, "Pax Ren.|Text": 0.018, "Pax Ren.|Image": 0.053, "Carca.|None": 0.080, "Carca.|Text": 0.040, "Carca.|Image": 0.040, "Catan|None": 0.032, "Catan|Text": 0.065, "Catan|Image": 0.032 } },
{ model: "GPT-4.1", tier: "T3", scores: { "KingD|None": 0.040, "KingD|Text": 0.080, "KingD|Image": 0.100, "Res Arcana|None": 0.020, "Res Arcana|Text": 0.020, "Res Arcana|Image": 0.020, "Pax Ren.|None": 0.123, "Pax Ren.|Text": 0.123, "Pax Ren.|Image": 0.088, "Carca.|None": 0.020, "Carca.|Text": 0.000, "Carca.|Image": 0.060, "Catan|None": 0.194, "Catan|Text": 0.065, "Catan|Image": 0.032 } },
{ model: "o3", tier: "T3", scores: { "KingD|None": 0.060, "KingD|Text": 0.120, "KingD|Image": 0.040, "Res Arcana|None": 0.000, "Res Arcana|Text": 0.060, "Res Arcana|Image": 0.060, "Pax Ren.|None": 0.070, "Pax Ren.|Text": 0.175, "Pax Ren.|Image": 0.211, "Carca.|None": 0.120, "Carca.|Text": 0.060, "Carca.|Image": 0.100, "Catan|None": 0.194, "Catan|Text": 0.226, "Catan|Image": 0.258 } },
{ model: "GPT-5.1", tier: "T3", scores: { "KingD|None": 0.160, "KingD|Text": 0.060, "KingD|Image": 0.060, "Res Arcana|None": 0.080, "Res Arcana|Text": 0.080, "Res Arcana|Image": 0.080, "Pax Ren.|None": 0.123, "Pax Ren.|Text": 0.088, "Pax Ren.|Image": 0.211, "Carca.|None": 0.020, "Carca.|Text": 0.100, "Carca.|Image": 0.100, "Catan|None": 0.129, "Catan|Text": 0.161, "Catan|Image": 0.161 } },
{ model: "Gemini 2.5 Flash", tier: "T3", scores: { "KingD|None": 0.000, "KingD|Text": 0.020, "KingD|Image": 0.000, "Res Arcana|None": 0.000, "Res Arcana|Text": 0.080, "Res Arcana|Image": 0.080, "Pax Ren.|None": 0.018, "Pax Ren.|Text": 0.053, "Pax Ren.|Image": 0.000, "Carca.|None": 0.020, "Carca.|Text": 0.060, "Carca.|Image": 0.040, "Catan|None": 0.032, "Catan|Text": 0.000, "Catan|Image": 0.065 } },
{ model: "Gemini 2.5 Pro", tier: "T3", scores: { "KingD|None": 0.040, "KingD|Text": 0.020, "KingD|Image": 0.000, "Res Arcana|None": 0.120, "Res Arcana|Text": 0.300, "Res Arcana|Image": 0.080, "Pax Ren.|None": 0.123, "Pax Ren.|Text": 0.053, "Pax Ren.|Image": 0.088, "Carca.|None": 0.060, "Carca.|Text": 0.060, "Carca.|Image": 0.040, "Catan|None": 0.129, "Catan|Text": 0.097, "Catan|Image": 0.161 } },
{ model: "Gemini 3 Pro", tier: "T3", scores: { "KingD|None": 0.160, "KingD|Text": 0.160, "KingD|Image": 0.140, "Res Arcana|None": 0.200, "Res Arcana|Text": 0.260, "Res Arcana|Image": 0.180, "Pax Ren.|None": 0.053, "Pax Ren.|Text": 0.105, "Pax Ren.|Image": 0.193, "Carca.|None": 0.060, "Carca.|Text": 0.020, "Carca.|Image": 0.040, "Catan|None": 0.129, "Catan|Text": 0.161, "Catan|Image": 0.032 } },
{ model: "Claude 4.5 Sonnet", tier: "T3", scores: { "KingD|None": 0.020, "KingD|Text": 0.040, "KingD|Image": 0.040, "Res Arcana|None": 0.040, "Res Arcana|Text": 0.060, "Res Arcana|Image": 0.080, "Pax Ren.|None": 0.053, "Pax Ren.|Text": 0.088, "Pax Ren.|Image": null, "Carca.|None": 0.040, "Carca.|Text": 0.020, "Carca.|Image": 0.060, "Catan|None": 0.000, "Catan|Text": 0.032, "Catan|Image": null } }
];
|