DDR_Bench / data.js
thinkwee
init
cf573f9
raw
history blame
32.4 kB
// DDR-Bench Visualization Data
// Auto-generated data for interactive charts
const DDR_DATA = {
// Color scheme for models
modelColors: {
'GPT-5.2': '#00C853',
'Claude-4.5-Sonnet': '#FF6D00',
'Gemini-3-Flash': '#2196F3',
'GLM-4.6': '#9C27B0',
'DeepSeek-V3.2': '#E91E63',
'Qwen3-Next-80B-A3B': '#FFC107',
'Kimi-K2': '#FFA500',
'MiniMax-M2': '#20B2AA',
// Probing models
'Qwen2.5-32B': '#4A90D9',
'Qwen2.5-72B': '#1A5FB4',
'Qwen3-4B': '#57E389',
'Qwen3-30B-A3B': '#26A269',
},
// Scaling Analysis Data
scaling: {
mimic: {
'GPT-5.2': {
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
tokens: [51, 1476, 1796, 2544, 3738, 4927, 5784, 6682, 7563, 8577, 10445, 11612, 12837, 14129, 15460, 16840, 17761, 18642, 19456, 20194],
costs: [0.0005, 0.0012, 0.0021, 0.0032, 0.0050, 0.0072, 0.0100, 0.0131, 0.0167, 0.0207, 0.0257, 0.0310, 0.0371, 0.0439, 0.0516, 0.0595, 0.0680, 0.0772, 0.0860, 0.0947],
accuracy: [2.8, 5.5, 8.2, 10.8, 13.2, 15.5, 17.6, 19.5, 21.2, 22.7, 24.0, 25.1, 26.0, 26.7, 27.1, 27.2, 27.2, 27.3, 27.3, 27.26]
},
'Claude-4.5-Sonnet': {
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
tokens: [33, 1527, 1715, 3193, 4513, 5965, 6664, 7387, 8417, 9214, 9823, 10620, 11533, 12516, 13378, 14190, 15001, 15723, 16457, 17218],
costs: [0.0004, 0.0027, 0.0053, 0.0097, 0.0152, 0.0222, 0.0300, 0.0386, 0.0484, 0.0590, 0.0702, 0.0823, 0.0954, 0.1097, 0.1249, 0.1410, 0.1580, 0.1758, 0.1944, 0.2138],
accuracy: [3.5, 7.0, 10.5, 14.0, 17.2, 20.2, 23.0, 25.5, 27.8, 29.8, 31.5, 32.8, 33.8, 34.2, 34.3, 34.4, 34.4, 34.4, 34.4, 34.37]
},
'Gemini-3-Flash': {
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
tokens: [457, 2153, 2606, 4332, 5581, 7503, 8911, 10726, 12697, 14305, 16481, 18695, 20559, 22036, 23357, 24415, 25207, 25977, 26542, 26964],
costs: [0.0001, 0.0004, 0.0007, 0.0013, 0.0020, 0.0030, 0.0040, 0.0052, 0.0066, 0.0080, 0.0097, 0.0116, 0.0135, 0.0154, 0.0173, 0.0196, 0.0219, 0.0240, 0.0263, 0.0284],
accuracy: [2.5, 5.0, 7.5, 10.0, 12.4, 14.6, 16.7, 18.6, 20.3, 21.8, 23.1, 24.0, 24.6, 24.8, 24.9, 24.9, 24.9, 24.9, 24.9, 24.94]
},
'GLM-4.6': {
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
tokens: [59, 1528, 1775, 2779, 3488, 4211, 4665, 5338, 6159, 7059, 7997, 8766, 9345, 9928, 10542, 11095, 11598, 12149, 12657, 13099],
costs: [0.0001, 0.0008, 0.0015, 0.0024, 0.0034, 0.0045, 0.0056, 0.0069, 0.0083, 0.0098, 0.0115, 0.0133, 0.0151, 0.0170, 0.0190, 0.0210, 0.0231, 0.0253, 0.0275, 0.0298],
accuracy: [2.3, 4.7, 7.0, 9.3, 11.5, 13.5, 15.4, 17.1, 18.7, 20.1, 21.2, 22.1, 22.7, 23.0, 23.1, 23.2, 23.2, 23.2, 23.3, 23.26]
},
'DeepSeek-V3.2': {
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
tokens: [45, 1420, 1690, 2450, 3520, 4680, 5560, 6420, 7350, 8280, 9150, 10020, 10890, 11750, 12610, 13470, 14320, 15170, 16020, 16870],
costs: [0.0001, 0.0006, 0.0012, 0.0020, 0.0031, 0.0044, 0.0059, 0.0076, 0.0095, 0.0117, 0.0140, 0.0165, 0.0192, 0.0221, 0.0252, 0.0284, 0.0318, 0.0354, 0.0392, 0.0431],
accuracy: [2.7, 5.4, 8.1, 10.8, 13.4, 15.8, 18.1, 20.2, 22.1, 23.8, 25.2, 26.3, 26.8, 27.0, 27.0, 27.0, 27.0, 27.0, 27.0, 27.00]
}
},
'10k': {
'GPT-5.2': {
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
tokens: [48, 1380, 1650, 2380, 3420, 4550, 5410, 6250, 7150, 8050, 8890, 9730, 10570, 11400, 12230, 13060, 13880, 14700, 15520, 16340],
costs: [0.0004, 0.0010, 0.0017, 0.0027, 0.0042, 0.0061, 0.0084, 0.0110, 0.0140, 0.0174, 0.0216, 0.0261, 0.0312, 0.0369, 0.0434, 0.0501, 0.0572, 0.0650, 0.0724, 0.0797],
accuracy: [4.5, 9.0, 13.5, 18.0, 22.3, 26.3, 30.0, 33.4, 36.5, 39.3, 41.8, 43.5, 44.5, 44.9, 45.0, 45.0, 45.0, 45.0, 45.0, 44.99]
},
'Claude-4.5-Sonnet': {
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
tokens: [30, 1420, 1580, 2970, 4200, 5550, 6200, 6870, 7830, 8570, 9130, 9870, 10710, 11620, 12410, 13150, 13890, 14550, 15220, 15920],
costs: [0.0004, 0.0025, 0.0049, 0.0089, 0.0140, 0.0205, 0.0277, 0.0357, 0.0447, 0.0545, 0.0649, 0.0760, 0.0882, 0.1014, 0.1154, 0.1303, 0.1460, 0.1624, 0.1796, 0.1976],
accuracy: [7.7, 15.5, 23.2, 30.9, 38.4, 45.6, 52.6, 59.2, 65.5, 70.5, 74.2, 76.0, 77.0, 77.3, 77.3, 77.3, 77.3, 77.3, 77.3, 77.27]
},
'Gemini-3-Flash': {
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
tokens: [420, 1980, 2400, 3990, 5140, 6910, 8210, 9880, 11700, 13180, 15180, 17220, 18940, 20300, 21510, 22480, 23210, 23920, 24440, 24830],
costs: [0.0001, 0.0004, 0.0007, 0.0012, 0.0019, 0.0028, 0.0037, 0.0048, 0.0061, 0.0074, 0.0090, 0.0107, 0.0125, 0.0142, 0.0160, 0.0181, 0.0202, 0.0222, 0.0243, 0.0263],
accuracy: [4.4, 8.9, 13.3, 17.8, 22.0, 26.1, 30.0, 33.6, 37.0, 40.1, 42.4, 43.8, 44.3, 44.4, 44.4, 44.4, 44.4, 44.4, 44.4, 44.41]
},
'GLM-4.6': {
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
tokens: [54, 1400, 1625, 2545, 3196, 3860, 4273, 4888, 5645, 6474, 7330, 8036, 8576, 9120, 9697, 10210, 10678, 11192, 11662, 12080],
costs: [0.0001, 0.0007, 0.0014, 0.0022, 0.0031, 0.0041, 0.0051, 0.0063, 0.0076, 0.0090, 0.0106, 0.0122, 0.0139, 0.0156, 0.0174, 0.0193, 0.0212, 0.0232, 0.0252, 0.0273],
accuracy: [6.0, 12.1, 18.1, 24.2, 30.0, 35.6, 41.0, 46.0, 50.8, 55.0, 58.2, 59.7, 60.3, 60.4, 60.4, 60.4, 60.4, 60.4, 60.4, 60.42]
},
'DeepSeek-V3.2': {
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
tokens: [42, 1305, 1555, 2250, 3235, 4295, 5105, 5895, 6750, 7600, 8395, 9190, 9985, 10775, 11565, 12355, 13140, 13925, 14710, 15495],
costs: [0.0001, 0.0005, 0.0011, 0.0018, 0.0028, 0.0040, 0.0054, 0.0070, 0.0087, 0.0107, 0.0129, 0.0152, 0.0176, 0.0203, 0.0231, 0.0261, 0.0292, 0.0325, 0.0360, 0.0396],
accuracy: [6.1, 12.1, 18.2, 24.2, 30.1, 35.8, 41.2, 46.3, 51.2, 55.5, 58.8, 60.2, 60.6, 60.7, 60.7, 60.7, 60.7, 60.7, 60.7, 60.66]
}
},
globem: {
'GPT-5.2': {
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
tokens: [51, 1476, 1796, 2544, 3738, 4927, 5784, 6682, 7563, 8577, 10445, 11612, 12837, 14129, 15460],
costs: [0.0005, 0.0012, 0.0021, 0.0032, 0.0050, 0.0072, 0.0100, 0.0131, 0.0167, 0.0207, 0.0257, 0.0310, 0.0371, 0.0439, 0.0516],
accuracy: [3.8, 7.7, 11.5, 15.3, 19.0, 22.6, 26.1, 29.4, 32.5, 35.4, 37.2, 38.0, 38.3, 38.4, 38.39]
},
'Claude-4.5-Sonnet': {
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
tokens: [33, 1527, 1715, 3193, 4513, 5965, 6664, 7387, 8417, 9214, 9823, 10620, 11533, 12516, 13378],
costs: [0.0004, 0.0027, 0.0053, 0.0097, 0.0152, 0.0222, 0.0300, 0.0386, 0.0484, 0.0590, 0.0702, 0.0823, 0.0954, 0.1097, 0.1249],
accuracy: [4.0, 8.0, 12.1, 16.1, 20.0, 23.9, 27.6, 31.2, 34.6, 37.0, 39.0, 40.0, 40.2, 40.2, 40.23]
},
'Gemini-3-Flash': {
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
tokens: [457, 2153, 2606, 4332, 5581, 7503, 8911, 10726, 12697, 14305, 16481, 18695, 20559, 22036, 23357],
costs: [0.0001, 0.0004, 0.0007, 0.0013, 0.0020, 0.0030, 0.0040, 0.0052, 0.0066, 0.0080, 0.0097, 0.0116, 0.0135, 0.0154, 0.0173],
accuracy: [3.5, 7.1, 10.6, 14.1, 17.5, 20.8, 24.0, 27.1, 29.9, 32.2, 33.8, 34.9, 35.2, 35.3, 35.29]
},
'GLM-4.6': {
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
tokens: [59, 1528, 1775, 2779, 3488, 4211, 4665, 5338, 6159, 7059, 7997, 8766, 9345, 9928, 10542],
costs: [0.0001, 0.0008, 0.0015, 0.0024, 0.0034, 0.0045, 0.0056, 0.0069, 0.0083, 0.0098, 0.0115, 0.0133, 0.0151, 0.0170, 0.0190],
accuracy: [4.2, 8.3, 12.5, 16.6, 20.7, 24.6, 28.4, 32.0, 35.4, 38.0, 40.0, 41.2, 41.5, 41.6, 41.61]
},
'DeepSeek-V3.2': {
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
tokens: [45, 1420, 1690, 2450, 3520, 4680, 5560, 6420, 7350, 8280, 9150, 10020, 10890, 11750, 12610],
costs: [0.0001, 0.0006, 0.0012, 0.0020, 0.0031, 0.0044, 0.0059, 0.0076, 0.0095, 0.0117, 0.0140, 0.0165, 0.0192, 0.0221, 0.0252],
accuracy: [3.8, 7.6, 11.5, 15.3, 19.0, 22.7, 26.2, 29.6, 32.8, 35.5, 37.2, 38.0, 38.1, 38.2, 38.16]
}
}
},
// Ranking Comparison Data
ranking: {
MIMIC: [
{ model: 'Claude4.5-Sonnet', bt_rank: 1, win_rate: 87.5, accuracy: 33.66, acc_rank: 1, is_proprietary: true },
{ model: 'Kimi-K2', bt_rank: 2, win_rate: 82.1, accuracy: 30.17, acc_rank: 2, is_proprietary: false },
{ model: 'GPT5.1', bt_rank: 3, win_rate: 78.3, accuracy: 30.10, acc_rank: 3, is_proprietary: true },
{ model: 'Gemini3-Flash', bt_rank: 4, win_rate: 75.0, accuracy: 29.28, acc_rank: 4, is_proprietary: true },
{ model: 'GPT5.2', bt_rank: 5, win_rate: 71.2, accuracy: 28.88, acc_rank: 5, is_proprietary: true },
{ model: 'DeepSeek-V3.2', bt_rank: 6, win_rate: 68.5, accuracy: 27.65, acc_rank: 6, is_proprietary: false },
{ model: 'GPT5-mini', bt_rank: 7, win_rate: 65.0, accuracy: 27.59, acc_rank: 7, is_proprietary: true },
{ model: 'GLM4.6', bt_rank: 8, win_rate: 61.8, accuracy: 23.84, acc_rank: 8, is_proprietary: false },
{ model: 'MiniMax-M2', bt_rank: 9, win_rate: 58.2, accuracy: 23.52, acc_rank: 9, is_proprietary: false },
{ model: 'Qwen3', bt_rank: 10, win_rate: 54.5, accuracy: 19.13, acc_rank: 11, is_proprietary: false },
{ model: 'Gemini2.5-Pro', bt_rank: 11, win_rate: 51.0, accuracy: 19.00, acc_rank: 12, is_proprietary: true },
{ model: 'Qwen3-Next-80B-A3B', bt_rank: 12, win_rate: 47.5, accuracy: 18.80, acc_rank: 10, is_proprietary: false },
{ model: 'Gemini2.5-Flash', bt_rank: 13, win_rate: 44.0, accuracy: 18.61, acc_rank: 13, is_proprietary: true },
{ model: 'Qwen3-4B', bt_rank: 14, win_rate: 40.5, accuracy: 16.93, acc_rank: 14, is_proprietary: false },
{ model: 'Gemini2.5-Flash-Lite', bt_rank: 15, win_rate: 37.0, accuracy: 16.64, acc_rank: 15, is_proprietary: true },
{ model: 'Qwen2.5-72B', bt_rank: 16, win_rate: 33.5, accuracy: 14.92, acc_rank: 16, is_proprietary: false },
{ model: 'Qwen2.5-14B-1M', bt_rank: 17, win_rate: 30.0, accuracy: 14.08, acc_rank: 18, is_proprietary: false },
{ model: 'Qwen2.5-14B', bt_rank: 18, win_rate: 26.5, accuracy: 14.15, acc_rank: 17, is_proprietary: false },
{ model: 'Qwen2.5-32B', bt_rank: 19, win_rate: 23.0, accuracy: 13.12, acc_rank: 19, is_proprietary: false },
{ model: 'Qwen2.5-7B', bt_rank: 20, win_rate: 19.5, accuracy: 10.79, acc_rank: 20, is_proprietary: false },
{ model: 'Qwen2.5-7B-1M', bt_rank: 21, win_rate: 16.0, accuracy: 9.08, acc_rank: 21, is_proprietary: false },
{ model: 'Llama3.3-70B', bt_rank: 22, win_rate: 12.5, accuracy: 7.30, acc_rank: 22, is_proprietary: false }
],
'10K': [
{ model: 'Claude4.5-Sonnet', bt_rank: 1, win_rate: 92.0, accuracy: 69.26, acc_rank: 1, is_proprietary: true },
{ model: 'DeepSeek-V3.2', bt_rank: 2, win_rate: 85.5, accuracy: 49.41, acc_rank: 2, is_proprietary: false },
{ model: 'GLM4.6', bt_rank: 3, win_rate: 82.0, accuracy: 48.29, acc_rank: 3, is_proprietary: false },
{ model: 'GPT5.2', bt_rank: 4, win_rate: 78.0, accuracy: 43.11, acc_rank: 4, is_proprietary: true },
{ model: 'GPT5-mini', bt_rank: 5, win_rate: 74.5, accuracy: 41.56, acc_rank: 5, is_proprietary: true },
{ model: 'GPT5.1', bt_rank: 6, win_rate: 71.0, accuracy: 41.23, acc_rank: 6, is_proprietary: true },
{ model: 'Kimi-K2', bt_rank: 7, win_rate: 67.5, accuracy: 41.17, acc_rank: 7, is_proprietary: false },
{ model: 'Gemini3-Flash', bt_rank: 8, win_rate: 64.0, accuracy: 39.50, acc_rank: 8, is_proprietary: true },
{ model: 'Qwen3-Next-80B-A3B', bt_rank: 9, win_rate: 60.5, accuracy: 38.34, acc_rank: 9, is_proprietary: false },
{ model: 'MiniMax-M2', bt_rank: 10, win_rate: 57.0, accuracy: 35.74, acc_rank: 10, is_proprietary: false },
{ model: 'Qwen3-4B', bt_rank: 11, win_rate: 53.5, accuracy: 30.43, acc_rank: 11, is_proprietary: false },
{ model: 'Qwen3', bt_rank: 12, win_rate: 50.0, accuracy: 28.23, acc_rank: 12, is_proprietary: false },
{ model: 'Gemini2.5-Pro', bt_rank: 13, win_rate: 46.5, accuracy: 20.91, acc_rank: 13, is_proprietary: true },
{ model: 'Qwen2.5-72B', bt_rank: 14, win_rate: 43.0, accuracy: 20.79, acc_rank: 14, is_proprietary: false },
{ model: 'Qwen2.5-32B', bt_rank: 15, win_rate: 39.5, accuracy: 17.83, acc_rank: 15, is_proprietary: false },
{ model: 'Qwen2.5-14B-1M', bt_rank: 16, win_rate: 36.0, accuracy: 16.67, acc_rank: 16, is_proprietary: false },
{ model: 'Qwen2.5-14B', bt_rank: 17, win_rate: 32.5, accuracy: 14.65, acc_rank: 17, is_proprietary: false },
{ model: 'Gemini2.5-Flash-Lite', bt_rank: 18, win_rate: 29.0, accuracy: 14.37, acc_rank: 18, is_proprietary: true },
{ model: 'Gemini2.5-Flash', bt_rank: 19, win_rate: 25.5, accuracy: 12.61, acc_rank: 19, is_proprietary: true },
{ model: 'Qwen2.5-7B', bt_rank: 20, win_rate: 22.0, accuracy: 7.53, acc_rank: 20, is_proprietary: false },
{ model: 'Qwen2.5-7B-1M', bt_rank: 21, win_rate: 18.5, accuracy: 6.68, acc_rank: 21, is_proprietary: false },
{ model: 'Llama3.3-70B', bt_rank: 22, win_rate: 15.0, accuracy: 6.51, acc_rank: 22, is_proprietary: false }
],
GLOBEM: [
{ model: 'GLM4.6', bt_rank: 1, win_rate: 78.0, accuracy: 39.77, acc_rank: 1, is_proprietary: false },
{ model: 'Claude4.5-Sonnet', bt_rank: 2, win_rate: 75.5, accuracy: 39.54, acc_rank: 2, is_proprietary: true },
{ model: 'GPT5.2', bt_rank: 3, win_rate: 72.0, accuracy: 38.39, acc_rank: 3, is_proprietary: true },
{ model: 'DeepSeek-V3.2', bt_rank: 4, win_rate: 69.5, accuracy: 38.39, acc_rank: 4, is_proprietary: false },
{ model: 'Kimi-K2', bt_rank: 5, win_rate: 66.0, accuracy: 37.01, acc_rank: 5, is_proprietary: false },
{ model: 'MiniMax-M2', bt_rank: 6, win_rate: 63.5, accuracy: 36.90, acc_rank: 6, is_proprietary: false },
{ model: 'GPT5.1', bt_rank: 7, win_rate: 61.0, accuracy: 36.76, acc_rank: 7, is_proprietary: true },
{ model: 'Qwen3', bt_rank: 8, win_rate: 58.0, accuracy: 36.32, acc_rank: 8, is_proprietary: false },
{ model: 'Gemini3-Flash', bt_rank: 9, win_rate: 55.5, accuracy: 35.46, acc_rank: 9, is_proprietary: true },
{ model: 'Gemini2.5-Pro', bt_rank: 10, win_rate: 52.0, accuracy: 34.60, acc_rank: 10, is_proprietary: true },
{ model: 'Qwen3-Next-80B-A3B', bt_rank: 11, win_rate: 49.5, accuracy: 34.14, acc_rank: 11, is_proprietary: false },
{ model: 'GPT5-mini', bt_rank: 12, win_rate: 46.0, accuracy: 33.91, acc_rank: 12, is_proprietary: true },
{ model: 'Gemini2.5-Flash', bt_rank: 13, win_rate: 43.5, accuracy: 28.62, acc_rank: 13, is_proprietary: true },
{ model: 'Qwen2.5-7B-1M', bt_rank: 14, win_rate: 40.0, accuracy: 27.15, acc_rank: 14, is_proprietary: false },
{ model: 'Qwen2.5-72B', bt_rank: 15, win_rate: 37.5, accuracy: 27.13, acc_rank: 15, is_proprietary: false },
{ model: 'Qwen3-4B', bt_rank: 16, win_rate: 34.0, accuracy: 26.90, acc_rank: 16, is_proprietary: false },
{ model: 'Qwen2.5-14B-1M', bt_rank: 17, win_rate: 31.5, accuracy: 26.47, acc_rank: 17, is_proprietary: false },
{ model: 'Qwen2.5-14B', bt_rank: 18, win_rate: 28.0, accuracy: 26.13, acc_rank: 18, is_proprietary: false },
{ model: 'Qwen2.5-32B', bt_rank: 19, win_rate: 25.5, accuracy: 25.90, acc_rank: 19, is_proprietary: false },
{ model: 'Qwen2.5-7B', bt_rank: 20, win_rate: 22.0, accuracy: 25.64, acc_rank: 20, is_proprietary: false },
{ model: 'Gemini2.5-Flash-Lite', bt_rank: 21, win_rate: 19.5, accuracy: 25.52, acc_rank: 21, is_proprietary: true },
{ model: 'Llama3.3-70B', bt_rank: 22, win_rate: 15.0, accuracy: 22.65, acc_rank: 22, is_proprietary: false }
]
},
// Turn Distribution Data (distribution: percentage in bins [0-10, 10-20, ..., 90-100])
turn: {
mimic: [
{ model: 'DeepSeekV3.2', median: 21, distribution: [0, 0, 2, 8, 15, 22, 25, 18, 7, 3] },
{ model: 'GLM4.6', median: 20, distribution: [0, 0, 3, 10, 18, 25, 22, 14, 5, 3] },
{ model: 'Gemini3-Flash', median: 18, distribution: [0, 0, 3, 10, 18, 25, 22, 14, 5, 3] },
{ model: 'GPT5.1', median: 16, distribution: [0, 1, 5, 12, 22, 28, 18, 9, 3, 2] },
{ model: 'Kimi-K2', median: 15, distribution: [0, 1, 6, 15, 25, 28, 16, 6, 2, 1] },
{ model: 'Claude4.5-Sonnet', median: 14, distribution: [0, 0, 5, 15, 25, 30, 15, 7, 2, 1] },
{ model: 'MiniMax-M2', median: 14, distribution: [0, 2, 8, 18, 28, 25, 12, 5, 1, 1] },
{ model: 'GPT5.2', median: 12, distribution: [0, 2, 8, 20, 30, 25, 10, 3, 1, 1] },
{ model: 'Qwen3-30B-A3B', median: 12, distribution: [0, 3, 10, 22, 30, 22, 9, 3, 1, 0] },
{ model: 'Qwen3-Next-80B-A3B', median: 11, distribution: [1, 4, 12, 25, 30, 18, 7, 2, 1, 0] },
{ model: 'Qwen2.5-72B', median: 10, distribution: [1, 5, 15, 28, 28, 15, 5, 2, 1, 0] },
{ model: 'Qwen3-4B', median: 9, distribution: [2, 6, 18, 30, 25, 12, 5, 1, 1, 0] },
{ model: 'GPT5-mini', median: 8, distribution: [2, 8, 18, 28, 25, 12, 5, 1, 1, 0] },
{ model: 'Llama3.3-70B', median: 5, distribution: [12, 25, 30, 20, 8, 3, 1, 1, 0, 0] }
],
'10k': [
{ model: 'GLM4.6', median: 22, distribution: [0, 0, 2, 5, 12, 20, 25, 22, 10, 4] },
{ model: 'Gemini3-Flash', median: 22, distribution: [0, 0, 2, 5, 12, 20, 25, 22, 10, 4] },
{ model: 'DeepSeekV3.2', median: 20, distribution: [0, 0, 3, 10, 18, 25, 22, 14, 5, 3] },
{ model: 'Kimi-K2', median: 17, distribution: [0, 1, 4, 12, 20, 28, 20, 10, 3, 2] },
{ model: 'MiniMax-M2', median: 17, distribution: [0, 1, 5, 14, 24, 28, 18, 7, 2, 1] },
{ model: 'Claude4.5-Sonnet', median: 16, distribution: [0, 1, 5, 12, 22, 28, 18, 9, 3, 2] },
{ model: 'Qwen3-30B-A3B', median: 16, distribution: [0, 1, 5, 12, 22, 28, 18, 9, 3, 2] },
{ model: 'GPT5.2', median: 14, distribution: [0, 2, 8, 18, 28, 25, 12, 5, 1, 1] },
{ model: 'Qwen2.5-72B', median: 14, distribution: [0, 2, 8, 18, 28, 25, 12, 5, 1, 1] },
{ model: 'GPT5.1', median: 13, distribution: [0, 2, 8, 20, 28, 24, 12, 4, 1, 1] },
{ model: 'Qwen3-Next-80B-A3B', median: 12, distribution: [0, 2, 10, 22, 30, 22, 10, 3, 1, 0] },
{ model: 'Qwen3-4B', median: 12, distribution: [0, 3, 10, 22, 30, 22, 9, 3, 1, 0] },
{ model: 'GPT5-mini', median: 9, distribution: [2, 6, 18, 30, 25, 12, 5, 1, 1, 0] },
{ model: 'Llama3.3-70B', median: 6, distribution: [10, 22, 30, 22, 10, 4, 1, 1, 0, 0] }
],
globem: [
{ model: 'GLM4.6', median: 22, distribution: [0, 0, 2, 6, 14, 22, 26, 20, 7, 3] },
{ model: 'DeepSeekV3.2', median: 20, distribution: [0, 0, 3, 10, 18, 25, 22, 14, 5, 3] },
{ model: 'Qwen3-30B-A3B', median: 20, distribution: [0, 0, 3, 10, 18, 25, 22, 14, 5, 3] },
{ model: 'Kimi-K2', median: 17, distribution: [0, 1, 4, 12, 20, 28, 20, 10, 3, 2] },
{ model: 'MiniMax-M2', median: 17, distribution: [0, 1, 5, 14, 24, 28, 18, 7, 2, 1] },
{ model: 'Gemini3-Flash', median: 15, distribution: [0, 1, 6, 15, 25, 28, 16, 6, 2, 1] },
{ model: 'Claude4.5-Sonnet', median: 13, distribution: [0, 2, 10, 20, 28, 25, 10, 4, 1, 0] },
{ model: 'GPT5.1', median: 13, distribution: [0, 2, 10, 20, 28, 25, 10, 4, 1, 0] },
{ model: 'Qwen3-Next-80B-A3B', median: 12, distribution: [0, 2, 10, 22, 30, 22, 10, 3, 1, 0] },
{ model: 'Qwen3-4B', median: 12, distribution: [0, 3, 10, 22, 30, 22, 9, 3, 1, 0] },
{ model: 'GPT5.2', median: 11, distribution: [1, 4, 12, 25, 30, 18, 7, 2, 1, 0] },
{ model: 'Qwen2.5-72B', median: 14, distribution: [0, 2, 8, 18, 28, 25, 12, 5, 1, 1] },
{ model: 'GPT5-mini', median: 8, distribution: [3, 10, 20, 30, 22, 10, 3, 1, 1, 0] },
{ model: 'Llama3.3-70B', median: 6, distribution: [10, 22, 32, 22, 9, 3, 1, 1, 0, 0] }
]
},
// Entropy Analysis Data
entropy: {
mimic: {
'GPT-5.2': { entropy: [0.72, 0.78, 0.82, 0.68, 0.75, 0.88, 0.65, 0.79, 0.71, 0.84], coverage: [0.08, 0.10, 0.09, 0.07, 0.09, 0.11, 0.06, 0.10, 0.08, 0.10], accuracy: [30, 35, 40, 25, 32, 45, 20, 28, 31, 38] },
'Claude-4.5-Sonnet': { entropy: [0.85, 0.88, 0.92, 0.80, 0.87, 0.78, 0.82, 0.90, 0.86, 0.89], coverage: [0.12, 0.14, 0.13, 0.10, 0.13, 0.09, 0.11, 0.15, 0.12, 0.14], accuracy: [45, 50, 55, 40, 48, 35, 42, 52, 47, 51] },
'Gemini-3-Flash': { entropy: [0.70, 0.75, 0.68, 0.72, 0.80, 0.65, 0.78, 0.72, 0.69, 0.76], coverage: [0.06, 0.09, 0.07, 0.08, 0.10, 0.05, 0.09, 0.07, 0.06, 0.08], accuracy: [28, 32, 25, 30, 38, 22, 35, 28, 26, 33] },
'GLM-4.6': { entropy: [0.78, 0.82, 0.75, 0.80, 0.88, 0.72, 0.85, 0.78, 0.76, 0.83], coverage: [0.09, 0.11, 0.08, 0.10, 0.13, 0.07, 0.12, 0.09, 0.08, 0.11], accuracy: [32, 40, 28, 35, 45, 25, 42, 32, 30, 38] },
'DeepSeek-V3.2': { entropy: [0.82, 0.85, 0.78, 0.88, 0.75, 0.90, 0.80, 0.85, 0.81, 0.87], coverage: [0.10, 0.12, 0.09, 0.14, 0.08, 0.15, 0.10, 0.12, 0.10, 0.13], accuracy: [38, 42, 32, 48, 28, 52, 35, 42, 36, 44] }
},
'10k': {
'GPT-5.2': { entropy: [0.85, 0.88, 0.92, 0.82, 0.87, 0.94, 0.80, 0.89, 0.84, 0.91], coverage: [0.35, 0.42, 0.48, 0.32, 0.40, 0.52, 0.28, 0.44, 0.38, 0.46], accuracy: [35, 40, 45, 30, 38, 50, 25, 42, 36, 44] },
'Claude-4.5-Sonnet': { entropy: [0.92, 0.95, 0.98, 0.90, 0.94, 0.88, 0.91, 0.96, 0.93, 0.95], coverage: [0.55, 0.62, 0.68, 0.50, 0.58, 0.45, 0.52, 0.65, 0.56, 0.60], accuracy: [65, 72, 78, 60, 68, 55, 62, 75, 66, 70] },
'Gemini-3-Flash': { entropy: [0.82, 0.86, 0.80, 0.84, 0.90, 0.78, 0.88, 0.83, 0.81, 0.87], coverage: [0.28, 0.35, 0.25, 0.32, 0.42, 0.22, 0.38, 0.30, 0.26, 0.36], accuracy: [35, 40, 30, 38, 48, 28, 45, 36, 32, 42] },
'GLM-4.6': { entropy: [0.88, 0.92, 0.85, 0.90, 0.95, 0.82, 0.93, 0.88, 0.86, 0.91], coverage: [0.42, 0.50, 0.38, 0.46, 0.55, 0.35, 0.52, 0.44, 0.40, 0.48], accuracy: [50, 58, 45, 52, 62, 40, 56, 50, 46, 54] },
'DeepSeek-V3.2': { entropy: [0.90, 0.93, 0.87, 0.95, 0.85, 0.97, 0.89, 0.94, 0.88, 0.92], coverage: [0.48, 0.55, 0.42, 0.60, 0.38, 0.65, 0.50, 0.57, 0.45, 0.53], accuracy: [52, 60, 48, 65, 42, 70, 55, 62, 50, 58] }
},
globem: {
'GPT-5.2': { entropy: [0.75, 0.80, 0.85, 0.72, 0.78, 0.88, 0.70, 0.82, 0.76, 0.84], coverage: [0.65, 0.72, 0.78, 0.60, 0.70, 0.85, 0.55, 0.75, 0.68, 0.80], accuracy: [32, 38, 42, 28, 35, 48, 25, 40, 34, 44] },
'Claude-4.5-Sonnet': { entropy: [0.82, 0.86, 0.90, 0.78, 0.84, 0.75, 0.80, 0.88, 0.83, 0.87], coverage: [0.78, 0.85, 0.92, 0.72, 0.82, 0.68, 0.75, 0.88, 0.80, 0.86], accuracy: [38, 45, 50, 35, 42, 32, 38, 48, 40, 46] },
'Gemini-3-Flash': { entropy: [0.72, 0.77, 0.70, 0.75, 0.82, 0.68, 0.80, 0.74, 0.71, 0.78], coverage: [0.55, 0.65, 0.50, 0.58, 0.72, 0.45, 0.68, 0.60, 0.52, 0.66], accuracy: [30, 36, 28, 34, 42, 26, 40, 32, 28, 38] },
'GLM-4.6': { entropy: [0.80, 0.84, 0.78, 0.82, 0.90, 0.75, 0.87, 0.81, 0.79, 0.85], coverage: [0.72, 0.80, 0.68, 0.75, 0.88, 0.62, 0.85, 0.74, 0.70, 0.82], accuracy: [38, 45, 35, 42, 52, 30, 48, 40, 36, 46] },
'DeepSeek-V3.2': { entropy: [0.84, 0.88, 0.80, 0.90, 0.78, 0.92, 0.82, 0.87, 0.83, 0.89], coverage: [0.75, 0.82, 0.70, 0.88, 0.65, 0.92, 0.78, 0.84, 0.72, 0.86], accuracy: [36, 42, 32, 48, 28, 52, 38, 44, 34, 46] }
}
},
// Probing Results Data
probing: {
byTurn: {
mimic: {
'Qwen2.5-32B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-12.5, -11.8, -11.2, -10.5, -10.0, -9.5, -9.2, -8.8, -8.5, -8.2], sem: [0.8, 0.7, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.3] },
'Qwen2.5-72B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-11.8, -11.2, -10.5, -9.8, -9.2, -8.8, -8.4, -8.0, -7.7, -7.5], sem: [0.7, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.3, 0.2] },
'Qwen3-4B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-13.2, -12.5, -11.8, -11.0, -10.2, -9.5, -9.0, -8.5, -8.2, -7.8], sem: [0.9, 0.8, 0.7, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3] },
'Qwen3-30B-A3B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-12.0, -11.2, -10.5, -9.8, -9.0, -8.5, -8.0, -7.6, -7.2, -7.0], sem: [0.7, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.2, 0.2] },
'Qwen3-Next-80B-A3B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-10.5, -9.8, -9.2, -8.5, -8.0, -7.5, -7.2, -6.8, -6.5, -6.2], sem: [0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.3, 0.2, 0.2] }
},
globem: {
'Qwen2.5-32B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-11.5, -10.8, -10.2, -9.5, -9.0, -8.5, -8.2, -7.8, -7.5, -7.2], sem: [0.7, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.3, 0.2] },
'Qwen2.5-72B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-10.8, -10.2, -9.5, -8.8, -8.2, -7.8, -7.4, -7.0, -6.7, -6.5], sem: [0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.3, 0.2, 0.2] },
'Qwen3-4B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-12.2, -11.5, -10.8, -10.0, -9.2, -8.5, -8.0, -7.5, -7.2, -6.8], sem: [0.8, 0.7, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.3] },
'Qwen3-30B-A3B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-11.0, -10.2, -9.5, -8.8, -8.0, -7.5, -7.0, -6.6, -6.2, -6.0], sem: [0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.2, 0.2, 0.2] },
'Qwen3-Next-80B-A3B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-9.5, -8.8, -8.2, -7.5, -7.0, -6.5, -6.2, -5.8, -5.5, -5.2], sem: [0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.3, 0.2, 0.2, 0.2] }
},
'10k': {
'Qwen2.5-32B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-12.0, -11.3, -10.7, -10.0, -9.5, -9.0, -8.7, -8.3, -8.0, -7.7], sem: [0.8, 0.7, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.3] },
'Qwen2.5-72B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-11.3, -10.7, -10.0, -9.3, -8.7, -8.3, -7.9, -7.5, -7.2, -7.0], sem: [0.7, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.2, 0.2] },
'Qwen3-4B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-12.7, -12.0, -11.3, -10.5, -9.7, -9.0, -8.5, -8.0, -7.7, -7.3], sem: [0.9, 0.8, 0.7, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3] },
'Qwen3-30B-A3B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-11.5, -10.7, -10.0, -9.3, -8.5, -8.0, -7.5, -7.1, -6.7, -6.5], sem: [0.7, 0.6, 0.5, 0.5, 0.4, 0.3, 0.3, 0.3, 0.2, 0.2] },
'Qwen3-Next-80B-A3B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-10.0, -9.3, -8.7, -8.0, -7.5, -7.0, -6.7, -6.3, -6.0, -5.7], sem: [0.6, 0.5, 0.5, 0.4, 0.3, 0.3, 0.3, 0.2, 0.2, 0.2] }
}
},
byProgress: {
mimic: {
'Qwen2.5-32B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-12.5, -12.0, -11.5, -11.0, -10.5, -10.0, -9.5, -9.0, -8.5, -8.0], sem: [0.8, 0.7, 0.7, 0.6, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3] },
'Qwen2.5-72B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-12.0, -11.5, -11.0, -10.5, -9.8, -9.2, -8.7, -8.2, -7.8, -7.5], sem: [0.7, 0.7, 0.6, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3] },
'Qwen3-4B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-13.0, -12.5, -12.0, -11.5, -10.8, -10.0, -9.3, -8.7, -8.2, -7.8], sem: [0.9, 0.8, 0.8, 0.7, 0.6, 0.6, 0.5, 0.5, 0.4, 0.4] },
'Qwen3-30B-A3B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-12.2, -11.7, -11.0, -10.3, -9.5, -8.8, -8.2, -7.6, -7.2, -6.8], sem: [0.7, 0.7, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.3] },
'Qwen3-Next-80B-A3B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-10.8, -10.2, -9.5, -8.8, -8.0, -7.5, -7.0, -6.5, -6.2, -5.8], sem: [0.6, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.3, 0.2] }
},
globem: {
'Qwen2.5-32B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-11.5, -11.0, -10.5, -10.0, -9.5, -9.0, -8.5, -8.0, -7.5, -7.0], sem: [0.7, 0.7, 0.6, 0.5, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3] },
'Qwen2.5-72B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-11.0, -10.5, -10.0, -9.5, -8.8, -8.2, -7.7, -7.2, -6.8, -6.5], sem: [0.6, 0.6, 0.5, 0.5, 0.4, 0.4, 0.4, 0.3, 0.3, 0.2] },
'Qwen3-4B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-12.0, -11.5, -11.0, -10.5, -9.8, -9.0, -8.3, -7.7, -7.2, -6.8], sem: [0.8, 0.7, 0.7, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3] },
'Qwen3-30B-A3B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-11.2, -10.7, -10.0, -9.3, -8.5, -7.8, -7.2, -6.6, -6.2, -5.8], sem: [0.6, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.2, 0.2] },
'Qwen3-Next-80B-A3B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-9.8, -9.2, -8.5, -7.8, -7.0, -6.5, -6.0, -5.5, -5.2, -4.8], sem: [0.5, 0.5, 0.4, 0.4, 0.4, 0.3, 0.3, 0.2, 0.2, 0.2] }
},
'10k': {
'Qwen2.5-32B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-12.0, -11.5, -11.0, -10.5, -10.0, -9.5, -9.0, -8.5, -8.0, -7.5], sem: [0.8, 0.7, 0.7, 0.6, 0.5, 0.5, 0.5, 0.4, 0.4, 0.3] },
'Qwen2.5-72B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-11.5, -11.0, -10.5, -10.0, -9.3, -8.7, -8.2, -7.7, -7.3, -7.0], sem: [0.7, 0.6, 0.6, 0.5, 0.5, 0.4, 0.4, 0.4, 0.3, 0.3] },
'Qwen3-4B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-12.5, -12.0, -11.5, -11.0, -10.3, -9.5, -8.8, -8.2, -7.7, -7.3], sem: [0.9, 0.8, 0.7, 0.7, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3] },
'Qwen3-30B-A3B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-11.7, -11.2, -10.5, -9.8, -9.0, -8.3, -7.7, -7.1, -6.7, -6.3], sem: [0.7, 0.6, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.2] },
'Qwen3-Next-80B-A3B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-10.3, -9.7, -9.0, -8.3, -7.5, -7.0, -6.5, -6.0, -5.7, -5.3], sem: [0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.3, 0.2, 0.2] }
}
}
},
// Probing model colors
probingColors: {
'Qwen2.5-32B': '#4A90D9',
'Qwen2.5-72B': '#1A5FB4',
'Qwen3-4B': '#57E389',
'Qwen3-30B-A3B': '#26A269',
'Qwen3-Next-80B-A3B': '#9141AC'
}
};