diff --git "a/data.js" "b/data.js" --- "a/data.js" +++ "b/data.js" @@ -1,297 +1,3625 @@ -// DDR-Bench Visualization Data - Auto-generated from original data sources -// Generated from Python analysis scripts +// DDR-Bench Visualization Data +// Auto-generated by export_web_data.py - matches Python plotting scripts exactly const DDR_DATA = { modelColors: { - 'GPT-5.2': '#00C853', - 'Claude-4.5-Sonnet': '#FF6D00', - 'Gemini-3-Flash': '#2196F3', - 'GLM-4.6': '#9C27B0', - 'DeepSeek-V3.2': '#E91E63', - 'Qwen3-Next-80B-A3B': '#FFC107', - 'Kimi-K2': '#FFA500', - 'MiniMax-M2': '#20B2AA', - 'Qwen2.5-32B': '#4A90D9', - 'Qwen2.5-72B': '#1A5FB4', - 'Qwen3-4B': '#57E389', - 'Qwen3-30B-A3B': '#26A269', + "GPT-5.2": "#00C853", + "Claude-4.5-Sonnet": "#FF6D00", + "Gemini-3-Flash": "#2196F3", + "GLM-4.6": "#9C27B0" }, scaling: { - 'mimic': { - 'GPT-5.2': { - turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], - tokens: [51, 1475, 1796, 2543, 3737, 4926, 5784, 6681, 7562, 8577, 10444, 11611, 12837, 14128, 15459, 16839, 17760, 18642, 19455, 20193], - costs: [0.0005, 0.0012, 0.0021, 0.0032, 0.005, 0.0072, 0.01, 0.0131, 0.0167, 0.0207, 0.0257, 0.031, 0.0371, 0.0439, 0.0516, 0.0595, 0.068, 0.0772, 0.086, 0.0947], - accuracy: [2.02, 3.99, 5.9, 7.75, 9.55, 11.29, 12.97, 14.59, 16.14, 17.62, 19.03, 20.36, 21.62, 22.78, 23.85, 24.82, 25.68, 26.4, 26.96, 27.26] + "mimic": { + "GPT-5.2": { + "turns": [ + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40, + 45, + 50, + 55 + ], + "tokens": [ + 3737, + 8577, + 15459, + 20193, + 24028, + 26951, + 28820, + 29751, + 30405, + 30968, + 31260 + ], + "costs": [ + 0.005, + 0.0207, + 0.0516, + 0.0947, + 0.1522, + 0.2153, + 0.2799, + 0.3597, + 0.4373, + 0.4906, + 0.635 + ], + "accuracy": [ + 10.85, + 15.25, + 18.35, + 20.41, + 23.26, + 24.42, + 25.32, + 25.97, + 26.36, + 26.87, + 27.26 + ] + }, + "Gemini-3-Flash": { + "turns": [ + 5, + 10, + 15, + 20, + 25 + ], + "tokens": [ + 5580, + 14305, + 23357, + 26964, + 27542 + ], + "costs": [ + 0.002, + 0.008, + 0.0173, + 0.0284, + 0.045 + ], + "accuracy": [ + 7.62, + 13.44, + 19.77, + 24.03, + 24.94 + ] + }, + "Claude-4.5-Sonnet": { + "turns": [ + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40, + 45, + 50, + 55, + 60, + 65, + 70, + 75 + ], + "tokens": [ + 4513, + 9214, + 13378, + 17217, + 20275, + 22878, + 25379, + 27559, + 29532, + 31239, + 32395, + 33382, + 33796, + 33968, + 34140 + ], + "costs": [ + 0.0152, + 0.059, + 0.1249, + 0.2138, + 0.3214, + 0.4458, + 0.5823, + 0.7212, + 0.842, + 0.9656, + 1.0851, + 1.1605, + 1.3008, + 1.4081, + 1.3369 + ], + "accuracy": [ + 8.14, + 9.17, + 11.89, + 14.73, + 16.67, + 18.22, + 19.77, + 22.87, + 26.61, + 29.46, + 31.78, + 33.59, + 33.98, + 34.24, + 34.37 + ] + }, + "GLM-4.6": { + "turns": [ + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40, + 45, + 50, + 55, + 60, + 65, + 70, + 75, + 80, + 85, + 90, + 95 + ], + "tokens": [ + 3488, + 7059, + 10542, + 13099, + 14972, + 16484, + 17524, + 18410, + 19112, + 19728, + 20259, + 20715, + 21135, + 21489, + 21858, + 22169, + 22422, + 22613, + 22802 + ], + "costs": [ + 0.0026, + 0.0097, + 0.0217, + 0.0369, + 0.0552, + 0.0743, + 0.0969, + 0.1204, + 0.1489, + 0.1769, + 0.2074, + 0.24, + 0.2763, + 0.3114, + 0.3522, + 0.3935, + 0.4408, + 0.4741, + 0.5461 + ], + "accuracy": [ + 9.43, + 11.11, + 13.57, + 16.02, + 17.57, + 18.86, + 19.77, + 20.16, + 20.8, + 21.19, + 21.45, + 22.09, + 22.48, + 22.61, + 22.87, + 23.13, + 23.13, + 23.13, + 23.26 + ] + } + }, + "10k": { + "Claude-4.5-Sonnet": { + "turns": [ + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40, + 45, + 50, + 55, + 60, + 65, + 70, + 75, + 80 + ], + "tokens": [ + 2561, + 7208, + 10978, + 14050, + 17080, + 19955, + 22501, + 25124, + 27696, + 30109, + 32363, + 34247, + 35154, + 35907, + 36148, + 36277 + ], + "costs": [ + 0.0094, + 0.0414, + 0.0955, + 0.1682, + 0.2576, + 0.3598, + 0.4751, + 0.5932, + 0.7209, + 0.8684, + 1.0029, + 1.0913, + 1.2015, + 1.3713, + 1.4854, + 1.5611 + ], + "accuracy": [ + 0.82, + 1.06, + 1.41, + 3.18, + 5.65, + 9.78, + 15.19, + 22.85, + 31.8, + 46.64, + 60.42, + 69.02, + 73.26, + 75.62, + 76.68, + 77.27 + ] + }, + "GPT-5.2": { + "turns": [ + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40, + 45, + 50, + 55, + 60 + ], + "tokens": [ + 2823, + 7902, + 11581, + 14651, + 16306, + 17356, + 17871, + 18251, + 18345, + 18398, + 18441, + 18468 + ], + "costs": [ + 0.0037, + 0.0199, + 0.0454, + 0.0774, + 0.1125, + 0.1524, + 0.1983, + 0.2657, + 0.3601, + 0.4706, + 0.5641, + 0.6699 + ], + "accuracy": [ + 0.82, + 8.36, + 22.85, + 32.98, + 37.57, + 40.52, + 43.23, + 44.29, + 44.41, + 44.52, + 44.76, + 44.99 + ] + }, + "GLM-4.6": { + "turns": [ + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40, + 45, + 50, + 55, + 60, + 65, + 70, + 75, + 80, + 85, + 90 + ], + "tokens": [ + 1838, + 3927, + 6056, + 8124, + 10227, + 12399, + 14608, + 16757, + 18614, + 20290, + 21537, + 22418, + 23164, + 23553, + 23781, + 23950, + 24062, + 24102 + ], + "costs": [ + 0.0013, + 0.0053, + 0.012, + 0.0214, + 0.0334, + 0.0481, + 0.0654, + 0.0855, + 0.1047, + 0.1277, + 0.1498, + 0.1724, + 0.2004, + 0.223, + 0.2716, + 0.3281, + 0.3281, + 0.4018 + ], + "accuracy": [ + 0.24, + 0.59, + 2.0, + 4.48, + 8.72, + 13.19, + 19.08, + 26.27, + 35.34, + 41.22, + 47.7, + 52.3, + 54.77, + 56.54, + 57.83, + 59.25, + 60.19, + 60.42 + ] + }, + "Gemini-3-Flash": { + "turns": [ + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40 + ], + "tokens": [ + 4473, + 12616, + 18404, + 20077, + 20729, + 20883, + 20935, + 21004 + ], + "costs": [ + 0.002, + 0.008, + 0.0171, + 0.0275, + 0.0385, + 0.0405, + 0.0538, + 0.0688 + ], + "accuracy": [ + 1.88, + 18.61, + 38.4, + 42.17, + 43.7, + 44.05, + 44.05, + 44.41 + ] + } + }, + "globem": { + "GLM-4.6": { + "turns": [ + 5, + 10, + 15, + 20, + 25, + 30, + 35 + ], + "tokens": [ + 3851, + 8817, + 16221, + 26186, + 32076, + 34058, + 34425 + ], + "costs": [ + 0.0027, + 0.0115, + 0.028, + 0.0544, + 0.0891, + 0.1409, + 0.1581 + ], + "accuracy": [ + 2.3, + 2.3, + 5.52, + 22.99, + 36.32, + 40.0, + 41.61 + ] + }, + "Gemini-3-Flash": { + "turns": [ + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40 + ], + "tokens": [ + 6260, + 14825, + 25972, + 35526, + 40312, + 41787, + 42167, + 42236 + ], + "costs": [ + 0.0021, + 0.0074, + 0.0166, + 0.0292, + 0.047, + 0.0828, + 0.1479, + 0.2258 + ], + "accuracy": [ + 1.88, + 2.12, + 5.88, + 21.41, + 30.35, + 34.35, + 35.06, + 35.29 + ] + }, + "Claude-4.5-Sonnet": { + "turns": [ + 5, + 10, + 15, + 20, + 25, + 30 + ], + "tokens": [ + 4579, + 11405, + 21188, + 32526, + 44888, + 49137 + ], + "costs": [ + 0.0152, + 0.0638, + 0.1611, + 0.3124, + 0.4877, + 0.6335 + ], + "accuracy": [ + 2.53, + 2.53, + 3.45, + 12.64, + 30.57, + 40.23 + ] + }, + "GPT-5.2": { + "turns": [ + 5, + 10, + 15, + 20 + ], + "tokens": [ + 3477, + 10218, + 17672, + 19878 + ], + "costs": [ + 0.0048, + 0.0236, + 0.0652, + 0.1238 + ], + "accuracy": [ + 0.92, + 5.98, + 34.02, + 38.39 + ] + } } - ,'Claude-4.5-Sonnet': { - turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], - tokens: [33, 1527, 1714, 3192, 4513, 5965, 6664, 7386, 8417, 9214, 9822, 10619, 11532, 12516, 13378, 14190, 15000, 15722, 16457, 17217], - costs: [0.0004, 0.0027, 0.0053, 0.0097, 0.0152, 0.0222, 0.03, 0.0386, 0.0484, 0.059, 0.0702, 0.0823, 0.0954, 0.1097, 0.1249, 0.141, 0.158, 0.1758, 0.1944, 0.2138], - accuracy: [2.55, 5.02, 7.44, 9.78, 12.05, 14.24, 16.36, 18.4, 20.35, 22.22, 23.99, 25.68, 27.25, 28.72, 30.07, 31.3, 32.37, 33.28, 33.99, 34.37] - } - ,'Gemini-3-Flash': { - turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], - tokens: [457, 2153, 2605, 4331, 5580, 7502, 8911, 10725, 12697, 14305, 16480, 18695, 20559, 22036, 23357, 24415, 25207, 25977, 26542, 26964], - costs: [0.0001, 0.0004, 0.0007, 0.0013, 0.002, 0.003, 0.004, 0.0052, 0.0066, 0.008, 0.0097, 0.0116, 0.0135, 0.0154, 0.0173, 0.0196, 0.0219, 0.024, 0.0263, 0.0284], - accuracy: [1.85, 3.65, 5.4, 7.09, 8.74, 10.33, 11.87, 13.35, 14.77, 16.12, 17.41, 18.63, 19.78, 20.84, 21.82, 22.71, 23.49, 24.15, 24.66, 24.94] - } - ,'GLM-4.6': { - turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], - tokens: [59, 1528, 1774, 2778, 3488, 4210, 4664, 5337, 6158, 7059, 7996, 8765, 9344, 9928, 10542, 11095, 11598, 12149, 12657, 13099], - costs: [0.0001, 0.0006, 0.001, 0.0017, 0.0026, 0.0037, 0.0049, 0.0063, 0.0079, 0.0097, 0.0118, 0.014, 0.0164, 0.019, 0.0217, 0.0245, 0.0275, 0.0306, 0.0337, 0.0369], - accuracy: [1.72, 3.4, 5.03, 6.62, 8.15, 9.64, 11.07, 12.45, 13.77, 15.04, 16.24, 17.38, 18.44, 19.44, 20.35, 21.18, 21.91, 22.52, 23.0, 23.26] - } - } - ,'10k': { - 'GPT-5.2': { - turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], - tokens: [56, 318, 1162, 1828, 2823, 3790, 4901, 5967, 6858, 7902, 8585, 9384, 10024, 10939, 11581, 12226, 12917, 13514, 14106, 14651], - costs: [0.0005, 0.0007, 0.0013, 0.0021, 0.0037, 0.0057, 0.0081, 0.0113, 0.015, 0.0199, 0.0243, 0.0298, 0.0343, 0.0398, 0.0454, 0.0521, 0.0575, 0.0631, 0.0713, 0.0774], - accuracy: [3.33, 6.58, 9.73, 12.8, 15.77, 18.64, 21.41, 24.08, 26.64, 29.08, 31.41, 33.61, 35.67, 37.6, 39.37, 40.97, 42.38, 43.57, 44.49, 44.99] - } - ,'Claude-4.5-Sonnet': { - turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], - tokens: [40, 361, 1119, 1794, 2561, 3410, 4413, 5431, 6339, 7208, 7983, 8720, 9502, 10235, 10978, 11679, 12286, 12899, 13469, 14050], - costs: [0.0005, 0.0017, 0.0034, 0.006, 0.0094, 0.0138, 0.0192, 0.0256, 0.0331, 0.0414, 0.0506, 0.0606, 0.0714, 0.083, 0.0955, 0.1087, 0.1226, 0.1371, 0.1523, 0.1682], - accuracy: [5.72, 11.3, 16.72, 21.98, 27.08, 32.02, 36.78, 41.36, 45.75, 49.95, 53.94, 57.72, 61.27, 64.57, 67.61, 70.36, 72.78, 74.83, 76.41, 77.27] - } - ,'Gemini-3-Flash': { - turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], - tokens: [561, 1108, 2384, 3420, 4473, 5692, 7504, 9142, 10958, 12616, 14312, 15667, 16667, 17523, 18404, 19118, 19469, 19722, 19908, 20077], - costs: [0.0001, 0.0004, 0.0008, 0.0013, 0.002, 0.0028, 0.004, 0.0052, 0.0066, 0.008, 0.0098, 0.0111, 0.013, 0.0149, 0.0171, 0.0192, 0.0224, 0.0251, 0.0246, 0.0275], - accuracy: [3.29, 6.49, 9.61, 12.63, 15.56, 18.4, 21.14, 23.77, 26.3, 28.71, 31.0, 33.18, 35.21, 37.11, 38.86, 40.44, 41.83, 43.01, 43.91, 44.41] - } - ,'GLM-4.6': { - turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], - tokens: [58, 339, 973, 1327, 1838, 2223, 2604, 3020, 3477, 3927, 4339, 4764, 5206, 5662, 6056, 6495, 6894, 7329, 7709, 8124], - costs: [0.0001, 0.0003, 0.0005, 0.0009, 0.0013, 0.0019, 0.0026, 0.0034, 0.0042, 0.0053, 0.0064, 0.0076, 0.0089, 0.0104, 0.012, 0.0136, 0.0154, 0.0173, 0.0193, 0.0214], - accuracy: [4.47, 8.83, 13.07, 17.19, 21.18, 25.03, 28.76, 32.34, 35.78, 39.06, 42.18, 45.13, 47.91, 50.49, 52.87, 55.02, 56.91, 58.51, 59.74, 60.42] - } - } - ,'globem': { - 'GPT-5.2': { - turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], - tokens: [58, 875, 1559, 2562, 3477, 4756, 6053, 7393, 8608, 10218, 11988, 13748, 15107, 16631, 17672, 18592, 19144, 19498, 19696, 19878], - costs: [0.0005, 0.0013, 0.002, 0.0032, 0.0048, 0.007, 0.0098, 0.0135, 0.0178, 0.0236, 0.0294, 0.0385, 0.0468, 0.0562, 0.0652, 0.0767, 0.0879, 0.1002, 0.1082, 0.1238], - accuracy: [2.84, 5.61, 8.31, 10.92, 13.45, 15.91, 18.27, 20.55, 22.73, 24.82, 26.8, 28.68, 30.44, 32.08, 33.59, 34.96, 36.16, 37.18, 37.96, 38.39] - } - ,'Claude-4.5-Sonnet': { - turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], - tokens: [54, 930, 2128, 3337, 4579, 5649, 6915, 8193, 9731, 11405, 13210, 15065, 17143, 19238, 21188, 23277, 25394, 27614, 30130, 32526], - costs: [0.0008, 0.0032, 0.006, 0.0099, 0.0152, 0.0216, 0.0296, 0.0393, 0.0507, 0.0638, 0.0789, 0.096, 0.1155, 0.1372, 0.1611, 0.1873, 0.2158, 0.247, 0.2805, 0.3124], - accuracy: [2.98, 5.88, 8.7, 11.44, 14.1, 16.67, 19.15, 21.53, 23.82, 26.01, 28.09, 30.05, 31.9, 33.62, 35.2, 36.63, 37.89, 38.96, 39.78, 40.23] - } - ,'Gemini-3-Flash': { - turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], - tokens: [549, 1839, 3441, 4928, 6260, 8046, 9776, 11341, 13250, 14825, 16374, 18786, 20565, 24046, 25972, 28004, 30001, 31784, 33556, 35526], - costs: [0.0002, 0.0005, 0.0009, 0.0015, 0.0021, 0.0029, 0.0038, 0.0049, 0.0061, 0.0074, 0.0089, 0.0105, 0.0123, 0.0144, 0.0166, 0.019, 0.0213, 0.0235, 0.0263, 0.0292], - accuracy: [2.61, 5.16, 7.63, 10.04, 12.37, 14.62, 16.8, 18.89, 20.9, 22.81, 24.64, 26.36, 27.98, 29.49, 30.88, 32.13, 33.24, 34.17, 34.9, 35.29] - } - ,'GLM-4.6': { - turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], - tokens: [58, 903, 1849, 2854, 3851, 4830, 5779, 6760, 7791, 8817, 10040, 11362, 12855, 14434, 16221, 18101, 20062, 22187, 24211, 26186], - costs: [0.0001, 0.0005, 0.001, 0.0017, 0.0027, 0.004, 0.0055, 0.0072, 0.0092, 0.0115, 0.0141, 0.017, 0.0203, 0.0238, 0.028, 0.0325, 0.0372, 0.0423, 0.0482, 0.0544], - accuracy: [3.08, 6.08, 9.0, 11.84, 14.58, 17.24, 19.8, 22.27, 24.64, 26.9, 29.05, 31.08, 32.99, 34.77, 36.41, 37.89, 39.19, 40.29, 41.14, 41.61] - } - } }, ranking: { - 'MIMIC': [ - {model: 'gpt5-mini', bt_rank: 1, win_rate: 100.0, accuracy: 27.59, acc_rank: 7, is_proprietary: true} - ,{model: 'claude4.5-sonnet', bt_rank: 2, win_rate: 94.6, accuracy: 33.66, acc_rank: 1, is_proprietary: true} - ,{model: 'gpt5mini', bt_rank: 3, win_rate: 87.8, accuracy: 27.59, acc_rank: 8, is_proprietary: true} - ,{model: 'gpt5.2', bt_rank: 4, win_rate: 83.6, accuracy: 28.88, acc_rank: 5, is_proprietary: true} - ,{model: 'gpt5.1', bt_rank: 5, win_rate: 80.6, accuracy: 30.1, acc_rank: 3, is_proprietary: true} - ,{model: 'gemini3-flash', bt_rank: 6, win_rate: 76.5, accuracy: 29.28, acc_rank: 4, is_proprietary: true} - ,{model: 'kimi-k2', bt_rank: 7, win_rate: 73.1, accuracy: 30.17, acc_rank: 2, is_proprietary: false} - ,{model: 'run_api_deepseek_deepseek-chat', bt_rank: 8, win_rate: 70.5, accuracy: 27.65, acc_rank: 6, is_proprietary: false} - ,{model: 'gemini2.5-pro', bt_rank: 9, win_rate: 63.9, accuracy: 19.0, acc_rank: 14, is_proprietary: true} - ,{model: 'qwen3-next-80b-a3b-instruct', bt_rank: 10, win_rate: 59.5, accuracy: 18.8, acc_rank: 15, is_proprietary: false} - ,{model: 'minimax-m2', bt_rank: 11, win_rate: 59.7, accuracy: 23.52, acc_rank: 10, is_proprietary: false} - ,{model: 'glm4.6', bt_rank: 12, win_rate: 52.1, accuracy: 23.84, acc_rank: 9, is_proprietary: false} - ,{model: 'qwen3', bt_rank: 13, win_rate: 51.7, accuracy: 19.13, acc_rank: 13, is_proprietary: false} - ,{model: 'qwen2.5-14B-Instruct-1M', bt_rank: 14, win_rate: 40.3, accuracy: 20, acc_rank: 11, is_proprietary: false} - ,{model: 'gemini2.5-flash-lite', bt_rank: 15, win_rate: 35.4, accuracy: 16.64, acc_rank: 18, is_proprietary: true} - ,{model: 'qwen2.5-14B-Instruct', bt_rank: 16, win_rate: 32.4, accuracy: 14.15, acc_rank: 20, is_proprietary: false} - ,{model: 'qwen2.5-32b-instruct', bt_rank: 17, win_rate: 32.3, accuracy: 13.12, acc_rank: 21, is_proprietary: false} - ,{model: 'gemini2.5-flash', bt_rank: 18, win_rate: 31.2, accuracy: 18.61, acc_rank: 16, is_proprietary: true} - ,{model: 'qwen2.5-72B-Instruct', bt_rank: 19, win_rate: 29.5, accuracy: 14.92, acc_rank: 19, is_proprietary: false} - ,{model: 'qwen3-4B-Instruct-2507', bt_rank: 20, win_rate: 27.3, accuracy: 16.93, acc_rank: 17, is_proprietary: false} - ,{model: 'qwen2.5-7B-Instruct-1M', bt_rank: 21, win_rate: 17.3, accuracy: 20, acc_rank: 12, is_proprietary: false} - ] - ,'10K': [ - {model: 'claude4.5-sonnet', bt_rank: 1, win_rate: 92.8, accuracy: 69.26, acc_rank: 1, is_proprietary: true} - ,{model: 'run_api_deepseek_deepseek-chat', bt_rank: 2, win_rate: 80.6, accuracy: 49.41, acc_rank: 2, is_proprietary: false} - ,{model: 'gpt5mini', bt_rank: 3, win_rate: 80.4, accuracy: 41.56, acc_rank: 5, is_proprietary: true} - ,{model: 'gpt5.2', bt_rank: 4, win_rate: 78.0, accuracy: 43.11, acc_rank: 4, is_proprietary: true} - ,{model: 'kimi-k2', bt_rank: 5, win_rate: 77.0, accuracy: 41.17, acc_rank: 7, is_proprietary: false} - ,{model: 'glm4.6', bt_rank: 6, win_rate: 71.4, accuracy: 48.29, acc_rank: 3, is_proprietary: false} - ,{model: 'gemini3-flash', bt_rank: 7, win_rate: 63.6, accuracy: 39.5, acc_rank: 8, is_proprietary: true} - ,{model: 'qwen3-next-80b-a3b-instruct', bt_rank: 8, win_rate: 59.2, accuracy: 38.34, acc_rank: 9, is_proprietary: false} - ,{model: 'minimax-m2', bt_rank: 9, win_rate: 54.4, accuracy: 35.74, acc_rank: 10, is_proprietary: false} - ,{model: 'gpt5.1', bt_rank: 10, win_rate: 54.0, accuracy: 41.23, acc_rank: 6, is_proprietary: true} - ,{model: 'qwen3', bt_rank: 11, win_rate: 51.0, accuracy: 28.23, acc_rank: 12, is_proprietary: false} - ,{model: 'qwen2.5-14B-Instruct-1M', bt_rank: 12, win_rate: 45.6, accuracy: 20, acc_rank: 15, is_proprietary: false} - ,{model: 'gemini2.5-pro', bt_rank: 13, win_rate: 44.8, accuracy: 20.91, acc_rank: 13, is_proprietary: true} - ,{model: 'qwen2.5-32b-instruct', bt_rank: 14, win_rate: 41.2, accuracy: 17.83, acc_rank: 17, is_proprietary: false} - ,{model: 'qwen2.5-72B-Instruct', bt_rank: 15, win_rate: 34.6, accuracy: 20.79, acc_rank: 14, is_proprietary: false} - ,{model: 'qwen2.5-14B-Instruct', bt_rank: 16, win_rate: 31.6, accuracy: 14.65, acc_rank: 18, is_proprietary: false} - ,{model: 'qwen3-4B-Instruct-2507', bt_rank: 17, win_rate: 30.0, accuracy: 30.43, acc_rank: 11, is_proprietary: false} - ,{model: 'gemini2.5-flash-lite', bt_rank: 18, win_rate: 29.6, accuracy: 14.37, acc_rank: 19, is_proprietary: true} - ,{model: 'qwen2.5-7B-Instruct-1M', bt_rank: 19, win_rate: 27.4, accuracy: 20, acc_rank: 16, is_proprietary: false} - ,{model: 'gemini2.5-flash', bt_rank: 20, win_rate: 25.2, accuracy: 12.61, acc_rank: 20, is_proprietary: true} - ,{model: 'qwen2.5-7B-Instruct', bt_rank: 21, win_rate: 22.0, accuracy: 7.53, acc_rank: 21, is_proprietary: false} - ] - ,'GLOBEM': [ - {model: 'claude4.5-sonnet', bt_rank: 1, win_rate: 93.0, accuracy: 39.54, acc_rank: 2, is_proprietary: true} - ,{model: 'gpt5-mini', bt_rank: 2, win_rate: 60.0, accuracy: 33.91, acc_rank: 12, is_proprietary: true} - ,{model: 'gemini3-flash', bt_rank: 3, win_rate: 81.2, accuracy: 35.46, acc_rank: 9, is_proprietary: true} - ,{model: 'minimax-m2', bt_rank: 4, win_rate: 77.8, accuracy: 36.9, acc_rank: 6, is_proprietary: false} - ,{model: 'gpt5mini', bt_rank: 5, win_rate: 73.8, accuracy: 33.91, acc_rank: 13, is_proprietary: true} - ,{model: 'gpt5.1', bt_rank: 6, win_rate: 67.5, accuracy: 36.76, acc_rank: 7, is_proprietary: true} - ,{model: 'gpt5.2', bt_rank: 7, win_rate: 64.4, accuracy: 38.39, acc_rank: 3, is_proprietary: true} - ,{model: 'qwen3', bt_rank: 8, win_rate: 64.7, accuracy: 36.32, acc_rank: 8, is_proprietary: false} - ,{model: 'run_api_deepseek_deepseek-chat', bt_rank: 9, win_rate: 64.5, accuracy: 38.39, acc_rank: 4, is_proprietary: false} - ,{model: 'glm4.6', bt_rank: 10, win_rate: 53.6, accuracy: 39.77, acc_rank: 1, is_proprietary: false} - ,{model: 'kimi-k2', bt_rank: 11, win_rate: 52.2, accuracy: 37.01, acc_rank: 5, is_proprietary: false} - ,{model: 'gemini2.5-pro', bt_rank: 12, win_rate: 45.6, accuracy: 34.6, acc_rank: 10, is_proprietary: true} - ,{model: 'qwen2.5-72B-Instruct', bt_rank: 13, win_rate: 43.3, accuracy: 27.13, acc_rank: 14, is_proprietary: false} - ,{model: 'qwen2.5-32B-Instruct', bt_rank: 14, win_rate: 42.1, accuracy: 20, acc_rank: 20, is_proprietary: false} - ,{model: 'qwen3-next-80b-a3b-instruct', bt_rank: 15, win_rate: 41.5, accuracy: 34.14, acc_rank: 11, is_proprietary: false} - ,{model: 'qwen2.5-14B-Instruct', bt_rank: 16, win_rate: 40.8, accuracy: 26.13, acc_rank: 16, is_proprietary: false} - ,{model: 'gemini2.5-flash-lite', bt_rank: 17, win_rate: 37.4, accuracy: 25.52, acc_rank: 18, is_proprietary: true} - ,{model: 'qwen3-4B-Instruct-2507', bt_rank: 18, win_rate: 36.6, accuracy: 26.9, acc_rank: 15, is_proprietary: false} - ,{model: 'qwen2.5-14B-Instruct-1M', bt_rank: 19, win_rate: 32.0, accuracy: 20, acc_rank: 21, is_proprietary: false} - ,{model: 'llama3.3-70B', bt_rank: 20, win_rate: 28.1, accuracy: 22.65, acc_rank: 19, is_proprietary: false} - ,{model: 'qwen2.5-7B-Instruct', bt_rank: 21, win_rate: 22.2, accuracy: 25.64, acc_rank: 17, is_proprietary: false} - ] + "MIMIC": [ + { + "model": "gpt5-mini", + "bt_rank": 1, + "win_rate": 100.0, + "accuracy": 27.59, + "acc_rank": 7, + "is_proprietary": true + }, + { + "model": "claude4.5-sonnet", + "bt_rank": 2, + "win_rate": 94.6, + "accuracy": 33.66, + "acc_rank": 1, + "is_proprietary": true + }, + { + "model": "gpt5mini", + "bt_rank": 3, + "win_rate": 87.8, + "accuracy": 27.59, + "acc_rank": 8, + "is_proprietary": true + }, + { + "model": "gpt5.2", + "bt_rank": 4, + "win_rate": 83.6, + "accuracy": 28.88, + "acc_rank": 5, + "is_proprietary": true + }, + { + "model": "gpt5.1", + "bt_rank": 5, + "win_rate": 80.6, + "accuracy": 30.1, + "acc_rank": 3, + "is_proprietary": true + }, + { + "model": "gemini3-flash", + "bt_rank": 6, + "win_rate": 76.5, + "accuracy": 29.28, + "acc_rank": 4, + "is_proprietary": true + }, + { + "model": "kimi-k2", + "bt_rank": 7, + "win_rate": 73.1, + "accuracy": 30.17, + "acc_rank": 2, + "is_proprietary": false + }, + { + "model": "run_api_deepseek_deepseek-chat", + "bt_rank": 8, + "win_rate": 70.5, + "accuracy": 27.65, + "acc_rank": 6, + "is_proprietary": false + }, + { + "model": "gemini2.5-pro", + "bt_rank": 9, + "win_rate": 63.9, + "accuracy": 19.0, + "acc_rank": 14, + "is_proprietary": true + }, + { + "model": "qwen3-next-80b-a3b-instruct", + "bt_rank": 10, + "win_rate": 59.5, + "accuracy": 18.8, + "acc_rank": 15, + "is_proprietary": false + }, + { + "model": "minimax-m2", + "bt_rank": 11, + "win_rate": 59.7, + "accuracy": 23.52, + "acc_rank": 10, + "is_proprietary": false + }, + { + "model": "glm4.6", + "bt_rank": 12, + "win_rate": 52.1, + "accuracy": 23.84, + "acc_rank": 9, + "is_proprietary": false + }, + { + "model": "qwen3", + "bt_rank": 13, + "win_rate": 51.7, + "accuracy": 19.13, + "acc_rank": 13, + "is_proprietary": false + }, + { + "model": "qwen2.5-14B-Instruct-1M", + "bt_rank": 14, + "win_rate": 40.3, + "accuracy": 20, + "acc_rank": 11, + "is_proprietary": false + }, + { + "model": "gemini2.5-flash-lite", + "bt_rank": 15, + "win_rate": 35.4, + "accuracy": 16.64, + "acc_rank": 18, + "is_proprietary": true + }, + { + "model": "qwen2.5-14B-Instruct", + "bt_rank": 16, + "win_rate": 32.4, + "accuracy": 14.15, + "acc_rank": 20, + "is_proprietary": false + }, + { + "model": "qwen2.5-32b-instruct", + "bt_rank": 17, + "win_rate": 32.3, + "accuracy": 13.12, + "acc_rank": 21, + "is_proprietary": false + }, + { + "model": "gemini2.5-flash", + "bt_rank": 18, + "win_rate": 31.2, + "accuracy": 18.61, + "acc_rank": 16, + "is_proprietary": true + }, + { + "model": "qwen2.5-72B-Instruct", + "bt_rank": 19, + "win_rate": 29.5, + "accuracy": 14.92, + "acc_rank": 19, + "is_proprietary": false + }, + { + "model": "qwen3-4B-Instruct-2507", + "bt_rank": 20, + "win_rate": 27.3, + "accuracy": 16.93, + "acc_rank": 17, + "is_proprietary": false + }, + { + "model": "qwen2.5-7B-Instruct-1M", + "bt_rank": 21, + "win_rate": 17.3, + "accuracy": 20, + "acc_rank": 12, + "is_proprietary": false + }, + { + "model": "llama3.3-70B", + "bt_rank": 22, + "win_rate": 14.2, + "accuracy": 7.3, + "acc_rank": 22, + "is_proprietary": false + } + ], + "10K": [ + { + "model": "claude4.5-sonnet", + "bt_rank": 1, + "win_rate": 92.8, + "accuracy": 69.26, + "acc_rank": 1, + "is_proprietary": true + }, + { + "model": "run_api_deepseek_deepseek-chat", + "bt_rank": 2, + "win_rate": 80.6, + "accuracy": 49.41, + "acc_rank": 2, + "is_proprietary": false + }, + { + "model": "gpt5mini", + "bt_rank": 3, + "win_rate": 80.4, + "accuracy": 41.56, + "acc_rank": 5, + "is_proprietary": true + }, + { + "model": "gpt5.2", + "bt_rank": 4, + "win_rate": 78.0, + "accuracy": 43.11, + "acc_rank": 4, + "is_proprietary": true + }, + { + "model": "kimi-k2", + "bt_rank": 5, + "win_rate": 77.0, + "accuracy": 41.17, + "acc_rank": 7, + "is_proprietary": false + }, + { + "model": "glm4.6", + "bt_rank": 6, + "win_rate": 71.4, + "accuracy": 48.29, + "acc_rank": 3, + "is_proprietary": false + }, + { + "model": "gemini3-flash", + "bt_rank": 7, + "win_rate": 63.6, + "accuracy": 39.5, + "acc_rank": 8, + "is_proprietary": true + }, + { + "model": "qwen3-next-80b-a3b-instruct", + "bt_rank": 8, + "win_rate": 59.2, + "accuracy": 38.34, + "acc_rank": 9, + "is_proprietary": false + }, + { + "model": "minimax-m2", + "bt_rank": 9, + "win_rate": 54.4, + "accuracy": 35.74, + "acc_rank": 10, + "is_proprietary": false + }, + { + "model": "gpt5.1", + "bt_rank": 10, + "win_rate": 54.0, + "accuracy": 41.23, + "acc_rank": 6, + "is_proprietary": true + }, + { + "model": "qwen3", + "bt_rank": 11, + "win_rate": 51.0, + "accuracy": 28.23, + "acc_rank": 12, + "is_proprietary": false + }, + { + "model": "qwen2.5-14B-Instruct-1M", + "bt_rank": 12, + "win_rate": 45.6, + "accuracy": 20, + "acc_rank": 15, + "is_proprietary": false + }, + { + "model": "gemini2.5-pro", + "bt_rank": 13, + "win_rate": 44.8, + "accuracy": 20.91, + "acc_rank": 13, + "is_proprietary": true + }, + { + "model": "qwen2.5-32b-instruct", + "bt_rank": 14, + "win_rate": 41.2, + "accuracy": 17.83, + "acc_rank": 17, + "is_proprietary": false + }, + { + "model": "qwen2.5-72B-Instruct", + "bt_rank": 15, + "win_rate": 34.6, + "accuracy": 20.79, + "acc_rank": 14, + "is_proprietary": false + }, + { + "model": "qwen2.5-14B-Instruct", + "bt_rank": 16, + "win_rate": 31.6, + "accuracy": 14.65, + "acc_rank": 18, + "is_proprietary": false + }, + { + "model": "qwen3-4B-Instruct-2507", + "bt_rank": 17, + "win_rate": 30.0, + "accuracy": 30.43, + "acc_rank": 11, + "is_proprietary": false + }, + { + "model": "gemini2.5-flash-lite", + "bt_rank": 18, + "win_rate": 29.6, + "accuracy": 14.37, + "acc_rank": 19, + "is_proprietary": true + }, + { + "model": "qwen2.5-7B-Instruct-1M", + "bt_rank": 19, + "win_rate": 27.4, + "accuracy": 20, + "acc_rank": 16, + "is_proprietary": false + }, + { + "model": "gemini2.5-flash", + "bt_rank": 20, + "win_rate": 25.2, + "accuracy": 12.61, + "acc_rank": 20, + "is_proprietary": true + }, + { + "model": "qwen2.5-7B-Instruct", + "bt_rank": 21, + "win_rate": 22.0, + "accuracy": 7.53, + "acc_rank": 21, + "is_proprietary": false + }, + { + "model": "llama3.3-70B", + "bt_rank": 22, + "win_rate": 18.6, + "accuracy": 6.51, + "acc_rank": 22, + "is_proprietary": false + } + ], + "GLOBEM": [ + { + "model": "claude4.5-sonnet", + "bt_rank": 1, + "win_rate": 93.0, + "accuracy": 39.54, + "acc_rank": 2, + "is_proprietary": true + }, + { + "model": "gpt5-mini", + "bt_rank": 2, + "win_rate": 60.0, + "accuracy": 33.91, + "acc_rank": 12, + "is_proprietary": true + }, + { + "model": "gemini3-flash", + "bt_rank": 3, + "win_rate": 81.2, + "accuracy": 35.46, + "acc_rank": 9, + "is_proprietary": true + }, + { + "model": "minimax-m2", + "bt_rank": 4, + "win_rate": 77.8, + "accuracy": 36.9, + "acc_rank": 6, + "is_proprietary": false + }, + { + "model": "gpt5mini", + "bt_rank": 5, + "win_rate": 73.8, + "accuracy": 33.91, + "acc_rank": 13, + "is_proprietary": true + }, + { + "model": "gpt5.1", + "bt_rank": 6, + "win_rate": 67.5, + "accuracy": 36.76, + "acc_rank": 7, + "is_proprietary": true + }, + { + "model": "gpt5.2", + "bt_rank": 7, + "win_rate": 64.4, + "accuracy": 38.39, + "acc_rank": 3, + "is_proprietary": true + }, + { + "model": "qwen3", + "bt_rank": 8, + "win_rate": 64.7, + "accuracy": 36.32, + "acc_rank": 8, + "is_proprietary": false + }, + { + "model": "run_api_deepseek_deepseek-chat", + "bt_rank": 9, + "win_rate": 64.5, + "accuracy": 38.39, + "acc_rank": 4, + "is_proprietary": false + }, + { + "model": "glm4.6", + "bt_rank": 10, + "win_rate": 53.6, + "accuracy": 39.77, + "acc_rank": 1, + "is_proprietary": false + }, + { + "model": "kimi-k2", + "bt_rank": 11, + "win_rate": 52.2, + "accuracy": 37.01, + "acc_rank": 5, + "is_proprietary": false + }, + { + "model": "gemini2.5-pro", + "bt_rank": 12, + "win_rate": 45.6, + "accuracy": 34.6, + "acc_rank": 10, + "is_proprietary": true + }, + { + "model": "qwen2.5-72B-Instruct", + "bt_rank": 13, + "win_rate": 43.3, + "accuracy": 27.13, + "acc_rank": 14, + "is_proprietary": false + }, + { + "model": "qwen2.5-32B-Instruct", + "bt_rank": 14, + "win_rate": 42.1, + "accuracy": 20, + "acc_rank": 20, + "is_proprietary": false + }, + { + "model": "qwen3-next-80b-a3b-instruct", + "bt_rank": 15, + "win_rate": 41.5, + "accuracy": 34.14, + "acc_rank": 11, + "is_proprietary": false + }, + { + "model": "qwen2.5-14B-Instruct", + "bt_rank": 16, + "win_rate": 40.8, + "accuracy": 26.13, + "acc_rank": 16, + "is_proprietary": false + }, + { + "model": "gemini2.5-flash-lite", + "bt_rank": 17, + "win_rate": 37.4, + "accuracy": 25.52, + "acc_rank": 18, + "is_proprietary": true + }, + { + "model": "qwen3-4B-Instruct-2507", + "bt_rank": 18, + "win_rate": 36.6, + "accuracy": 26.9, + "acc_rank": 15, + "is_proprietary": false + }, + { + "model": "qwen2.5-14B-Instruct-1M", + "bt_rank": 19, + "win_rate": 32.0, + "accuracy": 20, + "acc_rank": 21, + "is_proprietary": false + }, + { + "model": "llama3.3-70B", + "bt_rank": 20, + "win_rate": 28.1, + "accuracy": 22.65, + "acc_rank": 19, + "is_proprietary": false + }, + { + "model": "qwen2.5-7B-Instruct", + "bt_rank": 21, + "win_rate": 22.2, + "accuracy": 25.64, + "acc_rank": 17, + "is_proprietary": false + }, + { + "model": "qwen2.5-7B-Instruct-1M", + "bt_rank": 22, + "win_rate": 19.7, + "accuracy": 20, + "acc_rank": 22, + "is_proprietary": false + } + ] }, turn: { - 'mimic': [ - {model: 'Claude4.5-Sonnet', median: 52, distribution: [0.0, 0.0, 1.0, 5.0, 31.0, 43.0, 13.0, 7.0, 0.0, 0.0]} - ,{model: 'GPT5-mini', median: 39, distribution: [0.0, 0.0, 9.0, 42.0, 36.0, 12.0, 1.0, 0.0, 0.0, 0.0]} - ,{model: 'GLM4.6', median: 39, distribution: [0.0, 6.3, 23.4, 20.7, 7.2, 13.5, 3.6, 6.3, 4.5, 14.4]} - ,{model: 'DeepSeekV3.2', median: 33, distribution: [0.0, 2.0, 22.0, 60.0, 16.0, 0.0, 0.0, 0.0, 0.0, 0.0]} - ,{model: 'GPT5.2', median: 30, distribution: [0.0, 10.0, 36.0, 32.0, 12.0, 10.0, 0.0, 0.0, 0.0, 0.0]} - ,{model: 'GPT5.1', median: 23, distribution: [1.5, 39.7, 29.4, 19.9, 9.6, 0.0, 0.0, 0.0, 0.0, 0.0]} - ,{model: 'Kimi-K2', median: 19, distribution: [0.0, 55.0, 44.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]} - ,{model: 'MiniMax-M2', median: 18, distribution: [0.0, 70.0, 30.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]} - ,{model: 'Qwen3-Next-80B-A3B', median: 17, distribution: [12.0, 52.0, 24.0, 10.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0]} - ,{model: 'Qwen3-30B-A3B', median: 17, distribution: [12.0, 52.0, 24.0, 10.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0]} - ,{model: 'Gemini3-Flash', median: 15, distribution: [7.0, 71.0, 22.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]} - ,{model: 'Gemini2.5-Pro', median: 15, distribution: [10.6, 70.2, 19.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]} - ,{model: 'Qwen2.5-72B', median: 11, distribution: [15.0, 85.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]} - ,{model: 'Llama3.3-70B', median: 6, distribution: [99.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]} - ] - ,'10k': [ - {model: 'Claude4.5-Sonnet', median: 56, distribution: [0.0, 0.0, 1.0, 6.0, 13.0, 44.0, 27.0, 6.0, 3.0, 0.0]} - ,{model: 'GLM4.6', median: 52, distribution: [0.0, 0.0, 3.8, 10.4, 27.4, 27.4, 18.9, 5.7, 4.7, 1.9]} - ,{model: 'DeepSeekV3.2', median: 39, distribution: [0.0, 0.0, 11.0, 40.0, 37.0, 9.0, 3.0, 0.0, 0.0, 0.0]} - ,{model: 'Kimi-K2', median: 24, distribution: [0.0, 29.0, 48.0, 21.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0]} - ,{model: 'GPT5.2', median: 20, distribution: [0.0, 43.0, 41.0, 12.0, 3.0, 0.0, 1.0, 0.0, 0.0, 0.0]} - ,{model: 'MiniMax-M2', median: 20, distribution: [0.0, 43.0, 48.0, 9.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]} - ,{model: 'GPT5.1', median: 17, distribution: [1.0, 69.0, 29.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]} - ,{model: 'Gemini2.5-Pro', median: 15, distribution: [7.0, 73.0, 18.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]} - ,{model: 'Gemini3-Flash', median: 13, distribution: [10.0, 82.0, 7.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]} - ,{model: 'Qwen3-Next-80B-A3B', median: 8, distribution: [81.0, 19.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]} - ,{model: 'Qwen3-30B-A3B', median: 8, distribution: [81.0, 19.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]} - ,{model: 'Qwen2.5-72B', median: 7, distribution: [75.0, 25.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]} - ,{model: 'Llama3.3-70B', median: 1, distribution: [92.0, 7.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]} - ] - ,'globem': [ - {model: 'Claude4.5-Sonnet', median: 25, distribution: [0.0, 6.0, 87.0, 7.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]} - ,{model: 'Gemini3-Flash', median: 21, distribution: [2.0, 36.0, 58.0, 3.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]} - ,{model: 'GLM4.6', median: 21, distribution: [0.0, 23.0, 66.0, 11.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]} - ,{model: 'DeepSeekV3.2', median: 20, distribution: [0.0, 32.0, 68.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]} - ,{model: 'GPT5-mini', median: 17, distribution: [2.0, 78.0, 20.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]} - ,{model: 'Kimi-K2', median: 17, distribution: [0.0, 82.0, 18.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]} - ,{model: 'MiniMax-M2', median: 17, distribution: [0.0, 80.0, 20.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]} - ,{model: 'GPT5.2', median: 15, distribution: [0.0, 92.0, 8.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]} - ,{model: 'Qwen2.5-72B', median: 14, distribution: [4.0, 78.0, 17.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]} - ,{model: 'Gemini2.5-Pro', median: 12, distribution: [3.0, 94.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]} - ,{model: 'Qwen3-Next-80B-A3B', median: 12, distribution: [0.0, 99.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]} - ,{model: 'Qwen3-30B-A3B', median: 12, distribution: [0.0, 99.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]} - ,{model: 'GPT5.1', median: 11, distribution: [30.0, 70.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]} - ,{model: 'Llama3.3-70B', median: 6, distribution: [98.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]} - ] - }, - entropy: { - 'mimic': { - 'GPT-5.2': {entropy: [0.72, 0.78, 0.82, 0.68, 0.75, 0.88], coverage: [0.08, 0.1, 0.09, 0.07, 0.09, 0.11], accuracy: [30, 35, 40, 25, 32, 45]} - ,'Claude-4.5-Sonnet': {entropy: [0.85, 0.88, 0.92, 0.8, 0.87, 0.78], coverage: [0.12, 0.14, 0.13, 0.1, 0.13, 0.09], accuracy: [45, 50, 55, 40, 48, 35]} - ,'Gemini-3-Flash': {entropy: [0.7, 0.75, 0.68, 0.72, 0.8, 0.65], coverage: [0.06, 0.09, 0.07, 0.08, 0.1, 0.05], accuracy: [28, 32, 25, 30, 38, 22]} - ,'GLM-4.6': {entropy: [0.78, 0.82, 0.75, 0.8, 0.88, 0.72], coverage: [0.09, 0.11, 0.08, 0.1, 0.13, 0.07], accuracy: [32, 40, 28, 35, 45, 25]} - ,'DeepSeek-V3.2': {entropy: [0.82, 0.85, 0.78, 0.88, 0.75, 0.9], coverage: [0.1, 0.12, 0.09, 0.14, 0.08, 0.15], accuracy: [38, 42, 32, 48, 28, 52]} - } - ,'10k': { - 'GPT-5.2': {entropy: [0.72, 0.78, 0.82, 0.68, 0.75, 0.88], coverage: [0.08, 0.1, 0.09, 0.07, 0.09, 0.11], accuracy: [30, 35, 40, 25, 32, 45]} - ,'Claude-4.5-Sonnet': {entropy: [0.85, 0.88, 0.92, 0.8, 0.87, 0.78], coverage: [0.12, 0.14, 0.13, 0.1, 0.13, 0.09], accuracy: [45, 50, 55, 40, 48, 35]} - ,'Gemini-3-Flash': {entropy: [0.7, 0.75, 0.68, 0.72, 0.8, 0.65], coverage: [0.06, 0.09, 0.07, 0.08, 0.1, 0.05], accuracy: [28, 32, 25, 30, 38, 22]} - ,'GLM-4.6': {entropy: [0.78, 0.82, 0.75, 0.8, 0.88, 0.72], coverage: [0.09, 0.11, 0.08, 0.1, 0.13, 0.07], accuracy: [32, 40, 28, 35, 45, 25]} - ,'DeepSeek-V3.2': {entropy: [0.82, 0.85, 0.78, 0.88, 0.75, 0.9], coverage: [0.1, 0.12, 0.09, 0.14, 0.08, 0.15], accuracy: [38, 42, 32, 48, 28, 52]} - } - ,'globem': { - 'GPT-5.2': {entropy: [0.72, 0.78, 0.82, 0.68, 0.75, 0.88], coverage: [0.08, 0.1, 0.09, 0.07, 0.09, 0.11], accuracy: [30, 35, 40, 25, 32, 45]} - ,'Claude-4.5-Sonnet': {entropy: [0.85, 0.88, 0.92, 0.8, 0.87, 0.78], coverage: [0.12, 0.14, 0.13, 0.1, 0.13, 0.09], accuracy: [45, 50, 55, 40, 48, 35]} - ,'Gemini-3-Flash': {entropy: [0.7, 0.75, 0.68, 0.72, 0.8, 0.65], coverage: [0.06, 0.09, 0.07, 0.08, 0.1, 0.05], accuracy: [28, 32, 25, 30, 38, 22]} - ,'GLM-4.6': {entropy: [0.78, 0.82, 0.75, 0.8, 0.88, 0.72], coverage: [0.09, 0.11, 0.08, 0.1, 0.13, 0.07], accuracy: [32, 40, 28, 35, 45, 25]} - ,'DeepSeek-V3.2': {entropy: [0.82, 0.85, 0.78, 0.88, 0.75, 0.9], coverage: [0.1, 0.12, 0.09, 0.14, 0.08, 0.15], accuracy: [38, 42, 32, 48, 28, 52]} - } + "mimic": [ + { + "model": "claude4.5-sonnet", + "median": 52, + "distribution": [ + 0.0, + 0.0, + 1.0, + 5.0, + 31.0, + 43.0, + 13.0, + 7.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3", + "median": 43, + "distribution": [ + 0.0, + 1.0, + 12.0, + 29.0, + 13.0, + 9.0, + 3.0, + 2.0, + 0.0, + 31.0 + ] + }, + { + "model": "gpt5-mini", + "median": 39, + "distribution": [ + 0.0, + 0.0, + 9.0, + 42.0, + 36.0, + 12.0, + 1.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "glm4.6", + "median": 39, + "distribution": [ + 0.0, + 6.3, + 23.4, + 20.7, + 7.2, + 13.5, + 3.6, + 6.3, + 4.5, + 14.4 + ] + }, + { + "model": "run_api_deepseek_deepseek-chat", + "median": 33, + "distribution": [ + 0.0, + 2.0, + 22.0, + 60.0, + 16.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "gpt5.2", + "median": 30, + "distribution": [ + 0.0, + 10.0, + 36.0, + 32.0, + 12.0, + 10.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "gpt5.1", + "median": 23, + "distribution": [ + 1.5, + 39.7, + 29.4, + 19.9, + 9.6, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "kimi-k2", + "median": 19, + "distribution": [ + 0.0, + 55.0, + 44.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "minimax-m2", + "median": 18, + "distribution": [ + 0.0, + 70.0, + 30.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3-next-80b-a3b-instruct-note", + "median": 17, + "distribution": [ + 12.0, + 52.0, + 24.0, + 10.0, + 2.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "gemini2.5-pro", + "median": 15, + "distribution": [ + 10.6, + 70.2, + 19.2, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "gemini3-flash", + "median": 15, + "distribution": [ + 7.0, + 71.0, + 22.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3-4B-Instruct-2507", + "median": 14, + "distribution": [ + 0.0, + 98.0, + 1.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3-next-80b-a3b-instruct-noreasoning", + "median": 14, + "distribution": [ + 7.0, + 68.0, + 22.0, + 2.0, + 0.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3-next-80b-a3b-instruct", + "median": 12, + "distribution": [ + 23.0, + 62.0, + 12.0, + 0.0, + 1.0, + 0.0, + 1.0, + 0.0, + 0.0, + 1.0 + ] + }, + { + "model": "qwen2.5-72B-Instruct", + "median": 11, + "distribution": [ + 15.0, + 85.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3-next-80b-a3b-instruct-longreasoning", + "median": 11, + "distribution": [ + 24.0, + 74.0, + 2.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen2.5-32b-instruct", + "median": 11, + "distribution": [ + 33.0, + 67.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "gemini2.5-flash-lite", + "median": 11, + "distribution": [ + 29.0, + 71.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "gemini2.5-flash", + "median": 10, + "distribution": [ + 34.0, + 65.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen2.5-14B-Instruct-1M", + "median": 10, + "distribution": [ + 34.0, + 65.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3-next-80b-a3b-instruct-shortreasoning", + "median": 9, + "distribution": [ + 64.0, + 36.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen2.5-14B-Instruct", + "median": 8, + "distribution": [ + 73.0, + 27.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen2.5-7B-Instruct", + "median": 7, + "distribution": [ + 90.0, + 10.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "llama-3.3-70B", + "median": 6, + "distribution": [ + 99.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "llama3.3-70B", + "median": 6, + "distribution": [ + 99.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen2.5-7B-Instruct-1M", + "median": 4, + "distribution": [ + 91.0, + 9.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } + ], + "10k": [ + { + "model": "claude4.5-sonnet", + "median": 56, + "distribution": [ + 0.0, + 0.0, + 1.0, + 6.0, + 13.0, + 44.0, + 27.0, + 6.0, + 3.0, + 0.0 + ] + }, + { + "model": "glm4.6", + "median": 52, + "distribution": [ + 0.0, + 0.0, + 3.8, + 10.4, + 27.4, + 27.4, + 18.9, + 5.7, + 4.7, + 1.9 + ] + }, + { + "model": "run_api_deepseek_deepseek-chat", + "median": 39, + "distribution": [ + 0.0, + 0.0, + 11.0, + 40.0, + 37.0, + 9.0, + 3.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "gpt5mini", + "median": 35, + "distribution": [ + 0.0, + 4.0, + 27.8, + 36.5, + 24.6, + 6.3, + 0.8, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3", + "median": 26, + "distribution": [ + 0.8, + 25.4, + 30.2, + 7.9, + 2.4, + 0.0, + 0.0, + 0.8, + 0.0, + 32.5 + ] + }, + { + "model": "kimi-k2", + "median": 24, + "distribution": [ + 0.0, + 29.0, + 48.0, + 21.0, + 2.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "minimax-m2", + "median": 20, + "distribution": [ + 0.0, + 43.0, + 48.0, + 9.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3-next-80b-a3b-instruct", + "median": 20, + "distribution": [ + 0.0, + 46.0, + 27.0, + 11.0, + 2.0, + 6.0, + 3.0, + 1.0, + 1.0, + 3.0 + ] + }, + { + "model": "gpt5.2", + "median": 20, + "distribution": [ + 0.0, + 43.0, + 41.0, + 12.0, + 3.0, + 0.0, + 1.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "gpt5.1", + "median": 17, + "distribution": [ + 1.0, + 69.0, + 29.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3-next-80b-a3b-instruct-note", + "median": 16, + "distribution": [ + 17.0, + 44.0, + 27.0, + 10.0, + 2.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "gemini2.5-pro", + "median": 15, + "distribution": [ + 7.0, + 73.0, + 18.0, + 2.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "gemini2.5-flash-lite", + "median": 14, + "distribution": [ + 14.0, + 78.0, + 8.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "gemini3-flash", + "median": 13, + "distribution": [ + 10.0, + 82.0, + 7.0, + 0.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "gemini2.5-flash", + "median": 12, + "distribution": [ + 21.0, + 69.0, + 8.0, + 2.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3-4B-Instruct-2507", + "median": 12, + "distribution": [ + 4.0, + 91.0, + 5.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen2.5-14B-Instruct-1M", + "median": 11, + "distribution": [ + 31.0, + 64.3, + 4.0, + 0.8, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3-next-80b-a3b-instruct-longreasoning", + "median": 11, + "distribution": [ + 28.0, + 67.0, + 5.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen2.5-32b-instruct", + "median": 10, + "distribution": [ + 34.1, + 65.9, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3-next-80b-a3b-instruct-noreasoning", + "median": 9, + "distribution": [ + 58.0, + 41.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen2.5-14B-Instruct", + "median": 9, + "distribution": [ + 58.0, + 42.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3-next-80b-a3b-instruct-shortreasoning", + "median": 8, + "distribution": [ + 81.0, + 19.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen2.5-7B-Instruct-1M", + "median": 8, + "distribution": [ + 70.0, + 29.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen2.5-72B-Instruct", + "median": 7, + "distribution": [ + 75.0, + 25.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen2.5-7B-Instruct", + "median": 7, + "distribution": [ + 84.0, + 16.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "llama3.3-70B", + "median": 1, + "distribution": [ + 92.0, + 7.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } + ], + "globem": [ + { + "model": "claude4.5-sonnet", + "median": 25, + "distribution": [ + 0.0, + 6.0, + 87.0, + 7.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "gemini3-flash", + "median": 21, + "distribution": [ + 2.0, + 36.0, + 58.0, + 3.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "glm4.6", + "median": 21, + "distribution": [ + 0.0, + 23.0, + 66.0, + 11.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "run_api_deepseek_deepseek-chat", + "median": 20, + "distribution": [ + 0.0, + 32.0, + 68.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3-next-80b-a3b-instruct-note", + "median": 19, + "distribution": [ + 16.0, + 36.0, + 33.0, + 9.0, + 5.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3", + "median": 19, + "distribution": [ + 0.0, + 50.0, + 38.0, + 9.0, + 0.0, + 0.0, + 1.0, + 0.0, + 0.0, + 2.0 + ] + }, + { + "model": "minimax-m2", + "median": 17, + "distribution": [ + 0.0, + 80.0, + 20.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "gpt5-mini", + "median": 17, + "distribution": [ + 2.0, + 78.0, + 20.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "kimi-k2", + "median": 17, + "distribution": [ + 0.0, + 82.0, + 18.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen2.5-32B-Instruct", + "median": 15, + "distribution": [ + 1.0, + 84.0, + 14.0, + 0.0, + 0.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "gpt5.2", + "median": 15, + "distribution": [ + 0.0, + 92.0, + 8.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen2.5-72B-Instruct", + "median": 14, + "distribution": [ + 4.0, + 78.0, + 17.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "gemini2.5-flash-lite", + "median": 14, + "distribution": [ + 7.0, + 80.0, + 12.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen2.5-14B-Instruct-1M", + "median": 14, + "distribution": [ + 13.0, + 66.0, + 16.0, + 5.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen2.5-14B-Instruct", + "median": 13, + "distribution": [ + 16.0, + 82.0, + 2.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3-next-80b-a3b-instruct", + "median": 12, + "distribution": [ + 0.0, + 99.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "gemini2.5-pro", + "median": 12, + "distribution": [ + 3.0, + 94.0, + 3.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen2.5-7B-Instruct-1M", + "median": 12, + "distribution": [ + 18.0, + 73.0, + 7.0, + 0.0, + 2.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "gemini2.5-flash", + "median": 12, + "distribution": [ + 15.0, + 85.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3-4B-Instruct-2507", + "median": 12, + "distribution": [ + 12.0, + 83.0, + 5.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "gpt5.1", + "median": 11, + "distribution": [ + 30.0, + 70.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3-next-80b-a3b-instruct-noreasoning", + "median": 9, + "distribution": [ + 57.0, + 42.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3-next-80b-a3b-instruct-longreasoning", + "median": 9, + "distribution": [ + 69.0, + 30.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3-next-80b-a3b-instruct-shortreasoning", + "median": 9, + "distribution": [ + 66.0, + 34.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen2.5-7B-Instruct", + "median": 9, + "distribution": [ + 53.0, + 45.0, + 1.0, + 0.0, + 0.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "llama3.3-70B", + "median": 6, + "distribution": [ + 98.0, + 2.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } + ] }, probing: { - byTurn: { - 'mimic': { - 'Qwen2.5-32B': {turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-4.3, -4.21, -4.04, -3.87, -3.59, -3.62, -3.33, -3.4, -2.93, -3.21], sem: [0.25, 0.27, 0.32, 0.35, 0.35, 0.36, 0.34, 0.35, 0.32, 0.4]} - ,'Qwen2.5-72B': {turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-3.51, -3.98, -3.68, -3.8, -3.26, -3.22, -3.12, -3.24, -3.08, -2.84], sem: [0.15, 0.21, 0.21, 0.23, 0.23, 0.21, 0.25, 0.25, 0.28, 0.08]} - ,'Qwen3-4B': {turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-3.48, -3.25, -3.3, -2.74, -2.75, -2.73, -2.72, -2.67, -2.62, -2.25], sem: [0.04, 0.05, 0.04, 0.07, 0.06, 0.07, 0.07, 0.07, 0.06, 0.06]} - ,'Qwen3-30B-A3B': {turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-4.94, -5.21, -5.51, -5.05, -4.96, -4.95, -4.75, -4.73, -4.6, -4.72], sem: [0.15, 0.18, 0.2, 0.18, 0.19, 0.19, 0.17, 0.18, 0.16, 0.18]} - ,'Qwen3-Next-80B-A3B': {turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-2.85, -2.86, -2.74, -2.65, -2.31, -2.14, -1.98, -2.03, -1.88, -1.82], sem: [0.1, 0.1, 0.11, 0.11, 0.11, 0.13, 0.13, 0.18, 0.17, 0.09]} - } - ,'globem': { - 'Qwen2.5-32B': {turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-5.48, -5.83, -5.84, -5.91, -6.01, -6.03, -5.86, -5.73, -5.78, -5.73], sem: [0.24, 0.28, 0.31, 0.33, 0.33, 0.35, 0.33, 0.35, 0.35, 0.36]} - ,'Qwen2.5-72B': {turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-4.68, -5.56, -5.65, -5.59, -5.59, -5.49, -5.54, -5.4, -5.57, -5.53], sem: [0.13, 0.18, 0.23, 0.23, 0.25, 0.25, 0.29, 0.32, 0.38, 0.46]} - ,'Qwen3-4B': {turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-4.67, -4.16, -3.9, -3.76, -3.6, -3.47, -3.05, -2.99, -2.93, -2.78], sem: [0.08, 0.07, 0.06, 0.06, 0.07, 0.08, 0.07, 0.08, 0.08, 0.09]} - ,'Qwen3-30B-A3B': {turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-5.28, -5.23, -5.2, -5.19, -5.2, -5.01, -5.21, -4.95, -4.93, -4.81], sem: [0.09, 0.09, 0.09, 0.08, 0.08, 0.08, 0.09, 0.09, 0.1, 0.1]} - ,'Qwen3-Next-80B-A3B': {turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-3.1, -3.15, -3.06, -3.01, -2.95, -2.88, -2.78, -2.4, -2.46, -1.89], sem: [0.06, 0.06, 0.06, 0.06, 0.06, 0.06, 0.07, 0.06, 0.14, 0.1]} - } - ,'10k': { - 'Qwen2.5-32B': {turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-6.59, -7.15, -6.99, -6.95, -6.82, -6.88, -6.71, -6.58, -6.67, -6.45], sem: [0.26, 0.28, 0.29, 0.3, 0.29, 0.29, 0.29, 0.32, 0.36, 0.41]} - ,'Qwen2.5-72B': {turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-5.51, -7.02, -6.45, -6.11, -5.98, -6.52, -7.02, -7.88, -8.05, -7.66], sem: [0.26, 0.34, 0.34, 0.36, 0.4, 0.53, 0.62, 0.71, 0.81, 0.92]} - ,'Qwen3-4B': {turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-4.68, -4.3, -3.57, -3.33, -3.27, -3.22, -3.06, -2.9, -2.75, -2.57], sem: [0.18, 0.17, 0.15, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14]} - ,'Qwen3-30B-A3B': {turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-3.23, -3.31, -3.28, -3.16, -3.06, -2.97, -2.94, -2.87, -2.83, -2.73], sem: [0.17, 0.17, 0.17, 0.17, 0.17, 0.16, 0.17, 0.18, 0.18, 0.17]} - ,'Qwen3-Next-80B-A3B': {turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-3.25, -3.42, -3.21, -2.94, -2.81, -2.75, -2.7, -2.65, -2.55, -2.45], sem: [0.16, 0.17, 0.17, 0.17, 0.16, 0.17, 0.16, 0.16, 0.16, 0.16]} - } - }, - byProgress: { - 'mimic': { - 'Qwen2.5-32B': {progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-4.3, -4.12, -3.73, -3.62, -3.36, -3.05, -2.94, -3.12, -4.6, -4.42], sem: [0.25, 0.21, 0.25, 0.36, 0.24, 0.25, 0.38, 0.45, 1.5, 0.1]} - ,'Qwen2.5-72B': {progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-3.51, -3.98, -3.74, -3.26, -3.17, -3.24, -2.99, -2.53, -2.58, -2.42], sem: [0.15, 0.21, 0.16, 0.23, 0.17, 0.25, 0.18, 0.09, 0.09, 0.2]} - ,'Qwen3-4B': {progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-3.37, -2.93, -2.71, -2.33, -1.99, -2.04, -1.57, -1.46, -1.48, -1.44], sem: [0.03, 0.04, 0.04, 0.04, 0.05, 0.08, 0.1, 0.05, 0.0, 0.01]} - ,'Qwen3-30B-A3B': {progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-5.13, -4.72, -4.42, -4.17, -4.04, -3.9, -3.64, -3.45, -3.36, -3.17], sem: [0.08, 0.07, 0.07, 0.07, 0.07, 0.08, 0.1, 0.14, 0.15, 0.26]} - ,'Qwen3-Next-80B-A3B': {progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-2.85, -2.8, -2.65, -2.22, -1.98, -1.96, -1.79, -1.74, -1.83, -1.85], sem: [0.1, 0.07, 0.11, 0.09, 0.13, 0.12, 0.08, 0.16, 0.15, 0.39]} - } - ,'globem': { - 'Qwen2.5-32B': {progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-5.66, -5.92, -5.88, -5.79, -5.79, -5.55, -5.47, -4.8, -3.55, -3.24], sem: [0.18, 0.19, 0.2, 0.21, 0.29, 0.29, 0.47, 0.63, 0.19, 0.47]} - ,'Qwen2.5-72B': {progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-4.68, -5.56, -5.62, -5.59, -5.51, -5.4, -5.56, -5.03, -5.77, -7.71], sem: [0.13, 0.18, 0.16, 0.25, 0.19, 0.32, 0.29, 0.55, 0.83, 0.1]} - ,'Qwen3-4B': {progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-4.42, -3.83, -3.38, -2.96, -2.71, -2.6, -2.46, -2.53, -2.63, -2.61], sem: [0.06, 0.04, 0.04, 0.05, 0.07, 0.08, 0.12, 0.14, 0.25, 0.04]} - ,'Qwen3-30B-A3B': {progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-5.26, -5.2, -5.06, -4.82, -4.5, -4.51, -4.37, -4.1, -4.03, -3.74], sem: [0.06, 0.05, 0.05, 0.06, 0.07, 0.08, 0.1, 0.29, 0.25, 0.11]} - ,'Qwen3-Next-80B-A3B': {progress: [10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-3.1, -3.15, -3.06, -3.01, -2.95, -2.88, -2.78, -2.4, -2.46], sem: [0.06, 0.06, 0.06, 0.06, 0.06, 0.06, 0.07, 0.06, 0.14]} - } - ,'10k': { - 'Qwen2.5-32B': {progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-6.59, -7.07, -6.89, -6.8, -6.58, -6.58, -6.76, -8.0, -8.59, -8.83], sem: [0.26, 0.2, 0.21, 0.2, 0.32, 0.27, 0.39, 0.57, 0.84, 1.12]} - ,'Qwen2.5-72B': {progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-5.51, -7.02, -6.28, -5.98, -6.52, -7.33, -8.05, -7.85, -8.41, -7.15], sem: [0.26, 0.34, 0.25, 0.4, 0.53, 0.47, 0.81, 0.79, 1.45, 1.26]} - ,'Qwen3-4B': {progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-4.49, -3.45, -3.19, -2.83, -2.5, -2.27, -2.31, -2.31, -2.35, -1.73], sem: [0.12, 0.1, 0.08, 0.1, 0.1, 0.11, 0.2, 0.29, 0.36, 0.03]} - ,'Qwen3-30B-A3B': {progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-3.14, -2.66, -2.29, -2.26, -1.97, -1.88, -1.52, -1.36, -1.61, -1.61], sem: [0.06, 0.06, 0.07, 0.1, 0.14, 0.18, 0.08, 0.02, 0.05, 0.08]} - ,'Qwen3-Next-80B-A3B': {progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-3.34, -2.99, -2.7, -2.5, -2.43, -2.55, -2.18, -2.28, -2.19, -2.5], sem: [0.12, 0.1, 0.1, 0.11, 0.11, 0.15, 0.21, 0.22, 0.26, 0.38]} + "byTurn": { + "mimic": { + "Qwen2.5-32B": { + "turns": [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 + ], + "logprob": [ + -4.3, + -4.21, + -4.04, + -3.87, + -3.59, + -3.62, + -3.33, + -3.4, + -2.93, + -3.21 + ], + "sem": [ + 0.25, + 0.27, + 0.32, + 0.35, + 0.35, + 0.36, + 0.34, + 0.35, + 0.32, + 0.4 + ] + }, + "Qwen2.5-72B": { + "turns": [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 + ], + "logprob": [ + -3.51, + -3.98, + -3.68, + -3.8, + -3.26, + -3.22, + -3.12, + -3.24, + -3.08, + -2.84 + ], + "sem": [ + 0.15, + 0.21, + 0.21, + 0.23, + 0.23, + 0.21, + 0.25, + 0.25, + 0.28, + 0.08 + ] + }, + "Qwen3-4B": { + "turns": [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 + ], + "logprob": [ + -3.48, + -3.25, + -3.3, + -2.74, + -2.75, + -2.73, + -2.72, + -2.67, + -2.62, + -2.25 + ], + "sem": [ + 0.04, + 0.05, + 0.04, + 0.07, + 0.06, + 0.07, + 0.07, + 0.07, + 0.06, + 0.06 + ] + }, + "Qwen3-30B-A3B": { + "turns": [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 + ], + "logprob": [ + -4.94, + -5.21, + -5.51, + -5.05, + -4.96, + -4.95, + -4.75, + -4.73, + -4.6, + -4.72 + ], + "sem": [ + 0.15, + 0.18, + 0.2, + 0.18, + 0.19, + 0.19, + 0.17, + 0.18, + 0.16, + 0.18 + ] + }, + "Qwen3-Next-80B-A3B": { + "turns": [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 + ], + "logprob": [ + -2.85, + -2.86, + -2.74, + -2.65, + -2.31, + -2.14, + -1.98, + -2.03, + -1.88, + -1.82 + ], + "sem": [ + 0.1, + 0.1, + 0.11, + 0.11, + 0.11, + 0.13, + 0.13, + 0.18, + 0.17, + 0.09 + ] + } + }, + "globem": { + "Qwen2.5-32B": { + "turns": [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 + ], + "logprob": [ + -5.48, + -5.83, + -5.84, + -5.91, + -6.01, + -6.03, + -5.86, + -5.73, + -5.78, + -5.73 + ], + "sem": [ + 0.24, + 0.28, + 0.31, + 0.33, + 0.33, + 0.35, + 0.33, + 0.35, + 0.35, + 0.36 + ] + }, + "Qwen2.5-72B": { + "turns": [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 + ], + "logprob": [ + -4.68, + -5.56, + -5.65, + -5.59, + -5.59, + -5.49, + -5.54, + -5.4, + -5.57, + -5.53 + ], + "sem": [ + 0.13, + 0.18, + 0.23, + 0.23, + 0.25, + 0.25, + 0.29, + 0.32, + 0.38, + 0.46 + ] + }, + "Qwen3-4B": { + "turns": [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 + ], + "logprob": [ + -4.67, + -4.16, + -3.9, + -3.76, + -3.6, + -3.47, + -3.05, + -2.99, + -2.93, + -2.78 + ], + "sem": [ + 0.08, + 0.07, + 0.06, + 0.06, + 0.07, + 0.08, + 0.07, + 0.08, + 0.08, + 0.09 + ] + }, + "Qwen3-30B-A3B": { + "turns": [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 + ], + "logprob": [ + -5.28, + -5.23, + -5.2, + -5.19, + -5.2, + -5.01, + -5.21, + -4.95, + -4.93, + -4.81 + ], + "sem": [ + 0.09, + 0.09, + 0.09, + 0.08, + 0.08, + 0.08, + 0.09, + 0.09, + 0.1, + 0.1 + ] + }, + "Qwen3-Next-80B-A3B": { + "turns": [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 + ], + "logprob": [ + -3.1, + -3.15, + -3.06, + -3.01, + -2.95, + -2.88, + -2.78, + -2.4, + -2.46, + -1.89 + ], + "sem": [ + 0.06, + 0.06, + 0.06, + 0.06, + 0.06, + 0.06, + 0.07, + 0.06, + 0.14, + 0.1 + ] + } + }, + "10k": { + "Qwen2.5-32B": { + "turns": [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 + ], + "logprob": [ + -6.59, + -7.15, + -6.99, + -6.95, + -6.82, + -6.88, + -6.71, + -6.58, + -6.67, + -6.45 + ], + "sem": [ + 0.26, + 0.28, + 0.29, + 0.3, + 0.29, + 0.29, + 0.29, + 0.32, + 0.36, + 0.41 + ] + }, + "Qwen2.5-72B": { + "turns": [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 + ], + "logprob": [ + -5.51, + -7.02, + -6.45, + -6.11, + -5.98, + -6.52, + -7.02, + -7.88, + -8.05, + -7.66 + ], + "sem": [ + 0.26, + 0.34, + 0.34, + 0.36, + 0.4, + 0.53, + 0.62, + 0.71, + 0.81, + 0.92 + ] + }, + "Qwen3-4B": { + "turns": [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 + ], + "logprob": [ + -4.68, + -4.3, + -3.57, + -3.33, + -3.27, + -3.22, + -3.06, + -2.9, + -2.75, + -2.57 + ], + "sem": [ + 0.18, + 0.17, + 0.15, + 0.14, + 0.14, + 0.14, + 0.14, + 0.14, + 0.14, + 0.14 + ] + }, + "Qwen3-30B-A3B": { + "turns": [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 + ], + "logprob": [ + -3.23, + -3.31, + -3.28, + -3.16, + -3.06, + -2.97, + -2.94, + -2.87, + -2.83, + -2.73 + ], + "sem": [ + 0.17, + 0.17, + 0.17, + 0.17, + 0.17, + 0.16, + 0.17, + 0.18, + 0.18, + 0.17 + ] + }, + "Qwen3-Next-80B-A3B": { + "turns": [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 + ], + "logprob": [ + -3.25, + -3.42, + -3.21, + -2.94, + -2.81, + -2.75, + -2.7, + -2.65, + -2.55, + -2.45 + ], + "sem": [ + 0.16, + 0.17, + 0.17, + 0.17, + 0.16, + 0.17, + 0.16, + 0.16, + 0.16, + 0.16 + ] + } + } + }, + "byProgress": { + "mimic": { + "Qwen2.5-32B": { + "progress": [ + 0, + 10, + 20, + 30, + 40, + 50, + 60, + 70, + 80, + 90 + ], + "logprob": [ + -4.3, + -4.12, + -3.73, + -3.62, + -3.36, + -3.05, + -2.94, + -3.12, + -4.6, + -4.42 + ], + "sem": [ + 0.25, + 0.21, + 0.25, + 0.36, + 0.24, + 0.25, + 0.38, + 0.45, + 1.5, + 0.1 + ] + }, + "Qwen2.5-72B": { + "progress": [ + 0, + 10, + 20, + 30, + 40, + 50, + 60, + 70, + 80, + 90 + ], + "logprob": [ + -3.51, + -3.98, + -3.74, + -3.26, + -3.17, + -3.24, + -2.99, + -2.53, + -2.58, + -2.42 + ], + "sem": [ + 0.15, + 0.21, + 0.16, + 0.23, + 0.17, + 0.25, + 0.18, + 0.09, + 0.09, + 0.2 + ] + }, + "Qwen3-4B": { + "progress": [ + 0, + 10, + 20, + 30, + 40, + 50, + 60, + 70, + 80, + 90 + ], + "logprob": [ + -3.37, + -2.93, + -2.71, + -2.33, + -1.99, + -2.04, + -1.57, + -1.46, + -1.48, + -1.44 + ], + "sem": [ + 0.03, + 0.04, + 0.04, + 0.04, + 0.05, + 0.08, + 0.1, + 0.05, + 0.0, + 0.01 + ] + }, + "Qwen3-30B-A3B": { + "progress": [ + 0, + 10, + 20, + 30, + 40, + 50, + 60, + 70, + 80, + 90 + ], + "logprob": [ + -5.13, + -4.72, + -4.42, + -4.17, + -4.04, + -3.9, + -3.64, + -3.45, + -3.36, + -3.17 + ], + "sem": [ + 0.08, + 0.07, + 0.07, + 0.07, + 0.07, + 0.08, + 0.1, + 0.14, + 0.15, + 0.26 + ] + }, + "Qwen3-Next-80B-A3B": { + "progress": [ + 0, + 10, + 20, + 30, + 40, + 50, + 60, + 70, + 80, + 90 + ], + "logprob": [ + -2.85, + -2.8, + -2.65, + -2.22, + -1.98, + -1.96, + -1.79, + -1.74, + -1.83, + -1.85 + ], + "sem": [ + 0.1, + 0.07, + 0.11, + 0.09, + 0.13, + 0.12, + 0.08, + 0.16, + 0.15, + 0.39 + ] + } + }, + "globem": { + "Qwen2.5-32B": { + "progress": [ + 0, + 10, + 20, + 30, + 40, + 50, + 60, + 70, + 80, + 90 + ], + "logprob": [ + -5.66, + -5.92, + -5.88, + -5.79, + -5.79, + -5.55, + -5.47, + -4.8, + -3.55, + -3.24 + ], + "sem": [ + 0.18, + 0.19, + 0.2, + 0.21, + 0.29, + 0.29, + 0.47, + 0.63, + 0.19, + 0.47 + ] + }, + "Qwen2.5-72B": { + "progress": [ + 0, + 10, + 20, + 30, + 40, + 50, + 60, + 70, + 80, + 90 + ], + "logprob": [ + -4.68, + -5.56, + -5.62, + -5.59, + -5.51, + -5.4, + -5.56, + -5.03, + -5.77, + -7.71 + ], + "sem": [ + 0.13, + 0.18, + 0.16, + 0.25, + 0.19, + 0.32, + 0.29, + 0.55, + 0.83, + 0.1 + ] + }, + "Qwen3-4B": { + "progress": [ + 0, + 10, + 20, + 30, + 40, + 50, + 60, + 70, + 80, + 90 + ], + "logprob": [ + -4.42, + -3.83, + -3.38, + -2.96, + -2.71, + -2.6, + -2.46, + -2.53, + -2.63, + -2.61 + ], + "sem": [ + 0.06, + 0.04, + 0.04, + 0.05, + 0.07, + 0.08, + 0.12, + 0.14, + 0.25, + 0.04 + ] + }, + "Qwen3-30B-A3B": { + "progress": [ + 0, + 10, + 20, + 30, + 40, + 50, + 60, + 70, + 80, + 90 + ], + "logprob": [ + -5.26, + -5.2, + -5.06, + -4.82, + -4.5, + -4.51, + -4.37, + -4.1, + -4.03, + -3.74 + ], + "sem": [ + 0.06, + 0.05, + 0.05, + 0.06, + 0.07, + 0.08, + 0.1, + 0.29, + 0.25, + 0.11 + ] + }, + "Qwen3-Next-80B-A3B": { + "progress": [ + 10, + 20, + 30, + 40, + 50, + 60, + 70, + 80, + 90 + ], + "logprob": [ + -3.1, + -3.15, + -3.06, + -3.01, + -2.95, + -2.88, + -2.78, + -2.4, + -2.46 + ], + "sem": [ + 0.06, + 0.06, + 0.06, + 0.06, + 0.06, + 0.06, + 0.07, + 0.06, + 0.14 + ] + } + }, + "10k": { + "Qwen2.5-32B": { + "progress": [ + 0, + 10, + 20, + 30, + 40, + 50, + 60, + 70, + 80, + 90 + ], + "logprob": [ + -6.59, + -7.07, + -6.89, + -6.8, + -6.58, + -6.58, + -6.76, + -8.0, + -8.59, + -8.83 + ], + "sem": [ + 0.26, + 0.2, + 0.21, + 0.2, + 0.32, + 0.27, + 0.39, + 0.57, + 0.84, + 1.12 + ] + }, + "Qwen2.5-72B": { + "progress": [ + 0, + 10, + 20, + 30, + 40, + 50, + 60, + 70, + 80, + 90 + ], + "logprob": [ + -5.51, + -7.02, + -6.28, + -5.98, + -6.52, + -7.33, + -8.05, + -7.85, + -8.41, + -7.15 + ], + "sem": [ + 0.26, + 0.34, + 0.25, + 0.4, + 0.53, + 0.47, + 0.81, + 0.79, + 1.45, + 1.26 + ] + }, + "Qwen3-4B": { + "progress": [ + 0, + 10, + 20, + 30, + 40, + 50, + 60, + 70, + 80, + 90 + ], + "logprob": [ + -4.49, + -3.45, + -3.19, + -2.83, + -2.5, + -2.27, + -2.31, + -2.31, + -2.35, + -1.73 + ], + "sem": [ + 0.12, + 0.1, + 0.08, + 0.1, + 0.1, + 0.11, + 0.2, + 0.29, + 0.36, + 0.03 + ] + }, + "Qwen3-30B-A3B": { + "progress": [ + 0, + 10, + 20, + 30, + 40, + 50, + 60, + 70, + 80, + 90 + ], + "logprob": [ + -3.14, + -2.66, + -2.29, + -2.26, + -1.97, + -1.88, + -1.52, + -1.36, + -1.61, + -1.61 + ], + "sem": [ + 0.06, + 0.06, + 0.07, + 0.1, + 0.14, + 0.18, + 0.08, + 0.02, + 0.05, + 0.08 + ] + }, + "Qwen3-Next-80B-A3B": { + "progress": [ + 0, + 10, + 20, + 30, + 40, + 50, + 60, + 70, + 80, + 90 + ], + "logprob": [ + -3.34, + -2.99, + -2.7, + -2.5, + -2.43, + -2.55, + -2.18, + -2.28, + -2.19, + -2.5 + ], + "sem": [ + 0.12, + 0.1, + 0.1, + 0.11, + 0.11, + 0.15, + 0.21, + 0.22, + 0.26, + 0.38 + ] + } + } } - } }, probingColors: { - 'Qwen2.5-32B': '#4A90D9', - 'Qwen2.5-72B': '#1A5FB4', - 'Qwen3-4B': '#57E389', - 'Qwen3-30B-A3B': '#26A269', - 'Qwen3-Next-80B-A3B': '#9141AC' + "Qwen2.5-32B": "#4A90D9", + "Qwen2.5-72B": "#1A5FB4", + "Qwen3-4B": "#57E389", + "Qwen3-30B-A3B": "#26A269", + "Qwen3-Next-80B-A3B": "#9141AC" } -}; +}; \ No newline at end of file