diff --git "a/data.js" "b/data.js" --- "a/data.js" +++ "b/data.js" @@ -2,3817 +2,3818 @@ // Auto-generated by export_web_data.py - matches Python plotting scripts exactly const DDR_DATA = { - modelColors: { - "GPT-5.2": "#00C853", - "Claude-4.5-Sonnet": "#FF6D00", - "Gemini-3-Flash": "#2196F3", - "GLM-4.6": "#9C27B0", - "DeepSeek-V3": "#E91E63" - }, - scaling: { - "mimic": { - "GPT-5.2": { - "turns": [ - 5, - 10, - 15, - 20, - 25, - 30, - 35, - 40, - 45, - 50, - 55 - ], - "tokens": [ - 3737, - 8577, - 15459, - 20193, - 24028, - 26951, - 28820, - 29751, - 30405, - 30968, - 31260 - ], - "costs": [ - 0.005, - 0.0207, - 0.0516, - 0.0947, - 0.1522, - 0.2153, - 0.2799, - 0.3597, - 0.4373, - 0.4906, - 0.635 - ], - "accuracy": [ - 10.85, - 15.25, - 18.35, - 20.41, - 23.26, - 24.42, - 25.32, - 25.97, - 26.36, - 26.87, - 27.26 - ] - }, - "Gemini-3-Flash": { - "turns": [ - 5, - 10, - 15, - 20, - 25 - ], - "tokens": [ - 5580, - 14305, - 23357, - 26964, - 27542 - ], - "costs": [ - 0.002, - 0.008, - 0.0173, - 0.0284, - 0.045 - ], - "accuracy": [ - 7.62, - 13.44, - 19.77, - 24.03, - 24.94 - ] - }, - "Claude-4.5-Sonnet": { - "turns": [ - 5, - 10, - 15, - 20, - 25, - 30, - 35, - 40, - 45, - 50, - 55, - 60, - 65, - 70, - 75 - ], - "tokens": [ - 4513, - 9214, - 13378, - 17217, - 20275, - 22878, - 25379, - 27559, - 29532, - 31239, - 32395, - 33382, - 33796, - 33968, - 34140 - ], - "costs": [ - 0.0152, - 0.059, - 0.1249, - 0.2138, - 0.3214, - 0.4458, - 0.5823, - 0.7212, - 0.842, - 0.9656, - 1.0851, - 1.1605, - 1.3008, - 1.4081, - 1.3369 - ], - "accuracy": [ - 8.14, - 9.17, - 11.89, - 14.73, - 16.67, - 18.22, - 19.77, - 22.87, - 26.61, - 29.46, - 31.78, - 33.59, - 33.98, - 34.24, - 34.37 - ] - }, - "GLM-4.6": { - "turns": [ - 5, - 10, - 15, - 20, - 25, - 30, - 35, - 40, - 45, - 50, - 55, - 60, - 65, - 70, - 75, - 80, - 85, - 90, - 95 - ], - "tokens": [ - 3488, - 7059, - 10542, - 13099, - 14972, - 16484, - 17524, - 18410, - 19112, - 19728, - 20259, - 20715, - 21135, - 21489, - 21858, - 22169, - 22422, - 22613, - 22802 - ], - "costs": [ - 0.0026, - 0.0097, - 0.0217, - 0.0369, - 0.0552, - 0.0743, - 0.0969, - 0.1204, - 0.1489, - 0.1769, - 0.2074, - 0.24, - 0.2763, - 0.3114, - 0.3522, - 0.3935, - 0.4408, - 0.4741, - 0.5461 - ], - "accuracy": [ - 9.43, - 11.11, - 13.57, - 16.02, - 17.57, - 18.86, - 19.77, - 20.16, - 20.8, - 21.19, - 21.45, - 22.09, - 22.48, - 22.61, - 22.87, - 23.13, - 23.13, - 23.13, - 23.26 - ] - }, - "deepseek-v3.2": { - "turns": [ - 5, - 10, - 15, - 20, - 25, - 30, - 35, - 40, - 45 - ], - "tokens": [ - 3447, - 9155, - 13730, - 17912, - 21270, - 23962, - 26205, - 27253, - 27411 - ], - "costs": [ - 0.001, - 0.0048, - 0.011, - 0.0192, - 0.0282, - 0.0385, - 0.0499, - 0.0642, - 0.0694 - ], - "accuracy": [ - 9.3, - 12.53, - 14.73, - 17.05, - 20.16, - 23.9, - 25.97, - 26.87, - 27.0 - ] - } - }, - "10k": { - "Claude-4.5-Sonnet": { - "turns": [ - 5, - 10, - 15, - 20, - 25, - 30, - 35, - 40, - 45, - 50, - 55, - 60, - 65, - 70, - 75, - 80 - ], - "tokens": [ - 2561, - 7208, - 10978, - 14050, - 17080, - 19955, - 22501, - 25124, - 27696, - 30109, - 32363, - 34247, - 35154, - 35907, - 36148, - 36277 - ], - "costs": [ - 0.0094, - 0.0414, - 0.0955, - 0.1682, - 0.2576, - 0.3598, - 0.4751, - 0.5932, - 0.7209, - 0.8684, - 1.0029, - 1.0913, - 1.2015, - 1.3713, - 1.4854, - 1.5611 - ], - "accuracy": [ - 0.82, - 1.06, - 1.41, - 3.18, - 5.65, - 9.78, - 15.19, - 22.85, - 31.8, - 46.64, - 60.42, - 69.02, - 73.26, - 75.62, - 76.68, - 77.27 - ] - }, - "GPT-5.2": { - "turns": [ - 5, - 10, - 15, - 20, - 25, - 30, - 35, - 40, - 45, - 50, - 55, - 60 - ], - "tokens": [ - 2823, - 7902, - 11581, - 14651, - 16306, - 17356, - 17871, - 18251, - 18345, - 18398, - 18441, - 18468 - ], - "costs": [ - 0.0037, - 0.0199, - 0.0454, - 0.0774, - 0.1125, - 0.1524, - 0.1983, - 0.2657, - 0.3601, - 0.4706, - 0.5641, - 0.6699 - ], - "accuracy": [ - 0.82, - 8.36, - 22.85, - 32.98, - 37.57, - 40.52, - 43.23, - 44.29, - 44.41, - 44.52, - 44.76, - 44.99 - ] - }, - "GLM-4.6": { - "turns": [ - 5, - 10, - 15, - 20, - 25, - 30, - 35, - 40, - 45, - 50, - 55, - 60, - 65, - 70, - 75, - 80, - 85, - 90 - ], - "tokens": [ - 1838, - 3927, - 6056, - 8124, - 10227, - 12399, - 14608, - 16757, - 18614, - 20290, - 21537, - 22418, - 23164, - 23553, - 23781, - 23950, - 24062, - 24102 - ], - "costs": [ - 0.0013, - 0.0053, - 0.012, - 0.0214, - 0.0334, - 0.0481, - 0.0654, - 0.0855, - 0.1047, - 0.1277, - 0.1498, - 0.1724, - 0.2004, - 0.223, - 0.2716, - 0.3281, - 0.3281, - 0.4018 - ], - "accuracy": [ - 0.24, - 0.59, - 2.0, - 4.48, - 8.72, - 13.19, - 19.08, - 26.27, - 35.34, - 41.22, - 47.7, - 52.3, - 54.77, - 56.54, - 57.83, - 59.25, - 60.19, - 60.42 - ] - }, - "deepseek-v3.2": { - "turns": [ - 5, - 10, - 15, - 20, - 25, - 30, - 35, - 40, - 45, - 50, - 55, - 60 - ], - "tokens": [ - 1997, - 4260, - 6707, - 9238, - 11703, - 14313, - 16306, - 18027, - 19074, - 19698, - 19875, - 19988 - ], - "costs": [ - 0.0006, - 0.0024, - 0.0054, - 0.0097, - 0.0153, - 0.0214, - 0.028, - 0.0355, - 0.0437, - 0.0505, - 0.06, - 0.0694 - ], - "accuracy": [ - 1.18, - 1.88, - 2.83, - 6.12, - 14.13, - 27.09, - 37.34, - 48.53, - 55.48, - 58.54, - 59.72, - 60.42 - ] - }, - "Gemini-3-Flash": { - "turns": [ - 5, - 10, - 15, - 20, - 25, - 30, - 35, - 40 - ], - "tokens": [ - 4473, - 12616, - 18404, - 20077, - 20729, - 20883, - 20935, - 21004 - ], - "costs": [ - 0.002, - 0.008, - 0.0171, - 0.0275, - 0.0385, - 0.0405, - 0.0538, - 0.0688 - ], - "accuracy": [ - 1.88, - 18.61, - 38.4, - 42.17, - 43.7, - 44.05, - 44.05, - 44.41 - ] - } - }, - "globem": { - "deepseek-v3.2": { - "turns": [ - 5, - 10, - 15, - 20, - 25 - ], - "tokens": [ - 3972, - 10497, - 20470, - 32293, - 36396 - ], - "costs": [ - 0.001, - 0.005, - 0.0135, - 0.0262, - 0.0382 - ], - "accuracy": [ - 0.92, - 0.92, - 4.6, - 27.59, - 36.78 - ] - }, - "GLM-4.6": { - "turns": [ - 5, - 10, - 15, - 20, - 25, - 30, - 35 - ], - "tokens": [ - 3851, - 8817, - 16221, - 26186, - 32076, - 34058, - 34425 - ], - "costs": [ - 0.0027, - 0.0115, - 0.028, - 0.0544, - 0.0891, - 0.1409, - 0.1581 - ], - "accuracy": [ - 2.3, - 2.3, - 5.52, - 22.99, - 36.32, - 40.0, - 41.61 - ] - }, - "Gemini-3-Flash": { - "turns": [ - 5, - 10, - 15, - 20, - 25, - 30, - 35, - 40 - ], - "tokens": [ - 6260, - 14825, - 25972, - 35526, - 40312, - 41787, - 42167, - 42236 - ], - "costs": [ - 0.0021, - 0.0074, - 0.0166, - 0.0292, - 0.047, - 0.0828, - 0.1479, - 0.2258 - ], - "accuracy": [ - 1.88, - 2.12, - 5.88, - 21.41, - 30.35, - 34.35, - 35.06, - 35.29 - ] - }, - "Claude-4.5-Sonnet": { - "turns": [ - 5, - 10, - 15, - 20, - 25, - 30 - ], - "tokens": [ - 4579, - 11405, - 21188, - 32526, - 44888, - 49137 - ], - "costs": [ - 0.0152, - 0.0638, - 0.1611, - 0.3124, - 0.4877, - 0.6335 - ], - "accuracy": [ - 2.53, - 2.53, - 3.45, - 12.64, - 30.57, - 40.23 - ] - }, - "GPT-5.2": { - "turns": [ - 5, - 10, - 15, - 20 - ], - "tokens": [ - 3477, - 10218, - 17672, - 19878 - ], - "costs": [ - 0.0048, - 0.0236, - 0.0652, - 0.1238 - ], - "accuracy": [ - 0.92, - 5.98, - 34.02, - 38.39 - ] - } - } - }, - ranking: { - "MIMIC": [ - { - "model": "gpt5-mini", - "bt_rank": 1, - "win_rate": 100.0, - "accuracy": 27.59, - "acc_rank": 7, - "is_proprietary": true - }, - { - "model": "claude4.5-sonnet", - "bt_rank": 2, - "win_rate": 94.6, - "accuracy": 33.66, - "acc_rank": 1, - "is_proprietary": true - }, - { - "model": "gpt5mini", - "bt_rank": 3, - "win_rate": 87.8, - "accuracy": 27.59, - "acc_rank": 8, - "is_proprietary": true - }, - { - "model": "gpt5.2", - "bt_rank": 4, - "win_rate": 83.6, - "accuracy": 28.88, - "acc_rank": 5, - "is_proprietary": true - }, - { - "model": "gpt5.1", - "bt_rank": 5, - "win_rate": 80.6, - "accuracy": 30.1, - "acc_rank": 3, - "is_proprietary": true - }, - { - "model": "gemini3-flash", - "bt_rank": 6, - "win_rate": 76.5, - "accuracy": 29.28, - "acc_rank": 4, - "is_proprietary": true - }, - { - "model": "kimi-k2", - "bt_rank": 7, - "win_rate": 73.1, - "accuracy": 30.17, - "acc_rank": 2, - "is_proprietary": false - }, - { - "model": "run_api_deepseek_deepseek-chat", - "bt_rank": 8, - "win_rate": 70.5, - "accuracy": 27.65, - "acc_rank": 6, - "is_proprietary": false - }, - { - "model": "gemini2.5-pro", - "bt_rank": 9, - "win_rate": 63.9, - "accuracy": 19.0, - "acc_rank": 14, - "is_proprietary": true - }, - { - "model": "qwen3-next-80b-a3b-instruct", - "bt_rank": 10, - "win_rate": 59.5, - "accuracy": 18.8, - "acc_rank": 15, - "is_proprietary": false - }, - { - "model": "minimax-m2", - "bt_rank": 11, - "win_rate": 59.7, - "accuracy": 23.52, - "acc_rank": 10, - "is_proprietary": false - }, - { - "model": "glm4.6", - "bt_rank": 12, - "win_rate": 52.1, - "accuracy": 23.84, - "acc_rank": 9, - "is_proprietary": false - }, - { - "model": "qwen3", - "bt_rank": 13, - "win_rate": 51.7, - "accuracy": 19.13, - "acc_rank": 13, - "is_proprietary": false - }, - { - "model": "qwen2.5-14B-Instruct-1M", - "bt_rank": 14, - "win_rate": 40.3, - "accuracy": 20, - "acc_rank": 11, - "is_proprietary": false - }, - { - "model": "gemini2.5-flash-lite", - "bt_rank": 15, - "win_rate": 35.4, - "accuracy": 16.64, - "acc_rank": 18, - "is_proprietary": true - }, - { - "model": "qwen2.5-14B-Instruct", - "bt_rank": 16, - "win_rate": 32.4, - "accuracy": 14.15, - "acc_rank": 20, - "is_proprietary": false - }, - { - "model": "qwen2.5-32b-instruct", - "bt_rank": 17, - "win_rate": 32.3, - "accuracy": 13.12, - "acc_rank": 21, - "is_proprietary": false - }, - { - "model": "gemini2.5-flash", - "bt_rank": 18, - "win_rate": 31.2, - "accuracy": 18.61, - "acc_rank": 16, - "is_proprietary": true - }, - { - "model": "qwen2.5-72B-Instruct", - "bt_rank": 19, - "win_rate": 29.5, - "accuracy": 14.92, - "acc_rank": 19, - "is_proprietary": false - }, - { - "model": "qwen3-4B-Instruct-2507", - "bt_rank": 20, - "win_rate": 27.3, - "accuracy": 16.93, - "acc_rank": 17, - "is_proprietary": false - }, - { - "model": "qwen2.5-7B-Instruct-1M", - "bt_rank": 21, - "win_rate": 17.3, - "accuracy": 20, - "acc_rank": 12, - "is_proprietary": false - }, - { - "model": "llama3.3-70B", - "bt_rank": 22, - "win_rate": 14.2, - "accuracy": 7.3, - "acc_rank": 22, - "is_proprietary": false - } - ], - "10K": [ - { - "model": "claude4.5-sonnet", - "bt_rank": 1, - "win_rate": 92.8, - "accuracy": 69.26, - "acc_rank": 1, - "is_proprietary": true - }, - { - "model": "run_api_deepseek_deepseek-chat", - "bt_rank": 2, - "win_rate": 80.6, - "accuracy": 49.41, - "acc_rank": 2, - "is_proprietary": false - }, - { - "model": "gpt5mini", - "bt_rank": 3, - "win_rate": 80.4, - "accuracy": 41.56, - "acc_rank": 5, - "is_proprietary": true - }, - { - "model": "gpt5.2", - "bt_rank": 4, - "win_rate": 78.0, - "accuracy": 43.11, - "acc_rank": 4, - "is_proprietary": true - }, - { - "model": "kimi-k2", - "bt_rank": 5, - "win_rate": 77.0, - "accuracy": 41.17, - "acc_rank": 7, - "is_proprietary": false - }, - { - "model": "glm4.6", - "bt_rank": 6, - "win_rate": 71.4, - "accuracy": 48.29, - "acc_rank": 3, - "is_proprietary": false - }, - { - "model": "gemini3-flash", - "bt_rank": 7, - "win_rate": 63.6, - "accuracy": 39.5, - "acc_rank": 8, - "is_proprietary": true - }, - { - "model": "qwen3-next-80b-a3b-instruct", - "bt_rank": 8, - "win_rate": 59.2, - "accuracy": 38.34, - "acc_rank": 9, - "is_proprietary": false - }, - { - "model": "minimax-m2", - "bt_rank": 9, - "win_rate": 54.4, - "accuracy": 35.74, - "acc_rank": 10, - "is_proprietary": false - }, - { - "model": "gpt5.1", - "bt_rank": 10, - "win_rate": 54.0, - "accuracy": 41.23, - "acc_rank": 6, - "is_proprietary": true - }, - { - "model": "qwen3", - "bt_rank": 11, - "win_rate": 51.0, - "accuracy": 28.23, - "acc_rank": 12, - "is_proprietary": false - }, - { - "model": "qwen2.5-14B-Instruct-1M", - "bt_rank": 12, - "win_rate": 45.6, - "accuracy": 20, - "acc_rank": 15, - "is_proprietary": false - }, - { - "model": "gemini2.5-pro", - "bt_rank": 13, - "win_rate": 44.8, - "accuracy": 20.91, - "acc_rank": 13, - "is_proprietary": true - }, - { - "model": "qwen2.5-32b-instruct", - "bt_rank": 14, - "win_rate": 41.2, - "accuracy": 17.83, - "acc_rank": 17, - "is_proprietary": false - }, - { - "model": "qwen2.5-72B-Instruct", - "bt_rank": 15, - "win_rate": 34.6, - "accuracy": 20.79, - "acc_rank": 14, - "is_proprietary": false - }, - { - "model": "qwen2.5-14B-Instruct", - "bt_rank": 16, - "win_rate": 31.6, - "accuracy": 14.65, - "acc_rank": 18, - "is_proprietary": false - }, - { - "model": "qwen3-4B-Instruct-2507", - "bt_rank": 17, - "win_rate": 30.0, - "accuracy": 30.43, - "acc_rank": 11, - "is_proprietary": false - }, - { - "model": "gemini2.5-flash-lite", - "bt_rank": 18, - "win_rate": 29.6, - "accuracy": 14.37, - "acc_rank": 19, - "is_proprietary": true - }, - { - "model": "qwen2.5-7B-Instruct-1M", - "bt_rank": 19, - "win_rate": 27.4, - "accuracy": 20, - "acc_rank": 16, - "is_proprietary": false - }, - { - "model": "gemini2.5-flash", - "bt_rank": 20, - "win_rate": 25.2, - "accuracy": 12.61, - "acc_rank": 20, - "is_proprietary": true - }, - { - "model": "qwen2.5-7B-Instruct", - "bt_rank": 21, - "win_rate": 22.0, - "accuracy": 7.53, - "acc_rank": 21, - "is_proprietary": false - }, - { - "model": "llama3.3-70B", - "bt_rank": 22, - "win_rate": 18.6, - "accuracy": 6.51, - "acc_rank": 22, - "is_proprietary": false - } - ], - "GLOBEM": [ - { - "model": "claude4.5-sonnet", - "bt_rank": 1, - "win_rate": 93.0, - "accuracy": 39.54, - "acc_rank": 2, - "is_proprietary": true - }, - { - "model": "gpt5-mini", - "bt_rank": 2, - "win_rate": 60.0, - "accuracy": 33.91, - "acc_rank": 12, - "is_proprietary": true - }, - { - "model": "gemini3-flash", - "bt_rank": 3, - "win_rate": 81.2, - "accuracy": 35.46, - "acc_rank": 9, - "is_proprietary": true - }, - { - "model": "minimax-m2", - "bt_rank": 4, - "win_rate": 77.8, - "accuracy": 36.9, - "acc_rank": 6, - "is_proprietary": false - }, - { - "model": "gpt5mini", - "bt_rank": 5, - "win_rate": 73.8, - "accuracy": 33.91, - "acc_rank": 13, - "is_proprietary": true - }, - { - "model": "gpt5.1", - "bt_rank": 6, - "win_rate": 67.5, - "accuracy": 36.76, - "acc_rank": 7, - "is_proprietary": true - }, - { - "model": "gpt5.2", - "bt_rank": 7, - "win_rate": 64.4, - "accuracy": 38.39, - "acc_rank": 3, - "is_proprietary": true - }, - { - "model": "qwen3", - "bt_rank": 8, - "win_rate": 64.7, - "accuracy": 36.32, - "acc_rank": 8, - "is_proprietary": false - }, - { - "model": "run_api_deepseek_deepseek-chat", - "bt_rank": 9, - "win_rate": 64.5, - "accuracy": 38.39, - "acc_rank": 4, - "is_proprietary": false - }, - { - "model": "glm4.6", - "bt_rank": 10, - "win_rate": 53.6, - "accuracy": 39.77, - "acc_rank": 1, - "is_proprietary": false - }, - { - "model": "kimi-k2", - "bt_rank": 11, - "win_rate": 52.2, - "accuracy": 37.01, - "acc_rank": 5, - "is_proprietary": false - }, - { - "model": "gemini2.5-pro", - "bt_rank": 12, - "win_rate": 45.6, - "accuracy": 34.6, - "acc_rank": 10, - "is_proprietary": true - }, - { - "model": "qwen2.5-72B-Instruct", - "bt_rank": 13, - "win_rate": 43.3, - "accuracy": 27.13, - "acc_rank": 14, - "is_proprietary": false - }, - { - "model": "qwen2.5-32B-Instruct", - "bt_rank": 14, - "win_rate": 42.1, - "accuracy": 20, - "acc_rank": 20, - "is_proprietary": false - }, - { - "model": "qwen3-next-80b-a3b-instruct", - "bt_rank": 15, - "win_rate": 41.5, - "accuracy": 34.14, - "acc_rank": 11, - "is_proprietary": false - }, - { - "model": "qwen2.5-14B-Instruct", - "bt_rank": 16, - "win_rate": 40.8, - "accuracy": 26.13, - "acc_rank": 16, - "is_proprietary": false - }, - { - "model": "gemini2.5-flash-lite", - "bt_rank": 17, - "win_rate": 37.4, - "accuracy": 25.52, - "acc_rank": 18, - "is_proprietary": true - }, - { - "model": "qwen3-4B-Instruct-2507", - "bt_rank": 18, - "win_rate": 36.6, - "accuracy": 26.9, - "acc_rank": 15, - "is_proprietary": false - }, - { - "model": "qwen2.5-14B-Instruct-1M", - "bt_rank": 19, - "win_rate": 32.0, - "accuracy": 20, - "acc_rank": 21, - "is_proprietary": false - }, - { - "model": "llama3.3-70B", - "bt_rank": 20, - "win_rate": 28.1, - "accuracy": 22.65, - "acc_rank": 19, - "is_proprietary": false - }, - { - "model": "qwen2.5-7B-Instruct", - "bt_rank": 21, - "win_rate": 22.2, - "accuracy": 25.64, - "acc_rank": 17, - "is_proprietary": false - }, - { - "model": "qwen2.5-7B-Instruct-1M", - "bt_rank": 22, - "win_rate": 19.7, - "accuracy": 20, - "acc_rank": 22, - "is_proprietary": false - } - ] - }, - turn: { - "mimic": [ - { - "model": "claude4.5-sonnet", - "median": 52, - "distribution": [ - 0.0, - 0.0, - 1.0, - 5.0, - 31.0, - 43.0, - 13.0, - 7.0, - 0.0, - 0.0 - ] - }, - { - "model": "qwen3", - "median": 43, - "distribution": [ - 0.0, - 1.0, - 12.0, - 29.0, - 13.0, - 9.0, - 3.0, - 2.0, - 0.0, - 31.0 - ] - }, - { - "model": "gpt5-mini", - "median": 39, - "distribution": [ - 0.0, - 0.0, - 9.0, - 42.0, - 36.0, - 12.0, - 1.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "glm4.6", - "median": 39, - "distribution": [ - 0.0, - 6.3, - 23.4, - 20.7, - 7.2, - 13.5, - 3.6, - 6.3, - 4.5, - 14.4 - ] - }, - { - "model": "run_api_deepseek_deepseek-chat", - "median": 33, - "distribution": [ - 0.0, - 2.0, - 22.0, - 60.0, - 16.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "gpt5.2", - "median": 30, - "distribution": [ - 0.0, - 10.0, - 36.0, - 32.0, - 12.0, - 10.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "gpt5.1", - "median": 23, - "distribution": [ - 1.5, - 39.7, - 29.4, - 19.9, - 9.6, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "kimi-k2", - "median": 19, - "distribution": [ - 0.0, - 55.0, - 44.0, - 1.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "minimax-m2", - "median": 18, - "distribution": [ - 0.0, - 70.0, - 30.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "qwen3-next-80b-a3b-instruct-note", - "median": 17, - "distribution": [ - 12.0, - 52.0, - 24.0, - 10.0, - 2.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "gemini2.5-pro", - "median": 15, - "distribution": [ - 10.6, - 70.2, - 19.2, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "gemini3-flash", - "median": 15, - "distribution": [ - 7.0, - 71.0, - 22.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "qwen3-4B-Instruct-2507", - "median": 14, - "distribution": [ - 0.0, - 98.0, - 1.0, - 1.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "qwen3-next-80b-a3b-instruct-noreasoning", - "median": 14, - "distribution": [ - 7.0, - 68.0, - 22.0, - 2.0, - 0.0, - 1.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "qwen3-next-80b-a3b-instruct", - "median": 12, - "distribution": [ - 23.0, - 62.0, - 12.0, - 0.0, - 1.0, - 0.0, - 1.0, - 0.0, - 0.0, - 1.0 - ] - }, - { - "model": "qwen2.5-72B-Instruct", - "median": 11, - "distribution": [ - 15.0, - 85.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "qwen3-next-80b-a3b-instruct-longreasoning", - "median": 11, - "distribution": [ - 24.0, - 74.0, - 2.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "qwen2.5-32b-instruct", - "median": 11, - "distribution": [ - 33.0, - 67.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "gemini2.5-flash-lite", - "median": 11, - "distribution": [ - 29.0, - 71.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "gemini2.5-flash", - "median": 10, - "distribution": [ - 34.0, - 65.0, - 1.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "qwen2.5-14B-Instruct-1M", - "median": 10, - "distribution": [ - 34.0, - 65.0, - 1.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "qwen3-next-80b-a3b-instruct-shortreasoning", - "median": 9, - "distribution": [ - 64.0, - 36.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "qwen2.5-14B-Instruct", - "median": 8, - "distribution": [ - 73.0, - 27.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "qwen2.5-7B-Instruct", - "median": 7, - "distribution": [ - 90.0, - 10.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "llama-3.3-70B", - "median": 6, - "distribution": [ - 99.0, - 1.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "llama3.3-70B", - "median": 6, - "distribution": [ - 99.0, - 1.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "qwen2.5-7B-Instruct-1M", - "median": 4, - "distribution": [ - 91.0, - 9.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - } - ], - "10k": [ - { - "model": "claude4.5-sonnet", - "median": 56, - "distribution": [ - 0.0, - 0.0, - 1.0, - 6.0, - 13.0, - 44.0, - 27.0, - 6.0, - 3.0, - 0.0 - ] - }, - { - "model": "glm4.6", - "median": 52, - "distribution": [ - 0.0, - 0.0, - 3.8, - 10.4, - 27.4, - 27.4, - 18.9, - 5.7, - 4.7, - 1.9 - ] - }, - { - "model": "run_api_deepseek_deepseek-chat", - "median": 39, - "distribution": [ - 0.0, - 0.0, - 11.0, - 40.0, - 37.0, - 9.0, - 3.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "gpt5mini", - "median": 35, - "distribution": [ - 0.0, - 4.0, - 27.8, - 36.5, - 24.6, - 6.3, - 0.8, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "qwen3", - "median": 26, - "distribution": [ - 0.8, - 25.4, - 30.2, - 7.9, - 2.4, - 0.0, - 0.0, - 0.8, - 0.0, - 32.5 - ] - }, - { - "model": "kimi-k2", - "median": 24, - "distribution": [ - 0.0, - 29.0, - 48.0, - 21.0, - 2.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "minimax-m2", - "median": 20, - "distribution": [ - 0.0, - 43.0, - 48.0, - 9.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "qwen3-next-80b-a3b-instruct", - "median": 20, - "distribution": [ - 0.0, - 46.0, - 27.0, - 11.0, - 2.0, - 6.0, - 3.0, - 1.0, - 1.0, - 3.0 - ] - }, - { - "model": "gpt5.2", - "median": 20, - "distribution": [ - 0.0, - 43.0, - 41.0, - 12.0, - 3.0, - 0.0, - 1.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "gpt5.1", - "median": 17, - "distribution": [ - 1.0, - 69.0, - 29.0, - 1.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "qwen3-next-80b-a3b-instruct-note", - "median": 16, - "distribution": [ - 17.0, - 44.0, - 27.0, - 10.0, - 2.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "gemini2.5-pro", - "median": 15, - "distribution": [ - 7.0, - 73.0, - 18.0, - 2.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "gemini2.5-flash-lite", - "median": 14, - "distribution": [ - 14.0, - 78.0, - 8.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "gemini3-flash", - "median": 13, - "distribution": [ - 10.0, - 82.0, - 7.0, - 0.0, - 1.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "gemini2.5-flash", - "median": 12, - "distribution": [ - 21.0, - 69.0, - 8.0, - 2.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "qwen3-4B-Instruct-2507", - "median": 12, - "distribution": [ - 4.0, - 91.0, - 5.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "qwen2.5-14B-Instruct-1M", - "median": 11, - "distribution": [ - 31.0, - 64.3, - 4.0, - 0.8, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "qwen3-next-80b-a3b-instruct-longreasoning", - "median": 11, - "distribution": [ - 28.0, - 67.0, - 5.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "qwen2.5-32b-instruct", - "median": 10, - "distribution": [ - 34.1, - 65.9, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "qwen3-next-80b-a3b-instruct-noreasoning", - "median": 9, - "distribution": [ - 58.0, - 41.0, - 1.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "qwen2.5-14B-Instruct", - "median": 9, - "distribution": [ - 58.0, - 42.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "qwen3-next-80b-a3b-instruct-shortreasoning", - "median": 8, - "distribution": [ - 81.0, - 19.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "qwen2.5-7B-Instruct-1M", - "median": 8, - "distribution": [ - 70.0, - 29.0, - 1.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "qwen2.5-72B-Instruct", - "median": 7, - "distribution": [ - 75.0, - 25.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "qwen2.5-7B-Instruct", - "median": 7, - "distribution": [ - 84.0, - 16.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "llama3.3-70B", - "median": 1, - "distribution": [ - 92.0, - 7.0, - 1.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - } - ], - "globem": [ - { - "model": "claude4.5-sonnet", - "median": 25, - "distribution": [ - 0.0, - 6.0, - 87.0, - 7.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "gemini3-flash", - "median": 21, - "distribution": [ - 2.0, - 36.0, - 58.0, - 3.0, - 1.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "glm4.6", - "median": 21, - "distribution": [ - 0.0, - 23.0, - 66.0, - 11.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "run_api_deepseek_deepseek-chat", - "median": 20, - "distribution": [ - 0.0, - 32.0, - 68.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "qwen3-next-80b-a3b-instruct-note", - "median": 19, - "distribution": [ - 16.0, - 36.0, - 33.0, - 9.0, - 5.0, - 1.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "qwen3", - "median": 19, - "distribution": [ - 0.0, - 50.0, - 38.0, - 9.0, - 0.0, - 0.0, - 1.0, - 0.0, - 0.0, - 2.0 - ] - }, - { - "model": "minimax-m2", - "median": 17, - "distribution": [ - 0.0, - 80.0, - 20.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "gpt5-mini", - "median": 17, - "distribution": [ - 2.0, - 78.0, - 20.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "kimi-k2", - "median": 17, - "distribution": [ - 0.0, - 82.0, - 18.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "qwen2.5-32B-Instruct", - "median": 15, - "distribution": [ - 1.0, - 84.0, - 14.0, - 0.0, - 0.0, - 1.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "gpt5.2", - "median": 15, - "distribution": [ - 0.0, - 92.0, - 8.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "qwen2.5-72B-Instruct", - "median": 14, - "distribution": [ - 4.0, - 78.0, - 17.0, - 1.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "gemini2.5-flash-lite", - "median": 14, - "distribution": [ - 7.0, - 80.0, - 12.0, - 1.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "qwen2.5-14B-Instruct-1M", - "median": 14, - "distribution": [ - 13.0, - 66.0, - 16.0, - 5.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "qwen2.5-14B-Instruct", - "median": 13, - "distribution": [ - 16.0, - 82.0, - 2.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "qwen3-next-80b-a3b-instruct", - "median": 12, - "distribution": [ - 0.0, - 99.0, - 1.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "gemini2.5-pro", - "median": 12, - "distribution": [ - 3.0, - 94.0, - 3.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "qwen2.5-7B-Instruct-1M", - "median": 12, - "distribution": [ - 18.0, - 73.0, - 7.0, - 0.0, - 2.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "gemini2.5-flash", - "median": 12, - "distribution": [ - 15.0, - 85.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "qwen3-4B-Instruct-2507", - "median": 12, - "distribution": [ - 12.0, - 83.0, - 5.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "gpt5.1", - "median": 11, - "distribution": [ - 30.0, - 70.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "qwen3-next-80b-a3b-instruct-noreasoning", - "median": 9, - "distribution": [ - 57.0, - 42.0, - 1.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "qwen3-next-80b-a3b-instruct-longreasoning", - "median": 9, - "distribution": [ - 69.0, - 30.0, - 1.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "qwen3-next-80b-a3b-instruct-shortreasoning", - "median": 9, - "distribution": [ - 66.0, - 34.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "qwen2.5-7B-Instruct", - "median": 9, - "distribution": [ - 53.0, - 45.0, - 1.0, - 0.0, - 0.0, - 1.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - }, - { - "model": "llama3.3-70B", - "median": 6, - "distribution": [ - 98.0, - 2.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 - ] - } - ] - }, - probing: { - "byTurn": { - "mimic": { - "Qwen2.5-32B": { - "turns": [ - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10 - ], - "logprob": [ - -4.3, - -4.21, - -4.04, - -3.87, - -3.59, - -3.62, - -3.33, - -3.4, - -2.93, - -3.21 - ], - "sem": [ - 0.25, - 0.27, - 0.32, - 0.35, - 0.35, - 0.36, - 0.34, - 0.35, - 0.32, - 0.4 - ] - }, - "Qwen2.5-72B": { - "turns": [ - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10 - ], - "logprob": [ - -3.51, - -3.98, - -3.68, - -3.8, - -3.26, - -3.22, - -3.12, - -3.24, - -3.08, - -2.84 - ], - "sem": [ - 0.15, - 0.21, - 0.21, - 0.23, - 0.23, - 0.21, - 0.25, - 0.25, - 0.28, - 0.08 - ] - }, - "Qwen3-4B": { - "turns": [ - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10 - ], - "logprob": [ - -3.48, - -3.25, - -3.3, - -2.74, - -2.75, - -2.73, - -2.72, - -2.67, - -2.62, - -2.25 - ], - "sem": [ - 0.04, - 0.05, - 0.04, - 0.07, - 0.06, - 0.07, - 0.07, - 0.07, - 0.06, - 0.06 - ] - }, - "Qwen3-30B-A3B": { - "turns": [ - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10 - ], - "logprob": [ - -4.94, - -5.21, - -5.51, - -5.05, - -4.96, - -4.95, - -4.75, - -4.73, - -4.6, - -4.72 - ], - "sem": [ - 0.15, - 0.18, - 0.2, - 0.18, - 0.19, - 0.19, - 0.17, - 0.18, - 0.16, - 0.18 - ] - }, - "Qwen3-Next-80B-A3B": { - "turns": [ - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10 - ], - "logprob": [ - -2.85, - -2.86, - -2.74, - -2.65, - -2.31, - -2.14, - -1.98, - -2.03, - -1.88, - -1.82 - ], - "sem": [ - 0.1, - 0.1, - 0.11, - 0.11, - 0.11, - 0.13, - 0.13, - 0.18, - 0.17, - 0.09 - ] - } - }, - "globem": { - "Qwen2.5-32B": { - "turns": [ - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10 - ], - "logprob": [ - -5.48, - -5.83, - -5.84, - -5.91, - -6.01, - -6.03, - -5.86, - -5.73, - -5.78, - -5.73 - ], - "sem": [ - 0.24, - 0.28, - 0.31, - 0.33, - 0.33, - 0.35, - 0.33, - 0.35, - 0.35, - 0.36 - ] - }, - "Qwen2.5-72B": { - "turns": [ - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10 - ], - "logprob": [ - -4.68, - -5.56, - -5.65, - -5.59, - -5.59, - -5.49, - -5.54, - -5.4, - -5.57, - -5.53 - ], - "sem": [ - 0.13, - 0.18, - 0.23, - 0.23, - 0.25, - 0.25, - 0.29, - 0.32, - 0.38, - 0.46 - ] - }, - "Qwen3-4B": { - "turns": [ - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10 - ], - "logprob": [ - -4.67, - -4.16, - -3.9, - -3.76, - -3.6, - -3.47, - -3.05, - -2.99, - -2.93, - -2.78 - ], - "sem": [ - 0.08, - 0.07, - 0.06, - 0.06, - 0.07, - 0.08, - 0.07, - 0.08, - 0.08, - 0.09 - ] - }, - "Qwen3-30B-A3B": { - "turns": [ - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10 - ], - "logprob": [ - -5.28, - -5.23, - -5.2, - -5.19, - -5.2, - -5.01, - -5.21, - -4.95, - -4.93, - -4.81 - ], - "sem": [ - 0.09, - 0.09, - 0.09, - 0.08, - 0.08, - 0.08, - 0.09, - 0.09, - 0.1, - 0.1 - ] - }, - "Qwen3-Next-80B-A3B": { - "turns": [ - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10 - ], - "logprob": [ - -3.1, - -3.15, - -3.06, - -3.01, - -2.95, - -2.88, - -2.78, - -2.4, - -2.46, - -1.89 - ], - "sem": [ - 0.06, - 0.06, - 0.06, - 0.06, - 0.06, - 0.06, - 0.07, - 0.06, - 0.14, - 0.1 - ] - } - }, - "10k": { - "Qwen2.5-32B": { - "turns": [ - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10 - ], - "logprob": [ - -6.59, - -7.15, - -6.99, - -6.95, - -6.82, - -6.88, - -6.71, - -6.58, - -6.67, - -6.45 - ], - "sem": [ - 0.26, - 0.28, - 0.29, - 0.3, - 0.29, - 0.29, - 0.29, - 0.32, - 0.36, - 0.41 - ] - }, - "Qwen2.5-72B": { - "turns": [ - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10 - ], - "logprob": [ - -5.51, - -7.02, - -6.45, - -6.11, - -5.98, - -6.52, - -7.02, - -7.88, - -8.05, - -7.66 - ], - "sem": [ - 0.26, - 0.34, - 0.34, - 0.36, - 0.4, - 0.53, - 0.62, - 0.71, - 0.81, - 0.92 - ] - }, - "Qwen3-4B": { - "turns": [ - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10 - ], - "logprob": [ - -4.68, - -4.3, - -3.57, - -3.33, - -3.27, - -3.22, - -3.06, - -2.9, - -2.75, - -2.57 - ], - "sem": [ - 0.18, - 0.17, - 0.15, - 0.14, - 0.14, - 0.14, - 0.14, - 0.14, - 0.14, - 0.14 - ] - }, - "Qwen3-30B-A3B": { - "turns": [ - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10 - ], - "logprob": [ - -3.23, - -3.31, - -3.28, - -3.16, - -3.06, - -2.97, - -2.94, - -2.87, - -2.83, - -2.73 - ], - "sem": [ - 0.17, - 0.17, - 0.17, - 0.17, - 0.17, - 0.16, - 0.17, - 0.18, - 0.18, - 0.17 - ] - }, - "Qwen3-Next-80B-A3B": { - "turns": [ - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10 - ], - "logprob": [ - -3.25, - -3.42, - -3.21, - -2.94, - -2.81, - -2.75, - -2.7, - -2.65, - -2.55, - -2.45 - ], - "sem": [ - 0.16, - 0.17, - 0.17, - 0.17, - 0.16, - 0.17, - 0.16, - 0.16, - 0.16, - 0.16 - ] - } - } - }, - "byProgress": { - "mimic": { - "Qwen2.5-32B": { - "progress": [ - 0, - 10, - 20, - 30, - 40, - 50, - 60, - 70, - 80, - 90 - ], - "logprob": [ - -4.3, - -4.12, - -3.73, - -3.62, - -3.36, - -3.05, - -2.94, - -3.12, - -4.6, - -4.42 - ], - "sem": [ - 0.25, - 0.21, - 0.25, - 0.36, - 0.24, - 0.25, - 0.38, - 0.45, - 1.5, - 0.1 - ] - }, - "Qwen2.5-72B": { - "progress": [ - 0, - 10, - 20, - 30, - 40, - 50, - 60, - 70, - 80, - 90 - ], - "logprob": [ - -3.51, - -3.98, - -3.74, - -3.26, - -3.17, - -3.24, - -2.99, - -2.53, - -2.58, - -2.42 - ], - "sem": [ - 0.15, - 0.21, - 0.16, - 0.23, - 0.17, - 0.25, - 0.18, - 0.09, - 0.09, - 0.2 - ] - }, - "Qwen3-4B": { - "progress": [ - 0, - 10, - 20, - 30, - 40, - 50, - 60, - 70, - 80, - 90 - ], - "logprob": [ - -3.37, - -2.93, - -2.71, - -2.33, - -1.99, - -2.04, - -1.57, - -1.46, - -1.48, - -1.44 - ], - "sem": [ - 0.03, - 0.04, - 0.04, - 0.04, - 0.05, - 0.08, - 0.1, - 0.05, - 0.0, - 0.01 - ] - }, - "Qwen3-30B-A3B": { - "progress": [ - 0, - 10, - 20, - 30, - 40, - 50, - 60, - 70, - 80, - 90 - ], - "logprob": [ - -5.13, - -4.72, - -4.42, - -4.17, - -4.04, - -3.9, - -3.64, - -3.45, - -3.36, - -3.17 - ], - "sem": [ - 0.08, - 0.07, - 0.07, - 0.07, - 0.07, - 0.08, - 0.1, - 0.14, - 0.15, - 0.26 - ] - }, - "Qwen3-Next-80B-A3B": { - "progress": [ - 0, - 10, - 20, - 30, - 40, - 50, - 60, - 70, - 80, - 90 - ], - "logprob": [ - -2.85, - -2.8, - -2.65, - -2.22, - -1.98, - -1.96, - -1.79, - -1.74, - -1.83, - -1.85 - ], - "sem": [ - 0.1, - 0.07, - 0.11, - 0.09, - 0.13, - 0.12, - 0.08, - 0.16, - 0.15, - 0.39 - ] - } - }, - "globem": { - "Qwen2.5-32B": { - "progress": [ - 0, - 10, - 20, - 30, - 40, - 50, - 60, - 70, - 80, - 90 - ], - "logprob": [ - -5.66, - -5.92, - -5.88, - -5.79, - -5.79, - -5.55, - -5.47, - -4.8, - -3.55, - -3.24 - ], - "sem": [ - 0.18, - 0.19, - 0.2, - 0.21, - 0.29, - 0.29, - 0.47, - 0.63, - 0.19, - 0.47 - ] - }, - "Qwen2.5-72B": { - "progress": [ - 0, - 10, - 20, - 30, - 40, - 50, - 60, - 70, - 80, - 90 - ], - "logprob": [ - -4.68, - -5.56, - -5.62, - -5.59, - -5.51, - -5.4, - -5.56, - -5.03, - -5.77, - -7.71 - ], - "sem": [ - 0.13, - 0.18, - 0.16, - 0.25, - 0.19, - 0.32, - 0.29, - 0.55, - 0.83, - 0.1 - ] - }, - "Qwen3-4B": { - "progress": [ - 0, - 10, - 20, - 30, - 40, - 50, - 60, - 70, - 80, - 90 - ], - "logprob": [ - -4.42, - -3.83, - -3.38, - -2.96, - -2.71, - -2.6, - -2.46, - -2.53, - -2.63, - -2.61 - ], - "sem": [ - 0.06, - 0.04, - 0.04, - 0.05, - 0.07, - 0.08, - 0.12, - 0.14, - 0.25, - 0.04 - ] - }, - "Qwen3-30B-A3B": { - "progress": [ - 0, - 10, - 20, - 30, - 40, - 50, - 60, - 70, - 80, - 90 - ], - "logprob": [ - -5.26, - -5.2, - -5.06, - -4.82, - -4.5, - -4.51, - -4.37, - -4.1, - -4.03, - -3.74 - ], - "sem": [ - 0.06, - 0.05, - 0.05, - 0.06, - 0.07, - 0.08, - 0.1, - 0.29, - 0.25, - 0.11 - ] - }, - "Qwen3-Next-80B-A3B": { - "progress": [ - 10, - 20, - 30, - 40, - 50, - 60, - 70, - 80, - 90 - ], - "logprob": [ - -3.1, - -3.15, - -3.06, - -3.01, - -2.95, - -2.88, - -2.78, - -2.4, - -2.46 - ], - "sem": [ - 0.06, - 0.06, - 0.06, - 0.06, - 0.06, - 0.06, - 0.07, - 0.06, - 0.14 - ] - } - }, - "10k": { - "Qwen2.5-32B": { - "progress": [ - 0, - 10, - 20, - 30, - 40, - 50, - 60, - 70, - 80, - 90 - ], - "logprob": [ - -6.59, - -7.07, - -6.89, - -6.8, - -6.58, - -6.58, - -6.76, - -8.0, - -8.59, - -8.83 - ], - "sem": [ - 0.26, - 0.2, - 0.21, - 0.2, - 0.32, - 0.27, - 0.39, - 0.57, - 0.84, - 1.12 - ] - }, - "Qwen2.5-72B": { - "progress": [ - 0, - 10, - 20, - 30, - 40, - 50, - 60, - 70, - 80, - 90 - ], - "logprob": [ - -5.51, - -7.02, - -6.28, - -5.98, - -6.52, - -7.33, - -8.05, - -7.85, - -8.41, - -7.15 - ], - "sem": [ - 0.26, - 0.34, - 0.25, - 0.4, - 0.53, - 0.47, - 0.81, - 0.79, - 1.45, - 1.26 - ] - }, - "Qwen3-4B": { - "progress": [ - 0, - 10, - 20, - 30, - 40, - 50, - 60, - 70, - 80, - 90 - ], - "logprob": [ - -4.49, - -3.45, - -3.19, - -2.83, - -2.5, - -2.27, - -2.31, - -2.31, - -2.35, - -1.73 - ], - "sem": [ - 0.12, - 0.1, - 0.08, - 0.1, - 0.1, - 0.11, - 0.2, - 0.29, - 0.36, - 0.03 - ] - }, - "Qwen3-30B-A3B": { - "progress": [ - 0, - 10, - 20, - 30, - 40, - 50, - 60, - 70, - 80, - 90 - ], - "logprob": [ - -3.14, - -2.66, - -2.29, - -2.26, - -1.97, - -1.88, - -1.52, - -1.36, - -1.61, - -1.61 - ], - "sem": [ - 0.06, - 0.06, - 0.07, - 0.1, - 0.14, - 0.18, - 0.08, - 0.02, - 0.05, - 0.08 - ] - }, - "Qwen3-Next-80B-A3B": { - "progress": [ - 0, - 10, - 20, - 30, - 40, - 50, - 60, - 70, - 80, - 90 - ], - "logprob": [ - -3.34, - -2.99, - -2.7, - -2.5, - -2.43, - -2.55, - -2.18, - -2.28, - -2.19, - -2.5 - ], - "sem": [ - 0.12, - 0.1, - 0.1, - 0.11, - 0.11, - 0.15, - 0.21, - 0.22, - 0.26, - 0.38 - ] - } - } - } - }, - probingColors: { - "Qwen2.5-32B": "#4A90D9", - "Qwen2.5-72B": "#1A5FB4", - "Qwen3-4B": "#57E389", - "Qwen3-30B-A3B": "#26A269", - "Qwen3-Next-80B-A3B": "#9141AC" - }, - error: [ - { - "main_category": "Fail in Exploration", - "subcategory": "Insufficient Breadth", - "count": 64, - "percentage": 31.1, - "color": "#1565C0" - }, - { - "main_category": "Fail in Exploration", - "subcategory": "Insufficient Depth", - "count": 56, - "percentage": 27.2, - "color": "#42A5F5" - }, - { - "main_category": "Poor Data-to-Insight", - "subcategory": "Insight Misinterpretation", - "count": 19, - "percentage": 9.2, - "color": "#2E7D32" - }, - { - "main_category": "Poor Data-to-Insight", - "subcategory": "Superficial Analysis", - "count": 16, - "percentage": 7.8, - "color": "#43A047" - }, - { - "main_category": "Poor Data-to-Insight", - "subcategory": "Over Reasoning", - "count": 15, - "percentage": 7.3, - "color": "#81C784" - }, - { - "main_category": "Lost in Context", - "subcategory": "Lost in Debugging", - "count": 18, - "percentage": 8.7, - "color": "#C62828" - }, - { - "main_category": "Lost in Context", - "subcategory": "Fail in Summarization", - "count": 10, - "percentage": 4.9, - "color": "#E53935" - }, - { - "main_category": "Lost in Context", - "subcategory": "Poor Instruction Following", - "count": 8, - "percentage": 3.9, - "color": "#EF9A9A" - } - ] + modelColors: { + "GPT-5.2": "#00C853", + "Claude-4.5-Sonnet": "#FF6D00", + "Gemini-3-Flash": "#2196F3", + "GLM-4.6": "#9C27B0", + "DeepSeek-V3": "#E91E63", + "deepseek-v3.2": "#E91E63" + }, + scaling: { + "mimic": { + "GPT-5.2": { + "turns": [ + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40, + 45, + 50, + 55 + ], + "tokens": [ + 3737, + 8577, + 15459, + 20193, + 24028, + 26951, + 28820, + 29751, + 30405, + 30968, + 31260 + ], + "costs": [ + 0.005, + 0.0207, + 0.0516, + 0.0947, + 0.1522, + 0.2153, + 0.2799, + 0.3597, + 0.4373, + 0.4906, + 0.635 + ], + "accuracy": [ + 10.85, + 15.25, + 18.35, + 20.41, + 23.26, + 24.42, + 25.32, + 25.97, + 26.36, + 26.87, + 27.26 + ] + }, + "Gemini-3-Flash": { + "turns": [ + 5, + 10, + 15, + 20, + 25 + ], + "tokens": [ + 5580, + 14305, + 23357, + 26964, + 27542 + ], + "costs": [ + 0.002, + 0.008, + 0.0173, + 0.0284, + 0.045 + ], + "accuracy": [ + 7.62, + 13.44, + 19.77, + 24.03, + 24.94 + ] + }, + "Claude-4.5-Sonnet": { + "turns": [ + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40, + 45, + 50, + 55, + 60, + 65, + 70, + 75 + ], + "tokens": [ + 4513, + 9214, + 13378, + 17217, + 20275, + 22878, + 25379, + 27559, + 29532, + 31239, + 32395, + 33382, + 33796, + 33968, + 34140 + ], + "costs": [ + 0.0152, + 0.059, + 0.1249, + 0.2138, + 0.3214, + 0.4458, + 0.5823, + 0.7212, + 0.842, + 0.9656, + 1.0851, + 1.1605, + 1.3008, + 1.4081, + 1.3369 + ], + "accuracy": [ + 8.14, + 9.17, + 11.89, + 14.73, + 16.67, + 18.22, + 19.77, + 22.87, + 26.61, + 29.46, + 31.78, + 33.59, + 33.98, + 34.24, + 34.37 + ] + }, + "GLM-4.6": { + "turns": [ + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40, + 45, + 50, + 55, + 60, + 65, + 70, + 75, + 80, + 85, + 90, + 95 + ], + "tokens": [ + 3488, + 7059, + 10542, + 13099, + 14972, + 16484, + 17524, + 18410, + 19112, + 19728, + 20259, + 20715, + 21135, + 21489, + 21858, + 22169, + 22422, + 22613, + 22802 + ], + "costs": [ + 0.0026, + 0.0097, + 0.0217, + 0.0369, + 0.0552, + 0.0743, + 0.0969, + 0.1204, + 0.1489, + 0.1769, + 0.2074, + 0.24, + 0.2763, + 0.3114, + 0.3522, + 0.3935, + 0.4408, + 0.4741, + 0.5461 + ], + "accuracy": [ + 9.43, + 11.11, + 13.57, + 16.02, + 17.57, + 18.86, + 19.77, + 20.16, + 20.8, + 21.19, + 21.45, + 22.09, + 22.48, + 22.61, + 22.87, + 23.13, + 23.13, + 23.13, + 23.26 + ] + }, + "deepseek-v3.2": { + "turns": [ + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40, + 45 + ], + "tokens": [ + 3447, + 9155, + 13730, + 17912, + 21270, + 23962, + 26205, + 27253, + 27411 + ], + "costs": [ + 0.001, + 0.0048, + 0.011, + 0.0192, + 0.0282, + 0.0385, + 0.0499, + 0.0642, + 0.0694 + ], + "accuracy": [ + 9.3, + 12.53, + 14.73, + 17.05, + 20.16, + 23.9, + 25.97, + 26.87, + 27.0 + ] + } + }, + "10k": { + "Claude-4.5-Sonnet": { + "turns": [ + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40, + 45, + 50, + 55, + 60, + 65, + 70, + 75, + 80 + ], + "tokens": [ + 2561, + 7208, + 10978, + 14050, + 17080, + 19955, + 22501, + 25124, + 27696, + 30109, + 32363, + 34247, + 35154, + 35907, + 36148, + 36277 + ], + "costs": [ + 0.0094, + 0.0414, + 0.0955, + 0.1682, + 0.2576, + 0.3598, + 0.4751, + 0.5932, + 0.7209, + 0.8684, + 1.0029, + 1.0913, + 1.2015, + 1.3713, + 1.4854, + 1.5611 + ], + "accuracy": [ + 0.82, + 1.06, + 1.41, + 3.18, + 5.65, + 9.78, + 15.19, + 22.85, + 31.8, + 46.64, + 60.42, + 69.02, + 73.26, + 75.62, + 76.68, + 77.27 + ] + }, + "GPT-5.2": { + "turns": [ + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40, + 45, + 50, + 55, + 60 + ], + "tokens": [ + 2823, + 7902, + 11581, + 14651, + 16306, + 17356, + 17871, + 18251, + 18345, + 18398, + 18441, + 18468 + ], + "costs": [ + 0.0037, + 0.0199, + 0.0454, + 0.0774, + 0.1125, + 0.1524, + 0.1983, + 0.2657, + 0.3601, + 0.4706, + 0.5641, + 0.6699 + ], + "accuracy": [ + 0.82, + 8.36, + 22.85, + 32.98, + 37.57, + 40.52, + 43.23, + 44.29, + 44.41, + 44.52, + 44.76, + 44.99 + ] + }, + "GLM-4.6": { + "turns": [ + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40, + 45, + 50, + 55, + 60, + 65, + 70, + 75, + 80, + 85, + 90 + ], + "tokens": [ + 1838, + 3927, + 6056, + 8124, + 10227, + 12399, + 14608, + 16757, + 18614, + 20290, + 21537, + 22418, + 23164, + 23553, + 23781, + 23950, + 24062, + 24102 + ], + "costs": [ + 0.0013, + 0.0053, + 0.012, + 0.0214, + 0.0334, + 0.0481, + 0.0654, + 0.0855, + 0.1047, + 0.1277, + 0.1498, + 0.1724, + 0.2004, + 0.223, + 0.2716, + 0.3281, + 0.3281, + 0.4018 + ], + "accuracy": [ + 0.24, + 0.59, + 2.0, + 4.48, + 8.72, + 13.19, + 19.08, + 26.27, + 35.34, + 41.22, + 47.7, + 52.3, + 54.77, + 56.54, + 57.83, + 59.25, + 60.19, + 60.42 + ] + }, + "deepseek-v3.2": { + "turns": [ + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40, + 45, + 50, + 55, + 60 + ], + "tokens": [ + 1997, + 4260, + 6707, + 9238, + 11703, + 14313, + 16306, + 18027, + 19074, + 19698, + 19875, + 19988 + ], + "costs": [ + 0.0006, + 0.0024, + 0.0054, + 0.0097, + 0.0153, + 0.0214, + 0.028, + 0.0355, + 0.0437, + 0.0505, + 0.06, + 0.0694 + ], + "accuracy": [ + 1.18, + 1.88, + 2.83, + 6.12, + 14.13, + 27.09, + 37.34, + 48.53, + 55.48, + 58.54, + 59.72, + 60.42 + ] + }, + "Gemini-3-Flash": { + "turns": [ + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40 + ], + "tokens": [ + 4473, + 12616, + 18404, + 20077, + 20729, + 20883, + 20935, + 21004 + ], + "costs": [ + 0.002, + 0.008, + 0.0171, + 0.0275, + 0.0385, + 0.0405, + 0.0538, + 0.0688 + ], + "accuracy": [ + 1.88, + 18.61, + 38.4, + 42.17, + 43.7, + 44.05, + 44.05, + 44.41 + ] + } + }, + "globem": { + "deepseek-v3.2": { + "turns": [ + 5, + 10, + 15, + 20, + 25 + ], + "tokens": [ + 3972, + 10497, + 20470, + 32293, + 36396 + ], + "costs": [ + 0.001, + 0.005, + 0.0135, + 0.0262, + 0.0382 + ], + "accuracy": [ + 0.92, + 0.92, + 4.6, + 27.59, + 36.78 + ] + }, + "GLM-4.6": { + "turns": [ + 5, + 10, + 15, + 20, + 25, + 30, + 35 + ], + "tokens": [ + 3851, + 8817, + 16221, + 26186, + 32076, + 34058, + 34425 + ], + "costs": [ + 0.0027, + 0.0115, + 0.028, + 0.0544, + 0.0891, + 0.1409, + 0.1581 + ], + "accuracy": [ + 2.3, + 2.3, + 5.52, + 22.99, + 36.32, + 40.0, + 41.61 + ] + }, + "Gemini-3-Flash": { + "turns": [ + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40 + ], + "tokens": [ + 6260, + 14825, + 25972, + 35526, + 40312, + 41787, + 42167, + 42236 + ], + "costs": [ + 0.0021, + 0.0074, + 0.0166, + 0.0292, + 0.047, + 0.0828, + 0.1479, + 0.2258 + ], + "accuracy": [ + 1.88, + 2.12, + 5.88, + 21.41, + 30.35, + 34.35, + 35.06, + 35.29 + ] + }, + "Claude-4.5-Sonnet": { + "turns": [ + 5, + 10, + 15, + 20, + 25, + 30 + ], + "tokens": [ + 4579, + 11405, + 21188, + 32526, + 44888, + 49137 + ], + "costs": [ + 0.0152, + 0.0638, + 0.1611, + 0.3124, + 0.4877, + 0.6335 + ], + "accuracy": [ + 2.53, + 2.53, + 3.45, + 12.64, + 30.57, + 40.23 + ] + }, + "GPT-5.2": { + "turns": [ + 5, + 10, + 15, + 20 + ], + "tokens": [ + 3477, + 10218, + 17672, + 19878 + ], + "costs": [ + 0.0048, + 0.0236, + 0.0652, + 0.1238 + ], + "accuracy": [ + 0.92, + 5.98, + 34.02, + 38.39 + ] + } + } + }, + ranking: { + "MIMIC": [ + { + "model": "gpt5-mini", + "bt_rank": 1, + "win_rate": 100.0, + "accuracy": 27.59, + "acc_rank": 7, + "is_proprietary": true + }, + { + "model": "claude4.5-sonnet", + "bt_rank": 2, + "win_rate": 94.6, + "accuracy": 33.66, + "acc_rank": 1, + "is_proprietary": true + }, + { + "model": "gpt5mini", + "bt_rank": 3, + "win_rate": 87.8, + "accuracy": 27.59, + "acc_rank": 8, + "is_proprietary": true + }, + { + "model": "gpt5.2", + "bt_rank": 4, + "win_rate": 83.6, + "accuracy": 28.88, + "acc_rank": 5, + "is_proprietary": true + }, + { + "model": "gpt5.1", + "bt_rank": 5, + "win_rate": 80.6, + "accuracy": 30.1, + "acc_rank": 3, + "is_proprietary": true + }, + { + "model": "gemini3-flash", + "bt_rank": 6, + "win_rate": 76.5, + "accuracy": 29.28, + "acc_rank": 4, + "is_proprietary": true + }, + { + "model": "kimi-k2", + "bt_rank": 7, + "win_rate": 73.1, + "accuracy": 30.17, + "acc_rank": 2, + "is_proprietary": false + }, + { + "model": "run_api_deepseek_deepseek-chat", + "bt_rank": 8, + "win_rate": 70.5, + "accuracy": 27.65, + "acc_rank": 6, + "is_proprietary": false + }, + { + "model": "gemini2.5-pro", + "bt_rank": 9, + "win_rate": 63.9, + "accuracy": 19.0, + "acc_rank": 14, + "is_proprietary": true + }, + { + "model": "qwen3-next-80b-a3b-instruct", + "bt_rank": 10, + "win_rate": 59.5, + "accuracy": 18.8, + "acc_rank": 15, + "is_proprietary": false + }, + { + "model": "minimax-m2", + "bt_rank": 11, + "win_rate": 59.7, + "accuracy": 23.52, + "acc_rank": 10, + "is_proprietary": false + }, + { + "model": "glm4.6", + "bt_rank": 12, + "win_rate": 52.1, + "accuracy": 23.84, + "acc_rank": 9, + "is_proprietary": false + }, + { + "model": "qwen3", + "bt_rank": 13, + "win_rate": 51.7, + "accuracy": 19.13, + "acc_rank": 13, + "is_proprietary": false + }, + { + "model": "qwen2.5-14B-Instruct-1M", + "bt_rank": 14, + "win_rate": 40.3, + "accuracy": 20, + "acc_rank": 11, + "is_proprietary": false + }, + { + "model": "gemini2.5-flash-lite", + "bt_rank": 15, + "win_rate": 35.4, + "accuracy": 16.64, + "acc_rank": 18, + "is_proprietary": true + }, + { + "model": "qwen2.5-14B-Instruct", + "bt_rank": 16, + "win_rate": 32.4, + "accuracy": 14.15, + "acc_rank": 20, + "is_proprietary": false + }, + { + "model": "qwen2.5-32b-instruct", + "bt_rank": 17, + "win_rate": 32.3, + "accuracy": 13.12, + "acc_rank": 21, + "is_proprietary": false + }, + { + "model": "gemini2.5-flash", + "bt_rank": 18, + "win_rate": 31.2, + "accuracy": 18.61, + "acc_rank": 16, + "is_proprietary": true + }, + { + "model": "qwen2.5-72B-Instruct", + "bt_rank": 19, + "win_rate": 29.5, + "accuracy": 14.92, + "acc_rank": 19, + "is_proprietary": false + }, + { + "model": "qwen3-4B-Instruct-2507", + "bt_rank": 20, + "win_rate": 27.3, + "accuracy": 16.93, + "acc_rank": 17, + "is_proprietary": false + }, + { + "model": "qwen2.5-7B-Instruct-1M", + "bt_rank": 21, + "win_rate": 17.3, + "accuracy": 20, + "acc_rank": 12, + "is_proprietary": false + }, + { + "model": "llama3.3-70B", + "bt_rank": 22, + "win_rate": 14.2, + "accuracy": 7.3, + "acc_rank": 22, + "is_proprietary": false + } + ], + "10K": [ + { + "model": "claude4.5-sonnet", + "bt_rank": 1, + "win_rate": 92.8, + "accuracy": 69.26, + "acc_rank": 1, + "is_proprietary": true + }, + { + "model": "run_api_deepseek_deepseek-chat", + "bt_rank": 2, + "win_rate": 80.6, + "accuracy": 49.41, + "acc_rank": 2, + "is_proprietary": false + }, + { + "model": "gpt5mini", + "bt_rank": 3, + "win_rate": 80.4, + "accuracy": 41.56, + "acc_rank": 5, + "is_proprietary": true + }, + { + "model": "gpt5.2", + "bt_rank": 4, + "win_rate": 78.0, + "accuracy": 43.11, + "acc_rank": 4, + "is_proprietary": true + }, + { + "model": "kimi-k2", + "bt_rank": 5, + "win_rate": 77.0, + "accuracy": 41.17, + "acc_rank": 7, + "is_proprietary": false + }, + { + "model": "glm4.6", + "bt_rank": 6, + "win_rate": 71.4, + "accuracy": 48.29, + "acc_rank": 3, + "is_proprietary": false + }, + { + "model": "gemini3-flash", + "bt_rank": 7, + "win_rate": 63.6, + "accuracy": 39.5, + "acc_rank": 8, + "is_proprietary": true + }, + { + "model": "qwen3-next-80b-a3b-instruct", + "bt_rank": 8, + "win_rate": 59.2, + "accuracy": 38.34, + "acc_rank": 9, + "is_proprietary": false + }, + { + "model": "minimax-m2", + "bt_rank": 9, + "win_rate": 54.4, + "accuracy": 35.74, + "acc_rank": 10, + "is_proprietary": false + }, + { + "model": "gpt5.1", + "bt_rank": 10, + "win_rate": 54.0, + "accuracy": 41.23, + "acc_rank": 6, + "is_proprietary": true + }, + { + "model": "qwen3", + "bt_rank": 11, + "win_rate": 51.0, + "accuracy": 28.23, + "acc_rank": 12, + "is_proprietary": false + }, + { + "model": "qwen2.5-14B-Instruct-1M", + "bt_rank": 12, + "win_rate": 45.6, + "accuracy": 20, + "acc_rank": 15, + "is_proprietary": false + }, + { + "model": "gemini2.5-pro", + "bt_rank": 13, + "win_rate": 44.8, + "accuracy": 20.91, + "acc_rank": 13, + "is_proprietary": true + }, + { + "model": "qwen2.5-32b-instruct", + "bt_rank": 14, + "win_rate": 41.2, + "accuracy": 17.83, + "acc_rank": 17, + "is_proprietary": false + }, + { + "model": "qwen2.5-72B-Instruct", + "bt_rank": 15, + "win_rate": 34.6, + "accuracy": 20.79, + "acc_rank": 14, + "is_proprietary": false + }, + { + "model": "qwen2.5-14B-Instruct", + "bt_rank": 16, + "win_rate": 31.6, + "accuracy": 14.65, + "acc_rank": 18, + "is_proprietary": false + }, + { + "model": "qwen3-4B-Instruct-2507", + "bt_rank": 17, + "win_rate": 30.0, + "accuracy": 30.43, + "acc_rank": 11, + "is_proprietary": false + }, + { + "model": "gemini2.5-flash-lite", + "bt_rank": 18, + "win_rate": 29.6, + "accuracy": 14.37, + "acc_rank": 19, + "is_proprietary": true + }, + { + "model": "qwen2.5-7B-Instruct-1M", + "bt_rank": 19, + "win_rate": 27.4, + "accuracy": 20, + "acc_rank": 16, + "is_proprietary": false + }, + { + "model": "gemini2.5-flash", + "bt_rank": 20, + "win_rate": 25.2, + "accuracy": 12.61, + "acc_rank": 20, + "is_proprietary": true + }, + { + "model": "qwen2.5-7B-Instruct", + "bt_rank": 21, + "win_rate": 22.0, + "accuracy": 7.53, + "acc_rank": 21, + "is_proprietary": false + }, + { + "model": "llama3.3-70B", + "bt_rank": 22, + "win_rate": 18.6, + "accuracy": 6.51, + "acc_rank": 22, + "is_proprietary": false + } + ], + "GLOBEM": [ + { + "model": "claude4.5-sonnet", + "bt_rank": 1, + "win_rate": 93.0, + "accuracy": 39.54, + "acc_rank": 2, + "is_proprietary": true + }, + { + "model": "gpt5-mini", + "bt_rank": 2, + "win_rate": 60.0, + "accuracy": 33.91, + "acc_rank": 12, + "is_proprietary": true + }, + { + "model": "gemini3-flash", + "bt_rank": 3, + "win_rate": 81.2, + "accuracy": 35.46, + "acc_rank": 9, + "is_proprietary": true + }, + { + "model": "minimax-m2", + "bt_rank": 4, + "win_rate": 77.8, + "accuracy": 36.9, + "acc_rank": 6, + "is_proprietary": false + }, + { + "model": "gpt5mini", + "bt_rank": 5, + "win_rate": 73.8, + "accuracy": 33.91, + "acc_rank": 13, + "is_proprietary": true + }, + { + "model": "gpt5.1", + "bt_rank": 6, + "win_rate": 67.5, + "accuracy": 36.76, + "acc_rank": 7, + "is_proprietary": true + }, + { + "model": "gpt5.2", + "bt_rank": 7, + "win_rate": 64.4, + "accuracy": 38.39, + "acc_rank": 3, + "is_proprietary": true + }, + { + "model": "qwen3", + "bt_rank": 8, + "win_rate": 64.7, + "accuracy": 36.32, + "acc_rank": 8, + "is_proprietary": false + }, + { + "model": "run_api_deepseek_deepseek-chat", + "bt_rank": 9, + "win_rate": 64.5, + "accuracy": 38.39, + "acc_rank": 4, + "is_proprietary": false + }, + { + "model": "glm4.6", + "bt_rank": 10, + "win_rate": 53.6, + "accuracy": 39.77, + "acc_rank": 1, + "is_proprietary": false + }, + { + "model": "kimi-k2", + "bt_rank": 11, + "win_rate": 52.2, + "accuracy": 37.01, + "acc_rank": 5, + "is_proprietary": false + }, + { + "model": "gemini2.5-pro", + "bt_rank": 12, + "win_rate": 45.6, + "accuracy": 34.6, + "acc_rank": 10, + "is_proprietary": true + }, + { + "model": "qwen2.5-72B-Instruct", + "bt_rank": 13, + "win_rate": 43.3, + "accuracy": 27.13, + "acc_rank": 14, + "is_proprietary": false + }, + { + "model": "qwen2.5-32B-Instruct", + "bt_rank": 14, + "win_rate": 42.1, + "accuracy": 20, + "acc_rank": 20, + "is_proprietary": false + }, + { + "model": "qwen3-next-80b-a3b-instruct", + "bt_rank": 15, + "win_rate": 41.5, + "accuracy": 34.14, + "acc_rank": 11, + "is_proprietary": false + }, + { + "model": "qwen2.5-14B-Instruct", + "bt_rank": 16, + "win_rate": 40.8, + "accuracy": 26.13, + "acc_rank": 16, + "is_proprietary": false + }, + { + "model": "gemini2.5-flash-lite", + "bt_rank": 17, + "win_rate": 37.4, + "accuracy": 25.52, + "acc_rank": 18, + "is_proprietary": true + }, + { + "model": "qwen3-4B-Instruct-2507", + "bt_rank": 18, + "win_rate": 36.6, + "accuracy": 26.9, + "acc_rank": 15, + "is_proprietary": false + }, + { + "model": "qwen2.5-14B-Instruct-1M", + "bt_rank": 19, + "win_rate": 32.0, + "accuracy": 20, + "acc_rank": 21, + "is_proprietary": false + }, + { + "model": "llama3.3-70B", + "bt_rank": 20, + "win_rate": 28.1, + "accuracy": 22.65, + "acc_rank": 19, + "is_proprietary": false + }, + { + "model": "qwen2.5-7B-Instruct", + "bt_rank": 21, + "win_rate": 22.2, + "accuracy": 25.64, + "acc_rank": 17, + "is_proprietary": false + }, + { + "model": "qwen2.5-7B-Instruct-1M", + "bt_rank": 22, + "win_rate": 19.7, + "accuracy": 20, + "acc_rank": 22, + "is_proprietary": false + } + ] + }, + turn: { + "mimic": [ + { + "model": "claude4.5-sonnet", + "median": 52, + "distribution": [ + 0.0, + 0.0, + 1.0, + 5.0, + 31.0, + 43.0, + 13.0, + 7.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3", + "median": 43, + "distribution": [ + 0.0, + 1.0, + 12.0, + 29.0, + 13.0, + 9.0, + 3.0, + 2.0, + 0.0, + 31.0 + ] + }, + { + "model": "gpt5-mini", + "median": 39, + "distribution": [ + 0.0, + 0.0, + 9.0, + 42.0, + 36.0, + 12.0, + 1.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "glm4.6", + "median": 39, + "distribution": [ + 0.0, + 6.3, + 23.4, + 20.7, + 7.2, + 13.5, + 3.6, + 6.3, + 4.5, + 14.4 + ] + }, + { + "model": "run_api_deepseek_deepseek-chat", + "median": 33, + "distribution": [ + 0.0, + 2.0, + 22.0, + 60.0, + 16.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "gpt5.2", + "median": 30, + "distribution": [ + 0.0, + 10.0, + 36.0, + 32.0, + 12.0, + 10.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "gpt5.1", + "median": 23, + "distribution": [ + 1.5, + 39.7, + 29.4, + 19.9, + 9.6, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "kimi-k2", + "median": 19, + "distribution": [ + 0.0, + 55.0, + 44.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "minimax-m2", + "median": 18, + "distribution": [ + 0.0, + 70.0, + 30.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3-next-80b-a3b-instruct-note", + "median": 17, + "distribution": [ + 12.0, + 52.0, + 24.0, + 10.0, + 2.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "gemini2.5-pro", + "median": 15, + "distribution": [ + 10.6, + 70.2, + 19.2, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "gemini3-flash", + "median": 15, + "distribution": [ + 7.0, + 71.0, + 22.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3-4B-Instruct-2507", + "median": 14, + "distribution": [ + 0.0, + 98.0, + 1.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3-next-80b-a3b-instruct-noreasoning", + "median": 14, + "distribution": [ + 7.0, + 68.0, + 22.0, + 2.0, + 0.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3-next-80b-a3b-instruct", + "median": 12, + "distribution": [ + 23.0, + 62.0, + 12.0, + 0.0, + 1.0, + 0.0, + 1.0, + 0.0, + 0.0, + 1.0 + ] + }, + { + "model": "qwen2.5-72B-Instruct", + "median": 11, + "distribution": [ + 15.0, + 85.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3-next-80b-a3b-instruct-longreasoning", + "median": 11, + "distribution": [ + 24.0, + 74.0, + 2.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen2.5-32b-instruct", + "median": 11, + "distribution": [ + 33.0, + 67.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "gemini2.5-flash-lite", + "median": 11, + "distribution": [ + 29.0, + 71.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "gemini2.5-flash", + "median": 10, + "distribution": [ + 34.0, + 65.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen2.5-14B-Instruct-1M", + "median": 10, + "distribution": [ + 34.0, + 65.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3-next-80b-a3b-instruct-shortreasoning", + "median": 9, + "distribution": [ + 64.0, + 36.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen2.5-14B-Instruct", + "median": 8, + "distribution": [ + 73.0, + 27.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen2.5-7B-Instruct", + "median": 7, + "distribution": [ + 90.0, + 10.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "llama-3.3-70B", + "median": 6, + "distribution": [ + 99.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "llama3.3-70B", + "median": 6, + "distribution": [ + 99.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen2.5-7B-Instruct-1M", + "median": 4, + "distribution": [ + 91.0, + 9.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } + ], + "10k": [ + { + "model": "claude4.5-sonnet", + "median": 56, + "distribution": [ + 0.0, + 0.0, + 1.0, + 6.0, + 13.0, + 44.0, + 27.0, + 6.0, + 3.0, + 0.0 + ] + }, + { + "model": "glm4.6", + "median": 52, + "distribution": [ + 0.0, + 0.0, + 3.8, + 10.4, + 27.4, + 27.4, + 18.9, + 5.7, + 4.7, + 1.9 + ] + }, + { + "model": "run_api_deepseek_deepseek-chat", + "median": 39, + "distribution": [ + 0.0, + 0.0, + 11.0, + 40.0, + 37.0, + 9.0, + 3.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "gpt5mini", + "median": 35, + "distribution": [ + 0.0, + 4.0, + 27.8, + 36.5, + 24.6, + 6.3, + 0.8, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3", + "median": 26, + "distribution": [ + 0.8, + 25.4, + 30.2, + 7.9, + 2.4, + 0.0, + 0.0, + 0.8, + 0.0, + 32.5 + ] + }, + { + "model": "kimi-k2", + "median": 24, + "distribution": [ + 0.0, + 29.0, + 48.0, + 21.0, + 2.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "minimax-m2", + "median": 20, + "distribution": [ + 0.0, + 43.0, + 48.0, + 9.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3-next-80b-a3b-instruct", + "median": 20, + "distribution": [ + 0.0, + 46.0, + 27.0, + 11.0, + 2.0, + 6.0, + 3.0, + 1.0, + 1.0, + 3.0 + ] + }, + { + "model": "gpt5.2", + "median": 20, + "distribution": [ + 0.0, + 43.0, + 41.0, + 12.0, + 3.0, + 0.0, + 1.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "gpt5.1", + "median": 17, + "distribution": [ + 1.0, + 69.0, + 29.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3-next-80b-a3b-instruct-note", + "median": 16, + "distribution": [ + 17.0, + 44.0, + 27.0, + 10.0, + 2.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "gemini2.5-pro", + "median": 15, + "distribution": [ + 7.0, + 73.0, + 18.0, + 2.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "gemini2.5-flash-lite", + "median": 14, + "distribution": [ + 14.0, + 78.0, + 8.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "gemini3-flash", + "median": 13, + "distribution": [ + 10.0, + 82.0, + 7.0, + 0.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "gemini2.5-flash", + "median": 12, + "distribution": [ + 21.0, + 69.0, + 8.0, + 2.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3-4B-Instruct-2507", + "median": 12, + "distribution": [ + 4.0, + 91.0, + 5.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen2.5-14B-Instruct-1M", + "median": 11, + "distribution": [ + 31.0, + 64.3, + 4.0, + 0.8, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3-next-80b-a3b-instruct-longreasoning", + "median": 11, + "distribution": [ + 28.0, + 67.0, + 5.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen2.5-32b-instruct", + "median": 10, + "distribution": [ + 34.1, + 65.9, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3-next-80b-a3b-instruct-noreasoning", + "median": 9, + "distribution": [ + 58.0, + 41.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen2.5-14B-Instruct", + "median": 9, + "distribution": [ + 58.0, + 42.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3-next-80b-a3b-instruct-shortreasoning", + "median": 8, + "distribution": [ + 81.0, + 19.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen2.5-7B-Instruct-1M", + "median": 8, + "distribution": [ + 70.0, + 29.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen2.5-72B-Instruct", + "median": 7, + "distribution": [ + 75.0, + 25.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen2.5-7B-Instruct", + "median": 7, + "distribution": [ + 84.0, + 16.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "llama3.3-70B", + "median": 1, + "distribution": [ + 92.0, + 7.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } + ], + "globem": [ + { + "model": "claude4.5-sonnet", + "median": 25, + "distribution": [ + 0.0, + 6.0, + 87.0, + 7.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "gemini3-flash", + "median": 21, + "distribution": [ + 2.0, + 36.0, + 58.0, + 3.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "glm4.6", + "median": 21, + "distribution": [ + 0.0, + 23.0, + 66.0, + 11.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "run_api_deepseek_deepseek-chat", + "median": 20, + "distribution": [ + 0.0, + 32.0, + 68.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3-next-80b-a3b-instruct-note", + "median": 19, + "distribution": [ + 16.0, + 36.0, + 33.0, + 9.0, + 5.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3", + "median": 19, + "distribution": [ + 0.0, + 50.0, + 38.0, + 9.0, + 0.0, + 0.0, + 1.0, + 0.0, + 0.0, + 2.0 + ] + }, + { + "model": "minimax-m2", + "median": 17, + "distribution": [ + 0.0, + 80.0, + 20.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "gpt5-mini", + "median": 17, + "distribution": [ + 2.0, + 78.0, + 20.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "kimi-k2", + "median": 17, + "distribution": [ + 0.0, + 82.0, + 18.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen2.5-32B-Instruct", + "median": 15, + "distribution": [ + 1.0, + 84.0, + 14.0, + 0.0, + 0.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "gpt5.2", + "median": 15, + "distribution": [ + 0.0, + 92.0, + 8.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen2.5-72B-Instruct", + "median": 14, + "distribution": [ + 4.0, + 78.0, + 17.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "gemini2.5-flash-lite", + "median": 14, + "distribution": [ + 7.0, + 80.0, + 12.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen2.5-14B-Instruct-1M", + "median": 14, + "distribution": [ + 13.0, + 66.0, + 16.0, + 5.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen2.5-14B-Instruct", + "median": 13, + "distribution": [ + 16.0, + 82.0, + 2.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3-next-80b-a3b-instruct", + "median": 12, + "distribution": [ + 0.0, + 99.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "gemini2.5-pro", + "median": 12, + "distribution": [ + 3.0, + 94.0, + 3.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen2.5-7B-Instruct-1M", + "median": 12, + "distribution": [ + 18.0, + 73.0, + 7.0, + 0.0, + 2.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "gemini2.5-flash", + "median": 12, + "distribution": [ + 15.0, + 85.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3-4B-Instruct-2507", + "median": 12, + "distribution": [ + 12.0, + 83.0, + 5.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "gpt5.1", + "median": 11, + "distribution": [ + 30.0, + 70.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3-next-80b-a3b-instruct-noreasoning", + "median": 9, + "distribution": [ + 57.0, + 42.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3-next-80b-a3b-instruct-longreasoning", + "median": 9, + "distribution": [ + 69.0, + 30.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen3-next-80b-a3b-instruct-shortreasoning", + "median": 9, + "distribution": [ + 66.0, + 34.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "qwen2.5-7B-Instruct", + "median": 9, + "distribution": [ + 53.0, + 45.0, + 1.0, + 0.0, + 0.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + { + "model": "llama3.3-70B", + "median": 6, + "distribution": [ + 98.0, + 2.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } + ] + }, + probing: { + "byTurn": { + "mimic": { + "Qwen2.5-32B": { + "turns": [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 + ], + "logprob": [ + -4.3, + -4.21, + -4.04, + -3.87, + -3.59, + -3.62, + -3.33, + -3.4, + -2.93, + -3.21 + ], + "sem": [ + 0.25, + 0.27, + 0.32, + 0.35, + 0.35, + 0.36, + 0.34, + 0.35, + 0.32, + 0.4 + ] + }, + "Qwen2.5-72B": { + "turns": [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 + ], + "logprob": [ + -3.51, + -3.98, + -3.68, + -3.8, + -3.26, + -3.22, + -3.12, + -3.24, + -3.08, + -2.84 + ], + "sem": [ + 0.15, + 0.21, + 0.21, + 0.23, + 0.23, + 0.21, + 0.25, + 0.25, + 0.28, + 0.08 + ] + }, + "Qwen3-4B": { + "turns": [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 + ], + "logprob": [ + -3.48, + -3.25, + -3.3, + -2.74, + -2.75, + -2.73, + -2.72, + -2.67, + -2.62, + -2.25 + ], + "sem": [ + 0.04, + 0.05, + 0.04, + 0.07, + 0.06, + 0.07, + 0.07, + 0.07, + 0.06, + 0.06 + ] + }, + "Qwen3-30B-A3B": { + "turns": [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 + ], + "logprob": [ + -4.94, + -5.21, + -5.51, + -5.05, + -4.96, + -4.95, + -4.75, + -4.73, + -4.6, + -4.72 + ], + "sem": [ + 0.15, + 0.18, + 0.2, + 0.18, + 0.19, + 0.19, + 0.17, + 0.18, + 0.16, + 0.18 + ] + }, + "Qwen3-Next-80B-A3B": { + "turns": [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 + ], + "logprob": [ + -2.85, + -2.86, + -2.74, + -2.65, + -2.31, + -2.14, + -1.98, + -2.03, + -1.88, + -1.82 + ], + "sem": [ + 0.1, + 0.1, + 0.11, + 0.11, + 0.11, + 0.13, + 0.13, + 0.18, + 0.17, + 0.09 + ] + } + }, + "globem": { + "Qwen2.5-32B": { + "turns": [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 + ], + "logprob": [ + -5.48, + -5.83, + -5.84, + -5.91, + -6.01, + -6.03, + -5.86, + -5.73, + -5.78, + -5.73 + ], + "sem": [ + 0.24, + 0.28, + 0.31, + 0.33, + 0.33, + 0.35, + 0.33, + 0.35, + 0.35, + 0.36 + ] + }, + "Qwen2.5-72B": { + "turns": [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 + ], + "logprob": [ + -4.68, + -5.56, + -5.65, + -5.59, + -5.59, + -5.49, + -5.54, + -5.4, + -5.57, + -5.53 + ], + "sem": [ + 0.13, + 0.18, + 0.23, + 0.23, + 0.25, + 0.25, + 0.29, + 0.32, + 0.38, + 0.46 + ] + }, + "Qwen3-4B": { + "turns": [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 + ], + "logprob": [ + -4.67, + -4.16, + -3.9, + -3.76, + -3.6, + -3.47, + -3.05, + -2.99, + -2.93, + -2.78 + ], + "sem": [ + 0.08, + 0.07, + 0.06, + 0.06, + 0.07, + 0.08, + 0.07, + 0.08, + 0.08, + 0.09 + ] + }, + "Qwen3-30B-A3B": { + "turns": [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 + ], + "logprob": [ + -5.28, + -5.23, + -5.2, + -5.19, + -5.2, + -5.01, + -5.21, + -4.95, + -4.93, + -4.81 + ], + "sem": [ + 0.09, + 0.09, + 0.09, + 0.08, + 0.08, + 0.08, + 0.09, + 0.09, + 0.1, + 0.1 + ] + }, + "Qwen3-Next-80B-A3B": { + "turns": [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 + ], + "logprob": [ + -3.1, + -3.15, + -3.06, + -3.01, + -2.95, + -2.88, + -2.78, + -2.4, + -2.46, + -1.89 + ], + "sem": [ + 0.06, + 0.06, + 0.06, + 0.06, + 0.06, + 0.06, + 0.07, + 0.06, + 0.14, + 0.1 + ] + } + }, + "10k": { + "Qwen2.5-32B": { + "turns": [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 + ], + "logprob": [ + -6.59, + -7.15, + -6.99, + -6.95, + -6.82, + -6.88, + -6.71, + -6.58, + -6.67, + -6.45 + ], + "sem": [ + 0.26, + 0.28, + 0.29, + 0.3, + 0.29, + 0.29, + 0.29, + 0.32, + 0.36, + 0.41 + ] + }, + "Qwen2.5-72B": { + "turns": [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 + ], + "logprob": [ + -5.51, + -7.02, + -6.45, + -6.11, + -5.98, + -6.52, + -7.02, + -7.88, + -8.05, + -7.66 + ], + "sem": [ + 0.26, + 0.34, + 0.34, + 0.36, + 0.4, + 0.53, + 0.62, + 0.71, + 0.81, + 0.92 + ] + }, + "Qwen3-4B": { + "turns": [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 + ], + "logprob": [ + -4.68, + -4.3, + -3.57, + -3.33, + -3.27, + -3.22, + -3.06, + -2.9, + -2.75, + -2.57 + ], + "sem": [ + 0.18, + 0.17, + 0.15, + 0.14, + 0.14, + 0.14, + 0.14, + 0.14, + 0.14, + 0.14 + ] + }, + "Qwen3-30B-A3B": { + "turns": [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 + ], + "logprob": [ + -3.23, + -3.31, + -3.28, + -3.16, + -3.06, + -2.97, + -2.94, + -2.87, + -2.83, + -2.73 + ], + "sem": [ + 0.17, + 0.17, + 0.17, + 0.17, + 0.17, + 0.16, + 0.17, + 0.18, + 0.18, + 0.17 + ] + }, + "Qwen3-Next-80B-A3B": { + "turns": [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 + ], + "logprob": [ + -3.25, + -3.42, + -3.21, + -2.94, + -2.81, + -2.75, + -2.7, + -2.65, + -2.55, + -2.45 + ], + "sem": [ + 0.16, + 0.17, + 0.17, + 0.17, + 0.16, + 0.17, + 0.16, + 0.16, + 0.16, + 0.16 + ] + } + } + }, + "byProgress": { + "mimic": { + "Qwen2.5-32B": { + "progress": [ + 0, + 10, + 20, + 30, + 40, + 50, + 60, + 70, + 80, + 90 + ], + "logprob": [ + -4.3, + -4.12, + -3.73, + -3.62, + -3.36, + -3.05, + -2.94, + -3.12, + -4.6, + -4.42 + ], + "sem": [ + 0.25, + 0.21, + 0.25, + 0.36, + 0.24, + 0.25, + 0.38, + 0.45, + 1.5, + 0.1 + ] + }, + "Qwen2.5-72B": { + "progress": [ + 0, + 10, + 20, + 30, + 40, + 50, + 60, + 70, + 80, + 90 + ], + "logprob": [ + -3.51, + -3.98, + -3.74, + -3.26, + -3.17, + -3.24, + -2.99, + -2.53, + -2.58, + -2.42 + ], + "sem": [ + 0.15, + 0.21, + 0.16, + 0.23, + 0.17, + 0.25, + 0.18, + 0.09, + 0.09, + 0.2 + ] + }, + "Qwen3-4B": { + "progress": [ + 0, + 10, + 20, + 30, + 40, + 50, + 60, + 70, + 80, + 90 + ], + "logprob": [ + -3.37, + -2.93, + -2.71, + -2.33, + -1.99, + -2.04, + -1.57, + -1.46, + -1.48, + -1.44 + ], + "sem": [ + 0.03, + 0.04, + 0.04, + 0.04, + 0.05, + 0.08, + 0.1, + 0.05, + 0.0, + 0.01 + ] + }, + "Qwen3-30B-A3B": { + "progress": [ + 0, + 10, + 20, + 30, + 40, + 50, + 60, + 70, + 80, + 90 + ], + "logprob": [ + -5.13, + -4.72, + -4.42, + -4.17, + -4.04, + -3.9, + -3.64, + -3.45, + -3.36, + -3.17 + ], + "sem": [ + 0.08, + 0.07, + 0.07, + 0.07, + 0.07, + 0.08, + 0.1, + 0.14, + 0.15, + 0.26 + ] + }, + "Qwen3-Next-80B-A3B": { + "progress": [ + 0, + 10, + 20, + 30, + 40, + 50, + 60, + 70, + 80, + 90 + ], + "logprob": [ + -2.85, + -2.8, + -2.65, + -2.22, + -1.98, + -1.96, + -1.79, + -1.74, + -1.83, + -1.85 + ], + "sem": [ + 0.1, + 0.07, + 0.11, + 0.09, + 0.13, + 0.12, + 0.08, + 0.16, + 0.15, + 0.39 + ] + } + }, + "globem": { + "Qwen2.5-32B": { + "progress": [ + 0, + 10, + 20, + 30, + 40, + 50, + 60, + 70, + 80, + 90 + ], + "logprob": [ + -5.66, + -5.92, + -5.88, + -5.79, + -5.79, + -5.55, + -5.47, + -4.8, + -3.55, + -3.24 + ], + "sem": [ + 0.18, + 0.19, + 0.2, + 0.21, + 0.29, + 0.29, + 0.47, + 0.63, + 0.19, + 0.47 + ] + }, + "Qwen2.5-72B": { + "progress": [ + 0, + 10, + 20, + 30, + 40, + 50, + 60, + 70, + 80, + 90 + ], + "logprob": [ + -4.68, + -5.56, + -5.62, + -5.59, + -5.51, + -5.4, + -5.56, + -5.03, + -5.77, + -7.71 + ], + "sem": [ + 0.13, + 0.18, + 0.16, + 0.25, + 0.19, + 0.32, + 0.29, + 0.55, + 0.83, + 0.1 + ] + }, + "Qwen3-4B": { + "progress": [ + 0, + 10, + 20, + 30, + 40, + 50, + 60, + 70, + 80, + 90 + ], + "logprob": [ + -4.42, + -3.83, + -3.38, + -2.96, + -2.71, + -2.6, + -2.46, + -2.53, + -2.63, + -2.61 + ], + "sem": [ + 0.06, + 0.04, + 0.04, + 0.05, + 0.07, + 0.08, + 0.12, + 0.14, + 0.25, + 0.04 + ] + }, + "Qwen3-30B-A3B": { + "progress": [ + 0, + 10, + 20, + 30, + 40, + 50, + 60, + 70, + 80, + 90 + ], + "logprob": [ + -5.26, + -5.2, + -5.06, + -4.82, + -4.5, + -4.51, + -4.37, + -4.1, + -4.03, + -3.74 + ], + "sem": [ + 0.06, + 0.05, + 0.05, + 0.06, + 0.07, + 0.08, + 0.1, + 0.29, + 0.25, + 0.11 + ] + }, + "Qwen3-Next-80B-A3B": { + "progress": [ + 10, + 20, + 30, + 40, + 50, + 60, + 70, + 80, + 90 + ], + "logprob": [ + -3.1, + -3.15, + -3.06, + -3.01, + -2.95, + -2.88, + -2.78, + -2.4, + -2.46 + ], + "sem": [ + 0.06, + 0.06, + 0.06, + 0.06, + 0.06, + 0.06, + 0.07, + 0.06, + 0.14 + ] + } + }, + "10k": { + "Qwen2.5-32B": { + "progress": [ + 0, + 10, + 20, + 30, + 40, + 50, + 60, + 70, + 80, + 90 + ], + "logprob": [ + -6.59, + -7.07, + -6.89, + -6.8, + -6.58, + -6.58, + -6.76, + -8.0, + -8.59, + -8.83 + ], + "sem": [ + 0.26, + 0.2, + 0.21, + 0.2, + 0.32, + 0.27, + 0.39, + 0.57, + 0.84, + 1.12 + ] + }, + "Qwen2.5-72B": { + "progress": [ + 0, + 10, + 20, + 30, + 40, + 50, + 60, + 70, + 80, + 90 + ], + "logprob": [ + -5.51, + -7.02, + -6.28, + -5.98, + -6.52, + -7.33, + -8.05, + -7.85, + -8.41, + -7.15 + ], + "sem": [ + 0.26, + 0.34, + 0.25, + 0.4, + 0.53, + 0.47, + 0.81, + 0.79, + 1.45, + 1.26 + ] + }, + "Qwen3-4B": { + "progress": [ + 0, + 10, + 20, + 30, + 40, + 50, + 60, + 70, + 80, + 90 + ], + "logprob": [ + -4.49, + -3.45, + -3.19, + -2.83, + -2.5, + -2.27, + -2.31, + -2.31, + -2.35, + -1.73 + ], + "sem": [ + 0.12, + 0.1, + 0.08, + 0.1, + 0.1, + 0.11, + 0.2, + 0.29, + 0.36, + 0.03 + ] + }, + "Qwen3-30B-A3B": { + "progress": [ + 0, + 10, + 20, + 30, + 40, + 50, + 60, + 70, + 80, + 90 + ], + "logprob": [ + -3.14, + -2.66, + -2.29, + -2.26, + -1.97, + -1.88, + -1.52, + -1.36, + -1.61, + -1.61 + ], + "sem": [ + 0.06, + 0.06, + 0.07, + 0.1, + 0.14, + 0.18, + 0.08, + 0.02, + 0.05, + 0.08 + ] + }, + "Qwen3-Next-80B-A3B": { + "progress": [ + 0, + 10, + 20, + 30, + 40, + 50, + 60, + 70, + 80, + 90 + ], + "logprob": [ + -3.34, + -2.99, + -2.7, + -2.5, + -2.43, + -2.55, + -2.18, + -2.28, + -2.19, + -2.5 + ], + "sem": [ + 0.12, + 0.1, + 0.1, + 0.11, + 0.11, + 0.15, + 0.21, + 0.22, + 0.26, + 0.38 + ] + } + } + } + }, + probingColors: { + "Qwen2.5-32B": "#4A90D9", + "Qwen2.5-72B": "#1A5FB4", + "Qwen3-4B": "#57E389", + "Qwen3-30B-A3B": "#26A269", + "Qwen3-Next-80B-A3B": "#9141AC" + }, + error: [ + { + "main_category": "Fail in Exploration", + "subcategory": "Insufficient Breadth", + "count": 64, + "percentage": 31.1, + "color": "#1565C0" + }, + { + "main_category": "Fail in Exploration", + "subcategory": "Insufficient Depth", + "count": 56, + "percentage": 27.2, + "color": "#42A5F5" + }, + { + "main_category": "Poor Data-to-Insight", + "subcategory": "Insight Misinterpretation", + "count": 19, + "percentage": 9.2, + "color": "#2E7D32" + }, + { + "main_category": "Poor Data-to-Insight", + "subcategory": "Superficial Analysis", + "count": 16, + "percentage": 7.8, + "color": "#43A047" + }, + { + "main_category": "Poor Data-to-Insight", + "subcategory": "Over Reasoning", + "count": 15, + "percentage": 7.3, + "color": "#81C784" + }, + { + "main_category": "Lost in Context", + "subcategory": "Lost in Debugging", + "count": 18, + "percentage": 8.7, + "color": "#C62828" + }, + { + "main_category": "Lost in Context", + "subcategory": "Fail in Summarization", + "count": 10, + "percentage": 4.9, + "color": "#E53935" + }, + { + "main_category": "Lost in Context", + "subcategory": "Poor Instruction Following", + "count": 8, + "percentage": 3.9, + "color": "#EF9A9A" + } + ] }; \ No newline at end of file