thinkwee commited on
Commit ·
f17ef98
1
Parent(s): cf573f9
correct data
Browse files
data.js
CHANGED
|
@@ -1,8 +1,7 @@
|
|
| 1 |
-
// DDR-Bench Visualization Data
|
| 2 |
-
//
|
| 3 |
|
| 4 |
const DDR_DATA = {
|
| 5 |
-
// Color scheme for models
|
| 6 |
modelColors: {
|
| 7 |
'GPT-5.2': '#00C853',
|
| 8 |
'Claude-4.5-Sonnet': '#FF6D00',
|
|
@@ -12,317 +11,282 @@ const DDR_DATA = {
|
|
| 12 |
'Qwen3-Next-80B-A3B': '#FFC107',
|
| 13 |
'Kimi-K2': '#FFA500',
|
| 14 |
'MiniMax-M2': '#20B2AA',
|
| 15 |
-
// Probing models
|
| 16 |
'Qwen2.5-32B': '#4A90D9',
|
| 17 |
'Qwen2.5-72B': '#1A5FB4',
|
| 18 |
'Qwen3-4B': '#57E389',
|
| 19 |
'Qwen3-30B-A3B': '#26A269',
|
| 20 |
},
|
| 21 |
-
|
| 22 |
-
// Scaling Analysis Data
|
| 23 |
scaling: {
|
| 24 |
-
mimic: {
|
| 25 |
'GPT-5.2': {
|
| 26 |
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
|
| 27 |
-
tokens: [51,
|
| 28 |
-
costs: [0.0005, 0.0012, 0.0021, 0.0032, 0.
|
| 29 |
-
accuracy: [2.
|
| 30 |
-
}
|
| 31 |
-
'Claude-4.5-Sonnet': {
|
| 32 |
-
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
|
| 33 |
-
tokens: [33, 1527, 1715, 3193, 4513, 5965, 6664, 7387, 8417, 9214, 9823, 10620, 11533, 12516, 13378, 14190, 15001, 15723, 16457, 17218],
|
| 34 |
-
costs: [0.0004, 0.0027, 0.0053, 0.0097, 0.0152, 0.0222, 0.0300, 0.0386, 0.0484, 0.0590, 0.0702, 0.0823, 0.0954, 0.1097, 0.1249, 0.1410, 0.1580, 0.1758, 0.1944, 0.2138],
|
| 35 |
-
accuracy: [3.5, 7.0, 10.5, 14.0, 17.2, 20.2, 23.0, 25.5, 27.8, 29.8, 31.5, 32.8, 33.8, 34.2, 34.3, 34.4, 34.4, 34.4, 34.4, 34.37]
|
| 36 |
-
},
|
| 37 |
-
'Gemini-3-Flash': {
|
| 38 |
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
|
| 39 |
-
tokens: [
|
| 40 |
-
costs: [0.
|
| 41 |
-
accuracy: [2.
|
| 42 |
-
}
|
| 43 |
-
'
|
| 44 |
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
|
| 45 |
-
tokens: [
|
| 46 |
-
costs: [0.0001, 0.
|
| 47 |
-
accuracy: [
|
| 48 |
-
}
|
| 49 |
-
'
|
| 50 |
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
|
| 51 |
-
tokens: [
|
| 52 |
-
costs: [0.0001, 0.0006, 0.
|
| 53 |
-
accuracy: [
|
| 54 |
}
|
| 55 |
-
}
|
| 56 |
-
'10k': {
|
| 57 |
'GPT-5.2': {
|
| 58 |
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
|
| 59 |
-
tokens: [
|
| 60 |
-
costs: [0.
|
| 61 |
-
accuracy: [
|
| 62 |
-
}
|
| 63 |
-
'Claude-4.5-Sonnet': {
|
| 64 |
-
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
|
| 65 |
-
tokens: [30, 1420, 1580, 2970, 4200, 5550, 6200, 6870, 7830, 8570, 9130, 9870, 10710, 11620, 12410, 13150, 13890, 14550, 15220, 15920],
|
| 66 |
-
costs: [0.0004, 0.0025, 0.0049, 0.0089, 0.0140, 0.0205, 0.0277, 0.0357, 0.0447, 0.0545, 0.0649, 0.0760, 0.0882, 0.1014, 0.1154, 0.1303, 0.1460, 0.1624, 0.1796, 0.1976],
|
| 67 |
-
accuracy: [7.7, 15.5, 23.2, 30.9, 38.4, 45.6, 52.6, 59.2, 65.5, 70.5, 74.2, 76.0, 77.0, 77.3, 77.3, 77.3, 77.3, 77.3, 77.3, 77.27]
|
| 68 |
-
},
|
| 69 |
-
'Gemini-3-Flash': {
|
| 70 |
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
|
| 71 |
-
tokens: [
|
| 72 |
-
costs: [0.
|
| 73 |
-
accuracy: [
|
| 74 |
-
}
|
| 75 |
-
'
|
| 76 |
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
|
| 77 |
-
tokens: [
|
| 78 |
-
costs: [0.0001, 0.
|
| 79 |
-
accuracy: [
|
| 80 |
-
}
|
| 81 |
-
'
|
| 82 |
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
|
| 83 |
-
tokens: [
|
| 84 |
-
costs: [0.0001, 0.
|
| 85 |
-
accuracy: [
|
| 86 |
}
|
| 87 |
-
}
|
| 88 |
-
globem: {
|
| 89 |
'GPT-5.2': {
|
| 90 |
-
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
|
| 91 |
-
tokens: [
|
| 92 |
-
costs: [0.0005, 0.
|
| 93 |
-
accuracy: [
|
| 94 |
-
}
|
| 95 |
-
'Claude-4.5-Sonnet': {
|
| 96 |
-
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
|
| 97 |
-
tokens: [
|
| 98 |
-
costs: [0.
|
| 99 |
-
accuracy: [
|
| 100 |
-
}
|
| 101 |
-
'Gemini-3-Flash': {
|
| 102 |
-
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
|
| 103 |
-
tokens: [
|
| 104 |
-
costs: [0.
|
| 105 |
-
accuracy: [
|
| 106 |
-
}
|
| 107 |
-
'GLM-4.6': {
|
| 108 |
-
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
|
| 109 |
-
tokens: [
|
| 110 |
-
costs: [0.0001, 0.
|
| 111 |
-
accuracy: [
|
| 112 |
-
},
|
| 113 |
-
'DeepSeek-V3.2': {
|
| 114 |
-
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
|
| 115 |
-
tokens: [45, 1420, 1690, 2450, 3520, 4680, 5560, 6420, 7350, 8280, 9150, 10020, 10890, 11750, 12610],
|
| 116 |
-
costs: [0.0001, 0.0006, 0.0012, 0.0020, 0.0031, 0.0044, 0.0059, 0.0076, 0.0095, 0.0117, 0.0140, 0.0165, 0.0192, 0.0221, 0.0252],
|
| 117 |
-
accuracy: [3.8, 7.6, 11.5, 15.3, 19.0, 22.7, 26.2, 29.6, 32.8, 35.5, 37.2, 38.0, 38.1, 38.2, 38.16]
|
| 118 |
}
|
| 119 |
}
|
| 120 |
},
|
| 121 |
-
|
| 122 |
-
// Ranking Comparison Data
|
| 123 |
ranking: {
|
| 124 |
-
MIMIC: [
|
| 125 |
-
{
|
| 126 |
-
{
|
| 127 |
-
{
|
| 128 |
-
{
|
| 129 |
-
{
|
| 130 |
-
{
|
| 131 |
-
{
|
| 132 |
-
{
|
| 133 |
-
{
|
| 134 |
-
{
|
| 135 |
-
{
|
| 136 |
-
{
|
| 137 |
-
{
|
| 138 |
-
{
|
| 139 |
-
{
|
| 140 |
-
{
|
| 141 |
-
{
|
| 142 |
-
{
|
| 143 |
-
{
|
| 144 |
-
{
|
| 145 |
-
{
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
{
|
| 150 |
-
{
|
| 151 |
-
{
|
| 152 |
-
{
|
| 153 |
-
{
|
| 154 |
-
{
|
| 155 |
-
{
|
| 156 |
-
{
|
| 157 |
-
{
|
| 158 |
-
{
|
| 159 |
-
{
|
| 160 |
-
{
|
| 161 |
-
{
|
| 162 |
-
{
|
| 163 |
-
{
|
| 164 |
-
{
|
| 165 |
-
{
|
| 166 |
-
{
|
| 167 |
-
{
|
| 168 |
-
{
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
{
|
| 174 |
-
{
|
| 175 |
-
{
|
| 176 |
-
{
|
| 177 |
-
{
|
| 178 |
-
{
|
| 179 |
-
{
|
| 180 |
-
{
|
| 181 |
-
{
|
| 182 |
-
{
|
| 183 |
-
{
|
| 184 |
-
{
|
| 185 |
-
{
|
| 186 |
-
{
|
| 187 |
-
{
|
| 188 |
-
{
|
| 189 |
-
{
|
| 190 |
-
{
|
| 191 |
-
{
|
| 192 |
-
{ model: 'Qwen2.5-7B', bt_rank: 20, win_rate: 22.0, accuracy: 25.64, acc_rank: 20, is_proprietary: false },
|
| 193 |
-
{ model: 'Gemini2.5-Flash-Lite', bt_rank: 21, win_rate: 19.5, accuracy: 25.52, acc_rank: 21, is_proprietary: true },
|
| 194 |
-
{ model: 'Llama3.3-70B', bt_rank: 22, win_rate: 15.0, accuracy: 22.65, acc_rank: 22, is_proprietary: false }
|
| 195 |
]
|
| 196 |
},
|
| 197 |
-
|
| 198 |
-
// Turn Distribution Data (distribution: percentage in bins [0-10, 10-20, ..., 90-100])
|
| 199 |
turn: {
|
| 200 |
-
mimic: [
|
| 201 |
-
{
|
| 202 |
-
{
|
| 203 |
-
{
|
| 204 |
-
{
|
| 205 |
-
{
|
| 206 |
-
{
|
| 207 |
-
{
|
| 208 |
-
{
|
| 209 |
-
{
|
| 210 |
-
{
|
| 211 |
-
{
|
| 212 |
-
{
|
| 213 |
-
{
|
| 214 |
-
{
|
| 215 |
-
]
|
| 216 |
-
'10k': [
|
| 217 |
-
{
|
| 218 |
-
{
|
| 219 |
-
{
|
| 220 |
-
{
|
| 221 |
-
{
|
| 222 |
-
{
|
| 223 |
-
{
|
| 224 |
-
{
|
| 225 |
-
{
|
| 226 |
-
{
|
| 227 |
-
{
|
| 228 |
-
{
|
| 229 |
-
{
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
{
|
| 234 |
-
{
|
| 235 |
-
{
|
| 236 |
-
{
|
| 237 |
-
{
|
| 238 |
-
{
|
| 239 |
-
{
|
| 240 |
-
{
|
| 241 |
-
{
|
| 242 |
-
{
|
| 243 |
-
{
|
| 244 |
-
{
|
| 245 |
-
{
|
| 246 |
-
{ model: 'Llama3.3-70B', median: 6, distribution: [10, 22, 32, 22, 9, 3, 1, 1, 0, 0] }
|
| 247 |
]
|
| 248 |
},
|
| 249 |
-
|
| 250 |
-
// Entropy Analysis Data
|
| 251 |
entropy: {
|
| 252 |
-
mimic: {
|
| 253 |
-
'GPT-5.2': {
|
| 254 |
-
'Claude-4.5-Sonnet': {
|
| 255 |
-
'Gemini-3-Flash': {
|
| 256 |
-
'GLM-4.6': {
|
| 257 |
-
'DeepSeek-V3.2': {
|
| 258 |
-
}
|
| 259 |
-
'10k': {
|
| 260 |
-
'GPT-5.2': {
|
| 261 |
-
'Claude-4.5-Sonnet': {
|
| 262 |
-
'Gemini-3-Flash': {
|
| 263 |
-
'GLM-4.6': {
|
| 264 |
-
'DeepSeek-V3.2': {
|
| 265 |
-
}
|
| 266 |
-
globem: {
|
| 267 |
-
'GPT-5.2': {
|
| 268 |
-
'Claude-4.5-Sonnet': {
|
| 269 |
-
'Gemini-3-Flash': {
|
| 270 |
-
'GLM-4.6': {
|
| 271 |
-
'DeepSeek-V3.2': {
|
| 272 |
}
|
| 273 |
},
|
| 274 |
-
|
| 275 |
-
// Probing Results Data
|
| 276 |
probing: {
|
| 277 |
byTurn: {
|
| 278 |
-
mimic: {
|
| 279 |
-
'Qwen2.5-32B': {
|
| 280 |
-
'Qwen2.5-72B': {
|
| 281 |
-
'Qwen3-4B': {
|
| 282 |
-
'Qwen3-30B-A3B': {
|
| 283 |
-
'Qwen3-Next-80B-A3B': {
|
| 284 |
-
}
|
| 285 |
-
globem: {
|
| 286 |
-
'Qwen2.5-32B': {
|
| 287 |
-
'Qwen2.5-72B': {
|
| 288 |
-
'Qwen3-4B': {
|
| 289 |
-
'Qwen3-30B-A3B': {
|
| 290 |
-
'Qwen3-Next-80B-A3B': {
|
| 291 |
-
}
|
| 292 |
-
'10k': {
|
| 293 |
-
'Qwen2.5-32B': {
|
| 294 |
-
'Qwen2.5-72B': {
|
| 295 |
-
'Qwen3-4B': {
|
| 296 |
-
'Qwen3-30B-A3B': {
|
| 297 |
-
'Qwen3-Next-80B-A3B': {
|
| 298 |
}
|
| 299 |
},
|
| 300 |
byProgress: {
|
| 301 |
-
mimic: {
|
| 302 |
-
'Qwen2.5-32B': {
|
| 303 |
-
'Qwen2.5-72B': {
|
| 304 |
-
'Qwen3-4B': {
|
| 305 |
-
'Qwen3-30B-A3B': {
|
| 306 |
-
'Qwen3-Next-80B-A3B': {
|
| 307 |
-
}
|
| 308 |
-
globem: {
|
| 309 |
-
'Qwen2.5-32B': {
|
| 310 |
-
'Qwen2.5-72B': {
|
| 311 |
-
'Qwen3-4B': {
|
| 312 |
-
'Qwen3-30B-A3B': {
|
| 313 |
-
'Qwen3-Next-80B-A3B': {
|
| 314 |
-
}
|
| 315 |
-
'10k': {
|
| 316 |
-
'Qwen2.5-32B': {
|
| 317 |
-
'Qwen2.5-72B': {
|
| 318 |
-
'Qwen3-4B': {
|
| 319 |
-
'Qwen3-30B-A3B': {
|
| 320 |
-
'Qwen3-Next-80B-A3B': {
|
| 321 |
}
|
| 322 |
}
|
| 323 |
},
|
| 324 |
-
|
| 325 |
-
// Probing model colors
|
| 326 |
probingColors: {
|
| 327 |
'Qwen2.5-32B': '#4A90D9',
|
| 328 |
'Qwen2.5-72B': '#1A5FB4',
|
|
|
|
| 1 |
+
// DDR-Bench Visualization Data - Auto-generated from original data sources
|
| 2 |
+
// Generated from Python analysis scripts
|
| 3 |
|
| 4 |
const DDR_DATA = {
|
|
|
|
| 5 |
modelColors: {
|
| 6 |
'GPT-5.2': '#00C853',
|
| 7 |
'Claude-4.5-Sonnet': '#FF6D00',
|
|
|
|
| 11 |
'Qwen3-Next-80B-A3B': '#FFC107',
|
| 12 |
'Kimi-K2': '#FFA500',
|
| 13 |
'MiniMax-M2': '#20B2AA',
|
|
|
|
| 14 |
'Qwen2.5-32B': '#4A90D9',
|
| 15 |
'Qwen2.5-72B': '#1A5FB4',
|
| 16 |
'Qwen3-4B': '#57E389',
|
| 17 |
'Qwen3-30B-A3B': '#26A269',
|
| 18 |
},
|
|
|
|
|
|
|
| 19 |
scaling: {
|
| 20 |
+
'mimic': {
|
| 21 |
'GPT-5.2': {
|
| 22 |
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
|
| 23 |
+
tokens: [51, 1475, 1796, 2543, 3737, 4926, 5784, 6681, 7562, 8577, 10444, 11611, 12837, 14128, 15459, 16839, 17760, 18642, 19455, 20193],
|
| 24 |
+
costs: [0.0005, 0.0012, 0.0021, 0.0032, 0.005, 0.0072, 0.01, 0.0131, 0.0167, 0.0207, 0.0257, 0.031, 0.0371, 0.0439, 0.0516, 0.0595, 0.068, 0.0772, 0.086, 0.0947],
|
| 25 |
+
accuracy: [2.02, 3.99, 5.9, 7.75, 9.55, 11.29, 12.97, 14.59, 16.14, 17.62, 19.03, 20.36, 21.62, 22.78, 23.85, 24.82, 25.68, 26.4, 26.96, 27.26]
|
| 26 |
+
}
|
| 27 |
+
,'Claude-4.5-Sonnet': {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
|
| 29 |
+
tokens: [33, 1527, 1714, 3192, 4513, 5965, 6664, 7386, 8417, 9214, 9822, 10619, 11532, 12516, 13378, 14190, 15000, 15722, 16457, 17217],
|
| 30 |
+
costs: [0.0004, 0.0027, 0.0053, 0.0097, 0.0152, 0.0222, 0.03, 0.0386, 0.0484, 0.059, 0.0702, 0.0823, 0.0954, 0.1097, 0.1249, 0.141, 0.158, 0.1758, 0.1944, 0.2138],
|
| 31 |
+
accuracy: [2.55, 5.02, 7.44, 9.78, 12.05, 14.24, 16.36, 18.4, 20.35, 22.22, 23.99, 25.68, 27.25, 28.72, 30.07, 31.3, 32.37, 33.28, 33.99, 34.37]
|
| 32 |
+
}
|
| 33 |
+
,'Gemini-3-Flash': {
|
| 34 |
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
|
| 35 |
+
tokens: [457, 2153, 2605, 4331, 5580, 7502, 8911, 10725, 12697, 14305, 16480, 18695, 20559, 22036, 23357, 24415, 25207, 25977, 26542, 26964],
|
| 36 |
+
costs: [0.0001, 0.0004, 0.0007, 0.0013, 0.002, 0.003, 0.004, 0.0052, 0.0066, 0.008, 0.0097, 0.0116, 0.0135, 0.0154, 0.0173, 0.0196, 0.0219, 0.024, 0.0263, 0.0284],
|
| 37 |
+
accuracy: [1.85, 3.65, 5.4, 7.09, 8.74, 10.33, 11.87, 13.35, 14.77, 16.12, 17.41, 18.63, 19.78, 20.84, 21.82, 22.71, 23.49, 24.15, 24.66, 24.94]
|
| 38 |
+
}
|
| 39 |
+
,'GLM-4.6': {
|
| 40 |
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
|
| 41 |
+
tokens: [59, 1528, 1774, 2778, 3488, 4210, 4664, 5337, 6158, 7059, 7996, 8765, 9344, 9928, 10542, 11095, 11598, 12149, 12657, 13099],
|
| 42 |
+
costs: [0.0001, 0.0006, 0.001, 0.0017, 0.0026, 0.0037, 0.0049, 0.0063, 0.0079, 0.0097, 0.0118, 0.014, 0.0164, 0.019, 0.0217, 0.0245, 0.0275, 0.0306, 0.0337, 0.0369],
|
| 43 |
+
accuracy: [1.72, 3.4, 5.03, 6.62, 8.15, 9.64, 11.07, 12.45, 13.77, 15.04, 16.24, 17.38, 18.44, 19.44, 20.35, 21.18, 21.91, 22.52, 23.0, 23.26]
|
| 44 |
}
|
| 45 |
+
}
|
| 46 |
+
,'10k': {
|
| 47 |
'GPT-5.2': {
|
| 48 |
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
|
| 49 |
+
tokens: [56, 318, 1162, 1828, 2823, 3790, 4901, 5967, 6858, 7902, 8585, 9384, 10024, 10939, 11581, 12226, 12917, 13514, 14106, 14651],
|
| 50 |
+
costs: [0.0005, 0.0007, 0.0013, 0.0021, 0.0037, 0.0057, 0.0081, 0.0113, 0.015, 0.0199, 0.0243, 0.0298, 0.0343, 0.0398, 0.0454, 0.0521, 0.0575, 0.0631, 0.0713, 0.0774],
|
| 51 |
+
accuracy: [3.33, 6.58, 9.73, 12.8, 15.77, 18.64, 21.41, 24.08, 26.64, 29.08, 31.41, 33.61, 35.67, 37.6, 39.37, 40.97, 42.38, 43.57, 44.49, 44.99]
|
| 52 |
+
}
|
| 53 |
+
,'Claude-4.5-Sonnet': {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
|
| 55 |
+
tokens: [40, 361, 1119, 1794, 2561, 3410, 4413, 5431, 6339, 7208, 7983, 8720, 9502, 10235, 10978, 11679, 12286, 12899, 13469, 14050],
|
| 56 |
+
costs: [0.0005, 0.0017, 0.0034, 0.006, 0.0094, 0.0138, 0.0192, 0.0256, 0.0331, 0.0414, 0.0506, 0.0606, 0.0714, 0.083, 0.0955, 0.1087, 0.1226, 0.1371, 0.1523, 0.1682],
|
| 57 |
+
accuracy: [5.72, 11.3, 16.72, 21.98, 27.08, 32.02, 36.78, 41.36, 45.75, 49.95, 53.94, 57.72, 61.27, 64.57, 67.61, 70.36, 72.78, 74.83, 76.41, 77.27]
|
| 58 |
+
}
|
| 59 |
+
,'Gemini-3-Flash': {
|
| 60 |
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
|
| 61 |
+
tokens: [561, 1108, 2384, 3420, 4473, 5692, 7504, 9142, 10958, 12616, 14312, 15667, 16667, 17523, 18404, 19118, 19469, 19722, 19908, 20077],
|
| 62 |
+
costs: [0.0001, 0.0004, 0.0008, 0.0013, 0.002, 0.0028, 0.004, 0.0052, 0.0066, 0.008, 0.0098, 0.0111, 0.013, 0.0149, 0.0171, 0.0192, 0.0224, 0.0251, 0.0246, 0.0275],
|
| 63 |
+
accuracy: [3.29, 6.49, 9.61, 12.63, 15.56, 18.4, 21.14, 23.77, 26.3, 28.71, 31.0, 33.18, 35.21, 37.11, 38.86, 40.44, 41.83, 43.01, 43.91, 44.41]
|
| 64 |
+
}
|
| 65 |
+
,'GLM-4.6': {
|
| 66 |
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
|
| 67 |
+
tokens: [58, 339, 973, 1327, 1838, 2223, 2604, 3020, 3477, 3927, 4339, 4764, 5206, 5662, 6056, 6495, 6894, 7329, 7709, 8124],
|
| 68 |
+
costs: [0.0001, 0.0003, 0.0005, 0.0009, 0.0013, 0.0019, 0.0026, 0.0034, 0.0042, 0.0053, 0.0064, 0.0076, 0.0089, 0.0104, 0.012, 0.0136, 0.0154, 0.0173, 0.0193, 0.0214],
|
| 69 |
+
accuracy: [4.47, 8.83, 13.07, 17.19, 21.18, 25.03, 28.76, 32.34, 35.78, 39.06, 42.18, 45.13, 47.91, 50.49, 52.87, 55.02, 56.91, 58.51, 59.74, 60.42]
|
| 70 |
}
|
| 71 |
+
}
|
| 72 |
+
,'globem': {
|
| 73 |
'GPT-5.2': {
|
| 74 |
+
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
|
| 75 |
+
tokens: [58, 875, 1559, 2562, 3477, 4756, 6053, 7393, 8608, 10218, 11988, 13748, 15107, 16631, 17672, 18592, 19144, 19498, 19696, 19878],
|
| 76 |
+
costs: [0.0005, 0.0013, 0.002, 0.0032, 0.0048, 0.007, 0.0098, 0.0135, 0.0178, 0.0236, 0.0294, 0.0385, 0.0468, 0.0562, 0.0652, 0.0767, 0.0879, 0.1002, 0.1082, 0.1238],
|
| 77 |
+
accuracy: [2.84, 5.61, 8.31, 10.92, 13.45, 15.91, 18.27, 20.55, 22.73, 24.82, 26.8, 28.68, 30.44, 32.08, 33.59, 34.96, 36.16, 37.18, 37.96, 38.39]
|
| 78 |
+
}
|
| 79 |
+
,'Claude-4.5-Sonnet': {
|
| 80 |
+
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
|
| 81 |
+
tokens: [54, 930, 2128, 3337, 4579, 5649, 6915, 8193, 9731, 11405, 13210, 15065, 17143, 19238, 21188, 23277, 25394, 27614, 30130, 32526],
|
| 82 |
+
costs: [0.0008, 0.0032, 0.006, 0.0099, 0.0152, 0.0216, 0.0296, 0.0393, 0.0507, 0.0638, 0.0789, 0.096, 0.1155, 0.1372, 0.1611, 0.1873, 0.2158, 0.247, 0.2805, 0.3124],
|
| 83 |
+
accuracy: [2.98, 5.88, 8.7, 11.44, 14.1, 16.67, 19.15, 21.53, 23.82, 26.01, 28.09, 30.05, 31.9, 33.62, 35.2, 36.63, 37.89, 38.96, 39.78, 40.23]
|
| 84 |
+
}
|
| 85 |
+
,'Gemini-3-Flash': {
|
| 86 |
+
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
|
| 87 |
+
tokens: [549, 1839, 3441, 4928, 6260, 8046, 9776, 11341, 13250, 14825, 16374, 18786, 20565, 24046, 25972, 28004, 30001, 31784, 33556, 35526],
|
| 88 |
+
costs: [0.0002, 0.0005, 0.0009, 0.0015, 0.0021, 0.0029, 0.0038, 0.0049, 0.0061, 0.0074, 0.0089, 0.0105, 0.0123, 0.0144, 0.0166, 0.019, 0.0213, 0.0235, 0.0263, 0.0292],
|
| 89 |
+
accuracy: [2.61, 5.16, 7.63, 10.04, 12.37, 14.62, 16.8, 18.89, 20.9, 22.81, 24.64, 26.36, 27.98, 29.49, 30.88, 32.13, 33.24, 34.17, 34.9, 35.29]
|
| 90 |
+
}
|
| 91 |
+
,'GLM-4.6': {
|
| 92 |
+
turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
|
| 93 |
+
tokens: [58, 903, 1849, 2854, 3851, 4830, 5779, 6760, 7791, 8817, 10040, 11362, 12855, 14434, 16221, 18101, 20062, 22187, 24211, 26186],
|
| 94 |
+
costs: [0.0001, 0.0005, 0.001, 0.0017, 0.0027, 0.004, 0.0055, 0.0072, 0.0092, 0.0115, 0.0141, 0.017, 0.0203, 0.0238, 0.028, 0.0325, 0.0372, 0.0423, 0.0482, 0.0544],
|
| 95 |
+
accuracy: [3.08, 6.08, 9.0, 11.84, 14.58, 17.24, 19.8, 22.27, 24.64, 26.9, 29.05, 31.08, 32.99, 34.77, 36.41, 37.89, 39.19, 40.29, 41.14, 41.61]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
}
|
| 97 |
}
|
| 98 |
},
|
|
|
|
|
|
|
| 99 |
ranking: {
|
| 100 |
+
'MIMIC': [
|
| 101 |
+
{model: 'gpt5-mini', bt_rank: 1, win_rate: 100.0, accuracy: 27.59, acc_rank: 7, is_proprietary: true}
|
| 102 |
+
,{model: 'claude4.5-sonnet', bt_rank: 2, win_rate: 94.6, accuracy: 33.66, acc_rank: 1, is_proprietary: true}
|
| 103 |
+
,{model: 'gpt5mini', bt_rank: 3, win_rate: 87.8, accuracy: 27.59, acc_rank: 8, is_proprietary: true}
|
| 104 |
+
,{model: 'gpt5.2', bt_rank: 4, win_rate: 83.6, accuracy: 28.88, acc_rank: 5, is_proprietary: true}
|
| 105 |
+
,{model: 'gpt5.1', bt_rank: 5, win_rate: 80.6, accuracy: 30.1, acc_rank: 3, is_proprietary: true}
|
| 106 |
+
,{model: 'gemini3-flash', bt_rank: 6, win_rate: 76.5, accuracy: 29.28, acc_rank: 4, is_proprietary: true}
|
| 107 |
+
,{model: 'kimi-k2', bt_rank: 7, win_rate: 73.1, accuracy: 30.17, acc_rank: 2, is_proprietary: false}
|
| 108 |
+
,{model: 'run_api_deepseek_deepseek-chat', bt_rank: 8, win_rate: 70.5, accuracy: 27.65, acc_rank: 6, is_proprietary: false}
|
| 109 |
+
,{model: 'gemini2.5-pro', bt_rank: 9, win_rate: 63.9, accuracy: 19.0, acc_rank: 14, is_proprietary: true}
|
| 110 |
+
,{model: 'qwen3-next-80b-a3b-instruct', bt_rank: 10, win_rate: 59.5, accuracy: 18.8, acc_rank: 15, is_proprietary: false}
|
| 111 |
+
,{model: 'minimax-m2', bt_rank: 11, win_rate: 59.7, accuracy: 23.52, acc_rank: 10, is_proprietary: false}
|
| 112 |
+
,{model: 'glm4.6', bt_rank: 12, win_rate: 52.1, accuracy: 23.84, acc_rank: 9, is_proprietary: false}
|
| 113 |
+
,{model: 'qwen3', bt_rank: 13, win_rate: 51.7, accuracy: 19.13, acc_rank: 13, is_proprietary: false}
|
| 114 |
+
,{model: 'qwen2.5-14B-Instruct-1M', bt_rank: 14, win_rate: 40.3, accuracy: 20, acc_rank: 11, is_proprietary: false}
|
| 115 |
+
,{model: 'gemini2.5-flash-lite', bt_rank: 15, win_rate: 35.4, accuracy: 16.64, acc_rank: 18, is_proprietary: true}
|
| 116 |
+
,{model: 'qwen2.5-14B-Instruct', bt_rank: 16, win_rate: 32.4, accuracy: 14.15, acc_rank: 20, is_proprietary: false}
|
| 117 |
+
,{model: 'qwen2.5-32b-instruct', bt_rank: 17, win_rate: 32.3, accuracy: 13.12, acc_rank: 21, is_proprietary: false}
|
| 118 |
+
,{model: 'gemini2.5-flash', bt_rank: 18, win_rate: 31.2, accuracy: 18.61, acc_rank: 16, is_proprietary: true}
|
| 119 |
+
,{model: 'qwen2.5-72B-Instruct', bt_rank: 19, win_rate: 29.5, accuracy: 14.92, acc_rank: 19, is_proprietary: false}
|
| 120 |
+
,{model: 'qwen3-4B-Instruct-2507', bt_rank: 20, win_rate: 27.3, accuracy: 16.93, acc_rank: 17, is_proprietary: false}
|
| 121 |
+
,{model: 'qwen2.5-7B-Instruct-1M', bt_rank: 21, win_rate: 17.3, accuracy: 20, acc_rank: 12, is_proprietary: false}
|
| 122 |
+
]
|
| 123 |
+
,'10K': [
|
| 124 |
+
{model: 'claude4.5-sonnet', bt_rank: 1, win_rate: 92.8, accuracy: 69.26, acc_rank: 1, is_proprietary: true}
|
| 125 |
+
,{model: 'run_api_deepseek_deepseek-chat', bt_rank: 2, win_rate: 80.6, accuracy: 49.41, acc_rank: 2, is_proprietary: false}
|
| 126 |
+
,{model: 'gpt5mini', bt_rank: 3, win_rate: 80.4, accuracy: 41.56, acc_rank: 5, is_proprietary: true}
|
| 127 |
+
,{model: 'gpt5.2', bt_rank: 4, win_rate: 78.0, accuracy: 43.11, acc_rank: 4, is_proprietary: true}
|
| 128 |
+
,{model: 'kimi-k2', bt_rank: 5, win_rate: 77.0, accuracy: 41.17, acc_rank: 7, is_proprietary: false}
|
| 129 |
+
,{model: 'glm4.6', bt_rank: 6, win_rate: 71.4, accuracy: 48.29, acc_rank: 3, is_proprietary: false}
|
| 130 |
+
,{model: 'gemini3-flash', bt_rank: 7, win_rate: 63.6, accuracy: 39.5, acc_rank: 8, is_proprietary: true}
|
| 131 |
+
,{model: 'qwen3-next-80b-a3b-instruct', bt_rank: 8, win_rate: 59.2, accuracy: 38.34, acc_rank: 9, is_proprietary: false}
|
| 132 |
+
,{model: 'minimax-m2', bt_rank: 9, win_rate: 54.4, accuracy: 35.74, acc_rank: 10, is_proprietary: false}
|
| 133 |
+
,{model: 'gpt5.1', bt_rank: 10, win_rate: 54.0, accuracy: 41.23, acc_rank: 6, is_proprietary: true}
|
| 134 |
+
,{model: 'qwen3', bt_rank: 11, win_rate: 51.0, accuracy: 28.23, acc_rank: 12, is_proprietary: false}
|
| 135 |
+
,{model: 'qwen2.5-14B-Instruct-1M', bt_rank: 12, win_rate: 45.6, accuracy: 20, acc_rank: 15, is_proprietary: false}
|
| 136 |
+
,{model: 'gemini2.5-pro', bt_rank: 13, win_rate: 44.8, accuracy: 20.91, acc_rank: 13, is_proprietary: true}
|
| 137 |
+
,{model: 'qwen2.5-32b-instruct', bt_rank: 14, win_rate: 41.2, accuracy: 17.83, acc_rank: 17, is_proprietary: false}
|
| 138 |
+
,{model: 'qwen2.5-72B-Instruct', bt_rank: 15, win_rate: 34.6, accuracy: 20.79, acc_rank: 14, is_proprietary: false}
|
| 139 |
+
,{model: 'qwen2.5-14B-Instruct', bt_rank: 16, win_rate: 31.6, accuracy: 14.65, acc_rank: 18, is_proprietary: false}
|
| 140 |
+
,{model: 'qwen3-4B-Instruct-2507', bt_rank: 17, win_rate: 30.0, accuracy: 30.43, acc_rank: 11, is_proprietary: false}
|
| 141 |
+
,{model: 'gemini2.5-flash-lite', bt_rank: 18, win_rate: 29.6, accuracy: 14.37, acc_rank: 19, is_proprietary: true}
|
| 142 |
+
,{model: 'qwen2.5-7B-Instruct-1M', bt_rank: 19, win_rate: 27.4, accuracy: 20, acc_rank: 16, is_proprietary: false}
|
| 143 |
+
,{model: 'gemini2.5-flash', bt_rank: 20, win_rate: 25.2, accuracy: 12.61, acc_rank: 20, is_proprietary: true}
|
| 144 |
+
,{model: 'qwen2.5-7B-Instruct', bt_rank: 21, win_rate: 22.0, accuracy: 7.53, acc_rank: 21, is_proprietary: false}
|
| 145 |
+
]
|
| 146 |
+
,'GLOBEM': [
|
| 147 |
+
{model: 'claude4.5-sonnet', bt_rank: 1, win_rate: 93.0, accuracy: 39.54, acc_rank: 2, is_proprietary: true}
|
| 148 |
+
,{model: 'gpt5-mini', bt_rank: 2, win_rate: 60.0, accuracy: 33.91, acc_rank: 12, is_proprietary: true}
|
| 149 |
+
,{model: 'gemini3-flash', bt_rank: 3, win_rate: 81.2, accuracy: 35.46, acc_rank: 9, is_proprietary: true}
|
| 150 |
+
,{model: 'minimax-m2', bt_rank: 4, win_rate: 77.8, accuracy: 36.9, acc_rank: 6, is_proprietary: false}
|
| 151 |
+
,{model: 'gpt5mini', bt_rank: 5, win_rate: 73.8, accuracy: 33.91, acc_rank: 13, is_proprietary: true}
|
| 152 |
+
,{model: 'gpt5.1', bt_rank: 6, win_rate: 67.5, accuracy: 36.76, acc_rank: 7, is_proprietary: true}
|
| 153 |
+
,{model: 'gpt5.2', bt_rank: 7, win_rate: 64.4, accuracy: 38.39, acc_rank: 3, is_proprietary: true}
|
| 154 |
+
,{model: 'qwen3', bt_rank: 8, win_rate: 64.7, accuracy: 36.32, acc_rank: 8, is_proprietary: false}
|
| 155 |
+
,{model: 'run_api_deepseek_deepseek-chat', bt_rank: 9, win_rate: 64.5, accuracy: 38.39, acc_rank: 4, is_proprietary: false}
|
| 156 |
+
,{model: 'glm4.6', bt_rank: 10, win_rate: 53.6, accuracy: 39.77, acc_rank: 1, is_proprietary: false}
|
| 157 |
+
,{model: 'kimi-k2', bt_rank: 11, win_rate: 52.2, accuracy: 37.01, acc_rank: 5, is_proprietary: false}
|
| 158 |
+
,{model: 'gemini2.5-pro', bt_rank: 12, win_rate: 45.6, accuracy: 34.6, acc_rank: 10, is_proprietary: true}
|
| 159 |
+
,{model: 'qwen2.5-72B-Instruct', bt_rank: 13, win_rate: 43.3, accuracy: 27.13, acc_rank: 14, is_proprietary: false}
|
| 160 |
+
,{model: 'qwen2.5-32B-Instruct', bt_rank: 14, win_rate: 42.1, accuracy: 20, acc_rank: 20, is_proprietary: false}
|
| 161 |
+
,{model: 'qwen3-next-80b-a3b-instruct', bt_rank: 15, win_rate: 41.5, accuracy: 34.14, acc_rank: 11, is_proprietary: false}
|
| 162 |
+
,{model: 'qwen2.5-14B-Instruct', bt_rank: 16, win_rate: 40.8, accuracy: 26.13, acc_rank: 16, is_proprietary: false}
|
| 163 |
+
,{model: 'gemini2.5-flash-lite', bt_rank: 17, win_rate: 37.4, accuracy: 25.52, acc_rank: 18, is_proprietary: true}
|
| 164 |
+
,{model: 'qwen3-4B-Instruct-2507', bt_rank: 18, win_rate: 36.6, accuracy: 26.9, acc_rank: 15, is_proprietary: false}
|
| 165 |
+
,{model: 'qwen2.5-14B-Instruct-1M', bt_rank: 19, win_rate: 32.0, accuracy: 20, acc_rank: 21, is_proprietary: false}
|
| 166 |
+
,{model: 'llama3.3-70B', bt_rank: 20, win_rate: 28.1, accuracy: 22.65, acc_rank: 19, is_proprietary: false}
|
| 167 |
+
,{model: 'qwen2.5-7B-Instruct', bt_rank: 21, win_rate: 22.2, accuracy: 25.64, acc_rank: 17, is_proprietary: false}
|
|
|
|
|
|
|
|
|
|
| 168 |
]
|
| 169 |
},
|
|
|
|
|
|
|
| 170 |
turn: {
|
| 171 |
+
'mimic': [
|
| 172 |
+
{model: 'Claude4.5-Sonnet', median: 52, distribution: [0.0, 0.0, 1.0, 5.0, 31.0, 43.0, 13.0, 7.0, 0.0, 0.0]}
|
| 173 |
+
,{model: 'GPT5-mini', median: 39, distribution: [0.0, 0.0, 9.0, 42.0, 36.0, 12.0, 1.0, 0.0, 0.0, 0.0]}
|
| 174 |
+
,{model: 'GLM4.6', median: 39, distribution: [0.0, 6.3, 23.4, 20.7, 7.2, 13.5, 3.6, 6.3, 4.5, 14.4]}
|
| 175 |
+
,{model: 'DeepSeekV3.2', median: 33, distribution: [0.0, 2.0, 22.0, 60.0, 16.0, 0.0, 0.0, 0.0, 0.0, 0.0]}
|
| 176 |
+
,{model: 'GPT5.2', median: 30, distribution: [0.0, 10.0, 36.0, 32.0, 12.0, 10.0, 0.0, 0.0, 0.0, 0.0]}
|
| 177 |
+
,{model: 'GPT5.1', median: 23, distribution: [1.5, 39.7, 29.4, 19.9, 9.6, 0.0, 0.0, 0.0, 0.0, 0.0]}
|
| 178 |
+
,{model: 'Kimi-K2', median: 19, distribution: [0.0, 55.0, 44.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}
|
| 179 |
+
,{model: 'MiniMax-M2', median: 18, distribution: [0.0, 70.0, 30.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}
|
| 180 |
+
,{model: 'Qwen3-Next-80B-A3B', median: 17, distribution: [12.0, 52.0, 24.0, 10.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0]}
|
| 181 |
+
,{model: 'Qwen3-30B-A3B', median: 17, distribution: [12.0, 52.0, 24.0, 10.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0]}
|
| 182 |
+
,{model: 'Gemini3-Flash', median: 15, distribution: [7.0, 71.0, 22.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}
|
| 183 |
+
,{model: 'Gemini2.5-Pro', median: 15, distribution: [10.6, 70.2, 19.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}
|
| 184 |
+
,{model: 'Qwen2.5-72B', median: 11, distribution: [15.0, 85.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}
|
| 185 |
+
,{model: 'Llama3.3-70B', median: 6, distribution: [99.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}
|
| 186 |
+
]
|
| 187 |
+
,'10k': [
|
| 188 |
+
{model: 'Claude4.5-Sonnet', median: 56, distribution: [0.0, 0.0, 1.0, 6.0, 13.0, 44.0, 27.0, 6.0, 3.0, 0.0]}
|
| 189 |
+
,{model: 'GLM4.6', median: 52, distribution: [0.0, 0.0, 3.8, 10.4, 27.4, 27.4, 18.9, 5.7, 4.7, 1.9]}
|
| 190 |
+
,{model: 'DeepSeekV3.2', median: 39, distribution: [0.0, 0.0, 11.0, 40.0, 37.0, 9.0, 3.0, 0.0, 0.0, 0.0]}
|
| 191 |
+
,{model: 'Kimi-K2', median: 24, distribution: [0.0, 29.0, 48.0, 21.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0]}
|
| 192 |
+
,{model: 'GPT5.2', median: 20, distribution: [0.0, 43.0, 41.0, 12.0, 3.0, 0.0, 1.0, 0.0, 0.0, 0.0]}
|
| 193 |
+
,{model: 'MiniMax-M2', median: 20, distribution: [0.0, 43.0, 48.0, 9.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}
|
| 194 |
+
,{model: 'GPT5.1', median: 17, distribution: [1.0, 69.0, 29.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}
|
| 195 |
+
,{model: 'Gemini2.5-Pro', median: 15, distribution: [7.0, 73.0, 18.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}
|
| 196 |
+
,{model: 'Gemini3-Flash', median: 13, distribution: [10.0, 82.0, 7.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]}
|
| 197 |
+
,{model: 'Qwen3-Next-80B-A3B', median: 8, distribution: [81.0, 19.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}
|
| 198 |
+
,{model: 'Qwen3-30B-A3B', median: 8, distribution: [81.0, 19.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}
|
| 199 |
+
,{model: 'Qwen2.5-72B', median: 7, distribution: [75.0, 25.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}
|
| 200 |
+
,{model: 'Llama3.3-70B', median: 1, distribution: [92.0, 7.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}
|
| 201 |
+
]
|
| 202 |
+
,'globem': [
|
| 203 |
+
{model: 'Claude4.5-Sonnet', median: 25, distribution: [0.0, 6.0, 87.0, 7.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}
|
| 204 |
+
,{model: 'Gemini3-Flash', median: 21, distribution: [2.0, 36.0, 58.0, 3.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]}
|
| 205 |
+
,{model: 'GLM4.6', median: 21, distribution: [0.0, 23.0, 66.0, 11.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}
|
| 206 |
+
,{model: 'DeepSeekV3.2', median: 20, distribution: [0.0, 32.0, 68.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}
|
| 207 |
+
,{model: 'GPT5-mini', median: 17, distribution: [2.0, 78.0, 20.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}
|
| 208 |
+
,{model: 'Kimi-K2', median: 17, distribution: [0.0, 82.0, 18.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}
|
| 209 |
+
,{model: 'MiniMax-M2', median: 17, distribution: [0.0, 80.0, 20.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}
|
| 210 |
+
,{model: 'GPT5.2', median: 15, distribution: [0.0, 92.0, 8.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}
|
| 211 |
+
,{model: 'Qwen2.5-72B', median: 14, distribution: [4.0, 78.0, 17.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}
|
| 212 |
+
,{model: 'Gemini2.5-Pro', median: 12, distribution: [3.0, 94.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}
|
| 213 |
+
,{model: 'Qwen3-Next-80B-A3B', median: 12, distribution: [0.0, 99.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}
|
| 214 |
+
,{model: 'Qwen3-30B-A3B', median: 12, distribution: [0.0, 99.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}
|
| 215 |
+
,{model: 'GPT5.1', median: 11, distribution: [30.0, 70.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}
|
| 216 |
+
,{model: 'Llama3.3-70B', median: 6, distribution: [98.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}
|
|
|
|
| 217 |
]
|
| 218 |
},
|
|
|
|
|
|
|
| 219 |
entropy: {
|
| 220 |
+
'mimic': {
|
| 221 |
+
'GPT-5.2': {entropy: [0.72, 0.78, 0.82, 0.68, 0.75, 0.88], coverage: [0.08, 0.1, 0.09, 0.07, 0.09, 0.11], accuracy: [30, 35, 40, 25, 32, 45]}
|
| 222 |
+
,'Claude-4.5-Sonnet': {entropy: [0.85, 0.88, 0.92, 0.8, 0.87, 0.78], coverage: [0.12, 0.14, 0.13, 0.1, 0.13, 0.09], accuracy: [45, 50, 55, 40, 48, 35]}
|
| 223 |
+
,'Gemini-3-Flash': {entropy: [0.7, 0.75, 0.68, 0.72, 0.8, 0.65], coverage: [0.06, 0.09, 0.07, 0.08, 0.1, 0.05], accuracy: [28, 32, 25, 30, 38, 22]}
|
| 224 |
+
,'GLM-4.6': {entropy: [0.78, 0.82, 0.75, 0.8, 0.88, 0.72], coverage: [0.09, 0.11, 0.08, 0.1, 0.13, 0.07], accuracy: [32, 40, 28, 35, 45, 25]}
|
| 225 |
+
,'DeepSeek-V3.2': {entropy: [0.82, 0.85, 0.78, 0.88, 0.75, 0.9], coverage: [0.1, 0.12, 0.09, 0.14, 0.08, 0.15], accuracy: [38, 42, 32, 48, 28, 52]}
|
| 226 |
+
}
|
| 227 |
+
,'10k': {
|
| 228 |
+
'GPT-5.2': {entropy: [0.72, 0.78, 0.82, 0.68, 0.75, 0.88], coverage: [0.08, 0.1, 0.09, 0.07, 0.09, 0.11], accuracy: [30, 35, 40, 25, 32, 45]}
|
| 229 |
+
,'Claude-4.5-Sonnet': {entropy: [0.85, 0.88, 0.92, 0.8, 0.87, 0.78], coverage: [0.12, 0.14, 0.13, 0.1, 0.13, 0.09], accuracy: [45, 50, 55, 40, 48, 35]}
|
| 230 |
+
,'Gemini-3-Flash': {entropy: [0.7, 0.75, 0.68, 0.72, 0.8, 0.65], coverage: [0.06, 0.09, 0.07, 0.08, 0.1, 0.05], accuracy: [28, 32, 25, 30, 38, 22]}
|
| 231 |
+
,'GLM-4.6': {entropy: [0.78, 0.82, 0.75, 0.8, 0.88, 0.72], coverage: [0.09, 0.11, 0.08, 0.1, 0.13, 0.07], accuracy: [32, 40, 28, 35, 45, 25]}
|
| 232 |
+
,'DeepSeek-V3.2': {entropy: [0.82, 0.85, 0.78, 0.88, 0.75, 0.9], coverage: [0.1, 0.12, 0.09, 0.14, 0.08, 0.15], accuracy: [38, 42, 32, 48, 28, 52]}
|
| 233 |
+
}
|
| 234 |
+
,'globem': {
|
| 235 |
+
'GPT-5.2': {entropy: [0.72, 0.78, 0.82, 0.68, 0.75, 0.88], coverage: [0.08, 0.1, 0.09, 0.07, 0.09, 0.11], accuracy: [30, 35, 40, 25, 32, 45]}
|
| 236 |
+
,'Claude-4.5-Sonnet': {entropy: [0.85, 0.88, 0.92, 0.8, 0.87, 0.78], coverage: [0.12, 0.14, 0.13, 0.1, 0.13, 0.09], accuracy: [45, 50, 55, 40, 48, 35]}
|
| 237 |
+
,'Gemini-3-Flash': {entropy: [0.7, 0.75, 0.68, 0.72, 0.8, 0.65], coverage: [0.06, 0.09, 0.07, 0.08, 0.1, 0.05], accuracy: [28, 32, 25, 30, 38, 22]}
|
| 238 |
+
,'GLM-4.6': {entropy: [0.78, 0.82, 0.75, 0.8, 0.88, 0.72], coverage: [0.09, 0.11, 0.08, 0.1, 0.13, 0.07], accuracy: [32, 40, 28, 35, 45, 25]}
|
| 239 |
+
,'DeepSeek-V3.2': {entropy: [0.82, 0.85, 0.78, 0.88, 0.75, 0.9], coverage: [0.1, 0.12, 0.09, 0.14, 0.08, 0.15], accuracy: [38, 42, 32, 48, 28, 52]}
|
| 240 |
}
|
| 241 |
},
|
|
|
|
|
|
|
| 242 |
probing: {
|
| 243 |
byTurn: {
|
| 244 |
+
'mimic': {
|
| 245 |
+
'Qwen2.5-32B': {turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-4.3, -4.21, -4.04, -3.87, -3.59, -3.62, -3.33, -3.4, -2.93, -3.21], sem: [0.25, 0.27, 0.32, 0.35, 0.35, 0.36, 0.34, 0.35, 0.32, 0.4]}
|
| 246 |
+
,'Qwen2.5-72B': {turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-3.51, -3.98, -3.68, -3.8, -3.26, -3.22, -3.12, -3.24, -3.08, -2.84], sem: [0.15, 0.21, 0.21, 0.23, 0.23, 0.21, 0.25, 0.25, 0.28, 0.08]}
|
| 247 |
+
,'Qwen3-4B': {turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-3.48, -3.25, -3.3, -2.74, -2.75, -2.73, -2.72, -2.67, -2.62, -2.25], sem: [0.04, 0.05, 0.04, 0.07, 0.06, 0.07, 0.07, 0.07, 0.06, 0.06]}
|
| 248 |
+
,'Qwen3-30B-A3B': {turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-4.94, -5.21, -5.51, -5.05, -4.96, -4.95, -4.75, -4.73, -4.6, -4.72], sem: [0.15, 0.18, 0.2, 0.18, 0.19, 0.19, 0.17, 0.18, 0.16, 0.18]}
|
| 249 |
+
,'Qwen3-Next-80B-A3B': {turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-2.85, -2.86, -2.74, -2.65, -2.31, -2.14, -1.98, -2.03, -1.88, -1.82], sem: [0.1, 0.1, 0.11, 0.11, 0.11, 0.13, 0.13, 0.18, 0.17, 0.09]}
|
| 250 |
+
}
|
| 251 |
+
,'globem': {
|
| 252 |
+
'Qwen2.5-32B': {turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-5.48, -5.83, -5.84, -5.91, -6.01, -6.03, -5.86, -5.73, -5.78, -5.73], sem: [0.24, 0.28, 0.31, 0.33, 0.33, 0.35, 0.33, 0.35, 0.35, 0.36]}
|
| 253 |
+
,'Qwen2.5-72B': {turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-4.68, -5.56, -5.65, -5.59, -5.59, -5.49, -5.54, -5.4, -5.57, -5.53], sem: [0.13, 0.18, 0.23, 0.23, 0.25, 0.25, 0.29, 0.32, 0.38, 0.46]}
|
| 254 |
+
,'Qwen3-4B': {turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-4.67, -4.16, -3.9, -3.76, -3.6, -3.47, -3.05, -2.99, -2.93, -2.78], sem: [0.08, 0.07, 0.06, 0.06, 0.07, 0.08, 0.07, 0.08, 0.08, 0.09]}
|
| 255 |
+
,'Qwen3-30B-A3B': {turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-5.28, -5.23, -5.2, -5.19, -5.2, -5.01, -5.21, -4.95, -4.93, -4.81], sem: [0.09, 0.09, 0.09, 0.08, 0.08, 0.08, 0.09, 0.09, 0.1, 0.1]}
|
| 256 |
+
,'Qwen3-Next-80B-A3B': {turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-3.1, -3.15, -3.06, -3.01, -2.95, -2.88, -2.78, -2.4, -2.46, -1.89], sem: [0.06, 0.06, 0.06, 0.06, 0.06, 0.06, 0.07, 0.06, 0.14, 0.1]}
|
| 257 |
+
}
|
| 258 |
+
,'10k': {
|
| 259 |
+
'Qwen2.5-32B': {turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-6.59, -7.15, -6.99, -6.95, -6.82, -6.88, -6.71, -6.58, -6.67, -6.45], sem: [0.26, 0.28, 0.29, 0.3, 0.29, 0.29, 0.29, 0.32, 0.36, 0.41]}
|
| 260 |
+
,'Qwen2.5-72B': {turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-5.51, -7.02, -6.45, -6.11, -5.98, -6.52, -7.02, -7.88, -8.05, -7.66], sem: [0.26, 0.34, 0.34, 0.36, 0.4, 0.53, 0.62, 0.71, 0.81, 0.92]}
|
| 261 |
+
,'Qwen3-4B': {turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-4.68, -4.3, -3.57, -3.33, -3.27, -3.22, -3.06, -2.9, -2.75, -2.57], sem: [0.18, 0.17, 0.15, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14]}
|
| 262 |
+
,'Qwen3-30B-A3B': {turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-3.23, -3.31, -3.28, -3.16, -3.06, -2.97, -2.94, -2.87, -2.83, -2.73], sem: [0.17, 0.17, 0.17, 0.17, 0.17, 0.16, 0.17, 0.18, 0.18, 0.17]}
|
| 263 |
+
,'Qwen3-Next-80B-A3B': {turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-3.25, -3.42, -3.21, -2.94, -2.81, -2.75, -2.7, -2.65, -2.55, -2.45], sem: [0.16, 0.17, 0.17, 0.17, 0.16, 0.17, 0.16, 0.16, 0.16, 0.16]}
|
| 264 |
}
|
| 265 |
},
|
| 266 |
byProgress: {
|
| 267 |
+
'mimic': {
|
| 268 |
+
'Qwen2.5-32B': {progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-4.3, -4.12, -3.73, -3.62, -3.36, -3.05, -2.94, -3.12, -4.6, -4.42], sem: [0.25, 0.21, 0.25, 0.36, 0.24, 0.25, 0.38, 0.45, 1.5, 0.1]}
|
| 269 |
+
,'Qwen2.5-72B': {progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-3.51, -3.98, -3.74, -3.26, -3.17, -3.24, -2.99, -2.53, -2.58, -2.42], sem: [0.15, 0.21, 0.16, 0.23, 0.17, 0.25, 0.18, 0.09, 0.09, 0.2]}
|
| 270 |
+
,'Qwen3-4B': {progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-3.37, -2.93, -2.71, -2.33, -1.99, -2.04, -1.57, -1.46, -1.48, -1.44], sem: [0.03, 0.04, 0.04, 0.04, 0.05, 0.08, 0.1, 0.05, 0.0, 0.01]}
|
| 271 |
+
,'Qwen3-30B-A3B': {progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-5.13, -4.72, -4.42, -4.17, -4.04, -3.9, -3.64, -3.45, -3.36, -3.17], sem: [0.08, 0.07, 0.07, 0.07, 0.07, 0.08, 0.1, 0.14, 0.15, 0.26]}
|
| 272 |
+
,'Qwen3-Next-80B-A3B': {progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-2.85, -2.8, -2.65, -2.22, -1.98, -1.96, -1.79, -1.74, -1.83, -1.85], sem: [0.1, 0.07, 0.11, 0.09, 0.13, 0.12, 0.08, 0.16, 0.15, 0.39]}
|
| 273 |
+
}
|
| 274 |
+
,'globem': {
|
| 275 |
+
'Qwen2.5-32B': {progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-5.66, -5.92, -5.88, -5.79, -5.79, -5.55, -5.47, -4.8, -3.55, -3.24], sem: [0.18, 0.19, 0.2, 0.21, 0.29, 0.29, 0.47, 0.63, 0.19, 0.47]}
|
| 276 |
+
,'Qwen2.5-72B': {progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-4.68, -5.56, -5.62, -5.59, -5.51, -5.4, -5.56, -5.03, -5.77, -7.71], sem: [0.13, 0.18, 0.16, 0.25, 0.19, 0.32, 0.29, 0.55, 0.83, 0.1]}
|
| 277 |
+
,'Qwen3-4B': {progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-4.42, -3.83, -3.38, -2.96, -2.71, -2.6, -2.46, -2.53, -2.63, -2.61], sem: [0.06, 0.04, 0.04, 0.05, 0.07, 0.08, 0.12, 0.14, 0.25, 0.04]}
|
| 278 |
+
,'Qwen3-30B-A3B': {progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-5.26, -5.2, -5.06, -4.82, -4.5, -4.51, -4.37, -4.1, -4.03, -3.74], sem: [0.06, 0.05, 0.05, 0.06, 0.07, 0.08, 0.1, 0.29, 0.25, 0.11]}
|
| 279 |
+
,'Qwen3-Next-80B-A3B': {progress: [10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-3.1, -3.15, -3.06, -3.01, -2.95, -2.88, -2.78, -2.4, -2.46], sem: [0.06, 0.06, 0.06, 0.06, 0.06, 0.06, 0.07, 0.06, 0.14]}
|
| 280 |
+
}
|
| 281 |
+
,'10k': {
|
| 282 |
+
'Qwen2.5-32B': {progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-6.59, -7.07, -6.89, -6.8, -6.58, -6.58, -6.76, -8.0, -8.59, -8.83], sem: [0.26, 0.2, 0.21, 0.2, 0.32, 0.27, 0.39, 0.57, 0.84, 1.12]}
|
| 283 |
+
,'Qwen2.5-72B': {progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-5.51, -7.02, -6.28, -5.98, -6.52, -7.33, -8.05, -7.85, -8.41, -7.15], sem: [0.26, 0.34, 0.25, 0.4, 0.53, 0.47, 0.81, 0.79, 1.45, 1.26]}
|
| 284 |
+
,'Qwen3-4B': {progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-4.49, -3.45, -3.19, -2.83, -2.5, -2.27, -2.31, -2.31, -2.35, -1.73], sem: [0.12, 0.1, 0.08, 0.1, 0.1, 0.11, 0.2, 0.29, 0.36, 0.03]}
|
| 285 |
+
,'Qwen3-30B-A3B': {progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-3.14, -2.66, -2.29, -2.26, -1.97, -1.88, -1.52, -1.36, -1.61, -1.61], sem: [0.06, 0.06, 0.07, 0.1, 0.14, 0.18, 0.08, 0.02, 0.05, 0.08]}
|
| 286 |
+
,'Qwen3-Next-80B-A3B': {progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-3.34, -2.99, -2.7, -2.5, -2.43, -2.55, -2.18, -2.28, -2.19, -2.5], sem: [0.12, 0.1, 0.1, 0.11, 0.11, 0.15, 0.21, 0.22, 0.26, 0.38]}
|
| 287 |
}
|
| 288 |
}
|
| 289 |
},
|
|
|
|
|
|
|
| 290 |
probingColors: {
|
| 291 |
'Qwen2.5-32B': '#4A90D9',
|
| 292 |
'Qwen2.5-72B': '#1A5FB4',
|