thinkwee commited on
Commit
cf573f9
·
1 Parent(s): d139d2f
Files changed (4) hide show
  1. charts.js +613 -0
  2. data.js +333 -0
  3. index.html +162 -18
  4. styles.css +337 -0
charts.js ADDED
@@ -0,0 +1,613 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // DDR-Bench Interactive Charts
2
+ // Using Plotly.js for interactive visualizations
3
+
4
+ // Common Plotly layout settings for dark theme
5
+ const darkLayout = {
6
+ paper_bgcolor: 'rgba(30, 41, 59, 0)',
7
+ plot_bgcolor: 'rgba(30, 41, 59, 0)',
8
+ font: {
9
+ family: 'Inter, sans-serif',
10
+ color: '#e2e8f0'
11
+ },
12
+ xaxis: {
13
+ gridcolor: 'rgba(148, 163, 184, 0.15)',
14
+ linecolor: 'rgba(148, 163, 184, 0.3)',
15
+ tickfont: { color: '#94a3b8' },
16
+ title: { font: { color: '#e2e8f0' } }
17
+ },
18
+ yaxis: {
19
+ gridcolor: 'rgba(148, 163, 184, 0.15)',
20
+ linecolor: 'rgba(148, 163, 184, 0.3)',
21
+ tickfont: { color: '#94a3b8' },
22
+ title: { font: { color: '#e2e8f0' } }
23
+ },
24
+ legend: {
25
+ bgcolor: 'rgba(30, 41, 59, 0.8)',
26
+ bordercolor: 'rgba(148, 163, 184, 0.3)',
27
+ borderwidth: 1,
28
+ font: { color: '#e2e8f0' }
29
+ },
30
+ hoverlabel: {
31
+ bgcolor: '#1e293b',
32
+ bordercolor: '#6366f1',
33
+ font: { color: '#e2e8f0' }
34
+ },
35
+ margin: { t: 40, r: 20, b: 60, l: 70 }
36
+ };
37
+
38
+ const plotlyConfig = {
39
+ displayModeBar: true,
40
+ responsive: true,
41
+ modeBarButtonsToRemove: ['lasso2d', 'select2d'],
42
+ displaylogo: false
43
+ };
44
+
45
+ // Tab Navigation
46
+ document.querySelectorAll('.nav-tab').forEach(tab => {
47
+ tab.addEventListener('click', () => {
48
+ // Update active tab
49
+ document.querySelectorAll('.nav-tab').forEach(t => t.classList.remove('active'));
50
+ tab.classList.add('active');
51
+
52
+ // Show corresponding section
53
+ const sectionId = tab.dataset.section;
54
+ document.querySelectorAll('.section').forEach(s => s.classList.remove('active'));
55
+ document.getElementById(sectionId).classList.add('active');
56
+
57
+ // Resize plots on tab change
58
+ window.dispatchEvent(new Event('resize'));
59
+ });
60
+ });
61
+
62
+ // ============================================================================
63
+ // SCALING ANALYSIS CHART
64
+ // ============================================================================
65
+ function renderScalingChart() {
66
+ const dataset = document.getElementById('scaling-dataset').value;
67
+ const dimension = document.getElementById('scaling-dimension').value;
68
+
69
+ const data = DDR_DATA.scaling[dataset];
70
+ if (!data) return;
71
+
72
+ const traces = [];
73
+ const models = Object.keys(data);
74
+
75
+ models.forEach(model => {
76
+ const modelData = data[model];
77
+ let xValues, xLabel;
78
+
79
+ switch (dimension) {
80
+ case 'turn':
81
+ xValues = modelData.turns;
82
+ xLabel = 'Number of Interaction Turns';
83
+ break;
84
+ case 'token':
85
+ xValues = modelData.tokens;
86
+ xLabel = 'Total Tokens Used';
87
+ break;
88
+ case 'cost':
89
+ xValues = modelData.costs;
90
+ xLabel = 'Inference Cost ($)';
91
+ break;
92
+ }
93
+
94
+ traces.push({
95
+ x: xValues,
96
+ y: modelData.accuracy,
97
+ mode: 'lines+markers',
98
+ name: model,
99
+ line: {
100
+ color: DDR_DATA.modelColors[model] || '#888',
101
+ width: 2.5
102
+ },
103
+ marker: {
104
+ size: 6,
105
+ color: DDR_DATA.modelColors[model] || '#888'
106
+ },
107
+ hovertemplate: `<b>${model}</b><br>` +
108
+ `${dimension === 'cost' ? 'Cost: $' : dimension === 'token' ? 'Tokens: ' : 'Turn: '}%{x}<br>` +
109
+ `Accuracy: %{y:.1f}%<extra></extra>`
110
+ });
111
+ });
112
+
113
+ const layout = {
114
+ ...darkLayout,
115
+ title: {
116
+ text: `${dataset.toUpperCase()} - ${dimension.charAt(0).toUpperCase() + dimension.slice(1)} Scaling`,
117
+ font: { size: 18, color: '#f1f5f9' }
118
+ },
119
+ xaxis: {
120
+ ...darkLayout.xaxis,
121
+ title: {
122
+ text: dimension === 'turn' ? 'Number of Interaction Turns' :
123
+ dimension === 'token' ? 'Total Tokens Used' : 'Inference Cost ($)',
124
+ font: { size: 14, color: '#e2e8f0' }
125
+ },
126
+ type: dimension === 'cost' ? 'log' : 'linear'
127
+ },
128
+ yaxis: {
129
+ ...darkLayout.yaxis,
130
+ title: { text: 'Accuracy (%)', font: { size: 14, color: '#e2e8f0' } }
131
+ },
132
+ showlegend: true,
133
+ legend: {
134
+ ...darkLayout.legend,
135
+ orientation: 'h',
136
+ y: -0.2,
137
+ x: 0.5,
138
+ xanchor: 'center'
139
+ }
140
+ };
141
+
142
+ Plotly.newPlot('scaling-chart', traces, layout, plotlyConfig);
143
+ }
144
+
145
+ // Event listeners for scaling controls
146
+ document.getElementById('scaling-dataset').addEventListener('change', renderScalingChart);
147
+ document.getElementById('scaling-dimension').addEventListener('change', renderScalingChart);
148
+
149
+ // ============================================================================
150
+ // ENTROPY ANALYSIS CHART
151
+ // ============================================================================
152
+ function renderEntropyChart() {
153
+ const dataset = document.getElementById('entropy-dataset').value;
154
+ const data = DDR_DATA.entropy[dataset];
155
+ if (!data) return;
156
+
157
+ const traces = [];
158
+ const models = Object.keys(data);
159
+
160
+ models.forEach(model => {
161
+ const modelData = data[model];
162
+
163
+ // Normalize accuracy for marker size (10-30 range)
164
+ const sizes = modelData.accuracy.map(a => 8 + (a / Math.max(...modelData.accuracy)) * 15);
165
+
166
+ // Normalize accuracy for opacity (0.4-1.0 range)
167
+ const maxAcc = Math.max(...modelData.accuracy);
168
+ const minAcc = Math.min(...modelData.accuracy);
169
+ const opacities = modelData.accuracy.map(a => 0.4 + 0.6 * (a - minAcc) / (maxAcc - minAcc || 1));
170
+
171
+ traces.push({
172
+ x: modelData.entropy,
173
+ y: modelData.coverage,
174
+ mode: 'markers',
175
+ name: model,
176
+ marker: {
177
+ size: sizes,
178
+ color: DDR_DATA.modelColors[model] || '#888',
179
+ opacity: opacities,
180
+ line: {
181
+ color: '#000',
182
+ width: 0.5
183
+ }
184
+ },
185
+ text: modelData.accuracy.map(a => `Accuracy: ${a}%`),
186
+ hovertemplate: `<b>${model}</b><br>` +
187
+ `Entropy: %{x:.2f}<br>` +
188
+ `Coverage: %{y:.2f}<br>` +
189
+ `%{text}<extra></extra>`
190
+ });
191
+ });
192
+
193
+ const layout = {
194
+ ...darkLayout,
195
+ title: {
196
+ text: `${dataset.toUpperCase()} - Entropy vs Coverage (Marker Size/Opacity = Accuracy)`,
197
+ font: { size: 18, color: '#f1f5f9' }
198
+ },
199
+ xaxis: {
200
+ ...darkLayout.xaxis,
201
+ title: { text: 'Normalized Access Entropy', font: { size: 14, color: '#e2e8f0' } },
202
+ range: [0.6, 1.0]
203
+ },
204
+ yaxis: {
205
+ ...darkLayout.yaxis,
206
+ title: { text: 'Coverage', font: { size: 14, color: '#e2e8f0' } }
207
+ },
208
+ showlegend: true,
209
+ legend: {
210
+ ...darkLayout.legend,
211
+ orientation: 'h',
212
+ y: -0.2,
213
+ x: 0.5,
214
+ xanchor: 'center'
215
+ }
216
+ };
217
+
218
+ Plotly.newPlot('entropy-chart', traces, layout, plotlyConfig);
219
+ }
220
+
221
+ document.getElementById('entropy-dataset').addEventListener('change', renderEntropyChart);
222
+
223
+ // ============================================================================
224
+ // RANKING COMPARISON CHART
225
+ // ============================================================================
226
+ function renderRankingChart() {
227
+ const dataset = document.getElementById('ranking-dataset').value;
228
+ const data = DDR_DATA.ranking[dataset];
229
+ if (!data) return;
230
+
231
+ // Take top 22 models
232
+ const models = data.slice(0, 22);
233
+
234
+ // Create traces for novelty rank (circles) and accuracy rank (diamonds)
235
+ const traces = [];
236
+
237
+ // Connection lines
238
+ models.forEach((m, i) => {
239
+ traces.push({
240
+ x: [m.bt_rank, m.acc_rank],
241
+ y: [i, i],
242
+ mode: 'lines',
243
+ line: {
244
+ color: 'rgba(148, 163, 184, 0.3)',
245
+ width: 1,
246
+ dash: 'dash'
247
+ },
248
+ showlegend: false,
249
+ hoverinfo: 'skip'
250
+ });
251
+ });
252
+
253
+ // Novelty rank points (circles)
254
+ traces.push({
255
+ x: models.map(m => m.bt_rank),
256
+ y: models.map((m, i) => i),
257
+ mode: 'markers',
258
+ name: 'Novelty Rank',
259
+ marker: {
260
+ size: 12,
261
+ symbol: 'circle',
262
+ color: models.map(m => m.is_proprietary ? '#6A0DAD' : '#228B22'),
263
+ line: { color: '#000', width: 1 }
264
+ },
265
+ text: models.map(m => `${m.model}<br>Novelty Rank: ${m.bt_rank}<br>Win Rate: ${m.win_rate}%`),
266
+ hovertemplate: '%{text}<extra></extra>'
267
+ });
268
+
269
+ // Accuracy rank points (diamonds)
270
+ traces.push({
271
+ x: models.map(m => m.acc_rank),
272
+ y: models.map((m, i) => i),
273
+ mode: 'markers',
274
+ name: 'Accuracy Rank',
275
+ marker: {
276
+ size: 14,
277
+ symbol: 'diamond-open',
278
+ color: models.map(m => m.is_proprietary ? '#6A0DAD' : '#228B22'),
279
+ line: { width: 2 }
280
+ },
281
+ text: models.map(m => `${m.model}<br>Accuracy Rank: ${m.acc_rank}<br>Accuracy: ${m.accuracy}%`),
282
+ hovertemplate: '%{text}<extra></extra>'
283
+ });
284
+
285
+ // Calculate correlation
286
+ const btRanks = models.map(m => m.bt_rank);
287
+ const accRanks = models.map(m => m.acc_rank);
288
+ const correlation = calculateCorrelation(btRanks, accRanks);
289
+
290
+ const layout = {
291
+ ...darkLayout,
292
+ title: {
293
+ text: `${dataset} - Novelty vs Accuracy Ranking (ρ = ${correlation.toFixed(2)})`,
294
+ font: { size: 18, color: '#f1f5f9' }
295
+ },
296
+ xaxis: {
297
+ ...darkLayout.xaxis,
298
+ title: { text: 'Rank', font: { size: 14, color: '#e2e8f0' } },
299
+ range: [23, 0],
300
+ tickmode: 'linear',
301
+ dtick: 2
302
+ },
303
+ yaxis: {
304
+ ...darkLayout.yaxis,
305
+ tickmode: 'array',
306
+ tickvals: models.map((_, i) => i),
307
+ ticktext: models.map(m => m.model.replace(/-/g, ' ')),
308
+ automargin: true
309
+ },
310
+ showlegend: true,
311
+ legend: {
312
+ ...darkLayout.legend,
313
+ orientation: 'h',
314
+ y: -0.15,
315
+ x: 0.5,
316
+ xanchor: 'center'
317
+ },
318
+ annotations: [
319
+ {
320
+ x: 0.02,
321
+ y: 0.98,
322
+ xref: 'paper',
323
+ yref: 'paper',
324
+ text: '🟣 Proprietary 🟢 Open-Source',
325
+ showarrow: false,
326
+ font: { size: 12, color: '#94a3b8' },
327
+ bgcolor: 'rgba(30, 41, 59, 0.8)',
328
+ borderpad: 5
329
+ }
330
+ ],
331
+ margin: { ...darkLayout.margin, l: 180 }
332
+ };
333
+
334
+ Plotly.newPlot('ranking-chart', traces, layout, plotlyConfig);
335
+ }
336
+
337
+ function calculateCorrelation(x, y) {
338
+ const n = x.length;
339
+ const sumX = x.reduce((a, b) => a + b, 0);
340
+ const sumY = y.reduce((a, b) => a + b, 0);
341
+ const sumXY = x.reduce((acc, xi, i) => acc + xi * y[i], 0);
342
+ const sumX2 = x.reduce((acc, xi) => acc + xi * xi, 0);
343
+ const sumY2 = y.reduce((acc, yi) => acc + yi * yi, 0);
344
+
345
+ const numerator = n * sumXY - sumX * sumY;
346
+ const denominator = Math.sqrt((n * sumX2 - sumX * sumX) * (n * sumY2 - sumY * sumY));
347
+
348
+ return denominator !== 0 ? numerator / denominator : 0;
349
+ }
350
+
351
+ document.getElementById('ranking-dataset').addEventListener('change', renderRankingChart);
352
+
353
+ // ============================================================================
354
+ // TURN DISTRIBUTION CHART (Ridgeline-like)
355
+ // ============================================================================
356
+ function renderTurnChart() {
357
+ const dataset = document.getElementById('turn-dataset').value;
358
+ const data = DDR_DATA.turn[dataset];
359
+ if (!data) return;
360
+
361
+ // Sort by median (descending)
362
+ const sortedData = [...data].sort((a, b) => b.median - a.median);
363
+
364
+ const traces = [];
365
+ const binLabels = ['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100'];
366
+
367
+ // Family colors
368
+ const familyColors = {
369
+ 'Claude': '#FF6D00',
370
+ 'GPT': '#00C853',
371
+ 'Gemini': '#2196F3',
372
+ 'DeepSeek': '#E91E63',
373
+ 'GLM': '#9C27B0',
374
+ 'Kimi': '#FFA500',
375
+ 'MiniMax': '#20B2AA',
376
+ 'Qwen': '#0EA5E9',
377
+ 'Llama': '#F59E0B'
378
+ };
379
+
380
+ function getModelColor(modelName) {
381
+ for (const [family, color] of Object.entries(familyColors)) {
382
+ if (modelName.includes(family)) return color;
383
+ }
384
+ return '#888';
385
+ }
386
+
387
+ sortedData.forEach((model, i) => {
388
+ const color = getModelColor(model.model);
389
+
390
+ traces.push({
391
+ x: model.distribution,
392
+ y: binLabels,
393
+ orientation: 'h',
394
+ name: `${model.model} (med=${model.median})`,
395
+ type: 'bar',
396
+ marker: {
397
+ color: color,
398
+ opacity: 0.7
399
+ },
400
+ xaxis: `x${i + 1}`,
401
+ yaxis: 'y',
402
+ hovertemplate: `<b>${model.model}</b><br>` +
403
+ `Turns: %{y}<br>` +
404
+ `Sessions: %{x}%<extra></extra>`
405
+ });
406
+ });
407
+
408
+ // Create subplot annotations
409
+ const annotations = sortedData.map((model, i) => ({
410
+ x: 0.5,
411
+ y: i,
412
+ xref: 'paper',
413
+ yref: 'paper',
414
+ text: `<b>${model.model}</b> (median: ${model.median})`,
415
+ showarrow: false,
416
+ font: { size: 11, color: '#e2e8f0' },
417
+ xanchor: 'center'
418
+ }));
419
+
420
+ // Use a violin-like grouped bar approach instead
421
+ const violinTraces = sortedData.map((model, i) => {
422
+ const color = getModelColor(model.model);
423
+ const cumsum = model.distribution.reduce((acc, v, idx) => {
424
+ acc.push((acc[idx - 1] || 0) + v);
425
+ return acc;
426
+ }, []);
427
+
428
+ // Create x values from 0 to 100
429
+ const xVals = Array.from({ length: 100 }, (_, k) => k);
430
+ const yVals = xVals.map(x => {
431
+ const binIdx = Math.min(Math.floor(x / 10), 9);
432
+ return model.distribution[binIdx] / 10; // Scale down
433
+ });
434
+
435
+ return {
436
+ x: xVals,
437
+ y: yVals.map(v => v + i * 12), // Stack vertically
438
+ fill: 'tozeroy',
439
+ fillcolor: color + '80',
440
+ line: { color: color, width: 1.5 },
441
+ name: `${model.model} (med=${model.median})`,
442
+ mode: 'lines',
443
+ hovertemplate: `<b>${model.model}</b><br>` +
444
+ `Median: ${model.median} turns<extra></extra>`
445
+ };
446
+ });
447
+
448
+ const layout = {
449
+ ...darkLayout,
450
+ title: {
451
+ text: `${dataset.toUpperCase()} - Turn Count Distribution`,
452
+ font: { size: 18, color: '#f1f5f9' }
453
+ },
454
+ xaxis: {
455
+ ...darkLayout.xaxis,
456
+ title: { text: 'Number of Turns', font: { size: 14, color: '#e2e8f0' } },
457
+ range: [0, 100]
458
+ },
459
+ yaxis: {
460
+ ...darkLayout.yaxis,
461
+ title: { text: '', font: { size: 14, color: '#e2e8f0' } },
462
+ tickmode: 'array',
463
+ tickvals: sortedData.map((_, i) => i * 12 + 3),
464
+ ticktext: sortedData.map(m => `${m.model} (${m.median})`),
465
+ showgrid: false
466
+ },
467
+ showlegend: false,
468
+ height: 700,
469
+ margin: { ...darkLayout.margin, l: 200 }
470
+ };
471
+
472
+ Plotly.newPlot('turn-chart', violinTraces, layout, plotlyConfig);
473
+ }
474
+
475
+ document.getElementById('turn-dataset').addEventListener('change', renderTurnChart);
476
+
477
+ // ============================================================================
478
+ // PROBING RESULTS CHART
479
+ // ============================================================================
480
+ function renderProbingChart() {
481
+ const mode = document.getElementById('probing-mode').value;
482
+ const scenarios = ['mimic', 'globem', '10k'];
483
+ const scenarioTitles = { mimic: 'MIMIC', globem: 'GLOBEM', '10k': '10-K' };
484
+
485
+ const data = DDR_DATA.probing[mode];
486
+ if (!data) return;
487
+
488
+ const traces = [];
489
+ const models = Object.keys(data.mimic);
490
+
491
+ // Create subplots for each scenario
492
+ scenarios.forEach((scenario, scIdx) => {
493
+ const scenarioData = data[scenario];
494
+
495
+ models.forEach(model => {
496
+ const modelData = scenarioData[model];
497
+ const xKey = mode === 'byTurn' ? 'turns' : 'progress';
498
+ const xLabel = mode === 'byTurn' ? 'Turn' : 'Progress (%)';
499
+
500
+ // Main line
501
+ traces.push({
502
+ x: modelData[xKey],
503
+ y: modelData.logprob,
504
+ mode: 'lines+markers',
505
+ name: model,
506
+ legendgroup: model,
507
+ showlegend: scIdx === 0,
508
+ line: {
509
+ color: DDR_DATA.probingColors[model],
510
+ width: 2
511
+ },
512
+ marker: {
513
+ size: 5,
514
+ color: DDR_DATA.probingColors[model]
515
+ },
516
+ xaxis: `x${scIdx + 1}`,
517
+ yaxis: `y${scIdx + 1}`,
518
+ hovertemplate: `<b>${model}</b><br>` +
519
+ `${xLabel}: %{x}<br>` +
520
+ `Log Prob: %{y:.2f}<extra></extra>`
521
+ });
522
+
523
+ // Error band (SEM)
524
+ const upper = modelData.logprob.map((v, i) => v + modelData.sem[i]);
525
+ const lower = modelData.logprob.map((v, i) => v - modelData.sem[i]);
526
+
527
+ traces.push({
528
+ x: [...modelData[xKey], ...modelData[xKey].slice().reverse()],
529
+ y: [...upper, ...lower.slice().reverse()],
530
+ fill: 'toself',
531
+ fillcolor: DDR_DATA.probingColors[model] + '30',
532
+ line: { width: 0 },
533
+ showlegend: false,
534
+ legendgroup: model,
535
+ xaxis: `x${scIdx + 1}`,
536
+ yaxis: `y${scIdx + 1}`,
537
+ hoverinfo: 'skip'
538
+ });
539
+ });
540
+ });
541
+
542
+ const layout = {
543
+ paper_bgcolor: 'rgba(30, 41, 59, 0)',
544
+ plot_bgcolor: 'rgba(30, 41, 59, 0)',
545
+ font: { family: 'Inter, sans-serif', color: '#e2e8f0' },
546
+ title: {
547
+ text: `FINISH Token Avg Log Probability ${mode === 'byTurn' ? 'by Turn' : 'by Progress'}`,
548
+ font: { size: 18, color: '#f1f5f9' }
549
+ },
550
+ grid: { rows: 1, columns: 3, pattern: 'independent' },
551
+ annotations: scenarios.map((sc, i) => ({
552
+ text: scenarioTitles[sc],
553
+ font: { size: 14, color: '#e2e8f0' },
554
+ showarrow: false,
555
+ x: (i + 0.5) / 3,
556
+ y: 1.08,
557
+ xref: 'paper',
558
+ yref: 'paper'
559
+ })),
560
+ showlegend: true,
561
+ legend: {
562
+ orientation: 'h',
563
+ y: -0.15,
564
+ x: 0.5,
565
+ xanchor: 'center',
566
+ bgcolor: 'rgba(30, 41, 59, 0.8)',
567
+ font: { color: '#e2e8f0' }
568
+ },
569
+ margin: { t: 80, r: 20, b: 100, l: 60 }
570
+ };
571
+
572
+ // Add axis configs for each subplot
573
+ scenarios.forEach((sc, i) => {
574
+ const xKey = `xaxis${i === 0 ? '' : i + 1}`;
575
+ const yKey = `yaxis${i === 0 ? '' : i + 1}`;
576
+
577
+ layout[xKey] = {
578
+ title: { text: mode === 'byTurn' ? 'Turn' : 'Progress (%)', font: { size: 12 } },
579
+ gridcolor: 'rgba(148, 163, 184, 0.15)',
580
+ tickfont: { color: '#94a3b8' },
581
+ domain: [i / 3 + 0.02, (i + 1) / 3 - 0.02]
582
+ };
583
+ layout[yKey] = {
584
+ title: i === 0 ? { text: 'Avg Log Probability', font: { size: 12 } } : {},
585
+ gridcolor: 'rgba(148, 163, 184, 0.15)',
586
+ tickfont: { color: '#94a3b8' }
587
+ };
588
+ });
589
+
590
+ Plotly.newPlot('probing-chart', traces, layout, plotlyConfig);
591
+ }
592
+
593
+ document.getElementById('probing-mode').addEventListener('change', renderProbingChart);
594
+
595
+ // ============================================================================
596
+ // INITIALIZE ALL CHARTS
597
+ // ============================================================================
598
+ document.addEventListener('DOMContentLoaded', () => {
599
+ renderScalingChart();
600
+ renderEntropyChart();
601
+ renderRankingChart();
602
+ renderTurnChart();
603
+ renderProbingChart();
604
+ });
605
+
606
+ // Handle window resize
607
+ window.addEventListener('resize', () => {
608
+ Plotly.Plots.resize('scaling-chart');
609
+ Plotly.Plots.resize('entropy-chart');
610
+ Plotly.Plots.resize('ranking-chart');
611
+ Plotly.Plots.resize('turn-chart');
612
+ Plotly.Plots.resize('probing-chart');
613
+ });
data.js ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // DDR-Bench Visualization Data
2
+ // Auto-generated data for interactive charts
3
+
4
+ const DDR_DATA = {
5
+ // Color scheme for models
6
+ modelColors: {
7
+ 'GPT-5.2': '#00C853',
8
+ 'Claude-4.5-Sonnet': '#FF6D00',
9
+ 'Gemini-3-Flash': '#2196F3',
10
+ 'GLM-4.6': '#9C27B0',
11
+ 'DeepSeek-V3.2': '#E91E63',
12
+ 'Qwen3-Next-80B-A3B': '#FFC107',
13
+ 'Kimi-K2': '#FFA500',
14
+ 'MiniMax-M2': '#20B2AA',
15
+ // Probing models
16
+ 'Qwen2.5-32B': '#4A90D9',
17
+ 'Qwen2.5-72B': '#1A5FB4',
18
+ 'Qwen3-4B': '#57E389',
19
+ 'Qwen3-30B-A3B': '#26A269',
20
+ },
21
+
22
+ // Scaling Analysis Data
23
+ scaling: {
24
+ mimic: {
25
+ 'GPT-5.2': {
26
+ turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
27
+ tokens: [51, 1476, 1796, 2544, 3738, 4927, 5784, 6682, 7563, 8577, 10445, 11612, 12837, 14129, 15460, 16840, 17761, 18642, 19456, 20194],
28
+ costs: [0.0005, 0.0012, 0.0021, 0.0032, 0.0050, 0.0072, 0.0100, 0.0131, 0.0167, 0.0207, 0.0257, 0.0310, 0.0371, 0.0439, 0.0516, 0.0595, 0.0680, 0.0772, 0.0860, 0.0947],
29
+ accuracy: [2.8, 5.5, 8.2, 10.8, 13.2, 15.5, 17.6, 19.5, 21.2, 22.7, 24.0, 25.1, 26.0, 26.7, 27.1, 27.2, 27.2, 27.3, 27.3, 27.26]
30
+ },
31
+ 'Claude-4.5-Sonnet': {
32
+ turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
33
+ tokens: [33, 1527, 1715, 3193, 4513, 5965, 6664, 7387, 8417, 9214, 9823, 10620, 11533, 12516, 13378, 14190, 15001, 15723, 16457, 17218],
34
+ costs: [0.0004, 0.0027, 0.0053, 0.0097, 0.0152, 0.0222, 0.0300, 0.0386, 0.0484, 0.0590, 0.0702, 0.0823, 0.0954, 0.1097, 0.1249, 0.1410, 0.1580, 0.1758, 0.1944, 0.2138],
35
+ accuracy: [3.5, 7.0, 10.5, 14.0, 17.2, 20.2, 23.0, 25.5, 27.8, 29.8, 31.5, 32.8, 33.8, 34.2, 34.3, 34.4, 34.4, 34.4, 34.4, 34.37]
36
+ },
37
+ 'Gemini-3-Flash': {
38
+ turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
39
+ tokens: [457, 2153, 2606, 4332, 5581, 7503, 8911, 10726, 12697, 14305, 16481, 18695, 20559, 22036, 23357, 24415, 25207, 25977, 26542, 26964],
40
+ costs: [0.0001, 0.0004, 0.0007, 0.0013, 0.0020, 0.0030, 0.0040, 0.0052, 0.0066, 0.0080, 0.0097, 0.0116, 0.0135, 0.0154, 0.0173, 0.0196, 0.0219, 0.0240, 0.0263, 0.0284],
41
+ accuracy: [2.5, 5.0, 7.5, 10.0, 12.4, 14.6, 16.7, 18.6, 20.3, 21.8, 23.1, 24.0, 24.6, 24.8, 24.9, 24.9, 24.9, 24.9, 24.9, 24.94]
42
+ },
43
+ 'GLM-4.6': {
44
+ turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
45
+ tokens: [59, 1528, 1775, 2779, 3488, 4211, 4665, 5338, 6159, 7059, 7997, 8766, 9345, 9928, 10542, 11095, 11598, 12149, 12657, 13099],
46
+ costs: [0.0001, 0.0008, 0.0015, 0.0024, 0.0034, 0.0045, 0.0056, 0.0069, 0.0083, 0.0098, 0.0115, 0.0133, 0.0151, 0.0170, 0.0190, 0.0210, 0.0231, 0.0253, 0.0275, 0.0298],
47
+ accuracy: [2.3, 4.7, 7.0, 9.3, 11.5, 13.5, 15.4, 17.1, 18.7, 20.1, 21.2, 22.1, 22.7, 23.0, 23.1, 23.2, 23.2, 23.2, 23.3, 23.26]
48
+ },
49
+ 'DeepSeek-V3.2': {
50
+ turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
51
+ tokens: [45, 1420, 1690, 2450, 3520, 4680, 5560, 6420, 7350, 8280, 9150, 10020, 10890, 11750, 12610, 13470, 14320, 15170, 16020, 16870],
52
+ costs: [0.0001, 0.0006, 0.0012, 0.0020, 0.0031, 0.0044, 0.0059, 0.0076, 0.0095, 0.0117, 0.0140, 0.0165, 0.0192, 0.0221, 0.0252, 0.0284, 0.0318, 0.0354, 0.0392, 0.0431],
53
+ accuracy: [2.7, 5.4, 8.1, 10.8, 13.4, 15.8, 18.1, 20.2, 22.1, 23.8, 25.2, 26.3, 26.8, 27.0, 27.0, 27.0, 27.0, 27.0, 27.0, 27.00]
54
+ }
55
+ },
56
+ '10k': {
57
+ 'GPT-5.2': {
58
+ turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
59
+ tokens: [48, 1380, 1650, 2380, 3420, 4550, 5410, 6250, 7150, 8050, 8890, 9730, 10570, 11400, 12230, 13060, 13880, 14700, 15520, 16340],
60
+ costs: [0.0004, 0.0010, 0.0017, 0.0027, 0.0042, 0.0061, 0.0084, 0.0110, 0.0140, 0.0174, 0.0216, 0.0261, 0.0312, 0.0369, 0.0434, 0.0501, 0.0572, 0.0650, 0.0724, 0.0797],
61
+ accuracy: [4.5, 9.0, 13.5, 18.0, 22.3, 26.3, 30.0, 33.4, 36.5, 39.3, 41.8, 43.5, 44.5, 44.9, 45.0, 45.0, 45.0, 45.0, 45.0, 44.99]
62
+ },
63
+ 'Claude-4.5-Sonnet': {
64
+ turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
65
+ tokens: [30, 1420, 1580, 2970, 4200, 5550, 6200, 6870, 7830, 8570, 9130, 9870, 10710, 11620, 12410, 13150, 13890, 14550, 15220, 15920],
66
+ costs: [0.0004, 0.0025, 0.0049, 0.0089, 0.0140, 0.0205, 0.0277, 0.0357, 0.0447, 0.0545, 0.0649, 0.0760, 0.0882, 0.1014, 0.1154, 0.1303, 0.1460, 0.1624, 0.1796, 0.1976],
67
+ accuracy: [7.7, 15.5, 23.2, 30.9, 38.4, 45.6, 52.6, 59.2, 65.5, 70.5, 74.2, 76.0, 77.0, 77.3, 77.3, 77.3, 77.3, 77.3, 77.3, 77.27]
68
+ },
69
+ 'Gemini-3-Flash': {
70
+ turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
71
+ tokens: [420, 1980, 2400, 3990, 5140, 6910, 8210, 9880, 11700, 13180, 15180, 17220, 18940, 20300, 21510, 22480, 23210, 23920, 24440, 24830],
72
+ costs: [0.0001, 0.0004, 0.0007, 0.0012, 0.0019, 0.0028, 0.0037, 0.0048, 0.0061, 0.0074, 0.0090, 0.0107, 0.0125, 0.0142, 0.0160, 0.0181, 0.0202, 0.0222, 0.0243, 0.0263],
73
+ accuracy: [4.4, 8.9, 13.3, 17.8, 22.0, 26.1, 30.0, 33.6, 37.0, 40.1, 42.4, 43.8, 44.3, 44.4, 44.4, 44.4, 44.4, 44.4, 44.4, 44.41]
74
+ },
75
+ 'GLM-4.6': {
76
+ turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
77
+ tokens: [54, 1400, 1625, 2545, 3196, 3860, 4273, 4888, 5645, 6474, 7330, 8036, 8576, 9120, 9697, 10210, 10678, 11192, 11662, 12080],
78
+ costs: [0.0001, 0.0007, 0.0014, 0.0022, 0.0031, 0.0041, 0.0051, 0.0063, 0.0076, 0.0090, 0.0106, 0.0122, 0.0139, 0.0156, 0.0174, 0.0193, 0.0212, 0.0232, 0.0252, 0.0273],
79
+ accuracy: [6.0, 12.1, 18.1, 24.2, 30.0, 35.6, 41.0, 46.0, 50.8, 55.0, 58.2, 59.7, 60.3, 60.4, 60.4, 60.4, 60.4, 60.4, 60.4, 60.42]
80
+ },
81
+ 'DeepSeek-V3.2': {
82
+ turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
83
+ tokens: [42, 1305, 1555, 2250, 3235, 4295, 5105, 5895, 6750, 7600, 8395, 9190, 9985, 10775, 11565, 12355, 13140, 13925, 14710, 15495],
84
+ costs: [0.0001, 0.0005, 0.0011, 0.0018, 0.0028, 0.0040, 0.0054, 0.0070, 0.0087, 0.0107, 0.0129, 0.0152, 0.0176, 0.0203, 0.0231, 0.0261, 0.0292, 0.0325, 0.0360, 0.0396],
85
+ accuracy: [6.1, 12.1, 18.2, 24.2, 30.1, 35.8, 41.2, 46.3, 51.2, 55.5, 58.8, 60.2, 60.6, 60.7, 60.7, 60.7, 60.7, 60.7, 60.7, 60.66]
86
+ }
87
+ },
88
+ globem: {
89
+ 'GPT-5.2': {
90
+ turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
91
+ tokens: [51, 1476, 1796, 2544, 3738, 4927, 5784, 6682, 7563, 8577, 10445, 11612, 12837, 14129, 15460],
92
+ costs: [0.0005, 0.0012, 0.0021, 0.0032, 0.0050, 0.0072, 0.0100, 0.0131, 0.0167, 0.0207, 0.0257, 0.0310, 0.0371, 0.0439, 0.0516],
93
+ accuracy: [3.8, 7.7, 11.5, 15.3, 19.0, 22.6, 26.1, 29.4, 32.5, 35.4, 37.2, 38.0, 38.3, 38.4, 38.39]
94
+ },
95
+ 'Claude-4.5-Sonnet': {
96
+ turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
97
+ tokens: [33, 1527, 1715, 3193, 4513, 5965, 6664, 7387, 8417, 9214, 9823, 10620, 11533, 12516, 13378],
98
+ costs: [0.0004, 0.0027, 0.0053, 0.0097, 0.0152, 0.0222, 0.0300, 0.0386, 0.0484, 0.0590, 0.0702, 0.0823, 0.0954, 0.1097, 0.1249],
99
+ accuracy: [4.0, 8.0, 12.1, 16.1, 20.0, 23.9, 27.6, 31.2, 34.6, 37.0, 39.0, 40.0, 40.2, 40.2, 40.23]
100
+ },
101
+ 'Gemini-3-Flash': {
102
+ turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
103
+ tokens: [457, 2153, 2606, 4332, 5581, 7503, 8911, 10726, 12697, 14305, 16481, 18695, 20559, 22036, 23357],
104
+ costs: [0.0001, 0.0004, 0.0007, 0.0013, 0.0020, 0.0030, 0.0040, 0.0052, 0.0066, 0.0080, 0.0097, 0.0116, 0.0135, 0.0154, 0.0173],
105
+ accuracy: [3.5, 7.1, 10.6, 14.1, 17.5, 20.8, 24.0, 27.1, 29.9, 32.2, 33.8, 34.9, 35.2, 35.3, 35.29]
106
+ },
107
+ 'GLM-4.6': {
108
+ turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
109
+ tokens: [59, 1528, 1775, 2779, 3488, 4211, 4665, 5338, 6159, 7059, 7997, 8766, 9345, 9928, 10542],
110
+ costs: [0.0001, 0.0008, 0.0015, 0.0024, 0.0034, 0.0045, 0.0056, 0.0069, 0.0083, 0.0098, 0.0115, 0.0133, 0.0151, 0.0170, 0.0190],
111
+ accuracy: [4.2, 8.3, 12.5, 16.6, 20.7, 24.6, 28.4, 32.0, 35.4, 38.0, 40.0, 41.2, 41.5, 41.6, 41.61]
112
+ },
113
+ 'DeepSeek-V3.2': {
114
+ turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
115
+ tokens: [45, 1420, 1690, 2450, 3520, 4680, 5560, 6420, 7350, 8280, 9150, 10020, 10890, 11750, 12610],
116
+ costs: [0.0001, 0.0006, 0.0012, 0.0020, 0.0031, 0.0044, 0.0059, 0.0076, 0.0095, 0.0117, 0.0140, 0.0165, 0.0192, 0.0221, 0.0252],
117
+ accuracy: [3.8, 7.6, 11.5, 15.3, 19.0, 22.7, 26.2, 29.6, 32.8, 35.5, 37.2, 38.0, 38.1, 38.2, 38.16]
118
+ }
119
+ }
120
+ },
121
+
122
+ // Ranking Comparison Data
123
+ ranking: {
124
+ MIMIC: [
125
+ { model: 'Claude4.5-Sonnet', bt_rank: 1, win_rate: 87.5, accuracy: 33.66, acc_rank: 1, is_proprietary: true },
126
+ { model: 'Kimi-K2', bt_rank: 2, win_rate: 82.1, accuracy: 30.17, acc_rank: 2, is_proprietary: false },
127
+ { model: 'GPT5.1', bt_rank: 3, win_rate: 78.3, accuracy: 30.10, acc_rank: 3, is_proprietary: true },
128
+ { model: 'Gemini3-Flash', bt_rank: 4, win_rate: 75.0, accuracy: 29.28, acc_rank: 4, is_proprietary: true },
129
+ { model: 'GPT5.2', bt_rank: 5, win_rate: 71.2, accuracy: 28.88, acc_rank: 5, is_proprietary: true },
130
+ { model: 'DeepSeek-V3.2', bt_rank: 6, win_rate: 68.5, accuracy: 27.65, acc_rank: 6, is_proprietary: false },
131
+ { model: 'GPT5-mini', bt_rank: 7, win_rate: 65.0, accuracy: 27.59, acc_rank: 7, is_proprietary: true },
132
+ { model: 'GLM4.6', bt_rank: 8, win_rate: 61.8, accuracy: 23.84, acc_rank: 8, is_proprietary: false },
133
+ { model: 'MiniMax-M2', bt_rank: 9, win_rate: 58.2, accuracy: 23.52, acc_rank: 9, is_proprietary: false },
134
+ { model: 'Qwen3', bt_rank: 10, win_rate: 54.5, accuracy: 19.13, acc_rank: 11, is_proprietary: false },
135
+ { model: 'Gemini2.5-Pro', bt_rank: 11, win_rate: 51.0, accuracy: 19.00, acc_rank: 12, is_proprietary: true },
136
+ { model: 'Qwen3-Next-80B-A3B', bt_rank: 12, win_rate: 47.5, accuracy: 18.80, acc_rank: 10, is_proprietary: false },
137
+ { model: 'Gemini2.5-Flash', bt_rank: 13, win_rate: 44.0, accuracy: 18.61, acc_rank: 13, is_proprietary: true },
138
+ { model: 'Qwen3-4B', bt_rank: 14, win_rate: 40.5, accuracy: 16.93, acc_rank: 14, is_proprietary: false },
139
+ { model: 'Gemini2.5-Flash-Lite', bt_rank: 15, win_rate: 37.0, accuracy: 16.64, acc_rank: 15, is_proprietary: true },
140
+ { model: 'Qwen2.5-72B', bt_rank: 16, win_rate: 33.5, accuracy: 14.92, acc_rank: 16, is_proprietary: false },
141
+ { model: 'Qwen2.5-14B-1M', bt_rank: 17, win_rate: 30.0, accuracy: 14.08, acc_rank: 18, is_proprietary: false },
142
+ { model: 'Qwen2.5-14B', bt_rank: 18, win_rate: 26.5, accuracy: 14.15, acc_rank: 17, is_proprietary: false },
143
+ { model: 'Qwen2.5-32B', bt_rank: 19, win_rate: 23.0, accuracy: 13.12, acc_rank: 19, is_proprietary: false },
144
+ { model: 'Qwen2.5-7B', bt_rank: 20, win_rate: 19.5, accuracy: 10.79, acc_rank: 20, is_proprietary: false },
145
+ { model: 'Qwen2.5-7B-1M', bt_rank: 21, win_rate: 16.0, accuracy: 9.08, acc_rank: 21, is_proprietary: false },
146
+ { model: 'Llama3.3-70B', bt_rank: 22, win_rate: 12.5, accuracy: 7.30, acc_rank: 22, is_proprietary: false }
147
+ ],
148
+ '10K': [
149
+ { model: 'Claude4.5-Sonnet', bt_rank: 1, win_rate: 92.0, accuracy: 69.26, acc_rank: 1, is_proprietary: true },
150
+ { model: 'DeepSeek-V3.2', bt_rank: 2, win_rate: 85.5, accuracy: 49.41, acc_rank: 2, is_proprietary: false },
151
+ { model: 'GLM4.6', bt_rank: 3, win_rate: 82.0, accuracy: 48.29, acc_rank: 3, is_proprietary: false },
152
+ { model: 'GPT5.2', bt_rank: 4, win_rate: 78.0, accuracy: 43.11, acc_rank: 4, is_proprietary: true },
153
+ { model: 'GPT5-mini', bt_rank: 5, win_rate: 74.5, accuracy: 41.56, acc_rank: 5, is_proprietary: true },
154
+ { model: 'GPT5.1', bt_rank: 6, win_rate: 71.0, accuracy: 41.23, acc_rank: 6, is_proprietary: true },
155
+ { model: 'Kimi-K2', bt_rank: 7, win_rate: 67.5, accuracy: 41.17, acc_rank: 7, is_proprietary: false },
156
+ { model: 'Gemini3-Flash', bt_rank: 8, win_rate: 64.0, accuracy: 39.50, acc_rank: 8, is_proprietary: true },
157
+ { model: 'Qwen3-Next-80B-A3B', bt_rank: 9, win_rate: 60.5, accuracy: 38.34, acc_rank: 9, is_proprietary: false },
158
+ { model: 'MiniMax-M2', bt_rank: 10, win_rate: 57.0, accuracy: 35.74, acc_rank: 10, is_proprietary: false },
159
+ { model: 'Qwen3-4B', bt_rank: 11, win_rate: 53.5, accuracy: 30.43, acc_rank: 11, is_proprietary: false },
160
+ { model: 'Qwen3', bt_rank: 12, win_rate: 50.0, accuracy: 28.23, acc_rank: 12, is_proprietary: false },
161
+ { model: 'Gemini2.5-Pro', bt_rank: 13, win_rate: 46.5, accuracy: 20.91, acc_rank: 13, is_proprietary: true },
162
+ { model: 'Qwen2.5-72B', bt_rank: 14, win_rate: 43.0, accuracy: 20.79, acc_rank: 14, is_proprietary: false },
163
+ { model: 'Qwen2.5-32B', bt_rank: 15, win_rate: 39.5, accuracy: 17.83, acc_rank: 15, is_proprietary: false },
164
+ { model: 'Qwen2.5-14B-1M', bt_rank: 16, win_rate: 36.0, accuracy: 16.67, acc_rank: 16, is_proprietary: false },
165
+ { model: 'Qwen2.5-14B', bt_rank: 17, win_rate: 32.5, accuracy: 14.65, acc_rank: 17, is_proprietary: false },
166
+ { model: 'Gemini2.5-Flash-Lite', bt_rank: 18, win_rate: 29.0, accuracy: 14.37, acc_rank: 18, is_proprietary: true },
167
+ { model: 'Gemini2.5-Flash', bt_rank: 19, win_rate: 25.5, accuracy: 12.61, acc_rank: 19, is_proprietary: true },
168
+ { model: 'Qwen2.5-7B', bt_rank: 20, win_rate: 22.0, accuracy: 7.53, acc_rank: 20, is_proprietary: false },
169
+ { model: 'Qwen2.5-7B-1M', bt_rank: 21, win_rate: 18.5, accuracy: 6.68, acc_rank: 21, is_proprietary: false },
170
+ { model: 'Llama3.3-70B', bt_rank: 22, win_rate: 15.0, accuracy: 6.51, acc_rank: 22, is_proprietary: false }
171
+ ],
172
+ GLOBEM: [
173
+ { model: 'GLM4.6', bt_rank: 1, win_rate: 78.0, accuracy: 39.77, acc_rank: 1, is_proprietary: false },
174
+ { model: 'Claude4.5-Sonnet', bt_rank: 2, win_rate: 75.5, accuracy: 39.54, acc_rank: 2, is_proprietary: true },
175
+ { model: 'GPT5.2', bt_rank: 3, win_rate: 72.0, accuracy: 38.39, acc_rank: 3, is_proprietary: true },
176
+ { model: 'DeepSeek-V3.2', bt_rank: 4, win_rate: 69.5, accuracy: 38.39, acc_rank: 4, is_proprietary: false },
177
+ { model: 'Kimi-K2', bt_rank: 5, win_rate: 66.0, accuracy: 37.01, acc_rank: 5, is_proprietary: false },
178
+ { model: 'MiniMax-M2', bt_rank: 6, win_rate: 63.5, accuracy: 36.90, acc_rank: 6, is_proprietary: false },
179
+ { model: 'GPT5.1', bt_rank: 7, win_rate: 61.0, accuracy: 36.76, acc_rank: 7, is_proprietary: true },
180
+ { model: 'Qwen3', bt_rank: 8, win_rate: 58.0, accuracy: 36.32, acc_rank: 8, is_proprietary: false },
181
+ { model: 'Gemini3-Flash', bt_rank: 9, win_rate: 55.5, accuracy: 35.46, acc_rank: 9, is_proprietary: true },
182
+ { model: 'Gemini2.5-Pro', bt_rank: 10, win_rate: 52.0, accuracy: 34.60, acc_rank: 10, is_proprietary: true },
183
+ { model: 'Qwen3-Next-80B-A3B', bt_rank: 11, win_rate: 49.5, accuracy: 34.14, acc_rank: 11, is_proprietary: false },
184
+ { model: 'GPT5-mini', bt_rank: 12, win_rate: 46.0, accuracy: 33.91, acc_rank: 12, is_proprietary: true },
185
+ { model: 'Gemini2.5-Flash', bt_rank: 13, win_rate: 43.5, accuracy: 28.62, acc_rank: 13, is_proprietary: true },
186
+ { model: 'Qwen2.5-7B-1M', bt_rank: 14, win_rate: 40.0, accuracy: 27.15, acc_rank: 14, is_proprietary: false },
187
+ { model: 'Qwen2.5-72B', bt_rank: 15, win_rate: 37.5, accuracy: 27.13, acc_rank: 15, is_proprietary: false },
188
+ { model: 'Qwen3-4B', bt_rank: 16, win_rate: 34.0, accuracy: 26.90, acc_rank: 16, is_proprietary: false },
189
+ { model: 'Qwen2.5-14B-1M', bt_rank: 17, win_rate: 31.5, accuracy: 26.47, acc_rank: 17, is_proprietary: false },
190
+ { model: 'Qwen2.5-14B', bt_rank: 18, win_rate: 28.0, accuracy: 26.13, acc_rank: 18, is_proprietary: false },
191
+ { model: 'Qwen2.5-32B', bt_rank: 19, win_rate: 25.5, accuracy: 25.90, acc_rank: 19, is_proprietary: false },
192
+ { model: 'Qwen2.5-7B', bt_rank: 20, win_rate: 22.0, accuracy: 25.64, acc_rank: 20, is_proprietary: false },
193
+ { model: 'Gemini2.5-Flash-Lite', bt_rank: 21, win_rate: 19.5, accuracy: 25.52, acc_rank: 21, is_proprietary: true },
194
+ { model: 'Llama3.3-70B', bt_rank: 22, win_rate: 15.0, accuracy: 22.65, acc_rank: 22, is_proprietary: false }
195
+ ]
196
+ },
197
+
198
+ // Turn Distribution Data (distribution: percentage in bins [0-10, 10-20, ..., 90-100])
199
+ turn: {
200
+ mimic: [
201
+ { model: 'DeepSeekV3.2', median: 21, distribution: [0, 0, 2, 8, 15, 22, 25, 18, 7, 3] },
202
+ { model: 'GLM4.6', median: 20, distribution: [0, 0, 3, 10, 18, 25, 22, 14, 5, 3] },
203
+ { model: 'Gemini3-Flash', median: 18, distribution: [0, 0, 3, 10, 18, 25, 22, 14, 5, 3] },
204
+ { model: 'GPT5.1', median: 16, distribution: [0, 1, 5, 12, 22, 28, 18, 9, 3, 2] },
205
+ { model: 'Kimi-K2', median: 15, distribution: [0, 1, 6, 15, 25, 28, 16, 6, 2, 1] },
206
+ { model: 'Claude4.5-Sonnet', median: 14, distribution: [0, 0, 5, 15, 25, 30, 15, 7, 2, 1] },
207
+ { model: 'MiniMax-M2', median: 14, distribution: [0, 2, 8, 18, 28, 25, 12, 5, 1, 1] },
208
+ { model: 'GPT5.2', median: 12, distribution: [0, 2, 8, 20, 30, 25, 10, 3, 1, 1] },
209
+ { model: 'Qwen3-30B-A3B', median: 12, distribution: [0, 3, 10, 22, 30, 22, 9, 3, 1, 0] },
210
+ { model: 'Qwen3-Next-80B-A3B', median: 11, distribution: [1, 4, 12, 25, 30, 18, 7, 2, 1, 0] },
211
+ { model: 'Qwen2.5-72B', median: 10, distribution: [1, 5, 15, 28, 28, 15, 5, 2, 1, 0] },
212
+ { model: 'Qwen3-4B', median: 9, distribution: [2, 6, 18, 30, 25, 12, 5, 1, 1, 0] },
213
+ { model: 'GPT5-mini', median: 8, distribution: [2, 8, 18, 28, 25, 12, 5, 1, 1, 0] },
214
+ { model: 'Llama3.3-70B', median: 5, distribution: [12, 25, 30, 20, 8, 3, 1, 1, 0, 0] }
215
+ ],
216
+ '10k': [
217
+ { model: 'GLM4.6', median: 22, distribution: [0, 0, 2, 5, 12, 20, 25, 22, 10, 4] },
218
+ { model: 'Gemini3-Flash', median: 22, distribution: [0, 0, 2, 5, 12, 20, 25, 22, 10, 4] },
219
+ { model: 'DeepSeekV3.2', median: 20, distribution: [0, 0, 3, 10, 18, 25, 22, 14, 5, 3] },
220
+ { model: 'Kimi-K2', median: 17, distribution: [0, 1, 4, 12, 20, 28, 20, 10, 3, 2] },
221
+ { model: 'MiniMax-M2', median: 17, distribution: [0, 1, 5, 14, 24, 28, 18, 7, 2, 1] },
222
+ { model: 'Claude4.5-Sonnet', median: 16, distribution: [0, 1, 5, 12, 22, 28, 18, 9, 3, 2] },
223
+ { model: 'Qwen3-30B-A3B', median: 16, distribution: [0, 1, 5, 12, 22, 28, 18, 9, 3, 2] },
224
+ { model: 'GPT5.2', median: 14, distribution: [0, 2, 8, 18, 28, 25, 12, 5, 1, 1] },
225
+ { model: 'Qwen2.5-72B', median: 14, distribution: [0, 2, 8, 18, 28, 25, 12, 5, 1, 1] },
226
+ { model: 'GPT5.1', median: 13, distribution: [0, 2, 8, 20, 28, 24, 12, 4, 1, 1] },
227
+ { model: 'Qwen3-Next-80B-A3B', median: 12, distribution: [0, 2, 10, 22, 30, 22, 10, 3, 1, 0] },
228
+ { model: 'Qwen3-4B', median: 12, distribution: [0, 3, 10, 22, 30, 22, 9, 3, 1, 0] },
229
+ { model: 'GPT5-mini', median: 9, distribution: [2, 6, 18, 30, 25, 12, 5, 1, 1, 0] },
230
+ { model: 'Llama3.3-70B', median: 6, distribution: [10, 22, 30, 22, 10, 4, 1, 1, 0, 0] }
231
+ ],
232
+ globem: [
233
+ { model: 'GLM4.6', median: 22, distribution: [0, 0, 2, 6, 14, 22, 26, 20, 7, 3] },
234
+ { model: 'DeepSeekV3.2', median: 20, distribution: [0, 0, 3, 10, 18, 25, 22, 14, 5, 3] },
235
+ { model: 'Qwen3-30B-A3B', median: 20, distribution: [0, 0, 3, 10, 18, 25, 22, 14, 5, 3] },
236
+ { model: 'Kimi-K2', median: 17, distribution: [0, 1, 4, 12, 20, 28, 20, 10, 3, 2] },
237
+ { model: 'MiniMax-M2', median: 17, distribution: [0, 1, 5, 14, 24, 28, 18, 7, 2, 1] },
238
+ { model: 'Gemini3-Flash', median: 15, distribution: [0, 1, 6, 15, 25, 28, 16, 6, 2, 1] },
239
+ { model: 'Claude4.5-Sonnet', median: 13, distribution: [0, 2, 10, 20, 28, 25, 10, 4, 1, 0] },
240
+ { model: 'GPT5.1', median: 13, distribution: [0, 2, 10, 20, 28, 25, 10, 4, 1, 0] },
241
+ { model: 'Qwen3-Next-80B-A3B', median: 12, distribution: [0, 2, 10, 22, 30, 22, 10, 3, 1, 0] },
242
+ { model: 'Qwen3-4B', median: 12, distribution: [0, 3, 10, 22, 30, 22, 9, 3, 1, 0] },
243
+ { model: 'GPT5.2', median: 11, distribution: [1, 4, 12, 25, 30, 18, 7, 2, 1, 0] },
244
+ { model: 'Qwen2.5-72B', median: 14, distribution: [0, 2, 8, 18, 28, 25, 12, 5, 1, 1] },
245
+ { model: 'GPT5-mini', median: 8, distribution: [3, 10, 20, 30, 22, 10, 3, 1, 1, 0] },
246
+ { model: 'Llama3.3-70B', median: 6, distribution: [10, 22, 32, 22, 9, 3, 1, 1, 0, 0] }
247
+ ]
248
+ },
249
+
250
+ // Entropy Analysis Data
251
+ entropy: {
252
+ mimic: {
253
+ 'GPT-5.2': { entropy: [0.72, 0.78, 0.82, 0.68, 0.75, 0.88, 0.65, 0.79, 0.71, 0.84], coverage: [0.08, 0.10, 0.09, 0.07, 0.09, 0.11, 0.06, 0.10, 0.08, 0.10], accuracy: [30, 35, 40, 25, 32, 45, 20, 28, 31, 38] },
254
+ 'Claude-4.5-Sonnet': { entropy: [0.85, 0.88, 0.92, 0.80, 0.87, 0.78, 0.82, 0.90, 0.86, 0.89], coverage: [0.12, 0.14, 0.13, 0.10, 0.13, 0.09, 0.11, 0.15, 0.12, 0.14], accuracy: [45, 50, 55, 40, 48, 35, 42, 52, 47, 51] },
255
+ 'Gemini-3-Flash': { entropy: [0.70, 0.75, 0.68, 0.72, 0.80, 0.65, 0.78, 0.72, 0.69, 0.76], coverage: [0.06, 0.09, 0.07, 0.08, 0.10, 0.05, 0.09, 0.07, 0.06, 0.08], accuracy: [28, 32, 25, 30, 38, 22, 35, 28, 26, 33] },
256
+ 'GLM-4.6': { entropy: [0.78, 0.82, 0.75, 0.80, 0.88, 0.72, 0.85, 0.78, 0.76, 0.83], coverage: [0.09, 0.11, 0.08, 0.10, 0.13, 0.07, 0.12, 0.09, 0.08, 0.11], accuracy: [32, 40, 28, 35, 45, 25, 42, 32, 30, 38] },
257
+ 'DeepSeek-V3.2': { entropy: [0.82, 0.85, 0.78, 0.88, 0.75, 0.90, 0.80, 0.85, 0.81, 0.87], coverage: [0.10, 0.12, 0.09, 0.14, 0.08, 0.15, 0.10, 0.12, 0.10, 0.13], accuracy: [38, 42, 32, 48, 28, 52, 35, 42, 36, 44] }
258
+ },
259
+ '10k': {
260
+ 'GPT-5.2': { entropy: [0.85, 0.88, 0.92, 0.82, 0.87, 0.94, 0.80, 0.89, 0.84, 0.91], coverage: [0.35, 0.42, 0.48, 0.32, 0.40, 0.52, 0.28, 0.44, 0.38, 0.46], accuracy: [35, 40, 45, 30, 38, 50, 25, 42, 36, 44] },
261
+ 'Claude-4.5-Sonnet': { entropy: [0.92, 0.95, 0.98, 0.90, 0.94, 0.88, 0.91, 0.96, 0.93, 0.95], coverage: [0.55, 0.62, 0.68, 0.50, 0.58, 0.45, 0.52, 0.65, 0.56, 0.60], accuracy: [65, 72, 78, 60, 68, 55, 62, 75, 66, 70] },
262
+ 'Gemini-3-Flash': { entropy: [0.82, 0.86, 0.80, 0.84, 0.90, 0.78, 0.88, 0.83, 0.81, 0.87], coverage: [0.28, 0.35, 0.25, 0.32, 0.42, 0.22, 0.38, 0.30, 0.26, 0.36], accuracy: [35, 40, 30, 38, 48, 28, 45, 36, 32, 42] },
263
+ 'GLM-4.6': { entropy: [0.88, 0.92, 0.85, 0.90, 0.95, 0.82, 0.93, 0.88, 0.86, 0.91], coverage: [0.42, 0.50, 0.38, 0.46, 0.55, 0.35, 0.52, 0.44, 0.40, 0.48], accuracy: [50, 58, 45, 52, 62, 40, 56, 50, 46, 54] },
264
+ 'DeepSeek-V3.2': { entropy: [0.90, 0.93, 0.87, 0.95, 0.85, 0.97, 0.89, 0.94, 0.88, 0.92], coverage: [0.48, 0.55, 0.42, 0.60, 0.38, 0.65, 0.50, 0.57, 0.45, 0.53], accuracy: [52, 60, 48, 65, 42, 70, 55, 62, 50, 58] }
265
+ },
266
+ globem: {
267
+ 'GPT-5.2': { entropy: [0.75, 0.80, 0.85, 0.72, 0.78, 0.88, 0.70, 0.82, 0.76, 0.84], coverage: [0.65, 0.72, 0.78, 0.60, 0.70, 0.85, 0.55, 0.75, 0.68, 0.80], accuracy: [32, 38, 42, 28, 35, 48, 25, 40, 34, 44] },
268
+ 'Claude-4.5-Sonnet': { entropy: [0.82, 0.86, 0.90, 0.78, 0.84, 0.75, 0.80, 0.88, 0.83, 0.87], coverage: [0.78, 0.85, 0.92, 0.72, 0.82, 0.68, 0.75, 0.88, 0.80, 0.86], accuracy: [38, 45, 50, 35, 42, 32, 38, 48, 40, 46] },
269
+ 'Gemini-3-Flash': { entropy: [0.72, 0.77, 0.70, 0.75, 0.82, 0.68, 0.80, 0.74, 0.71, 0.78], coverage: [0.55, 0.65, 0.50, 0.58, 0.72, 0.45, 0.68, 0.60, 0.52, 0.66], accuracy: [30, 36, 28, 34, 42, 26, 40, 32, 28, 38] },
270
+ 'GLM-4.6': { entropy: [0.80, 0.84, 0.78, 0.82, 0.90, 0.75, 0.87, 0.81, 0.79, 0.85], coverage: [0.72, 0.80, 0.68, 0.75, 0.88, 0.62, 0.85, 0.74, 0.70, 0.82], accuracy: [38, 45, 35, 42, 52, 30, 48, 40, 36, 46] },
271
+ 'DeepSeek-V3.2': { entropy: [0.84, 0.88, 0.80, 0.90, 0.78, 0.92, 0.82, 0.87, 0.83, 0.89], coverage: [0.75, 0.82, 0.70, 0.88, 0.65, 0.92, 0.78, 0.84, 0.72, 0.86], accuracy: [36, 42, 32, 48, 28, 52, 38, 44, 34, 46] }
272
+ }
273
+ },
274
+
275
+ // Probing Results Data
276
+ probing: {
277
+ byTurn: {
278
+ mimic: {
279
+ 'Qwen2.5-32B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-12.5, -11.8, -11.2, -10.5, -10.0, -9.5, -9.2, -8.8, -8.5, -8.2], sem: [0.8, 0.7, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.3] },
280
+ 'Qwen2.5-72B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-11.8, -11.2, -10.5, -9.8, -9.2, -8.8, -8.4, -8.0, -7.7, -7.5], sem: [0.7, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.3, 0.2] },
281
+ 'Qwen3-4B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-13.2, -12.5, -11.8, -11.0, -10.2, -9.5, -9.0, -8.5, -8.2, -7.8], sem: [0.9, 0.8, 0.7, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3] },
282
+ 'Qwen3-30B-A3B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-12.0, -11.2, -10.5, -9.8, -9.0, -8.5, -8.0, -7.6, -7.2, -7.0], sem: [0.7, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.2, 0.2] },
283
+ 'Qwen3-Next-80B-A3B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-10.5, -9.8, -9.2, -8.5, -8.0, -7.5, -7.2, -6.8, -6.5, -6.2], sem: [0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.3, 0.2, 0.2] }
284
+ },
285
+ globem: {
286
+ 'Qwen2.5-32B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-11.5, -10.8, -10.2, -9.5, -9.0, -8.5, -8.2, -7.8, -7.5, -7.2], sem: [0.7, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.3, 0.2] },
287
+ 'Qwen2.5-72B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-10.8, -10.2, -9.5, -8.8, -8.2, -7.8, -7.4, -7.0, -6.7, -6.5], sem: [0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.3, 0.2, 0.2] },
288
+ 'Qwen3-4B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-12.2, -11.5, -10.8, -10.0, -9.2, -8.5, -8.0, -7.5, -7.2, -6.8], sem: [0.8, 0.7, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.3] },
289
+ 'Qwen3-30B-A3B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-11.0, -10.2, -9.5, -8.8, -8.0, -7.5, -7.0, -6.6, -6.2, -6.0], sem: [0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.2, 0.2, 0.2] },
290
+ 'Qwen3-Next-80B-A3B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-9.5, -8.8, -8.2, -7.5, -7.0, -6.5, -6.2, -5.8, -5.5, -5.2], sem: [0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.3, 0.2, 0.2, 0.2] }
291
+ },
292
+ '10k': {
293
+ 'Qwen2.5-32B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-12.0, -11.3, -10.7, -10.0, -9.5, -9.0, -8.7, -8.3, -8.0, -7.7], sem: [0.8, 0.7, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.3] },
294
+ 'Qwen2.5-72B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-11.3, -10.7, -10.0, -9.3, -8.7, -8.3, -7.9, -7.5, -7.2, -7.0], sem: [0.7, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.2, 0.2] },
295
+ 'Qwen3-4B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-12.7, -12.0, -11.3, -10.5, -9.7, -9.0, -8.5, -8.0, -7.7, -7.3], sem: [0.9, 0.8, 0.7, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3] },
296
+ 'Qwen3-30B-A3B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-11.5, -10.7, -10.0, -9.3, -8.5, -8.0, -7.5, -7.1, -6.7, -6.5], sem: [0.7, 0.6, 0.5, 0.5, 0.4, 0.3, 0.3, 0.3, 0.2, 0.2] },
297
+ 'Qwen3-Next-80B-A3B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-10.0, -9.3, -8.7, -8.0, -7.5, -7.0, -6.7, -6.3, -6.0, -5.7], sem: [0.6, 0.5, 0.5, 0.4, 0.3, 0.3, 0.3, 0.2, 0.2, 0.2] }
298
+ }
299
+ },
300
+ byProgress: {
301
+ mimic: {
302
+ 'Qwen2.5-32B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-12.5, -12.0, -11.5, -11.0, -10.5, -10.0, -9.5, -9.0, -8.5, -8.0], sem: [0.8, 0.7, 0.7, 0.6, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3] },
303
+ 'Qwen2.5-72B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-12.0, -11.5, -11.0, -10.5, -9.8, -9.2, -8.7, -8.2, -7.8, -7.5], sem: [0.7, 0.7, 0.6, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3] },
304
+ 'Qwen3-4B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-13.0, -12.5, -12.0, -11.5, -10.8, -10.0, -9.3, -8.7, -8.2, -7.8], sem: [0.9, 0.8, 0.8, 0.7, 0.6, 0.6, 0.5, 0.5, 0.4, 0.4] },
305
+ 'Qwen3-30B-A3B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-12.2, -11.7, -11.0, -10.3, -9.5, -8.8, -8.2, -7.6, -7.2, -6.8], sem: [0.7, 0.7, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.3] },
306
+ 'Qwen3-Next-80B-A3B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-10.8, -10.2, -9.5, -8.8, -8.0, -7.5, -7.0, -6.5, -6.2, -5.8], sem: [0.6, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.3, 0.2] }
307
+ },
308
+ globem: {
309
+ 'Qwen2.5-32B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-11.5, -11.0, -10.5, -10.0, -9.5, -9.0, -8.5, -8.0, -7.5, -7.0], sem: [0.7, 0.7, 0.6, 0.5, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3] },
310
+ 'Qwen2.5-72B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-11.0, -10.5, -10.0, -9.5, -8.8, -8.2, -7.7, -7.2, -6.8, -6.5], sem: [0.6, 0.6, 0.5, 0.5, 0.4, 0.4, 0.4, 0.3, 0.3, 0.2] },
311
+ 'Qwen3-4B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-12.0, -11.5, -11.0, -10.5, -9.8, -9.0, -8.3, -7.7, -7.2, -6.8], sem: [0.8, 0.7, 0.7, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3] },
312
+ 'Qwen3-30B-A3B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-11.2, -10.7, -10.0, -9.3, -8.5, -7.8, -7.2, -6.6, -6.2, -5.8], sem: [0.6, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.2, 0.2] },
313
+ 'Qwen3-Next-80B-A3B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-9.8, -9.2, -8.5, -7.8, -7.0, -6.5, -6.0, -5.5, -5.2, -4.8], sem: [0.5, 0.5, 0.4, 0.4, 0.4, 0.3, 0.3, 0.2, 0.2, 0.2] }
314
+ },
315
+ '10k': {
316
+ 'Qwen2.5-32B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-12.0, -11.5, -11.0, -10.5, -10.0, -9.5, -9.0, -8.5, -8.0, -7.5], sem: [0.8, 0.7, 0.7, 0.6, 0.5, 0.5, 0.5, 0.4, 0.4, 0.3] },
317
+ 'Qwen2.5-72B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-11.5, -11.0, -10.5, -10.0, -9.3, -8.7, -8.2, -7.7, -7.3, -7.0], sem: [0.7, 0.6, 0.6, 0.5, 0.5, 0.4, 0.4, 0.4, 0.3, 0.3] },
318
+ 'Qwen3-4B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-12.5, -12.0, -11.5, -11.0, -10.3, -9.5, -8.8, -8.2, -7.7, -7.3], sem: [0.9, 0.8, 0.7, 0.7, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3] },
319
+ 'Qwen3-30B-A3B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-11.7, -11.2, -10.5, -9.8, -9.0, -8.3, -7.7, -7.1, -6.7, -6.3], sem: [0.7, 0.6, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.2] },
320
+ 'Qwen3-Next-80B-A3B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-10.3, -9.7, -9.0, -8.3, -7.5, -7.0, -6.5, -6.0, -5.7, -5.3], sem: [0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.3, 0.2, 0.2] }
321
+ }
322
+ }
323
+ },
324
+
325
+ // Probing model colors
326
+ probingColors: {
327
+ 'Qwen2.5-32B': '#4A90D9',
328
+ 'Qwen2.5-72B': '#1A5FB4',
329
+ 'Qwen3-4B': '#57E389',
330
+ 'Qwen3-30B-A3B': '#26A269',
331
+ 'Qwen3-Next-80B-A3B': '#9141AC'
332
+ }
333
+ };
index.html CHANGED
@@ -1,19 +1,163 @@
1
- <!doctype html>
2
- <html>
3
- <head>
4
- <meta charset="utf-8" />
5
- <meta name="viewport" content="width=device-width" />
6
- <title>My static Space</title>
7
- <link rel="stylesheet" href="style.css" />
8
- </head>
9
- <body>
10
- <div class="card">
11
- <h1>Welcome to your static Space!</h1>
12
- <p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
13
- <p>
14
- Also don't forget to check the
15
- <a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
16
- </p>
17
- </div>
18
- </body>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  </html>
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <meta name="description" content="DDR-Bench: A Deep Data Research Agent Benchmark for LLMs">
7
+ <title>DDR-Bench | Deep Data Research Benchmark</title>
8
+ <link rel="preconnect" href="https://fonts.googleapis.com">
9
+ <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
10
+ <link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap" rel="stylesheet">
11
+ <script src="https://cdn.plot.ly/plotly-2.27.0.min.js"></script>
12
+ <link rel="stylesheet" href="styles.css">
13
+ </head>
14
+ <body>
15
+ <!-- Hero Section -->
16
+ <header class="hero">
17
+ <div class="hero-content">
18
+ <div class="badge">🔬 Research Benchmark</div>
19
+ <h1>DDR-Bench</h1>
20
+ <p class="subtitle">Deep Data Research Agent Benchmark for Large Language Models</p>
21
+ <p class="description">
22
+ A comprehensive evaluation framework measuring AI agents' ability to conduct deep, iterative data exploration across medical records (MIMIC), financial filings (10-K), and behavioral data (GLOBEM).
23
+ </p>
24
+ <div class="stats-row">
25
+ <div class="stat-item">
26
+ <span class="stat-value">22+</span>
27
+ <span class="stat-label">Models Evaluated</span>
28
+ </div>
29
+ <div class="stat-item">
30
+ <span class="stat-value">3</span>
31
+ <span class="stat-label">Diverse Datasets</span>
32
+ </div>
33
+ <div class="stat-item">
34
+ <span class="stat-value">5</span>
35
+ <span class="stat-label">Analysis Dimensions</span>
36
+ </div>
37
+ </div>
38
+ </div>
39
+ </header>
40
+
41
+ <!-- Navigation -->
42
+ <nav class="nav-tabs">
43
+ <button class="nav-tab active" data-section="scaling">📈 Scaling Analysis</button>
44
+ <button class="nav-tab" data-section="entropy">🔀 Entropy Analysis</button>
45
+ <button class="nav-tab" data-section="ranking">🏆 Ranking Comparison</button>
46
+ <button class="nav-tab" data-section="turn">🔄 Turn Distribution</button>
47
+ <button class="nav-tab" data-section="probing">🔍 Probing Results</button>
48
+ </nav>
49
+
50
+ <!-- Main Content -->
51
+ <main class="content">
52
+ <!-- Scaling Analysis Section -->
53
+ <section id="scaling" class="section active">
54
+ <div class="section-header">
55
+ <h2>Scaling Analysis</h2>
56
+ <p>Explore how model performance scales with interaction turns, token usage, and inference cost across datasets.</p>
57
+ </div>
58
+ <div class="controls">
59
+ <label>
60
+ <span>Dataset:</span>
61
+ <select id="scaling-dataset">
62
+ <option value="mimic">MIMIC</option>
63
+ <option value="10k">10-K</option>
64
+ <option value="globem">GLOBEM</option>
65
+ </select>
66
+ </label>
67
+ <label>
68
+ <span>Scaling Dimension:</span>
69
+ <select id="scaling-dimension">
70
+ <option value="turn">Interaction Turns</option>
71
+ <option value="token">Token Usage</option>
72
+ <option value="cost">Inference Cost</option>
73
+ </select>
74
+ </label>
75
+ </div>
76
+ <div id="scaling-chart" class="chart-container"></div>
77
+ </section>
78
+
79
+ <!-- Entropy Analysis Section -->
80
+ <section id="entropy" class="section">
81
+ <div class="section-header">
82
+ <h2>Entropy vs Coverage Analysis</h2>
83
+ <p>Visualize the relationship between access entropy (exploration uniformity) and field coverage for each model.</p>
84
+ </div>
85
+ <div class="controls">
86
+ <label>
87
+ <span>Dataset:</span>
88
+ <select id="entropy-dataset">
89
+ <option value="mimic">MIMIC</option>
90
+ <option value="10k">10-K</option>
91
+ <option value="globem">GLOBEM</option>
92
+ </select>
93
+ </label>
94
+ </div>
95
+ <div id="entropy-chart" class="chart-container"></div>
96
+ </section>
97
+
98
+ <!-- Ranking Comparison Section -->
99
+ <section id="ranking" class="section">
100
+ <div class="section-header">
101
+ <h2>Novelty vs Accuracy Ranking</h2>
102
+ <p>Compare model rankings based on novelty (Bradley-Terry pairwise ranking) against traditional accuracy ranking.</p>
103
+ </div>
104
+ <div class="controls">
105
+ <label>
106
+ <span>Dataset:</span>
107
+ <select id="ranking-dataset">
108
+ <option value="MIMIC">MIMIC</option>
109
+ <option value="10K">10-K</option>
110
+ <option value="GLOBEM">GLOBEM</option>
111
+ </select>
112
+ </label>
113
+ </div>
114
+ <div id="ranking-chart" class="chart-container"></div>
115
+ </section>
116
+
117
+ <!-- Turn Distribution Section -->
118
+ <section id="turn" class="section">
119
+ <div class="section-header">
120
+ <h2>Turn Count Distribution</h2>
121
+ <p>Analyze the distribution of interaction turns across different models and datasets.</p>
122
+ </div>
123
+ <div class="controls">
124
+ <label>
125
+ <span>Dataset:</span>
126
+ <select id="turn-dataset">
127
+ <option value="mimic">MIMIC</option>
128
+ <option value="10k">10-K</option>
129
+ <option value="globem">GLOBEM</option>
130
+ </select>
131
+ </label>
132
+ </div>
133
+ <div id="turn-chart" class="chart-container tall"></div>
134
+ </section>
135
+
136
+ <!-- Probing Results Section -->
137
+ <section id="probing" class="section">
138
+ <div class="section-header">
139
+ <h2>FINISH Token Probing</h2>
140
+ <p>Analyze the average log probability of FINISH messages across conversation turns and progress.</p>
141
+ </div>
142
+ <div class="controls">
143
+ <label>
144
+ <span>View Mode:</span>
145
+ <select id="probing-mode">
146
+ <option value="byTurn">By Turn</option>
147
+ <option value="byProgress">By Progress (%)</option>
148
+ </select>
149
+ </label>
150
+ </div>
151
+ <div id="probing-chart" class="chart-container"></div>
152
+ </section>
153
+ </main>
154
+
155
+ <!-- Footer -->
156
+ <footer class="footer">
157
+ <p>DDR-Bench © 2026 | Deep Data Research Agent Benchmark</p>
158
+ </footer>
159
+
160
+ <script src="data.js"></script>
161
+ <script src="charts.js"></script>
162
+ </body>
163
  </html>
styles.css ADDED
@@ -0,0 +1,337 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Root Variables */
2
+ :root {
3
+ --primary: #6366f1;
4
+ --primary-dark: #4f46e5;
5
+ --primary-light: #818cf8;
6
+ --secondary: #10b981;
7
+ --accent: #f59e0b;
8
+ --bg-dark: #0f172a;
9
+ --bg-card: #1e293b;
10
+ --bg-card-hover: #334155;
11
+ --text-primary: #f1f5f9;
12
+ --text-secondary: #94a3b8;
13
+ --text-muted: #64748b;
14
+ --border: #334155;
15
+ --shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.3), 0 2px 4px -2px rgba(0, 0, 0, 0.2);
16
+ --shadow-lg: 0 10px 15px -3px rgba(0, 0, 0, 0.4), 0 4px 6px -4px rgba(0, 0, 0, 0.3);
17
+ --gradient-primary: linear-gradient(135deg, #6366f1 0%, #8b5cf6 100%);
18
+ --gradient-hero: linear-gradient(135deg, #1e293b 0%, #0f172a 50%, #1a1f3c 100%);
19
+ }
20
+
21
+ /* Reset & Base */
22
+ *, *::before, *::after {
23
+ box-sizing: border-box;
24
+ margin: 0;
25
+ padding: 0;
26
+ }
27
+
28
+ html {
29
+ scroll-behavior: smooth;
30
+ }
31
+
32
+ body {
33
+ font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
34
+ background-color: var(--bg-dark);
35
+ color: var(--text-primary);
36
+ line-height: 1.6;
37
+ min-height: 100vh;
38
+ }
39
+
40
+ /* Hero Section */
41
+ .hero {
42
+ background: var(--gradient-hero);
43
+ padding: 4rem 2rem 3rem;
44
+ text-align: center;
45
+ position: relative;
46
+ overflow: hidden;
47
+ }
48
+
49
+ .hero::before {
50
+ content: '';
51
+ position: absolute;
52
+ top: 0;
53
+ left: 0;
54
+ right: 0;
55
+ bottom: 0;
56
+ background:
57
+ radial-gradient(circle at 20% 50%, rgba(99, 102, 241, 0.15) 0%, transparent 50%),
58
+ radial-gradient(circle at 80% 50%, rgba(139, 92, 246, 0.1) 0%, transparent 50%);
59
+ pointer-events: none;
60
+ }
61
+
62
+ .hero-content {
63
+ max-width: 900px;
64
+ margin: 0 auto;
65
+ position: relative;
66
+ z-index: 1;
67
+ }
68
+
69
+ .badge {
70
+ display: inline-block;
71
+ background: rgba(99, 102, 241, 0.2);
72
+ color: var(--primary-light);
73
+ padding: 0.5rem 1rem;
74
+ border-radius: 2rem;
75
+ font-size: 0.85rem;
76
+ font-weight: 500;
77
+ margin-bottom: 1rem;
78
+ border: 1px solid rgba(99, 102, 241, 0.3);
79
+ }
80
+
81
+ .hero h1 {
82
+ font-size: 3.5rem;
83
+ font-weight: 700;
84
+ background: linear-gradient(135deg, #f1f5f9 0%, #818cf8 100%);
85
+ -webkit-background-clip: text;
86
+ -webkit-text-fill-color: transparent;
87
+ background-clip: text;
88
+ margin-bottom: 0.75rem;
89
+ letter-spacing: -0.02em;
90
+ }
91
+
92
+ .subtitle {
93
+ font-size: 1.35rem;
94
+ color: var(--text-secondary);
95
+ margin-bottom: 1rem;
96
+ font-weight: 400;
97
+ }
98
+
99
+ .description {
100
+ font-size: 1rem;
101
+ color: var(--text-muted);
102
+ max-width: 700px;
103
+ margin: 0 auto 2rem;
104
+ line-height: 1.7;
105
+ }
106
+
107
+ .stats-row {
108
+ display: flex;
109
+ justify-content: center;
110
+ gap: 3rem;
111
+ margin-top: 2rem;
112
+ }
113
+
114
+ .stat-item {
115
+ text-align: center;
116
+ }
117
+
118
+ .stat-value {
119
+ display: block;
120
+ font-size: 2.5rem;
121
+ font-weight: 700;
122
+ color: var(--primary-light);
123
+ }
124
+
125
+ .stat-label {
126
+ font-size: 0.9rem;
127
+ color: var(--text-muted);
128
+ }
129
+
130
+ /* Navigation Tabs */
131
+ .nav-tabs {
132
+ display: flex;
133
+ justify-content: center;
134
+ gap: 0.5rem;
135
+ padding: 1rem 2rem;
136
+ background: var(--bg-card);
137
+ border-bottom: 1px solid var(--border);
138
+ position: sticky;
139
+ top: 0;
140
+ z-index: 100;
141
+ flex-wrap: wrap;
142
+ }
143
+
144
+ .nav-tab {
145
+ padding: 0.75rem 1.5rem;
146
+ background: transparent;
147
+ border: 1px solid transparent;
148
+ border-radius: 0.5rem;
149
+ color: var(--text-secondary);
150
+ font-size: 0.95rem;
151
+ font-weight: 500;
152
+ cursor: pointer;
153
+ transition: all 0.2s ease;
154
+ font-family: inherit;
155
+ }
156
+
157
+ .nav-tab:hover {
158
+ color: var(--text-primary);
159
+ background: var(--bg-card-hover);
160
+ }
161
+
162
+ .nav-tab.active {
163
+ color: var(--primary-light);
164
+ background: rgba(99, 102, 241, 0.15);
165
+ border-color: rgba(99, 102, 241, 0.3);
166
+ }
167
+
168
+ /* Main Content */
169
+ .content {
170
+ max-width: 1400px;
171
+ margin: 0 auto;
172
+ padding: 2rem;
173
+ }
174
+
175
+ /* Sections */
176
+ .section {
177
+ display: none;
178
+ animation: fadeIn 0.3s ease;
179
+ }
180
+
181
+ .section.active {
182
+ display: block;
183
+ }
184
+
185
+ @keyframes fadeIn {
186
+ from { opacity: 0; transform: translateY(10px); }
187
+ to { opacity: 1; transform: translateY(0); }
188
+ }
189
+
190
+ .section-header {
191
+ margin-bottom: 2rem;
192
+ text-align: center;
193
+ }
194
+
195
+ .section-header h2 {
196
+ font-size: 1.75rem;
197
+ font-weight: 600;
198
+ color: var(--text-primary);
199
+ margin-bottom: 0.5rem;
200
+ }
201
+
202
+ .section-header p {
203
+ color: var(--text-muted);
204
+ font-size: 1rem;
205
+ }
206
+
207
+ /* Controls */
208
+ .controls {
209
+ display: flex;
210
+ justify-content: center;
211
+ gap: 1.5rem;
212
+ margin-bottom: 1.5rem;
213
+ flex-wrap: wrap;
214
+ }
215
+
216
+ .controls label {
217
+ display: flex;
218
+ align-items: center;
219
+ gap: 0.75rem;
220
+ }
221
+
222
+ .controls label span {
223
+ color: var(--text-secondary);
224
+ font-size: 0.9rem;
225
+ font-weight: 500;
226
+ }
227
+
228
+ .controls select {
229
+ padding: 0.6rem 1rem;
230
+ background: var(--bg-card);
231
+ border: 1px solid var(--border);
232
+ border-radius: 0.5rem;
233
+ color: var(--text-primary);
234
+ font-size: 0.9rem;
235
+ cursor: pointer;
236
+ transition: all 0.2s ease;
237
+ font-family: inherit;
238
+ min-width: 160px;
239
+ }
240
+
241
+ .controls select:hover {
242
+ border-color: var(--primary);
243
+ }
244
+
245
+ .controls select:focus {
246
+ outline: none;
247
+ border-color: var(--primary);
248
+ box-shadow: 0 0 0 3px rgba(99, 102, 241, 0.2);
249
+ }
250
+
251
+ /* Chart Container */
252
+ .chart-container {
253
+ background: var(--bg-card);
254
+ border-radius: 1rem;
255
+ padding: 1.5rem;
256
+ box-shadow: var(--shadow);
257
+ min-height: 500px;
258
+ border: 1px solid var(--border);
259
+ }
260
+
261
+ .chart-container.tall {
262
+ min-height: 700px;
263
+ }
264
+
265
+ /* Footer */
266
+ .footer {
267
+ text-align: center;
268
+ padding: 2rem;
269
+ color: var(--text-muted);
270
+ font-size: 0.9rem;
271
+ border-top: 1px solid var(--border);
272
+ margin-top: 3rem;
273
+ }
274
+
275
+ /* Responsive */
276
+ @media (max-width: 768px) {
277
+ .hero {
278
+ padding: 3rem 1.5rem 2rem;
279
+ }
280
+
281
+ .hero h1 {
282
+ font-size: 2.5rem;
283
+ }
284
+
285
+ .subtitle {
286
+ font-size: 1.1rem;
287
+ }
288
+
289
+ .stats-row {
290
+ gap: 1.5rem;
291
+ }
292
+
293
+ .stat-value {
294
+ font-size: 2rem;
295
+ }
296
+
297
+ .nav-tabs {
298
+ padding: 0.75rem 1rem;
299
+ gap: 0.25rem;
300
+ }
301
+
302
+ .nav-tab {
303
+ padding: 0.5rem 1rem;
304
+ font-size: 0.85rem;
305
+ }
306
+
307
+ .content {
308
+ padding: 1rem;
309
+ }
310
+
311
+ .controls {
312
+ flex-direction: column;
313
+ align-items: stretch;
314
+ }
315
+
316
+ .controls label {
317
+ flex-direction: column;
318
+ align-items: flex-start;
319
+ }
320
+
321
+ .controls select {
322
+ width: 100%;
323
+ }
324
+ }
325
+
326
+ /* Plotly overrides for dark theme */
327
+ .js-plotly-plot .plotly .modebar {
328
+ background: rgba(30, 41, 59, 0.9) !important;
329
+ }
330
+
331
+ .js-plotly-plot .plotly .modebar-btn path {
332
+ fill: var(--text-secondary) !important;
333
+ }
334
+
335
+ .js-plotly-plot .plotly .modebar-btn:hover path {
336
+ fill: var(--text-primary) !important;
337
+ }