thinkwee commited on
Commit
daa6be0
·
1 Parent(s): f17ef98
Files changed (3) hide show
  1. charts.js +346 -470
  2. index.html +73 -73
  3. styles.css +135 -93
charts.js CHANGED
@@ -1,5 +1,5 @@
1
- // DDR-Bench Interactive Charts
2
- // Using Plotly.js for interactive visualizations
3
 
4
  // Common Plotly layout settings for dark theme
5
  const darkLayout = {
@@ -7,362 +7,298 @@ const darkLayout = {
7
  plot_bgcolor: 'rgba(30, 41, 59, 0)',
8
  font: {
9
  family: 'Inter, sans-serif',
10
- color: '#e2e8f0'
 
11
  },
12
  xaxis: {
13
- gridcolor: 'rgba(148, 163, 184, 0.15)',
14
- linecolor: 'rgba(148, 163, 184, 0.3)',
15
- tickfont: { color: '#94a3b8' },
16
- title: { font: { color: '#e2e8f0' } }
17
  },
18
  yaxis: {
19
- gridcolor: 'rgba(148, 163, 184, 0.15)',
20
- linecolor: 'rgba(148, 163, 184, 0.3)',
21
- tickfont: { color: '#94a3b8' },
22
- title: { font: { color: '#e2e8f0' } }
23
  },
24
  legend: {
25
- bgcolor: 'rgba(30, 41, 59, 0.8)',
26
- bordercolor: 'rgba(148, 163, 184, 0.3)',
27
  borderwidth: 1,
28
- font: { color: '#e2e8f0' }
 
 
 
 
29
  },
30
  hoverlabel: {
31
  bgcolor: '#1e293b',
32
  bordercolor: '#6366f1',
33
- font: { color: '#e2e8f0' }
34
  },
35
- margin: { t: 40, r: 20, b: 60, l: 70 }
36
  };
37
 
38
  const plotlyConfig = {
39
  displayModeBar: true,
40
  responsive: true,
41
- modeBarButtonsToRemove: ['lasso2d', 'select2d'],
42
  displaylogo: false
43
  };
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  // Tab Navigation
46
  document.querySelectorAll('.nav-tab').forEach(tab => {
47
  tab.addEventListener('click', () => {
48
- // Update active tab
49
  document.querySelectorAll('.nav-tab').forEach(t => t.classList.remove('active'));
50
  tab.classList.add('active');
51
 
52
- // Show corresponding section
53
  const sectionId = tab.dataset.section;
54
  document.querySelectorAll('.section').forEach(s => s.classList.remove('active'));
55
  document.getElementById(sectionId).classList.add('active');
56
 
57
  // Resize plots on tab change
58
- window.dispatchEvent(new Event('resize'));
59
  });
60
  });
61
 
62
  // ============================================================================
63
- // SCALING ANALYSIS CHART
64
  // ============================================================================
65
- function renderScalingChart() {
66
- const dataset = document.getElementById('scaling-dataset').value;
67
- const dimension = document.getElementById('scaling-dimension').value;
68
-
69
- const data = DDR_DATA.scaling[dataset];
70
- if (!data) return;
71
-
72
- const traces = [];
73
- const models = Object.keys(data);
74
-
75
- models.forEach(model => {
76
- const modelData = data[model];
77
- let xValues, xLabel;
78
-
79
- switch (dimension) {
80
- case 'turn':
81
- xValues = modelData.turns;
82
- xLabel = 'Number of Interaction Turns';
83
- break;
84
- case 'token':
85
- xValues = modelData.tokens;
86
- xLabel = 'Total Tokens Used';
87
- break;
88
- case 'cost':
89
- xValues = modelData.costs;
90
- xLabel = 'Inference Cost ($)';
91
- break;
92
- }
93
 
94
- traces.push({
95
- x: xValues,
96
- y: modelData.accuracy,
97
- mode: 'lines+markers',
98
- name: model,
99
- line: {
100
- color: DDR_DATA.modelColors[model] || '#888',
101
- width: 2.5
102
- },
103
- marker: {
104
- size: 6,
105
- color: DDR_DATA.modelColors[model] || '#888'
106
- },
107
- hovertemplate: `<b>${model}</b><br>` +
108
- `${dimension === 'cost' ? 'Cost: $' : dimension === 'token' ? 'Tokens: ' : 'Turn: '}%{x}<br>` +
109
- `Accuracy: %{y:.1f}%<extra></extra>`
 
 
 
 
 
 
 
 
 
110
  });
111
- });
112
 
113
- const layout = {
114
- ...darkLayout,
115
- title: {
116
- text: `${dataset.toUpperCase()} - ${dimension.charAt(0).toUpperCase() + dimension.slice(1)} Scaling`,
117
- font: { size: 18, color: '#f1f5f9' }
118
- },
119
- xaxis: {
120
- ...darkLayout.xaxis,
121
- title: {
122
- text: dimension === 'turn' ? 'Number of Interaction Turns' :
123
- dimension === 'token' ? 'Total Tokens Used' : 'Inference Cost ($)',
124
- font: { size: 14, color: '#e2e8f0' }
125
  },
126
- type: dimension === 'cost' ? 'log' : 'linear'
127
- },
128
- yaxis: {
129
- ...darkLayout.yaxis,
130
- title: { text: 'Accuracy (%)', font: { size: 14, color: '#e2e8f0' } }
131
- },
132
- showlegend: true,
133
- legend: {
134
- ...darkLayout.legend,
135
- orientation: 'h',
136
- y: -0.2,
137
- x: 0.5,
138
- xanchor: 'center'
139
- }
140
- };
141
 
142
- Plotly.newPlot('scaling-chart', traces, layout, plotlyConfig);
 
143
  }
144
 
145
- // Event listeners for scaling controls
146
- document.getElementById('scaling-dataset').addEventListener('change', renderScalingChart);
147
- document.getElementById('scaling-dimension').addEventListener('change', renderScalingChart);
148
-
149
- // ============================================================================
150
- // ENTROPY ANALYSIS CHART
151
- // ============================================================================
152
- function renderEntropyChart() {
153
- const dataset = document.getElementById('entropy-dataset').value;
154
- const data = DDR_DATA.entropy[dataset];
155
- if (!data) return;
156
 
157
- const traces = [];
158
- const models = Object.keys(data);
 
159
 
160
- models.forEach(model => {
161
- const modelData = data[model];
 
162
 
163
- // Normalize accuracy for marker size (10-30 range)
164
- const sizes = modelData.accuracy.map(a => 8 + (a / Math.max(...modelData.accuracy)) * 15);
 
 
 
 
 
 
 
 
 
 
 
 
 
165
 
166
- // Normalize accuracy for opacity (0.4-1.0 range)
167
- const maxAcc = Math.max(...modelData.accuracy);
168
- const minAcc = Math.min(...modelData.accuracy);
169
- const opacities = modelData.accuracy.map(a => 0.4 + 0.6 * (a - minAcc) / (maxAcc - minAcc || 1));
170
 
171
- traces.push({
172
- x: modelData.entropy,
173
- y: modelData.coverage,
174
- mode: 'markers',
175
- name: model,
176
- marker: {
177
- size: sizes,
178
- color: DDR_DATA.modelColors[model] || '#888',
179
- opacity: opacities,
180
- line: {
181
- color: '#000',
182
- width: 0.5
183
  }
184
- },
185
- text: modelData.accuracy.map(a => `Accuracy: ${a}%`),
186
- hovertemplate: `<b>${model}</b><br>` +
187
- `Entropy: %{x:.2f}<br>` +
188
- `Coverage: %{y:.2f}<br>` +
189
- `%{text}<extra></extra>`
190
- });
191
- });
192
 
193
- const layout = {
194
- ...darkLayout,
195
- title: {
196
- text: `${dataset.toUpperCase()} - Entropy vs Coverage (Marker Size/Opacity = Accuracy)`,
197
- font: { size: 18, color: '#f1f5f9' }
198
- },
199
- xaxis: {
200
- ...darkLayout.xaxis,
201
- title: { text: 'Normalized Access Entropy', font: { size: 14, color: '#e2e8f0' } },
202
- range: [0.6, 1.0]
203
- },
204
- yaxis: {
205
- ...darkLayout.yaxis,
206
- title: { text: 'Coverage', font: { size: 14, color: '#e2e8f0' } }
207
- },
208
- showlegend: true,
209
- legend: {
210
- ...darkLayout.legend,
211
- orientation: 'h',
212
- y: -0.2,
213
- x: 0.5,
214
- xanchor: 'center'
215
- }
216
- };
217
 
218
- Plotly.newPlot('entropy-chart', traces, layout, plotlyConfig);
 
 
 
 
 
219
  }
220
 
221
- document.getElementById('entropy-dataset').addEventListener('change', renderEntropyChart);
 
 
 
 
 
 
 
 
 
 
222
 
223
  // ============================================================================
224
- // RANKING COMPARISON CHART
225
  // ============================================================================
226
- function renderRankingChart() {
227
- const dataset = document.getElementById('ranking-dataset').value;
228
- const data = DDR_DATA.ranking[dataset];
229
- if (!data) return;
230
-
231
- // Take top 22 models
232
- const models = data.slice(0, 22);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
 
234
- // Create traces for novelty rank (circles) and accuracy rank (diamonds)
235
- const traces = [];
 
 
 
 
 
 
 
 
 
 
 
 
 
236
 
237
- // Connection lines
238
- models.forEach((m, i) => {
239
  traces.push({
240
- x: [m.bt_rank, m.acc_rank],
241
- y: [i, i],
242
- mode: 'lines',
243
- line: {
244
- color: 'rgba(148, 163, 184, 0.3)',
245
- width: 1,
246
- dash: 'dash'
 
 
247
  },
248
- showlegend: false,
249
- hoverinfo: 'skip'
250
  });
251
- });
252
 
253
- // Novelty rank points (circles)
254
- traces.push({
255
- x: models.map(m => m.bt_rank),
256
- y: models.map((m, i) => i),
257
- mode: 'markers',
258
- name: 'Novelty Rank',
259
- marker: {
260
- size: 12,
261
- symbol: 'circle',
262
- color: models.map(m => m.is_proprietary ? '#6A0DAD' : '#228B22'),
263
- line: { color: '#000', width: 1 }
264
- },
265
- text: models.map(m => `${m.model}<br>Novelty Rank: ${m.bt_rank}<br>Win Rate: ${m.win_rate}%`),
266
- hovertemplate: '%{text}<extra></extra>'
267
- });
 
 
 
 
 
 
 
268
 
269
- // Accuracy rank points (diamonds)
270
- traces.push({
271
- x: models.map(m => m.acc_rank),
272
- y: models.map((m, i) => i),
273
- mode: 'markers',
274
- name: 'Accuracy Rank',
275
- marker: {
276
- size: 14,
277
- symbol: 'diamond-open',
278
- color: models.map(m => m.is_proprietary ? '#6A0DAD' : '#228B22'),
279
- line: { width: 2 }
280
- },
281
- text: models.map(m => `${m.model}<br>Accuracy Rank: ${m.acc_rank}<br>Accuracy: ${m.accuracy}%`),
282
- hovertemplate: '%{text}<extra></extra>'
283
  });
284
-
285
- // Calculate correlation
286
- const btRanks = models.map(m => m.bt_rank);
287
- const accRanks = models.map(m => m.acc_rank);
288
- const correlation = calculateCorrelation(btRanks, accRanks);
289
-
290
- const layout = {
291
- ...darkLayout,
292
- title: {
293
- text: `${dataset} - Novelty vs Accuracy Ranking (ρ = ${correlation.toFixed(2)})`,
294
- font: { size: 18, color: '#f1f5f9' }
295
- },
296
- xaxis: {
297
- ...darkLayout.xaxis,
298
- title: { text: 'Rank', font: { size: 14, color: '#e2e8f0' } },
299
- range: [23, 0],
300
- tickmode: 'linear',
301
- dtick: 2
302
- },
303
- yaxis: {
304
- ...darkLayout.yaxis,
305
- tickmode: 'array',
306
- tickvals: models.map((_, i) => i),
307
- ticktext: models.map(m => m.model.replace(/-/g, ' ')),
308
- automargin: true
309
- },
310
- showlegend: true,
311
- legend: {
312
- ...darkLayout.legend,
313
- orientation: 'h',
314
- y: -0.15,
315
- x: 0.5,
316
- xanchor: 'center'
317
- },
318
- annotations: [
319
- {
320
- x: 0.02,
321
- y: 0.98,
322
- xref: 'paper',
323
- yref: 'paper',
324
- text: '🟣 Proprietary 🟢 Open-Source',
325
- showarrow: false,
326
- font: { size: 12, color: '#94a3b8' },
327
- bgcolor: 'rgba(30, 41, 59, 0.8)',
328
- borderpad: 5
329
- }
330
- ],
331
- margin: { ...darkLayout.margin, l: 180 }
332
- };
333
-
334
- Plotly.newPlot('ranking-chart', traces, layout, plotlyConfig);
335
  }
336
 
337
- function calculateCorrelation(x, y) {
338
- const n = x.length;
339
- const sumX = x.reduce((a, b) => a + b, 0);
340
- const sumY = y.reduce((a, b) => a + b, 0);
341
- const sumXY = x.reduce((acc, xi, i) => acc + xi * y[i], 0);
342
- const sumX2 = x.reduce((acc, xi) => acc + xi * xi, 0);
343
- const sumY2 = y.reduce((acc, yi) => acc + yi * yi, 0);
344
-
345
- const numerator = n * sumXY - sumX * sumY;
346
- const denominator = Math.sqrt((n * sumX2 - sumX * sumX) * (n * sumY2 - sumY * sumY));
347
-
348
- return denominator !== 0 ? numerator / denominator : 0;
349
- }
350
-
351
- document.getElementById('ranking-dataset').addEventListener('change', renderRankingChart);
352
-
353
  // ============================================================================
354
- // TURN DISTRIBUTION CHART (Ridgeline-like)
355
  // ============================================================================
356
- function renderTurnChart() {
357
- const dataset = document.getElementById('turn-dataset').value;
358
- const data = DDR_DATA.turn[dataset];
359
- if (!data) return;
360
-
361
- // Sort by median (descending)
362
- const sortedData = [...data].sort((a, b) => b.median - a.median);
363
-
364
- const traces = [];
365
- const binLabels = ['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100'];
366
 
367
  // Family colors
368
  const familyColors = {
@@ -384,116 +320,73 @@ function renderTurnChart() {
384
  return '#888';
385
  }
386
 
387
- sortedData.forEach((model, i) => {
388
- const color = getModelColor(model.model);
 
389
 
390
- traces.push({
391
- x: model.distribution,
392
- y: binLabels,
393
- orientation: 'h',
394
- name: `${model.model} (med=${model.median})`,
395
- type: 'bar',
396
- marker: {
397
- color: color,
398
- opacity: 0.7
399
- },
400
- xaxis: `x${i + 1}`,
401
- yaxis: 'y',
402
- hovertemplate: `<b>${model.model}</b><br>` +
403
- `Turns: %{y}<br>` +
404
- `Sessions: %{x}%<extra></extra>`
405
- });
406
- });
407
 
408
- // Create subplot annotations
409
- const annotations = sortedData.map((model, i) => ({
410
- x: 0.5,
411
- y: i,
412
- xref: 'paper',
413
- yref: 'paper',
414
- text: `<b>${model.model}</b> (median: ${model.median})`,
415
- showarrow: false,
416
- font: { size: 11, color: '#e2e8f0' },
417
- xanchor: 'center'
418
- }));
419
-
420
- // Use a violin-like grouped bar approach instead
421
- const violinTraces = sortedData.map((model, i) => {
422
- const color = getModelColor(model.model);
423
- const cumsum = model.distribution.reduce((acc, v, idx) => {
424
- acc.push((acc[idx - 1] || 0) + v);
425
- return acc;
426
- }, []);
427
-
428
- // Create x values from 0 to 100
429
- const xVals = Array.from({ length: 100 }, (_, k) => k);
430
- const yVals = xVals.map(x => {
431
- const binIdx = Math.min(Math.floor(x / 10), 9);
432
- return model.distribution[binIdx] / 10; // Scale down
433
  });
434
 
435
- return {
436
- x: xVals,
437
- y: yVals.map(v => v + i * 12), // Stack vertically
438
- fill: 'tozeroy',
439
- fillcolor: color + '80',
440
- line: { color: color, width: 1.5 },
441
- name: `${model.model} (med=${model.median})`,
442
- mode: 'lines',
443
- hovertemplate: `<b>${model.model}</b><br>` +
444
- `Median: ${model.median} turns<extra></extra>`
 
 
 
 
445
  };
446
- });
447
-
448
- const layout = {
449
- ...darkLayout,
450
- title: {
451
- text: `${dataset.toUpperCase()} - Turn Count Distribution`,
452
- font: { size: 18, color: '#f1f5f9' }
453
- },
454
- xaxis: {
455
- ...darkLayout.xaxis,
456
- title: { text: 'Number of Turns', font: { size: 14, color: '#e2e8f0' } },
457
- range: [0, 100]
458
- },
459
- yaxis: {
460
- ...darkLayout.yaxis,
461
- title: { text: '', font: { size: 14, color: '#e2e8f0' } },
462
- tickmode: 'array',
463
- tickvals: sortedData.map((_, i) => i * 12 + 3),
464
- ticktext: sortedData.map(m => `${m.model} (${m.median})`),
465
- showgrid: false
466
- },
467
- showlegend: false,
468
- height: 700,
469
- margin: { ...darkLayout.margin, l: 200 }
470
- };
471
 
472
- Plotly.newPlot('turn-chart', violinTraces, layout, plotlyConfig);
 
473
  }
474
 
475
- document.getElementById('turn-dataset').addEventListener('change', renderTurnChart);
476
-
477
  // ============================================================================
478
- // PROBING RESULTS CHART
479
  // ============================================================================
480
- function renderProbingChart() {
481
- const mode = document.getElementById('probing-mode').value;
482
- const scenarios = ['mimic', 'globem', '10k'];
483
- const scenarioTitles = { mimic: 'MIMIC', globem: 'GLOBEM', '10k': '10-K' };
484
 
485
- const data = DDR_DATA.probing[mode];
486
- if (!data) return;
 
487
 
488
- const traces = [];
489
- const models = Object.keys(data.mimic);
 
490
 
491
- // Create subplots for each scenario
492
- scenarios.forEach((scenario, scIdx) => {
493
- const scenarioData = data[scenario];
494
 
495
  models.forEach(model => {
496
- const modelData = scenarioData[model];
497
  const xKey = mode === 'byTurn' ? 'turns' : 'progress';
498
  const xLabel = mode === 'byTurn' ? 'Turn' : 'Progress (%)';
499
 
@@ -503,111 +396,94 @@ function renderProbingChart() {
503
  y: modelData.logprob,
504
  mode: 'lines+markers',
505
  name: model,
506
- legendgroup: model,
507
- showlegend: scIdx === 0,
508
  line: {
509
- color: DDR_DATA.probingColors[model],
510
  width: 2
511
  },
512
  marker: {
513
- size: 5,
514
- color: DDR_DATA.probingColors[model]
515
  },
516
- xaxis: `x${scIdx + 1}`,
517
- yaxis: `y${scIdx + 1}`,
518
- hovertemplate: `<b>${model}</b><br>` +
519
- `${xLabel}: %{x}<br>` +
520
- `Log Prob: %{y:.2f}<extra></extra>`
521
  });
522
 
523
- // Error band (SEM)
524
- const upper = modelData.logprob.map((v, i) => v + modelData.sem[i]);
525
- const lower = modelData.logprob.map((v, i) => v - modelData.sem[i]);
526
-
527
- traces.push({
528
- x: [...modelData[xKey], ...modelData[xKey].slice().reverse()],
529
- y: [...upper, ...lower.slice().reverse()],
530
- fill: 'toself',
531
- fillcolor: DDR_DATA.probingColors[model] + '30',
532
- line: { width: 0 },
533
- showlegend: false,
534
- legendgroup: model,
535
- xaxis: `x${scIdx + 1}`,
536
- yaxis: `y${scIdx + 1}`,
537
- hoverinfo: 'skip'
538
- });
539
  });
540
- });
541
 
542
- const layout = {
543
- paper_bgcolor: 'rgba(30, 41, 59, 0)',
544
- plot_bgcolor: 'rgba(30, 41, 59, 0)',
545
- font: { family: 'Inter, sans-serif', color: '#e2e8f0' },
546
- title: {
547
- text: `FINISH Token Avg Log Probability ${mode === 'byTurn' ? 'by Turn' : 'by Progress'}`,
548
- font: { size: 18, color: '#f1f5f9' }
549
- },
550
- grid: { rows: 1, columns: 3, pattern: 'independent' },
551
- annotations: scenarios.map((sc, i) => ({
552
- text: scenarioTitles[sc],
553
- font: { size: 14, color: '#e2e8f0' },
554
- showarrow: false,
555
- x: (i + 0.5) / 3,
556
- y: 1.08,
557
- xref: 'paper',
558
- yref: 'paper'
559
- })),
560
- showlegend: true,
561
- legend: {
562
- orientation: 'h',
563
- y: -0.15,
564
- x: 0.5,
565
- xanchor: 'center',
566
- bgcolor: 'rgba(30, 41, 59, 0.8)',
567
- font: { color: '#e2e8f0' }
568
- },
569
- margin: { t: 80, r: 20, b: 100, l: 60 }
570
- };
571
-
572
- // Add axis configs for each subplot
573
- scenarios.forEach((sc, i) => {
574
- const xKey = `xaxis${i === 0 ? '' : i + 1}`;
575
- const yKey = `yaxis${i === 0 ? '' : i + 1}`;
576
-
577
- layout[xKey] = {
578
- title: { text: mode === 'byTurn' ? 'Turn' : 'Progress (%)', font: { size: 12 } },
579
- gridcolor: 'rgba(148, 163, 184, 0.15)',
580
- tickfont: { color: '#94a3b8' },
581
- domain: [i / 3 + 0.02, (i + 1) / 3 - 0.02]
582
- };
583
- layout[yKey] = {
584
- title: i === 0 ? { text: 'Avg Log Probability', font: { size: 12 } } : {},
585
- gridcolor: 'rgba(148, 163, 184, 0.15)',
586
- tickfont: { color: '#94a3b8' }
587
  };
588
- });
589
 
590
- Plotly.newPlot('probing-chart', traces, layout, plotlyConfig);
 
591
  }
592
 
593
- document.getElementById('probing-mode').addEventListener('change', renderProbingChart);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
594
 
595
  // ============================================================================
596
  // INITIALIZE ALL CHARTS
597
  // ============================================================================
598
  document.addEventListener('DOMContentLoaded', () => {
599
- renderScalingChart();
600
- renderEntropyChart();
601
- renderRankingChart();
602
- renderTurnChart();
603
- renderProbingChart();
604
  });
605
 
606
  // Handle window resize
 
607
  window.addEventListener('resize', () => {
608
- Plotly.Plots.resize('scaling-chart');
609
- Plotly.Plots.resize('entropy-chart');
610
- Plotly.Plots.resize('ranking-chart');
611
- Plotly.Plots.resize('turn-chart');
612
- Plotly.Plots.resize('probing-chart');
 
 
 
 
613
  });
 
1
+ // DDR-Bench Interactive Charts with Smooth Animations
2
+ // Using Plotly.js with animate for smooth transitions
3
 
4
  // Common Plotly layout settings for dark theme
5
  const darkLayout = {
 
7
  plot_bgcolor: 'rgba(30, 41, 59, 0)',
8
  font: {
9
  family: 'Inter, sans-serif',
10
+ color: '#e2e8f0',
11
+ size: 11
12
  },
13
  xaxis: {
14
+ gridcolor: 'rgba(148, 163, 184, 0.12)',
15
+ linecolor: 'rgba(148, 163, 184, 0.2)',
16
+ tickfont: { color: '#94a3b8', size: 10 },
17
+ title: { font: { color: '#e2e8f0', size: 11 } }
18
  },
19
  yaxis: {
20
+ gridcolor: 'rgba(148, 163, 184, 0.12)',
21
+ linecolor: 'rgba(148, 163, 184, 0.2)',
22
+ tickfont: { color: '#94a3b8', size: 10 },
23
+ title: { font: { color: '#e2e8f0', size: 11 } }
24
  },
25
  legend: {
26
+ bgcolor: 'rgba(30, 41, 59, 0.9)',
27
+ bordercolor: 'rgba(148, 163, 184, 0.2)',
28
  borderwidth: 1,
29
+ font: { color: '#e2e8f0', size: 10 },
30
+ orientation: 'h',
31
+ y: -0.2,
32
+ x: 0.5,
33
+ xanchor: 'center'
34
  },
35
  hoverlabel: {
36
  bgcolor: '#1e293b',
37
  bordercolor: '#6366f1',
38
+ font: { color: '#e2e8f0', size: 11 }
39
  },
40
+ margin: { t: 20, r: 15, b: 60, l: 50 }
41
  };
42
 
43
  const plotlyConfig = {
44
  displayModeBar: true,
45
  responsive: true,
46
+ modeBarButtonsToRemove: ['lasso2d', 'select2d', 'autoScale2d'],
47
  displaylogo: false
48
  };
49
 
50
+ // Animation settings for smooth transitions
51
+ const animationSettings = {
52
+ transition: {
53
+ duration: 500,
54
+ easing: 'cubic-in-out'
55
+ },
56
+ frame: {
57
+ duration: 500
58
+ }
59
+ };
60
+
61
+ // Current state
62
+ let currentScalingDim = 'turn';
63
+ let currentProbingMode = 'byTurn';
64
+
65
  // Tab Navigation
66
  document.querySelectorAll('.nav-tab').forEach(tab => {
67
  tab.addEventListener('click', () => {
 
68
  document.querySelectorAll('.nav-tab').forEach(t => t.classList.remove('active'));
69
  tab.classList.add('active');
70
 
 
71
  const sectionId = tab.dataset.section;
72
  document.querySelectorAll('.section').forEach(s => s.classList.remove('active'));
73
  document.getElementById(sectionId).classList.add('active');
74
 
75
  // Resize plots on tab change
76
+ setTimeout(() => window.dispatchEvent(new Event('resize')), 100);
77
  });
78
  });
79
 
80
  // ============================================================================
81
+ // SCALING ANALYSIS - 3 Charts with animated dimension switching
82
  // ============================================================================
83
+ function initScalingCharts() {
84
+ const scenarios = ['mimic', '10k', 'globem'];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
+ scenarios.forEach(scenario => {
87
+ const data = DDR_DATA.scaling[scenario];
88
+ if (!data) return;
89
+
90
+ const traces = [];
91
+ const models = Object.keys(data);
92
+
93
+ models.forEach(model => {
94
+ const modelData = data[model];
95
+
96
+ traces.push({
97
+ x: modelData.turns,
98
+ y: modelData.accuracy,
99
+ mode: 'lines+markers',
100
+ name: model,
101
+ line: {
102
+ color: DDR_DATA.modelColors[model] || '#888',
103
+ width: 2
104
+ },
105
+ marker: {
106
+ size: 5,
107
+ color: DDR_DATA.modelColors[model] || '#888'
108
+ },
109
+ hovertemplate: `<b>${model}</b><br>Turn: %{x}<br>Accuracy: %{y:.1f}%<extra></extra>`
110
+ });
111
  });
 
112
 
113
+ const layout = {
114
+ ...darkLayout,
115
+ xaxis: {
116
+ ...darkLayout.xaxis,
117
+ title: { text: 'Interaction Turns', font: { size: 11, color: '#e2e8f0' } }
 
 
 
 
 
 
 
118
  },
119
+ yaxis: {
120
+ ...darkLayout.yaxis,
121
+ title: { text: 'Accuracy (%)', font: { size: 11, color: '#e2e8f0' } }
122
+ },
123
+ showlegend: true
124
+ };
 
 
 
 
 
 
 
 
 
125
 
126
+ Plotly.newPlot(`scaling-${scenario}`, traces, layout, plotlyConfig);
127
+ });
128
  }
129
 
130
+ function updateScalingCharts(dimension) {
131
+ const scenarios = ['mimic', '10k', 'globem'];
132
+ const xLabels = {
133
+ 'turn': 'Interaction Turns',
134
+ 'token': 'Token Usage',
135
+ 'cost': 'Inference Cost ($)'
136
+ };
 
 
 
 
137
 
138
+ scenarios.forEach(scenario => {
139
+ const data = DDR_DATA.scaling[scenario];
140
+ if (!data) return;
141
 
142
+ const models = Object.keys(data);
143
+ const newX = [];
144
+ const newY = [];
145
 
146
+ models.forEach(model => {
147
+ const modelData = data[model];
148
+ let xValues;
149
+
150
+ switch (dimension) {
151
+ case 'turn':
152
+ xValues = modelData.turns;
153
+ break;
154
+ case 'token':
155
+ xValues = modelData.tokens;
156
+ break;
157
+ case 'cost':
158
+ xValues = modelData.costs;
159
+ break;
160
+ }
161
 
162
+ newX.push(xValues);
163
+ newY.push(modelData.accuracy);
164
+ });
 
165
 
166
+ // Animate the transition
167
+ Plotly.animate(`scaling-${scenario}`, {
168
+ data: newX.map((x, i) => ({ x, y: newY[i] })),
169
+ traces: models.map((_, i) => i),
170
+ layout: {
171
+ xaxis: {
172
+ title: { text: xLabels[dimension], font: { size: 11, color: '#e2e8f0' } },
173
+ type: dimension === 'cost' ? 'log' : 'linear'
 
 
 
 
174
  }
175
+ }
176
+ }, animationSettings);
 
 
 
 
 
 
177
 
178
+ // Update hover templates
179
+ const hoverLabels = {
180
+ 'turn': 'Turn',
181
+ 'token': 'Tokens',
182
+ 'cost': 'Cost: $'
183
+ };
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
+ models.forEach((model, i) => {
186
+ Plotly.restyle(`scaling-${scenario}`, {
187
+ hovertemplate: `<b>${model}</b><br>${hoverLabels[dimension]}: %{x}<br>Accuracy: %{y:.1f}%<extra></extra>`
188
+ }, [i]);
189
+ });
190
+ });
191
  }
192
 
193
+ // Dimension toggle event listeners
194
+ document.querySelectorAll('.dim-btn:not(.probing-dim)').forEach(btn => {
195
+ btn.addEventListener('click', () => {
196
+ document.querySelectorAll('.dim-btn:not(.probing-dim)').forEach(b => b.classList.remove('active'));
197
+ btn.classList.add('active');
198
+
199
+ const dimension = btn.dataset.dim;
200
+ currentScalingDim = dimension;
201
+ updateScalingCharts(dimension);
202
+ });
203
+ });
204
 
205
  // ============================================================================
206
+ // RANKING COMPARISON - 3 Charts
207
  // ============================================================================
208
+ function initRankingCharts() {
209
+ const scenarios = [
210
+ { key: 'MIMIC', id: 'mimic' },
211
+ { key: '10K', id: '10k' },
212
+ { key: 'GLOBEM', id: 'globem' }
213
+ ];
214
+
215
+ scenarios.forEach(({ key, id }) => {
216
+ const data = DDR_DATA.ranking[key];
217
+ if (!data) return;
218
+
219
+ const models = data.slice(0, 15); // Top 15 models
220
+ const traces = [];
221
+
222
+ // Connection lines
223
+ models.forEach((m, i) => {
224
+ traces.push({
225
+ x: [m.bt_rank, m.acc_rank],
226
+ y: [i, i],
227
+ mode: 'lines',
228
+ line: {
229
+ color: 'rgba(148, 163, 184, 0.25)',
230
+ width: 1,
231
+ dash: 'dash'
232
+ },
233
+ showlegend: false,
234
+ hoverinfo: 'skip'
235
+ });
236
+ });
237
 
238
+ // Novelty rank points
239
+ traces.push({
240
+ x: models.map(m => m.bt_rank),
241
+ y: models.map((_, i) => i),
242
+ mode: 'markers',
243
+ name: 'Novelty',
244
+ marker: {
245
+ size: 10,
246
+ symbol: 'circle',
247
+ color: models.map(m => m.is_proprietary ? '#8B5CF6' : '#22C55E'),
248
+ line: { color: '#000', width: 0.5 }
249
+ },
250
+ text: models.map(m => `${m.model}<br>Novelty: #${m.bt_rank}<br>Win: ${m.win_rate}%`),
251
+ hovertemplate: '%{text}<extra></extra>'
252
+ });
253
 
254
+ // Accuracy rank points
 
255
  traces.push({
256
+ x: models.map(m => m.acc_rank),
257
+ y: models.map((_, i) => i),
258
+ mode: 'markers',
259
+ name: 'Accuracy',
260
+ marker: {
261
+ size: 12,
262
+ symbol: 'diamond-open',
263
+ color: models.map(m => m.is_proprietary ? '#8B5CF6' : '#22C55E'),
264
+ line: { width: 2 }
265
  },
266
+ text: models.map(m => `${m.model}<br>Accuracy: #${m.acc_rank}<br>${m.accuracy}%`),
267
+ hovertemplate: '%{text}<extra></extra>'
268
  });
 
269
 
270
+ const layout = {
271
+ ...darkLayout,
272
+ xaxis: {
273
+ ...darkLayout.xaxis,
274
+ title: { text: 'Rank', font: { size: 11, color: '#e2e8f0' } },
275
+ range: [Math.max(...models.map(m => Math.max(m.bt_rank, m.acc_rank))) + 1, 0],
276
+ dtick: 2
277
+ },
278
+ yaxis: {
279
+ ...darkLayout.yaxis,
280
+ tickmode: 'array',
281
+ tickvals: models.map((_, i) => i),
282
+ ticktext: models.map(m => m.model.substring(0, 15)),
283
+ automargin: true
284
+ },
285
+ showlegend: true,
286
+ legend: {
287
+ ...darkLayout.legend,
288
+ y: -0.12
289
+ },
290
+ margin: { ...darkLayout.margin, l: 120 }
291
+ };
292
 
293
+ Plotly.newPlot(`ranking-${id}`, traces, layout, plotlyConfig);
 
 
 
 
 
 
 
 
 
 
 
 
 
294
  });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
  }
296
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
297
  // ============================================================================
298
+ // TURN DISTRIBUTION - 3 Charts (Box plots)
299
  // ============================================================================
300
+ function initTurnCharts() {
301
+ const scenarios = ['mimic', '10k', 'globem'];
 
 
 
 
 
 
 
 
302
 
303
  // Family colors
304
  const familyColors = {
 
320
  return '#888';
321
  }
322
 
323
+ scenarios.forEach(scenario => {
324
+ const data = DDR_DATA.turn[scenario];
325
+ if (!data) return;
326
 
327
+ const sortedData = [...data].sort((a, b) => a.median - b.median);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
328
 
329
+ const traces = sortedData.map((model, i) => {
330
+ const color = getModelColor(model.model);
331
+
332
+ return {
333
+ y: [model.model],
334
+ x: [model.median],
335
+ type: 'bar',
336
+ orientation: 'h',
337
+ name: model.model,
338
+ marker: {
339
+ color: color,
340
+ opacity: 0.8
341
+ },
342
+ text: [`${model.median}`],
343
+ textposition: 'outside',
344
+ textfont: { size: 9, color: '#94a3b8' },
345
+ hovertemplate: `<b>${model.model}</b><br>Median: ${model.median} turns<extra></extra>`,
346
+ showlegend: false
347
+ };
 
 
 
 
 
 
348
  });
349
 
350
+ const layout = {
351
+ ...darkLayout,
352
+ barmode: 'group',
353
+ xaxis: {
354
+ ...darkLayout.xaxis,
355
+ title: { text: 'Median Turns', font: { size: 11, color: '#e2e8f0' } },
356
+ range: [0, Math.max(...sortedData.map(d => d.median)) * 1.15]
357
+ },
358
+ yaxis: {
359
+ ...darkLayout.yaxis,
360
+ automargin: true,
361
+ tickfont: { size: 9 }
362
+ },
363
+ margin: { ...darkLayout.margin, l: 130 }
364
  };
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
365
 
366
+ Plotly.newPlot(`turn-${scenario}`, traces, layout, plotlyConfig);
367
+ });
368
  }
369
 
 
 
370
  // ============================================================================
371
+ // PROBING RESULTS - 3 Charts with animated mode switching
372
  // ============================================================================
373
+ function initProbingCharts() {
374
+ renderProbingCharts('byTurn');
375
+ }
 
376
 
377
+ function renderProbingCharts(mode) {
378
+ const scenarios = ['mimic', 'globem', '10k'];
379
+ const scenarioIds = { 'mimic': 'mimic', 'globem': 'globem', '10k': '10k' };
380
 
381
+ scenarios.forEach(scenario => {
382
+ const data = DDR_DATA.probing[mode]?.[scenario];
383
+ if (!data) return;
384
 
385
+ const traces = [];
386
+ const models = Object.keys(data);
 
387
 
388
  models.forEach(model => {
389
+ const modelData = data[model];
390
  const xKey = mode === 'byTurn' ? 'turns' : 'progress';
391
  const xLabel = mode === 'byTurn' ? 'Turn' : 'Progress (%)';
392
 
 
396
  y: modelData.logprob,
397
  mode: 'lines+markers',
398
  name: model,
 
 
399
  line: {
400
+ color: DDR_DATA.probingColors[model] || '#888',
401
  width: 2
402
  },
403
  marker: {
404
+ size: 4,
405
+ color: DDR_DATA.probingColors[model] || '#888'
406
  },
407
+ hovertemplate: `<b>${model}</b><br>${xLabel}: %{x}<br>Log Prob: %{y:.2f}<extra></extra>`
 
 
 
 
408
  });
409
 
410
+ // Error band
411
+ if (modelData.sem) {
412
+ const upper = modelData.logprob.map((v, i) => v + modelData.sem[i]);
413
+ const lower = modelData.logprob.map((v, i) => v - modelData.sem[i]);
414
+
415
+ traces.push({
416
+ x: [...modelData[xKey], ...modelData[xKey].slice().reverse()],
417
+ y: [...upper, ...lower.slice().reverse()],
418
+ fill: 'toself',
419
+ fillcolor: (DDR_DATA.probingColors[model] || '#888') + '25',
420
+ line: { width: 0 },
421
+ showlegend: false,
422
+ hoverinfo: 'skip'
423
+ });
424
+ }
 
425
  });
 
426
 
427
+ const layout = {
428
+ ...darkLayout,
429
+ xaxis: {
430
+ ...darkLayout.xaxis,
431
+ title: { text: mode === 'byTurn' ? 'Turn' : 'Progress (%)', font: { size: 11, color: '#e2e8f0' } }
432
+ },
433
+ yaxis: {
434
+ ...darkLayout.yaxis,
435
+ title: { text: 'Avg Log Probability', font: { size: 11, color: '#e2e8f0' } }
436
+ },
437
+ showlegend: true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
438
  };
 
439
 
440
+ Plotly.newPlot(`probing-${scenarioIds[scenario]}`, traces, layout, plotlyConfig);
441
+ });
442
  }
443
 
444
+ // Probing dimension toggle
445
+ document.querySelectorAll('.probing-dim').forEach(btn => {
446
+ btn.addEventListener('click', () => {
447
+ document.querySelectorAll('.probing-dim').forEach(b => b.classList.remove('active'));
448
+ btn.classList.add('active');
449
+
450
+ const mode = btn.dataset.mode;
451
+ currentProbingMode = mode;
452
+
453
+ // Add updating class for visual feedback
454
+ ['mimic', 'globem', '10k'].forEach(s => {
455
+ document.getElementById(`probing-${s}`).classList.add('chart-updating');
456
+ });
457
+
458
+ setTimeout(() => {
459
+ renderProbingCharts(mode);
460
+ ['mimic', 'globem', '10k'].forEach(s => {
461
+ document.getElementById(`probing-${s}`).classList.remove('chart-updating');
462
+ });
463
+ }, 150);
464
+ });
465
+ });
466
 
467
  // ============================================================================
468
  // INITIALIZE ALL CHARTS
469
  // ============================================================================
470
  document.addEventListener('DOMContentLoaded', () => {
471
+ initScalingCharts();
472
+ initRankingCharts();
473
+ initTurnCharts();
474
+ initProbingCharts();
 
475
  });
476
 
477
  // Handle window resize
478
+ let resizeTimeout;
479
  window.addEventListener('resize', () => {
480
+ clearTimeout(resizeTimeout);
481
+ resizeTimeout = setTimeout(() => {
482
+ ['mimic', '10k', 'globem'].forEach(s => {
483
+ Plotly.Plots.resize(`scaling-${s}`);
484
+ Plotly.Plots.resize(`ranking-${s}`);
485
+ Plotly.Plots.resize(`turn-${s}`);
486
+ Plotly.Plots.resize(`probing-${s}`);
487
+ });
488
+ }, 100);
489
  });
index.html CHANGED
@@ -1,5 +1,6 @@
1
  <!DOCTYPE html>
2
  <html lang="en">
 
3
  <head>
4
  <meta charset="UTF-8">
5
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
@@ -11,6 +12,7 @@
11
  <script src="https://cdn.plot.ly/plotly-2.27.0.min.js"></script>
12
  <link rel="stylesheet" href="styles.css">
13
  </head>
 
14
  <body>
15
  <!-- Hero Section -->
16
  <header class="hero">
@@ -19,7 +21,8 @@
19
  <h1>DDR-Bench</h1>
20
  <p class="subtitle">Deep Data Research Agent Benchmark for Large Language Models</p>
21
  <p class="description">
22
- A comprehensive evaluation framework measuring AI agents' ability to conduct deep, iterative data exploration across medical records (MIMIC), financial filings (10-K), and behavioral data (GLOBEM).
 
23
  </p>
24
  <div class="stats-row">
25
  <div class="stat-item">
@@ -41,7 +44,6 @@
41
  <!-- Navigation -->
42
  <nav class="nav-tabs">
43
  <button class="nav-tab active" data-section="scaling">📈 Scaling Analysis</button>
44
- <button class="nav-tab" data-section="entropy">🔀 Entropy Analysis</button>
45
  <button class="nav-tab" data-section="ranking">🏆 Ranking Comparison</button>
46
  <button class="nav-tab" data-section="turn">🔄 Turn Distribution</button>
47
  <button class="nav-tab" data-section="probing">🔍 Probing Results</button>
@@ -49,88 +51,77 @@
49
 
50
  <!-- Main Content -->
51
  <main class="content">
52
- <!-- Scaling Analysis Section -->
53
  <section id="scaling" class="section active">
54
  <div class="section-header">
55
  <h2>Scaling Analysis</h2>
56
- <p>Explore how model performance scales with interaction turns, token usage, and inference cost across datasets.</p>
57
- </div>
58
- <div class="controls">
59
- <label>
60
- <span>Dataset:</span>
61
- <select id="scaling-dataset">
62
- <option value="mimic">MIMIC</option>
63
- <option value="10k">10-K</option>
64
- <option value="globem">GLOBEM</option>
65
- </select>
66
- </label>
67
- <label>
68
- <span>Scaling Dimension:</span>
69
- <select id="scaling-dimension">
70
- <option value="turn">Interaction Turns</option>
71
- <option value="token">Token Usage</option>
72
- <option value="cost">Inference Cost</option>
73
- </select>
74
- </label>
75
  </div>
76
- <div id="scaling-chart" class="chart-container"></div>
77
- </section>
78
-
79
- <!-- Entropy Analysis Section -->
80
- <section id="entropy" class="section">
81
- <div class="section-header">
82
- <h2>Entropy vs Coverage Analysis</h2>
83
- <p>Visualize the relationship between access entropy (exploration uniformity) and field coverage for each model.</p>
84
  </div>
85
- <div class="controls">
86
- <label>
87
- <span>Dataset:</span>
88
- <select id="entropy-dataset">
89
- <option value="mimic">MIMIC</option>
90
- <option value="10k">10-K</option>
91
- <option value="globem">GLOBEM</option>
92
- </select>
93
- </label>
 
 
 
 
94
  </div>
95
- <div id="entropy-chart" class="chart-container"></div>
96
  </section>
97
 
98
- <!-- Ranking Comparison Section -->
99
  <section id="ranking" class="section">
100
  <div class="section-header">
101
  <h2>Novelty vs Accuracy Ranking</h2>
102
- <p>Compare model rankings based on novelty (Bradley-Terry pairwise ranking) against traditional accuracy ranking.</p>
 
103
  </div>
104
- <div class="controls">
105
- <label>
106
- <span>Dataset:</span>
107
- <select id="ranking-dataset">
108
- <option value="MIMIC">MIMIC</option>
109
- <option value="10K">10-K</option>
110
- <option value="GLOBEM">GLOBEM</option>
111
- </select>
112
- </label>
 
 
 
 
113
  </div>
114
- <div id="ranking-chart" class="chart-container"></div>
115
  </section>
116
 
117
- <!-- Turn Distribution Section -->
118
  <section id="turn" class="section">
119
  <div class="section-header">
120
  <h2>Turn Count Distribution</h2>
121
  <p>Analyze the distribution of interaction turns across different models and datasets.</p>
122
  </div>
123
- <div class="controls">
124
- <label>
125
- <span>Dataset:</span>
126
- <select id="turn-dataset">
127
- <option value="mimic">MIMIC</option>
128
- <option value="10k">10-K</option>
129
- <option value="globem">GLOBEM</option>
130
- </select>
131
- </label>
 
 
 
 
132
  </div>
133
- <div id="turn-chart" class="chart-container tall"></div>
134
  </section>
135
 
136
  <!-- Probing Results Section -->
@@ -139,16 +130,24 @@
139
  <h2>FINISH Token Probing</h2>
140
  <p>Analyze the average log probability of FINISH messages across conversation turns and progress.</p>
141
  </div>
142
- <div class="controls">
143
- <label>
144
- <span>View Mode:</span>
145
- <select id="probing-mode">
146
- <option value="byTurn">By Turn</option>
147
- <option value="byProgress">By Progress (%)</option>
148
- </select>
149
- </label>
 
 
 
 
 
 
 
 
 
150
  </div>
151
- <div id="probing-chart" class="chart-container"></div>
152
  </section>
153
  </main>
154
 
@@ -160,4 +159,5 @@
160
  <script src="data.js"></script>
161
  <script src="charts.js"></script>
162
  </body>
163
- </html>
 
 
1
  <!DOCTYPE html>
2
  <html lang="en">
3
+
4
  <head>
5
  <meta charset="UTF-8">
6
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
 
12
  <script src="https://cdn.plot.ly/plotly-2.27.0.min.js"></script>
13
  <link rel="stylesheet" href="styles.css">
14
  </head>
15
+
16
  <body>
17
  <!-- Hero Section -->
18
  <header class="hero">
 
21
  <h1>DDR-Bench</h1>
22
  <p class="subtitle">Deep Data Research Agent Benchmark for Large Language Models</p>
23
  <p class="description">
24
+ A comprehensive evaluation framework measuring AI agents' ability to conduct deep, iterative data
25
+ exploration across medical records (MIMIC), financial filings (10-K), and behavioral data (GLOBEM).
26
  </p>
27
  <div class="stats-row">
28
  <div class="stat-item">
 
44
  <!-- Navigation -->
45
  <nav class="nav-tabs">
46
  <button class="nav-tab active" data-section="scaling">📈 Scaling Analysis</button>
 
47
  <button class="nav-tab" data-section="ranking">🏆 Ranking Comparison</button>
48
  <button class="nav-tab" data-section="turn">🔄 Turn Distribution</button>
49
  <button class="nav-tab" data-section="probing">🔍 Probing Results</button>
 
51
 
52
  <!-- Main Content -->
53
  <main class="content">
54
+ <!-- Scaling Analysis Section - 3 charts side by side -->
55
  <section id="scaling" class="section active">
56
  <div class="section-header">
57
  <h2>Scaling Analysis</h2>
58
+ <p>Explore how model performance scales with interaction turns, token usage, and inference cost across
59
+ all datasets.</p>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  </div>
61
+ <div class="dimension-toggle">
62
+ <button class="dim-btn active" data-dim="turn">🔄 Interaction Turns</button>
63
+ <button class="dim-btn" data-dim="token">📊 Token Usage</button>
64
+ <button class="dim-btn" data-dim="cost">💰 Inference Cost</button>
 
 
 
 
65
  </div>
66
+ <div class="charts-grid three-col">
67
+ <div class="chart-card">
68
+ <h3>MIMIC</h3>
69
+ <div id="scaling-mimic" class="chart-container-sm"></div>
70
+ </div>
71
+ <div class="chart-card">
72
+ <h3>10-K</h3>
73
+ <div id="scaling-10k" class="chart-container-sm"></div>
74
+ </div>
75
+ <div class="chart-card">
76
+ <h3>GLOBEM</h3>
77
+ <div id="scaling-globem" class="chart-container-sm"></div>
78
+ </div>
79
  </div>
 
80
  </section>
81
 
82
+ <!-- Ranking Comparison Section - 3 charts -->
83
  <section id="ranking" class="section">
84
  <div class="section-header">
85
  <h2>Novelty vs Accuracy Ranking</h2>
86
+ <p>Compare model rankings based on Bradley-Terry pairwise ranking against traditional accuracy ranking.
87
+ </p>
88
  </div>
89
+ <div class="charts-grid three-col">
90
+ <div class="chart-card">
91
+ <h3>MIMIC</h3>
92
+ <div id="ranking-mimic" class="chart-container-tall"></div>
93
+ </div>
94
+ <div class="chart-card">
95
+ <h3>10-K</h3>
96
+ <div id="ranking-10k" class="chart-container-tall"></div>
97
+ </div>
98
+ <div class="chart-card">
99
+ <h3>GLOBEM</h3>
100
+ <div id="ranking-globem" class="chart-container-tall"></div>
101
+ </div>
102
  </div>
 
103
  </section>
104
 
105
+ <!-- Turn Distribution Section - 3 charts -->
106
  <section id="turn" class="section">
107
  <div class="section-header">
108
  <h2>Turn Count Distribution</h2>
109
  <p>Analyze the distribution of interaction turns across different models and datasets.</p>
110
  </div>
111
+ <div class="charts-grid three-col">
112
+ <div class="chart-card">
113
+ <h3>MIMIC</h3>
114
+ <div id="turn-mimic" class="chart-container-tall"></div>
115
+ </div>
116
+ <div class="chart-card">
117
+ <h3>10-K</h3>
118
+ <div id="turn-10k" class="chart-container-tall"></div>
119
+ </div>
120
+ <div class="chart-card">
121
+ <h3>GLOBEM</h3>
122
+ <div id="turn-globem" class="chart-container-tall"></div>
123
+ </div>
124
  </div>
 
125
  </section>
126
 
127
  <!-- Probing Results Section -->
 
130
  <h2>FINISH Token Probing</h2>
131
  <p>Analyze the average log probability of FINISH messages across conversation turns and progress.</p>
132
  </div>
133
+ <div class="dimension-toggle">
134
+ <button class="dim-btn probing-dim active" data-mode="byTurn">📊 By Turn</button>
135
+ <button class="dim-btn probing-dim" data-mode="byProgress">📈 By Progress (%)</button>
136
+ </div>
137
+ <div class="charts-grid three-col">
138
+ <div class="chart-card">
139
+ <h3>MIMIC</h3>
140
+ <div id="probing-mimic" class="chart-container-sm"></div>
141
+ </div>
142
+ <div class="chart-card">
143
+ <h3>GLOBEM</h3>
144
+ <div id="probing-globem" class="chart-container-sm"></div>
145
+ </div>
146
+ <div class="chart-card">
147
+ <h3>10-K</h3>
148
+ <div id="probing-10k" class="chart-container-sm"></div>
149
+ </div>
150
  </div>
 
151
  </section>
152
  </main>
153
 
 
159
  <script src="data.js"></script>
160
  <script src="charts.js"></script>
161
  </body>
162
+
163
+ </html>
styles.css CHANGED
@@ -19,7 +19,9 @@
19
  }
20
 
21
  /* Reset & Base */
22
- *, *::before, *::after {
 
 
23
  box-sizing: border-box;
24
  margin: 0;
25
  padding: 0;
@@ -40,7 +42,7 @@ body {
40
  /* Hero Section */
41
  .hero {
42
  background: var(--gradient-hero);
43
- padding: 4rem 2rem 3rem;
44
  text-align: center;
45
  position: relative;
46
  overflow: hidden;
@@ -53,7 +55,7 @@ body {
53
  left: 0;
54
  right: 0;
55
  bottom: 0;
56
- background:
57
  radial-gradient(circle at 20% 50%, rgba(99, 102, 241, 0.15) 0%, transparent 50%),
58
  radial-gradient(circle at 80% 50%, rgba(139, 92, 246, 0.1) 0%, transparent 50%);
59
  pointer-events: none;
@@ -70,45 +72,45 @@ body {
70
  display: inline-block;
71
  background: rgba(99, 102, 241, 0.2);
72
  color: var(--primary-light);
73
- padding: 0.5rem 1rem;
74
  border-radius: 2rem;
75
- font-size: 0.85rem;
76
  font-weight: 500;
77
- margin-bottom: 1rem;
78
  border: 1px solid rgba(99, 102, 241, 0.3);
79
  }
80
 
81
  .hero h1 {
82
- font-size: 3.5rem;
83
  font-weight: 700;
84
  background: linear-gradient(135deg, #f1f5f9 0%, #818cf8 100%);
85
  -webkit-background-clip: text;
86
  -webkit-text-fill-color: transparent;
87
  background-clip: text;
88
- margin-bottom: 0.75rem;
89
  letter-spacing: -0.02em;
90
  }
91
 
92
  .subtitle {
93
- font-size: 1.35rem;
94
  color: var(--text-secondary);
95
- margin-bottom: 1rem;
96
  font-weight: 400;
97
  }
98
 
99
  .description {
100
- font-size: 1rem;
101
  color: var(--text-muted);
102
  max-width: 700px;
103
- margin: 0 auto 2rem;
104
- line-height: 1.7;
105
  }
106
 
107
  .stats-row {
108
  display: flex;
109
  justify-content: center;
110
- gap: 3rem;
111
- margin-top: 2rem;
112
  }
113
 
114
  .stat-item {
@@ -117,13 +119,13 @@ body {
117
 
118
  .stat-value {
119
  display: block;
120
- font-size: 2.5rem;
121
  font-weight: 700;
122
  color: var(--primary-light);
123
  }
124
 
125
  .stat-label {
126
- font-size: 0.9rem;
127
  color: var(--text-muted);
128
  }
129
 
@@ -132,7 +134,7 @@ body {
132
  display: flex;
133
  justify-content: center;
134
  gap: 0.5rem;
135
- padding: 1rem 2rem;
136
  background: var(--bg-card);
137
  border-bottom: 1px solid var(--border);
138
  position: sticky;
@@ -142,12 +144,12 @@ body {
142
  }
143
 
144
  .nav-tab {
145
- padding: 0.75rem 1.5rem;
146
  background: transparent;
147
  border: 1px solid transparent;
148
  border-radius: 0.5rem;
149
  color: var(--text-secondary);
150
- font-size: 0.95rem;
151
  font-weight: 500;
152
  cursor: pointer;
153
  transition: all 0.2s ease;
@@ -167,9 +169,9 @@ body {
167
 
168
  /* Main Content */
169
  .content {
170
- max-width: 1400px;
171
  margin: 0 auto;
172
- padding: 2rem;
173
  }
174
 
175
  /* Sections */
@@ -183,143 +185,173 @@ body {
183
  }
184
 
185
  @keyframes fadeIn {
186
- from { opacity: 0; transform: translateY(10px); }
187
- to { opacity: 1; transform: translateY(0); }
 
 
 
 
 
 
 
188
  }
189
 
190
  .section-header {
191
- margin-bottom: 2rem;
192
  text-align: center;
193
  }
194
 
195
  .section-header h2 {
196
- font-size: 1.75rem;
197
  font-weight: 600;
198
  color: var(--text-primary);
199
- margin-bottom: 0.5rem;
200
  }
201
 
202
  .section-header p {
203
  color: var(--text-muted);
204
- font-size: 1rem;
205
  }
206
 
207
- /* Controls */
208
- .controls {
209
  display: flex;
210
  justify-content: center;
211
- gap: 1.5rem;
212
  margin-bottom: 1.5rem;
213
- flex-wrap: wrap;
214
  }
215
 
216
- .controls label {
217
- display: flex;
218
- align-items: center;
219
- gap: 0.75rem;
220
- }
221
-
222
- .controls label span {
223
  color: var(--text-secondary);
224
- font-size: 0.9rem;
225
  font-weight: 500;
 
 
 
226
  }
227
 
228
- .controls select {
229
- padding: 0.6rem 1rem;
230
- background: var(--bg-card);
231
- border: 1px solid var(--border);
232
- border-radius: 0.5rem;
233
  color: var(--text-primary);
234
- font-size: 0.9rem;
235
- cursor: pointer;
236
- transition: all 0.2s ease;
237
- font-family: inherit;
238
- min-width: 160px;
239
  }
240
 
241
- .controls select:hover {
242
- border-color: var(--primary);
 
 
 
243
  }
244
 
245
- .controls select:focus {
246
- outline: none;
247
- border-color: var(--primary);
248
- box-shadow: 0 0 0 3px rgba(99, 102, 241, 0.2);
249
  }
250
 
251
- /* Chart Container */
252
- .chart-container {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
  background: var(--bg-card);
254
- border-radius: 1rem;
255
- padding: 1.5rem;
256
- box-shadow: var(--shadow);
257
- min-height: 500px;
258
  border: 1px solid var(--border);
 
 
 
 
 
 
 
 
 
 
 
259
  }
260
 
261
- .chart-container.tall {
262
- min-height: 700px;
 
 
 
 
 
 
 
263
  }
264
 
265
  /* Footer */
266
  .footer {
267
  text-align: center;
268
- padding: 2rem;
269
  color: var(--text-muted);
270
- font-size: 0.9rem;
271
  border-top: 1px solid var(--border);
272
- margin-top: 3rem;
273
  }
274
 
275
  /* Responsive */
276
  @media (max-width: 768px) {
277
  .hero {
278
- padding: 3rem 1.5rem 2rem;
279
  }
280
-
281
  .hero h1 {
282
- font-size: 2.5rem;
283
  }
284
-
285
  .subtitle {
286
- font-size: 1.1rem;
287
  }
288
-
289
  .stats-row {
290
  gap: 1.5rem;
291
  }
292
-
293
  .stat-value {
294
- font-size: 2rem;
295
  }
296
-
297
  .nav-tabs {
298
- padding: 0.75rem 1rem;
299
  gap: 0.25rem;
300
  }
301
-
302
  .nav-tab {
303
- padding: 0.5rem 1rem;
304
- font-size: 0.85rem;
305
  }
306
-
307
  .content {
308
  padding: 1rem;
309
  }
310
-
311
- .controls {
312
- flex-direction: column;
313
- align-items: stretch;
314
- }
315
-
316
- .controls label {
317
- flex-direction: column;
318
- align-items: flex-start;
319
  }
320
-
321
- .controls select {
322
- width: 100%;
 
323
  }
324
  }
325
 
@@ -335,3 +367,13 @@ body {
335
  .js-plotly-plot .plotly .modebar-btn:hover path {
336
  fill: var(--text-primary) !important;
337
  }
 
 
 
 
 
 
 
 
 
 
 
19
  }
20
 
21
  /* Reset & Base */
22
+ *,
23
+ *::before,
24
+ *::after {
25
  box-sizing: border-box;
26
  margin: 0;
27
  padding: 0;
 
42
  /* Hero Section */
43
  .hero {
44
  background: var(--gradient-hero);
45
+ padding: 3rem 2rem 2rem;
46
  text-align: center;
47
  position: relative;
48
  overflow: hidden;
 
55
  left: 0;
56
  right: 0;
57
  bottom: 0;
58
+ background:
59
  radial-gradient(circle at 20% 50%, rgba(99, 102, 241, 0.15) 0%, transparent 50%),
60
  radial-gradient(circle at 80% 50%, rgba(139, 92, 246, 0.1) 0%, transparent 50%);
61
  pointer-events: none;
 
72
  display: inline-block;
73
  background: rgba(99, 102, 241, 0.2);
74
  color: var(--primary-light);
75
+ padding: 0.4rem 0.8rem;
76
  border-radius: 2rem;
77
+ font-size: 0.8rem;
78
  font-weight: 500;
79
+ margin-bottom: 0.75rem;
80
  border: 1px solid rgba(99, 102, 241, 0.3);
81
  }
82
 
83
  .hero h1 {
84
+ font-size: 3rem;
85
  font-weight: 700;
86
  background: linear-gradient(135deg, #f1f5f9 0%, #818cf8 100%);
87
  -webkit-background-clip: text;
88
  -webkit-text-fill-color: transparent;
89
  background-clip: text;
90
+ margin-bottom: 0.5rem;
91
  letter-spacing: -0.02em;
92
  }
93
 
94
  .subtitle {
95
+ font-size: 1.2rem;
96
  color: var(--text-secondary);
97
+ margin-bottom: 0.75rem;
98
  font-weight: 400;
99
  }
100
 
101
  .description {
102
+ font-size: 0.9rem;
103
  color: var(--text-muted);
104
  max-width: 700px;
105
+ margin: 0 auto 1.5rem;
106
+ line-height: 1.6;
107
  }
108
 
109
  .stats-row {
110
  display: flex;
111
  justify-content: center;
112
+ gap: 2.5rem;
113
+ margin-top: 1.5rem;
114
  }
115
 
116
  .stat-item {
 
119
 
120
  .stat-value {
121
  display: block;
122
+ font-size: 2rem;
123
  font-weight: 700;
124
  color: var(--primary-light);
125
  }
126
 
127
  .stat-label {
128
+ font-size: 0.8rem;
129
  color: var(--text-muted);
130
  }
131
 
 
134
  display: flex;
135
  justify-content: center;
136
  gap: 0.5rem;
137
+ padding: 0.75rem 2rem;
138
  background: var(--bg-card);
139
  border-bottom: 1px solid var(--border);
140
  position: sticky;
 
144
  }
145
 
146
  .nav-tab {
147
+ padding: 0.6rem 1.25rem;
148
  background: transparent;
149
  border: 1px solid transparent;
150
  border-radius: 0.5rem;
151
  color: var(--text-secondary);
152
+ font-size: 0.9rem;
153
  font-weight: 500;
154
  cursor: pointer;
155
  transition: all 0.2s ease;
 
169
 
170
  /* Main Content */
171
  .content {
172
+ max-width: 1600px;
173
  margin: 0 auto;
174
+ padding: 1.5rem;
175
  }
176
 
177
  /* Sections */
 
185
  }
186
 
187
  @keyframes fadeIn {
188
+ from {
189
+ opacity: 0;
190
+ transform: translateY(10px);
191
+ }
192
+
193
+ to {
194
+ opacity: 1;
195
+ transform: translateY(0);
196
+ }
197
  }
198
 
199
  .section-header {
200
+ margin-bottom: 1.5rem;
201
  text-align: center;
202
  }
203
 
204
  .section-header h2 {
205
+ font-size: 1.5rem;
206
  font-weight: 600;
207
  color: var(--text-primary);
208
+ margin-bottom: 0.4rem;
209
  }
210
 
211
  .section-header p {
212
  color: var(--text-muted);
213
+ font-size: 0.9rem;
214
  }
215
 
216
+ /* Dimension Toggle Buttons */
217
+ .dimension-toggle {
218
  display: flex;
219
  justify-content: center;
220
+ gap: 0.5rem;
221
  margin-bottom: 1.5rem;
 
222
  }
223
 
224
+ .dim-btn {
225
+ padding: 0.6rem 1.2rem;
226
+ background: var(--bg-card);
227
+ border: 1px solid var(--border);
228
+ border-radius: 2rem;
 
 
229
  color: var(--text-secondary);
230
+ font-size: 0.85rem;
231
  font-weight: 500;
232
+ cursor: pointer;
233
+ transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
234
+ font-family: inherit;
235
  }
236
 
237
+ .dim-btn:hover {
238
+ background: var(--bg-card-hover);
 
 
 
239
  color: var(--text-primary);
240
+ transform: translateY(-1px);
 
 
 
 
241
  }
242
 
243
+ .dim-btn.active {
244
+ background: var(--gradient-primary);
245
+ color: white;
246
+ border-color: transparent;
247
+ box-shadow: 0 4px 12px rgba(99, 102, 241, 0.3);
248
  }
249
 
250
+ /* Charts Grid */
251
+ .charts-grid {
252
+ display: grid;
253
+ gap: 1rem;
254
  }
255
 
256
+ .charts-grid.three-col {
257
+ grid-template-columns: repeat(3, 1fr);
258
+ }
259
+
260
+ @media (max-width: 1200px) {
261
+ .charts-grid.three-col {
262
+ grid-template-columns: repeat(2, 1fr);
263
+ }
264
+ }
265
+
266
+ @media (max-width: 768px) {
267
+ .charts-grid.three-col {
268
+ grid-template-columns: 1fr;
269
+ }
270
+ }
271
+
272
+ /* Chart Card */
273
+ .chart-card {
274
  background: var(--bg-card);
275
+ border-radius: 0.75rem;
276
+ padding: 1rem;
 
 
277
  border: 1px solid var(--border);
278
+ box-shadow: var(--shadow);
279
+ }
280
+
281
+ .chart-card h3 {
282
+ font-size: 1rem;
283
+ font-weight: 600;
284
+ color: var(--text-primary);
285
+ margin-bottom: 0.75rem;
286
+ text-align: center;
287
+ padding-bottom: 0.5rem;
288
+ border-bottom: 1px solid var(--border);
289
  }
290
 
291
+ /* Chart Container */
292
+ .chart-container-sm {
293
+ height: 350px;
294
+ min-height: 300px;
295
+ }
296
+
297
+ .chart-container-tall {
298
+ height: 550px;
299
+ min-height: 500px;
300
  }
301
 
302
  /* Footer */
303
  .footer {
304
  text-align: center;
305
+ padding: 1.5rem;
306
  color: var(--text-muted);
307
+ font-size: 0.85rem;
308
  border-top: 1px solid var(--border);
309
+ margin-top: 2rem;
310
  }
311
 
312
  /* Responsive */
313
  @media (max-width: 768px) {
314
  .hero {
315
+ padding: 2rem 1rem 1.5rem;
316
  }
317
+
318
  .hero h1 {
319
+ font-size: 2rem;
320
  }
321
+
322
  .subtitle {
323
+ font-size: 1rem;
324
  }
325
+
326
  .stats-row {
327
  gap: 1.5rem;
328
  }
329
+
330
  .stat-value {
331
+ font-size: 1.5rem;
332
  }
333
+
334
  .nav-tabs {
335
+ padding: 0.5rem 0.75rem;
336
  gap: 0.25rem;
337
  }
338
+
339
  .nav-tab {
340
+ padding: 0.5rem 0.75rem;
341
+ font-size: 0.8rem;
342
  }
343
+
344
  .content {
345
  padding: 1rem;
346
  }
347
+
348
+ .dimension-toggle {
349
+ flex-wrap: wrap;
 
 
 
 
 
 
350
  }
351
+
352
+ .dim-btn {
353
+ padding: 0.5rem 1rem;
354
+ font-size: 0.8rem;
355
  }
356
  }
357
 
 
367
  .js-plotly-plot .plotly .modebar-btn:hover path {
368
  fill: var(--text-primary) !important;
369
  }
370
+
371
+ /* Smooth transitions for chart updates */
372
+ .chart-container-sm,
373
+ .chart-container-tall {
374
+ transition: opacity 0.2s ease;
375
+ }
376
+
377
+ .chart-updating {
378
+ opacity: 0.7;
379
+ }