thinkwee
commited on
Commit
Β·
41d056a
1
Parent(s):
5026fae
fix display
Browse files- charts.js +239 -81
- data.js +59 -1
- index.html +24 -4
- styles.css +16 -0
charts.js
CHANGED
|
@@ -377,9 +377,11 @@ document.querySelectorAll('.dim-btn:not(.probing-dim)').forEach(btn => {
|
|
| 377 |
});
|
| 378 |
|
| 379 |
// ============================================================================
|
| 380 |
-
// RANKING COMPARISON - 3 Charts
|
| 381 |
// ============================================================================
|
| 382 |
-
|
|
|
|
|
|
|
| 383 |
const scenarios = [
|
| 384 |
{ key: 'MIMIC', id: 'mimic' },
|
| 385 |
{ key: '10K', id: '10k' },
|
|
@@ -390,106 +392,162 @@ function initRankingCharts() {
|
|
| 390 |
const data = DDR_DATA.ranking[key];
|
| 391 |
if (!data) return;
|
| 392 |
|
| 393 |
-
const models = data.slice(0,
|
| 394 |
const traces = [];
|
| 395 |
|
| 396 |
-
//
|
| 397 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 398 |
traces.push({
|
| 399 |
-
x:
|
| 400 |
-
y:
|
| 401 |
-
mode: '
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
|
|
|
|
|
|
| 406 |
},
|
| 407 |
-
|
| 408 |
-
|
| 409 |
});
|
| 410 |
-
});
|
| 411 |
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 427 |
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 443 |
|
| 444 |
const layout = {
|
| 445 |
...darkLayout,
|
| 446 |
xaxis: {
|
| 447 |
...darkLayout.xaxis,
|
| 448 |
title: { text: 'Rank', font: { size: 11, color: '#e2e8f0' } },
|
| 449 |
-
range: [
|
| 450 |
dtick: 2
|
| 451 |
},
|
| 452 |
yaxis: {
|
| 453 |
...darkLayout.yaxis,
|
| 454 |
tickmode: 'array',
|
| 455 |
tickvals: models.map((_, i) => i),
|
| 456 |
-
ticktext: models.map(m => m.model.substring(0,
|
| 457 |
automargin: true
|
| 458 |
},
|
| 459 |
showlegend: true,
|
| 460 |
legend: {
|
| 461 |
...darkLayout.legend,
|
| 462 |
-
y: -0.
|
| 463 |
},
|
| 464 |
-
margin: { ...darkLayout.margin, l:
|
| 465 |
};
|
| 466 |
|
| 467 |
-
Plotly.
|
| 468 |
});
|
| 469 |
}
|
| 470 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 471 |
// ============================================================================
|
| 472 |
-
// TURN DISTRIBUTION - 3 Charts (
|
| 473 |
// ============================================================================
|
| 474 |
function initTurnCharts() {
|
| 475 |
const scenarios = ['mimic', '10k', 'globem'];
|
| 476 |
|
| 477 |
// Family colors
|
| 478 |
const familyColors = {
|
| 479 |
-
'
|
| 480 |
-
'
|
| 481 |
-
'
|
| 482 |
-
'
|
| 483 |
-
'
|
| 484 |
-
'
|
| 485 |
-
'
|
| 486 |
-
'
|
| 487 |
-
'
|
| 488 |
};
|
| 489 |
|
| 490 |
function getModelColor(modelName) {
|
|
|
|
| 491 |
for (const [family, color] of Object.entries(familyColors)) {
|
| 492 |
-
if (
|
| 493 |
}
|
| 494 |
return '#888';
|
| 495 |
}
|
|
@@ -498,43 +556,70 @@ function initTurnCharts() {
|
|
| 498 |
const data = DDR_DATA.turn[scenario];
|
| 499 |
if (!data) return;
|
| 500 |
|
| 501 |
-
|
|
|
|
| 502 |
|
| 503 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 504 |
const color = getModelColor(model.model);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 505 |
|
| 506 |
-
|
| 507 |
-
|
| 508 |
-
x:
|
| 509 |
-
|
| 510 |
-
|
|
|
|
|
|
|
|
|
|
| 511 |
name: model.model,
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
},
|
| 516 |
-
|
| 517 |
-
|
| 518 |
-
|
| 519 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 520 |
showlegend: false
|
| 521 |
-
};
|
| 522 |
});
|
| 523 |
|
| 524 |
const layout = {
|
| 525 |
...darkLayout,
|
| 526 |
-
barmode: 'group',
|
| 527 |
xaxis: {
|
| 528 |
...darkLayout.xaxis,
|
| 529 |
title: { text: 'Number of Turns', font: { size: 11, color: '#e2e8f0' } },
|
| 530 |
-
range: [0,
|
|
|
|
| 531 |
},
|
| 532 |
yaxis: {
|
| 533 |
...darkLayout.yaxis,
|
|
|
|
|
|
|
|
|
|
| 534 |
automargin: true,
|
| 535 |
-
|
| 536 |
},
|
| 537 |
-
margin: { ...darkLayout.margin, l:
|
|
|
|
| 538 |
};
|
| 539 |
|
| 540 |
Plotly.newPlot(`turn-${scenario}`, traces, layout, plotlyConfig);
|
|
@@ -638,6 +723,75 @@ document.querySelectorAll('.probing-dim').forEach(btn => {
|
|
| 638 |
});
|
| 639 |
});
|
| 640 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 641 |
// ============================================================================
|
| 642 |
// INITIALIZE ALL CHARTS
|
| 643 |
// ============================================================================
|
|
@@ -645,6 +799,7 @@ document.addEventListener('DOMContentLoaded', () => {
|
|
| 645 |
initScalingCharts();
|
| 646 |
initRankingCharts();
|
| 647 |
initTurnCharts();
|
|
|
|
| 648 |
initProbingCharts();
|
| 649 |
});
|
| 650 |
|
|
@@ -659,5 +814,8 @@ window.addEventListener('resize', () => {
|
|
| 659 |
Plotly.Plots.resize(`turn-${s}`);
|
| 660 |
Plotly.Plots.resize(`probing-${s}`);
|
| 661 |
});
|
|
|
|
|
|
|
|
|
|
| 662 |
}, 100);
|
| 663 |
});
|
|
|
|
| 377 |
});
|
| 378 |
|
| 379 |
// ============================================================================
|
| 380 |
+
// RANKING COMPARISON - 3 Charts with animated mode switching
|
| 381 |
// ============================================================================
|
| 382 |
+
let currentRankingMode = 'comparison';
|
| 383 |
+
|
| 384 |
+
function renderRankingCharts(mode) {
|
| 385 |
const scenarios = [
|
| 386 |
{ key: 'MIMIC', id: 'mimic' },
|
| 387 |
{ key: '10K', id: '10k' },
|
|
|
|
| 392 |
const data = DDR_DATA.ranking[key];
|
| 393 |
if (!data) return;
|
| 394 |
|
| 395 |
+
const models = data.slice(0, 12); // Top 12 models for better fit
|
| 396 |
const traces = [];
|
| 397 |
|
| 398 |
+
// Get x-axis values based on mode
|
| 399 |
+
const getXValue = (m) => {
|
| 400 |
+
switch (mode) {
|
| 401 |
+
case 'novelty': return m.bt_rank;
|
| 402 |
+
case 'accuracy': return m.acc_rank;
|
| 403 |
+
default: return m.bt_rank; // For comparison, use bt_rank as base
|
| 404 |
+
}
|
| 405 |
+
};
|
| 406 |
+
|
| 407 |
+
if (mode === 'comparison') {
|
| 408 |
+
// Connection lines
|
| 409 |
+
models.forEach((m, i) => {
|
| 410 |
+
traces.push({
|
| 411 |
+
x: [m.bt_rank, m.acc_rank],
|
| 412 |
+
y: [i, i],
|
| 413 |
+
mode: 'lines',
|
| 414 |
+
line: {
|
| 415 |
+
color: 'rgba(148, 163, 184, 0.3)',
|
| 416 |
+
width: 1.5,
|
| 417 |
+
dash: 'dot'
|
| 418 |
+
},
|
| 419 |
+
showlegend: false,
|
| 420 |
+
hoverinfo: 'skip'
|
| 421 |
+
});
|
| 422 |
+
});
|
| 423 |
+
|
| 424 |
+
// Novelty rank points
|
| 425 |
traces.push({
|
| 426 |
+
x: models.map(m => m.bt_rank),
|
| 427 |
+
y: models.map((_, i) => i),
|
| 428 |
+
mode: 'markers',
|
| 429 |
+
name: 'Novelty Rank',
|
| 430 |
+
marker: {
|
| 431 |
+
size: 10,
|
| 432 |
+
symbol: 'circle',
|
| 433 |
+
color: '#8B5CF6',
|
| 434 |
+
line: { color: '#fff', width: 1 }
|
| 435 |
},
|
| 436 |
+
text: models.map(m => `${m.model}<br>Novelty: #${m.bt_rank}<br>Win Rate: ${m.win_rate}%`),
|
| 437 |
+
hovertemplate: '%{text}<extra></extra>'
|
| 438 |
});
|
|
|
|
| 439 |
|
| 440 |
+
// Accuracy rank points
|
| 441 |
+
traces.push({
|
| 442 |
+
x: models.map(m => m.acc_rank),
|
| 443 |
+
y: models.map((_, i) => i),
|
| 444 |
+
mode: 'markers',
|
| 445 |
+
name: 'Accuracy Rank',
|
| 446 |
+
marker: {
|
| 447 |
+
size: 10,
|
| 448 |
+
symbol: 'diamond',
|
| 449 |
+
color: '#22C55E',
|
| 450 |
+
line: { color: '#fff', width: 1 }
|
| 451 |
+
},
|
| 452 |
+
text: models.map(m => `${m.model}<br>Accuracy: #${m.acc_rank}<br>${m.accuracy}%`),
|
| 453 |
+
hovertemplate: '%{text}<extra></extra>'
|
| 454 |
+
});
|
| 455 |
+
} else {
|
| 456 |
+
// Single mode - just points
|
| 457 |
+
const xVals = models.map(m => mode === 'novelty' ? m.bt_rank : m.acc_rank);
|
| 458 |
+
const color = mode === 'novelty' ? '#8B5CF6' : '#22C55E';
|
| 459 |
+
const label = mode === 'novelty' ? 'Novelty' : 'Accuracy';
|
| 460 |
|
| 461 |
+
traces.push({
|
| 462 |
+
x: xVals,
|
| 463 |
+
y: models.map((_, i) => i),
|
| 464 |
+
mode: 'markers',
|
| 465 |
+
name: label,
|
| 466 |
+
marker: {
|
| 467 |
+
size: 12,
|
| 468 |
+
symbol: 'circle',
|
| 469 |
+
color: color,
|
| 470 |
+
line: { color: '#fff', width: 1 }
|
| 471 |
+
},
|
| 472 |
+
text: models.map(m => {
|
| 473 |
+
if (mode === 'novelty') {
|
| 474 |
+
return `${m.model}<br>Novelty: #${m.bt_rank}<br>Win Rate: ${m.win_rate}%`;
|
| 475 |
+
} else {
|
| 476 |
+
return `${m.model}<br>Accuracy: #${m.acc_rank}<br>${m.accuracy}%`;
|
| 477 |
+
}
|
| 478 |
+
}),
|
| 479 |
+
hovertemplate: '%{text}<extra></extra>'
|
| 480 |
+
});
|
| 481 |
+
}
|
| 482 |
+
|
| 483 |
+
const maxRank = Math.max(...models.map(m => Math.max(m.bt_rank, m.acc_rank)));
|
| 484 |
|
| 485 |
const layout = {
|
| 486 |
...darkLayout,
|
| 487 |
xaxis: {
|
| 488 |
...darkLayout.xaxis,
|
| 489 |
title: { text: 'Rank', font: { size: 11, color: '#e2e8f0' } },
|
| 490 |
+
range: [maxRank + 1, 0],
|
| 491 |
dtick: 2
|
| 492 |
},
|
| 493 |
yaxis: {
|
| 494 |
...darkLayout.yaxis,
|
| 495 |
tickmode: 'array',
|
| 496 |
tickvals: models.map((_, i) => i),
|
| 497 |
+
ticktext: models.map(m => m.model.length > 18 ? m.model.substring(0, 16) + '...' : m.model),
|
| 498 |
automargin: true
|
| 499 |
},
|
| 500 |
showlegend: true,
|
| 501 |
legend: {
|
| 502 |
...darkLayout.legend,
|
| 503 |
+
y: -0.15
|
| 504 |
},
|
| 505 |
+
margin: { ...darkLayout.margin, l: 130, b: 70 }
|
| 506 |
};
|
| 507 |
|
| 508 |
+
Plotly.react(`ranking-${id}`, traces, layout, plotlyConfig);
|
| 509 |
});
|
| 510 |
}
|
| 511 |
|
| 512 |
+
function initRankingCharts() {
|
| 513 |
+
renderRankingCharts('comparison');
|
| 514 |
+
}
|
| 515 |
+
|
| 516 |
+
// Ranking mode toggle event listener
|
| 517 |
+
document.querySelectorAll('.ranking-dim').forEach(btn => {
|
| 518 |
+
btn.addEventListener('click', () => {
|
| 519 |
+
document.querySelectorAll('.ranking-dim').forEach(b => b.classList.remove('active'));
|
| 520 |
+
btn.classList.add('active');
|
| 521 |
+
|
| 522 |
+
const mode = btn.dataset.mode;
|
| 523 |
+
currentRankingMode = mode;
|
| 524 |
+
renderRankingCharts(mode);
|
| 525 |
+
});
|
| 526 |
+
});
|
| 527 |
+
|
| 528 |
// ============================================================================
|
| 529 |
+
// TURN DISTRIBUTION - 3 Charts (Ridgeline style)
|
| 530 |
// ============================================================================
|
| 531 |
function initTurnCharts() {
|
| 532 |
const scenarios = ['mimic', '10k', 'globem'];
|
| 533 |
|
| 534 |
// Family colors
|
| 535 |
const familyColors = {
|
| 536 |
+
'claude': '#FF6D00',
|
| 537 |
+
'gpt': '#00C853',
|
| 538 |
+
'gemini': '#2196F3',
|
| 539 |
+
'deepseek': '#E91E63',
|
| 540 |
+
'glm': '#9C27B0',
|
| 541 |
+
'kimi': '#FFA500',
|
| 542 |
+
'minimax': '#20B2AA',
|
| 543 |
+
'qwen': '#0EA5E9',
|
| 544 |
+
'llama': '#F59E0B'
|
| 545 |
};
|
| 546 |
|
| 547 |
function getModelColor(modelName) {
|
| 548 |
+
const lower = modelName.toLowerCase();
|
| 549 |
for (const [family, color] of Object.entries(familyColors)) {
|
| 550 |
+
if (lower.includes(family)) return color;
|
| 551 |
}
|
| 552 |
return '#888';
|
| 553 |
}
|
|
|
|
| 556 |
const data = DDR_DATA.turn[scenario];
|
| 557 |
if (!data) return;
|
| 558 |
|
| 559 |
+
// Sort by median descending (highest median at top)
|
| 560 |
+
const sortedData = [...data].sort((a, b) => b.median - a.median);
|
| 561 |
|
| 562 |
+
// Limit to top 15 models for readability
|
| 563 |
+
const displayData = sortedData.slice(0, 15);
|
| 564 |
+
|
| 565 |
+
const traces = [];
|
| 566 |
+
const binLabels = ['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100'];
|
| 567 |
+
const binCenters = [5, 15, 25, 35, 45, 55, 65, 75, 85, 95];
|
| 568 |
+
|
| 569 |
+
// Create ridgeline traces (area charts stacked vertically)
|
| 570 |
+
displayData.forEach((model, idx) => {
|
| 571 |
const color = getModelColor(model.model);
|
| 572 |
+
const yOffset = idx;
|
| 573 |
+
|
| 574 |
+
// Scale distribution to fit in the row (max height ~0.8)
|
| 575 |
+
const maxDist = Math.max(...model.distribution) || 1;
|
| 576 |
+
const scaledDist = model.distribution.map(d => d / maxDist * 0.7);
|
| 577 |
|
| 578 |
+
// Create filled area trace
|
| 579 |
+
traces.push({
|
| 580 |
+
x: binCenters,
|
| 581 |
+
y: scaledDist.map(d => yOffset + d),
|
| 582 |
+
mode: 'lines',
|
| 583 |
+
fill: 'toself',
|
| 584 |
+
fillcolor: color + '40', // 25% opacity
|
| 585 |
+
line: { color: color, width: 1.5 },
|
| 586 |
name: model.model,
|
| 587 |
+
text: model.distribution.map((d, i) =>
|
| 588 |
+
`${model.model}<br>${binLabels[i]} turns: ${d.toFixed(1)}%<br>Median: ${model.median}`
|
| 589 |
+
),
|
| 590 |
+
hovertemplate: '%{text}<extra></extra>',
|
| 591 |
+
showlegend: false
|
| 592 |
+
});
|
| 593 |
+
|
| 594 |
+
// Add baseline
|
| 595 |
+
traces.push({
|
| 596 |
+
x: [0, 100],
|
| 597 |
+
y: [yOffset, yOffset],
|
| 598 |
+
mode: 'lines',
|
| 599 |
+
line: { color: 'rgba(148, 163, 184, 0.2)', width: 0.5 },
|
| 600 |
+
hoverinfo: 'skip',
|
| 601 |
showlegend: false
|
| 602 |
+
});
|
| 603 |
});
|
| 604 |
|
| 605 |
const layout = {
|
| 606 |
...darkLayout,
|
|
|
|
| 607 |
xaxis: {
|
| 608 |
...darkLayout.xaxis,
|
| 609 |
title: { text: 'Number of Turns', font: { size: 11, color: '#e2e8f0' } },
|
| 610 |
+
range: [0, 100],
|
| 611 |
+
dtick: 20
|
| 612 |
},
|
| 613 |
yaxis: {
|
| 614 |
...darkLayout.yaxis,
|
| 615 |
+
tickmode: 'array',
|
| 616 |
+
tickvals: displayData.map((_, i) => i),
|
| 617 |
+
ticktext: displayData.map(m => m.model.length > 20 ? m.model.substring(0, 18) + '...' : m.model),
|
| 618 |
automargin: true,
|
| 619 |
+
range: [-0.5, displayData.length]
|
| 620 |
},
|
| 621 |
+
margin: { ...darkLayout.margin, l: 140 },
|
| 622 |
+
showlegend: false
|
| 623 |
};
|
| 624 |
|
| 625 |
Plotly.newPlot(`turn-${scenario}`, traces, layout, plotlyConfig);
|
|
|
|
| 723 |
});
|
| 724 |
});
|
| 725 |
|
| 726 |
+
// ============================================================================
|
| 727 |
+
// ERROR ANALYSIS - Hierarchical Bar Chart
|
| 728 |
+
// ============================================================================
|
| 729 |
+
function initErrorChart() {
|
| 730 |
+
const data = DDR_DATA.error;
|
| 731 |
+
if (!data || data.length === 0) return;
|
| 732 |
+
|
| 733 |
+
// Group by main category for bracket annotations
|
| 734 |
+
const categoryGroups = {};
|
| 735 |
+
data.forEach((item, idx) => {
|
| 736 |
+
if (!categoryGroups[item.main_category]) {
|
| 737 |
+
categoryGroups[item.main_category] = { start: idx, end: idx, items: [] };
|
| 738 |
+
}
|
| 739 |
+
categoryGroups[item.main_category].end = idx;
|
| 740 |
+
categoryGroups[item.main_category].items.push(item);
|
| 741 |
+
});
|
| 742 |
+
|
| 743 |
+
const traces = [{
|
| 744 |
+
x: data.map(d => d.subcategory),
|
| 745 |
+
y: data.map(d => d.percentage),
|
| 746 |
+
type: 'bar',
|
| 747 |
+
marker: {
|
| 748 |
+
color: data.map(d => d.color),
|
| 749 |
+
line: { color: '#fff', width: 0.5 }
|
| 750 |
+
},
|
| 751 |
+
text: data.map(d => `${d.percentage}%`),
|
| 752 |
+
textposition: 'outside',
|
| 753 |
+
textfont: { size: 11, color: '#e2e8f0' },
|
| 754 |
+
hovertemplate: '<b>%{x}</b><br>%{y:.1f}%<br>Count: %{customdata}<extra></extra>',
|
| 755 |
+
customdata: data.map(d => d.count),
|
| 756 |
+
showlegend: false
|
| 757 |
+
}];
|
| 758 |
+
|
| 759 |
+
const maxPct = Math.max(...data.map(d => d.percentage));
|
| 760 |
+
|
| 761 |
+
// Create annotations for main category labels
|
| 762 |
+
const annotations = [];
|
| 763 |
+
Object.entries(categoryGroups).forEach(([catName, group]) => {
|
| 764 |
+
const midIdx = (group.start + group.end) / 2;
|
| 765 |
+
annotations.push({
|
| 766 |
+
x: midIdx,
|
| 767 |
+
y: maxPct * 1.15,
|
| 768 |
+
text: `<b>${catName}</b>`,
|
| 769 |
+
showarrow: false,
|
| 770 |
+
font: { size: 10, color: '#e2e8f0' },
|
| 771 |
+
xanchor: 'center',
|
| 772 |
+
yanchor: 'bottom'
|
| 773 |
+
});
|
| 774 |
+
});
|
| 775 |
+
|
| 776 |
+
const layout = {
|
| 777 |
+
...darkLayout,
|
| 778 |
+
xaxis: {
|
| 779 |
+
...darkLayout.xaxis,
|
| 780 |
+
tickangle: -30,
|
| 781 |
+
tickfont: { size: 10, color: '#94a3b8' }
|
| 782 |
+
},
|
| 783 |
+
yaxis: {
|
| 784 |
+
...darkLayout.yaxis,
|
| 785 |
+
title: { text: 'Percentage (%)', font: { size: 11, color: '#e2e8f0' } },
|
| 786 |
+
range: [0, maxPct * 1.25]
|
| 787 |
+
},
|
| 788 |
+
annotations: annotations,
|
| 789 |
+
margin: { t: 50, r: 20, b: 100, l: 50 }
|
| 790 |
+
};
|
| 791 |
+
|
| 792 |
+
Plotly.newPlot('error-chart', traces, layout, plotlyConfig);
|
| 793 |
+
}
|
| 794 |
+
|
| 795 |
// ============================================================================
|
| 796 |
// INITIALIZE ALL CHARTS
|
| 797 |
// ============================================================================
|
|
|
|
| 799 |
initScalingCharts();
|
| 800 |
initRankingCharts();
|
| 801 |
initTurnCharts();
|
| 802 |
+
initErrorChart();
|
| 803 |
initProbingCharts();
|
| 804 |
});
|
| 805 |
|
|
|
|
| 814 |
Plotly.Plots.resize(`turn-${s}`);
|
| 815 |
Plotly.Plots.resize(`probing-${s}`);
|
| 816 |
});
|
| 817 |
+
if (document.getElementById('error-chart')) {
|
| 818 |
+
Plotly.Plots.resize('error-chart');
|
| 819 |
+
}
|
| 820 |
}, 100);
|
| 821 |
});
|
data.js
CHANGED
|
@@ -3756,5 +3756,63 @@ const DDR_DATA = {
|
|
| 3756 |
"Qwen3-4B": "#57E389",
|
| 3757 |
"Qwen3-30B-A3B": "#26A269",
|
| 3758 |
"Qwen3-Next-80B-A3B": "#9141AC"
|
| 3759 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3760 |
};
|
|
|
|
| 3756 |
"Qwen3-4B": "#57E389",
|
| 3757 |
"Qwen3-30B-A3B": "#26A269",
|
| 3758 |
"Qwen3-Next-80B-A3B": "#9141AC"
|
| 3759 |
+
},
|
| 3760 |
+
error: [
|
| 3761 |
+
{
|
| 3762 |
+
"main_category": "Fail in Exploration",
|
| 3763 |
+
"subcategory": "Insufficient Breadth",
|
| 3764 |
+
"count": 64,
|
| 3765 |
+
"percentage": 31.1,
|
| 3766 |
+
"color": "#1565C0"
|
| 3767 |
+
},
|
| 3768 |
+
{
|
| 3769 |
+
"main_category": "Fail in Exploration",
|
| 3770 |
+
"subcategory": "Insufficient Depth",
|
| 3771 |
+
"count": 56,
|
| 3772 |
+
"percentage": 27.2,
|
| 3773 |
+
"color": "#42A5F5"
|
| 3774 |
+
},
|
| 3775 |
+
{
|
| 3776 |
+
"main_category": "Poor Data-to-Insight",
|
| 3777 |
+
"subcategory": "Insight Misinterpretation",
|
| 3778 |
+
"count": 19,
|
| 3779 |
+
"percentage": 9.2,
|
| 3780 |
+
"color": "#2E7D32"
|
| 3781 |
+
},
|
| 3782 |
+
{
|
| 3783 |
+
"main_category": "Poor Data-to-Insight",
|
| 3784 |
+
"subcategory": "Superficial Analysis",
|
| 3785 |
+
"count": 16,
|
| 3786 |
+
"percentage": 7.8,
|
| 3787 |
+
"color": "#43A047"
|
| 3788 |
+
},
|
| 3789 |
+
{
|
| 3790 |
+
"main_category": "Poor Data-to-Insight",
|
| 3791 |
+
"subcategory": "Over Reasoning",
|
| 3792 |
+
"count": 15,
|
| 3793 |
+
"percentage": 7.3,
|
| 3794 |
+
"color": "#81C784"
|
| 3795 |
+
},
|
| 3796 |
+
{
|
| 3797 |
+
"main_category": "Lost in Context",
|
| 3798 |
+
"subcategory": "Lost in Debugging",
|
| 3799 |
+
"count": 18,
|
| 3800 |
+
"percentage": 8.7,
|
| 3801 |
+
"color": "#C62828"
|
| 3802 |
+
},
|
| 3803 |
+
{
|
| 3804 |
+
"main_category": "Lost in Context",
|
| 3805 |
+
"subcategory": "Fail in Summarization",
|
| 3806 |
+
"count": 10,
|
| 3807 |
+
"percentage": 4.9,
|
| 3808 |
+
"color": "#E53935"
|
| 3809 |
+
},
|
| 3810 |
+
{
|
| 3811 |
+
"main_category": "Lost in Context",
|
| 3812 |
+
"subcategory": "Poor Instruction Following",
|
| 3813 |
+
"count": 8,
|
| 3814 |
+
"percentage": 3.9,
|
| 3815 |
+
"color": "#EF9A9A"
|
| 3816 |
+
}
|
| 3817 |
+
]
|
| 3818 |
};
|
index.html
CHANGED
|
@@ -46,6 +46,7 @@
|
|
| 46 |
<button class="nav-tab active" data-section="scaling">π Scaling Analysis</button>
|
| 47 |
<button class="nav-tab" data-section="ranking">π Ranking Comparison</button>
|
| 48 |
<button class="nav-tab" data-section="turn">π Turn Distribution</button>
|
|
|
|
| 49 |
<button class="nav-tab" data-section="probing">π Probing Results</button>
|
| 50 |
</nav>
|
| 51 |
|
|
@@ -79,29 +80,35 @@
|
|
| 79 |
</div>
|
| 80 |
</section>
|
| 81 |
|
| 82 |
-
<!-- Ranking Comparison Section - 3 charts -->
|
| 83 |
<section id="ranking" class="section">
|
| 84 |
<div class="section-header">
|
| 85 |
<h2>Novelty vs Accuracy Ranking</h2>
|
| 86 |
<p>Compare model rankings based on Bradley-Terry pairwise ranking against traditional accuracy ranking.
|
| 87 |
</p>
|
| 88 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
<div class="charts-grid three-col">
|
| 90 |
<div class="chart-card">
|
| 91 |
<h3>MIMIC</h3>
|
| 92 |
-
<div id="ranking-mimic" class="chart-container-
|
| 93 |
</div>
|
| 94 |
<div class="chart-card">
|
| 95 |
<h3>10-K</h3>
|
| 96 |
-
<div id="ranking-10k" class="chart-container-
|
| 97 |
</div>
|
| 98 |
<div class="chart-card">
|
| 99 |
<h3>GLOBEM</h3>
|
| 100 |
-
<div id="ranking-globem" class="chart-container-
|
| 101 |
</div>
|
| 102 |
</div>
|
| 103 |
</section>
|
| 104 |
|
|
|
|
| 105 |
<!-- Turn Distribution Section - 3 charts -->
|
| 106 |
<section id="turn" class="section">
|
| 107 |
<div class="section-header">
|
|
@@ -124,6 +131,19 @@
|
|
| 124 |
</div>
|
| 125 |
</section>
|
| 126 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
<!-- Probing Results Section -->
|
| 128 |
<section id="probing" class="section">
|
| 129 |
<div class="section-header">
|
|
|
|
| 46 |
<button class="nav-tab active" data-section="scaling">π Scaling Analysis</button>
|
| 47 |
<button class="nav-tab" data-section="ranking">π Ranking Comparison</button>
|
| 48 |
<button class="nav-tab" data-section="turn">π Turn Distribution</button>
|
| 49 |
+
<button class="nav-tab" data-section="error">β οΈ Error Analysis</button>
|
| 50 |
<button class="nav-tab" data-section="probing">π Probing Results</button>
|
| 51 |
</nav>
|
| 52 |
|
|
|
|
| 80 |
</div>
|
| 81 |
</section>
|
| 82 |
|
| 83 |
+
<!-- Ranking Comparison Section - 3 charts with toggle -->
|
| 84 |
<section id="ranking" class="section">
|
| 85 |
<div class="section-header">
|
| 86 |
<h2>Novelty vs Accuracy Ranking</h2>
|
| 87 |
<p>Compare model rankings based on Bradley-Terry pairwise ranking against traditional accuracy ranking.
|
| 88 |
</p>
|
| 89 |
</div>
|
| 90 |
+
<div class="dimension-toggle">
|
| 91 |
+
<button class="dim-btn ranking-dim active" data-mode="comparison">π Comparison View</button>
|
| 92 |
+
<button class="dim-btn ranking-dim" data-mode="novelty">π― Novelty Rank</button>
|
| 93 |
+
<button class="dim-btn ranking-dim" data-mode="accuracy">π Accuracy Rank</button>
|
| 94 |
+
</div>
|
| 95 |
<div class="charts-grid three-col">
|
| 96 |
<div class="chart-card">
|
| 97 |
<h3>MIMIC</h3>
|
| 98 |
+
<div id="ranking-mimic" class="chart-container-md"></div>
|
| 99 |
</div>
|
| 100 |
<div class="chart-card">
|
| 101 |
<h3>10-K</h3>
|
| 102 |
+
<div id="ranking-10k" class="chart-container-md"></div>
|
| 103 |
</div>
|
| 104 |
<div class="chart-card">
|
| 105 |
<h3>GLOBEM</h3>
|
| 106 |
+
<div id="ranking-globem" class="chart-container-md"></div>
|
| 107 |
</div>
|
| 108 |
</div>
|
| 109 |
</section>
|
| 110 |
|
| 111 |
+
|
| 112 |
<!-- Turn Distribution Section - 3 charts -->
|
| 113 |
<section id="turn" class="section">
|
| 114 |
<div class="section-header">
|
|
|
|
| 131 |
</div>
|
| 132 |
</section>
|
| 133 |
|
| 134 |
+
<!-- Error Analysis Section -->
|
| 135 |
+
<section id="error" class="section">
|
| 136 |
+
<div class="section-header">
|
| 137 |
+
<h2>Error Type Analysis</h2>
|
| 138 |
+
<p>Breakdown of error types encountered during agent interactions, grouped by main categories.</p>
|
| 139 |
+
</div>
|
| 140 |
+
<div class="charts-grid single">
|
| 141 |
+
<div class="chart-card wide">
|
| 142 |
+
<div id="error-chart" class="chart-container-md"></div>
|
| 143 |
+
</div>
|
| 144 |
+
</div>
|
| 145 |
+
</section>
|
| 146 |
+
|
| 147 |
<!-- Probing Results Section -->
|
| 148 |
<section id="probing" class="section">
|
| 149 |
<div class="section-header">
|
styles.css
CHANGED
|
@@ -294,11 +294,27 @@ body {
|
|
| 294 |
min-height: 300px;
|
| 295 |
}
|
| 296 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 297 |
.chart-container-tall {
|
| 298 |
height: 550px;
|
| 299 |
min-height: 500px;
|
| 300 |
}
|
| 301 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
/* Footer */
|
| 303 |
.footer {
|
| 304 |
text-align: center;
|
|
|
|
| 294 |
min-height: 300px;
|
| 295 |
}
|
| 296 |
|
| 297 |
+
.chart-container-md {
|
| 298 |
+
height: 450px;
|
| 299 |
+
min-height: 400px;
|
| 300 |
+
}
|
| 301 |
+
|
| 302 |
.chart-container-tall {
|
| 303 |
height: 550px;
|
| 304 |
min-height: 500px;
|
| 305 |
}
|
| 306 |
|
| 307 |
+
/* Single chart grid */
|
| 308 |
+
.charts-grid.single {
|
| 309 |
+
grid-template-columns: 1fr;
|
| 310 |
+
max-width: 1000px;
|
| 311 |
+
margin: 0 auto;
|
| 312 |
+
}
|
| 313 |
+
|
| 314 |
+
.chart-card.wide {
|
| 315 |
+
padding: 1.5rem;
|
| 316 |
+
}
|
| 317 |
+
|
| 318 |
/* Footer */
|
| 319 |
.footer {
|
| 320 |
text-align: center;
|