thinkwee
commited on
Commit
Β·
9200a73
1
Parent(s):
9c2d624
fix display
Browse files- charts.js +100 -85
- index.html +6 -8
charts.js
CHANGED
|
@@ -362,150 +362,165 @@ document.querySelectorAll('.dim-btn:not(.probing-dim)').forEach(btn => {
|
|
| 362 |
});
|
| 363 |
|
| 364 |
// ============================================================================
|
| 365 |
-
// RANKING COMPARISON -
|
| 366 |
// ============================================================================
|
| 367 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 368 |
|
| 369 |
-
function
|
| 370 |
const scenarios = [
|
| 371 |
{ key: 'MIMIC', id: 'mimic' },
|
| 372 |
{ key: '10K', id: '10k' },
|
| 373 |
{ key: 'GLOBEM', id: 'globem' }
|
| 374 |
];
|
| 375 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 376 |
scenarios.forEach(({ key, id }) => {
|
| 377 |
const rawData = DDR_DATA.ranking[key];
|
| 378 |
if (!rawData) return;
|
| 379 |
|
| 380 |
-
// Sort
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
}
|
| 385 |
-
sortedModels = [...rawData].sort((a, b) => a.acc_rank - b.acc_rank);
|
| 386 |
-
}
|
| 387 |
|
| 388 |
-
//
|
| 389 |
-
const models = sortedModels.slice(0, 12);
|
| 390 |
const traces = [];
|
|
|
|
| 391 |
|
| 392 |
-
//
|
| 393 |
-
const primaryColor = mode === 'novelty' ? '#8B5CF6' : '#22C55E';
|
| 394 |
-
const secondaryColor = mode === 'novelty' ? '#22C55E' : '#8B5CF6';
|
| 395 |
-
const primaryLabel = mode === 'novelty' ? 'Novelty Rank' : 'Accuracy Rank';
|
| 396 |
-
const secondaryLabel = mode === 'novelty' ? 'Accuracy Rank' : 'Novelty Rank';
|
| 397 |
-
|
| 398 |
-
// Connection lines (dashed) from primary to secondary
|
| 399 |
models.forEach((m, i) => {
|
| 400 |
-
const
|
| 401 |
-
const secondaryX = mode === 'novelty' ? m.acc_rank : m.bt_rank;
|
| 402 |
-
|
| 403 |
traces.push({
|
| 404 |
-
x: [
|
| 405 |
y: [i, i],
|
| 406 |
mode: 'lines',
|
| 407 |
line: {
|
| 408 |
-
color: 'rgba(
|
| 409 |
-
width: 1
|
| 410 |
-
dash: '
|
| 411 |
},
|
| 412 |
showlegend: false,
|
| 413 |
hoverinfo: 'skip'
|
| 414 |
});
|
| 415 |
});
|
| 416 |
|
| 417 |
-
//
|
|
|
|
| 418 |
traces.push({
|
| 419 |
-
x: models.map(m =>
|
| 420 |
y: models.map((_, i) => i),
|
| 421 |
mode: 'markers',
|
| 422 |
-
name:
|
| 423 |
marker: {
|
| 424 |
-
size:
|
| 425 |
-
symbol: '
|
| 426 |
-
color:
|
| 427 |
-
line: {
|
| 428 |
},
|
| 429 |
-
text: models.map(m => {
|
| 430 |
-
if (mode === 'novelty') {
|
| 431 |
-
return `<b>${m.model}</b><br>Novelty: #${m.bt_rank}<br>Win Rate: ${m.win_rate}%`;
|
| 432 |
-
} else {
|
| 433 |
-
return `<b>${m.model}</b><br>Accuracy: #${m.acc_rank}<br>${m.accuracy}%`;
|
| 434 |
-
}
|
| 435 |
-
}),
|
| 436 |
hovertemplate: '%{text}<extra></extra>'
|
| 437 |
});
|
| 438 |
|
| 439 |
-
//
|
|
|
|
| 440 |
traces.push({
|
| 441 |
-
x: models.map(m =>
|
| 442 |
y: models.map((_, i) => i),
|
| 443 |
mode: 'markers',
|
| 444 |
-
name:
|
| 445 |
marker: {
|
| 446 |
-
size:
|
| 447 |
-
symbol: '
|
| 448 |
-
color:
|
| 449 |
-
line: { width:
|
| 450 |
},
|
| 451 |
-
text: models.map(m => {
|
| 452 |
-
if (mode === 'novelty') {
|
| 453 |
-
return `<b>${m.model}</b><br>Accuracy: #${m.acc_rank}<br>${m.accuracy}%`;
|
| 454 |
-
} else {
|
| 455 |
-
return `<b>${m.model}</b><br>Novelty: #${m.bt_rank}<br>Win Rate: ${m.win_rate}%`;
|
| 456 |
-
}
|
| 457 |
-
}),
|
| 458 |
hovertemplate: '%{text}<extra></extra>'
|
| 459 |
});
|
| 460 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 461 |
const layout = {
|
| 462 |
...darkLayout,
|
| 463 |
xaxis: {
|
| 464 |
...darkLayout.xaxis,
|
| 465 |
-
title: { text: 'Rank', font: { size:
|
| 466 |
-
range: [
|
| 467 |
-
dtick:
|
| 468 |
-
tick0:
|
| 469 |
},
|
| 470 |
yaxis: {
|
| 471 |
...darkLayout.yaxis,
|
| 472 |
tickmode: 'array',
|
| 473 |
tickvals: models.map((_, i) => i),
|
| 474 |
-
ticktext: models.map(m =>
|
|
|
|
| 475 |
automargin: true,
|
| 476 |
range: [-0.5, models.length - 0.5]
|
| 477 |
},
|
| 478 |
-
showlegend:
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
y:
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 487 |
};
|
| 488 |
|
| 489 |
-
Plotly.
|
| 490 |
});
|
| 491 |
}
|
| 492 |
|
| 493 |
-
function initRankingCharts() {
|
| 494 |
-
renderRankingCharts('novelty');
|
| 495 |
-
}
|
| 496 |
-
|
| 497 |
-
// Ranking mode toggle event listener
|
| 498 |
-
document.querySelectorAll('.ranking-dim').forEach(btn => {
|
| 499 |
-
btn.addEventListener('click', () => {
|
| 500 |
-
document.querySelectorAll('.ranking-dim').forEach(b => b.classList.remove('active'));
|
| 501 |
-
btn.classList.add('active');
|
| 502 |
-
|
| 503 |
-
const mode = btn.dataset.mode;
|
| 504 |
-
currentRankingMode = mode;
|
| 505 |
-
renderRankingCharts(mode);
|
| 506 |
-
});
|
| 507 |
-
});
|
| 508 |
-
|
| 509 |
// ============================================================================
|
| 510 |
// TURN DISTRIBUTION - 3 Charts (Ridgeline style)
|
| 511 |
// ============================================================================
|
|
|
|
| 362 |
});
|
| 363 |
|
| 364 |
// ============================================================================
|
| 365 |
+
// RANKING COMPARISON - Matches Python create_rank_figure.py exactly
|
| 366 |
// ============================================================================
|
| 367 |
+
const RANKING_DISPLAY_NAMES = {
|
| 368 |
+
'run_api_deepseek_deepseek-chat': 'DeepSeek-V3.2',
|
| 369 |
+
'qwen3-next-80b-a3b-instruct': 'Qwen3-Next-80BA3B',
|
| 370 |
+
'qwen2.5-14B-Instruct-1M': 'Qwen2.5-14B-1M',
|
| 371 |
+
'qwen2.5-7B-Instruct-1M': 'Qwen2.5-7B-1M',
|
| 372 |
+
'qwen2.5-14B-Instruct': 'Qwen2.5-14B',
|
| 373 |
+
'qwen2.5-7B-Instruct': 'Qwen2.5-7B',
|
| 374 |
+
'qwen2.5-72B-Instruct': 'Qwen2.5-72B',
|
| 375 |
+
'qwen2.5-32b-instruct': 'Qwen2.5-32B',
|
| 376 |
+
'qwen3-4B-Instruct-2507': 'Qwen3-4B',
|
| 377 |
+
'gemini2.5-flash-lite': 'Gemini2.5-Flash-Lite',
|
| 378 |
+
'gemini2.5-flash': 'Gemini2.5-Flash',
|
| 379 |
+
'gemini2.5-pro': 'Gemini2.5-Pro',
|
| 380 |
+
'claude4.5-sonnet': 'Claude4.5-Sonnet',
|
| 381 |
+
'llama3.3-70B': 'Llama3.3-70B',
|
| 382 |
+
'minimax-m2': 'MiniMax-M2',
|
| 383 |
+
'gpt5mini': 'GPT-5-mini',
|
| 384 |
+
'gpt5-mini': 'GPT-5-mini',
|
| 385 |
+
'gpt5.1': 'GPT-5.1',
|
| 386 |
+
'gpt5.2': 'GPT-5.2',
|
| 387 |
+
'kimi-k2': 'Kimi-K2',
|
| 388 |
+
'glm4.6': 'GLM-4.6',
|
| 389 |
+
'qwen3': 'Qwen3-30B-A3B',
|
| 390 |
+
'gemini3-flash': 'Gemini3-Flash',
|
| 391 |
+
};
|
| 392 |
+
|
| 393 |
+
function getDisplayName(model) {
|
| 394 |
+
return RANKING_DISPLAY_NAMES[model] || model;
|
| 395 |
+
}
|
| 396 |
|
| 397 |
+
function initRankingCharts() {
|
| 398 |
const scenarios = [
|
| 399 |
{ key: 'MIMIC', id: 'mimic' },
|
| 400 |
{ key: '10K', id: '10k' },
|
| 401 |
{ key: 'GLOBEM', id: 'globem' }
|
| 402 |
];
|
| 403 |
|
| 404 |
+
// Colors matching Python script
|
| 405 |
+
const PROPRIETARY_COLOR = '#6A0DAD'; // Vivid purple
|
| 406 |
+
const OPENSOURCE_COLOR = '#228B22'; // Forest green
|
| 407 |
+
|
| 408 |
scenarios.forEach(({ key, id }) => {
|
| 409 |
const rawData = DDR_DATA.ranking[key];
|
| 410 |
if (!rawData) return;
|
| 411 |
|
| 412 |
+
// Sort by acc_rank (like Python: df.sort_values(['acc_rank', 'bt_rank']))
|
| 413 |
+
const sortedModels = [...rawData].sort((a, b) => {
|
| 414 |
+
if (a.acc_rank !== b.acc_rank) return a.acc_rank - b.acc_rank;
|
| 415 |
+
return a.bt_rank - b.bt_rank;
|
| 416 |
+
});
|
|
|
|
|
|
|
| 417 |
|
| 418 |
+
const models = sortedModels; // Use all models (up to 22)
|
|
|
|
| 419 |
const traces = [];
|
| 420 |
+
const topN = models.length;
|
| 421 |
|
| 422 |
+
// Connection lines (dashed black)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 423 |
models.forEach((m, i) => {
|
| 424 |
+
const accRankClipped = Math.min(m.acc_rank, topN + 1);
|
|
|
|
|
|
|
| 425 |
traces.push({
|
| 426 |
+
x: [m.bt_rank, accRankClipped],
|
| 427 |
y: [i, i],
|
| 428 |
mode: 'lines',
|
| 429 |
line: {
|
| 430 |
+
color: 'rgba(0, 0, 0, 0.3)',
|
| 431 |
+
width: 1,
|
| 432 |
+
dash: 'dash'
|
| 433 |
},
|
| 434 |
showlegend: false,
|
| 435 |
hoverinfo: 'skip'
|
| 436 |
});
|
| 437 |
});
|
| 438 |
|
| 439 |
+
// Accuracy rank points (hollow diamonds) - drawn first (lower z)
|
| 440 |
+
const accColors = models.map(m => m.is_proprietary ? PROPRIETARY_COLOR : OPENSOURCE_COLOR);
|
| 441 |
traces.push({
|
| 442 |
+
x: models.map(m => m.acc_rank),
|
| 443 |
y: models.map((_, i) => i),
|
| 444 |
mode: 'markers',
|
| 445 |
+
name: 'Accuracy Rank',
|
| 446 |
marker: {
|
| 447 |
+
size: 12,
|
| 448 |
+
symbol: 'diamond-open',
|
| 449 |
+
color: accColors,
|
| 450 |
+
line: { width: 2 }
|
| 451 |
},
|
| 452 |
+
text: models.map(m => `<b>${getDisplayName(m.model)}</b><br>Accuracy Rank: #${m.acc_rank}<br>Accuracy: ${m.accuracy}%`),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 453 |
hovertemplate: '%{text}<extra></extra>'
|
| 454 |
});
|
| 455 |
|
| 456 |
+
// Novelty rank points (filled circles) - drawn on top
|
| 457 |
+
const noveltyColors = models.map(m => m.is_proprietary ? PROPRIETARY_COLOR : OPENSOURCE_COLOR);
|
| 458 |
traces.push({
|
| 459 |
+
x: models.map(m => m.bt_rank),
|
| 460 |
y: models.map((_, i) => i),
|
| 461 |
mode: 'markers',
|
| 462 |
+
name: 'Novelty Rank',
|
| 463 |
marker: {
|
| 464 |
+
size: 10,
|
| 465 |
+
symbol: 'circle',
|
| 466 |
+
color: noveltyColors,
|
| 467 |
+
line: { color: '#000', width: 1 }
|
| 468 |
},
|
| 469 |
+
text: models.map(m => `<b>${getDisplayName(m.model)}</b><br>Novelty Rank: #${m.bt_rank}<br>Win Rate: ${m.win_rate}%`),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 470 |
hovertemplate: '%{text}<extra></extra>'
|
| 471 |
});
|
| 472 |
|
| 473 |
+
// Calculate Spearman correlation
|
| 474 |
+
const btRanks = models.map(m => m.bt_rank);
|
| 475 |
+
const accRanks = models.map(m => m.acc_rank);
|
| 476 |
+
const n = btRanks.length;
|
| 477 |
+
const meanBt = btRanks.reduce((a, b) => a + b, 0) / n;
|
| 478 |
+
const meanAcc = accRanks.reduce((a, b) => a + b, 0) / n;
|
| 479 |
+
let num = 0, denBt = 0, denAcc = 0;
|
| 480 |
+
for (let i = 0; i < n; i++) {
|
| 481 |
+
num += (btRanks[i] - meanBt) * (accRanks[i] - meanAcc);
|
| 482 |
+
denBt += (btRanks[i] - meanBt) ** 2;
|
| 483 |
+
denAcc += (accRanks[i] - meanAcc) ** 2;
|
| 484 |
+
}
|
| 485 |
+
const rho = num / Math.sqrt(denBt * denAcc);
|
| 486 |
+
|
| 487 |
const layout = {
|
| 488 |
...darkLayout,
|
| 489 |
xaxis: {
|
| 490 |
...darkLayout.xaxis,
|
| 491 |
+
title: { text: 'Rank', font: { size: 10, color: '#e2e8f0' } },
|
| 492 |
+
range: [topN + 0.5, 0.5], // Inverted: high ranks left, 1 on right
|
| 493 |
+
dtick: 2,
|
| 494 |
+
tick0: 2
|
| 495 |
},
|
| 496 |
yaxis: {
|
| 497 |
...darkLayout.yaxis,
|
| 498 |
tickmode: 'array',
|
| 499 |
tickvals: models.map((_, i) => i),
|
| 500 |
+
ticktext: models.map(m => getDisplayName(m.model)),
|
| 501 |
+
tickfont: { size: 8, color: '#94a3b8' },
|
| 502 |
automargin: true,
|
| 503 |
range: [-0.5, models.length - 0.5]
|
| 504 |
},
|
| 505 |
+
showlegend: false,
|
| 506 |
+
annotations: [{
|
| 507 |
+
x: 0.02,
|
| 508 |
+
y: 0.98,
|
| 509 |
+
xref: 'paper',
|
| 510 |
+
yref: 'paper',
|
| 511 |
+
text: `Ο = ${rho.toFixed(2)}`,
|
| 512 |
+
showarrow: false,
|
| 513 |
+
font: { size: 11, color: '#94a3b8', family: 'Inter' },
|
| 514 |
+
bgcolor: 'rgba(30, 41, 59, 0.8)',
|
| 515 |
+
borderpad: 4
|
| 516 |
+
}],
|
| 517 |
+
margin: { t: 15, r: 10, b: 40, l: 110 }
|
| 518 |
};
|
| 519 |
|
| 520 |
+
Plotly.newPlot(`ranking-${id}`, traces, layout, plotlyConfig);
|
| 521 |
});
|
| 522 |
}
|
| 523 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 524 |
// ============================================================================
|
| 525 |
// TURN DISTRIBUTION - 3 Charts (Ridgeline style)
|
| 526 |
// ============================================================================
|
index.html
CHANGED
|
@@ -75,24 +75,22 @@
|
|
| 75 |
<section id="ranking" class="section visible">
|
| 76 |
<div class="section-header">
|
| 77 |
<h2>π Ranking Comparison</h2>
|
| 78 |
-
<p>
|
| 79 |
-
|
| 80 |
-
<div class="dimension-toggle">
|
| 81 |
-
<button class="dim-btn ranking-dim active" data-mode="novelty">π― Novelty Rank</button>
|
| 82 |
-
<button class="dim-btn ranking-dim" data-mode="accuracy">π Accuracy Rank</button>
|
| 83 |
</div>
|
| 84 |
<div class="charts-grid three-col">
|
| 85 |
<div class="chart-card">
|
| 86 |
<h3>MIMIC</h3>
|
| 87 |
-
<div id="ranking-mimic" class="chart-container"></div>
|
| 88 |
</div>
|
| 89 |
<div class="chart-card">
|
| 90 |
<h3>10-K</h3>
|
| 91 |
-
<div id="ranking-10k" class="chart-container"></div>
|
| 92 |
</div>
|
| 93 |
<div class="chart-card">
|
| 94 |
<h3>GLOBEM</h3>
|
| 95 |
-
<div id="ranking-globem" class="chart-container"></div>
|
|
|
|
| 96 |
</div>
|
| 97 |
</div>
|
| 98 |
</section>
|
|
|
|
| 75 |
<section id="ranking" class="section visible">
|
| 76 |
<div class="section-header">
|
| 77 |
<h2>π Ranking Comparison</h2>
|
| 78 |
+
<p>Novelty (Bradley-Terry pairwise) vs Accuracy ranking. β = Novelty Rank, β = Accuracy Rank. Purple =
|
| 79 |
+
Proprietary, Green = Open-source.</p>
|
|
|
|
|
|
|
|
|
|
| 80 |
</div>
|
| 81 |
<div class="charts-grid three-col">
|
| 82 |
<div class="chart-card">
|
| 83 |
<h3>MIMIC</h3>
|
| 84 |
+
<div id="ranking-mimic" class="chart-container-tall"></div>
|
| 85 |
</div>
|
| 86 |
<div class="chart-card">
|
| 87 |
<h3>10-K</h3>
|
| 88 |
+
<div id="ranking-10k" class="chart-container-tall"></div>
|
| 89 |
</div>
|
| 90 |
<div class="chart-card">
|
| 91 |
<h3>GLOBEM</h3>
|
| 92 |
+
<div id="ranking-globem" class="chart-container-tall"></div>
|
| 93 |
+
|
| 94 |
</div>
|
| 95 |
</div>
|
| 96 |
</section>
|