base-eval / index.html
maxidl's picture
Upload index.html with huggingface_hub
6a172ea verified
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>ellamind base-eval</title>
<script src="https://cdn.plot.ly/plotly-2.35.2.min.js" charset="utf-8"></script>
<script type="importmap">
{
"imports": {
"@huggingface/hub": "https://cdn.jsdelivr.net/npm/@huggingface/hub@0.21.0/+esm"
}
}
</script>
<style>
* { box-sizing: border-box; margin: 0; padding: 0; }
body {
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
background: #f8f9fa;
color: #1a1a2e;
padding: 24px;
}
/* ── Page header ─────────────────────────────── */
.page-header {
text-align: center;
margin-bottom: 24px;
}
.page-header h1 {
font-size: 1.5rem;
font-weight: 600;
color: #1a1a2e;
}
.btn {
padding: 8px 16px;
border: 1px solid #dee2e6;
border-radius: 6px;
background: #fff;
font-size: 0.875rem;
color: #495057;
cursor: pointer;
transition: background 0.15s;
}
.btn:hover { background: #e9ecef; }
.btn-primary {
background: #4361ee;
color: #fff;
border-color: #4361ee;
}
.btn-primary:hover { background: #3a56d4; }
.btn-sm {
padding: 4px 10px;
font-size: 0.75rem;
}
.btn-danger { color: #e63946; border-color: #e6394640; }
.btn-danger:hover { background: #e6394610; }
/* ── Panels grid ────────────────────────────── */
#panels-container {
display: grid;
grid-template-columns: 1fr;
gap: 20px;
}
/* ── Panel ───────────────────────────────────── */
.panel {
background: #fff;
border: 1px solid #dee2e6;
border-radius: 8px;
overflow: hidden;
position: relative;
width: 1400px;
max-width: 100%;
margin: 0 auto;
}
.panel-toolbar {
display: flex;
align-items: center;
justify-content: flex-end;
gap: 6px;
padding: 6px 10px;
border-bottom: 1px solid #dee2e6;
background: #f8f9fa;
}
.panel-controls {
padding: 16px;
border-bottom: 1px solid #dee2e6;
}
.panel-controls.collapsed { display: none; }
.controls-row {
display: flex;
flex-wrap: wrap;
gap: 12px;
align-items: flex-end;
}
.controls-row + .controls-row { margin-top: 12px; }
.control-group {
display: flex;
flex-direction: column;
gap: 4px;
}
.control-group label {
font-size: 0.7rem;
font-weight: 600;
text-transform: uppercase;
letter-spacing: 0.05em;
color: #6c757d;
}
select {
padding: 6px 10px;
border: 1px solid #dee2e6;
border-radius: 6px;
background: #fff;
font-size: 0.8rem;
color: #1a1a2e;
min-width: 160px;
cursor: pointer;
}
select:focus {
outline: none;
border-color: #4361ee;
box-shadow: 0 0 0 3px rgba(67, 97, 238, 0.15);
}
/* ── Models section ──────────────────────────── */
.models-section {
margin-top: 12px;
}
.models-header {
display: flex;
align-items: center;
gap: 8px;
margin-bottom: 8px;
}
.models-header span {
font-size: 0.7rem;
font-weight: 600;
text-transform: uppercase;
letter-spacing: 0.05em;
color: #6c757d;
}
.checkbox-grid {
display: flex;
flex-wrap: wrap;
gap: 6px 16px;
}
.checkbox-item {
display: flex;
align-items: center;
gap: 5px;
cursor: pointer;
font-size: 0.8rem;
}
.checkbox-item input[type="checkbox"] {
width: 14px;
height: 14px;
cursor: pointer;
accent-color: #4361ee;
}
.checkbox-item .color-dot {
cursor: pointer;
border: 1px solid transparent;
transition: border-color 0.15s;
}
.checkbox-item .color-dot:hover {
border-color: #888;
}
.checkbox-item .model-name.missing {
text-decoration: line-through;
opacity: 0.5;
cursor: help;
}
.checkbox-item .model-name.missing:hover {
opacity: 0.8;
}
.model-separator {
width: 100%;
border-top: 1px solid #eee;
margin: 4px 0;
}
/* ── Chart ───────────────────────────────────── */
.panel-chart-wrapper {
position: relative;
}
.panel-chart {
min-height: 100px;
overflow: hidden;
}
.title-hover-zone {
position: absolute;
top: 0;
left: 50px;
right: 50px;
height: 40px;
cursor: pointer;
z-index: 10;
display: flex;
align-items: center;
justify-content: center;
pointer-events: none;
}
.title-hover-zone > * {
pointer-events: auto;
}
.title-info-icon {
position: absolute;
top: 50%;
transform: translateY(-50%);
width: 18px;
height: 18px;
border-radius: 50%;
background: #e9ecef;
color: #495057;
font-size: 11px;
font-weight: 600;
display: flex;
align-items: center;
justify-content: center;
opacity: 0.6;
transition: opacity 0.15s;
}
.title-hover-zone:hover .title-info-icon {
opacity: 1;
}
/* ── Task quality stats ─────────────────────── */
.task-stats {
display: flex;
flex-direction: column;
padding: 6px 16px;
border-top: 1px solid #dee2e6;
background: #f8f9fa;
font-size: 0.72rem;
color: #495057;
gap: 3px;
}
.task-stats:empty { display: none; }
.stat-row {
display: flex;
flex-wrap: wrap;
align-items: center;
gap: 4px 12px;
}
.stat-stage {
font-weight: 700;
text-transform: uppercase;
letter-spacing: 0.04em;
color: #495057;
min-width: 52px;
font-size: 0.68rem;
}
.stat-item {
display: flex;
align-items: center;
gap: 3px;
}
.stat-label {
font-weight: 600;
text-transform: uppercase;
letter-spacing: 0.03em;
color: #6c757d;
font-size: 0.68rem;
}
.stat-value {
font-variant-numeric: tabular-nums;
}
.stat-value.good { color: #2a9d8f; }
.stat-value.ok { color: #e9c46a; }
.stat-value.bad { color: #e63946; }
.stat-help {
display: inline-block;
width: 14px;
height: 14px;
border-radius: 50%;
background: #e9ecef;
color: #6c757d;
font-size: 9px;
font-weight: 700;
text-align: center;
line-height: 14px;
cursor: help;
}
/* ── Resize handle ──────────────────────────── */
.panel-width-handle {
position: absolute;
top: 0;
right: -4px;
width: 8px;
height: 100%;
cursor: ew-resize;
z-index: 20;
transition: background-color 0.15s;
}
.panel-width-handle:hover,
.panel-width-handle.active {
background-color: #e9ecef;
}
.panel-resize-handle {
height: 6px;
cursor: ns-resize;
background: linear-gradient(to bottom, #dee2e6 1px, transparent 1px, transparent 3px, #dee2e6 3px);
background-size: 100% 4px;
background-position: center;
transition: background-color 0.15s;
}
.panel-resize-handle:hover,
.panel-resize-handle.active {
background-color: #e9ecef;
}
.loading {
display: flex;
align-items: center;
justify-content: center;
padding: 1rem 0;
color: #adb5bd;
font-size: 0.85rem;
}
/* ── Custom tooltip ──────────────────────────── */
.custom-tooltip {
position: fixed;
pointer-events: none;
background: rgba(0, 0, 0, 0.85);
color: #fff;
padding: 8px 12px 12px;
border-radius: 4px;
font-size: 11px;
line-height: 1.5;
z-index: 9999;
display: none;
white-space: nowrap;
}
.custom-tooltip.scrollable {
pointer-events: auto;
overflow-y: auto;
white-space: normal;
min-width: 200px;
max-width: 400px;
}
/* ── Merge dataset ───────────────────────────── */
.merge-dataset-row {
display: flex;
align-items: center;
justify-content: center;
gap: 8px;
margin-bottom: 16px;
flex-wrap: wrap;
}
.merge-dataset-row input[type="text"] {
padding: 6px 10px;
border: 1px solid #dee2e6;
border-radius: 6px;
font-size: 0.8rem;
color: #1a1a2e;
width: 420px;
max-width: 60vw;
}
.merge-dataset-row input[type="text"]:focus,
.hf-auth-row input[type="password"]:focus {
outline: none;
border-color: #4361ee;
box-shadow: 0 0 0 3px rgba(67, 97, 238, 0.15);
}
.hf-auth-row input[type="password"] {
padding: 6px 10px;
border: 1px solid #dee2e6;
border-radius: 6px;
font-size: 0.8rem;
color: #1a1a2e;
width: 420px;
max-width: 50vw;
}
.hf-auth-row {
display: flex;
align-items: center;
justify-content: center;
gap: 8px;
margin-bottom: 8px;
}
.hf-auth-row .hf-user {
font-size: 0.8rem;
color: #495057;
}
.hf-signin-img {
cursor: pointer;
height: 28px;
}
.hf-auth-row label {
font-size: 0.75rem;
color: #6c757d;
cursor: pointer;
display: flex;
align-items: center;
gap: 4px;
}
.hf-auth-row label input {
accent-color: #4361ee;
}
.merge-dataset-row .merge-status {
font-size: 0.75rem;
color: #6c757d;
}
.merge-dataset-row .merge-status.error {
color: #e63946;
}
.merged-tags {
display: flex;
flex-wrap: wrap;
gap: 6px;
justify-content: center;
margin-bottom: 12px;
}
.merged-tag {
display: inline-flex;
align-items: center;
gap: 4px;
padding: 3px 10px;
background: #e9ecef;
border-radius: 12px;
font-size: 0.75rem;
color: #495057;
}
.merged-tag button {
background: none;
border: none;
cursor: pointer;
color: #e63946;
font-size: 0.85rem;
line-height: 1;
padding: 0 2px;
}
.merged-tag button:hover { color: #c5303c; }
/* ── Add panel button ────────────────────────── */
.add-panel-row {
display: flex;
justify-content: center;
padding: 20px;
}
/* ── Init loading ────────────────────────────── */
#init-loading {
display: flex;
align-items: center;
justify-content: center;
height: 300px;
color: #6c757d;
font-size: 1rem;
}
</style>
</head>
<body>
<div class="page-header">
<h1>ellamind base-eval</h1>
<p style="margin:4px 0 0;font-size:13px;color:#6c757d;">Benchmarks: <a href="https://github.com/ellamind/base-eval" target="_blank" rel="noopener" style="color:#4361ee;">ellamind/base-eval</a> Β· Data: <a href="https://huggingface.co/datasets/ellamind/eval-scores-ref" target="_blank" rel="noopener" style="color:#4361ee;">ellamind/eval-scores-ref</a></p>
</div>
<div id="init-loading">Initializing DuckDB...</div>
<div class="hf-auth-row" id="hf-auth-row" style="display:none">
<img id="hf-signin-btn" class="hf-signin-img" src="https://huggingface.co/datasets/huggingface/badges/resolve/main/sign-in-with-huggingface-sm-dark.svg" alt="Sign in with Hugging Face" style="display:none">
<label id="hf-private-label" style="display:none"><input type="checkbox" id="hf-private-toggle"> Include private repos</label>
<span id="hf-user" class="hf-user" style="display:none"></span>
<button class="btn btn-sm" id="hf-signout-btn" style="display:none">Sign out</button>
<input type="password" id="hf-token-input" placeholder="HF token (for private datasets)" style="display:none">
</div>
<div class="merge-dataset-row" id="merge-dataset-row" style="display:none">
<input type="text" id="merge-dataset-input" placeholder="HF dataset path, e.g. org/dataset-name or org/dataset-name/file.parquet">
<button class="btn btn-primary btn-sm" id="btn-merge-dataset">Merge Dataset</button>
<span class="merge-status" id="merge-status"></span>
</div>
<div class="merged-tags" id="merged-tags"></div>
<div id="panels-container"></div>
<div class="custom-tooltip" id="custom-tooltip"></div>
<div class="add-panel-row" id="add-panel-row" style="display:none">
<button class="btn btn-primary" id="btn-add-panel">+ Add Panel</button>
</div>
<script type="module">
import * as duckdb from 'https://cdn.jsdelivr.net/npm/@duckdb/duckdb-wasm@1.29.0/+esm';
import jsyaml from 'https://cdn.jsdelivr.net/npm/js-yaml@4.1.0/+esm';
import { oauthLoginUrl, oauthHandleRedirectIfPresent } from '@huggingface/hub';
// ── Globals ─────────────────────────────────────────────────
let hfAccessToken = null; // set by OAuth or manual input
let db = null;
let conn = null;
let panelCounter = 0;
const panels = new Map();
// Fallback palette for models without a config color
const COLOR_PALETTE = [
'#4361ee', '#e63946', '#2a9d8f', '#e9c46a', '#f4a261',
'#264653', '#7209b7', '#06d6a0', '#ef476f', '#ff6b6b',
'#48bfe3', '#d4a017', '#b5838d', '#588157', '#9d4edd',
'#f77f00', '#3a86a7', '#8338ec', '#ff006e', '#fb5607',
];
const PARQUET_URL = 'https://huggingface.co/datasets/ellamind/eval-scores-ref/resolve/main/scores.parquet';
// Shared model info (loaded once)
let ALL_MODELS = []; // [{model, model_display_name, is_checkpoint}]
let MODEL_COLORS = {};
let CONFIG = {}; // parsed config.yaml
// Merged datasets tracking
let mergedDatasets = []; // [{id, label, url}]
let mergeCounter = 0;
// ── DuckDB init ─────────────────────────────────────────────
async function initDuckDB() {
const JSDELIVR_BUNDLES = duckdb.getJsDelivrBundles();
const bundle = await duckdb.selectBundle(JSDELIVR_BUNDLES);
const worker_url = URL.createObjectURL(
new Blob([`importScripts("${bundle.mainWorker}");`], { type: 'text/javascript' })
);
const worker = new Worker(worker_url);
const logger = new duckdb.ConsoleLogger();
db = new duckdb.AsyncDuckDB(logger, worker);
await db.instantiate(bundle.mainModule, bundle.pthreadWorker);
URL.revokeObjectURL(worker_url);
conn = await db.connect();
}
async function loadParquet() {
const response = await fetch(PARQUET_URL);
const buffer = new Uint8Array(await response.arrayBuffer());
await db.registerFileBuffer('scores.parquet', buffer);
await conn.query(`CREATE VIEW scores AS SELECT * FROM 'scores.parquet'`);
}
// ── SQL helpers ─────────────────────────────────────────────
async function query(sql) {
const result = await conn.query(sql);
return result.toArray().map(row => row.toJSON());
}
function esc(s) { return s.replace(/'/g, "''"); }
function sqlIn(vals) { return vals.map(v => `'${esc(v)}'`).join(', '); }
// ── Shared helpers ──────────────────────────────────────────
function populateSelect(el, options, selected) {
el.innerHTML = '';
options.forEach(opt => {
const o = document.createElement('option');
if (typeof opt === 'object') { o.value = opt.value; o.textContent = opt.label; }
else { o.value = opt; o.textContent = opt; }
el.appendChild(o);
});
if (selected && options.some(o => (typeof o === 'object' ? o.value : o) === selected)) {
el.value = selected;
}
}
function formatTokensSingle(value, precision) {
if (value >= 1e12) return (value / 1e12).toFixed(precision) + 'T';
if (value >= 1e9) return (value / 1e9).toFixed(precision) + 'B';
if (value >= 1e6) return (value / 1e6).toFixed(precision) + 'M';
if (value >= 1e3) return (value / 1e3).toFixed(precision) + 'K';
return value.toString();
}
function formatTokens(value) {
if (value == null || isNaN(value)) return 'N/A';
return formatTokensSingle(value, value >= 1e12 ? 1 : 0);
}
function formatTokensArray(values) {
// Use minimum precision that produces unique labels
for (let p = (values[0] >= 1e12 ? 1 : 0); p <= 3; p++) {
const labels = values.map(v => formatTokensSingle(v, p));
if (new Set(labels).size === labels.length) return labels;
}
return values.map(v => formatTokensSingle(v, 3));
}
function niceTicks(min, max, maxTicks = 8) {
min = Math.max(0, min);
if (min >= max) return [min];
const range = max - min;
// Find a "nice" step size: 1, 2, 5 Γ— 10^n
const rawStep = range / maxTicks;
const mag = Math.pow(10, Math.floor(Math.log10(rawStep)));
const normalized = rawStep / mag;
let step;
if (normalized <= 1.5) step = 1 * mag;
else if (normalized <= 3.5) step = 2 * mag;
else if (normalized <= 7.5) step = 5 * mag;
else step = 10 * mag;
const start = Math.ceil(min / step) * step;
const ticks = [];
for (let v = start; v <= max; v += step) {
ticks.push(Math.round(v));
}
// Always include min/max endpoints if not already close
if (ticks.length === 0 || ticks[0] - min > step * 0.3) ticks.unshift(Math.round(min));
if (max - ticks[ticks.length - 1] > step * 0.3) ticks.push(Math.round(max));
return ticks;
}
function exponentialMovingAverage(values, alpha) {
if (alpha <= 0) return values;
const result = [];
let ema = 0;
let debiasWeight = 0;
for (let i = 0; i < values.length; i++) {
ema = alpha * ema + (1 - alpha) * values[i];
debiasWeight = alpha * debiasWeight + (1 - alpha);
result.push(ema / debiasWeight);
}
return result;
}
// ── Task quality metrics ─────────────────────────────────────
// Spearman rank correlation between two arrays
function spearmanCorrelation(xs, ys) {
const n = xs.length;
if (n < 3) return NaN;
function rankArray(arr) {
const indexed = arr.map((v, i) => ({ v, i }));
indexed.sort((a, b) => a.v - b.v);
const ranks = new Array(n);
let i = 0;
while (i < n) {
let j = i;
while (j < n - 1 && indexed[j + 1].v === indexed[j].v) j++;
const avgRank = (i + j) / 2 + 1;
for (let k = i; k <= j; k++) ranks[indexed[k].i] = avgRank;
i = j + 1;
}
return ranks;
}
const rx = rankArray(xs);
const ry = rankArray(ys);
let sumD2 = 0;
for (let i = 0; i < n; i++) sumD2 += (rx[i] - ry[i]) ** 2;
return 1 - (6 * sumD2) / (n * (n * n - 1));
}
// Kendall's Tau between two arrays
function kendallTau(xs, ys) {
const n = xs.length;
if (n < 2) return NaN;
let concordant = 0, discordant = 0;
for (let i = 0; i < n; i++) {
for (let j = i + 1; j < n; j++) {
const dx = xs[i] - xs[j];
const dy = ys[i] - ys[j];
if (dx * dy > 0) concordant++;
else if (dx * dy < 0) discordant++;
}
}
const pairs = n * (n - 1) / 2;
return (concordant - discordant) / pairs;
}
// ── Benchmark Goodness Metrics ──────────────────────────────
/** Avg Spearman(steps, scores) across model point slices. Needs β‰₯3 pts. */
function _monotonicity(slices, flip) {
const vals = slices.map(pts => {
if (pts.length < 3) return NaN;
return flip * spearmanCorrelation(pts.map(p => p.x), pts.map(p => p.y));
}).filter(v => !isNaN(v));
return vals.length > 0 ? vals.reduce((a, b) => a + b, 0) / vals.length : NaN;
}
/** Relative improvement: (best-first)/|first|, avg across models. Needs β‰₯2 pts. */
function _signalStrength(slices, higherIsBetter) {
const vals = slices.map(pts => {
if (pts.length < 2) return NaN;
const first = pts[0].y;
const best = higherIsBetter === false
? Math.min(...pts.map(p => p.y))
: Math.max(...pts.map(p => p.y));
const raw = higherIsBetter === false ? first - best : best - first;
return Math.abs(first) > 0.01 ? raw / Math.abs(first) : raw;
}).filter(v => !isNaN(v));
return vals.length > 0 ? vals.reduce((a, b) => a + b, 0) / vals.length : NaN;
}
/** MAD of consecutive score diffs, avg across models. Robust to data-mix jumps. Needs β‰₯3 pts. */
function _noise(slices) {
const vals = slices.map(pts => {
if (pts.length < 3) return NaN; // need β‰₯2 diffs for MAD
const diffs = [];
for (let i = 1; i < pts.length; i++) diffs.push(pts[i].y - pts[i - 1].y);
const sorted = [...diffs].sort((a, b) => a - b);
const median = sorted[Math.floor(sorted.length / 2)];
const absDev = diffs.map(d => Math.abs(d - median));
const sortedDev = [...absDev].sort((a, b) => a - b);
return sortedDev[Math.floor(sortedDev.length / 2)];
}).filter(v => !isNaN(v));
return vals.length > 0 ? vals.reduce((a, b) => a + b, 0) / vals.length : NaN;
}
/** Avg Kendall Tau of model rankings between consecutive steps. */
function _orderingConsistency(checkpointModels, xValues) {
if (checkpointModels.length < 2 || xValues.length < 2) return NaN;
const scoreLookup = {};
for (const [name, d] of checkpointModels) {
scoreLookup[name] = {};
for (const p of d.points) scoreLookup[name][p.x] = p.y;
}
const taus = [];
for (let i = 0; i < xValues.length - 1; i++) {
const x1 = xValues[i], x2 = xValues[i + 1];
const s1 = checkpointModels.map(([name]) => scoreLookup[name][x1]);
const s2 = checkpointModels.map(([name]) => scoreLookup[name][x2]);
const tau = kendallTau(s1, s2);
if (!isNaN(tau)) taus.push(tau);
}
return taus.length > 0 ? taus.reduce((a, b) => a + b, 0) / taus.length : NaN;
}
/** Std of scores across models at the last common x. */
function _discrimination(checkpointModels, xValues) {
if (checkpointModels.length < 2 || xValues.length === 0) return NaN;
const lastX = xValues[xValues.length - 1];
const scores = checkpointModels.map(([, d]) => {
const p = d.points.find(p => p.x === lastX);
return p ? p.y : NaN;
}).filter(v => !isNaN(v));
if (scores.length < 2) return NaN;
const mean = scores.reduce((a, b) => a + b, 0) / scores.length;
const variance = scores.reduce((a, b) => a + (b - mean) ** 2, 0) / scores.length;
return Math.sqrt(variance);
}
/**
* Compute benchmark goodness metrics for three stages: overall, early, late.
* @param {Object} byModel - { modelName: { points: [{x, y}], isCheckpoint } }
* @param {boolean} higherIsBetter
* @returns {Object} { overall: {...}, early: {...}, late: {...} } or null
*/
function computeTaskQualityMetrics(byModel, higherIsBetter) {
const flip = higherIsBetter === false ? -1 : 1;
// Filter to checkpoint models with valid (non-null x) points
const checkpointModels = Object.entries(byModel)
.map(([name, d]) => [name, { ...d, points: d.points.filter(p => p.x != null) }])
.filter(([, d]) => d.isCheckpoint && d.points.length >= 3);
if (checkpointModels.length === 0) return null;
// Per-model slicing: split each model's own points at its own midpoint.
// This avoids requiring common x-values for per-model metrics.
function perModelSlices(stage) {
return checkpointModels.map(([, d]) => {
const pts = d.points;
const half = Math.floor(pts.length / 2);
if (stage === 'early') return pts.slice(0, Math.max(half, 1));
if (stage === 'late') return pts.slice(half);
return pts;
});
}
// Common x-values (for cross-model metrics: ordering, discrimination).
// Fall back to union if intersection is too small.
const xSets = checkpointModels.map(([, d]) => new Set(d.points.map(p => p.x)));
let commonXs = [...xSets[0]].filter(x => xSets.every(s => s.has(x))).sort((a, b) => a - b);
if (commonXs.length < 2) {
// Use all x-values that appear in at least 2 models
const xCount = {};
for (const s of xSets) for (const x of s) xCount[x] = (xCount[x] || 0) + 1;
commonXs = Object.entries(xCount)
.filter(([, c]) => c >= 2)
.map(([x]) => Number(x))
.sort((a, b) => a - b);
}
const commonHalf = Math.floor(commonXs.length / 2);
const earlyCommonXs = commonXs.slice(0, Math.max(commonHalf, 1));
const lateCommonXs = commonXs.slice(commonHalf);
function metricsFor(stage, crossXs) {
const slices = perModelSlices(stage);
return {
monotonicity: _monotonicity(slices, flip),
signalStrength: _signalStrength(slices, higherIsBetter),
noise: _noise(slices),
orderingConsistency: _orderingConsistency(checkpointModels, crossXs),
discrimination: _discrimination(checkpointModels, crossXs),
};
}
return {
overall: metricsFor('overall', commonXs),
early: metricsFor('early', earlyCommonXs),
late: metricsFor('late', lateCommonXs),
};
}
const METRIC_HELP = {
monotonicity: 'Spearman rank correlation between training steps and score, averaged across models. Values near 1.0 mean scores consistently improve with training.',
signalStrength: 'Relative improvement: (best \u2212 first) / |first|, averaged across models. Shows how much the task is learned beyond its initial performance.',
noise: 'Median absolute deviation (MAD) of consecutive score differences, averaged across models. Lower = cleaner signal. Uses MAD to be robust to sudden jumps from data-mix changes.',
orderingConsistency: "Average Kendall\u2019s Tau of model rankings between consecutive checkpoint steps. High values mean stable model ordering.",
discrimination: 'Std of scores across models at the last checkpoint in this stage. Higher = task better separates model quality.',
};
const METRIC_ITEMS = [
{ key: 'monotonicity', label: 'Monotonicity' },
{ key: 'signalStrength', label: 'Signal Str.' },
{ key: 'noise', label: 'Noise' },
{ key: 'orderingConsistency', label: 'Ordering' },
{ key: 'discrimination', label: 'Discrim.' },
];
const STAGE_LABELS = { overall: 'Overall', early: 'Early', late: 'Late' };
function qualityClass(metric, value) {
if (value == null || isNaN(value)) return '';
switch (metric) {
case 'monotonicity':
return value >= 0.7 ? 'good' : value >= 0.4 ? 'ok' : 'bad';
case 'signalStrength':
return value >= 0.10 ? 'good' : value >= 0.03 ? 'ok' : 'bad';
case 'noise':
return '';
case 'orderingConsistency':
return value >= 0.6 ? 'good' : value >= 0.3 ? 'ok' : 'bad';
case 'discrimination':
return value >= 0.03 ? 'good' : value >= 0.01 ? 'ok' : 'bad';
default: return '';
}
}
function renderTaskStats(statsEl, metrics) {
if (!metrics) { statsEl.textContent = ''; return; }
const stages = ['overall', 'early', 'late'];
// Build DOM safely
statsEl.textContent = '';
const tooltip = document.getElementById('custom-tooltip');
for (const stage of stages) {
const data = metrics[stage];
if (!data) continue;
const row = document.createElement('div');
row.className = 'stat-row';
const stageLabel = document.createElement('span');
stageLabel.className = 'stat-stage';
stageLabel.textContent = STAGE_LABELS[stage];
row.appendChild(stageLabel);
for (const { key, label } of METRIC_ITEMS) {
const val = data[key];
const display = val == null || isNaN(val) ? 'N/A' : val.toFixed(3);
const cls = qualityClass(key, val);
const item = document.createElement('span');
item.className = 'stat-item';
const lbl = document.createElement('span');
lbl.className = 'stat-label';
lbl.textContent = label + ':';
item.appendChild(lbl);
const valEl = document.createElement('span');
valEl.className = 'stat-value' + (cls ? ' ' + cls : '');
valEl.textContent = display;
item.appendChild(valEl);
const help = document.createElement('span');
help.className = 'stat-help';
help.textContent = '?';
help.dataset.helpKey = key;
help.addEventListener('mouseenter', () => {
if (tooltip.classList.contains('scrollable')) return;
tooltip.textContent = METRIC_HELP[key];
tooltip.style.display = 'block';
tooltip._statTip = true;
const rect = help.getBoundingClientRect();
tooltip.style.left = rect.left + 'px';
tooltip.style.top = (rect.top - tooltip.offsetHeight - 4) + 'px';
});
help.addEventListener('mouseleave', () => {
if (tooltip._statTip) {
tooltip.style.display = 'none';
tooltip._statTip = false;
}
});
item.appendChild(help);
row.appendChild(item);
}
statsEl.appendChild(row);
}
}
async function loadConfig() {
try {
const resp = await fetch('config.yaml');
if (resp.ok) {
CONFIG = jsyaml.load(await resp.text()) || {};
}
} catch (e) {
console.warn('Could not load config.yaml, using defaults:', e);
}
}
async function loadModels() {
ALL_MODELS = await query(`
WITH raw AS (
SELECT DISTINCT model, model_display_name, is_checkpoint
FROM scores
),
ckpt_models AS (
SELECT model FROM raw WHERE is_checkpoint = true
)
SELECT r.model, r.model_display_name, r.is_checkpoint
FROM raw r
WHERE r.is_checkpoint = true
OR r.model NOT IN (SELECT model FROM ckpt_models)
ORDER BY r.is_checkpoint DESC, r.model_display_name
`);
// Assign colors: config overrides first, then fallback palette
const configColors = CONFIG.model_colors || {};
MODEL_COLORS = {};
let paletteIdx = 0;
ALL_MODELS.forEach(m => {
const name = m.model_display_name;
if (configColors[name]) {
MODEL_COLORS[name] = configColors[name];
} else {
MODEL_COLORS[name] = COLOR_PALETTE[paletteIdx % COLOR_PALETTE.length];
paletteIdx++;
}
});
}
// ── Panel class ─────────────────────────────────────────────
class Panel {
constructor(id) {
this.id = id;
this.el = {};
this.collapsed = false;
this._zoomXRange = null;
this._zoomYRange = null;
this.build();
}
build() {
const container = document.getElementById('panels-container');
const panel = document.createElement('div');
panel.className = 'panel';
panel.id = `panel-${this.id}`;
panel.innerHTML = `
<div class="panel-toolbar">
<button class="btn btn-sm" id="ptoggle-${this.id}">Collapse</button>
<button class="btn btn-sm" id="pexport-png-${this.id}">PNG</button>
<button class="btn btn-sm" id="pexport-svg-${this.id}">SVG</button>
<button class="btn btn-sm btn-danger" id="premove-${this.id}">Remove</button>
</div>
<div class="panel-controls" id="pcontrols-${this.id}">
<div class="controls-row">
<div class="control-group">
<label>Eval Suite</label>
<select id="psuite-${this.id}"></select>
</div>
<div class="control-group">
<label>Task</label>
<select id="ptask-${this.id}"></select>
</div>
<div class="control-group">
<label>Metric</label>
<select id="pmetric-${this.id}"></select>
</div>
<div class="control-group">
<label>Smoothing: <span id="psmooth-val-${this.id}">0</span></label>
<input type="range" id="psmooth-${this.id}" min="0" max="0.99" step="0.01" value="0" style="width:120px;vertical-align:middle">
</div>
<div class="control-group">
<label>Chart Type</label>
<select id="pchart-type-${this.id}">
<option value="auto">Auto</option>
<option value="line" selected>Line</option>
<option value="bar">Bar</option>
</select>
</div>
<div class="control-group">
<label>X-Ticks</label>
<select id="pxticks-${this.id}">
<option value="4">4</option>
<option value="6">6</option>
<option value="8" selected>8</option>
<option value="12">12</option>
<option value="16">16</option>
<option value="24">24</option>
</select>
</div>
</div>
<div class="models-section">
<div class="models-header">
<span>Models</span>
<button class="btn btn-sm" id="pmodels-all-${this.id}">All</button>
<button class="btn btn-sm" id="pmodels-none-${this.id}">None</button>
<button class="btn btn-sm" id="pmodels-ckpt-${this.id}">Checkpoints</button>
<button class="btn btn-sm" id="pmodels-base-${this.id}">Baselines</button>
</div>
<div class="checkbox-grid" id="pmodels-${this.id}"></div>
</div>
</div>
<div class="panel-chart-wrapper">
<div class="title-hover-zone" id="ptitle-hover-${this.id}" style="display:none"></div>
<div class="panel-chart" id="pchart-${this.id}"></div>
</div>
<div class="task-stats" id="pstats-${this.id}"></div>
<div class="panel-resize-handle" id="presize-${this.id}"></div>
<div class="panel-width-handle" id="pwidth-${this.id}"></div>
`;
container.appendChild(panel);
// Cache refs
this.el.panel = panel;
this.el.controls = panel.querySelector(`#pcontrols-${this.id}`);
this.el.suite = panel.querySelector(`#psuite-${this.id}`);
this.el.task = panel.querySelector(`#ptask-${this.id}`);
this.el.metric = panel.querySelector(`#pmetric-${this.id}`);
this.el.smooth = panel.querySelector(`#psmooth-${this.id}`);
this.el.chartType = panel.querySelector(`#pchart-type-${this.id}`);
this.el.xTicks = panel.querySelector(`#pxticks-${this.id}`);
this.el.models = panel.querySelector(`#pmodels-${this.id}`);
this.el.chart = panel.querySelector(`#pchart-${this.id}`);
this.el.titleHover = panel.querySelector(`#ptitle-hover-${this.id}`);
this.el.stats = panel.querySelector(`#pstats-${this.id}`);
this.el.resize = panel.querySelector(`#presize-${this.id}`);
this.el.widthHandle = panel.querySelector(`#pwidth-${this.id}`);
this.chartHeight = null; // null = use default
// Events
panel.querySelector(`#ptoggle-${this.id}`).addEventListener('click', () => this.toggleControls());
panel.querySelector(`#premove-${this.id}`).addEventListener('click', () => this.remove());
panel.querySelector(`#pexport-png-${this.id}`).addEventListener('click', () => this.export('png'));
panel.querySelector(`#pexport-svg-${this.id}`).addEventListener('click', () => this.export('svg'));
this.el.suite.addEventListener('change', () => this.onSuiteChange());
this.el.task.addEventListener('change', () => this.onTaskChange());
this.el.metric.addEventListener('change', () => this.renderChart());
this.el.smooth.addEventListener('input', () => {
panel.querySelector(`#psmooth-val-${this.id}`).textContent = this.el.smooth.value;
this.renderChart();
});
this.el.chartType.addEventListener('change', () => this.renderChart());
this.el.xTicks.addEventListener('change', () => this.renderChart());
panel.querySelector(`#pmodels-all-${this.id}`).addEventListener('click', () => this.setModels(true));
panel.querySelector(`#pmodels-none-${this.id}`).addEventListener('click', () => this.setModels(false));
panel.querySelector(`#pmodels-ckpt-${this.id}`).addEventListener('click', () => this.setModelsByType(true));
panel.querySelector(`#pmodels-base-${this.id}`).addEventListener('click', () => this.setModelsByType(false));
// Resize handle drag
this.el.resize.addEventListener('mousedown', (e) => this.startResize(e));
this.el.widthHandle.addEventListener('mousedown', (e) => this.startWidthResize(e));
this.buildModelCheckboxes();
}
toggleControls() {
this.collapsed = !this.collapsed;
this.el.controls.classList.toggle('collapsed', this.collapsed);
this.el.panel.querySelector(`#ptoggle-${this.id}`).textContent =
this.collapsed ? 'Expand' : 'Collapse';
}
remove() {
this.el.panel.remove();
panels.delete(this.id);
}
buildModelCheckboxes() {
const container = this.el.models;
container.innerHTML = '';
let lastCkpt = null;
for (const m of ALL_MODELS) {
if (lastCkpt !== null && lastCkpt !== m.is_checkpoint) {
const sep = document.createElement('div');
sep.className = 'model-separator';
container.appendChild(sep);
}
lastCkpt = m.is_checkpoint;
const lbl = document.createElement('div');
lbl.className = 'checkbox-item';
const cb = document.createElement('input');
cb.type = 'checkbox';
cb.value = m.model_display_name;
const DEFAULT_MODELS = [
'SmolLM3 3B', 'Olmo 3 7B', 'Olmo 3 32B',
'Apertus 8B', 'Apertus 70B', 'Kimi K2',
'Nemotron 3 Nano 30B-A3B', 'Nemotron 3 Super 120B-A12B',
];
cb.checked = DEFAULT_MODELS.includes(m.model_display_name)
|| /^Qwen3\.5\b/.test(m.model_display_name);
cb.dataset.isCheckpoint = m.is_checkpoint;
cb.addEventListener('change', () => this.renderChart());
const dot = document.createElement('span');
dot.className = 'color-dot';
dot.dataset.model = m.model_display_name;
dot.style.cssText = `display:inline-block;width:9px;height:9px;border-radius:50%;background:${MODEL_COLORS[m.model_display_name]};position:relative`;
const colorInput = document.createElement('input');
colorInput.type = 'color';
colorInput.value = MODEL_COLORS[m.model_display_name];
colorInput.style.cssText = 'position:absolute;top:0;left:0;width:100%;height:100%;opacity:0;cursor:pointer;border:none;padding:0';
colorInput.addEventListener('click', (e) => e.stopPropagation());
colorInput.addEventListener('input', (e) => {
const newColor = e.target.value;
MODEL_COLORS[m.model_display_name] = newColor;
document.querySelectorAll(`.color-dot[data-model="${CSS.escape(m.model_display_name)}"]`).forEach(d => {
d.style.background = newColor;
});
panels.forEach((p) => p.renderChart());
});
dot.appendChild(colorInput);
const name = document.createElement('span');
name.className = 'model-name';
name.dataset.modelName = m.model_display_name;
name.textContent = ' ' + m.model_display_name;
if (!m.is_checkpoint) {
name.style.fontStyle = 'italic';
}
name.addEventListener('mouseenter', (e) => {
const tip = name.dataset.missingTip;
if (!tip) return;
const tooltip = document.getElementById('custom-tooltip');
if (tooltip.classList.contains('scrollable')) return;
tooltip.innerHTML = tip;
tooltip.style.display = 'block';
tooltip._modelTip = true;
const rect = name.getBoundingClientRect();
tooltip.style.left = (rect.left) + 'px';
tooltip.style.top = (rect.bottom + 4) + 'px';
});
name.addEventListener('mouseleave', () => {
const tooltip = document.getElementById('custom-tooltip');
if (tooltip._modelTip) {
tooltip.style.display = 'none';
tooltip._modelTip = false;
}
});
// Click on name or container (but not dot) toggles checkbox
name.addEventListener('click', () => { cb.checked = !cb.checked; cb.dispatchEvent(new Event('change')); });
lbl.addEventListener('click', (e) => {
if (e.target === lbl) { cb.checked = !cb.checked; cb.dispatchEvent(new Event('change')); }
});
lbl.append(cb, dot, name);
container.appendChild(lbl);
}
}
setModels(checked) {
this.el.models.querySelectorAll('input').forEach(cb => cb.checked = checked);
this.renderChart();
}
setModelsByType(isCheckpoint) {
this.el.models.querySelectorAll('input').forEach(cb => {
cb.checked = (cb.dataset.isCheckpoint === String(isCheckpoint));
});
this.renderChart();
}
getSelectedModels() {
return Array.from(this.el.models.querySelectorAll('input:checked')).map(cb => cb.value);
}
getSmoothing() {
return parseFloat(this.el.smooth.value) || 0;
}
getChartType() {
return this.el.chartType.value;
}
getMaxXTicks() {
return parseInt(this.el.xTicks.value, 10) || 8;
}
getSelectedTask() {
return this.el.task.value;
}
// ── Populate cascades ──────────────────────────────────────
async populateSuites(defaults) {
const rows = await query(`
SELECT DISTINCT task AS value, task_display_name AS label
FROM scores
WHERE task_type = 'eval_suite' AND task != 'test_fix'
ORDER BY task
`);
populateSelect(this.el.suite, rows, defaults?.suite);
await this.onSuiteChange(defaults);
}
async onSuiteChange(defaults) {
const suite = this.el.suite.value;
if (!suite) return;
// Parse subtask_tree JSON to find task groups and leaf benchmarks
// Merge all distinct trees since different models may have different subsets
const stRows = await query(`
SELECT DISTINCT subtask_tree FROM scores
WHERE task = '${esc(suite)}' AND subtask_tree IS NOT NULL
`);
let groupTasks = [];
let leafTasks = [];
if (stRows.length > 0) {
const merged = {};
for (const row of stRows) {
if (!row.subtask_tree) continue;
const tree = JSON.parse(row.subtask_tree);
for (const [key, children] of Object.entries(tree)) {
if (!merged[key]) merged[key] = new Set();
for (const c of children) merged[key].add(c);
}
}
const allChildren = new Set();
for (const children of Object.values(merged)) {
for (const c of children) allChildren.add(c);
}
// Groups are keys in the tree (they have children)
groupTasks = Object.keys(merged).sort();
// Leaves are children that don't appear as keys
leafTasks = [...allChildren].filter(t => !merged[t]).sort();
}
// Fetch display names for groups and leaves
const allTasks = [...groupTasks, ...leafTasks];
let rows = [];
if (allTasks.length > 0) {
rows = await query(`
SELECT DISTINCT task AS value, task_display_name AS label, task_type
FROM scores
WHERE task IN (${sqlIn(allTasks)})
ORDER BY task
`);
}
const groupRows = rows.filter(r => r.task_type === 'task_group');
const leafRows = rows.filter(r => r.task_type === 'benchmark');
const options = [
{ value: suite, label: `${suite} (eval suite)` },
...groupRows.map(r => ({ value: r.value, label: `${r.label} (group)` })),
...leafRows,
];
populateSelect(this.el.task, options, defaults?.task);
await this.onTaskChange(defaults);
}
async onTaskChange(defaults) {
const task = this.getSelectedTask();
if (!task) return;
const rows = await query(`
SELECT DISTINCT metric FROM scores WHERE task = '${esc(task)}' ORDER BY metric
`);
const prev = defaults?.metric || this.el.metric.value;
populateSelect(this.el.metric, rows.map(r => r.metric), prev);
if (defaults?.chartType) this.el.chartType.value = defaults.chartType;
await this.renderChart();
}
async updateMissingModels(task, metric) {
const nameEls = this.el.models.querySelectorAll('.model-name');
if (!task || !metric) {
nameEls.forEach(el => {
el.classList.remove('missing');
delete el.dataset.missingTip;
});
return;
}
const available = await query(`
SELECT DISTINCT model_display_name FROM scores
WHERE task = '${esc(task)}' AND metric = '${esc(metric)}'
`);
const availableSet = new Set(available.map(r => r.model_display_name));
nameEls.forEach(el => {
const modelName = el.dataset.modelName;
if (!availableSet.has(modelName)) {
el.classList.add('missing');
el.dataset.missingTip = `No scores for "${modelName}" on this task / metric`;
} else {
el.classList.remove('missing');
delete el.dataset.missingTip;
}
});
}
// ── Chart rendering ────────────────────────────────────────
async renderChart() {
const task = this.getSelectedTask();
const metric = this.el.metric.value;
const models = this.getSelectedModels();
await this.updateMissingModels(task, metric);
if (!task || !metric || models.length === 0) {
Plotly.react(this.el.chart, [], {
title: { text: '', font: { size: 14 } },
xaxis: { visible: false },
yaxis: { visible: false },
height: this.getChartHeight(600),
plot_bgcolor: '#fff', paper_bgcolor: '#fff',
});
this.el.stats.textContent = '';
return;
}
const rows = await query(`
SELECT model, model_display_name, tokens_trained, score, score_stderr,
is_checkpoint, higher_is_better, step
FROM scores
WHERE task = '${esc(task)}'
AND metric = '${esc(metric)}'
AND model_display_name IN (${sqlIn(models)})
ORDER BY model_display_name, tokens_trained
`);
if (rows.length === 0) {
this.el.chart.innerHTML = '<div class="loading">No data for this selection</div>';
return;
}
// Merge final checkpoints (step=null) into matching checkpoint series
const mergedRows = this.mergeFinalCheckpoints(rows);
// Determine chart type
const chartType = this.resolveChartType(mergedRows);
const higherIsBetter = mergedRows[0]?.higher_is_better;
// Fetch subtask tree JSON from the data (merge all variants)
let subtaskTree = null;
try {
const stRows = await query(`
SELECT DISTINCT subtask_tree FROM scores
WHERE task = '${esc(task)}' AND metric = '${esc(metric)}'
AND subtask_tree IS NOT NULL
`);
if (stRows.length > 0) {
const merged = {};
for (const row of stRows) {
if (!row.subtask_tree) continue;
const tree = JSON.parse(row.subtask_tree);
for (const [key, children] of Object.entries(tree)) {
if (!merged[key]) merged[key] = new Set();
for (const c of children) merged[key].add(c);
}
}
// Convert Sets back to arrays for downstream use
for (const key of Object.keys(merged)) {
merged[key] = [...merged[key]];
}
subtaskTree = merged;
}
} catch (e) {
// ignore
}
if (chartType === 'bar') {
this.drawBarChart(mergedRows, task, metric, higherIsBetter, subtaskTree);
} else {
this.drawLineChart(mergedRows, task, metric, higherIsBetter, subtaskTree);
}
}
mergeFinalCheckpoints(rows) {
// Final checkpoints have step=null. If a matching checkpoint series
// exists (same `model` id), append the final checkpoint to that series.
const regular = [];
const finals = [];
for (const r of rows) {
if (r.step === null || r.step === undefined) {
finals.push(r);
} else {
regular.push(r);
}
}
if (finals.length === 0) return rows;
// Map model id -> series model_display_name for checkpoint series
const modelToSeries = {};
for (const r of regular) {
if (r.is_checkpoint) {
modelToSeries[r.model] = r.model_display_name;
}
}
const result = [...regular];
for (const fc of finals) {
const seriesName = modelToSeries[fc.model];
if (seriesName) {
// Append to matching checkpoint series
result.push({ ...fc, model_display_name: seriesName, is_checkpoint: true });
} else {
// No matching series, keep as-is
result.push(fc);
}
}
return result;
}
resolveChartType(rows) {
const pref = this.getChartType();
if (pref !== 'auto') return pref;
// Auto-detect: if every model has <= 1 unique tokens_trained, use bar
const byModel = {};
for (const r of rows) {
if (!byModel[r.model_display_name]) byModel[r.model_display_name] = new Set();
if (r.tokens_trained != null) byModel[r.model_display_name].add(Number(r.tokens_trained));
}
const allSingle = Object.values(byModel).every(s => s.size <= 1);
return allSingle ? 'bar' : 'line';
}
formatChartTitle(task, metric, higherIsBetter) {
const arrow = higherIsBetter === true ? ' \u2191' : higherIsBetter === false ? ' \u2193' : '';
return `${task} \u2014 ${metric}${arrow}`;
}
renderSubtaskTree(map, keys, depth = 0) {
if (!keys || keys.length === 0) return '';
const indent = depth * 16;
return keys.map(key => {
const children = map[key];
let html = `<div style="padding-left:${indent}px">${key}</div>`;
if (children) {
html += this.renderSubtaskTree(map, children, depth + 1);
}
return html;
}).join('');
}
setupTitleTooltip(subtaskTree) {
const hoverZone = this.el.titleHover;
hoverZone.innerHTML = '';
if (!subtaskTree || typeof subtaskTree !== 'object' || Object.keys(subtaskTree).length === 0) {
hoverZone.style.display = 'none';
return;
}
hoverZone.style.display = '';
// Position icon right before the title text
const icon = document.createElement('span');
icon.className = 'title-info-icon';
icon.textContent = 'i';
hoverZone.appendChild(icon);
const titleEl = this.el.chart.querySelector('.gtitle');
if (titleEl) {
const wrapperRect = this.el.chart.closest('.panel-chart-wrapper').getBoundingClientRect();
const titleRect = titleEl.getBoundingClientRect();
icon.style.left = (titleRect.right - wrapperRect.left - 50 + 6) + 'px'; // 50 = hover zone left offset, 6 = gap
} else {
icon.style.right = '0px';
}
const tooltip = document.getElementById('custom-tooltip');
// Find true roots: keys that never appear as a child value
const allChildren = new Set(Object.values(subtaskTree).flat());
const rootKeys = Object.keys(subtaskTree).filter(k => !allChildren.has(k));
const html = this.renderSubtaskTree(subtaskTree, rootKeys);
const positionTooltip = () => {
const titleEl = this.el.chart.querySelector('.gtitle');
const chartRect = this.el.chart.getBoundingClientRect();
const tw = tooltip.offsetWidth;
let tipTop;
if (titleEl) {
const titleRect = titleEl.getBoundingClientRect();
const titleCenter = (titleRect.left + titleRect.right) / 2;
tooltip.style.left = (titleCenter - tw / 2) + 'px';
tipTop = titleRect.bottom + 4;
} else {
tooltip.style.left = (chartRect.left + chartRect.width / 2 - tw / 2) + 'px';
tipTop = chartRect.top + 40;
}
tooltip.style.top = tipTop + 'px';
tooltip.style.maxHeight = Math.max(0, chartRect.bottom - tipTop) + 'px';
};
this._titleClick = (e) => {
// Toggle: if already visible for this panel, hide it
if (tooltip.style.display === 'block' && tooltip._panelId === this.id) {
tooltip.style.display = 'none';
tooltip.classList.remove('scrollable');
tooltip._panelId = null;
window.removeEventListener('scroll', this._titleScroll, true);
return;
}
tooltip.innerHTML = html;
tooltip.classList.add('scrollable');
tooltip.style.display = 'block';
tooltip._panelId = this.id;
positionTooltip();
window.addEventListener('scroll', this._titleScroll, true);
};
this._titleScroll = () => {
if (tooltip.style.display === 'block' && tooltip._panelId === this.id) {
positionTooltip();
}
};
this._titleOutsideClick = (e) => {
if (tooltip._panelId !== this.id) return;
if (tooltip.contains(e.target) || hoverZone.contains(e.target)) return;
tooltip.style.display = 'none';
tooltip.classList.remove('scrollable');
tooltip._panelId = null;
window.removeEventListener('scroll', this._titleScroll, true);
};
hoverZone.addEventListener('click', this._titleClick);
document.addEventListener('mousedown', this._titleOutsideClick);
}
startResize(e) {
e.preventDefault();
const startY = e.clientY;
const startH = this.el.chart.offsetHeight;
this.el.resize.classList.add('active');
const onMove = (ev) => {
const delta = ev.clientY - startY;
const newH = Math.max(200, startH + delta);
this.chartHeight = newH;
Plotly.relayout(this.el.chart, { height: newH });
};
const onUp = () => {
this.el.resize.classList.remove('active');
document.removeEventListener('mousemove', onMove);
document.removeEventListener('mouseup', onUp);
};
document.addEventListener('mousemove', onMove);
document.addEventListener('mouseup', onUp);
}
startWidthResize(e) {
e.preventDefault();
const startX = e.clientX;
const startW = this.el.panel.offsetWidth;
this.el.widthHandle.classList.add('active');
const chart = this.el.chart;
// Freeze chart height so Plotly's responsive ResizeObserver
// cannot collapse it while the panel width is being dragged.
const lockedH = chart.offsetHeight;
chart.style.minHeight = lockedH + 'px';
const onMove = (ev) => {
const newW = Math.max(300, startW + ev.clientX - startX);
this.el.panel.style.width = newW + 'px';
};
const onUp = () => {
this.el.widthHandle.classList.remove('active');
document.removeEventListener('mousemove', onMove);
document.removeEventListener('mouseup', onUp);
chart.style.minHeight = '';
Plotly.purge(chart);
this.renderChart();
};
document.addEventListener('mousemove', onMove);
document.addEventListener('mouseup', onUp);
}
getChartHeight(fallback) {
return this.chartHeight || fallback;
}
cleanupTooltip() {
const tooltip = document.getElementById('custom-tooltip');
tooltip.style.display = 'none';
const chart = this.el.chart;
chart.removeAllListeners?.('plotly_hover');
chart.removeAllListeners?.('plotly_unhover');
if (this._tooltipMouseMove) {
chart.removeEventListener('mousemove', this._tooltipMouseMove);
this._tooltipMouseMove = null;
}
if (this._tooltipMouseLeave) {
chart.removeEventListener('mouseleave', this._tooltipMouseLeave);
this._tooltipMouseLeave = null;
}
// Clean up title click popup
if (this._titleClick) {
const hz = this.el.titleHover;
hz.removeEventListener('click', this._titleClick);
hz.style.display = 'none';
this._titleClick = null;
}
if (this._titleScroll) {
window.removeEventListener('scroll', this._titleScroll, true);
this._titleScroll = null;
}
if (this._titleOutsideClick) {
document.removeEventListener('mousedown', this._titleOutsideClick);
this._titleOutsideClick = null;
}
if (tooltip._panelId === this.id) {
tooltip.classList.remove('scrollable');
tooltip._panelId = null;
}
}
drawLineChart(rows, task, metric, higherIsBetter, subtasks) {
this.cleanupTooltip();
const w = this.getSmoothing();
// Group by model
const byModel = {};
for (const r of rows) {
const name = r.model_display_name;
if (!byModel[name]) byModel[name] = { points: [], isCheckpoint: r.is_checkpoint };
byModel[name].points.push({ x: r.tokens_trained != null ? Number(r.tokens_trained) : null, y: r.score });
}
for (const d of Object.values(byModel)) d.points.sort((a, b) => (a.x ?? -Infinity) - (b.x ?? -Infinity));
// X range for baselines
let xMin = Infinity, xMax = -Infinity;
for (const d of Object.values(byModel)) {
if (d.isCheckpoint) {
for (const p of d.points) {
if (p.x != null) {
xMin = Math.min(xMin, p.x);
xMax = Math.max(xMax, p.x);
}
}
}
}
if (!isFinite(xMin)) { xMin = 0; xMax = 1; }
const traces = [];
for (const [name, d] of Object.entries(byModel)) {
const color = MODEL_COLORS[name] || '#999';
const validPoints = d.points.filter(p => p.x != null);
if (d.isCheckpoint && validPoints.length > 1) {
traces.push({
x: validPoints.map(p => p.x),
y: exponentialMovingAverage(validPoints.map(p => p.y), w),
name, mode: 'lines+markers',
line: { color, width: 2 }, marker: { size: 5 },
});
} else {
const score = d.points[0]?.y;
if (score != null) {
// Use many points so the tooltip is accessible along the whole line
const nPts = 50;
const xs = Array.from({ length: nPts }, (_, i) => xMin + (xMax - xMin) * i / (nPts - 1));
const ys = xs.map(() => score);
traces.push({
x: xs, y: ys,
name, mode: 'lines',
line: { color, width: 2, dash: 'dash' },
hoverinfo: 'name+y',
});
}
}
}
// Compute nice tick values from data range
const maxXTicks = this.getMaxXTicks();
const tickVals = niceTicks(xMin, xMax, maxXTicks);
Plotly.react(this.el.chart, traces, {
title: { text: this.formatChartTitle(task, metric, higherIsBetter), font: { size: 14, color: '#1a1a2e' } },
hoverlabel: { namelength: -1 },
xaxis: {
title: { text: 'Tokens Trained', font: { size: 12 } },
tickfont: { size: 10 }, tickvals: this._zoomXRange ? niceTicks(this._zoomXRange[0], this._zoomXRange[1], maxXTicks) : tickVals,
ticktext: formatTokensArray(this._zoomXRange ? niceTicks(this._zoomXRange[0], this._zoomXRange[1], maxXTicks) : tickVals),
gridcolor: '#e9ecef', zeroline: false,
...(this._zoomXRange ? { range: [...this._zoomXRange], autorange: false } : {}),
},
yaxis: {
title: { text: 'Score', font: { size: 12 } },
tickfont: { size: 10 }, gridcolor: '#e9ecef', zeroline: false,
...(this._zoomYRange ? { range: [...this._zoomYRange], autorange: false } : { autorange: true }),
},
legend: { orientation: 'h', yanchor: 'top', y: -0.15, x: 0, font: { size: 11 } },
margin: { t: 50, r: 20, b: 100, l: 50 },
plot_bgcolor: '#fff', paper_bgcolor: '#fff',
font: { family: '-apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif' },
height: this.getChartHeight(600),
}, { responsive: true });
// Recompute ticks on zoom/pan so they adapt to the visible range
this.el.chart.removeAllListeners?.('plotly_relayout');
let updatingTicks = false;
this.el.chart.on('plotly_relayout', (evt) => {
if (updatingTicks) return;
let tv;
const r0 = evt['xaxis.range[0]'];
const r1 = evt['xaxis.range[1]'];
if (r0 != null && r1 != null) {
// User zoomed/panned β€” save the range
this._zoomXRange = [r0, r1];
const yr0 = evt['yaxis.range[0]'];
const yr1 = evt['yaxis.range[1]'];
if (yr0 != null && yr1 != null) this._zoomYRange = [yr0, yr1];
tv = niceTicks(r0, r1, maxXTicks);
} else if (evt['xaxis.autorange']) {
// User double-clicked to reset β€” clear saved zoom
this._zoomXRange = null;
this._zoomYRange = null;
tv = niceTicks(xMin, xMax, maxXTicks);
}
if (tv) {
updatingTicks = true;
Plotly.relayout(this.el.chart, {
'xaxis.tickvals': tv,
'xaxis.ticktext': formatTokensArray(tv),
}).then(() => { updatingTicks = false; });
}
});
// Compute and display task quality metrics
const metrics = computeTaskQualityMetrics(byModel, higherIsBetter);
renderTaskStats(this.el.stats, metrics);
this.setupTitleTooltip(subtasks);
}
drawBarChart(rows, task, metric, higherIsBetter, subtasks) {
this.cleanupTooltip();
this.el.stats.innerHTML = '';
// For bar chart, use latest checkpoint per model
const byModel = {};
for (const r of rows) {
const name = r.model_display_name;
const tokens = r.tokens_trained != null ? Number(r.tokens_trained) : null;
if (!byModel[name] || (tokens != null && (byModel[name].tokens == null || tokens > byModel[name].tokens))) {
byModel[name] = { score: r.score, tokens, isCheckpoint: r.is_checkpoint };
}
}
// Sort by score
const sorted = Object.entries(byModel)
.sort((a, b) => higherIsBetter !== false ? b[1].score - a[1].score : a[1].score - b[1].score);
const names = sorted.map(([n]) => n);
const scores = sorted.map(([, d]) => d.score);
const colors = sorted.map(([n]) => MODEL_COLORS[n] || '#999');
const tokens = sorted.map(([, d]) => formatTokens(d.tokens));
const hovertext = sorted.map(([n, d]) =>
`${n}<br>Score: ${d.score.toFixed(4)}<br>Tokens: ${formatTokens(d.tokens)}`
);
// Annotations for tokens trained at the start of each bar
const annotations = names.map((name, i) => ({
x: 0,
y: name,
text: tokens[i],
hovertext: 'Tokens Trained',
xanchor: 'left',
yanchor: 'middle',
showarrow: false,
font: { size: 10, color: '#000' },
xshift: 4,
}));
Plotly.react(this.el.chart, [{
type: 'bar',
orientation: 'h',
y: names,
x: scores,
marker: { color: colors },
text: scores.map(s => s.toFixed(4)),
textposition: 'outside',
textfont: { size: 11 },
hoverinfo: 'none',
customdata: hovertext,
}], {
title: { text: this.formatChartTitle(task, metric, higherIsBetter), font: { size: 14, color: '#1a1a2e' } },
hovermode: 'closest',
annotations,
xaxis: {
title: { text: 'Score', font: { size: 12 } },
tickfont: { size: 10 }, gridcolor: '#e9ecef', zeroline: false,
...(this._zoomXRange ? { range: [...this._zoomXRange], autorange: false } : {}),
},
yaxis: {
tickfont: { size: 11 }, automargin: true,
categoryorder: 'array', categoryarray: names.slice().reverse(),
},
margin: { t: 60, r: 80, b: 60, l: 10 },
plot_bgcolor: '#fff', paper_bgcolor: '#fff',
font: { family: '-apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif' },
height: this.getChartHeight(Math.max(400, names.length * 40 + 100)),
showlegend: false,
}, { responsive: true });
// Custom cursor-following tooltip
const tooltip = document.getElementById('custom-tooltip');
const chart = this.el.chart;
chart.on('plotly_hover', (data) => {
if (tooltip.classList.contains('scrollable')) return;
const pt = data.points[0];
tooltip.innerHTML = pt.customdata;
tooltip.style.display = 'block';
});
chart.on('plotly_unhover', () => {
if (tooltip.classList.contains('scrollable')) return;
tooltip.style.display = 'none';
});
this._tooltipMouseMove = (e) => {
if (tooltip.classList.contains('scrollable')) return;
if (tooltip.style.display === 'block') {
tooltip.style.left = (e.clientX + 12) + 'px';
tooltip.style.top = (e.clientY - 10) + 'px';
}
};
this._tooltipMouseLeave = () => {
if (tooltip.classList.contains('scrollable')) return;
tooltip.style.display = 'none';
};
chart.addEventListener('mousemove', this._tooltipMouseMove);
chart.addEventListener('mouseleave', this._tooltipMouseLeave);
this.setupTitleTooltip(subtasks);
}
export(format) {
const task = this.getSelectedTask();
const metric = this.el.metric.value;
let filename = `${task}_${metric}`.replace(/[^a-zA-Z0-9_-]/g, '_');
Plotly.downloadImage(this.el.chart, { format, scale: 3, filename });
}
}
// ── Panel management ────────────────────────────────────────
async function addPanel(defaults) {
const id = panelCounter++;
const panel = new Panel(id);
panels.set(id, panel);
await panel.populateSuites(defaults);
return panel;
}
// ── Merge dataset helpers ───────────────────────────────────
function resolveParquetUrl(input) {
input = input.trim();
if (input.startsWith('http://') || input.startsWith('https://')) return input;
const parts = input.split('/');
if (parts.length === 2) {
return `https://huggingface.co/datasets/${parts[0]}/${parts[1]}/resolve/main/scores.parquet`;
}
if (parts.length >= 3) {
const org = parts[0], dataset = parts[1], filePath = parts.slice(2).join('/');
return `https://huggingface.co/datasets/${org}/${dataset}/resolve/main/${filePath}`;
}
return input;
}
async function mergeDataset(input) {
const statusEl = document.getElementById('merge-status');
statusEl.className = 'merge-status';
statusEl.textContent = 'Loading...';
try {
const url = resolveParquetUrl(input);
const id = mergeCounter++;
const bufferName = `merged_${id}.parquet`;
const fetchOpts = {};
const token = hfAccessToken || document.getElementById('hf-token-input').value.trim();
if (token) {
fetchOpts.headers = { 'Authorization': `Bearer ${token}` };
}
const response = await fetch(url, fetchOpts);
if (!response.ok) throw new Error(`HTTP ${response.status}: ${response.statusText}`);
const buffer = new Uint8Array(await response.arrayBuffer());
await db.registerFileBuffer(bufferName, buffer);
const unionParts = mergedDatasets.map(d => `UNION ALL SELECT * FROM '${d.bufferName}'`).join('\n');
await conn.query(`CREATE OR REPLACE VIEW scores AS
SELECT * FROM 'scores.parquet'
${unionParts}
UNION ALL SELECT * FROM '${bufferName}'
`);
const label = input.trim().replace(/^https:\/\/huggingface\.co\/datasets\//, '');
mergedDatasets.push({ id, label, url, bufferName, input: input.trim() });
renderMergedTags();
saveMergedToStorage();
await refreshAfterMerge();
statusEl.textContent = 'Merged successfully.';
setTimeout(() => { statusEl.textContent = ''; }, 3000);
document.getElementById('merge-dataset-input').value = '';
} catch (err) {
statusEl.className = 'merge-status error';
statusEl.textContent = `Error: ${err.message}`;
console.error('Merge failed:', err);
throw err; // re-throw so callers (e.g. restore) can handle
}
}
async function removeMergedDataset(id) {
mergedDatasets = mergedDatasets.filter(d => d.id !== id);
const unionParts = mergedDatasets.map(d => `UNION ALL SELECT * FROM '${d.bufferName}'`).join('\n');
await conn.query(`CREATE OR REPLACE VIEW scores AS
SELECT * FROM 'scores.parquet'
${unionParts}
`);
renderMergedTags();
saveMergedToStorage();
await refreshAfterMerge();
}
function saveMergedToStorage() {
try {
const inputs = mergedDatasets.map(d => d.input);
localStorage.setItem('mergedDatasets', JSON.stringify(inputs));
} catch (e) { /* storage full or unavailable */ }
}
function renderMergedTags() {
const container = document.getElementById('merged-tags');
container.textContent = '';
for (const ds of mergedDatasets) {
const tag = document.createElement('span');
tag.className = 'merged-tag';
const labelSpan = document.createElement('span');
labelSpan.textContent = ds.label;
const removeBtn = document.createElement('button');
removeBtn.textContent = '\u00d7';
removeBtn.title = 'Remove merged dataset';
removeBtn.addEventListener('click', () => removeMergedDataset(ds.id));
tag.append(labelSpan, removeBtn);
container.appendChild(tag);
}
}
async function refreshAfterMerge() {
await loadModels();
for (const [, panel] of panels) {
const selectedModels = new Set(panel.getSelectedModels());
const suite = panel.el.suite.value;
const task = panel.el.task.value;
const metric = panel.el.metric.value;
const chartType = panel.el.chartType.value;
panel.buildModelCheckboxes();
panel.el.models.querySelectorAll('input').forEach(cb => {
if (selectedModels.has(cb.value)) {
cb.checked = true;
}
});
await panel.populateSuites({ suite, task, metric, chartType });
}
}
// ── HF Auth ───────────────────────────────────────────────
function updateAuthUI(oauthResult) {
const authRow = document.getElementById('hf-auth-row');
const signinBtn = document.getElementById('hf-signin-btn');
const signoutBtn = document.getElementById('hf-signout-btn');
const userEl = document.getElementById('hf-user');
const tokenInput = document.getElementById('hf-token-input');
authRow.style.display = '';
if (!window.huggingface?.variables?.OAUTH_CLIENT_ID) {
// Not on HF Spaces β€” hide auth entirely
authRow.style.display = 'none';
return;
}
const privateLabel = document.getElementById('hf-private-label');
authRow.style.display = '';
if (oauthResult) {
signinBtn.style.display = 'none';
privateLabel.style.display = 'none';
tokenInput.style.display = 'none';
userEl.textContent = `Signed in as ${oauthResult.userInfo?.name || oauthResult.userInfo?.preferred_username || 'HF user'}`;
userEl.style.display = '';
signoutBtn.style.display = '';
} else {
signinBtn.style.display = '';
privateLabel.style.display = '';
tokenInput.style.display = 'none';
userEl.style.display = 'none';
signoutBtn.style.display = 'none';
}
}
let _hfExpiryTimer = null;
function hfSignOut() {
localStorage.removeItem('hf_oauth');
hfAccessToken = null;
if (_hfExpiryTimer) { clearTimeout(_hfExpiryTimer); _hfExpiryTimer = null; }
updateAuthUI(null);
}
const HF_EXPIRY_BUFFER_MS = 5 * 60_000; // 5 minutes
function isHfTokenExpired(oauthResult) {
if (!oauthResult?.accessTokenExpiresAt) return false;
return new Date(oauthResult.accessTokenExpiresAt).getTime() - HF_EXPIRY_BUFFER_MS < Date.now();
}
function scheduleHfExpiry(oauthResult) {
if (_hfExpiryTimer) clearTimeout(_hfExpiryTimer);
if (!oauthResult?.accessTokenExpiresAt) return;
const expiresAt = new Date(oauthResult.accessTokenExpiresAt).getTime();
const ms = expiresAt - HF_EXPIRY_BUFFER_MS - Date.now();
if (ms <= 0) {
hfSignOut();
return;
}
console.log(`HF token expires at ${new Date(expiresAt).toISOString()}, auto-signout in ${Math.round(ms / 60_000)}m`);
_hfExpiryTimer = setTimeout(() => {
console.warn('HF OAuth token expired, signing out automatically.');
hfSignOut();
}, ms);
}
async function initHfAuth() {
// Try restoring from localStorage
let oauthResult = null;
const stored = localStorage.getItem('hf_oauth');
if (stored) {
try { oauthResult = JSON.parse(stored); } catch { oauthResult = null; }
}
// Handle OAuth redirect (takes priority over stored)
oauthResult = (await oauthHandleRedirectIfPresent()) || oauthResult;
// Discard expired tokens instead of using them and getting 401s
if (oauthResult && isHfTokenExpired(oauthResult)) {
console.warn('HF OAuth token expired, discarding.');
oauthResult = null;
localStorage.removeItem('hf_oauth');
}
if (oauthResult?.accessToken) {
hfAccessToken = oauthResult.accessToken;
localStorage.setItem('hf_oauth', JSON.stringify(oauthResult));
scheduleHfExpiry(oauthResult);
}
updateAuthUI(oauthResult);
// Sign in
document.getElementById('hf-signin-btn').addEventListener('click', async () => {
const includePrivate = document.getElementById('hf-private-toggle').checked;
const scopes = includePrivate
? 'openid profile read-repos'
: 'openid profile gated-repos';
window.location.href = (await oauthLoginUrl({ scopes })) + '&prompt=consent';
});
// Sign out
document.getElementById('hf-signout-btn').addEventListener('click', () => hfSignOut());
}
// ── Init ────────────────────────────────────────────────────
const elInitLoading = document.getElementById('init-loading');
const elAddPanelRow = document.getElementById('add-panel-row');
async function init() {
try {
elInitLoading.textContent = 'Loading config...';
await loadConfig();
elInitLoading.textContent = 'Initializing DuckDB...';
await initDuckDB();
elInitLoading.textContent = 'Loading data from HuggingFace...';
await loadParquet();
elInitLoading.textContent = 'Loading models...';
await loadModels();
elInitLoading.style.display = 'none';
elAddPanelRow.style.display = '';
document.getElementById('merge-dataset-row').style.display = '';
// ── HF Auth setup ──
await initHfAuth();
// Restore previously merged datasets (skip auth failures silently)
try {
const saved = JSON.parse(localStorage.getItem('mergedDatasets') || '[]');
const failed = [];
for (const input of saved) {
try {
await mergeDataset(input);
} catch (e) {
console.warn(`Skipping saved dataset "${input}":`, e.message);
failed.push(input);
}
}
if (failed.length > 0) {
// Remove datasets that can't be loaded (auth expired, deleted, etc.)
const remaining = mergedDatasets.map(d => d.input);
localStorage.setItem('mergedDatasets', JSON.stringify(remaining));
const statusEl = document.getElementById('merge-status');
statusEl.className = 'merge-status error';
statusEl.textContent = `${failed.length} saved dataset(s) skipped (sign in to load private datasets)`;
}
} catch (e) { console.warn('Failed to restore merged datasets:', e); }
// Create default panel
await addPanel({ suite: 'eng_base_main', metric: 'acc_norm' });
} catch (err) {
elInitLoading.innerHTML = `<span style="color:#e63946">
Error: ${err.message}<br>
<small>Check browser console for details.</small>
</span>`;
console.error('Init failed:', err);
}
}
document.getElementById('btn-add-panel').addEventListener('click', () => addPanel());
document.getElementById('btn-merge-dataset').addEventListener('click', () => {
const input = document.getElementById('merge-dataset-input').value;
if (input.trim()) mergeDataset(input).catch(() => {});
});
document.getElementById('merge-dataset-input').addEventListener('keydown', (e) => {
if (e.key === 'Enter') {
const input = e.target.value;
if (input.trim()) mergeDataset(input).catch(() => {});
}
});
init();
</script>
</body>
</html>