finephrase / app /src /content /embeds /d3-benchmark-comparison.html
joelniklaus's picture
joelniklaus HF Staff
improved plots for mobile
7db0003
<!--
Reusable bar/line chart for benchmark comparisons.
Configuration via data-config attribute:
{
"datasets": { // required (unless using setups)
"raw_name": "Display Name", // shorthand: string = display name
"raw_name": { "display": "Name", "color": "#hex", "shaded": true, "baseline": true }
// full form: display is required, rest optional
},
"setups": { "Setup Label": { "datasets": {...} }, ... }, // optional, multi-setup mode with dropdown + average
"defaultMetric": "agg_score_macro", // optional, default: "agg_score_macro"
"defaultView": "bar", // optional, "bar" | "line", default: "bar"
"defaultSetup": "average", // optional, setup name or "average", default: "average" when ≥2 setups
"tokensPerStep": 2100000, // optional, default: 2.1e6
"runColumn": "runname", // optional, CSV column for series, default: "runname"
"stepColumn": "steps" // optional, CSV column for x-axis, default: "steps"
}
Per-dataset options (all optional except display):
display: Display name shown in legend, axes, and tooltips
color: Pinned hex color (otherwise auto-assigned from palette)
shaded: If true, bar gets a diagonal-stripe pattern (useful for aggregate baselines)
baseline: If true, rendered as a reference line (vertical in bar view, horizontal in line view)
instead of a regular bar/line. Not shown in the legend.
Data: uses benchmark-results.csv by default (one CSV with all runs).
Only rows matching keys in datasets are displayed.
Example usage in MDX:
<HtmlEmbed
src="d3-benchmark-comparison.html"
title="Baseline Comparison"
config={{
datasets: {
cosmopedia: "Cosmopedia",
dclm: { display: "Baseline (DCLM)", baseline: true },
nemotron_hq_synth: { display: "Nemotron-HQ-Synth", color: "#76b900", shaded: true }
}
}}
/>
-->
<div class="d3-benchmark-comparison"></div>
<style>
.d3-benchmark-comparison { position: relative; }
.d3-benchmark-comparison .controls {
display: flex;
flex-wrap: wrap;
gap: 16px;
align-items: flex-end;
justify-content: center;
margin: 10px 0 0 0;
}
.d3-benchmark-comparison .controls .control-group {
display: flex;
flex-direction: column;
align-items: flex-start;
gap: 6px;
}
.d3-benchmark-comparison .controls label {
font-size: 12px;
font-weight: 700;
color: var(--text-color);
}
.d3-benchmark-comparison .controls select {
appearance: none;
-webkit-appearance: none;
-moz-appearance: none;
border: 1px solid var(--border-color);
border-radius: 8px;
padding: 6px 28px 6px 10px;
background-color: var(--surface-bg);
color: var(--text-color);
font-size: 13px;
line-height: 1.2;
background-image: url("data:image/svg+xml,%3Csvg width='12' height='8' viewBox='0 0 12 8' xmlns='http://www.w3.org/2000/svg'%3E%3Cpath d='M1.41 1.59L6 6.17l4.59-4.58L12 3 6 9 0 3z' fill='%23999'/%3E%3C/svg%3E");
background-repeat: no-repeat;
background-position: right 8px center;
}
.d3-benchmark-comparison .controls select:focus-visible {
outline: 2px solid var(--primary-color);
outline-offset: 2px;
}
.d3-benchmark-comparison .legend {
display: flex;
flex-direction: column;
align-items: flex-start;
gap: 6px;
margin: 8px 0 0 0;
padding-bottom: 4px;
}
.d3-benchmark-comparison .legend .legend-title {
font-size: 12px;
font-weight: 700;
color: var(--text-color);
}
.d3-benchmark-comparison .legend .items {
display: flex;
flex-wrap: wrap;
gap: 8px 14px;
}
.d3-benchmark-comparison .legend .item {
display: inline-flex;
align-items: center;
gap: 6px;
white-space: nowrap;
font-size: 12px;
color: var(--text-color);
cursor: pointer;
}
.d3-benchmark-comparison .legend .item.ghost { opacity: .25; }
.d3-benchmark-comparison .legend .swatch {
width: 14px;
height: 14px;
border-radius: 3px;
border: 1px solid var(--border-color);
}
.d3-benchmark-comparison .bar.ghost { opacity: .25; }
.d3-benchmark-comparison .value-label.ghost { opacity: .25; }
.d3-benchmark-comparison .line-path { fill: none; stroke-width: 2; opacity: 0.85; }
.d3-benchmark-comparison .line-path.ghost { opacity: .15; }
.d3-benchmark-comparison .line-dot.ghost { opacity: .15; }
.d3-benchmark-comparison .baseline.ghost { opacity: .1; }
.d3-benchmark-comparison .axes path { display: none; }
.d3-benchmark-comparison .axes line { stroke: var(--axis-color); }
.d3-benchmark-comparison .axes text { fill: var(--tick-color); }
.d3-benchmark-comparison .grid line { stroke: var(--grid-color); }
.d3-benchmark-comparison .hover-line {
stroke: var(--text-color);
stroke-opacity: 0.25;
stroke-width: 1;
pointer-events: none;
}
.d3-benchmark-comparison .d3-tooltip {
position: absolute;
top: 0px;
left: 0px;
transform: translate(-9999px, -9999px);
pointer-events: none;
padding: 8px 10px;
border-radius: 8px;
font-size: 12px;
line-height: 1.35;
border: 1px solid var(--border-color);
background: var(--surface-bg);
color: var(--text-color);
box-shadow: 0 4px 24px rgba(0,0,0,.18);
opacity: 0;
transition: opacity .12s ease;
text-align: left;
z-index: 10;
}
.d3-benchmark-comparison .d3-tooltip .tip-dot {
display: inline-block;
width: 10px;
height: 10px;
border-radius: 3px;
border: 1px solid var(--border-color);
margin-right: 6px;
vertical-align: middle;
}
@media (max-width: 640px) {
.d3-benchmark-comparison .controls {
flex-direction: column;
align-items: stretch;
gap: 10px;
}
.d3-benchmark-comparison .controls .control-group {
width: 100%;
}
.d3-benchmark-comparison .controls select {
width: 100%;
}
.d3-benchmark-comparison .legend .item {
white-space: normal;
align-items: flex-start;
line-height: 1.2;
}
.d3-benchmark-comparison .legend .swatch {
flex-shrink: 0;
margin-top: 1px;
}
}
</style>
<script>
(() => {
const ensureD3 = (cb) => {
if (window.d3 && typeof window.d3.select === 'function') return cb();
let s = document.getElementById('d3-cdn-script');
if (!s) { s = document.createElement('script'); s.id = 'd3-cdn-script'; s.src = 'https://cdn.jsdelivr.net/npm/d3@7/dist/d3.min.js'; document.head.appendChild(s); }
const onReady = () => { if (window.d3 && typeof window.d3.select === 'function') cb(); };
s.addEventListener('load', onReady, { once: true }); if (window.d3) onReady();
};
const bootstrap = () => {
const scriptEl = document.currentScript;
let container = scriptEl ? scriptEl.previousElementSibling : null;
if (!(container && container.classList && container.classList.contains('d3-benchmark-comparison'))) {
const cs = Array.from(document.querySelectorAll('.d3-benchmark-comparison')).filter(el => !(el.dataset && el.dataset.mounted === 'true'));
container = cs[cs.length - 1] || null;
}
if (!container) return;
if (container.dataset) { if (container.dataset.mounted === 'true') return; container.dataset.mounted = 'true'; }
container.style.position = container.style.position || 'relative';
// ─── READ CONFIG ───
let mountEl = container;
while (mountEl && !mountEl.getAttribute?.('data-config')) { mountEl = mountEl.parentElement; }
let cfg = {};
try {
const raw = mountEl && mountEl.getAttribute ? mountEl.getAttribute('data-config') : null;
if (raw && raw.trim()) cfg = raw.trim().startsWith('{') ? JSON.parse(raw) : {};
} catch (_) {}
// ─── NORMALIZE DATASETS CONFIG ───
// Accepts: { "key": "Name" } or { "key": { display, color, shaded, baseline } }
// Returns: { key: { display, color, shaded, baseline } }
function normalizeDatasets(raw) {
const out = {};
for (const [k, v] of Object.entries(raw || {})) {
out[k] = typeof v === 'string' ? { display: v } : { ...v };
}
return out;
}
// ─── SETUP SUPPORT ───
const SETUPS = cfg.setups || null;
const setupNames = SETUPS ? Object.keys(SETUPS) : [];
const AVG_SETUP_KEY = 'Average (all setups)';
const HIDE_AVERAGE = !!cfg.hideAverage;
const defaultSetupCfg = cfg.defaultSetup || (setupNames.length >= 2 && !HIDE_AVERAGE ? 'average' : null);
let currentSetup = SETUPS ? (defaultSetupCfg === 'average' ? AVG_SETUP_KEY : (defaultSetupCfg && setupNames.includes(defaultSetupCfg) ? defaultSetupCfg : setupNames[0])) : null;
let DATASETS = SETUPS ? (currentSetup === AVG_SETUP_KEY ? {} : normalizeDatasets(SETUPS[currentSetup].datasets)) : normalizeDatasets(cfg.datasets);
let avgDatasets = {};
let parsedData = [];
const RUN_COL = cfg.runColumn || 'runname';
const STEP_COL = cfg.stepColumn || 'steps';
const TOKENS_PER_STEP = cfg.tokensPerStep || 2.1e6;
const defaultMetric = cfg.defaultMetric || 'agg_score_macro';
const defaultView = cfg.defaultView || 'bar';
const uid = Math.random().toString(36).slice(2, 8);
// ─── DATASET ACCESSORS ───
function displayName(raw) { return DATASETS[raw] ? DATASETS[raw].display : raw; }
function isBaseline(raw) { return !!(DATASETS[raw] && DATASETS[raw].baseline); }
function isShaded(raw) { return !!(DATASETS[raw] && DATASETS[raw].shaded); }
function pinnedColor(raw) { return DATASETS[raw] && DATASETS[raw].color; }
function stripePatternId(raw) { return 'stripe-' + uid + '-' + raw.replace(/[^a-zA-Z0-9]/g, '_'); }
const METRIC_NAMES = {
'agg_score_macro': 'Aggregate Score (Macro)',
'agg_score_micro': 'Aggregate Score (Micro)',
'agg_score_RC': 'Reading Comprehension',
'agg_score_GK': 'General Knowledge',
'agg_score_NLU': 'Natural Language Understanding',
'agg_score_MATH': 'Math',
'agg_score_TABLE': 'Table Understanding',
'agg_score_RES': 'Reasoning',
'lighteval|arc_cf:easy|3/prob_norm_token': 'ARC-Easy',
'lighteval|drop|3/prob_norm_token': 'DROP',
'lighteval|gsm8k|3/prob_norm_token': 'GSM8K',
'lighteval|hellaswag_cf|3/prob_norm_token': 'HellaSwag',
'lighteval|openbookqa_cf|3/prob_norm_token': 'OpenBookQA',
'lighteval|piqa_cf|3/prob_norm_token': 'PIQA',
'lighteval|squad_v2|3/prob_norm_token': 'SQuAD v2',
'lighteval|treb_qa|3/prob_norm_token': 'TriviaQA',
'lighteval|wikitablequestions|3/prob_norm_token': 'WikiTableQuestions',
'lighteval|winogrande_cf|3/prob_norm_token': 'Winogrande',
'lighteval|xcsqa_cf|3/prob_norm_token': 'XCSQA',
'lighteval|mmlu_redux_cf:_average|3/prob_norm_token': 'MMLU Redux'
};
// Tooltip
let tip = container.querySelector('.d3-tooltip'), tipInner;
if (!tip) {
tip = document.createElement('div'); tip.className = 'd3-tooltip';
tipInner = document.createElement('div'); tipInner.className = 'd3-tooltip__inner'; tip.appendChild(tipInner);
container.appendChild(tip);
} else { tipInner = tip.querySelector('.d3-tooltip__inner') || tip; }
// SVG
const svg = d3.select(container).append('svg').attr('width', '100%').style('display', 'block');
const gRoot = svg.append('g');
const defs = svg.append('defs');
// State
let allData = [];
let metricKeys = [];
let currentMetric = defaultMetric;
let currentView = defaultView;
let colorMap = {};
let highlight = null;
// ─── HELPERS ───
function metricName(key) { return METRIC_NAMES[key] || key; }
function stepsToTokens(step) { return step * TOKENS_PER_STEP; }
function formatTokens(tokens) {
if (tokens >= 1e9) return d3.format('.1f')(tokens / 1e9) + 'B';
if (tokens >= 1e6) return d3.format('.1f')(tokens / 1e6) + 'M';
return d3.format(',')(tokens);
}
function formatStep(step) {
if (step >= 1000) return d3.format('.0f')(step / 1000) + 'K';
return String(step);
}
function stepLabelShort(step) { return `${formatTokens(stepsToTokens(step))} (${formatStep(step)})`; }
function stepLabelLong(step) { return `${formatTokens(stepsToTokens(step))} Tokens (${formatStep(step)} Steps)`; }
function getCategoricalColors(n) {
try { if (window.ColorPalettes && typeof window.ColorPalettes.getColors === 'function') return window.ColorPalettes.getColors('categorical', n); } catch (_) {}
return (d3.schemeTableau10 || ['#4e79a7','#f28e2b','#e15759','#76b7b2','#59a14f','#edc948','#b07aa1','#ff9da7','#9c755f','#bab0ac']).slice(0, n);
}
function initColors() {
if (Object.keys(colorMap).length) return;
const allRaw = Array.from(d3.group(allData, d => d[RUN_COL]).keys()).sort();
const unpinned = [];
allRaw.forEach(raw => {
const pc = pinnedColor(raw);
if (pc) { colorMap[raw] = pc; }
else { unpinned.push(raw); }
});
const palette = getCategoricalColors(unpinned.length);
unpinned.forEach((raw, i) => { colorMap[raw] = palette[i % palette.length]; });
}
// ─── SETUP HELPERS ───
function filterData() {
const knownNames = Object.keys(DATASETS);
allData = knownNames.length ? parsedData.filter(r => knownNames.includes(r[RUN_COL])) : parsedData;
allData.columns = parsedData.columns;
}
function computeAverageData(rawData) {
if (!SETUPS || setupNames.length < 2) return { data: [], datasets: {} };
const displayToRaws = {};
for (const sName of setupNames) {
const ds = normalizeDatasets(SETUPS[sName].datasets);
for (const [raw, opts] of Object.entries(ds)) {
if (!displayToRaws[opts.display]) displayToRaws[opts.display] = [];
displayToRaws[opts.display].push(raw);
}
}
const fullDisplay = Object.entries(displayToRaws)
.filter(([, raws]) => raws.length >= setupNames.length);
const byRunStep = {};
for (const row of rawData) byRunStep[row[RUN_COL] + '|' + row[STEP_COL]] = row;
const steps = Array.from(new Set(rawData.map(r => +r[STEP_COL]))).sort((a, b) => a - b);
const cols = rawData.columns || Object.keys(rawData[0] || {});
const result = [];
const dsMap = {};
for (const [display, raws] of fullDisplay) {
const avgRaw = '__avg__' + display.replace(/[^a-zA-Z0-9]/g, '_');
// Merge options from first setup that has this display name
const firstOpts = Object.values(normalizeDatasets(SETUPS[setupNames[0]].datasets)).find(o => o.display === display) || {};
dsMap[avgRaw] = { display, ...firstOpts };
for (const step of steps) {
const rows = raws.map(r => byRunStep[r + '|' + step]).filter(Boolean);
if (!rows.length) continue;
const avgRow = { [RUN_COL]: avgRaw, [STEP_COL]: String(step) };
for (const col of cols) {
if (col === RUN_COL || col === STEP_COL) continue;
const vals = rows.map(r => +r[col]).filter(v => !isNaN(v));
avgRow[col] = vals.length ? vals.reduce((a, b) => a + b, 0) / vals.length : 0;
}
result.push(avgRow);
}
}
return { data: result, datasets: dsMap };
}
function switchSetup(name) {
currentSetup = name;
if (name === AVG_SETUP_KEY) {
DATASETS = { ...avgDatasets };
} else {
DATASETS = normalizeDatasets(SETUPS[name].datasets);
}
// Re-add baselines from any setup
for (const sName of setupNames) {
const ds = normalizeDatasets(SETUPS[sName].datasets);
for (const [raw, opts] of Object.entries(ds)) {
if (opts.baseline && !DATASETS[raw] && parsedData.some(r => r[RUN_COL] === raw)) {
DATASETS[raw] = { ...opts };
}
}
}
colorMap = {};
filterData();
initColors();
render();
buildLegend();
}
function showTip(html, x, y) {
tipInner.innerHTML = html;
const tipW = tip.offsetWidth || 180;
const cW = container.clientWidth || 800;
const preferredX = (x + tipW + 20 > cW) ? x - tipW - 12 : x + 12;
const px = Math.max(0, Math.min(preferredX, Math.max(0, cW - tipW - 6)));
tip.style.transform = `translate(${px}px, ${Math.max(0, y - 20)}px)`;
tip.style.opacity = '1';
}
function hideTip() {
tip.style.opacity = '0';
tip.style.transform = 'translate(-9999px, -9999px)';
}
function updateHighlight() {
gRoot.selectAll('rect.bar').classed('ghost', d => highlight && d.name !== highlight);
gRoot.selectAll('text.value-label').classed('ghost', d => highlight && d.name !== highlight);
gRoot.selectAll('.line-path').classed('ghost', d => highlight && d.name !== highlight);
gRoot.selectAll('.line-dot').classed('ghost', d => highlight && d.name !== highlight);
gRoot.selectAll('.baseline-vline').classed('ghost', d => highlight && d.name !== highlight);
gRoot.selectAll('.baseline-vlabel').classed('ghost', d => highlight && d.name !== highlight);
gRoot.selectAll('.baseline-hline').classed('ghost', d => highlight && d.name !== highlight);
gRoot.selectAll('.baseline-hlabel').classed('ghost', d => highlight && d.name !== highlight);
container.querySelectorAll('.legend .item').forEach(el => {
el.classList.toggle('ghost', highlight && el.getAttribute('data-name') !== highlight);
});
}
// ─── AUTO-DETECT METRICS from CSV columns ───
function detectMetrics(columns) {
const skip = new Set([RUN_COL, STEP_COL, 'seed']);
const aggOrder = ['agg_score_macro', 'agg_score_micro', 'agg_score_RC', 'agg_score_GK', 'agg_score_NLU', 'agg_score_MATH', 'agg_score_TABLE', 'agg_score_RES'];
const agg = aggOrder.filter(k => columns.includes(k));
const ind = columns.filter(k => !skip.has(k) && !agg.includes(k) && !isNaN(+allData[0][k]));
return [...agg, ...ind];
}
// ─── BAR CHART ───
function renderBar() {
const width = container.clientWidth || 800;
const isMobile = width < 640;
const hasBaselines = allData.some(r => isBaseline(r[RUN_COL]));
const grouped = d3.group(allData, d => d[RUN_COL]);
const finalData = [];
for (const [raw, rows] of grouped) {
const maxStep = d3.max(rows, r => +r[STEP_COL]);
const row = rows.find(r => +r[STEP_COL] === maxStep);
if (row) finalData.push({ name: displayName(raw), rawName: raw, value: +row[currentMetric] });
}
finalData.sort((a, b) => b.value - a.value);
const barData = finalData.filter(d => !isBaseline(d.rawName));
const baselineData = finalData.filter(d => isBaseline(d.rawName));
const maxLabelChars = d3.max(finalData, d => d.name.length) || 0;
const desiredLeft = Math.max(
isMobile ? 92 : 150,
Math.round(maxLabelChars * (isMobile ? 5.2 : 6.3))
);
const margin = {
top: hasBaselines ? 20 : 12,
right: isMobile ? 40 : 56,
bottom: isMobile ? 30 : 32,
left: Math.min(desiredLeft, isMobile ? 126 : 220),
};
const barHeight = 28, barGap = 8;
const height = margin.top + margin.bottom + barData.length * (barHeight + barGap);
svg.attr('width', width).attr('height', height);
gRoot.attr('transform', `translate(${margin.left},${margin.top})`);
const innerWidth = width - margin.left - margin.right;
const innerHeight = height - margin.top - margin.bottom;
const x = d3.scaleLinear().domain([0, d3.max(finalData, d => d.value) * 1.05]).range([0, innerWidth]);
const y = d3.scaleBand().domain(barData.map(d => d.name)).range([0, innerHeight]).padding(0.2);
// Grid
gRoot.selectAll('.grid').data([0]).join('g').attr('class', 'grid').call(g => {
g.selectAll('line').data(x.ticks(5)).join('line')
.attr('x1', d => x(d)).attr('x2', d => x(d)).attr('y1', 0).attr('y2', innerHeight);
});
// X axis
gRoot.selectAll('.axis-x').data([0]).join('g').attr('class', 'axes axis-x')
.attr('transform', `translate(0,${innerHeight})`)
.call(d3.axisBottom(x).ticks(isMobile ? 4 : 5).tickFormat(d3.format('.2f')).tickSizeOuter(0))
.call(g => {
g.selectAll('text').attr('fill', 'var(--tick-color)').style('font-size', isMobile ? '10px' : '11px');
g.selectAll('path, line').attr('stroke', 'var(--axis-color)');
});
// Y axis
gRoot.selectAll('.axis-y').data([0]).join('g').attr('class', 'axes axis-y')
.call(d3.axisLeft(y).tickSizeOuter(0))
.call(g => {
g.selectAll('text').attr('fill', 'var(--text-color)').style('font-size', isMobile ? '11px' : '12px').style('font-weight', '500');
g.selectAll('path, line').attr('stroke', 'var(--axis-color)');
});
// Stripe patterns for shaded bars
barData.forEach(d => {
if (!isShaded(d.rawName)) return;
const c = colorMap[d.rawName] || '#999';
const pat = defs.append('pattern').attr('id', stripePatternId(d.rawName))
.attr('width', 6).attr('height', 6).attr('patternUnits', 'userSpaceOnUse').attr('patternTransform', 'rotate(45)');
pat.append('rect').attr('width', 6).attr('height', 6).attr('fill', c).attr('opacity', 0.35);
pat.append('line').attr('x1', 0).attr('y1', 0).attr('x2', 0).attr('y2', 6).attr('stroke', c).attr('stroke-width', 2.5);
});
function barFill(d) {
if (isShaded(d.rawName)) return `url(#${stripePatternId(d.rawName)})`;
return colorMap[d.rawName] || 'var(--primary-color)';
}
// Bars
const barTip = (ev, d) => {
const [mx, my] = d3.pointer(ev, container);
showTip(`<strong>${d.name}</strong><br/>${metricName(currentMetric)}: <strong>${d.value.toFixed(3)}</strong>`, mx, my);
};
gRoot.selectAll('rect.bar').data(barData, d => d.name).join(
enter => enter.append('rect').attr('class', 'bar')
.attr('x', 0).attr('y', d => y(d.name)).attr('height', y.bandwidth()).attr('rx', 3)
.attr('fill', d => barFill(d))
.attr('width', 0)
.on('mouseenter', (ev, d) => { highlight = d.name; updateHighlight(); })
.on('mousemove', barTip)
.on('mouseleave', () => { hideTip(); highlight = null; updateHighlight(); })
.transition().duration(300).attr('width', d => Math.max(0, x(d.value))),
update => update
.on('mouseenter', (ev, d) => { highlight = d.name; updateHighlight(); })
.on('mousemove', barTip)
.on('mouseleave', () => { hideTip(); highlight = null; updateHighlight(); })
.transition().duration(300)
.attr('y', d => y(d.name)).attr('height', y.bandwidth())
.attr('width', d => Math.max(0, x(d.value)))
.attr('fill', d => barFill(d)),
exit => exit.transition().duration(200).attr('width', 0).remove()
);
// Value labels
gRoot.selectAll('text.value-label').data(barData, d => d.name).join(
enter => enter.append('text').attr('class', 'value-label')
.attr('x', d => x(d.value) + 5).attr('y', d => y(d.name) + y.bandwidth() / 2)
.attr('dy', '0.35em').attr('fill', 'var(--text-color)').attr('font-size', isMobile ? 10 : 11)
.text(d => d.value.toFixed(3)),
update => update.transition().duration(300)
.attr('x', d => x(d.value) + 5).attr('y', d => y(d.name) + y.bandwidth() / 2)
.text(d => d.value.toFixed(3)),
exit => exit.remove()
);
// Baseline vertical reference lines
gRoot.selectAll('.baseline-vline').data(baselineData, d => d.name).join(
enter => enter.append('line').attr('class', 'baseline-vline baseline')
.attr('x1', d => x(d.value)).attr('x2', d => x(d.value))
.attr('y1', 0).attr('y2', innerHeight)
.attr('stroke', d => colorMap[d.rawName] || '#999')
.attr('stroke-width', 2).attr('stroke-dasharray', '6,4').attr('opacity', 0.7),
update => update.transition().duration(300)
.attr('x1', d => x(d.value)).attr('x2', d => x(d.value))
.attr('y1', 0).attr('y2', innerHeight)
.attr('stroke', d => colorMap[d.rawName] || '#999'),
exit => exit.remove()
);
gRoot.selectAll('.baseline-vlabel').data(baselineData, d => d.name).join(
enter => enter.append('text').attr('class', 'baseline-vlabel baseline')
.attr('x', d => x(d.value)).attr('y', -4)
.attr('text-anchor', 'middle').attr('fill', d => colorMap[d.rawName] || '#999')
.attr('font-size', isMobile ? 10 : 11).attr('font-weight', 600)
.text(d => `${d.name} (${d.value.toFixed(3)})`),
update => update.transition().duration(300)
.attr('x', d => x(d.value))
.text(d => `${d.name} (${d.value.toFixed(3)})`),
exit => exit.remove()
);
}
// ─── LINE CHART ───
function renderLine() {
const width = container.clientWidth || 800;
const isMobile = width < 640;
const hasBaselines = allData.some(r => isBaseline(r[RUN_COL]));
const margin = { top: 16, right: isMobile ? 18 : 50, bottom: isMobile ? 42 : 48, left: isMobile ? 46 : 60 };
const height = Math.max(isMobile ? 260 : 300, Math.round(width / (isMobile ? 1.95 : 2.5)));
svg.attr('width', width).attr('height', height);
gRoot.attr('transform', `translate(${margin.left},${margin.top})`);
const innerWidth = width - margin.left - margin.right;
const innerHeight = height - margin.top - margin.bottom;
// Build series
const grouped = d3.group(allData, d => d[RUN_COL]);
const series = [];
const baselineSeries = [];
for (const [raw, rows] of grouped) {
const pts = rows.map(r => ({ step: +r[STEP_COL], value: +r[currentMetric] })).sort((a, b) => a.step - b.step);
const entry = { name: displayName(raw), rawName: raw, values: pts };
if (isBaseline(raw)) {
entry.finalValue = pts[pts.length - 1].value;
baselineSeries.push(entry);
} else {
series.push(entry);
}
}
const allSteps = Array.from(new Set(allData.filter(r => !isBaseline(r[RUN_COL])).map(r => +r[STEP_COL]))).sort((a, b) => a - b);
const allValues = [...series, ...baselineSeries].flatMap(s => s.finalValue != null ? [s.finalValue] : s.values.map(v => v.value));
const x = d3.scaleLinear().domain(d3.extent(allSteps)).range([0, innerWidth]);
const yMin = d3.min(allValues), yMax = d3.max(allValues), yPad = (yMax - yMin) * 0.08;
const y = d3.scaleLinear().domain([yMin - yPad, yMax + yPad]).range([innerHeight, 0]).nice();
// Grid
gRoot.selectAll('.grid').data([0]).join('g').attr('class', 'grid').call(g => {
g.selectAll('line').data(y.ticks(6)).join('line')
.attr('x1', 0).attr('x2', innerWidth).attr('y1', d => y(d)).attr('y2', d => y(d));
});
// X axis
gRoot.selectAll('.axis-x').data([0]).join('g').attr('class', 'axes axis-x')
.attr('transform', `translate(0,${innerHeight})`)
.call(
d3.axisBottom(x)
.ticks(isMobile ? 4 : 6)
.tickFormat(d => isMobile ? formatTokens(stepsToTokens(d)) : stepLabelShort(d))
.tickSizeOuter(0)
)
.call(g => {
g.selectAll('text').attr('fill', 'var(--tick-color)').style('font-size', isMobile ? '9px' : '10px');
g.selectAll('path, line').attr('stroke', 'var(--axis-color)');
});
// Y axis
gRoot.selectAll('.axis-y').data([0]).join('g').attr('class', 'axes axis-y')
.call(d3.axisLeft(y).ticks(isMobile ? 5 : 6).tickFormat(d3.format('.2f')).tickSizeOuter(0))
.call(g => {
g.selectAll('text').attr('fill', 'var(--tick-color)').style('font-size', isMobile ? '10px' : '11px');
g.selectAll('path, line').attr('stroke', 'var(--axis-color)');
});
// Axis labels
gRoot.selectAll('.x-label').data([0]).join('text').attr('class', 'x-label')
.attr('x', innerWidth / 2).attr('y', innerHeight + (isMobile ? 32 : 38))
.attr('text-anchor', 'middle').attr('fill', 'var(--text-color)').attr('font-size', isMobile ? 11 : 12)
.text('Tokens (Steps)');
gRoot.selectAll('.y-label').data([0]).join('text').attr('class', 'y-label')
.attr('transform', 'rotate(-90)').attr('x', -innerHeight / 2).attr('y', isMobile ? -34 : -44)
.attr('text-anchor', 'middle').attr('fill', 'var(--text-color)').attr('font-size', isMobile ? 11 : 12)
.text(metricName(currentMetric));
// Baseline horizontal reference lines
gRoot.selectAll('.baseline-hline').data(baselineSeries, d => d.name).join(
enter => enter.append('line').attr('class', 'baseline-hline baseline')
.attr('x1', 0).attr('x2', innerWidth)
.attr('y1', d => y(d.finalValue)).attr('y2', d => y(d.finalValue))
.attr('stroke', d => colorMap[d.rawName] || '#999')
.attr('stroke-width', 2).attr('stroke-dasharray', '6,4').attr('opacity', 0.7),
update => update.transition().duration(300)
.attr('x1', 0).attr('x2', innerWidth)
.attr('y1', d => y(d.finalValue)).attr('y2', d => y(d.finalValue))
.attr('stroke', d => colorMap[d.rawName] || '#999'),
exit => exit.remove()
);
gRoot.selectAll('.baseline-hlabel').data(baselineSeries, d => d.name).join(
enter => enter.append('text').attr('class', 'baseline-hlabel baseline')
.attr('x', 4).attr('y', d => y(d.finalValue) - 6)
.attr('text-anchor', 'start')
.attr('fill', d => colorMap[d.rawName] || '#999')
.attr('font-size', isMobile ? 9 : 10).attr('font-weight', 600)
.text(d => `${d.name} (${d.finalValue.toFixed(3)})`),
update => update.transition().duration(300)
.attr('x', 4).attr('y', d => y(d.finalValue) - 6)
.text(d => `${d.name} (${d.finalValue.toFixed(3)})`),
exit => exit.remove()
);
// Lines (non-baseline)
const line = d3.line().x(d => x(d.step)).y(d => y(d.value)).curve(d3.curveMonotoneX);
gRoot.selectAll('.line-path').data(series, d => d.name).join(
enter => enter.append('path').attr('class', 'line-path')
.attr('stroke', d => colorMap[d.rawName] || 'var(--primary-color)')
.attr('d', d => line(d.values)),
update => update.transition().duration(300)
.attr('stroke', d => colorMap[d.rawName] || 'var(--primary-color)')
.attr('d', d => line(d.values)),
exit => exit.remove()
);
// Dots (non-baseline)
const dotData = series.flatMap(s => s.values.map(v => ({ name: s.name, rawName: s.rawName, step: v.step, value: v.value })));
gRoot.selectAll('.line-dot').data(dotData, d => d.name + '-' + d.step).join(
enter => enter.append('circle').attr('class', 'line-dot')
.attr('cx', d => x(d.step)).attr('cy', d => y(d.value)).attr('r', 3)
.attr('fill', d => colorMap[d.rawName] || 'var(--primary-color)')
.attr('stroke', 'var(--surface-bg)').attr('stroke-width', 1),
update => update.transition().duration(300)
.attr('cx', d => x(d.step)).attr('cy', d => y(d.value))
.attr('fill', d => colorMap[d.rawName] || 'var(--primary-color)'),
exit => exit.remove()
);
// Hover overlay
gRoot.selectAll('.hover-line').data([0]).join('line').attr('class', 'hover-line')
.attr('y1', 0).attr('y2', innerHeight).style('display', 'none');
gRoot.selectAll('.hover-overlay').data([0]).join('rect').attr('class', 'hover-overlay')
.attr('width', innerWidth).attr('height', innerHeight)
.attr('fill', 'none').attr('pointer-events', 'all')
.on('mousemove', (ev) => {
const [mx] = d3.pointer(ev, gRoot.node());
const nearest = allSteps.reduce((best, s) => Math.abs(s - x.invert(mx)) < Math.abs(best - x.invert(mx)) ? s : best, allSteps[0]);
gRoot.select('.hover-line').attr('x1', x(nearest)).attr('x2', x(nearest)).style('display', null);
const entries = series.map(s => {
const pt = s.values.find(v => v.step === nearest);
return pt ? { name: s.name, rawName: s.rawName, value: pt.value } : null;
}).filter(Boolean);
baselineSeries.forEach(s => {
entries.push({ name: s.name, rawName: s.rawName, value: s.finalValue });
});
entries.sort((a, b) => b.value - a.value);
let html = `<div style="font-weight:700;margin-bottom:4px;">${stepLabelLong(nearest)}</div>`;
entries.forEach(e => {
html += `<div><span class="tip-dot" style="background:${colorMap[e.rawName]}"></span>${e.name}: <strong>${e.value.toFixed(3)}</strong></div>`;
});
const [cx, cy] = d3.pointer(ev, container);
showTip(html, cx, cy);
})
.on('mouseleave', () => {
gRoot.select('.hover-line').style('display', 'none');
hideTip();
});
}
// ─── RENDER ───
function render() {
if (!allData.length) return;
initColors();
gRoot.selectAll('*').remove();
defs.selectAll('*').remove();
if (currentView === 'bar') renderBar(); else renderLine();
}
// ─── UI ───
function buildUI() {
const controls = document.createElement('div'); controls.className = 'controls';
if (SETUPS && setupNames.length > 0) {
const setupGroup = document.createElement('div'); setupGroup.className = 'control-group';
const setupLabel = document.createElement('label'); setupLabel.setAttribute('for', 'setup-' + uid); setupLabel.textContent = 'Setup';
const setupSelect = document.createElement('select'); setupSelect.id = 'setup-' + uid;
setupNames.forEach(name => {
const opt = document.createElement('option'); opt.value = name; opt.textContent = name;
if (name === currentSetup) opt.selected = true;
setupSelect.appendChild(opt);
});
if (setupNames.length >= 2 && !HIDE_AVERAGE) {
const avgOpt = document.createElement('option'); avgOpt.value = AVG_SETUP_KEY; avgOpt.textContent = AVG_SETUP_KEY;
if (currentSetup === AVG_SETUP_KEY) avgOpt.selected = true;
setupSelect.appendChild(avgOpt);
}
setupSelect.addEventListener('change', () => { switchSetup(setupSelect.value); });
setupGroup.appendChild(setupLabel); setupGroup.appendChild(setupSelect);
controls.appendChild(setupGroup);
}
const viewGroup = document.createElement('div'); viewGroup.className = 'control-group';
const viewLabel = document.createElement('label'); viewLabel.setAttribute('for', 'view-' + uid); viewLabel.textContent = 'View';
const viewSelect = document.createElement('select'); viewSelect.id = 'view-' + uid;
[['bar', 'Final Score'], ['line', 'Training Progression']].forEach(([val, text]) => {
const opt = document.createElement('option'); opt.value = val; opt.textContent = text;
if (val === currentView) opt.selected = true;
viewSelect.appendChild(opt);
});
viewSelect.addEventListener('change', () => { currentView = viewSelect.value; render(); });
viewGroup.appendChild(viewLabel); viewGroup.appendChild(viewSelect);
controls.appendChild(viewGroup);
const metricGroup = document.createElement('div'); metricGroup.className = 'control-group';
const metricLabel = document.createElement('label'); metricLabel.setAttribute('for', 'metric-' + uid); metricLabel.textContent = 'Metric';
const metricSelect = document.createElement('select'); metricSelect.id = 'metric-' + uid;
metricGroup.appendChild(metricLabel); metricGroup.appendChild(metricSelect);
controls.appendChild(metricGroup);
container.appendChild(controls);
const legend = document.createElement('div'); legend.className = 'legend';
legend.innerHTML = '<div class="legend-title">Legend</div><div class="items"></div>';
container.appendChild(legend);
}
function populateMetricSelect() {
const sel = container.querySelector('#metric-' + uid);
if (!sel) return;
sel.innerHTML = '';
const aggGroup = document.createElement('optgroup'); aggGroup.label = 'Aggregate Scores';
const indGroup = document.createElement('optgroup'); indGroup.label = 'Individual Benchmarks';
metricKeys.forEach(key => {
const opt = document.createElement('option'); opt.value = key; opt.textContent = metricName(key);
if (key === currentMetric) opt.selected = true;
if (key.startsWith('agg_score')) aggGroup.appendChild(opt); else indGroup.appendChild(opt);
});
if (aggGroup.children.length) sel.appendChild(aggGroup);
if (indGroup.children.length) sel.appendChild(indGroup);
sel.addEventListener('change', () => { currentMetric = sel.value; render(); });
}
function buildLegend() {
const items = container.querySelector('.legend .items');
if (!items) return;
items.innerHTML = '';
const grouped = d3.group(allData, d => d[RUN_COL]);
const sorted = Array.from(grouped.entries())
.map(([raw, rows]) => {
const maxStep = d3.max(rows, r => +r[STEP_COL]);
const row = rows.find(r => +r[STEP_COL] === maxStep);
return { raw, score: row ? +row[defaultMetric] : 0 };
})
.sort((a, b) => b.score - a.score)
.map(d => d.raw);
sorted.filter(raw => !isBaseline(raw)).forEach(raw => {
const name = displayName(raw);
const el = document.createElement('span'); el.className = 'item'; el.setAttribute('data-name', name);
const sw = document.createElement('span'); sw.className = 'swatch';
const c = colorMap[raw] || '#999';
if (isShaded(raw)) {
sw.style.background = c;
sw.style.backgroundImage = 'repeating-linear-gradient(45deg, transparent, transparent 2px, rgba(255,255,255,0.4) 2px, rgba(255,255,255,0.4) 4px)';
} else {
sw.style.background = c;
}
const txt = document.createElement('span'); txt.textContent = name;
el.appendChild(sw); el.appendChild(txt); items.appendChild(el);
el.addEventListener('mouseenter', () => { highlight = name; updateHighlight(); });
el.addEventListener('mouseleave', () => { highlight = null; updateHighlight(); });
});
}
buildUI();
// ─── DATA LOADING ───
const fetchFirstAvailable = async (paths) => {
for (const p of paths) {
try { const r = await fetch(p, { cache: 'no-cache' }); if (r.ok) return await r.text(); } catch (_) {}
}
throw new Error('CSV not found');
};
let dataMountEl = container;
while (dataMountEl && !dataMountEl.getAttribute?.('data-datafiles')) { dataMountEl = dataMountEl.parentElement; }
let providedData = null;
try {
const attr = dataMountEl && dataMountEl.getAttribute ? dataMountEl.getAttribute('data-datafiles') : null;
if (attr && attr.trim()) providedData = attr.trim().startsWith('[') ? JSON.parse(attr) : attr.trim();
} catch (_) {}
const ensurePrefix = (p) => (typeof p === 'string' && p && !p.includes('/')) ? `/data/${p}` : p;
const csvPaths = providedData
? (Array.isArray(providedData) ? providedData.map(ensurePrefix) : [ensurePrefix(providedData)])
: ['/data/benchmark-results.csv'];
(async () => {
try {
const text = await fetchFirstAvailable(csvPaths);
const parsed = d3.csvParse(text);
parsedData = parsed;
if (SETUPS && setupNames.length >= 2 && !HIDE_AVERAGE) {
const avg = computeAverageData(parsed);
avgDatasets = avg.datasets;
const hasAvgData = Object.values(avgDatasets).some(o => !o.baseline);
if (hasAvgData) {
parsedData = parsed.concat(avg.data);
parsedData.columns = parsed.columns;
if (currentSetup === AVG_SETUP_KEY) DATASETS = { ...avgDatasets };
} else {
const sel = container.querySelector('#setup-' + uid);
if (sel) { const o = sel.querySelector(`option[value="${AVG_SETUP_KEY}"]`); if (o) o.remove(); }
if (currentSetup === AVG_SETUP_KEY) { currentSetup = setupNames[0]; DATASETS = normalizeDatasets(SETUPS[currentSetup].datasets); if (sel) sel.value = currentSetup; }
}
}
filterData();
metricKeys = detectMetrics(allData.columns);
if (!metricKeys.includes(currentMetric)) currentMetric = metricKeys[0];
populateMetricSelect();
render();
buildLegend();
if (window.ResizeObserver) { new ResizeObserver(() => render()).observe(container); }
else { window.addEventListener('resize', () => render()); }
} catch (e) {
const pre = document.createElement('pre');
pre.textContent = 'Data load error: ' + (e && e.message ? e.message : e);
pre.style.color = 'var(--danger, #b00020)';
pre.style.fontSize = '12px';
container.appendChild(pre);
}
})();
};
if (document.readyState === 'loading') { document.addEventListener('DOMContentLoaded', () => ensureD3(bootstrap), { once: true }); }
else { ensureD3(bootstrap); }
})();
</script>