finephrase / app /src /content /embeds /d3-benchmark-comparison.html
joelniklaus's picture
joelniklaus HF Staff
make plots look nicer
387754d
raw
history blame
37.2 kB
<!--
Reusable bar/line chart for benchmark comparisons.
Configuration via data-config attribute:
{
"datasetNames": { "raw_name": "Display Name", ... }, // required (unless using setups)
"setups": { "Setup Label": { "datasetNames": {...} }, ... }, // optional, multi-setup mode with dropdown + average
"pinnedColors": { "DCLM": "#333", "FineWeb-Edu (HQ)": "#86a1a9" }, // optional
"baselines": ["dclm", "fw_edu_hq"], // optional, raw keys for baseline datasets (dashed lines, striped bars). Default: ["dclm", "fw_edu_hq"]
"defaultMetric": "agg_score_macro", // optional, default: "agg_score_macro"
"defaultView": "bar", // optional, "bar" | "line", default: "bar"
"tokensPerStep": 2100000, // optional, default: 2.1e6
"runColumn": "runname", // optional, CSV column for series, default: "runname"
"stepColumn": "steps" // optional, CSV column for x-axis, default: "steps"
}
Data: uses benchmark-results.csv by default (one CSV with all runs).
Only rows matching keys in datasetNames are displayed.
Example usage in MDX:
<HtmlEmbed
src="d3-benchmark-comparison.html"
title="Baseline Comparison"
config={{
datasetNames: {
cosmopedia: "Cosmopedia",
dclm: "DCLM",
fw_edu_hq: "FineWeb-Edu (HQ)"
}
}}
/>
-->
<div class="d3-benchmark-comparison"></div>
<style>
.d3-benchmark-comparison { position: relative; }
.d3-benchmark-comparison .controls {
display: flex;
gap: 16px;
align-items: flex-end;
justify-content: center;
margin: 10px 0 0 0;
}
.d3-benchmark-comparison .controls .control-group {
display: flex;
flex-direction: column;
align-items: flex-start;
gap: 6px;
}
.d3-benchmark-comparison .controls label {
font-size: 12px;
font-weight: 700;
color: var(--text-color);
}
.d3-benchmark-comparison .controls select {
appearance: none;
-webkit-appearance: none;
-moz-appearance: none;
border: 1px solid var(--border-color);
border-radius: 8px;
padding: 6px 28px 6px 10px;
background-color: var(--surface-bg);
color: var(--text-color);
font-size: 13px;
line-height: 1.2;
background-image: url("data:image/svg+xml,%3Csvg width='12' height='8' viewBox='0 0 12 8' xmlns='http://www.w3.org/2000/svg'%3E%3Cpath d='M1.41 1.59L6 6.17l4.59-4.58L12 3 6 9 0 3z' fill='%23999'/%3E%3C/svg%3E");
background-repeat: no-repeat;
background-position: right 8px center;
}
.d3-benchmark-comparison .controls select:focus-visible {
outline: 2px solid var(--primary-color);
outline-offset: 2px;
}
.d3-benchmark-comparison .legend {
display: flex;
flex-direction: column;
align-items: flex-start;
gap: 6px;
margin: 8px 0 0 0;
padding-bottom: 4px;
}
.d3-benchmark-comparison .legend .legend-title {
font-size: 12px;
font-weight: 700;
color: var(--text-color);
}
.d3-benchmark-comparison .legend .items {
display: flex;
flex-wrap: wrap;
gap: 8px 14px;
}
.d3-benchmark-comparison .legend .item {
display: inline-flex;
align-items: center;
gap: 6px;
white-space: nowrap;
font-size: 12px;
color: var(--text-color);
cursor: pointer;
}
.d3-benchmark-comparison .legend .item.ghost { opacity: .25; }
.d3-benchmark-comparison .legend .swatch {
width: 14px;
height: 14px;
border-radius: 3px;
border: 1px solid var(--border-color);
}
.d3-benchmark-comparison .bar.ghost { opacity: .25; }
.d3-benchmark-comparison .value-label.ghost { opacity: .25; }
.d3-benchmark-comparison .line-path { fill: none; stroke-width: 2; opacity: 0.85; }
.d3-benchmark-comparison .line-path.baseline { stroke-dasharray: 6,4; opacity: 0.5; }
.d3-benchmark-comparison .line-path.baseline.ghost { opacity: .1; }
.d3-benchmark-comparison .line-path.ghost { opacity: .15; }
.d3-benchmark-comparison .line-dot.baseline { opacity: 0.5; }
.d3-benchmark-comparison .line-dot.baseline.ghost { opacity: .1; }
.d3-benchmark-comparison .line-dot.ghost { opacity: .15; }
.d3-benchmark-comparison .axes path { display: none; }
.d3-benchmark-comparison .axes line { stroke: var(--axis-color); }
.d3-benchmark-comparison .axes text { fill: var(--tick-color); }
.d3-benchmark-comparison .grid line { stroke: var(--grid-color); }
.d3-benchmark-comparison .hover-line {
stroke: var(--text-color);
stroke-opacity: 0.25;
stroke-width: 1;
pointer-events: none;
}
.d3-benchmark-comparison .d3-tooltip {
position: absolute;
top: 0px;
left: 0px;
transform: translate(-9999px, -9999px);
pointer-events: none;
padding: 8px 10px;
border-radius: 8px;
font-size: 12px;
line-height: 1.35;
border: 1px solid var(--border-color);
background: var(--surface-bg);
color: var(--text-color);
box-shadow: 0 4px 24px rgba(0,0,0,.18);
opacity: 0;
transition: opacity .12s ease;
text-align: left;
z-index: 10;
}
.d3-benchmark-comparison .d3-tooltip .tip-dot {
display: inline-block;
width: 10px;
height: 10px;
border-radius: 3px;
border: 1px solid var(--border-color);
margin-right: 6px;
vertical-align: middle;
}
</style>
<script>
(() => {
const ensureD3 = (cb) => {
if (window.d3 && typeof window.d3.select === 'function') return cb();
let s = document.getElementById('d3-cdn-script');
if (!s) { s = document.createElement('script'); s.id = 'd3-cdn-script'; s.src = 'https://cdn.jsdelivr.net/npm/d3@7/dist/d3.min.js'; document.head.appendChild(s); }
const onReady = () => { if (window.d3 && typeof window.d3.select === 'function') cb(); };
s.addEventListener('load', onReady, { once: true }); if (window.d3) onReady();
};
const bootstrap = () => {
const scriptEl = document.currentScript;
let container = scriptEl ? scriptEl.previousElementSibling : null;
if (!(container && container.classList && container.classList.contains('d3-benchmark-comparison'))) {
const cs = Array.from(document.querySelectorAll('.d3-benchmark-comparison')).filter(el => !(el.dataset && el.dataset.mounted === 'true'));
container = cs[cs.length - 1] || null;
}
if (!container) return;
if (container.dataset) { if (container.dataset.mounted === 'true') return; container.dataset.mounted = 'true'; }
container.style.position = container.style.position || 'relative';
// ─── READ CONFIG ───
let mountEl = container;
while (mountEl && !mountEl.getAttribute?.('data-config')) { mountEl = mountEl.parentElement; }
let cfg = {};
try {
const raw = mountEl && mountEl.getAttribute ? mountEl.getAttribute('data-config') : null;
if (raw && raw.trim()) cfg = raw.trim().startsWith('{') ? JSON.parse(raw) : {};
} catch (_) {}
// Configurable settings with defaults
// ─── SETUP SUPPORT ───
const SETUPS = cfg.setups || null;
const setupNames = SETUPS ? Object.keys(SETUPS) : [];
let currentSetup = SETUPS ? setupNames[0] : null;
let DATASET_NAMES = SETUPS ? { ...SETUPS[setupNames[0]].datasetNames } : (cfg.datasetNames || {});
const AVG_SETUP_KEY = 'Average (all setups)';
let avgDatasetNames = {};
let parsedData = [];
const RUN_COL = cfg.runColumn || 'runname';
const STEP_COL = cfg.stepColumn || 'steps';
const TOKENS_PER_STEP = cfg.tokensPerStep || 2.1e6;
const defaultMetric = cfg.defaultMetric || 'agg_score_macro';
const defaultView = cfg.defaultView || 'bar';
// Stable baseline colors, merged with per-chart overrides
const PINNED_COLORS = Object.assign({ 'DCLM': '#333', 'FineWeb-Edu (HQ)': '#86a1a9' }, cfg.pinnedColors || {});
// Unique ID suffix for multiple instances on same page
const uid = Math.random().toString(36).slice(2, 8);
// Baseline datasets: dashed lines, striped bars, reduced opacity
const BASELINES = new Set(cfg.baselines || ['dclm', 'fw_edu_hq']);
function isBaseline(raw) { return BASELINES.has(raw); }
function stripePatternId(raw) { return 'stripe-' + uid + '-' + raw.replace(/[^a-zA-Z0-9]/g, '_'); }
function barFill(d) {
if (isBaseline(d.rawName)) return `url(#${stripePatternId(d.rawName)})`;
return colorMap[d.rawName] || 'var(--primary-color)';
}
// Standard metric display names (shared across all CSVs from this benchmark suite)
const METRIC_NAMES = {
'agg_score_macro': 'Aggregate Score (Macro)',
'agg_score_micro': 'Aggregate Score (Micro)',
'agg_score_RC': 'Reading Comprehension',
'agg_score_GK': 'General Knowledge',
'agg_score_NLU': 'Natural Language Understanding',
'agg_score_MATH': 'Math',
'agg_score_TABLE': 'Table Understanding',
'agg_score_RES': 'Reasoning',
'lighteval|arc_cf:easy|3/prob_norm_token': 'ARC-Easy',
'lighteval|drop|3/prob_norm_token': 'DROP',
'lighteval|gsm8k|3/prob_norm_token': 'GSM8K',
'lighteval|hellaswag_cf|3/prob_norm_token': 'HellaSwag',
'lighteval|openbookqa_cf|3/prob_norm_token': 'OpenBookQA',
'lighteval|piqa_cf|3/prob_norm_token': 'PIQA',
'lighteval|squad_v2|3/prob_norm_token': 'SQuAD v2',
'lighteval|treb_qa|3/prob_norm_token': 'TriviaQA',
'lighteval|wikitablequestions|3/prob_norm_token': 'WikiTableQuestions',
'lighteval|winogrande_cf|3/prob_norm_token': 'Winogrande',
'lighteval|xcsqa_cf|3/prob_norm_token': 'XCSQA',
'lighteval|mmlu_redux_cf:_average|3/prob_norm_token': 'MMLU Redux'
};
// Tooltip
let tip = container.querySelector('.d3-tooltip'), tipInner;
if (!tip) {
tip = document.createElement('div'); tip.className = 'd3-tooltip';
tipInner = document.createElement('div'); tipInner.className = 'd3-tooltip__inner'; tip.appendChild(tipInner);
container.appendChild(tip);
} else { tipInner = tip.querySelector('.d3-tooltip__inner') || tip; }
// SVG
const svg = d3.select(container).append('svg').attr('width', '100%').style('display', 'block');
const gRoot = svg.append('g');
const defs = svg.append('defs');
// State
let allData = [];
let metricKeys = []; // auto-detected from CSV columns
let currentMetric = defaultMetric;
let currentView = defaultView;
let colorMap = {};
let highlight = null;
// ─── HELPERS ───
function displayName(raw) { return DATASET_NAMES[raw] || raw; }
function metricName(key) { return METRIC_NAMES[key] || key; }
function stepsToTokens(step) { return step * TOKENS_PER_STEP; }
function formatTokens(tokens) {
if (tokens >= 1e9) return d3.format('.2f')(tokens / 1e9) + 'B';
if (tokens >= 1e6) return d3.format('.1f')(tokens / 1e6) + 'M';
return d3.format(',')(tokens);
}
function formatStep(step) {
if (step >= 1000) return d3.format('.0f')(step / 1000) + 'K';
return String(step);
}
function stepLabelShort(step) { return `${formatTokens(stepsToTokens(step))} (${formatStep(step)})`; }
function stepLabelLong(step) { return `${formatTokens(stepsToTokens(step))} Tokens (${formatStep(step)} Steps)`; }
function getCategoricalColors(n) {
try { if (window.ColorPalettes && typeof window.ColorPalettes.getColors === 'function') return window.ColorPalettes.getColors('categorical', n); } catch (_) {}
return (d3.schemeTableau10 || ['#4e79a7','#f28e2b','#e15759','#76b7b2','#59a14f','#edc948','#b07aa1','#ff9da7','#9c755f','#bab0ac']).slice(0, n);
}
function initColors() {
if (Object.keys(colorMap).length) return;
const allRaw = Array.from(d3.group(allData, d => d[RUN_COL]).keys()).sort();
// Assign pinned colors first (keyed by display name)
const unpinned = [];
allRaw.forEach(raw => {
const name = displayName(raw);
if (PINNED_COLORS[name]) { colorMap[raw] = PINNED_COLORS[name]; }
else { unpinned.push(raw); }
});
// Fill remaining from categorical palette
const palette = getCategoricalColors(unpinned.length);
unpinned.forEach((raw, i) => { colorMap[raw] = palette[i % palette.length]; });
}
// ─── SETUP HELPERS ───
function filterData() {
const knownNames = Object.keys(DATASET_NAMES);
allData = knownNames.length ? parsedData.filter(r => knownNames.includes(r[RUN_COL])) : parsedData;
allData.columns = parsedData.columns;
}
function computeAverageData(rawData) {
if (!SETUPS || setupNames.length < 2) return { data: [], datasetNames: {} };
// Build mapping: displayName -> [rawName1, rawName2, ...]
const displayToRaws = {};
for (const sName of setupNames) {
const dn = SETUPS[sName].datasetNames;
for (const [raw, display] of Object.entries(dn)) {
if (!displayToRaws[display]) displayToRaws[display] = [];
displayToRaws[display].push(raw);
}
}
// Only average display names that appear in ALL setups
const fullDisplay = Object.entries(displayToRaws)
.filter(([, raws]) => raws.length >= setupNames.length);
// Index raw data by runname+step for fast lookup
const byRunStep = {};
for (const row of rawData) byRunStep[row[RUN_COL] + '|' + row[STEP_COL]] = row;
const steps = Array.from(new Set(rawData.map(r => +r[STEP_COL]))).sort((a, b) => a - b);
const cols = rawData.columns || Object.keys(rawData[0] || {});
const result = [];
const dnMap = {};
for (const [display, raws] of fullDisplay) {
const avgRaw = '__avg__' + display.replace(/[^a-zA-Z0-9]/g, '_');
dnMap[avgRaw] = display;
for (const step of steps) {
const rows = raws.map(r => byRunStep[r + '|' + step]).filter(Boolean);
if (!rows.length) continue;
const avgRow = { [RUN_COL]: avgRaw, [STEP_COL]: String(step) };
for (const col of cols) {
if (col === RUN_COL || col === STEP_COL) continue;
const vals = rows.map(r => +r[col]).filter(v => !isNaN(v));
avgRow[col] = vals.length ? vals.reduce((a, b) => a + b, 0) / vals.length : 0;
}
result.push(avgRow);
}
}
return { data: result, datasetNames: dnMap };
}
function switchSetup(name) {
currentSetup = name;
if (name === AVG_SETUP_KEY) {
DATASET_NAMES = { ...avgDatasetNames };
} else {
DATASET_NAMES = { ...SETUPS[name].datasetNames };
}
// Re-add baselines that may be shared across setups
const baselineNames = cfg.baselines || ['dclm', 'fw_edu_hq'];
for (const bRaw of baselineNames) {
if (parsedData.some(r => r[RUN_COL] === bRaw) && !DATASET_NAMES[bRaw]) {
// Find display name from any setup or use raw
let bDisplay = bRaw;
for (const sName of setupNames) {
if (SETUPS[sName].datasetNames[bRaw]) { bDisplay = SETUPS[sName].datasetNames[bRaw]; break; }
}
DATASET_NAMES[bRaw] = bDisplay;
}
}
colorMap = {};
filterData();
initColors();
render();
buildLegend();
}
function showTip(html, x, y) {
tipInner.innerHTML = html;
const tipW = tip.offsetWidth || 180;
const cW = container.clientWidth || 800;
const px = (x + tipW + 20 > cW) ? x - tipW - 12 : x + 12;
tip.style.transform = `translate(${px}px, ${Math.max(0, y - 20)}px)`;
tip.style.opacity = '1';
}
function hideTip() {
tip.style.opacity = '0';
tip.style.transform = 'translate(-9999px, -9999px)';
}
function updateHighlight() {
gRoot.selectAll('rect.bar').classed('ghost', d => highlight && d.name !== highlight);
gRoot.selectAll('text.value-label').classed('ghost', d => highlight && d.name !== highlight);
gRoot.selectAll('.line-path').classed('ghost', d => highlight && d.name !== highlight);
gRoot.selectAll('.line-dot').classed('ghost', d => highlight && d.name !== highlight);
container.querySelectorAll('.legend .item').forEach(el => {
el.classList.toggle('ghost', highlight && el.getAttribute('data-name') !== highlight);
});
}
// ─── AUTO-DETECT METRICS from CSV columns ───
function detectMetrics(columns) {
const skip = new Set([RUN_COL, STEP_COL, 'seed']);
// Ordered: aggregate first, then individual
const aggOrder = ['agg_score_macro', 'agg_score_micro', 'agg_score_RC', 'agg_score_GK', 'agg_score_NLU', 'agg_score_MATH', 'agg_score_TABLE', 'agg_score_RES'];
const agg = aggOrder.filter(k => columns.includes(k));
const ind = columns.filter(k => !skip.has(k) && !agg.includes(k) && !isNaN(+allData[0][k]));
return [...agg, ...ind];
}
// ─── BAR CHART ───
function renderBar() {
const width = container.clientWidth || 800;
const margin = { top: 12, right: 56, bottom: 32, left: 190 };
const grouped = d3.group(allData, d => d[RUN_COL]);
const finalData = [];
for (const [raw, rows] of grouped) {
const maxStep = d3.max(rows, r => +r[STEP_COL]);
const row = rows.find(r => +r[STEP_COL] === maxStep);
if (row) finalData.push({ name: displayName(raw), rawName: raw, value: +row[currentMetric] });
}
finalData.sort((a, b) => b.value - a.value);
const barHeight = 28, barGap = 8;
const height = margin.top + margin.bottom + finalData.length * (barHeight + barGap);
svg.attr('width', width).attr('height', height);
gRoot.attr('transform', `translate(${margin.left},${margin.top})`);
const innerWidth = width - margin.left - margin.right;
const innerHeight = height - margin.top - margin.bottom;
const x = d3.scaleLinear().domain([0, d3.max(finalData, d => d.value) * 1.05]).range([0, innerWidth]);
const y = d3.scaleBand().domain(finalData.map(d => d.name)).range([0, innerHeight]).padding(0.2);
// Grid
gRoot.selectAll('.grid').data([0]).join('g').attr('class', 'grid').call(g => {
g.selectAll('line').data(x.ticks(5)).join('line')
.attr('x1', d => x(d)).attr('x2', d => x(d)).attr('y1', 0).attr('y2', innerHeight);
});
// X axis
gRoot.selectAll('.axis-x').data([0]).join('g').attr('class', 'axes axis-x')
.attr('transform', `translate(0,${innerHeight})`)
.call(d3.axisBottom(x).ticks(5).tickFormat(d3.format('.3f')).tickSizeOuter(0))
.call(g => {
g.selectAll('text').attr('fill', 'var(--tick-color)').style('font-size', '11px');
g.selectAll('path, line').attr('stroke', 'var(--axis-color)');
});
// Y axis
gRoot.selectAll('.axis-y').data([0]).join('g').attr('class', 'axes axis-y')
.call(d3.axisLeft(y).tickSizeOuter(0))
.call(g => {
g.selectAll('text').attr('fill', 'var(--text-color)').style('font-size', '12px').style('font-weight', '500');
g.selectAll('path, line').attr('stroke', 'var(--axis-color)');
});
// Bars
// Stripe patterns for baseline bars
finalData.forEach(d => {
if (!isBaseline(d.rawName)) return;
const c = colorMap[d.rawName] || '#999';
const pat = defs.append('pattern').attr('id', stripePatternId(d.rawName))
.attr('width', 6).attr('height', 6).attr('patternUnits', 'userSpaceOnUse').attr('patternTransform', 'rotate(45)');
pat.append('rect').attr('width', 6).attr('height', 6).attr('fill', c).attr('opacity', 0.35);
pat.append('line').attr('x1', 0).attr('y1', 0).attr('x2', 0).attr('y2', 6).attr('stroke', c).attr('stroke-width', 2.5);
});
const barTip = (ev, d) => {
const [mx, my] = d3.pointer(ev, container);
showTip(`<strong>${d.name}</strong><br/>${metricName(currentMetric)}: <strong>${d.value.toFixed(4)}</strong>`, mx, my);
};
gRoot.selectAll('rect.bar').data(finalData, d => d.name).join(
enter => enter.append('rect').attr('class', 'bar')
.attr('x', 0).attr('y', d => y(d.name)).attr('height', y.bandwidth()).attr('rx', 3)
.attr('fill', d => barFill(d))
.attr('width', 0)
.on('mouseenter', (ev, d) => { highlight = d.name; updateHighlight(); })
.on('mousemove', barTip)
.on('mouseleave', () => { hideTip(); highlight = null; updateHighlight(); })
.transition().duration(300).attr('width', d => Math.max(0, x(d.value))),
update => update
.on('mouseenter', (ev, d) => { highlight = d.name; updateHighlight(); })
.on('mousemove', barTip)
.on('mouseleave', () => { hideTip(); highlight = null; updateHighlight(); })
.transition().duration(300)
.attr('y', d => y(d.name)).attr('height', y.bandwidth())
.attr('width', d => Math.max(0, x(d.value)))
.attr('fill', d => barFill(d)),
exit => exit.transition().duration(200).attr('width', 0).remove()
);
// Value labels
gRoot.selectAll('text.value-label').data(finalData, d => d.name).join(
enter => enter.append('text').attr('class', 'value-label')
.attr('x', d => x(d.value) + 5).attr('y', d => y(d.name) + y.bandwidth() / 2)
.attr('dy', '0.35em').attr('fill', 'var(--text-color)').attr('font-size', 11)
.text(d => d.value.toFixed(4)),
update => update.transition().duration(300)
.attr('x', d => x(d.value) + 5).attr('y', d => y(d.name) + y.bandwidth() / 2)
.text(d => d.value.toFixed(4)),
exit => exit.remove()
);
}
// ─── LINE CHART ───
function renderLine() {
const width = container.clientWidth || 800;
const margin = { top: 16, right: 50, bottom: 48, left: 60 };
const height = Math.max(300, Math.round(width / 2.5));
svg.attr('width', width).attr('height', height);
gRoot.attr('transform', `translate(${margin.left},${margin.top})`);
const innerWidth = width - margin.left - margin.right;
const innerHeight = height - margin.top - margin.bottom;
// Build series
const grouped = d3.group(allData, d => d[RUN_COL]);
const series = [];
for (const [raw, rows] of grouped) {
const pts = rows.map(r => ({ step: +r[STEP_COL], value: +r[currentMetric] })).sort((a, b) => a.step - b.step);
series.push({ name: displayName(raw), rawName: raw, values: pts });
}
const allSteps = Array.from(new Set(allData.map(r => +r[STEP_COL]))).sort((a, b) => a - b);
const allValues = series.flatMap(s => s.values.map(v => v.value));
const x = d3.scaleLinear().domain(d3.extent(allSteps)).range([0, innerWidth]);
const yMin = d3.min(allValues), yMax = d3.max(allValues), yPad = (yMax - yMin) * 0.08;
const y = d3.scaleLinear().domain([yMin - yPad, yMax + yPad]).range([innerHeight, 0]).nice();
// Grid
gRoot.selectAll('.grid').data([0]).join('g').attr('class', 'grid').call(g => {
g.selectAll('line').data(y.ticks(6)).join('line')
.attr('x1', 0).attr('x2', innerWidth).attr('y1', d => y(d)).attr('y2', d => y(d));
});
// X axis
gRoot.selectAll('.axis-x').data([0]).join('g').attr('class', 'axes axis-x')
.attr('transform', `translate(0,${innerHeight})`)
.call(d3.axisBottom(x).ticks(6).tickFormat(d => stepLabelShort(d)).tickSizeOuter(0))
.call(g => {
g.selectAll('text').attr('fill', 'var(--tick-color)').style('font-size', '10px');
g.selectAll('path, line').attr('stroke', 'var(--axis-color)');
});
// Y axis
gRoot.selectAll('.axis-y').data([0]).join('g').attr('class', 'axes axis-y')
.call(d3.axisLeft(y).ticks(6).tickFormat(d3.format('.3f')).tickSizeOuter(0))
.call(g => {
g.selectAll('text').attr('fill', 'var(--tick-color)').style('font-size', '11px');
g.selectAll('path, line').attr('stroke', 'var(--axis-color)');
});
// Axis labels
gRoot.selectAll('.x-label').data([0]).join('text').attr('class', 'x-label')
.attr('x', innerWidth / 2).attr('y', innerHeight + 38)
.attr('text-anchor', 'middle').attr('fill', 'var(--text-color)').attr('font-size', 12)
.text('Tokens (Steps)');
gRoot.selectAll('.y-label').data([0]).join('text').attr('class', 'y-label')
.attr('transform', 'rotate(-90)').attr('x', -innerHeight / 2).attr('y', -44)
.attr('text-anchor', 'middle').attr('fill', 'var(--text-color)').attr('font-size', 12)
.text(metricName(currentMetric));
// Lines
const line = d3.line().x(d => x(d.step)).y(d => y(d.value)).curve(d3.curveMonotoneX);
gRoot.selectAll('.line-path').data(series, d => d.name).join(
enter => enter.append('path').attr('class', d => 'line-path' + (isBaseline(d.rawName) ? ' baseline' : ''))
.attr('stroke', d => colorMap[d.rawName] || 'var(--primary-color)')
.attr('d', d => line(d.values)),
update => update.attr('class', d => 'line-path' + (isBaseline(d.rawName) ? ' baseline' : ''))
.transition().duration(300)
.attr('stroke', d => colorMap[d.rawName] || 'var(--primary-color)')
.attr('d', d => line(d.values)),
exit => exit.remove()
);
// Dots
const dotData = series.flatMap(s => s.values.map(v => ({ name: s.name, rawName: s.rawName, step: v.step, value: v.value })));
gRoot.selectAll('.line-dot').data(dotData, d => d.name + '-' + d.step).join(
enter => enter.append('circle').attr('class', d => 'line-dot' + (isBaseline(d.rawName) ? ' baseline' : ''))
.attr('cx', d => x(d.step)).attr('cy', d => y(d.value)).attr('r', 3)
.attr('fill', d => colorMap[d.rawName] || 'var(--primary-color)')
.attr('stroke', 'var(--surface-bg)').attr('stroke-width', 1),
update => update.attr('class', d => 'line-dot' + (isBaseline(d.rawName) ? ' baseline' : ''))
.transition().duration(300)
.attr('cx', d => x(d.step)).attr('cy', d => y(d.value))
.attr('fill', d => colorMap[d.rawName] || 'var(--primary-color)'),
exit => exit.remove()
);
// Hover overlay
gRoot.selectAll('.hover-line').data([0]).join('line').attr('class', 'hover-line')
.attr('y1', 0).attr('y2', innerHeight).style('display', 'none');
gRoot.selectAll('.hover-overlay').data([0]).join('rect').attr('class', 'hover-overlay')
.attr('width', innerWidth).attr('height', innerHeight)
.attr('fill', 'none').attr('pointer-events', 'all')
.on('mousemove', (ev) => {
const [mx] = d3.pointer(ev, gRoot.node());
const nearest = allSteps.reduce((best, s) => Math.abs(s - x.invert(mx)) < Math.abs(best - x.invert(mx)) ? s : best, allSteps[0]);
gRoot.select('.hover-line').attr('x1', x(nearest)).attr('x2', x(nearest)).style('display', null);
const entries = series.map(s => {
const pt = s.values.find(v => v.step === nearest);
return pt ? { name: s.name, rawName: s.rawName, value: pt.value } : null;
}).filter(Boolean).sort((a, b) => b.value - a.value);
let html = `<div style="font-weight:700;margin-bottom:4px;">${stepLabelLong(nearest)}</div>`;
entries.forEach(e => {
html += `<div><span class="tip-dot" style="background:${colorMap[e.rawName]}"></span>${e.name}: <strong>${e.value.toFixed(4)}</strong></div>`;
});
const [cx, cy] = d3.pointer(ev, container);
showTip(html, cx, cy);
})
.on('mouseleave', () => {
gRoot.select('.hover-line').style('display', 'none');
hideTip();
});
}
// ─── RENDER ───
function render() {
if (!allData.length) return;
initColors();
gRoot.selectAll('*').remove();
defs.selectAll('*').remove();
if (currentView === 'bar') renderBar(); else renderLine();
}
// ─── UI ───
function buildUI() {
const controls = document.createElement('div'); controls.className = 'controls';
// Setup selector (only shown when setups config is present)
if (SETUPS && setupNames.length > 0) {
const setupGroup = document.createElement('div'); setupGroup.className = 'control-group';
const setupLabel = document.createElement('label'); setupLabel.setAttribute('for', 'setup-' + uid); setupLabel.textContent = 'Setup';
const setupSelect = document.createElement('select'); setupSelect.id = 'setup-' + uid;
setupNames.forEach(name => {
const opt = document.createElement('option'); opt.value = name; opt.textContent = name;
if (name === currentSetup) opt.selected = true;
setupSelect.appendChild(opt);
});
// Add Average option
if (setupNames.length >= 2) {
const avgOpt = document.createElement('option'); avgOpt.value = AVG_SETUP_KEY; avgOpt.textContent = AVG_SETUP_KEY;
setupSelect.appendChild(avgOpt);
}
setupSelect.addEventListener('change', () => { switchSetup(setupSelect.value); });
setupGroup.appendChild(setupLabel); setupGroup.appendChild(setupSelect);
controls.appendChild(setupGroup);
}
// View toggle
const viewGroup = document.createElement('div'); viewGroup.className = 'control-group';
const viewLabel = document.createElement('label'); viewLabel.setAttribute('for', 'view-' + uid); viewLabel.textContent = 'View';
const viewSelect = document.createElement('select'); viewSelect.id = 'view-' + uid;
[['bar', 'Final Score'], ['line', 'Training Progression']].forEach(([val, text]) => {
const opt = document.createElement('option'); opt.value = val; opt.textContent = text;
if (val === currentView) opt.selected = true;
viewSelect.appendChild(opt);
});
viewSelect.addEventListener('change', () => { currentView = viewSelect.value; render(); });
viewGroup.appendChild(viewLabel); viewGroup.appendChild(viewSelect);
controls.appendChild(viewGroup);
// Metric select (populated after data load)
const metricGroup = document.createElement('div'); metricGroup.className = 'control-group';
const metricLabel = document.createElement('label'); metricLabel.setAttribute('for', 'metric-' + uid); metricLabel.textContent = 'Metric';
const metricSelect = document.createElement('select'); metricSelect.id = 'metric-' + uid;
metricGroup.appendChild(metricLabel); metricGroup.appendChild(metricSelect);
controls.appendChild(metricGroup);
container.appendChild(controls);
// Legend
const legend = document.createElement('div'); legend.className = 'legend';
legend.innerHTML = '<div class="legend-title">Legend</div><div class="items"></div>';
container.appendChild(legend);
}
function populateMetricSelect() {
const sel = container.querySelector('#metric-' + uid);
if (!sel) return;
sel.innerHTML = '';
const aggGroup = document.createElement('optgroup'); aggGroup.label = 'Aggregate Scores';
const indGroup = document.createElement('optgroup'); indGroup.label = 'Individual Benchmarks';
metricKeys.forEach(key => {
const opt = document.createElement('option'); opt.value = key; opt.textContent = metricName(key);
if (key === currentMetric) opt.selected = true;
if (key.startsWith('agg_score')) aggGroup.appendChild(opt); else indGroup.appendChild(opt);
});
if (aggGroup.children.length) sel.appendChild(aggGroup);
if (indGroup.children.length) sel.appendChild(indGroup);
sel.addEventListener('change', () => { currentMetric = sel.value; render(); });
}
function buildLegend() {
const items = container.querySelector('.legend .items');
if (!items) return;
items.innerHTML = '';
// Sort by final score (max step) on current default metric, descending
const grouped = d3.group(allData, d => d[RUN_COL]);
const sorted = Array.from(grouped.entries())
.map(([raw, rows]) => {
const maxStep = d3.max(rows, r => +r[STEP_COL]);
const row = rows.find(r => +r[STEP_COL] === maxStep);
return { raw, score: row ? +row[defaultMetric] : 0 };
})
.sort((a, b) => b.score - a.score)
.map(d => d.raw);
sorted.forEach(raw => {
const name = displayName(raw);
const el = document.createElement('span'); el.className = 'item'; el.setAttribute('data-name', name);
const sw = document.createElement('span'); sw.className = 'swatch';
const swColor = colorMap[raw] || '#999';
sw.style.background = swColor;
if (isBaseline(raw)) sw.style.backgroundImage = 'repeating-linear-gradient(45deg, transparent, transparent 2px, rgba(255,255,255,0.4) 2px, rgba(255,255,255,0.4) 4px)';
const txt = document.createElement('span'); txt.textContent = name;
el.appendChild(sw); el.appendChild(txt); items.appendChild(el);
el.addEventListener('mouseenter', () => { highlight = name; updateHighlight(); });
el.addEventListener('mouseleave', () => { highlight = null; updateHighlight(); });
});
}
buildUI();
// ─── DATA LOADING ───
const fetchFirstAvailable = async (paths) => {
for (const p of paths) {
try { const r = await fetch(p, { cache: 'no-cache' }); if (r.ok) return await r.text(); } catch (_) {}
}
throw new Error('CSV not found');
};
let dataMountEl = container;
while (dataMountEl && !dataMountEl.getAttribute?.('data-datafiles')) { dataMountEl = dataMountEl.parentElement; }
let providedData = null;
try {
const attr = dataMountEl && dataMountEl.getAttribute ? dataMountEl.getAttribute('data-datafiles') : null;
if (attr && attr.trim()) providedData = attr.trim().startsWith('[') ? JSON.parse(attr) : attr.trim();
} catch (_) {}
const ensurePrefix = (p) => (typeof p === 'string' && p && !p.includes('/')) ? `/data/${p}` : p;
const csvPaths = providedData
? (Array.isArray(providedData) ? providedData.map(ensurePrefix) : [ensurePrefix(providedData)])
: ['/data/benchmark-results.csv'];
(async () => {
try {
const text = await fetchFirstAvailable(csvPaths);
const parsed = d3.csvParse(text);
parsedData = parsed;
// Compute average data for setup mode
if (SETUPS && setupNames.length >= 2) {
const avg = computeAverageData(parsed);
avgDatasetNames = avg.datasetNames;
parsedData = parsed.concat(avg.data);
parsedData.columns = parsed.columns;
}
// Filter to only datasets with configured display names
filterData();
metricKeys = detectMetrics(allData.columns);
// Ensure defaultMetric is valid; fall back to first available
if (!metricKeys.includes(currentMetric)) currentMetric = metricKeys[0];
populateMetricSelect();
render();
buildLegend();
if (window.ResizeObserver) { new ResizeObserver(() => render()).observe(container); }
else { window.addEventListener('resize', () => render()); }
} catch (e) {
const pre = document.createElement('pre');
pre.textContent = 'Data load error: ' + (e && e.message ? e.message : e);
pre.style.color = 'var(--danger, #b00020)';
pre.style.fontSize = '12px';
container.appendChild(pre);
}
})();
};
if (document.readyState === 'loading') { document.addEventListener('DOMContentLoaded', () => ensureD3(bootstrap), { once: true }); }
else { ensureD3(bootstrap); }
})();
</script>