finephrase / app /src /content /embeds /d3-pipeline.html
joelniklaus's picture
joelniklaus HF Staff
add arrow back
0b70925
<div class="d3-pipeline"></div>
<style>
.d3-pipeline {
position: relative;
width: 100%;
margin: 0;
container-type: inline-size;
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
}
.d3-pipeline .node-group { cursor: default; }
.d3-pipeline .node-card { transition: filter .15s ease; }
.d3-pipeline .node-group:hover .node-card { filter: brightness(1.05); }
.d3-pipeline .node-title { font-weight: 700; fill: var(--text-color); }
.d3-pipeline .node-subtitle { fill: var(--muted-color); }
.d3-pipeline .group-label { font-weight: 700; fill: var(--muted-color); letter-spacing: 0.02em; }
.d3-pipeline .edge-path { fill: none; stroke-linecap: round; }
.d3-pipeline .d3-tooltip {
position: absolute; top: 0; left: 0;
transform: translate(-9999px, -9999px);
pointer-events: none; padding: 8px 12px; border-radius: 8px;
font-size: 12px; line-height: 1.4;
border: 1px solid var(--border-color); background: var(--surface-bg);
color: var(--text-color); box-shadow: 0 4px 20px rgba(0,0,0,.15);
opacity: 0; transition: opacity .12s ease; max-width: 260px; z-index: 100;
}
.d3-pipeline .d3-tooltip strong { display: block; margin-bottom: 2px; font-size: 13px; }
</style>
<script>
(() => {
const ensureD3 = (cb) => {
if (window.d3 && typeof window.d3.select === 'function') return cb();
let s = document.getElementById('d3-cdn-script');
if (!s) { s = document.createElement('script'); s.id = 'd3-cdn-script'; s.src = 'https://cdn.jsdelivr.net/npm/d3@7/dist/d3.min.js'; document.head.appendChild(s); }
const onReady = () => { if (window.d3 && typeof window.d3.select === 'function') cb(); };
s.addEventListener('load', onReady, { once: true });
if (window.d3) onReady();
};
const bootstrap = () => {
const scriptEl = document.currentScript;
let container = scriptEl ? scriptEl.previousElementSibling : null;
if (!(container && container.classList && container.classList.contains('d3-pipeline'))) {
const cs = Array.from(document.querySelectorAll('.d3-pipeline')).filter(el => !(el.dataset && el.dataset.mounted === 'true'));
container = cs[cs.length - 1] || null;
}
if (!container) return;
if (container.dataset) { if (container.dataset.mounted === 'true') return; container.dataset.mounted = 'true'; }
container.style.position = container.style.position || 'relative';
const tip = document.createElement('div');
tip.className = 'd3-tooltip';
const tipInner = document.createElement('div');
tip.appendChild(tipInner);
container.appendChild(tip);
function showTip(ev, html) {
tipInner.innerHTML = html;
tip.style.opacity = '1';
const r = container.getBoundingClientRect();
const x = ev.clientX - r.left + 14, y = ev.clientY - r.top - 10;
tip.style.transform = `translate(${x}px, ${y}px)`;
}
function hideTip() { tip.style.opacity = '0'; tip.style.transform = 'translate(-9999px,-9999px)'; }
const svg = d3.select(container).append('svg').attr('width', '100%').style('display', 'block');
const defs = svg.append('defs');
defs.append('marker').attr('id', 'pl-arrow').attr('viewBox', '0 0 10 8')
.attr('refX', 9).attr('refY', 4).attr('markerWidth', 7).attr('markerHeight', 5.5)
.attr('orient', 'auto').append('path').attr('d', 'M0,1 L8,4 L0,7 Z');
const gRoot = svg.append('g');
const gGroups = gRoot.append('g');
const gEdges = gRoot.append('g');
const gNodes = gRoot.append('g');
const nodes = [
{ id: 'hf_in', label: 'HF Hub Dataset', sub: '', group: 'input', tip: 'Source dataset from the Hugging Face Hub. Any split or config.' },
{ id: 'read', label: 'Read', sub: 'HuggingFaceDatasetReader', group: 'pipeline', tip: 'Reads documents from the Hub and streams them into the pipeline.' },
{ id: 'transform', label: 'Transform', sub: 'InferenceRunner', group: 'pipeline', tip: 'Orchestrates LLM inference: batching, retries, metric logging.' },
{ id: 'write', label: 'Write', sub: 'ParquetWriter', group: 'pipeline', tip: 'Writes generated outputs as Parquet files with checkpointing.' },
{ id: 'local', label: 'Local', sub: 'single node, multi-GPU', group: 'execution', tip: 'Run on a single machine with multiple workers for development.' },
{ id: 'slurm', label: 'Slurm Cluster', sub: 'multi-node, auto-scaling', group: 'execution', tip: 'Distribute across nodes for large-scale production workloads.' },
{ id: 'rollout', label: 'Custom Rollout', sub: 'async callable', group: 'inference', tip: 'Your rollout function: orchestrates one or many generate() calls.' },
{ id: 'vllm', label: 'vLLM / SGLang', sub: 'Server', group: 'inference', tip: 'High-throughput inference engine with prefix caching and batching.' },
{ id: 'hf_out', label: 'HF Hub Dataset', sub: '', group: 'output', tip: 'Generated dataset uploaded continuously to the Hugging Face Hub.' },
{ id: 'card', label: 'Dataset Card', sub: '+ Metrics', group: 'output', tip: 'Auto-generated dataset card with throughput stats.' },
{ id: 'monitor', label: 'Progress Monitor', sub: '', group: 'output', tip: 'Live progress bar and ETA on the dataset card during inference.' },
];
const groups = [
{ id: 'input', label: 'Input', icon: '📥' },
{ id: 'pipeline', label: 'DataTrove Pipeline', icon: '⚙️' },
{ id: 'execution', label: 'Execution Mode', icon: '🖥️' },
{ id: 'inference', label: 'Inference Engine', icon: '🚀' },
{ id: 'output', label: 'Output', icon: '📤' },
];
const edges = [
{ from: 'hf_in', to: 'read' },
{ from: 'read', to: 'transform' },
{ from: 'transform', to: 'write' },
{ from: 'transform', to: 'rollout' },
{ from: 'rollout', to: 'vllm' },
{ from: 'vllm', to: 'transform', label: 'response' },
{ from: 'write', to: 'hf_out' },
{ from: 'write', to: 'card' },
{ from: 'write', to: 'monitor' },
];
function isDark() { return document.documentElement.getAttribute('data-theme') === 'dark'; }
function colors() {
const dk = isDark();
const primary = window.ColorPalettes ? window.ColorPalettes.getPrimary() : (dk ? '#7c6ff7' : '#6366f1');
return {
nodeBg: dk ? 'rgba(255,255,255,0.055)' : 'rgba(255,255,255,0.92)',
nodeBd: dk ? 'rgba(255,255,255,0.10)' : 'rgba(0,0,0,0.09)',
groupBg: dk ? 'rgba(255,255,255,0.025)' : 'rgba(0,0,0,0.022)',
groupBd: dk ? 'rgba(255,255,255,0.07)' : 'rgba(0,0,0,0.055)',
pipeBg: dk ? 'rgba(99,102,241,0.055)' : 'rgba(99,102,241,0.04)',
pipeBd: dk ? 'rgba(99,102,241,0.14)' : 'rgba(99,102,241,0.11)',
edge: dk ? 'rgba(255,255,255,0.22)' : 'rgba(0,0,0,0.18)',
arrow: dk ? 'rgba(255,255,255,0.30)' : 'rgba(0,0,0,0.25)',
primary,
};
}
// Compute layout positions for a given container width
function computeLayout() {
const W = container.clientWidth || 820;
const s = Math.min(1, W / 820);
const nw = Math.round(200 * s), nh = Math.round(60 * s);
const nr = Math.round(10 * s);
const gp = Math.round(10 * s); // group padding
const gr = Math.round(10 * s); // group corner radius
const glh = Math.round(22 * s); // group label height
const ng = Math.round(7 * s); // node gap within group
const cg = Math.round(70 * s); // column gap
const rg = Math.round(14 * s); // row gap between groups
// Three columns: left (exec + inference), center (input + pipeline), right (output)
const leftW = nw + gp * 2;
const centerW = nw + gp * 2;
const rightW = nw + gp * 2;
const totalW = leftW + centerW + rightW + cg * 2;
const offsetX = Math.max(0, (W - totalW) / 2);
const leftX = offsetX;
const centerX = offsetX + leftW + cg;
const rightX = offsetX + leftW + cg + centerW + cg;
// -- Center column: Input (1 node) + Pipeline (3 nodes)
let y = Math.round(4 * s);
const inputNode = nodes.find(n => n.id === 'hf_in');
inputNode._x = centerX + gp; inputNode._y = y + glh + gp;
inputNode._w = nw; inputNode._h = nh; inputNode._r = nr;
const inputGroup = groups.find(g => g.id === 'input');
inputGroup._x = centerX; inputGroup._y = y;
inputGroup._w = centerW; inputGroup._h = glh + gp * 2 + nh; inputGroup._r = gr;
y += inputGroup._h + rg;
const pipeTop = y;
const pipeNodes = ['read', 'transform', 'write'].map(id => nodes.find(n => n.id === id));
pipeNodes.forEach((n, i) => {
n._x = centerX + gp;
n._y = pipeTop + glh + gp + i * (nh + ng);
n._w = nw; n._h = nh; n._r = nr;
});
const pipeH = glh + gp * 2 + 3 * nh + 2 * ng;
const pipeGroup = groups.find(g => g.id === 'pipeline');
pipeGroup._x = centerX; pipeGroup._y = pipeTop;
pipeGroup._w = centerW; pipeGroup._h = pipeH; pipeGroup._r = gr;
// -- Left column: Execution + Inference
// Position so inference engine bottom aligns with write node
const execNodes = ['local', 'slurm'].map(id => nodes.find(n => n.id === id));
const execH = glh + gp * 2 + execNodes.length * nh + (execNodes.length - 1) * ng;
const inferNodes = ['rollout', 'vllm'].map(id => nodes.find(n => n.id === id));
const inferH = glh + gp * 2 + inferNodes.length * nh + (inferNodes.length - 1) * ng;
const writeNode = nodes.find(n => n.id === 'write');
const inferBottom = writeNode._y + writeNode._h + gp;
const inferTop = inferBottom - inferH;
const execTop = inferTop - rg - execH;
execNodes.forEach((n, i) => {
n._x = leftX + gp; n._y = execTop + glh + gp + i * (nh + ng);
n._w = nw; n._h = nh; n._r = nr;
});
const execGroup = groups.find(g => g.id === 'execution');
execGroup._x = leftX; execGroup._y = execTop;
execGroup._w = leftW; execGroup._h = execH; execGroup._r = gr;
inferNodes.forEach((n, i) => {
n._x = leftX + gp; n._y = inferTop + glh + gp + i * (nh + ng);
n._w = nw; n._h = nh; n._r = nr;
});
const inferGroup = groups.find(g => g.id === 'inference');
inferGroup._x = leftX; inferGroup._y = inferTop;
inferGroup._w = leftW; inferGroup._h = inferH; inferGroup._r = gr;
// -- Right column: Output (align bottom with write node)
const outNodes = ['hf_out', 'card', 'monitor'].map(id => nodes.find(n => n.id === id));
const outH = glh + gp * 2 + outNodes.length * nh + (outNodes.length - 1) * ng;
const outBottom = writeNode._y + writeNode._h + gp;
const outTop = outBottom - outH;
outNodes.forEach((n, i) => {
n._x = rightX + gp; n._y = outTop + glh + gp + i * (nh + ng);
n._w = nw; n._h = nh; n._r = nr;
});
const outGroup = groups.find(g => g.id === 'output');
outGroup._x = rightX; outGroup._y = outTop;
outGroup._w = rightW; outGroup._h = outH; outGroup._r = gr;
const minY = Math.min(
...nodes.map(n => n._y),
...groups.map(g => g._y)
);
if (minY < 0) {
// Shift everything down so nothing is clipped
const shift = -minY + Math.round(4 * s);
nodes.forEach(n => { n._y += shift; });
groups.forEach(g => { g._y += shift; });
}
const maxY = Math.max(
...nodes.map(n => n._y + n._h + gp),
...groups.map(g => g._y + g._h)
);
svg.attr('height', maxY + Math.round(4 * s));
return s;
}
function pt(n, side, offset) {
const o = offset || 0;
if (side === 'top') return { x: n._x + n._w / 2 + o, y: n._y };
if (side === 'bottom') return { x: n._x + n._w / 2 + o, y: n._y + n._h };
if (side === 'left') return { x: n._x, y: n._y + n._h / 2 + o };
if (side === 'right') return { x: n._x + n._w, y: n._y + n._h / 2 + o };
}
function hBez(a, b) {
const mx = (a.x + b.x) / 2;
return `M${a.x},${a.y} C${mx},${a.y} ${mx},${b.y} ${b.x},${b.y}`;
}
function vBez(a, b) {
const my = (a.y + b.y) / 2;
return `M${a.x},${a.y} C${a.x},${my} ${b.x},${my} ${b.x},${b.y}`;
}
function edgePath(e) {
const f = nodes.find(n => n.id === e.from);
const t = nodes.find(n => n.id === e.to);
if (!f || !t) return '';
if (e.from === 'hf_in' && e.to === 'read') return vBez(pt(f,'bottom'), pt(t,'top'));
if (e.from === 'read' && e.to === 'transform') return vBez(pt(f,'bottom'), pt(t,'top'));
if (e.from === 'transform' && e.to === 'write') return vBez(pt(f,'bottom'), pt(t,'top'));
if (e.from === 'transform' && e.to === 'rollout') return hBez(pt(f,'left'), pt(t,'right'));
if (e.from === 'rollout' && e.to === 'vllm') return vBez(pt(f,'bottom'), pt(t,'top'));
if (e.from === 'vllm' && e.to === 'transform') {
const a = pt(f, 'right');
const b = pt(t, 'left', Math.round(t._h * 0.2));
return hBez(a, b);
}
// Fan out from Write: top/center/bottom of right edge
const sp = Math.round(f._h * 0.28);
if (e.from === 'write' && e.to === 'hf_out') return hBez(pt(f,'right', -sp), pt(t,'left'));
if (e.from === 'write' && e.to === 'card') return hBez(pt(f,'right'), pt(t,'left'));
if (e.from === 'write' && e.to === 'monitor') return hBez(pt(f,'right', sp), pt(t,'left'));
return hBez(pt(f,'right'), pt(t,'left'));
}
function render() {
const s = computeLayout();
const c = colors();
const fs = Math.max(11, Math.round(13 * s));
const fsSub = Math.max(10, Math.round(11 * s));
const fsGrp = Math.max(10, Math.round(11 * s));
const fsIcon = Math.max(12, Math.round(14 * s));
defs.select('#pl-arrow path').attr('fill', c.arrow);
// Groups
const gSel = gGroups.selectAll('g.grp').data(groups, d => d.id);
const gE = gSel.enter().append('g').attr('class', 'grp');
gE.append('rect');
gE.append('text').attr('class', 'grp-icon');
gE.append('text').attr('class', 'group-label');
const gM = gE.merge(gSel);
gM.select('rect')
.attr('x', d => d._x).attr('y', d => d._y)
.attr('width', d => d._w).attr('height', d => d._h)
.attr('rx', d => d._r).attr('ry', d => d._r)
.attr('fill', d => d.id === 'pipeline' ? c.pipeBg : c.groupBg)
.attr('stroke', d => d.id === 'pipeline' ? c.pipeBd : c.groupBd)
.attr('stroke-width', 1);
gM.select('.grp-icon')
.attr('x', d => d._x + Math.round(6 * s))
.attr('y', d => d._y + Math.round(15 * s))
.style('font-size', fsIcon + 'px')
.text(d => d.icon);
gM.select('.group-label')
.attr('x', d => d._x + Math.round(6 * s) + fsIcon + Math.round(3 * s))
.attr('y', d => d._y + Math.round(15 * s))
.style('font-size', fsGrp + 'px')
.text(d => d.label);
gSel.exit().remove();
// Edges
const eSel = gEdges.selectAll('path.edge-path').data(edges, d => d.from + d.to);
eSel.enter().append('path').attr('class', 'edge-path')
.attr('marker-end', 'url(#pl-arrow)')
.merge(eSel)
.attr('d', edgePath)
.attr('stroke', c.edge)
.attr('stroke-width', Math.max(1.5, 1.8 * s));
eSel.exit().remove();
// Nodes
const nSel = gNodes.selectAll('g.node-group').data(nodes, d => d.id);
const nE = nSel.enter().append('g').attr('class', 'node-group');
nE.append('rect').attr('class', 'node-card');
nE.append('text').attr('class', 'node-title');
nE.append('text').attr('class', 'node-subtitle');
const nM = nE.merge(nSel);
nM.attr('transform', d => `translate(${d._x},${d._y})`);
nM.select('.node-card')
.attr('width', d => d._w).attr('height', d => d._h)
.attr('rx', d => d._r).attr('ry', d => d._r)
.attr('fill', c.nodeBg).attr('stroke', c.nodeBd).attr('stroke-width', 1);
nM.select('.node-title')
.attr('x', d => d._w / 2).attr('y', d => d.sub ? d._h * 0.38 : d._h / 2)
.attr('text-anchor', 'middle').attr('dominant-baseline', 'middle')
.style('font-size', fs + 'px').text(d => d.label);
nM.select('.node-subtitle')
.attr('x', d => d._w / 2).attr('y', d => d._h * 0.68)
.attr('text-anchor', 'middle').attr('dominant-baseline', 'middle')
.style('font-size', fsSub + 'px').text(d => d.sub || '');
nM.on('mouseenter', (ev, d) => { if (d.tip) showTip(ev, `<strong>${d.label}</strong>${d.tip}`); })
.on('mousemove', (ev, d) => { if (d.tip) showTip(ev, `<strong>${d.label}</strong>${d.tip}`); })
.on('mouseleave', hideTip);
nSel.exit().remove();
}
render();
if (window.ResizeObserver) { new ResizeObserver(() => render()).observe(container); }
else { window.addEventListener('resize', render); }
new MutationObserver(() => render()).observe(document.documentElement, { attributes: true, attributeFilter: ['data-theme'] });
};
if (document.readyState === 'loading') {
document.addEventListener('DOMContentLoaded', () => ensureD3(bootstrap), { once: true });
} else { ensureD3(bootstrap); }
})();
</script>