Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
| <div class="d3-pipeline"></div> | |
| <style> | |
| .d3-pipeline { | |
| position: relative; | |
| width: 100%; | |
| margin: 0; | |
| container-type: inline-size; | |
| font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif; | |
| } | |
| .d3-pipeline .node-group { cursor: default; } | |
| .d3-pipeline .node-card { transition: filter .15s ease; } | |
| .d3-pipeline .node-group:hover .node-card { filter: brightness(1.05); } | |
| .d3-pipeline .node-title { font-weight: 700; fill: var(--text-color); } | |
| .d3-pipeline .node-subtitle { fill: var(--muted-color); } | |
| .d3-pipeline .group-label { font-weight: 700; fill: var(--muted-color); letter-spacing: 0.02em; } | |
| .d3-pipeline .edge-path { fill: none; stroke-linecap: round; } | |
| .d3-pipeline .d3-tooltip { | |
| position: absolute; top: 0; left: 0; | |
| transform: translate(-9999px, -9999px); | |
| pointer-events: none; padding: 8px 12px; border-radius: 8px; | |
| font-size: 12px; line-height: 1.4; | |
| border: 1px solid var(--border-color); background: var(--surface-bg); | |
| color: var(--text-color); box-shadow: 0 4px 20px rgba(0,0,0,.15); | |
| opacity: 0; transition: opacity .12s ease; max-width: 260px; z-index: 100; | |
| } | |
| .d3-pipeline .d3-tooltip strong { display: block; margin-bottom: 2px; font-size: 13px; } | |
| </style> | |
| <script> | |
| (() => { | |
| const ensureD3 = (cb) => { | |
| if (window.d3 && typeof window.d3.select === 'function') return cb(); | |
| let s = document.getElementById('d3-cdn-script'); | |
| if (!s) { s = document.createElement('script'); s.id = 'd3-cdn-script'; s.src = 'https://cdn.jsdelivr.net/npm/d3@7/dist/d3.min.js'; document.head.appendChild(s); } | |
| const onReady = () => { if (window.d3 && typeof window.d3.select === 'function') cb(); }; | |
| s.addEventListener('load', onReady, { once: true }); | |
| if (window.d3) onReady(); | |
| }; | |
| const bootstrap = () => { | |
| const scriptEl = document.currentScript; | |
| let container = scriptEl ? scriptEl.previousElementSibling : null; | |
| if (!(container && container.classList && container.classList.contains('d3-pipeline'))) { | |
| const cs = Array.from(document.querySelectorAll('.d3-pipeline')).filter(el => !(el.dataset && el.dataset.mounted === 'true')); | |
| container = cs[cs.length - 1] || null; | |
| } | |
| if (!container) return; | |
| if (container.dataset) { if (container.dataset.mounted === 'true') return; container.dataset.mounted = 'true'; } | |
| container.style.position = container.style.position || 'relative'; | |
| const tip = document.createElement('div'); | |
| tip.className = 'd3-tooltip'; | |
| const tipInner = document.createElement('div'); | |
| tip.appendChild(tipInner); | |
| container.appendChild(tip); | |
| function showTip(ev, html) { | |
| tipInner.innerHTML = html; | |
| tip.style.opacity = '1'; | |
| const r = container.getBoundingClientRect(); | |
| const x = ev.clientX - r.left + 14, y = ev.clientY - r.top - 10; | |
| tip.style.transform = `translate(${x}px, ${y}px)`; | |
| } | |
| function hideTip() { tip.style.opacity = '0'; tip.style.transform = 'translate(-9999px,-9999px)'; } | |
| const svg = d3.select(container).append('svg').attr('width', '100%').style('display', 'block'); | |
| const defs = svg.append('defs'); | |
| defs.append('marker').attr('id', 'pl-arrow').attr('viewBox', '0 0 10 8') | |
| .attr('refX', 9).attr('refY', 4).attr('markerWidth', 7).attr('markerHeight', 5.5) | |
| .attr('orient', 'auto').append('path').attr('d', 'M0,1 L8,4 L0,7 Z'); | |
| const gRoot = svg.append('g'); | |
| const gGroups = gRoot.append('g'); | |
| const gEdges = gRoot.append('g'); | |
| const gNodes = gRoot.append('g'); | |
| const nodes = [ | |
| { id: 'hf_in', label: 'HF Hub Dataset', sub: '', group: 'input', tip: 'Source dataset from the Hugging Face Hub. Any split or config.' }, | |
| { id: 'read', label: 'Read', sub: 'HuggingFaceDatasetReader', group: 'pipeline', tip: 'Reads documents from the Hub and streams them into the pipeline.' }, | |
| { id: 'transform', label: 'Transform', sub: 'InferenceRunner', group: 'pipeline', tip: 'Orchestrates LLM inference: batching, retries, metric logging.' }, | |
| { id: 'write', label: 'Write', sub: 'ParquetWriter', group: 'pipeline', tip: 'Writes generated outputs as Parquet files with checkpointing.' }, | |
| { id: 'local', label: 'Local', sub: 'single node, multi-GPU', group: 'execution', tip: 'Run on a single machine with multiple workers for development.' }, | |
| { id: 'slurm', label: 'Slurm Cluster', sub: 'multi-node, auto-scaling', group: 'execution', tip: 'Distribute across nodes for large-scale production workloads.' }, | |
| { id: 'rollout', label: 'Custom Rollout', sub: 'async callable', group: 'inference', tip: 'Your rollout function: orchestrates one or many generate() calls.' }, | |
| { id: 'vllm', label: 'vLLM / SGLang', sub: 'Server', group: 'inference', tip: 'High-throughput inference engine with prefix caching and batching.' }, | |
| { id: 'hf_out', label: 'HF Hub Dataset', sub: '', group: 'output', tip: 'Generated dataset uploaded continuously to the Hugging Face Hub.' }, | |
| { id: 'card', label: 'Dataset Card', sub: '+ Metrics', group: 'output', tip: 'Auto-generated dataset card with throughput stats.' }, | |
| { id: 'monitor', label: 'Progress Monitor', sub: '', group: 'output', tip: 'Live progress bar and ETA on the dataset card during inference.' }, | |
| ]; | |
| const groups = [ | |
| { id: 'input', label: 'Input', icon: '📥' }, | |
| { id: 'pipeline', label: 'DataTrove Pipeline', icon: '⚙️' }, | |
| { id: 'execution', label: 'Execution Mode', icon: '🖥️' }, | |
| { id: 'inference', label: 'Inference Engine', icon: '🚀' }, | |
| { id: 'output', label: 'Output', icon: '📤' }, | |
| ]; | |
| const edges = [ | |
| { from: 'hf_in', to: 'read' }, | |
| { from: 'read', to: 'transform' }, | |
| { from: 'transform', to: 'write' }, | |
| { from: 'transform', to: 'rollout' }, | |
| { from: 'rollout', to: 'vllm' }, | |
| { from: 'vllm', to: 'transform', label: 'response' }, | |
| { from: 'write', to: 'hf_out' }, | |
| { from: 'write', to: 'card' }, | |
| { from: 'write', to: 'monitor' }, | |
| ]; | |
| function isDark() { return document.documentElement.getAttribute('data-theme') === 'dark'; } | |
| function colors() { | |
| const dk = isDark(); | |
| const primary = window.ColorPalettes ? window.ColorPalettes.getPrimary() : (dk ? '#7c6ff7' : '#6366f1'); | |
| return { | |
| nodeBg: dk ? 'rgba(255,255,255,0.055)' : 'rgba(255,255,255,0.92)', | |
| nodeBd: dk ? 'rgba(255,255,255,0.10)' : 'rgba(0,0,0,0.09)', | |
| groupBg: dk ? 'rgba(255,255,255,0.025)' : 'rgba(0,0,0,0.022)', | |
| groupBd: dk ? 'rgba(255,255,255,0.07)' : 'rgba(0,0,0,0.055)', | |
| pipeBg: dk ? 'rgba(99,102,241,0.055)' : 'rgba(99,102,241,0.04)', | |
| pipeBd: dk ? 'rgba(99,102,241,0.14)' : 'rgba(99,102,241,0.11)', | |
| edge: dk ? 'rgba(255,255,255,0.22)' : 'rgba(0,0,0,0.18)', | |
| arrow: dk ? 'rgba(255,255,255,0.30)' : 'rgba(0,0,0,0.25)', | |
| primary, | |
| }; | |
| } | |
| // Compute layout positions for a given container width | |
| function computeLayout() { | |
| const W = container.clientWidth || 820; | |
| const s = Math.min(1, W / 820); | |
| const nw = Math.round(200 * s), nh = Math.round(60 * s); | |
| const nr = Math.round(10 * s); | |
| const gp = Math.round(10 * s); // group padding | |
| const gr = Math.round(10 * s); // group corner radius | |
| const glh = Math.round(22 * s); // group label height | |
| const ng = Math.round(7 * s); // node gap within group | |
| const cg = Math.round(70 * s); // column gap | |
| const rg = Math.round(14 * s); // row gap between groups | |
| // Three columns: left (exec + inference), center (input + pipeline), right (output) | |
| const leftW = nw + gp * 2; | |
| const centerW = nw + gp * 2; | |
| const rightW = nw + gp * 2; | |
| const totalW = leftW + centerW + rightW + cg * 2; | |
| const offsetX = Math.max(0, (W - totalW) / 2); | |
| const leftX = offsetX; | |
| const centerX = offsetX + leftW + cg; | |
| const rightX = offsetX + leftW + cg + centerW + cg; | |
| // -- Center column: Input (1 node) + Pipeline (3 nodes) | |
| let y = Math.round(4 * s); | |
| const inputNode = nodes.find(n => n.id === 'hf_in'); | |
| inputNode._x = centerX + gp; inputNode._y = y + glh + gp; | |
| inputNode._w = nw; inputNode._h = nh; inputNode._r = nr; | |
| const inputGroup = groups.find(g => g.id === 'input'); | |
| inputGroup._x = centerX; inputGroup._y = y; | |
| inputGroup._w = centerW; inputGroup._h = glh + gp * 2 + nh; inputGroup._r = gr; | |
| y += inputGroup._h + rg; | |
| const pipeTop = y; | |
| const pipeNodes = ['read', 'transform', 'write'].map(id => nodes.find(n => n.id === id)); | |
| pipeNodes.forEach((n, i) => { | |
| n._x = centerX + gp; | |
| n._y = pipeTop + glh + gp + i * (nh + ng); | |
| n._w = nw; n._h = nh; n._r = nr; | |
| }); | |
| const pipeH = glh + gp * 2 + 3 * nh + 2 * ng; | |
| const pipeGroup = groups.find(g => g.id === 'pipeline'); | |
| pipeGroup._x = centerX; pipeGroup._y = pipeTop; | |
| pipeGroup._w = centerW; pipeGroup._h = pipeH; pipeGroup._r = gr; | |
| // -- Left column: Execution + Inference | |
| // Position so inference engine bottom aligns with write node | |
| const execNodes = ['local', 'slurm'].map(id => nodes.find(n => n.id === id)); | |
| const execH = glh + gp * 2 + execNodes.length * nh + (execNodes.length - 1) * ng; | |
| const inferNodes = ['rollout', 'vllm'].map(id => nodes.find(n => n.id === id)); | |
| const inferH = glh + gp * 2 + inferNodes.length * nh + (inferNodes.length - 1) * ng; | |
| const writeNode = nodes.find(n => n.id === 'write'); | |
| const inferBottom = writeNode._y + writeNode._h + gp; | |
| const inferTop = inferBottom - inferH; | |
| const execTop = inferTop - rg - execH; | |
| execNodes.forEach((n, i) => { | |
| n._x = leftX + gp; n._y = execTop + glh + gp + i * (nh + ng); | |
| n._w = nw; n._h = nh; n._r = nr; | |
| }); | |
| const execGroup = groups.find(g => g.id === 'execution'); | |
| execGroup._x = leftX; execGroup._y = execTop; | |
| execGroup._w = leftW; execGroup._h = execH; execGroup._r = gr; | |
| inferNodes.forEach((n, i) => { | |
| n._x = leftX + gp; n._y = inferTop + glh + gp + i * (nh + ng); | |
| n._w = nw; n._h = nh; n._r = nr; | |
| }); | |
| const inferGroup = groups.find(g => g.id === 'inference'); | |
| inferGroup._x = leftX; inferGroup._y = inferTop; | |
| inferGroup._w = leftW; inferGroup._h = inferH; inferGroup._r = gr; | |
| // -- Right column: Output (align bottom with write node) | |
| const outNodes = ['hf_out', 'card', 'monitor'].map(id => nodes.find(n => n.id === id)); | |
| const outH = glh + gp * 2 + outNodes.length * nh + (outNodes.length - 1) * ng; | |
| const outBottom = writeNode._y + writeNode._h + gp; | |
| const outTop = outBottom - outH; | |
| outNodes.forEach((n, i) => { | |
| n._x = rightX + gp; n._y = outTop + glh + gp + i * (nh + ng); | |
| n._w = nw; n._h = nh; n._r = nr; | |
| }); | |
| const outGroup = groups.find(g => g.id === 'output'); | |
| outGroup._x = rightX; outGroup._y = outTop; | |
| outGroup._w = rightW; outGroup._h = outH; outGroup._r = gr; | |
| const minY = Math.min( | |
| ...nodes.map(n => n._y), | |
| ...groups.map(g => g._y) | |
| ); | |
| if (minY < 0) { | |
| // Shift everything down so nothing is clipped | |
| const shift = -minY + Math.round(4 * s); | |
| nodes.forEach(n => { n._y += shift; }); | |
| groups.forEach(g => { g._y += shift; }); | |
| } | |
| const maxY = Math.max( | |
| ...nodes.map(n => n._y + n._h + gp), | |
| ...groups.map(g => g._y + g._h) | |
| ); | |
| svg.attr('height', maxY + Math.round(4 * s)); | |
| return s; | |
| } | |
| function pt(n, side, offset) { | |
| const o = offset || 0; | |
| if (side === 'top') return { x: n._x + n._w / 2 + o, y: n._y }; | |
| if (side === 'bottom') return { x: n._x + n._w / 2 + o, y: n._y + n._h }; | |
| if (side === 'left') return { x: n._x, y: n._y + n._h / 2 + o }; | |
| if (side === 'right') return { x: n._x + n._w, y: n._y + n._h / 2 + o }; | |
| } | |
| function hBez(a, b) { | |
| const mx = (a.x + b.x) / 2; | |
| return `M${a.x},${a.y} C${mx},${a.y} ${mx},${b.y} ${b.x},${b.y}`; | |
| } | |
| function vBez(a, b) { | |
| const my = (a.y + b.y) / 2; | |
| return `M${a.x},${a.y} C${a.x},${my} ${b.x},${my} ${b.x},${b.y}`; | |
| } | |
| function edgePath(e) { | |
| const f = nodes.find(n => n.id === e.from); | |
| const t = nodes.find(n => n.id === e.to); | |
| if (!f || !t) return ''; | |
| if (e.from === 'hf_in' && e.to === 'read') return vBez(pt(f,'bottom'), pt(t,'top')); | |
| if (e.from === 'read' && e.to === 'transform') return vBez(pt(f,'bottom'), pt(t,'top')); | |
| if (e.from === 'transform' && e.to === 'write') return vBez(pt(f,'bottom'), pt(t,'top')); | |
| if (e.from === 'transform' && e.to === 'rollout') return hBez(pt(f,'left'), pt(t,'right')); | |
| if (e.from === 'rollout' && e.to === 'vllm') return vBez(pt(f,'bottom'), pt(t,'top')); | |
| if (e.from === 'vllm' && e.to === 'transform') { | |
| const a = pt(f, 'right'); | |
| const b = pt(t, 'left', Math.round(t._h * 0.2)); | |
| return hBez(a, b); | |
| } | |
| // Fan out from Write: top/center/bottom of right edge | |
| const sp = Math.round(f._h * 0.28); | |
| if (e.from === 'write' && e.to === 'hf_out') return hBez(pt(f,'right', -sp), pt(t,'left')); | |
| if (e.from === 'write' && e.to === 'card') return hBez(pt(f,'right'), pt(t,'left')); | |
| if (e.from === 'write' && e.to === 'monitor') return hBez(pt(f,'right', sp), pt(t,'left')); | |
| return hBez(pt(f,'right'), pt(t,'left')); | |
| } | |
| function render() { | |
| const s = computeLayout(); | |
| const c = colors(); | |
| const fs = Math.max(11, Math.round(13 * s)); | |
| const fsSub = Math.max(10, Math.round(11 * s)); | |
| const fsGrp = Math.max(10, Math.round(11 * s)); | |
| const fsIcon = Math.max(12, Math.round(14 * s)); | |
| defs.select('#pl-arrow path').attr('fill', c.arrow); | |
| // Groups | |
| const gSel = gGroups.selectAll('g.grp').data(groups, d => d.id); | |
| const gE = gSel.enter().append('g').attr('class', 'grp'); | |
| gE.append('rect'); | |
| gE.append('text').attr('class', 'grp-icon'); | |
| gE.append('text').attr('class', 'group-label'); | |
| const gM = gE.merge(gSel); | |
| gM.select('rect') | |
| .attr('x', d => d._x).attr('y', d => d._y) | |
| .attr('width', d => d._w).attr('height', d => d._h) | |
| .attr('rx', d => d._r).attr('ry', d => d._r) | |
| .attr('fill', d => d.id === 'pipeline' ? c.pipeBg : c.groupBg) | |
| .attr('stroke', d => d.id === 'pipeline' ? c.pipeBd : c.groupBd) | |
| .attr('stroke-width', 1); | |
| gM.select('.grp-icon') | |
| .attr('x', d => d._x + Math.round(6 * s)) | |
| .attr('y', d => d._y + Math.round(15 * s)) | |
| .style('font-size', fsIcon + 'px') | |
| .text(d => d.icon); | |
| gM.select('.group-label') | |
| .attr('x', d => d._x + Math.round(6 * s) + fsIcon + Math.round(3 * s)) | |
| .attr('y', d => d._y + Math.round(15 * s)) | |
| .style('font-size', fsGrp + 'px') | |
| .text(d => d.label); | |
| gSel.exit().remove(); | |
| // Edges | |
| const eSel = gEdges.selectAll('path.edge-path').data(edges, d => d.from + d.to); | |
| eSel.enter().append('path').attr('class', 'edge-path') | |
| .attr('marker-end', 'url(#pl-arrow)') | |
| .merge(eSel) | |
| .attr('d', edgePath) | |
| .attr('stroke', c.edge) | |
| .attr('stroke-width', Math.max(1.5, 1.8 * s)); | |
| eSel.exit().remove(); | |
| // Nodes | |
| const nSel = gNodes.selectAll('g.node-group').data(nodes, d => d.id); | |
| const nE = nSel.enter().append('g').attr('class', 'node-group'); | |
| nE.append('rect').attr('class', 'node-card'); | |
| nE.append('text').attr('class', 'node-title'); | |
| nE.append('text').attr('class', 'node-subtitle'); | |
| const nM = nE.merge(nSel); | |
| nM.attr('transform', d => `translate(${d._x},${d._y})`); | |
| nM.select('.node-card') | |
| .attr('width', d => d._w).attr('height', d => d._h) | |
| .attr('rx', d => d._r).attr('ry', d => d._r) | |
| .attr('fill', c.nodeBg).attr('stroke', c.nodeBd).attr('stroke-width', 1); | |
| nM.select('.node-title') | |
| .attr('x', d => d._w / 2).attr('y', d => d.sub ? d._h * 0.38 : d._h / 2) | |
| .attr('text-anchor', 'middle').attr('dominant-baseline', 'middle') | |
| .style('font-size', fs + 'px').text(d => d.label); | |
| nM.select('.node-subtitle') | |
| .attr('x', d => d._w / 2).attr('y', d => d._h * 0.68) | |
| .attr('text-anchor', 'middle').attr('dominant-baseline', 'middle') | |
| .style('font-size', fsSub + 'px').text(d => d.sub || ''); | |
| nM.on('mouseenter', (ev, d) => { if (d.tip) showTip(ev, `<strong>${d.label}</strong>${d.tip}`); }) | |
| .on('mousemove', (ev, d) => { if (d.tip) showTip(ev, `<strong>${d.label}</strong>${d.tip}`); }) | |
| .on('mouseleave', hideTip); | |
| nSel.exit().remove(); | |
| } | |
| render(); | |
| if (window.ResizeObserver) { new ResizeObserver(() => render()).observe(container); } | |
| else { window.addEventListener('resize', render); } | |
| new MutationObserver(() => render()).observe(document.documentElement, { attributes: true, attributeFilter: ['data-theme'] }); | |
| }; | |
| if (document.readyState === 'loading') { | |
| document.addEventListener('DOMContentLoaded', () => ensureD3(bootstrap), { once: true }); | |
| } else { ensureD3(bootstrap); } | |
| })(); | |
| </script> | |