finephrase

Running on CPU Upgrade

App Files Files Community

joelniklaus HF Staff commited on Feb 20

Commit

2cd9a9d

1 Parent(s): 3e302cb

add leandro's visualization

Browse files

Files changed (2) hide show

app/src/content/chapters/infrastructure.mdx +8 -0
app/src/content/embeds/inference-throughput.html +383 -0

app/src/content/chapters/infrastructure.mdx CHANGED Viewed

@@ -443,3 +443,11 @@ python examples/inference/benchmark/generate_data.py \
 ```
 With a trillion-parameter model you won't be generating billions of tokens per hour, but you don't need to. A few thousand high-quality reasoning traces from a frontier model can be worth more than millions of tokens from a smaller one.

 ```
 With a trillion-parameter model you won't be generating billions of tokens per hour, but you don't need to. A few thousand high-quality reasoning traces from a frontier model can be worth more than millions of tokens from a smaller one.
+To get an intuition for what these throughput numbers feel like, <FigRef target="inference-throughput" /> lets you pick a model and scale up the number of GPUs. Each page represents roughly 500 tokens of generated text. At high enough throughput, pages roll up into books (200 pages each).
+<HtmlEmbed
+  id="inference-throughput"
+  src="inference-throughput.html"
+  caption="Interactive GPU throughput simulator. Select a model and adjust the number of H100 GPUs to see how fast pages (500 tokens each) or books (200 pages each) are generated."
+/>

app/src/content/embeds/inference-throughput.html ADDED Viewed

	@@ -0,0 +1,383 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8">
+<title>Demo 6: GPU Page Generator</title>
+<style>
+  * { margin: 0; padding: 0; box-sizing: border-box; }
+  body { background: #fff; font-family: system-ui, sans-serif; display: flex; flex-direction: column; align-items: center; padding: 30px; }
+  h2 { margin-bottom: 4px; }
+  .subtitle { color: #666; margin-bottom: 20px; font-size: 14px; }
+  .controls { display: flex; gap: 24px; align-items: flex-end; margin-bottom: 16px; flex-wrap: wrap; justify-content: center; }
+  .control-group { display: flex; flex-direction: column; gap: 4px; }
+  .control-group label { font-size: 12px; font-weight: 600; color: #444; text-transform: uppercase; letter-spacing: 0.5px; }
+  select, input[type=range] { font-size: 14px; font-family: inherit; }
+  select { padding: 6px 10px; border: 2px solid #000; background: #fff; cursor: pointer; }
+  input[type=range] { width: 200px; accent-color: #2e5f7e; }
+  .gpu-count { font-size: 14px; font-weight: 700; color: #2e5f7e; min-width: 60px; text-align: right; font-variant-numeric: tabular-nums; }
+  canvas { border: 2px solid #000; background: #fafafa; }
+  .metrics { display: flex; gap: 32px; margin-top: 14px; flex-wrap: wrap; justify-content: center; }
+  .metric { text-align: center; }
+  .metric .value { font-size: 28px; font-weight: 700; color: #2e5f7e; font-variant-numeric: tabular-nums; }
+  .metric .label { font-size: 11px; color: #888; text-transform: uppercase; letter-spacing: 0.5px; }
+</style>
+</head>
+<body>
+<h2>GPU Page Generator</h2>
+<div class="subtitle">1 page ≈ 500 tokens ≈ 1,800 characters</div>
+<div class="controls">
+  <div class="control-group">
+    <label>Model</label>
+    <select id="model">
+      <option value="45540">SmolLM2-135M (45,540 tps/gpu)</option>
+      <option value="8086">Qwen3-4B (8,086 tps/gpu)</option>
+      <option value="6443">Qwen3-8B (6,443 tps/gpu)</option>
+      <option value="6117">GPT-OSS-120B (6,117 tps/gpu)</option>
+      <option value="1724">Gemma-3-27B (1,724 tps/gpu)</option>
+    </select>
+  </div>
+  <div class="control-group">
+    <label>GPUs</label>
+    <div style="display:flex;align-items:center;gap:8px;">
+      <input type="range" id="gpus" min="0" max="10" step="0.01" value="0">
+      <span class="gpu-count" id="gpuLabel">1</span>
+    </div>
+  </div>
+</div>
+<canvas id="c" width="1000" height="400"></canvas>
+<div class="metrics">
+  <div class="metric">
+    <div class="value" id="pps">0</div>
+    <div class="label" id="ppsLabel">Pages / second</div>
+  </div>
+  <div class="metric">
+    <div class="value" id="tps">0</div>
+    <div class="label">Tokens / second</div>
+  </div>
+  <div class="metric">
+    <div class="value" id="total">0</div>
+    <div class="label" id="totalLabel">Pages generated</div>
+  </div>
+</div>
+<script>
+  const canvas = document.getElementById('c');
+  const ctx = canvas.getContext('2d');
+  const W = 1000, H = 400;
+  const TOKENS_PER_PAGE = 500;
+  const GPU_AREA_X = 10, GPU_AREA_Y = 10, GPU_AREA_W = 200, GPU_AREA_H = H - 20;
+  const FADE_START = W - 120;
+  function getSpawnX() {
+    return GPU_AREA_X + GPU_AREA_W + 15;
+  }
+  const modelSelect = document.getElementById('model');
+  const gpuSlider = document.getElementById('gpus');
+  const gpuLabel = document.getElementById('gpuLabel');
+  const ppsEl = document.getElementById('pps');
+  const ppsLabelEl = document.getElementById('ppsLabel');
+  const tpsEl = document.getElementById('tps');
+  const totalEl = document.getElementById('total');
+  const totalLabelEl = document.getElementById('totalLabel');
+  const PAGES_PER_BOOK = 200; // ~100K tokens per book
+  let pages = [];
+  let totalPages = 0;
+  let spawnAccum = 0;
+  let lastTime = performance.now();
+  // Page colors — slight variations
+  const pageColors = ['#f5f0e8', '#f0ece4', '#ebe7df', '#f2ede5', '#e8e4dc'];
+  function getGpuCount() {
+    // Logarithmic slider: 0-10 maps to 1-1024
+    return Math.round(Math.pow(2, gpuSlider.value));
+  }
+  function getTps() {
+    return parseInt(modelSelect.value) * getGpuCount();
+  }
+  function getPps() {
+    return getTps() / TOKENS_PER_PAGE;
+  }
+  gpuSlider.addEventListener('input', () => {
+    gpuLabel.textContent = getGpuCount().toLocaleString();
+  });
+  function drawGPUs() {
+    const count = getGpuCount();
+    const ax = GPU_AREA_X, ay = GPU_AREA_Y, aw = GPU_AREA_W, ah = GPU_AREA_H;
+    // Compute grid layout: find cols/rows that fit the area
+    // Aim for roughly square-ish cells that fill the area
+    let cols, rows;
+    if (count === 1) { cols = 1; rows = 1; }
+    else if (count === 2) { cols = 2; rows = 1; }
+    else if (count <= 4) { cols = 2; rows = 2; }
+    else if (count <= 8) { cols = 4; rows = 2; }
+    else if (count <= 16) { cols = 4; rows = 4; }
+    else if (count <= 32) { cols = 8; rows = 4; }
+    else if (count <= 64) { cols = 8; rows = 8; }
+    else if (count <= 128) { cols = 16; rows = 8; }
+    else if (count <= 256) { cols = 16; rows = 16; }
+    else if (count <= 512) { cols = 32; rows = 16; }
+    else { cols = 32; rows = 32; }
+    const gap = count <= 16 ? 3 : count <= 64 ? 2 : 1;
+    const cellW = (aw - gap * (cols - 1)) / cols;
+    const cellH = (ah - gap * (rows - 1)) / rows;
+    const gw = Math.min(cellW, cellH * 0.6); // GPU aspect ratio ~0.6
+    const gh = gw / 0.6;
+    // Center the grid
+    const totalW = cols * gw + (cols - 1) * gap;
+    const totalH = rows * gh + (rows - 1) * gap;
+    const offX = ax + (aw - totalW) / 2;
+    const offY = ay + (ah - totalH) / 2;
+    const fanSpeed = (performance.now() / 200) * Math.min(getPps(), 100);
+    for (let i = 0; i < count; i++) {
+      const col = i % cols;
+      const row = Math.floor(i / cols);
+      const x = offX + col * (gw + gap);
+      const y = offY + row * (gh + gap);
+      // GPU body
+      ctx.fillStyle = '#2a2a2a';
+      ctx.fillRect(x, y, gw, gh);
+      ctx.strokeStyle = '#000';
+      ctx.lineWidth = count <= 16 ? 1.5 : 0.5;
+      ctx.strokeRect(x, y, gw, gh);
+      if (gw >= 12) {
+        // Fan area
+        const fanSize = gw * 0.7;
+        const fanX = x + (gw - fanSize) / 2;
+        const fanY = y + gw * 0.08;
+        ctx.fillStyle = '#1a1a1a';
+        ctx.fillRect(fanX, fanY, fanSize, fanSize);
+        // Fan circle + blades
+        const fcx = fanX + fanSize / 2;
+        const fcy = fanY + fanSize / 2;
+        const fr = fanSize / 2 - 2;
+        ctx.beginPath();
+        ctx.arc(fcx, fcy, fr, 0, Math.PI * 2);
+        ctx.fillStyle = '#333';
+        ctx.fill();
+        if (gw >= 20) {
+          // Fan blades
+          const bladeR = fr - 2;
+          ctx.save();
+          ctx.translate(fcx, fcy);
+          ctx.rotate(fanSpeed + i * 0.5); // offset per GPU
+          const bladeCount = gw >= 40 ? 7 : 5;
+          for (let b = 0; b < bladeCount; b++) {
+            ctx.rotate(Math.PI * 2 / bladeCount);
+            ctx.beginPath();
+            ctx.moveTo(0, 0);
+            ctx.quadraticCurveTo(bladeR * 0.5, bladeR * 0.3, bladeR * 0.85, 0);
+            ctx.quadraticCurveTo(bladeR * 0.5, -bladeR * 0.3, 0, 0);
+            ctx.fillStyle = '#555';
+            ctx.fill();
+          }
+          ctx.restore();
+        }
+        // Heatsink lines below fan
+        const heatY = fanY + fanSize + 2;
+        const heatH = gh - (heatY - y) - gw * 0.15;
+        if (heatH > 4) {
+          const lineH = Math.max(1, Math.min(4, heatH / 6));
+          const lineGap = lineH * 1.5;
+          for (let li = 0; li * lineGap < heatH; li++) {
+            ctx.fillStyle = li % 2 === 0 ? '#444' : '#383838';
+            ctx.fillRect(x + gw * 0.1, heatY + li * lineGap, gw * 0.8, lineH);
+          }
+        }
+        // Gold pins at bottom
+        if (gw >= 30) {
+          ctx.fillStyle = '#c4a020';
+          const pinW = Math.max(1, gw * 0.06);
+          const pinCount = Math.floor(gw * 0.7 / (pinW * 2));
+          const pinStart = x + (gw - pinCount * pinW * 2) / 2;
+          for (let p = 0; p < pinCount; p++) {
+            ctx.fillRect(pinStart + p * pinW * 2, y + gh, pinW, Math.max(2, gw * 0.08));
+          }
+        }
+      } else {
+        // Too small for detail — just a green LED dot
+        ctx.fillStyle = '#8a8';
+        const dotR = Math.max(0.5, gw * 0.1);
+        ctx.fillRect(x + gw/2 - dotR, y + gh * 0.8, dotR * 2, dotR * 2);
+      }
+    }
+  }
+  // Book cover colors
+  const bookColors = ['#8b4513','#2e4057','#6b3a3a','#3a5a3a','#4a3a6b','#5a3a2e','#2e3a5a','#6b5a3a'];
+  function drawItem(p) {
+    const alpha = p.x > FADE_START ? 1 - (p.x - FADE_START) / (W - FADE_START) : 1;
+    ctx.globalAlpha = alpha;
+    if (p.type === 'book') {
+      const bw = 16, bh = 20;
+      // Shadow
+      ctx.fillStyle = 'rgba(0,0,0,0.12)';
+      ctx.fillRect(p.x + 2, p.y + 2, bw, bh);
+      // Cover
+      ctx.fillStyle = p.color;
+      ctx.fillRect(p.x, p.y, bw, bh);
+      // Spine
+      ctx.fillStyle = 'rgba(0,0,0,0.25)';
+      ctx.fillRect(p.x, p.y, 3, bh);
+      // Page edges (right side lighter stripe)
+      ctx.fillStyle = '#f0ece4';
+      ctx.fillRect(p.x + bw - 2, p.y + 1, 2, bh - 2);
+      // Title lines on cover
+      ctx.fillStyle = 'rgba(255,255,255,0.3)';
+      ctx.fillRect(p.x + 5, p.y + 5, 8, 1.5);
+      ctx.fillRect(p.x + 5, p.y + 8, 6, 1.5);
+      // Border
+      ctx.strokeStyle = '#333';
+      ctx.lineWidth = 0.5;
+      ctx.strokeRect(p.x, p.y, bw, bh);
+    } else {
+      const pw = 14, ph = 18;
+      // Shadow
+      ctx.fillStyle = 'rgba(0,0,0,0.08)';
+      ctx.fillRect(p.x + 2, p.y + 2, pw, ph);
+      // Page body
+      ctx.fillStyle = p.color;
+      ctx.fillRect(p.x, p.y, pw, ph);
+      ctx.strokeStyle = '#aaa';
+      ctx.lineWidth = 0.5;
+      ctx.strokeRect(p.x, p.y, pw, ph);
+      // Text lines
+      ctx.fillStyle = '#ccc';
+      for (let i = 0; i < 4; i++) {
+        const lw = 6 + Math.sin(p.id + i) * 4;
+        ctx.fillRect(p.x + 2, p.y + 3 + i * 3, lw, 1);
+      }
+    }
+    ctx.globalAlpha = 1;
+  }
+  function isBookMode() {
+    return getPps() / PAGES_PER_BOOK >= 1;
+  }
+  function spawnItem() {
+    const yMin = 30, yMax = H - 60;
+    const y = yMin + Math.random() * (yMax - yMin);
+    const bookMode = isBookMode();
+    // Use effective rate for speed calculation
+    const rate = bookMode ? getPps() / PAGES_PER_BOOK : getPps();
+    const baseSpeed = 2 + Math.log10(Math.max(1, rate)) * 2;
+    const speed = baseSpeed + Math.random() * baseSpeed;
+    pages.push({
+      x: getSpawnX() + Math.random() * 20,
+      y,
+      speed,
+      type: bookMode ? 'book' : 'page',
+      color: bookMode
+        ? bookColors[Math.floor(Math.random() * bookColors.length)]
+        : pageColors[Math.floor(Math.random() * pageColors.length)],
+      id: totalPages
+    });
+    totalPages += bookMode ? PAGES_PER_BOOK : 1;
+  }
+  function formatNum(n) {
+    if (n >= 1_000_000_000) return (n / 1_000_000_000).toFixed(1) + 'B';
+    if (n >= 1_000_000) return (n / 1_000_000).toFixed(1) + 'M';
+    if (n >= 1_000) return (n / 1_000).toFixed(1) + 'K';
+    return n.toFixed(0);
+  }
+  function frame(now) {
+    const dt = Math.min((now - lastTime) / 1000, 0.1); // cap at 100ms
+    lastTime = now;
+    ctx.clearRect(0, 0, W, H);
+    // Background gradient hint
+    ctx.fillStyle = '#fafafa';
+    ctx.fillRect(0, 0, W, H);
+    // Arrow flow hint (subtle)
+    ctx.strokeStyle = '#e0e0e0';
+    ctx.lineWidth = 1;
+    ctx.setLineDash([4, 8]);
+    for (let y = 60; y < H - 30; y += 40) {
+      ctx.beginPath();
+      ctx.moveTo(getSpawnX(), y);
+      ctx.lineTo(W - 20, y);
+      ctx.stroke();
+    }
+    ctx.setLineDash([]);
+    // Spawn items — books when fast enough, pages otherwise
+    const pps = getPps();
+    const bookMode = isBookMode();
+    const spawnRate = bookMode ? pps / PAGES_PER_BOOK : pps;
+    spawnAccum += spawnRate * dt;
+    while (spawnAccum >= 1) {
+      spawnItem();
+      spawnAccum -= 1;
+    }
+    // Update and draw items
+    for (let i = pages.length - 1; i >= 0; i--) {
+      pages[i].x += pages[i].speed;
+      if (pages[i].x > W + 40) {
+        pages.splice(i, 1);
+        continue;
+      }
+      drawItem(pages[i]);
+    }
+    // Draw GPUs on top
+    drawGPUs();
+    // Update metrics — switch to books when throughput is high enough
+    const totalTps = getTps();
+    const bps = pps / PAGES_PER_BOOK;
+    if (bps >= 1) {
+      ppsEl.textContent = formatNum(bps);
+      ppsLabelEl.textContent = 'Books / second';
+    } else {
+      ppsEl.textContent = formatNum(pps);
+      ppsLabelEl.textContent = 'Pages / second';
+    }
+    tpsEl.textContent = formatNum(totalTps);
+    const totalBooks = totalPages / PAGES_PER_BOOK;
+    if (totalBooks >= 1) {
+      totalEl.textContent = formatNum(totalBooks);
+      totalLabelEl.textContent = 'Books generated';
+    } else {
+      totalEl.textContent = formatNum(totalPages);
+      totalLabelEl.textContent = 'Pages generated';
+    }
+    requestAnimationFrame(frame);
+  }
+  gpuLabel.textContent = getGpuCount().toLocaleString();
+  requestAnimationFrame(frame);
+</script>
+</body>
+</html>