joelniklaus HF Staff commited on
Commit
2cd9a9d
·
1 Parent(s): 3e302cb

add leandro's visualization

Browse files
app/src/content/chapters/infrastructure.mdx CHANGED
@@ -443,3 +443,11 @@ python examples/inference/benchmark/generate_data.py \
443
  ```
444
 
445
  With a trillion-parameter model you won't be generating billions of tokens per hour, but you don't need to. A few thousand high-quality reasoning traces from a frontier model can be worth more than millions of tokens from a smaller one.
 
 
 
 
 
 
 
 
 
443
  ```
444
 
445
  With a trillion-parameter model you won't be generating billions of tokens per hour, but you don't need to. A few thousand high-quality reasoning traces from a frontier model can be worth more than millions of tokens from a smaller one.
446
+
447
+ To get an intuition for what these throughput numbers feel like, <FigRef target="inference-throughput" /> lets you pick a model and scale up the number of GPUs. Each page represents roughly 500 tokens of generated text. At high enough throughput, pages roll up into books (200 pages each).
448
+
449
+ <HtmlEmbed
450
+ id="inference-throughput"
451
+ src="inference-throughput.html"
452
+ caption="Interactive GPU throughput simulator. Select a model and adjust the number of H100 GPUs to see how fast pages (500 tokens each) or books (200 pages each) are generated."
453
+ />
app/src/content/embeds/inference-throughput.html ADDED
@@ -0,0 +1,383 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <title>Demo 6: GPU Page Generator</title>
6
+ <style>
7
+ * { margin: 0; padding: 0; box-sizing: border-box; }
8
+ body { background: #fff; font-family: system-ui, sans-serif; display: flex; flex-direction: column; align-items: center; padding: 30px; }
9
+ h2 { margin-bottom: 4px; }
10
+ .subtitle { color: #666; margin-bottom: 20px; font-size: 14px; }
11
+ .controls { display: flex; gap: 24px; align-items: flex-end; margin-bottom: 16px; flex-wrap: wrap; justify-content: center; }
12
+ .control-group { display: flex; flex-direction: column; gap: 4px; }
13
+ .control-group label { font-size: 12px; font-weight: 600; color: #444; text-transform: uppercase; letter-spacing: 0.5px; }
14
+ select, input[type=range] { font-size: 14px; font-family: inherit; }
15
+ select { padding: 6px 10px; border: 2px solid #000; background: #fff; cursor: pointer; }
16
+ input[type=range] { width: 200px; accent-color: #2e5f7e; }
17
+ .gpu-count { font-size: 14px; font-weight: 700; color: #2e5f7e; min-width: 60px; text-align: right; font-variant-numeric: tabular-nums; }
18
+ canvas { border: 2px solid #000; background: #fafafa; }
19
+ .metrics { display: flex; gap: 32px; margin-top: 14px; flex-wrap: wrap; justify-content: center; }
20
+ .metric { text-align: center; }
21
+ .metric .value { font-size: 28px; font-weight: 700; color: #2e5f7e; font-variant-numeric: tabular-nums; }
22
+ .metric .label { font-size: 11px; color: #888; text-transform: uppercase; letter-spacing: 0.5px; }
23
+ </style>
24
+ </head>
25
+ <body>
26
+ <h2>GPU Page Generator</h2>
27
+ <div class="subtitle">1 page ≈ 500 tokens ≈ 1,800 characters</div>
28
+
29
+ <div class="controls">
30
+ <div class="control-group">
31
+ <label>Model</label>
32
+ <select id="model">
33
+ <option value="45540">SmolLM2-135M (45,540 tps/gpu)</option>
34
+ <option value="8086">Qwen3-4B (8,086 tps/gpu)</option>
35
+ <option value="6443">Qwen3-8B (6,443 tps/gpu)</option>
36
+ <option value="6117">GPT-OSS-120B (6,117 tps/gpu)</option>
37
+ <option value="1724">Gemma-3-27B (1,724 tps/gpu)</option>
38
+ </select>
39
+ </div>
40
+ <div class="control-group">
41
+ <label>GPUs</label>
42
+ <div style="display:flex;align-items:center;gap:8px;">
43
+ <input type="range" id="gpus" min="0" max="10" step="0.01" value="0">
44
+ <span class="gpu-count" id="gpuLabel">1</span>
45
+ </div>
46
+ </div>
47
+ </div>
48
+
49
+ <canvas id="c" width="1000" height="400"></canvas>
50
+
51
+ <div class="metrics">
52
+ <div class="metric">
53
+ <div class="value" id="pps">0</div>
54
+ <div class="label" id="ppsLabel">Pages / second</div>
55
+ </div>
56
+ <div class="metric">
57
+ <div class="value" id="tps">0</div>
58
+ <div class="label">Tokens / second</div>
59
+ </div>
60
+ <div class="metric">
61
+ <div class="value" id="total">0</div>
62
+ <div class="label" id="totalLabel">Pages generated</div>
63
+ </div>
64
+ </div>
65
+
66
+ <script>
67
+ const canvas = document.getElementById('c');
68
+ const ctx = canvas.getContext('2d');
69
+ const W = 1000, H = 400;
70
+
71
+ const TOKENS_PER_PAGE = 500;
72
+ const GPU_AREA_X = 10, GPU_AREA_Y = 10, GPU_AREA_W = 200, GPU_AREA_H = H - 20;
73
+ const FADE_START = W - 120;
74
+
75
+ function getSpawnX() {
76
+ return GPU_AREA_X + GPU_AREA_W + 15;
77
+ }
78
+
79
+ const modelSelect = document.getElementById('model');
80
+ const gpuSlider = document.getElementById('gpus');
81
+ const gpuLabel = document.getElementById('gpuLabel');
82
+ const ppsEl = document.getElementById('pps');
83
+ const ppsLabelEl = document.getElementById('ppsLabel');
84
+ const tpsEl = document.getElementById('tps');
85
+ const totalEl = document.getElementById('total');
86
+ const totalLabelEl = document.getElementById('totalLabel');
87
+
88
+ const PAGES_PER_BOOK = 200; // ~100K tokens per book
89
+
90
+ let pages = [];
91
+ let totalPages = 0;
92
+ let spawnAccum = 0;
93
+ let lastTime = performance.now();
94
+
95
+ // Page colors — slight variations
96
+ const pageColors = ['#f5f0e8', '#f0ece4', '#ebe7df', '#f2ede5', '#e8e4dc'];
97
+
98
+ function getGpuCount() {
99
+ // Logarithmic slider: 0-10 maps to 1-1024
100
+ return Math.round(Math.pow(2, gpuSlider.value));
101
+ }
102
+
103
+ function getTps() {
104
+ return parseInt(modelSelect.value) * getGpuCount();
105
+ }
106
+
107
+ function getPps() {
108
+ return getTps() / TOKENS_PER_PAGE;
109
+ }
110
+
111
+ gpuSlider.addEventListener('input', () => {
112
+ gpuLabel.textContent = getGpuCount().toLocaleString();
113
+ });
114
+
115
+ function drawGPUs() {
116
+ const count = getGpuCount();
117
+ const ax = GPU_AREA_X, ay = GPU_AREA_Y, aw = GPU_AREA_W, ah = GPU_AREA_H;
118
+
119
+ // Compute grid layout: find cols/rows that fit the area
120
+ // Aim for roughly square-ish cells that fill the area
121
+ let cols, rows;
122
+ if (count === 1) { cols = 1; rows = 1; }
123
+ else if (count === 2) { cols = 2; rows = 1; }
124
+ else if (count <= 4) { cols = 2; rows = 2; }
125
+ else if (count <= 8) { cols = 4; rows = 2; }
126
+ else if (count <= 16) { cols = 4; rows = 4; }
127
+ else if (count <= 32) { cols = 8; rows = 4; }
128
+ else if (count <= 64) { cols = 8; rows = 8; }
129
+ else if (count <= 128) { cols = 16; rows = 8; }
130
+ else if (count <= 256) { cols = 16; rows = 16; }
131
+ else if (count <= 512) { cols = 32; rows = 16; }
132
+ else { cols = 32; rows = 32; }
133
+
134
+ const gap = count <= 16 ? 3 : count <= 64 ? 2 : 1;
135
+ const cellW = (aw - gap * (cols - 1)) / cols;
136
+ const cellH = (ah - gap * (rows - 1)) / rows;
137
+ const gw = Math.min(cellW, cellH * 0.6); // GPU aspect ratio ~0.6
138
+ const gh = gw / 0.6;
139
+ // Center the grid
140
+ const totalW = cols * gw + (cols - 1) * gap;
141
+ const totalH = rows * gh + (rows - 1) * gap;
142
+ const offX = ax + (aw - totalW) / 2;
143
+ const offY = ay + (ah - totalH) / 2;
144
+
145
+ const fanSpeed = (performance.now() / 200) * Math.min(getPps(), 100);
146
+
147
+ for (let i = 0; i < count; i++) {
148
+ const col = i % cols;
149
+ const row = Math.floor(i / cols);
150
+ const x = offX + col * (gw + gap);
151
+ const y = offY + row * (gh + gap);
152
+
153
+ // GPU body
154
+ ctx.fillStyle = '#2a2a2a';
155
+ ctx.fillRect(x, y, gw, gh);
156
+ ctx.strokeStyle = '#000';
157
+ ctx.lineWidth = count <= 16 ? 1.5 : 0.5;
158
+ ctx.strokeRect(x, y, gw, gh);
159
+
160
+ if (gw >= 12) {
161
+ // Fan area
162
+ const fanSize = gw * 0.7;
163
+ const fanX = x + (gw - fanSize) / 2;
164
+ const fanY = y + gw * 0.08;
165
+ ctx.fillStyle = '#1a1a1a';
166
+ ctx.fillRect(fanX, fanY, fanSize, fanSize);
167
+
168
+ // Fan circle + blades
169
+ const fcx = fanX + fanSize / 2;
170
+ const fcy = fanY + fanSize / 2;
171
+ const fr = fanSize / 2 - 2;
172
+ ctx.beginPath();
173
+ ctx.arc(fcx, fcy, fr, 0, Math.PI * 2);
174
+ ctx.fillStyle = '#333';
175
+ ctx.fill();
176
+
177
+ if (gw >= 20) {
178
+ // Fan blades
179
+ const bladeR = fr - 2;
180
+ ctx.save();
181
+ ctx.translate(fcx, fcy);
182
+ ctx.rotate(fanSpeed + i * 0.5); // offset per GPU
183
+ const bladeCount = gw >= 40 ? 7 : 5;
184
+ for (let b = 0; b < bladeCount; b++) {
185
+ ctx.rotate(Math.PI * 2 / bladeCount);
186
+ ctx.beginPath();
187
+ ctx.moveTo(0, 0);
188
+ ctx.quadraticCurveTo(bladeR * 0.5, bladeR * 0.3, bladeR * 0.85, 0);
189
+ ctx.quadraticCurveTo(bladeR * 0.5, -bladeR * 0.3, 0, 0);
190
+ ctx.fillStyle = '#555';
191
+ ctx.fill();
192
+ }
193
+ ctx.restore();
194
+ }
195
+
196
+ // Heatsink lines below fan
197
+ const heatY = fanY + fanSize + 2;
198
+ const heatH = gh - (heatY - y) - gw * 0.15;
199
+ if (heatH > 4) {
200
+ const lineH = Math.max(1, Math.min(4, heatH / 6));
201
+ const lineGap = lineH * 1.5;
202
+ for (let li = 0; li * lineGap < heatH; li++) {
203
+ ctx.fillStyle = li % 2 === 0 ? '#444' : '#383838';
204
+ ctx.fillRect(x + gw * 0.1, heatY + li * lineGap, gw * 0.8, lineH);
205
+ }
206
+ }
207
+
208
+ // Gold pins at bottom
209
+ if (gw >= 30) {
210
+ ctx.fillStyle = '#c4a020';
211
+ const pinW = Math.max(1, gw * 0.06);
212
+ const pinCount = Math.floor(gw * 0.7 / (pinW * 2));
213
+ const pinStart = x + (gw - pinCount * pinW * 2) / 2;
214
+ for (let p = 0; p < pinCount; p++) {
215
+ ctx.fillRect(pinStart + p * pinW * 2, y + gh, pinW, Math.max(2, gw * 0.08));
216
+ }
217
+ }
218
+ } else {
219
+ // Too small for detail — just a green LED dot
220
+ ctx.fillStyle = '#8a8';
221
+ const dotR = Math.max(0.5, gw * 0.1);
222
+ ctx.fillRect(x + gw/2 - dotR, y + gh * 0.8, dotR * 2, dotR * 2);
223
+ }
224
+ }
225
+ }
226
+
227
+ // Book cover colors
228
+ const bookColors = ['#8b4513','#2e4057','#6b3a3a','#3a5a3a','#4a3a6b','#5a3a2e','#2e3a5a','#6b5a3a'];
229
+
230
+ function drawItem(p) {
231
+ const alpha = p.x > FADE_START ? 1 - (p.x - FADE_START) / (W - FADE_START) : 1;
232
+ ctx.globalAlpha = alpha;
233
+
234
+ if (p.type === 'book') {
235
+ const bw = 16, bh = 20;
236
+ // Shadow
237
+ ctx.fillStyle = 'rgba(0,0,0,0.12)';
238
+ ctx.fillRect(p.x + 2, p.y + 2, bw, bh);
239
+ // Cover
240
+ ctx.fillStyle = p.color;
241
+ ctx.fillRect(p.x, p.y, bw, bh);
242
+ // Spine
243
+ ctx.fillStyle = 'rgba(0,0,0,0.25)';
244
+ ctx.fillRect(p.x, p.y, 3, bh);
245
+ // Page edges (right side lighter stripe)
246
+ ctx.fillStyle = '#f0ece4';
247
+ ctx.fillRect(p.x + bw - 2, p.y + 1, 2, bh - 2);
248
+ // Title lines on cover
249
+ ctx.fillStyle = 'rgba(255,255,255,0.3)';
250
+ ctx.fillRect(p.x + 5, p.y + 5, 8, 1.5);
251
+ ctx.fillRect(p.x + 5, p.y + 8, 6, 1.5);
252
+ // Border
253
+ ctx.strokeStyle = '#333';
254
+ ctx.lineWidth = 0.5;
255
+ ctx.strokeRect(p.x, p.y, bw, bh);
256
+ } else {
257
+ const pw = 14, ph = 18;
258
+ // Shadow
259
+ ctx.fillStyle = 'rgba(0,0,0,0.08)';
260
+ ctx.fillRect(p.x + 2, p.y + 2, pw, ph);
261
+ // Page body
262
+ ctx.fillStyle = p.color;
263
+ ctx.fillRect(p.x, p.y, pw, ph);
264
+ ctx.strokeStyle = '#aaa';
265
+ ctx.lineWidth = 0.5;
266
+ ctx.strokeRect(p.x, p.y, pw, ph);
267
+ // Text lines
268
+ ctx.fillStyle = '#ccc';
269
+ for (let i = 0; i < 4; i++) {
270
+ const lw = 6 + Math.sin(p.id + i) * 4;
271
+ ctx.fillRect(p.x + 2, p.y + 3 + i * 3, lw, 1);
272
+ }
273
+ }
274
+
275
+ ctx.globalAlpha = 1;
276
+ }
277
+
278
+ function isBookMode() {
279
+ return getPps() / PAGES_PER_BOOK >= 1;
280
+ }
281
+
282
+ function spawnItem() {
283
+ const yMin = 30, yMax = H - 60;
284
+ const y = yMin + Math.random() * (yMax - yMin);
285
+ const bookMode = isBookMode();
286
+ // Use effective rate for speed calculation
287
+ const rate = bookMode ? getPps() / PAGES_PER_BOOK : getPps();
288
+ const baseSpeed = 2 + Math.log10(Math.max(1, rate)) * 2;
289
+ const speed = baseSpeed + Math.random() * baseSpeed;
290
+ pages.push({
291
+ x: getSpawnX() + Math.random() * 20,
292
+ y,
293
+ speed,
294
+ type: bookMode ? 'book' : 'page',
295
+ color: bookMode
296
+ ? bookColors[Math.floor(Math.random() * bookColors.length)]
297
+ : pageColors[Math.floor(Math.random() * pageColors.length)],
298
+ id: totalPages
299
+ });
300
+ totalPages += bookMode ? PAGES_PER_BOOK : 1;
301
+ }
302
+
303
+ function formatNum(n) {
304
+ if (n >= 1_000_000_000) return (n / 1_000_000_000).toFixed(1) + 'B';
305
+ if (n >= 1_000_000) return (n / 1_000_000).toFixed(1) + 'M';
306
+ if (n >= 1_000) return (n / 1_000).toFixed(1) + 'K';
307
+ return n.toFixed(0);
308
+ }
309
+
310
+ function frame(now) {
311
+ const dt = Math.min((now - lastTime) / 1000, 0.1); // cap at 100ms
312
+ lastTime = now;
313
+
314
+ ctx.clearRect(0, 0, W, H);
315
+
316
+ // Background gradient hint
317
+ ctx.fillStyle = '#fafafa';
318
+ ctx.fillRect(0, 0, W, H);
319
+
320
+ // Arrow flow hint (subtle)
321
+ ctx.strokeStyle = '#e0e0e0';
322
+ ctx.lineWidth = 1;
323
+ ctx.setLineDash([4, 8]);
324
+ for (let y = 60; y < H - 30; y += 40) {
325
+ ctx.beginPath();
326
+ ctx.moveTo(getSpawnX(), y);
327
+ ctx.lineTo(W - 20, y);
328
+ ctx.stroke();
329
+ }
330
+ ctx.setLineDash([]);
331
+
332
+ // Spawn items — books when fast enough, pages otherwise
333
+ const pps = getPps();
334
+ const bookMode = isBookMode();
335
+ const spawnRate = bookMode ? pps / PAGES_PER_BOOK : pps;
336
+
337
+ spawnAccum += spawnRate * dt;
338
+ while (spawnAccum >= 1) {
339
+ spawnItem();
340
+ spawnAccum -= 1;
341
+ }
342
+
343
+ // Update and draw items
344
+ for (let i = pages.length - 1; i >= 0; i--) {
345
+ pages[i].x += pages[i].speed;
346
+ if (pages[i].x > W + 40) {
347
+ pages.splice(i, 1);
348
+ continue;
349
+ }
350
+ drawItem(pages[i]);
351
+ }
352
+
353
+ // Draw GPUs on top
354
+ drawGPUs();
355
+
356
+ // Update metrics — switch to books when throughput is high enough
357
+ const totalTps = getTps();
358
+ const bps = pps / PAGES_PER_BOOK;
359
+ if (bps >= 1) {
360
+ ppsEl.textContent = formatNum(bps);
361
+ ppsLabelEl.textContent = 'Books / second';
362
+ } else {
363
+ ppsEl.textContent = formatNum(pps);
364
+ ppsLabelEl.textContent = 'Pages / second';
365
+ }
366
+ tpsEl.textContent = formatNum(totalTps);
367
+ const totalBooks = totalPages / PAGES_PER_BOOK;
368
+ if (totalBooks >= 1) {
369
+ totalEl.textContent = formatNum(totalBooks);
370
+ totalLabelEl.textContent = 'Books generated';
371
+ } else {
372
+ totalEl.textContent = formatNum(totalPages);
373
+ totalLabelEl.textContent = 'Pages generated';
374
+ }
375
+
376
+ requestAnimationFrame(frame);
377
+ }
378
+
379
+ gpuLabel.textContent = getGpuCount().toLocaleString();
380
+ requestAnimationFrame(frame);
381
+ </script>
382
+ </body>
383
+ </html>