Spaces:
Running
Running
| <html lang="en"> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
| <title>LLM Inference β Pipeline Simulator</title> | |
| <style> | |
| :root { | |
| --bg: #0d1117; | |
| --surface: #161b22; | |
| --border: #30363d; | |
| --text: #c9d1d9; | |
| --dim: #8b949e; | |
| --accent: #58a6ff; | |
| --cpu: #f0883e; | |
| --ram: #3fb950; | |
| --gpu: #bc8cff; | |
| --vram: #f778ba; | |
| --prefill: #da3633; | |
| --decode: #d29922; | |
| --active: #58a6ff; | |
| } | |
| * { margin: 0; padding: 0; box-sizing: border-box; } | |
| body { | |
| background: var(--bg); | |
| color: var(--text); | |
| font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', monospace; | |
| display: flex; justify-content: center; align-items: center; | |
| min-height: 100vh; padding: 2rem; | |
| } | |
| .sim { | |
| max-width: 820px; width: 100%; | |
| background: var(--surface); | |
| border: 1px solid var(--border); | |
| border-radius: 12px; padding: 2rem; | |
| } | |
| h1 { font-size: 1.25rem; font-weight: 600; margin-bottom: 0.25rem; color: var(--text); } | |
| .subtitle { font-size: 0.8rem; color: var(--dim); margin-bottom: 1.5rem; } | |
| /* Pipeline */ | |
| .pipeline { | |
| display: flex; align-items: center; justify-content: center; | |
| gap: 0; margin-bottom: 1.5rem; flex-wrap: wrap; | |
| } | |
| .stage { | |
| padding: 0.6rem 1rem; border-radius: 8px; | |
| border: 2px solid var(--border); background: var(--bg); | |
| font-size: 0.85rem; font-weight: 600; | |
| text-align: center; min-width: 80px; | |
| transition: all 0.3s ease; | |
| position: relative; z-index: 2; | |
| } | |
| .stage.active { | |
| border-color: var(--active); | |
| background: #1a2332; | |
| box-shadow: 0 0 16px rgba(88,166,255,0.25); | |
| color: var(--active); | |
| } | |
| .arrow { | |
| width: 32px; height: 2px; background: var(--border); | |
| margin: 0 4px; flex-shrink: 0; position: relative; z-index: 1; | |
| } | |
| .arrow::after { | |
| content: 'βΈ'; position: absolute; right: -6px; top: -8px; | |
| font-size: 0.7rem; color: var(--border); | |
| } | |
| /* Decode loop indicator */ | |
| .decode-loop { | |
| display: flex; align-items: center; gap: 6px; | |
| margin-top: 0.3rem; justify-content: center; | |
| font-size: 0.7rem; color: var(--dim); | |
| } | |
| .loop-arrow { | |
| width: 20px; height: 2px; background: var(--dim); | |
| } | |
| .loop-count { | |
| padding: 2px 8px; border: 1px solid var(--border); | |
| border-radius: 4px; font-size: 0.7rem; | |
| } | |
| .loop-count.active { | |
| border-color: var(--decode); color: var(--decode); | |
| background: rgba(210,153,34,0.1); | |
| } | |
| /* Resources */ | |
| .resources { | |
| display: grid; grid-template-columns: repeat(4, 1fr); | |
| gap: 1rem; margin-bottom: 1.5rem; | |
| } | |
| .res { | |
| border: 2px solid var(--border); border-radius: 8px; | |
| padding: 0.6rem 0.4rem; background: var(--bg); text-align: center; | |
| transition: all 0.3s ease; | |
| } | |
| .res-label { | |
| font-size: 0.7rem; text-transform: uppercase; | |
| letter-spacing: 0.05em; margin-bottom: 0.15rem; | |
| } | |
| .res-status { font-size: 0.65rem; color: var(--dim); } | |
| .res.cpu .res-label { color: var(--cpu); } | |
| .res.ram .res-label { color: var(--ram); } | |
| .res.gpu .res-label { color: var(--gpu); } | |
| .res.vram .res-label { color: var(--vram); } | |
| .res.active.cpu { border-color: var(--cpu); box-shadow: 0 0 10px rgba(240,136,62,0.25); } | |
| .res.active.ram { border-color: var(--ram); box-shadow: 0 0 10px rgba(63,185,80,0.25); } | |
| .res.active.gpu { border-color: var(--gpu); box-shadow: 0 0 10px rgba(188,140,255,0.25); } | |
| .res.active.vram { border-color: var(--vram); box-shadow: 0 0 10px rgba(247,120,186,0.25); } | |
| .res.active .res-status { color: inherit; } | |
| /* Token display */ | |
| .token-area { | |
| background: var(--bg); border: 1px solid var(--border); | |
| border-radius: 8px; padding: 1rem; margin-bottom: 1.5rem; | |
| min-height: 80px; | |
| } | |
| .token-label { | |
| font-size: 0.65rem; text-transform: uppercase; | |
| letter-spacing: 0.05em; color: var(--dim); margin-bottom: 0.5rem; | |
| } | |
| .token-row { | |
| display: flex; gap: 6px; flex-wrap: wrap; align-items: center; | |
| } | |
| .token { | |
| padding: 4px 10px; border-radius: 6px; font-size: 0.8rem; | |
| border: 1px solid var(--border); background: var(--surface); | |
| font-family: monospace; transition: all 0.3s ease; | |
| } | |
| .token.prefill { border-color: var(--prefill); background: rgba(218,54,51,0.1); } | |
| .token.decode { border-color: var(--decode); background: rgba(210,153,34,0.1); } | |
| .token.new { border-color: var(--active); background: rgba(88,166,255,0.1); animation: pulse 0.6s ease; } | |
| .token.cached { border-color: var(--ram); opacity: 0.6; font-size: 0.7rem; } | |
| @keyframes pulse { | |
| 0%,100% { transform: scale(1); } | |
| 50% { transform: scale(1.08); } | |
| } | |
| /* KV cache */ | |
| .kvcache { | |
| background: var(--bg); border: 1px solid var(--border); | |
| border-radius: 8px; padding: 0.75rem 1rem; margin-bottom: 1.5rem; | |
| display: flex; align-items: center; gap: 0.75rem; | |
| } | |
| .kvcache-label { | |
| font-size: 0.65rem; text-transform: uppercase; | |
| letter-spacing: 0.05em; color: var(--dim); white-space: nowrap; | |
| } | |
| .kvcache-entries { | |
| display: flex; gap: 4px; flex-wrap: wrap; | |
| } | |
| .kv-cell { | |
| width: 14px; height: 14px; border-radius: 3px; | |
| border: 1px solid var(--border); background: var(--surface); | |
| transition: all 0.3s ease; | |
| } | |
| .kv-cell.filled { background: var(--vram); border-color: var(--vram); } | |
| /* Status */ | |
| .status { | |
| font-size: 0.8rem; color: var(--text); margin-bottom: 1rem; | |
| text-align: center; min-height: 1.2em; | |
| } | |
| .status .highlight { color: var(--active); } | |
| /* Controls */ | |
| .controls { | |
| display: flex; gap: 0.75rem; justify-content: center; | |
| } | |
| button { | |
| padding: 0.5rem 1.25rem; border-radius: 6px; border: 1px solid var(--border); | |
| background: var(--bg); color: var(--text); font-size: 0.8rem; | |
| cursor: pointer; font-family: inherit; transition: all 0.2s; | |
| } | |
| button:hover { border-color: var(--dim); background: #1c2128; } | |
| button.primary { | |
| border-color: var(--accent); color: var(--accent); | |
| background: rgba(88,166,255,0.1); | |
| } | |
| button.primary:hover { background: rgba(88,166,255,0.2); } | |
| button:disabled { opacity: 0.4; cursor: not-allowed; } | |
| /* Step indicator dots */ | |
| .steps { | |
| display: flex; gap: 6px; justify-content: center; margin-bottom: 1rem; | |
| } | |
| .step-dot { | |
| width: 8px; height: 8px; border-radius: 50%; | |
| background: var(--border); transition: background 0.3s; | |
| } | |
| .step-dot.done { background: var(--accent); } | |
| .step-dot.current { background: var(--accent); box-shadow: 0 0 6px var(--accent); } | |
| </style> | |
| </head> | |
| <body> | |
| <div class="sim"> | |
| <h1>LLM Inference Pipeline</h1> | |
| <p class="subtitle">Prompt: <strong>"The cat sat"</strong> β watch how it flows through the system</p> | |
| <!-- Step dots --> | |
| <div class="steps" id="stepDots"> | |
| <div class="step-dot"></div> | |
| <div class="step-dot"></div> | |
| <div class="step-dot"></div> | |
| <div class="step-dot"></div> | |
| <div class="step-dot"></div> | |
| </div> | |
| <!-- Pipeline stages --> | |
| <div class="pipeline"> | |
| <div class="stage" id="s_input">π₯ Input</div> | |
| <div class="arrow"></div> | |
| <div class="stage" id="s_tokenize">π€ Tokenize</div> | |
| <div class="arrow"></div> | |
| <div class="stage" id="s_prefill">β‘ Prefill</div> | |
| <div class="arrow"></div> | |
| <div class="stage" id="s_decode">π Decode</div> | |
| <div class="arrow"></div> | |
| <div class="stage" id="s_output">π€ Output</div> | |
| </div> | |
| <div class="decode-loop" id="loopInfo"> | |
| <span class="loop-arrow"></span> | |
| <span>loop</span> | |
| <span class="loop-count" id="loopCount">0 / 3</span> | |
| </div> | |
| <!-- Resources --> | |
| <div class="resources"> | |
| <div class="res cpu" id="resCpu"> | |
| <div class="res-label">CPU</div> | |
| <div class="res-status">idle</div> | |
| </div> | |
| <div class="res ram" id="resRam"> | |
| <div class="res-label">RAM</div> | |
| <div class="res-status">idle</div> | |
| </div> | |
| <div class="res gpu" id="resGpu"> | |
| <div class="res-label">GPU Compute</div> | |
| <div class="res-status">idle</div> | |
| </div> | |
| <div class="res vram" id="resVram"> | |
| <div class="res-label">VRAM / KV Cache</div> | |
| <div class="res-status">idle</div> | |
| </div> | |
| </div> | |
| <!-- Tokens --> | |
| <div class="token-area"> | |
| <div class="token-label">Tokens</div> | |
| <div class="token-row" id="tokenRow"> | |
| <span style="color:var(--dim); font-size:0.8rem;">Waiting for input...</span> | |
| </div> | |
| </div> | |
| <!-- KV Cache --> | |
| <div class="kvcache"> | |
| <span class="kvcache-label">KV Cache entries:</span> | |
| <span class="kvcache-entries" id="kvEntries"> | |
| <span style="color:var(--dim); font-size:0.75rem;">empty</span> | |
| </span> | |
| </div> | |
| <!-- Status --> | |
| <div class="status" id="status">Click <span class="highlight">Auto</span> to run the full pipeline, or use <span class="highlight">Forward</span> / <span class="highlight">Back</span> to walk through one stage at a time.</div> | |
| <!-- Controls --> | |
| <div class="controls"> | |
| <button class="primary" id="btnAuto" onclick="toggleAuto()">Auto βΆ</button> | |
| <button id="btnBack" onclick="back()">β Back</button> | |
| <button id="btnStep" onclick="step()">Forward βΆ</button> | |
| <button id="btnReset" onclick="reset()">βΊ Reset</button> | |
| </div> | |
| </div> | |
| <script> | |
| // --- State --- | |
| const TOKENS = [ | |
| { id: 576, text: 'The' }, | |
| { id: 3797, text: ' cat' }, | |
| { id: 7236, text: ' sat' }, | |
| ]; | |
| const OUTPUT = [ | |
| { id: 389, text: ' on' }, | |
| { id: 278, text: ' the' }, | |
| { id: 3098, text: ' mat' }, | |
| ]; | |
| const TOTAL_STEPS = 7; // 0:idle, 1:input, 2:tokenize, 3:prefill, 4-6:decode steps | |
| let currentStep = 0; | |
| let autoTimer = null; | |
| let decodeIdx = 0; | |
| // --- DOM refs --- | |
| const els = { | |
| s_input: document.getElementById('s_input'), | |
| s_tokenize: document.getElementById('s_tokenize'), | |
| s_prefill: document.getElementById('s_prefill'), | |
| s_decode: document.getElementById('s_decode'), | |
| s_output: document.getElementById('s_output'), | |
| resCpu: document.getElementById('resCpu'), | |
| resRam: document.getElementById('resRam'), | |
| resGpu: document.getElementById('resGpu'), | |
| resVram: document.getElementById('resVram'), | |
| tokenRow: document.getElementById('tokenRow'), | |
| kvEntries: document.getElementById('kvEntries'), | |
| status: document.getElementById('status'), | |
| loopCount: document.getElementById('loopCount'), | |
| loopInfo: document.getElementById('loopInfo'), | |
| btnAuto: document.getElementById('btnAuto'), | |
| btnStep: document.getElementById('btnStep'), | |
| btnBack: document.getElementById('btnBack'), | |
| stepDots: document.getElementById('stepDots').children, | |
| }; | |
| function clearActive() { | |
| Object.values(els).forEach(el => { | |
| if (el && el.classList) el.classList.remove('active'); | |
| }); | |
| } | |
| function setActiveResources(names) { | |
| const map = { cpu: els.resCpu, ram: els.resRam, gpu: els.resGpu, vram: els.resVram }; | |
| for (const [key, el] of Object.entries(map)) { | |
| if (names.includes(key)) { | |
| el.classList.add('active'); | |
| el.querySelector('.res-status').textContent = 'active'; | |
| } else { | |
| el.classList.remove('active'); | |
| el.querySelector('.res-status').textContent = 'idle'; | |
| } | |
| } | |
| } | |
| function setStatus(msg) { els.status.innerHTML = msg; } | |
| function setTokens(html) { els.tokenRow.innerHTML = html; } | |
| function setKV(html) { els.kvEntries.innerHTML = html; } | |
| function setDots(n) { | |
| for (let i = 0; i < els.stepDots.length; i++) { | |
| els.stepDots[i].classList.remove('done', 'current'); | |
| if (i < n) els.stepDots[i].classList.add('done'); | |
| if (i === n) els.stepDots[i].classList.add('current'); | |
| } | |
| } | |
| // --- Step implementations --- | |
| function doIdle() { | |
| clearActive(); | |
| setActiveResources([]); | |
| setTokens('<span style="color:var(--dim); font-size:0.8rem;">Waiting for input...</span>'); | |
| setKV('<span style="color:var(--dim); font-size:0.75rem;">empty</span>'); | |
| setStatus('Click <span class="highlight">Auto</span> to run the full pipeline, or use <span class="highlight">Forward</span> / <span class="highlight">Back</span> to walk through one stage at a time.'); | |
| els.loopInfo.style.opacity = '0.3'; | |
| els.loopCount.textContent = '0 / ' + OUTPUT.length; | |
| els.loopCount.classList.remove('active'); | |
| setDots(0); | |
| } | |
| function doInput() { | |
| clearActive(); | |
| els.s_input.classList.add('active'); | |
| setActiveResources(['cpu']); | |
| setTokens('<span style="color:var(--text); font-size:1rem; font-style:italic;">"The cat sat"</span>'); | |
| setKV('<span style="color:var(--dim); font-size:0.75rem;">empty</span>'); | |
| setStatus('π₯ <span class="highlight">Input</span> β user submits the prompt text.<br><span style="color:var(--dim); font-size:0.75rem;">CPU & RAM hold the raw string. GPU is idle.</span>'); | |
| els.loopInfo.style.opacity = '0.3'; | |
| setDots(1); | |
| } | |
| function doTokenize() { | |
| clearActive(); | |
| els.s_tokenize.classList.add('active'); | |
| setActiveResources(['cpu', 'ram']); | |
| let html = ''; | |
| TOKENS.forEach(t => { | |
| html += `<span class="token" style="border-color:var(--cpu);">${t.id}</span>`; | |
| }); | |
| setTokens(html + '<span style="color:var(--dim); margin-left:6px; font-size:0.7rem;">β token IDs</span>'); | |
| setKV('<span style="color:var(--dim); font-size:0.75rem;">empty</span>'); | |
| setStatus('π€ <span class="highlight">Tokenize</span> β CPU converts text to token IDs: "The cat sat" β [576, 3797, 7236]<br><span style="color:var(--dim); font-size:0.75rem;">This is a fast CPU-bound step. Tokenizer runs on CPU.</span>'); | |
| els.loopInfo.style.opacity = '0.3'; | |
| setDots(2); | |
| } | |
| function doPrefill() { | |
| clearActive(); | |
| els.s_prefill.classList.add('active'); | |
| setActiveResources(['gpu', 'vram']); | |
| let html = ''; | |
| TOKENS.forEach(t => { | |
| html += `<span class="token prefill">${t.id}<br><small>${t.text}</small></span>`; | |
| }); | |
| html += '<span style="font-size:1.5rem; margin:0 4px;">β</span>'; | |
| html += `<span class="token new">${OUTPUT[0].id}<br><small>${OUTPUT[0].text}</small></span>`; | |
| // V | |
| setTokens(html); | |
| // KV cache built | |
| let kvHtml = ''; | |
| TOKENS.forEach((_, i) => { | |
| kvHtml += `<div class="kv-cell filled" title="K${i+1} V${i+1}"></div>`; | |
| }); | |
| setKV(kvHtml + '<span style="color:var(--dim); margin-left:4px; font-size:0.7rem;">3 entries built</span>'); | |
| setStatus('β‘ <span class="highlight">Prefill</span> β all 3 tokens run through the model <em>simultaneously</em>.<br><span style="color:var(--dim); font-size:0.75rem;">GPU compute spikes. KV cache is built (stored in VRAM). First output token appears: " on".<br><strong>Compute-bound:</strong> GPU is doing heavy matrix math.</span>'); | |
| els.loopInfo.style.opacity = '0.3'; | |
| els.loopCount.textContent = '1 / ' + OUTPUT.length; | |
| setDots(3); | |
| } | |
| function doDecodeStep(idx) { | |
| clearActive(); | |
| els.s_decode.classList.add('active'); | |
| // During decode, GPU is busy but more moderate (memory-bound) | |
| setActiveResources(['gpu', 'vram']); | |
| // Show all tokens so far | |
| let html = ''; | |
| // Input tokens (cached) | |
| TOKENS.forEach(t => { | |
| html += `<span class="token cached">${t.id}</span>`; | |
| }); | |
| // Previously generated output tokens (cached) | |
| for (let i = 0; i < idx; i++) { | |
| html += `<span class="token cached">${OUTPUT[i].id}</span>`; | |
| } | |
| // New token being generated | |
| html += `<span class="token new">${OUTPUT[idx].id}<br><small>${OUTPUT[idx].text}</small></span>`; | |
| // Show what's being read | |
| html += '<span style="color:var(--dim); margin-left:8px; font-size:0.7rem;">β reads KV cache</span>'; | |
| setTokens(html); | |
| // KV cache grows | |
| let kvHtml = ''; | |
| const totalKV = TOKENS.length + idx; | |
| for (let i = 0; i < totalKV; i++) { | |
| kvHtml += `<div class="kv-cell filled" title="K${i+1} V${i+1}"></div>`; | |
| } | |
| setKV(kvHtml + `<span style="color:var(--dim); margin-left:4px; font-size:0.7rem;">${totalKV} entries β growing each step</span>`); | |
| const tokenNum = idx + 1; | |
| setStatus('π <span class="highlight">Decode step ' + tokenNum + '/' + OUTPUT.length + '</span> β generating token: <span style="color:var(--active);">"' + OUTPUT[idx].text.trim() + '"</span><br><span style="color:var(--dim); font-size:0.75rem;">Only the NEW token needs fresh GPU compute. All previous tokens reuse their KV cache entries.<br><strong>Memory-bound:</strong> GPU spends most time reading the growing KV cache from VRAM.</span>'); | |
| els.loopInfo.style.opacity = '1'; | |
| els.loopCount.textContent = tokenNum + ' / ' + OUTPUT.length; | |
| els.loopCount.classList.add('active'); | |
| els.loopInfo.querySelector('.loop-arrow').style.background = 'var(--decode)'; | |
| setDots(3 + idx); | |
| } | |
| function doDetokenize() { | |
| clearActive(); | |
| els.s_output.classList.add('active'); | |
| setActiveResources(['cpu']); | |
| // Show all tokens | |
| let html = ''; | |
| TOKENS.forEach(t => { html += `<span class="token cached">${t.id}</span>`; }); | |
| OUTPUT.forEach(t => { html += `<span class="token cached">${t.id}</span>`; }); | |
| html += '<span style="font-size:1.5rem; margin:0 4px;">β</span>'; | |
| html += '<span style="color:var(--ram); font-size:1rem; font-style:italic;">"The cat sat on the mat"</span>'; | |
| setTokens(html); | |
| // Final KV cache | |
| let kvHtml = ''; | |
| for (let i = 0; i < TOKENS.length + OUTPUT.length; i++) { | |
| kvHtml += `<div class="kv-cell filled" title="K${i+1} V${i+1}"></div>`; | |
| } | |
| setKV(kvHtml + '<span style="color:var(--dim); margin-left:4px; font-size:0.7rem;">6 total entries (freed after request)</span>'); | |
| setStatus('π€ <span class="highlight">Detokenize & Output</span> β CPU converts token IDs back to text.<br><span style="color:var(--dim); font-size:0.75rem;">Final: "The cat sat on the mat" β KV cache is freed, GPU goes idle.</span>'); | |
| els.loopInfo.style.opacity = '0.3'; | |
| setDots(6); | |
| } | |
| // --- Step state machine --- | |
| function step() { | |
| if (autoTimer) return; // don't mix auto + manual | |
| // step numbering: | |
| // 0 idle, 1 input, 2 tokenize, 3 prefill, 4 decode[0], 5 decode[1], 6 decode[2], 7 detokenize | |
| // idle returned by reset | |
| const actions = [doIdle, doInput, doTokenize, doPrefill, ...OUTPUT.map((_, i) => () => doDecodeStep(i)), doDetokenize]; | |
| if (currentStep < actions.length - 1) { | |
| currentStep++; | |
| actions[currentStep](); | |
| } | |
| if (currentStep >= actions.length - 1) { | |
| els.btnStep.disabled = true; | |
| } | |
| els.btnBack.disabled = (currentStep <= 0); | |
| } | |
| function back() { | |
| if (autoTimer) return; | |
| if (currentStep <= 0) return; | |
| const actions = [doIdle, doInput, doTokenize, doPrefill, ...OUTPUT.map((_, i) => () => doDecodeStep(i)), doDetokenize]; | |
| currentStep--; | |
| actions[currentStep](); | |
| els.btnStep.disabled = false; | |
| els.btnBack.disabled = (currentStep <= 0); | |
| } | |
| function reset() { | |
| stopAuto(); | |
| currentStep = 0; | |
| decodeIdx = 0; | |
| doIdle(); | |
| els.btnStep.disabled = false; | |
| els.btnBack.disabled = true; | |
| } | |
| // --- Auto play --- | |
| function toggleAuto() { | |
| if (autoTimer) { stopAuto(); return; } | |
| reset(); | |
| els.btnAuto.textContent = 'βΈ Stop'; | |
| els.btnAuto.classList.remove('primary'); | |
| els.btnStep.disabled = true; | |
| els.btnBack.disabled = true; | |
| const actions = [doIdle, doInput, doTokenize, doPrefill, ...OUTPUT.map((_, i) => () => doDecodeStep(i)), doDetokenize]; | |
| let i = 0; | |
| function tick() { | |
| if (i >= actions.length) { stopAuto(); return; } | |
| actions[i](); | |
| currentStep = i; | |
| i++; | |
| // Adjust timing: longer pauses for prefill/decode to let viewer read | |
| let delay = 1200; | |
| if (i === 4) delay = 1500; // prefill β let it sink in | |
| if (i >= 5 && i <= actions.length - 2) delay = 1800; // decode steps β slower | |
| autoTimer = setTimeout(tick, delay); | |
| } | |
| // Small initial delay so viewer sees the fresh start | |
| autoTimer = setTimeout(tick, 400); | |
| } | |
| function stopAuto() { | |
| if (autoTimer) clearTimeout(autoTimer); | |
| autoTimer = null; | |
| els.btnAuto.textContent = 'βΆ Auto'; | |
| els.btnAuto.classList.add('primary'); | |
| if (currentStep < 7) els.btnStep.disabled = false; | |
| els.btnBack.disabled = (currentStep <= 0); | |
| } | |
| // --- Init --- | |
| doIdle(); | |
| </script> | |
| </body> | |
| </html> | |