| <!doctype html> |
| <html lang="en"> |
| <head> |
| <meta charset="utf-8"> |
| <meta name="viewport" content="width=device-width, initial-scale=1"> |
| <title>tiny_vllm — Suraj Sharan</title> |
| <meta name="description" content="Minimal continuous-batching LLM engine: paged KV-cache, prefix caching, chunked prefill, SSE streaming. Live visualization of engine internals."> |
| <link rel="preconnect" href="https://fonts.googleapis.com"> |
| <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin> |
| <link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&family=JetBrains+Mono:wght@400;500;600&display=swap"> |
| <link rel="stylesheet" href="style.css"> |
| </head> |
| <body data-hf-space="https://encoder-tiny-vllm.hf.space"> |
|
|
| <nav class="topbar"> |
| <div class="crumb"> |
| <a href="https://surajsharan.github.io/">suraj sharan</a> |
| <span class="sep">/</span> |
| <a href="https://surajsharan.github.io/#projects">projects</a> |
| <span class="sep">/</span> |
| <span>tiny_vllm</span> |
| </div> |
| <div class="right"> |
| <a id="hf-live" href="#" target="_blank" rel="noopener" class="live-link" style="display:none">try live ↗</a> |
| <a href="https://github.com/surajsharan/tiny_vLLM" target="_blank" rel="noopener">github ↗</a> |
| <a href="https://surajsharan.github.io/" title="Back to portfolio">← back</a> |
| </div> |
| </nav> |
|
|
| <header class="hero"> |
| <h1>tiny_vllm<span class="muted"> --watch</span></h1> |
| <p class="subtitle">A minimal continuous-batching LLM engine — paged KV-cache, automatic prefix caching, chunked prefill, SSE streaming. Built to be read end-to-end. Below is a live visualization of the scheduler and memory pool.</p> |
| <div class="status"> |
| <span id="connection" class="badge offline">connecting…</span> |
| <span id="model" class="muted"></span> |
| </div> |
| </header> |
|
|
| <div id="banner" class="banner" style="display:none"></div> |
|
|
| <section class="prompt-box"> |
| <textarea id="prompt" rows="2" placeholder="Type a prompt and press Send (or Cmd/Ctrl+Enter)…">Explain paged attention in two sentences.</textarea> |
| <div class="controls"> |
| <label>max_tokens <input id="max_tokens" type="number" value="64" min="1" max="2048"></label> |
| <label>temperature <input id="temperature" type="number" value="0.7" step="0.1" min="0" max="2"></label> |
| <label>top_p <input id="top_p" type="number" value="0.9" step="0.05" min="0" max="1"></label> |
| <button id="send">Send</button> |
| <button id="send-twice" title="Submit the same prompt twice — second should hit prefix cache">Send ×2 (prefix demo)</button> |
|
|
| <span class="replay-controls"> |
| <select id="speed" style="display:none" title="Replay speed"> |
| <option value="0.5">0.5×</option> |
| <option value="1" selected>1×</option> |
| <option value="2">2×</option> |
| <option value="4">4×</option> |
| <option value="8">8×</option> |
| </select> |
| <button id="play-pause" style="display:none" class="ghost">Pause</button> |
| <button id="restart" style="display:none" class="ghost">Restart</button> |
| </span> |
| </div> |
|
|
| <div id="run-locally" class="run-locally" hidden> |
| <div class="rl-head">This is a <b>recorded</b> session — to send live prompts, run the server locally.</div> |
| <pre><code>git clone https://github.com/surajsharan/tiny_vLLM |
| cd tiny_vLLM && pip install -r requirements.txt |
| python -m tiny_vllm.server --model Qwen/Qwen2.5-0.5B-Instruct |
| # then open http://localhost:8000</code></pre> |
| <div class="rl-foot"> |
| <button id="rl-copy" class="ghost">Copy command</button> |
| <a id="rl-hf" class="rl-cta" href="#" target="_blank" rel="noopener" style="display:none">Or use the hosted live demo ↗</a> |
| <button id="rl-close" class="ghost">Dismiss</button> |
| </div> |
| </div> |
| </section> |
|
|
| <main> |
| <section class="card"> |
| <h2>Block pool <span class="muted" id="pool-summary"></span></h2> |
| <div id="block-pool" class="block-pool"></div> |
| <div class="legend"> |
| <span class="legend-item"><span class="swatch swatch-free"></span>free</span> |
| <span class="legend-item"><span class="swatch swatch-cached"></span>cached (evictable)</span> |
| <span class="legend-item"><span class="swatch swatch-used"></span>in use</span> |
| <span class="legend-item"><span class="swatch swatch-shared"></span>shared (refcount>1)</span> |
| <span class="legend-item"><span class="swatch swatch-hashed-edge"></span>hashed (border)</span> |
| </div> |
| </section> |
|
|
| <section class="card"> |
| <h2>Scheduler <span class="muted" id="sched-step"></span></h2> |
| <div class="stats"> |
| <div class="stat"><div class="stat-label">tokens / step</div><div class="stat-value" id="stat-tokens">0</div></div> |
| <div class="stat"><div class="stat-label">prefill / decode</div><div class="stat-value" id="stat-pfdec">0 / 0</div></div> |
| <div class="stat"><div class="stat-label">step (ms)</div><div class="stat-value" id="stat-ms">0</div></div> |
| <div class="stat"><div class="stat-label">prefix cache</div><div class="stat-value" id="stat-cache">—</div></div> |
| <div class="stat"><div class="stat-label">free blocks</div><div class="stat-value" id="stat-free">0</div></div> |
| <div class="stat"><div class="stat-label">preemptions</div><div class="stat-value" id="stat-pre">0</div></div> |
| </div> |
| <h3>step log</h3> |
| <pre id="log" class="log"></pre> |
| </section> |
|
|
| <section class="card grow"> |
| <h2>Sequences</h2> |
| <div id="seqs"></div> |
| </section> |
| </main> |
|
|
| <footer> |
| Subscribed to <code>/engine/events</code> · Source on <a href="https://github.com/surajsharan/tiny_vLLM" target="_blank" rel="noopener">github.com/surajsharan/tiny_vLLM</a> |
| </footer> |
|
|
| <script src="app.js"></script> |
| </body> |
| </html> |
|
|