File size: 5,726 Bytes
c32c359
 
 
 
 
9dfe9bd
 
 
 
 
 
c32c359
8fa0f9d
9dfe9bd
 
 
 
 
 
 
 
 
 
8fa0f9d
9dfe9bd
 
 
 
 
 
 
 
c32c359
39fa862
c32c359
 
 
 
39fa862
 
c32c359
 
 
 
 
 
 
 
39fa862
 
 
 
 
 
 
 
 
 
 
 
c32c359
4fa42d8
 
 
 
 
 
 
 
 
8fa0f9d
4fa42d8
 
 
c32c359
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9dfe9bd
c32c359
 
9dfe9bd
c32c359
9dfe9bd
c32c359
 
 
 
 
 
 
 
 
 
 
 
4fa42d8
c32c359
 
9dfe9bd
c32c359
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>tiny_vllm — Suraj Sharan</title>
<meta name="description" content="Minimal continuous-batching LLM engine: paged KV-cache, prefix caching, chunked prefill, SSE streaming. Live visualization of engine internals.">
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&family=JetBrains+Mono:wght@400;500;600&display=swap">
<link rel="stylesheet" href="style.css">
</head>
<body data-hf-space="https://encoder-tiny-vllm.hf.space">

<nav class="topbar">
  <div class="crumb">
    <a href="https://surajsharan.github.io/">suraj sharan</a>
    <span class="sep">/</span>
    <a href="https://surajsharan.github.io/#projects">projects</a>
    <span class="sep">/</span>
    <span>tiny_vllm</span>
  </div>
  <div class="right">
    <a id="hf-live" href="#" target="_blank" rel="noopener" class="live-link" style="display:none">try live ↗</a>
    <a href="https://github.com/surajsharan/tiny_vLLM" target="_blank" rel="noopener">github ↗</a>
    <a href="https://surajsharan.github.io/" title="Back to portfolio">← back</a>
  </div>
</nav>

<header class="hero">
  <h1>tiny_vllm<span class="muted"> --watch</span></h1>
  <p class="subtitle">A minimal continuous-batching LLM engine — paged KV-cache, automatic prefix caching, chunked prefill, SSE streaming. Built to be read end-to-end. Below is a live visualization of the scheduler and memory pool.</p>
  <div class="status">
    <span id="connection" class="badge offline">connecting…</span>
    <span id="model" class="muted"></span>
  </div>
</header>

<div id="banner" class="banner" style="display:none"></div>

<section class="prompt-box">
  <textarea id="prompt" rows="2" placeholder="Type a prompt and press Send (or Cmd/Ctrl+Enter)…">Explain paged attention in two sentences.</textarea>
  <div class="controls">
    <label>max_tokens <input id="max_tokens" type="number" value="64" min="1" max="2048"></label>
    <label>temperature <input id="temperature" type="number" value="0.7" step="0.1" min="0" max="2"></label>
    <label>top_p <input id="top_p" type="number" value="0.9" step="0.05" min="0" max="1"></label>
    <button id="send">Send</button>
    <button id="send-twice" title="Submit the same prompt twice — second should hit prefix cache">Send ×2 (prefix demo)</button>

    <span class="replay-controls">
      <select id="speed" style="display:none" title="Replay speed">
        <option value="0.5">0.5×</option>
        <option value="1" selected></option>
        <option value="2"></option>
        <option value="4"></option>
        <option value="8"></option>
      </select>
      <button id="play-pause" style="display:none" class="ghost">Pause</button>
      <button id="restart" style="display:none" class="ghost">Restart</button>
    </span>
  </div>

  <div id="run-locally" class="run-locally" hidden>
    <div class="rl-head">This is a <b>recorded</b> session — to send live prompts, run the server locally.</div>
    <pre><code>git clone https://github.com/surajsharan/tiny_vLLM
cd tiny_vLLM &amp;&amp; pip install -r requirements.txt
python -m tiny_vllm.server --model Qwen/Qwen2.5-0.5B-Instruct
# then open http://localhost:8000</code></pre>
    <div class="rl-foot">
      <button id="rl-copy" class="ghost">Copy command</button>
      <a id="rl-hf" class="rl-cta" href="#" target="_blank" rel="noopener" style="display:none">Or use the hosted live demo ↗</a>
      <button id="rl-close" class="ghost">Dismiss</button>
    </div>
  </div>
</section>

<main>
  <section class="card">
    <h2>Block pool <span class="muted" id="pool-summary"></span></h2>
    <div id="block-pool" class="block-pool"></div>
    <div class="legend">
      <span class="legend-item"><span class="swatch swatch-free"></span>free</span>
      <span class="legend-item"><span class="swatch swatch-cached"></span>cached (evictable)</span>
      <span class="legend-item"><span class="swatch swatch-used"></span>in use</span>
      <span class="legend-item"><span class="swatch swatch-shared"></span>shared (refcount&gt;1)</span>
      <span class="legend-item"><span class="swatch swatch-hashed-edge"></span>hashed (border)</span>
    </div>
  </section>

  <section class="card">
    <h2>Scheduler <span class="muted" id="sched-step"></span></h2>
    <div class="stats">
      <div class="stat"><div class="stat-label">tokens / step</div><div class="stat-value" id="stat-tokens">0</div></div>
      <div class="stat"><div class="stat-label">prefill / decode</div><div class="stat-value" id="stat-pfdec">0 / 0</div></div>
      <div class="stat"><div class="stat-label">step (ms)</div><div class="stat-value" id="stat-ms">0</div></div>
      <div class="stat"><div class="stat-label">prefix cache</div><div class="stat-value" id="stat-cache"></div></div>
      <div class="stat"><div class="stat-label">free blocks</div><div class="stat-value" id="stat-free">0</div></div>
      <div class="stat"><div class="stat-label">preemptions</div><div class="stat-value" id="stat-pre">0</div></div>
    </div>
    <h3>step log</h3>
    <pre id="log" class="log"></pre>
  </section>

  <section class="card grow">
    <h2>Sequences</h2>
    <div id="seqs"></div>
  </section>
</main>

<footer>
  Subscribed to <code>/engine/events</code> · Source on <a href="https://github.com/surajsharan/tiny_vLLM" target="_blank" rel="noopener">github.com/surajsharan/tiny_vLLM</a>
</footer>

<script src="app.js"></script>
</body>
</html>