tiny-vllm / scripts /make_demo_recording.py
enCoder's picture
Add GitHub Pages demo and recording functionality
39fa862
"""Fabricate a representative events.jsonl for the GH Pages demo.
We don't need a model or torch here — we just synthesize a sequence of engine
events that exercises:
* a prompt being admitted (request event + first step admitting it)
* chunked prefill across two steps
* decoding several tokens
* a second request that hits the prefix cache on identical prefix tokens
* a finished sequence and a freed-but-cached block
The output is read by `web/app.js`'s replay mode.
Run:
python scripts/make_demo_recording.py > web/events.jsonl
"""
from __future__ import annotations
import json
import sys
from copy import deepcopy
NUM_BLOCKS = 24
BLOCK_SIZE = 8
MODEL = "Qwen/Qwen2.5-0.5B-Instruct"
CONFIG = {
"model": MODEL,
"block_size": BLOCK_SIZE,
"num_blocks": NUM_BLOCKS,
"max_num_seqs": 8,
"max_num_batched_tokens": 32,
"prefix_caching": True,
}
# State we evolve.
ref = [0] * NUM_BLOCKS
hashed = [False] * NUM_BLOCKS
lookups = 0
hits = 0
events: list[dict] = []
ts = 0.0
step = 0
def pool_snapshot() -> dict:
return {
"num_blocks": NUM_BLOCKS,
"block_size": BLOCK_SIZE,
"num_free_blocks": sum(1 for r in ref if r == 0),
"num_cached_entries": sum(1 for h in hashed if h),
"prefix_cache_hits": hits,
"prefix_cache_lookups": lookups,
"ref_counts": list(ref),
"hashed": list(hashed),
}
def seq_view(rid, sid, status, prompt_len, num_gen, num_computed, num_cached, block_table) -> dict:
return {
"seq_id": sid, "request_id": rid, "status": status,
"prompt_len": prompt_len, "num_generated": num_gen,
"num_computed_tokens": num_computed,
"num_cached_prefix_tokens": num_cached,
"block_table": list(block_table),
}
def emit(ev_type: str, payload: dict) -> None:
global ts
events.append({"type": ev_type, "step": step, "timestamp": round(ts, 3), "payload": payload})
def emit_snapshot(running, waiting) -> None:
emit("snapshot", {
"step": step,
"config": CONFIG,
"block_pool": pool_snapshot(),
"running": running,
"waiting": waiting,
})
def emit_step(dur_ms, n_tok, n_pf, n_dec, deltas, running, waiting,
admitted=None, finished=None, preempted=None):
emit("step", {
"duration_ms": dur_ms,
"num_tokens": n_tok,
"num_seqs": n_pf + n_dec,
"num_prefill_seqs": n_pf,
"num_decode_seqs": n_dec,
"deltas": deltas or [],
"newly_admitted": admitted or [],
"finished": finished or [],
"preempted": preempted or [],
"snapshot": {
"step": step,
"config": CONFIG,
"block_pool": pool_snapshot(),
"running": running,
"waiting": waiting,
},
})
# ---------- script the session ----------
# Initial empty snapshot.
emit_snapshot(running=[], waiting=[])
ts += 0.4
# --- request A admitted with a 20-token prompt --------------------------
RID_A = "demo-aaaa1111"
SID_A = 1
PROMPT_A = "Explain paged attention in two sentences. Then explain prefix caching."
step = 1
ts += 0.1
emit("request", {"request_id": RID_A, "seq_id": SID_A, "prompt": PROMPT_A,
"prompt_len": 20, "max_tokens": 24})
# Step 1: chunked prefill, first chunk (16 tokens → 2 blocks).
ref[0] = ref[1] = 1
hashed[0] = hashed[1] = True
ts += 0.05
running_A_pf1 = [seq_view(RID_A, SID_A, "prefilling", 20, 0, 16, 0, [0, 1])]
emit_step(280, 16, 1, 0, [], [], [running_A_pf1[0]], admitted=[SID_A])
# (waiting moved to running at end of prefill; here still prefilling, so it
# stays in waiting in our scheduler; emit fix:)
events[-1]["payload"]["snapshot"]["running"] = []
events[-1]["payload"]["snapshot"]["waiting"] = running_A_pf1
ts += 0.3
# Step 2: finish prefill (4 more tokens). Need 1 more block.
step = 2
ref[2] = 1 # block 2 holds last 4 tokens (partial)
ts += 0.05
running_A = [seq_view(RID_A, SID_A, "running", 20, 0, 20, 0, [0, 1, 2])]
emit_step(180, 4, 1, 0, [], running_A, [])
ts += 0.2
# Steps 3-7: A decodes 5 tokens. Each step appends one slot; block 2 fills
# at token 24, then block 3 starts.
TOKS_A = [" Paged", " attention", " splits", " keys", " and"]
for i, tok in enumerate(TOKS_A):
step += 1
ts += 0.12
n_gen = i + 1
n_comp = 20 + i # after fwd: num_computed_tokens increments by 1 each decode
# Block table grows when crossing boundary.
bt = [0, 1, 2]
if 20 + n_gen > 24:
bt = [0, 1, 2, 3]
ref[3] = 1
# Maybe hash block 2 when it just filled (after sampling tok at pos 23 → 24 tokens computed).
if 20 + n_gen >= 24 and not hashed[2]:
hashed[2] = True
running_A = [seq_view(RID_A, SID_A, "running", 20, n_gen, 20 + n_gen,
0, bt)]
emit_step(95, 1, 0, 1,
[{"request_id": RID_A, "new_text": tok, "finished": False, "finish_reason": None}],
running_A, [])
# --- request B admitted, identical prefix → cache hit -------------------
step += 1
RID_B = "demo-bbbb2222"
SID_B = 2
PROMPT_B = PROMPT_A # same → prefix cache hit
ts += 0.6
emit("request", {"request_id": RID_B, "seq_id": SID_B, "prompt": PROMPT_B,
"prompt_len": 20, "max_tokens": 24})
step += 1
ts += 0.05
# Hit lookups: 2 full blocks of B's prompt (block_size=8 → 16/8=2 blocks).
# Both hit because A's first 2 blocks are hashed (and currently in use).
lookups += 2
hits += 2
ref[0] += 1
ref[1] += 1
# B needs a 3rd block (partial, 4 tokens). Fresh: block 4.
ref[4] = 1
running_B_pf = [seq_view(RID_B, SID_B, "prefilling", 20, 0, 16, 16, [0, 1, 4])]
# A is still running and decoding in this step.
step_A_n_gen = len(TOKS_A)
running_A_now = [seq_view(RID_A, SID_A, "running", 20, step_A_n_gen,
20 + step_A_n_gen, 0, [0, 1, 2, 3])]
emit_step(140, 5, 1, 1,
[{"request_id": RID_A, "new_text": " values", "finished": False, "finish_reason": None}],
[running_A_now[0]], [running_B_pf[0]], admitted=[SID_B])
ts += 0.12
# Step: B finishes prefill (4 tokens), A decodes one more.
step += 1
running_A_now = [seq_view(RID_A, SID_A, "running", 20, step_A_n_gen + 1,
21 + step_A_n_gen, 0, [0, 1, 2, 3])]
running_B = [seq_view(RID_B, SID_B, "running", 20, 0, 20, 16, [0, 1, 4])]
emit_step(110, 5, 1, 1,
[{"request_id": RID_A, "new_text": " into", "finished": False, "finish_reason": None}],
[running_A_now[0], running_B[0]], [])
ts += 0.12
# Steps: both decode in lock-step for a few rounds.
A_tokens = [" small", ",", " fixed", "-size", " blocks", " stored", " in"]
B_tokens = [" Prefix", " caching", " reuses", " those", " blocks", " across", " requests"]
for i in range(7):
step += 1
ts += 0.10
a_gen = step_A_n_gen + 2 + i
b_gen = i + 1
a_bt = [0, 1, 2, 3]
b_bt = [0, 1, 4]
# B crosses block boundary at b_gen such that 20 + b_gen > 24 → needs block 5.
if 20 + b_gen > 24 and ref[5] == 0:
ref[5] = 1
if 20 + b_gen > 24:
b_bt = [0, 1, 4, 5]
if 20 + b_gen >= 24 and not hashed[4]:
hashed[4] = True
seq_a = seq_view(RID_A, SID_A, "running", 20, a_gen, 20 + a_gen, 0, a_bt)
seq_b = seq_view(RID_B, SID_B, "running", 20, b_gen, 20 + b_gen, 16, b_bt)
emit_step(105, 2, 0, 2,
[{"request_id": RID_A, "new_text": A_tokens[i], "finished": False, "finish_reason": None},
{"request_id": RID_B, "new_text": B_tokens[i], "finished": False, "finish_reason": None}],
[seq_a, seq_b], [])
# A finishes (hits EOS or max_tokens).
step += 1
ts += 0.10
ref[0] -= 1 # A releases its blocks; block 0,1 still held by B so refcount drops to 1
ref[1] -= 1
ref[2] = 0 # block 2 (A only) → goes to cached free list (hashed)
ref[3] = 0 # block 3 (A only) → if hashed, cached; else uncached free
# Block 2 was hashed earlier; block 3 we never hashed (partial in A's decode). For demo,
# hash block 3 to make the cache view richer.
hashed[3] = True
seq_b = seq_view(RID_B, SID_B, "running", 20, 8, 28, 16, [0, 1, 4, 5])
emit_step(85, 1, 0, 1,
[{"request_id": RID_A, "new_text": " the GPU.", "finished": True, "finish_reason": "stop"}],
[seq_b], [], finished=[RID_A])
# B finishes too.
step += 1
ts += 0.10
ref[0] -= 1
ref[1] -= 1
ref[4] = 0
ref[5] = 0
hashed[5] = True
emit_step(80, 1, 0, 1,
[{"request_id": RID_B, "new_text": " for the same prompts.", "finished": True, "finish_reason": "stop"}],
[], [], finished=[RID_B])
# ---------- emit ----------
out = sys.stdout
for ev in events:
out.write(json.dumps(ev, separators=(",", ":")))
out.write("\n")