Spaces:

enCoder
/

tiny-vllm

Running

App Files Files Community

tiny-vllm / scripts /make_demo_recording.py

enCoder

Add GitHub Pages demo and recording functionality

39fa862 12 days ago

raw

history blame contribute delete

8.69 kB

	"""Fabricate a representative events.jsonl for the GH Pages demo.

	We don't need a model or torch here — we just synthesize a sequence of engine
	events that exercises:

	* a prompt being admitted (request event + first step admitting it)
	* chunked prefill across two steps
	* decoding several tokens
	* a second request that hits the prefix cache on identical prefix tokens
	* a finished sequence and a freed-but-cached block

	The output is read by `web/app.js`'s replay mode.

	Run:
	python scripts/make_demo_recording.py > web/events.jsonl
	"""
	from __future__ import annotations

	import json
	import sys
	from copy import deepcopy

	NUM_BLOCKS = 24
	BLOCK_SIZE = 8
	MODEL = "Qwen/Qwen2.5-0.5B-Instruct"

	CONFIG = {
	"model": MODEL,
	"block_size": BLOCK_SIZE,
	"num_blocks": NUM_BLOCKS,
	"max_num_seqs": 8,
	"max_num_batched_tokens": 32,
	"prefix_caching": True,
	}

	# State we evolve.
	ref = [0] * NUM_BLOCKS
	hashed = [False] * NUM_BLOCKS
	lookups = 0
	hits = 0

	events: list[dict] = []
	ts = 0.0
	step = 0


	def pool_snapshot() -> dict:
	return {
	"num_blocks": NUM_BLOCKS,
	"block_size": BLOCK_SIZE,
	"num_free_blocks": sum(1 for r in ref if r == 0),
	"num_cached_entries": sum(1 for h in hashed if h),
	"prefix_cache_hits": hits,
	"prefix_cache_lookups": lookups,
	"ref_counts": list(ref),
	"hashed": list(hashed),
	}


	def seq_view(rid, sid, status, prompt_len, num_gen, num_computed, num_cached, block_table) -> dict:
	return {
	"seq_id": sid, "request_id": rid, "status": status,
	"prompt_len": prompt_len, "num_generated": num_gen,
	"num_computed_tokens": num_computed,
	"num_cached_prefix_tokens": num_cached,
	"block_table": list(block_table),
	}


	def emit(ev_type: str, payload: dict) -> None:
	global ts
	events.append({"type": ev_type, "step": step, "timestamp": round(ts, 3), "payload": payload})


	def emit_snapshot(running, waiting) -> None:
	emit("snapshot", {
	"step": step,
	"config": CONFIG,
	"block_pool": pool_snapshot(),
	"running": running,
	"waiting": waiting,
	})


	def emit_step(dur_ms, n_tok, n_pf, n_dec, deltas, running, waiting,
	admitted=None, finished=None, preempted=None):
	emit("step", {
	"duration_ms": dur_ms,
	"num_tokens": n_tok,
	"num_seqs": n_pf + n_dec,
	"num_prefill_seqs": n_pf,
	"num_decode_seqs": n_dec,
	"deltas": deltas or [],
	"newly_admitted": admitted or [],
	"finished": finished or [],
	"preempted": preempted or [],
	"snapshot": {
	"step": step,
	"config": CONFIG,
	"block_pool": pool_snapshot(),
	"running": running,
	"waiting": waiting,
	},
	})


	# ---------- script the session ----------

	# Initial empty snapshot.
	emit_snapshot(running=[], waiting=[])
	ts += 0.4

	# --- request A admitted with a 20-token prompt --------------------------
	RID_A = "demo-aaaa1111"
	SID_A = 1
	PROMPT_A = "Explain paged attention in two sentences. Then explain prefix caching."

	step = 1
	ts += 0.1
	emit("request", {"request_id": RID_A, "seq_id": SID_A, "prompt": PROMPT_A,
	"prompt_len": 20, "max_tokens": 24})

	# Step 1: chunked prefill, first chunk (16 tokens → 2 blocks).
	ref[0] = ref[1] = 1
	hashed[0] = hashed[1] = True
	ts += 0.05
	running_A_pf1 = [seq_view(RID_A, SID_A, "prefilling", 20, 0, 16, 0, [0, 1])]
	emit_step(280, 16, 1, 0, [], [], [running_A_pf1[0]], admitted=[SID_A])
	# (waiting moved to running at end of prefill; here still prefilling, so it
	# stays in waiting in our scheduler; emit fix:)
	events[-1]["payload"]["snapshot"]["running"] = []
	events[-1]["payload"]["snapshot"]["waiting"] = running_A_pf1
	ts += 0.3

	# Step 2: finish prefill (4 more tokens). Need 1 more block.
	step = 2
	ref[2] = 1 # block 2 holds last 4 tokens (partial)
	ts += 0.05
	running_A = [seq_view(RID_A, SID_A, "running", 20, 0, 20, 0, [0, 1, 2])]
	emit_step(180, 4, 1, 0, [], running_A, [])
	ts += 0.2

	# Steps 3-7: A decodes 5 tokens. Each step appends one slot; block 2 fills
	# at token 24, then block 3 starts.
	TOKS_A = [" Paged", " attention", " splits", " keys", " and"]
	for i, tok in enumerate(TOKS_A):
	step += 1
	ts += 0.12
	n_gen = i + 1
	n_comp = 20 + i # after fwd: num_computed_tokens increments by 1 each decode
	# Block table grows when crossing boundary.
	bt = [0, 1, 2]
	if 20 + n_gen > 24:
	bt = [0, 1, 2, 3]
	ref[3] = 1
	# Maybe hash block 2 when it just filled (after sampling tok at pos 23 → 24 tokens computed).
	if 20 + n_gen >= 24 and not hashed[2]:
	hashed[2] = True
	running_A = [seq_view(RID_A, SID_A, "running", 20, n_gen, 20 + n_gen,
	0, bt)]
	emit_step(95, 1, 0, 1,
	[{"request_id": RID_A, "new_text": tok, "finished": False, "finish_reason": None}],
	running_A, [])

	# --- request B admitted, identical prefix → cache hit -------------------
	step += 1
	RID_B = "demo-bbbb2222"
	SID_B = 2
	PROMPT_B = PROMPT_A # same → prefix cache hit
	ts += 0.6
	emit("request", {"request_id": RID_B, "seq_id": SID_B, "prompt": PROMPT_B,
	"prompt_len": 20, "max_tokens": 24})

	step += 1
	ts += 0.05
	# Hit lookups: 2 full blocks of B's prompt (block_size=8 → 16/8=2 blocks).
	# Both hit because A's first 2 blocks are hashed (and currently in use).
	lookups += 2
	hits += 2
	ref[0] += 1
	ref[1] += 1
	# B needs a 3rd block (partial, 4 tokens). Fresh: block 4.
	ref[4] = 1
	running_B_pf = [seq_view(RID_B, SID_B, "prefilling", 20, 0, 16, 16, [0, 1, 4])]
	# A is still running and decoding in this step.
	step_A_n_gen = len(TOKS_A)
	running_A_now = [seq_view(RID_A, SID_A, "running", 20, step_A_n_gen,
	20 + step_A_n_gen, 0, [0, 1, 2, 3])]
	emit_step(140, 5, 1, 1,
	[{"request_id": RID_A, "new_text": " values", "finished": False, "finish_reason": None}],
	[running_A_now[0]], [running_B_pf[0]], admitted=[SID_B])
	ts += 0.12

	# Step: B finishes prefill (4 tokens), A decodes one more.
	step += 1
	running_A_now = [seq_view(RID_A, SID_A, "running", 20, step_A_n_gen + 1,
	21 + step_A_n_gen, 0, [0, 1, 2, 3])]
	running_B = [seq_view(RID_B, SID_B, "running", 20, 0, 20, 16, [0, 1, 4])]
	emit_step(110, 5, 1, 1,
	[{"request_id": RID_A, "new_text": " into", "finished": False, "finish_reason": None}],
	[running_A_now[0], running_B[0]], [])
	ts += 0.12

	# Steps: both decode in lock-step for a few rounds.
	A_tokens = [" small", ",", " fixed", "-size", " blocks", " stored", " in"]
	B_tokens = [" Prefix", " caching", " reuses", " those", " blocks", " across", " requests"]
	for i in range(7):
	step += 1
	ts += 0.10
	a_gen = step_A_n_gen + 2 + i
	b_gen = i + 1
	a_bt = [0, 1, 2, 3]
	b_bt = [0, 1, 4]
	# B crosses block boundary at b_gen such that 20 + b_gen > 24 → needs block 5.
	if 20 + b_gen > 24 and ref[5] == 0:
	ref[5] = 1
	if 20 + b_gen > 24:
	b_bt = [0, 1, 4, 5]
	if 20 + b_gen >= 24 and not hashed[4]:
	hashed[4] = True
	seq_a = seq_view(RID_A, SID_A, "running", 20, a_gen, 20 + a_gen, 0, a_bt)
	seq_b = seq_view(RID_B, SID_B, "running", 20, b_gen, 20 + b_gen, 16, b_bt)
	emit_step(105, 2, 0, 2,
	[{"request_id": RID_A, "new_text": A_tokens[i], "finished": False, "finish_reason": None},
	{"request_id": RID_B, "new_text": B_tokens[i], "finished": False, "finish_reason": None}],
	[seq_a, seq_b], [])

	# A finishes (hits EOS or max_tokens).
	step += 1
	ts += 0.10
	ref[0] -= 1 # A releases its blocks; block 0,1 still held by B so refcount drops to 1
	ref[1] -= 1
	ref[2] = 0 # block 2 (A only) → goes to cached free list (hashed)
	ref[3] = 0 # block 3 (A only) → if hashed, cached; else uncached free
	# Block 2 was hashed earlier; block 3 we never hashed (partial in A's decode). For demo,
	# hash block 3 to make the cache view richer.
	hashed[3] = True
	seq_b = seq_view(RID_B, SID_B, "running", 20, 8, 28, 16, [0, 1, 4, 5])
	emit_step(85, 1, 0, 1,
	[{"request_id": RID_A, "new_text": " the GPU.", "finished": True, "finish_reason": "stop"}],
	[seq_b], [], finished=[RID_A])

	# B finishes too.
	step += 1
	ts += 0.10
	ref[0] -= 1
	ref[1] -= 1
	ref[4] = 0
	ref[5] = 0
	hashed[5] = True
	emit_step(80, 1, 0, 1,
	[{"request_id": RID_B, "new_text": " for the same prompts.", "finished": True, "finish_reason": "stop"}],
	[], [], finished=[RID_B])


	# ---------- emit ----------

	out = sys.stdout
	for ev in events:
	out.write(json.dumps(ev, separators=(",", ":")))
	out.write("\n")