| """Fabricate a representative events.jsonl for the GH Pages demo. |
| |
| We don't need a model or torch here — we just synthesize a sequence of engine |
| events that exercises: |
| |
| * a prompt being admitted (request event + first step admitting it) |
| * chunked prefill across two steps |
| * decoding several tokens |
| * a second request that hits the prefix cache on identical prefix tokens |
| * a finished sequence and a freed-but-cached block |
| |
| The output is read by `web/app.js`'s replay mode. |
| |
| Run: |
| python scripts/make_demo_recording.py > web/events.jsonl |
| """ |
| from __future__ import annotations |
|
|
| import json |
| import sys |
| from copy import deepcopy |
|
|
| NUM_BLOCKS = 24 |
| BLOCK_SIZE = 8 |
| MODEL = "Qwen/Qwen2.5-0.5B-Instruct" |
|
|
| CONFIG = { |
| "model": MODEL, |
| "block_size": BLOCK_SIZE, |
| "num_blocks": NUM_BLOCKS, |
| "max_num_seqs": 8, |
| "max_num_batched_tokens": 32, |
| "prefix_caching": True, |
| } |
|
|
| |
| ref = [0] * NUM_BLOCKS |
| hashed = [False] * NUM_BLOCKS |
| lookups = 0 |
| hits = 0 |
|
|
| events: list[dict] = [] |
| ts = 0.0 |
| step = 0 |
|
|
|
|
| def pool_snapshot() -> dict: |
| return { |
| "num_blocks": NUM_BLOCKS, |
| "block_size": BLOCK_SIZE, |
| "num_free_blocks": sum(1 for r in ref if r == 0), |
| "num_cached_entries": sum(1 for h in hashed if h), |
| "prefix_cache_hits": hits, |
| "prefix_cache_lookups": lookups, |
| "ref_counts": list(ref), |
| "hashed": list(hashed), |
| } |
|
|
|
|
| def seq_view(rid, sid, status, prompt_len, num_gen, num_computed, num_cached, block_table) -> dict: |
| return { |
| "seq_id": sid, "request_id": rid, "status": status, |
| "prompt_len": prompt_len, "num_generated": num_gen, |
| "num_computed_tokens": num_computed, |
| "num_cached_prefix_tokens": num_cached, |
| "block_table": list(block_table), |
| } |
|
|
|
|
| def emit(ev_type: str, payload: dict) -> None: |
| global ts |
| events.append({"type": ev_type, "step": step, "timestamp": round(ts, 3), "payload": payload}) |
|
|
|
|
| def emit_snapshot(running, waiting) -> None: |
| emit("snapshot", { |
| "step": step, |
| "config": CONFIG, |
| "block_pool": pool_snapshot(), |
| "running": running, |
| "waiting": waiting, |
| }) |
|
|
|
|
| def emit_step(dur_ms, n_tok, n_pf, n_dec, deltas, running, waiting, |
| admitted=None, finished=None, preempted=None): |
| emit("step", { |
| "duration_ms": dur_ms, |
| "num_tokens": n_tok, |
| "num_seqs": n_pf + n_dec, |
| "num_prefill_seqs": n_pf, |
| "num_decode_seqs": n_dec, |
| "deltas": deltas or [], |
| "newly_admitted": admitted or [], |
| "finished": finished or [], |
| "preempted": preempted or [], |
| "snapshot": { |
| "step": step, |
| "config": CONFIG, |
| "block_pool": pool_snapshot(), |
| "running": running, |
| "waiting": waiting, |
| }, |
| }) |
|
|
|
|
| |
|
|
| |
| emit_snapshot(running=[], waiting=[]) |
| ts += 0.4 |
|
|
| |
| RID_A = "demo-aaaa1111" |
| SID_A = 1 |
| PROMPT_A = "Explain paged attention in two sentences. Then explain prefix caching." |
|
|
| step = 1 |
| ts += 0.1 |
| emit("request", {"request_id": RID_A, "seq_id": SID_A, "prompt": PROMPT_A, |
| "prompt_len": 20, "max_tokens": 24}) |
|
|
| |
| ref[0] = ref[1] = 1 |
| hashed[0] = hashed[1] = True |
| ts += 0.05 |
| running_A_pf1 = [seq_view(RID_A, SID_A, "prefilling", 20, 0, 16, 0, [0, 1])] |
| emit_step(280, 16, 1, 0, [], [], [running_A_pf1[0]], admitted=[SID_A]) |
| |
| |
| events[-1]["payload"]["snapshot"]["running"] = [] |
| events[-1]["payload"]["snapshot"]["waiting"] = running_A_pf1 |
| ts += 0.3 |
|
|
| |
| step = 2 |
| ref[2] = 1 |
| ts += 0.05 |
| running_A = [seq_view(RID_A, SID_A, "running", 20, 0, 20, 0, [0, 1, 2])] |
| emit_step(180, 4, 1, 0, [], running_A, []) |
| ts += 0.2 |
|
|
| |
| |
| TOKS_A = [" Paged", " attention", " splits", " keys", " and"] |
| for i, tok in enumerate(TOKS_A): |
| step += 1 |
| ts += 0.12 |
| n_gen = i + 1 |
| n_comp = 20 + i |
| |
| bt = [0, 1, 2] |
| if 20 + n_gen > 24: |
| bt = [0, 1, 2, 3] |
| ref[3] = 1 |
| |
| if 20 + n_gen >= 24 and not hashed[2]: |
| hashed[2] = True |
| running_A = [seq_view(RID_A, SID_A, "running", 20, n_gen, 20 + n_gen, |
| 0, bt)] |
| emit_step(95, 1, 0, 1, |
| [{"request_id": RID_A, "new_text": tok, "finished": False, "finish_reason": None}], |
| running_A, []) |
|
|
| |
| step += 1 |
| RID_B = "demo-bbbb2222" |
| SID_B = 2 |
| PROMPT_B = PROMPT_A |
| ts += 0.6 |
| emit("request", {"request_id": RID_B, "seq_id": SID_B, "prompt": PROMPT_B, |
| "prompt_len": 20, "max_tokens": 24}) |
|
|
| step += 1 |
| ts += 0.05 |
| |
| |
| lookups += 2 |
| hits += 2 |
| ref[0] += 1 |
| ref[1] += 1 |
| |
| ref[4] = 1 |
| running_B_pf = [seq_view(RID_B, SID_B, "prefilling", 20, 0, 16, 16, [0, 1, 4])] |
| |
| step_A_n_gen = len(TOKS_A) |
| running_A_now = [seq_view(RID_A, SID_A, "running", 20, step_A_n_gen, |
| 20 + step_A_n_gen, 0, [0, 1, 2, 3])] |
| emit_step(140, 5, 1, 1, |
| [{"request_id": RID_A, "new_text": " values", "finished": False, "finish_reason": None}], |
| [running_A_now[0]], [running_B_pf[0]], admitted=[SID_B]) |
| ts += 0.12 |
|
|
| |
| step += 1 |
| running_A_now = [seq_view(RID_A, SID_A, "running", 20, step_A_n_gen + 1, |
| 21 + step_A_n_gen, 0, [0, 1, 2, 3])] |
| running_B = [seq_view(RID_B, SID_B, "running", 20, 0, 20, 16, [0, 1, 4])] |
| emit_step(110, 5, 1, 1, |
| [{"request_id": RID_A, "new_text": " into", "finished": False, "finish_reason": None}], |
| [running_A_now[0], running_B[0]], []) |
| ts += 0.12 |
|
|
| |
| A_tokens = [" small", ",", " fixed", "-size", " blocks", " stored", " in"] |
| B_tokens = [" Prefix", " caching", " reuses", " those", " blocks", " across", " requests"] |
| for i in range(7): |
| step += 1 |
| ts += 0.10 |
| a_gen = step_A_n_gen + 2 + i |
| b_gen = i + 1 |
| a_bt = [0, 1, 2, 3] |
| b_bt = [0, 1, 4] |
| |
| if 20 + b_gen > 24 and ref[5] == 0: |
| ref[5] = 1 |
| if 20 + b_gen > 24: |
| b_bt = [0, 1, 4, 5] |
| if 20 + b_gen >= 24 and not hashed[4]: |
| hashed[4] = True |
| seq_a = seq_view(RID_A, SID_A, "running", 20, a_gen, 20 + a_gen, 0, a_bt) |
| seq_b = seq_view(RID_B, SID_B, "running", 20, b_gen, 20 + b_gen, 16, b_bt) |
| emit_step(105, 2, 0, 2, |
| [{"request_id": RID_A, "new_text": A_tokens[i], "finished": False, "finish_reason": None}, |
| {"request_id": RID_B, "new_text": B_tokens[i], "finished": False, "finish_reason": None}], |
| [seq_a, seq_b], []) |
|
|
| |
| step += 1 |
| ts += 0.10 |
| ref[0] -= 1 |
| ref[1] -= 1 |
| ref[2] = 0 |
| ref[3] = 0 |
| |
| |
| hashed[3] = True |
| seq_b = seq_view(RID_B, SID_B, "running", 20, 8, 28, 16, [0, 1, 4, 5]) |
| emit_step(85, 1, 0, 1, |
| [{"request_id": RID_A, "new_text": " the GPU.", "finished": True, "finish_reason": "stop"}], |
| [seq_b], [], finished=[RID_A]) |
|
|
| |
| step += 1 |
| ts += 0.10 |
| ref[0] -= 1 |
| ref[1] -= 1 |
| ref[4] = 0 |
| ref[5] = 0 |
| hashed[5] = True |
| emit_step(80, 1, 0, 1, |
| [{"request_id": RID_B, "new_text": " for the same prompts.", "finished": True, "finish_reason": "stop"}], |
| [], [], finished=[RID_B]) |
|
|
|
|
| |
|
|
| out = sys.stdout |
| for ev in events: |
| out.write(json.dumps(ev, separators=(",", ":"))) |
| out.write("\n") |
|
|