Spaces:

enCoder
/

tiny-vllm

Running

File size: 8,688 Bytes

39fa862

"""Fabricate a representative events.jsonl for the GH Pages demo.

We don't need a model or torch here — we just synthesize a sequence of engine
events that exercises:

  * a prompt being admitted (request event + first step admitting it)
  * chunked prefill across two steps
  * decoding several tokens
  * a second request that hits the prefix cache on identical prefix tokens
  * a finished sequence and a freed-but-cached block

The output is read by `web/app.js`'s replay mode.

Run:
    python scripts/make_demo_recording.py > web/events.jsonl
"""
from __future__ import annotations

import json
import sys
from copy import deepcopy

NUM_BLOCKS = 24
BLOCK_SIZE = 8
MODEL = "Qwen/Qwen2.5-0.5B-Instruct"

CONFIG = {
    "model": MODEL,
    "block_size": BLOCK_SIZE,
    "num_blocks": NUM_BLOCKS,
    "max_num_seqs": 8,
    "max_num_batched_tokens": 32,
    "prefix_caching": True,
}

# State we evolve.
ref = [0] * NUM_BLOCKS
hashed = [False] * NUM_BLOCKS
lookups = 0
hits = 0

events: list[dict] = []
ts = 0.0
step = 0


def pool_snapshot() -> dict:
    return {
        "num_blocks": NUM_BLOCKS,
        "block_size": BLOCK_SIZE,
        "num_free_blocks": sum(1 for r in ref if r == 0),
        "num_cached_entries": sum(1 for h in hashed if h),
        "prefix_cache_hits": hits,
        "prefix_cache_lookups": lookups,
        "ref_counts": list(ref),
        "hashed": list(hashed),
    }


def seq_view(rid, sid, status, prompt_len, num_gen, num_computed, num_cached, block_table) -> dict:
    return {
        "seq_id": sid, "request_id": rid, "status": status,
        "prompt_len": prompt_len, "num_generated": num_gen,
        "num_computed_tokens": num_computed,
        "num_cached_prefix_tokens": num_cached,
        "block_table": list(block_table),
    }


def emit(ev_type: str, payload: dict) -> None:
    global ts
    events.append({"type": ev_type, "step": step, "timestamp": round(ts, 3), "payload": payload})


def emit_snapshot(running, waiting) -> None:
    emit("snapshot", {
        "step": step,
        "config": CONFIG,
        "block_pool": pool_snapshot(),
        "running": running,
        "waiting": waiting,
    })


def emit_step(dur_ms, n_tok, n_pf, n_dec, deltas, running, waiting,
              admitted=None, finished=None, preempted=None):
    emit("step", {
        "duration_ms": dur_ms,
        "num_tokens": n_tok,
        "num_seqs": n_pf + n_dec,
        "num_prefill_seqs": n_pf,
        "num_decode_seqs": n_dec,
        "deltas": deltas or [],
        "newly_admitted": admitted or [],
        "finished": finished or [],
        "preempted": preempted or [],
        "snapshot": {
            "step": step,
            "config": CONFIG,
            "block_pool": pool_snapshot(),
            "running": running,
            "waiting": waiting,
        },
    })


# ---------- script the session ----------

# Initial empty snapshot.
emit_snapshot(running=[], waiting=[])
ts += 0.4

# --- request A admitted with a 20-token prompt --------------------------
RID_A = "demo-aaaa1111"
SID_A = 1
PROMPT_A = "Explain paged attention in two sentences. Then explain prefix caching."

step = 1
ts += 0.1
emit("request", {"request_id": RID_A, "seq_id": SID_A, "prompt": PROMPT_A,
                 "prompt_len": 20, "max_tokens": 24})

# Step 1: chunked prefill, first chunk (16 tokens → 2 blocks).
ref[0] = ref[1] = 1
hashed[0] = hashed[1] = True
ts += 0.05
running_A_pf1 = [seq_view(RID_A, SID_A, "prefilling", 20, 0, 16, 0, [0, 1])]
emit_step(280, 16, 1, 0, [], [], [running_A_pf1[0]], admitted=[SID_A])
# (waiting moved to running at end of prefill; here still prefilling, so it
# stays in waiting in our scheduler; emit fix:)
events[-1]["payload"]["snapshot"]["running"] = []
events[-1]["payload"]["snapshot"]["waiting"] = running_A_pf1
ts += 0.3

# Step 2: finish prefill (4 more tokens). Need 1 more block.
step = 2
ref[2] = 1  # block 2 holds last 4 tokens (partial)
ts += 0.05
running_A = [seq_view(RID_A, SID_A, "running", 20, 0, 20, 0, [0, 1, 2])]
emit_step(180, 4, 1, 0, [], running_A, [])
ts += 0.2

# Steps 3-7: A decodes 5 tokens. Each step appends one slot; block 2 fills
# at token 24, then block 3 starts.
TOKS_A = [" Paged", " attention", " splits", " keys", " and"]
for i, tok in enumerate(TOKS_A):
    step += 1
    ts += 0.12
    n_gen = i + 1
    n_comp = 20 + i  # after fwd: num_computed_tokens increments by 1 each decode
    # Block table grows when crossing boundary.
    bt = [0, 1, 2]
    if 20 + n_gen > 24:
        bt = [0, 1, 2, 3]
        ref[3] = 1
    # Maybe hash block 2 when it just filled (after sampling tok at pos 23 → 24 tokens computed).
    if 20 + n_gen >= 24 and not hashed[2]:
        hashed[2] = True
    running_A = [seq_view(RID_A, SID_A, "running", 20, n_gen, 20 + n_gen,
                          0, bt)]
    emit_step(95, 1, 0, 1,
              [{"request_id": RID_A, "new_text": tok, "finished": False, "finish_reason": None}],
              running_A, [])

# --- request B admitted, identical prefix → cache hit -------------------
step += 1
RID_B = "demo-bbbb2222"
SID_B = 2
PROMPT_B = PROMPT_A   # same → prefix cache hit
ts += 0.6
emit("request", {"request_id": RID_B, "seq_id": SID_B, "prompt": PROMPT_B,
                 "prompt_len": 20, "max_tokens": 24})

step += 1
ts += 0.05
# Hit lookups: 2 full blocks of B's prompt (block_size=8 → 16/8=2 blocks).
# Both hit because A's first 2 blocks are hashed (and currently in use).
lookups += 2
hits += 2
ref[0] += 1
ref[1] += 1
# B needs a 3rd block (partial, 4 tokens). Fresh: block 4.
ref[4] = 1
running_B_pf = [seq_view(RID_B, SID_B, "prefilling", 20, 0, 16, 16, [0, 1, 4])]
# A is still running and decoding in this step.
step_A_n_gen = len(TOKS_A)
running_A_now = [seq_view(RID_A, SID_A, "running", 20, step_A_n_gen,
                          20 + step_A_n_gen, 0, [0, 1, 2, 3])]
emit_step(140, 5, 1, 1,
          [{"request_id": RID_A, "new_text": " values", "finished": False, "finish_reason": None}],
          [running_A_now[0]], [running_B_pf[0]], admitted=[SID_B])
ts += 0.12

# Step: B finishes prefill (4 tokens), A decodes one more.
step += 1
running_A_now = [seq_view(RID_A, SID_A, "running", 20, step_A_n_gen + 1,
                          21 + step_A_n_gen, 0, [0, 1, 2, 3])]
running_B = [seq_view(RID_B, SID_B, "running", 20, 0, 20, 16, [0, 1, 4])]
emit_step(110, 5, 1, 1,
          [{"request_id": RID_A, "new_text": " into", "finished": False, "finish_reason": None}],
          [running_A_now[0], running_B[0]], [])
ts += 0.12

# Steps: both decode in lock-step for a few rounds.
A_tokens = [" small", ",", " fixed", "-size", " blocks", " stored", " in"]
B_tokens = [" Prefix", " caching", " reuses", " those", " blocks", " across", " requests"]
for i in range(7):
    step += 1
    ts += 0.10
    a_gen = step_A_n_gen + 2 + i
    b_gen = i + 1
    a_bt = [0, 1, 2, 3]
    b_bt = [0, 1, 4]
    # B crosses block boundary at b_gen such that 20 + b_gen > 24 → needs block 5.
    if 20 + b_gen > 24 and ref[5] == 0:
        ref[5] = 1
    if 20 + b_gen > 24:
        b_bt = [0, 1, 4, 5]
    if 20 + b_gen >= 24 and not hashed[4]:
        hashed[4] = True
    seq_a = seq_view(RID_A, SID_A, "running", 20, a_gen, 20 + a_gen, 0, a_bt)
    seq_b = seq_view(RID_B, SID_B, "running", 20, b_gen, 20 + b_gen, 16, b_bt)
    emit_step(105, 2, 0, 2,
              [{"request_id": RID_A, "new_text": A_tokens[i], "finished": False, "finish_reason": None},
               {"request_id": RID_B, "new_text": B_tokens[i], "finished": False, "finish_reason": None}],
              [seq_a, seq_b], [])

# A finishes (hits EOS or max_tokens).
step += 1
ts += 0.10
ref[0] -= 1   # A releases its blocks; block 0,1 still held by B so refcount drops to 1
ref[1] -= 1
ref[2] = 0    # block 2 (A only) → goes to cached free list (hashed)
ref[3] = 0    # block 3 (A only) → if hashed, cached; else uncached free
# Block 2 was hashed earlier; block 3 we never hashed (partial in A's decode). For demo,
# hash block 3 to make the cache view richer.
hashed[3] = True
seq_b = seq_view(RID_B, SID_B, "running", 20, 8, 28, 16, [0, 1, 4, 5])
emit_step(85, 1, 0, 1,
          [{"request_id": RID_A, "new_text": " the GPU.", "finished": True, "finish_reason": "stop"}],
          [seq_b], [], finished=[RID_A])

# B finishes too.
step += 1
ts += 0.10
ref[0] -= 1
ref[1] -= 1
ref[4] = 0
ref[5] = 0
hashed[5] = True
emit_step(80, 1, 0, 1,
          [{"request_id": RID_B, "new_text": " for the same prompts.", "finished": True, "finish_reason": "stop"}],
          [], [], finished=[RID_B])


# ---------- emit ----------

out = sys.stdout
for ev in events:
    out.write(json.dumps(ev, separators=(",", ":")))
    out.write("\n")