tiny-vllm / examples /smoke_client.py
enCoder's picture
minimal continuous-batching LLM engine
c32c359
"""Fire concurrent prompts at a running tiny_vllm server.
Run the server first:
python -m tiny_vllm.server --model Qwen/Qwen2.5-0.5B-Instruct
Then in another shell:
python examples/smoke_client.py
"""
from __future__ import annotations
import argparse
import asyncio
import json
import time
import httpx
PROMPTS = [
"Write a haiku about paged attention.",
"Explain GQA in one paragraph.",
"What is continuous batching, briefly?",
"List three uses of prefix caching.",
]
async def one(client: httpx.AsyncClient, prompt: str, idx: int) -> tuple[str, float]:
t0 = time.monotonic()
print(f"[{idx}] >> {prompt!r}")
text_parts: list[str] = []
async with client.stream(
"POST", "/generate",
json={"prompt": prompt, "max_tokens": 48, "temperature": 0.7, "top_p": 0.9, "stream": True},
timeout=None,
) as resp:
resp.raise_for_status()
async for raw in resp.aiter_lines():
if not raw.startswith("data: "):
continue
data = raw[6:]
if data == "[DONE]":
break
chunk = json.loads(data)
if chunk.get("text"):
text_parts.append(chunk["text"])
if chunk.get("finished"):
break
dt = time.monotonic() - t0
text = "".join(text_parts)
print(f"[{idx}] << ({dt:.2f}s) {text}")
return text, dt
async def main() -> None:
p = argparse.ArgumentParser()
p.add_argument("--base-url", default="http://127.0.0.1:8000")
p.add_argument("--rounds", type=int, default=1)
p.add_argument("--prefix-demo", action="store_true",
help="send same prompt 3x to show prefix cache speedup")
args = p.parse_args()
async with httpx.AsyncClient(base_url=args.base_url) as client:
if args.prefix_demo:
prompt = PROMPTS[0]
for i in range(3):
await one(client, prompt, i)
return
for r in range(args.rounds):
tasks = [one(client, p, i + r * len(PROMPTS)) for i, p in enumerate(PROMPTS)]
await asyncio.gather(*tasks)
if __name__ == "__main__":
asyncio.run(main())