| """Fire concurrent prompts at a running tiny_vllm server. |
| |
| Run the server first: |
| python -m tiny_vllm.server --model Qwen/Qwen2.5-0.5B-Instruct |
| |
| Then in another shell: |
| python examples/smoke_client.py |
| """ |
| from __future__ import annotations |
|
|
| import argparse |
| import asyncio |
| import json |
| import time |
|
|
| import httpx |
|
|
|
|
| PROMPTS = [ |
| "Write a haiku about paged attention.", |
| "Explain GQA in one paragraph.", |
| "What is continuous batching, briefly?", |
| "List three uses of prefix caching.", |
| ] |
|
|
|
|
| async def one(client: httpx.AsyncClient, prompt: str, idx: int) -> tuple[str, float]: |
| t0 = time.monotonic() |
| print(f"[{idx}] >> {prompt!r}") |
| text_parts: list[str] = [] |
| async with client.stream( |
| "POST", "/generate", |
| json={"prompt": prompt, "max_tokens": 48, "temperature": 0.7, "top_p": 0.9, "stream": True}, |
| timeout=None, |
| ) as resp: |
| resp.raise_for_status() |
| async for raw in resp.aiter_lines(): |
| if not raw.startswith("data: "): |
| continue |
| data = raw[6:] |
| if data == "[DONE]": |
| break |
| chunk = json.loads(data) |
| if chunk.get("text"): |
| text_parts.append(chunk["text"]) |
| if chunk.get("finished"): |
| break |
| dt = time.monotonic() - t0 |
| text = "".join(text_parts) |
| print(f"[{idx}] << ({dt:.2f}s) {text}") |
| return text, dt |
|
|
|
|
| async def main() -> None: |
| p = argparse.ArgumentParser() |
| p.add_argument("--base-url", default="http://127.0.0.1:8000") |
| p.add_argument("--rounds", type=int, default=1) |
| p.add_argument("--prefix-demo", action="store_true", |
| help="send same prompt 3x to show prefix cache speedup") |
| args = p.parse_args() |
|
|
| async with httpx.AsyncClient(base_url=args.base_url) as client: |
| if args.prefix_demo: |
| prompt = PROMPTS[0] |
| for i in range(3): |
| await one(client, prompt, i) |
| return |
| for r in range(args.rounds): |
| tasks = [one(client, p, i + r * len(PROMPTS)) for i, p in enumerate(PROMPTS)] |
| await asyncio.gather(*tasks) |
|
|
|
|
| if __name__ == "__main__": |
| asyncio.run(main()) |
|
|