Spaces:

enCoder
/

tiny-vllm

Running

App Files Files Community

tiny-vllm / examples /smoke_client.py

enCoder

minimal continuous-batching LLM engine

c32c359 12 days ago

raw

history blame contribute delete

2.2 kB

	"""Fire concurrent prompts at a running tiny_vllm server.

	Run the server first:
	python -m tiny_vllm.server --model Qwen/Qwen2.5-0.5B-Instruct

	Then in another shell:
	python examples/smoke_client.py
	"""
	from __future__ import annotations

	import argparse
	import asyncio
	import json
	import time

	import httpx


	PROMPTS = [
	"Write a haiku about paged attention.",
	"Explain GQA in one paragraph.",
	"What is continuous batching, briefly?",
	"List three uses of prefix caching.",
	]


	async def one(client: httpx.AsyncClient, prompt: str, idx: int) -> tuple[str, float]:
	t0 = time.monotonic()
	print(f"[{idx}] >> {prompt!r}")
	text_parts: list[str] = []
	async with client.stream(
	"POST", "/generate",
	json={"prompt": prompt, "max_tokens": 48, "temperature": 0.7, "top_p": 0.9, "stream": True},
	timeout=None,
	) as resp:
	resp.raise_for_status()
	async for raw in resp.aiter_lines():
	if not raw.startswith("data: "):
	continue
	data = raw[6:]
	if data == "[DONE]":
	break
	chunk = json.loads(data)
	if chunk.get("text"):
	text_parts.append(chunk["text"])
	if chunk.get("finished"):
	break
	dt = time.monotonic() - t0
	text = "".join(text_parts)
	print(f"[{idx}] << ({dt:.2f}s) {text}")
	return text, dt


	async def main() -> None:
	p = argparse.ArgumentParser()
	p.add_argument("--base-url", default="http://127.0.0.1:8000")
	p.add_argument("--rounds", type=int, default=1)
	p.add_argument("--prefix-demo", action="store_true",
	help="send same prompt 3x to show prefix cache speedup")
	args = p.parse_args()

	async with httpx.AsyncClient(base_url=args.base_url) as client:
	if args.prefix_demo:
	prompt = PROMPTS[0]
	for i in range(3):
	await one(client, prompt, i)
	return
	for r in range(args.rounds):
	tasks = [one(client, p, i + r * len(PROMPTS)) for i, p in enumerate(PROMPTS)]
	await asyncio.gather(*tasks)


	if __name__ == "__main__":
	asyncio.run(main())