Spaces:

BeardedAmbivert
/

inference-server

Sleeping

Aditya Kulkarni

feat: add dynamic batching and benchmark script

046db3f 3 months ago

2.88 kB

	"""Benchmark script: compare sequential vs concurrent request throughput.

	Usage:
	Start the server first: uv run uvicorn app.main:app
	Then run: uv run python scripts/bench.py

	This script sends N requests both sequentially and concurrently.
	Sequential = no batching benefit (one at a time).
	Concurrent = requests arrive together, batcher can group them.
	Compare the two to measure batching impact.
	"""

	import asyncio
	import time
	import httpx
	import math

	URL = "http://localhost:8000/predict"
	PAYLOAD = {"texts": ["hello world", "benchmark test"]}
	N_REQUESTS = 20


	async def send_one(client: httpx.AsyncClient) -> float:
	"""Send a single POST request, return the response time in seconds."""
	start_time = time.time()
	await client.post(URL, json=PAYLOAD)
	elapsed_time = time.time() - start_time
	return elapsed_time


	async def run_sequential(n: int) -> dict:
	"""Send n requests one after another."""
	async with httpx.AsyncClient() as client:
	latencies = []
	for _ in range(n):
	latencies.append(await send_one(client))

	total_time = sum(latencies)
	return {
	"total_time": total_time,
	"avg_latency": total_time / n,
	"rps": n / total_time,
	"latencies": latencies,
	}


	async def run_concurrent(n: int) -> dict:
	"""Send n requests all at once using asyncio.gather."""
	async with httpx.AsyncClient() as client:
	start = time.time()
	latencies = list(await asyncio.gather(*(send_one(client) for _ in range(n))))
	total_time = time.time() - start

	return {
	"total_time": total_time,
	"avg_latency": sum(latencies) / n,
	"rps": n / total_time,
	"latencies": latencies,
	}


	def percentile(latencies: list[float], p: int) -> float:
	"""Calculate the p-th percentile from a list of latencies."""
	return sorted(latencies)[math.ceil((p/100) * len(latencies)) - 1]


	async def main():
	"""Run both benchmarks and print comparison."""
	print(f"Benchmarking with {N_REQUESTS} requests...\n")

	print("Running sequential...")
	seq = await run_sequential(N_REQUESTS)

	print("Running concurrent...")
	con = await run_concurrent(N_REQUESTS)

	header = f"{'Mode':<12} {'Total (s)':>10} {'Avg (s)':>10} {'p50 (s)':>10} {'p95 (s)':>10} {'p99 (s)':>10} {'RPS':>8}"
	print(f"\n{header}")
	print("-" * len(header))

	for label, r in [("Sequential", seq), ("Concurrent", con)]:
	lats = r["latencies"]
	print(
	f"{label:<12} {r['total_time']:>10.3f} {r['avg_latency']:>10.3f} "
	f"{percentile(lats, 50):>10.3f} {percentile(lats, 95):>10.3f} "
	f"{percentile(lats, 99):>10.3f} {r['rps']:>8.1f}"
	)

	speedup = seq["total_time"] / con["total_time"]
	print(f"\nSpeedup: {speedup:.2f}x")



	if __name__ == "__main__":
	asyncio.run(main())