Aditya Kulkarni
feat: add dynamic batching and benchmark script
046db3f
"""Benchmark script: compare sequential vs concurrent request throughput.
Usage:
Start the server first: uv run uvicorn app.main:app
Then run: uv run python scripts/bench.py
This script sends N requests both sequentially and concurrently.
Sequential = no batching benefit (one at a time).
Concurrent = requests arrive together, batcher can group them.
Compare the two to measure batching impact.
"""
import asyncio
import time
import httpx
import math
URL = "http://localhost:8000/predict"
PAYLOAD = {"texts": ["hello world", "benchmark test"]}
N_REQUESTS = 20
async def send_one(client: httpx.AsyncClient) -> float:
"""Send a single POST request, return the response time in seconds."""
start_time = time.time()
await client.post(URL, json=PAYLOAD)
elapsed_time = time.time() - start_time
return elapsed_time
async def run_sequential(n: int) -> dict:
"""Send n requests one after another."""
async with httpx.AsyncClient() as client:
latencies = []
for _ in range(n):
latencies.append(await send_one(client))
total_time = sum(latencies)
return {
"total_time": total_time,
"avg_latency": total_time / n,
"rps": n / total_time,
"latencies": latencies,
}
async def run_concurrent(n: int) -> dict:
"""Send n requests all at once using asyncio.gather."""
async with httpx.AsyncClient() as client:
start = time.time()
latencies = list(await asyncio.gather(*(send_one(client) for _ in range(n))))
total_time = time.time() - start
return {
"total_time": total_time,
"avg_latency": sum(latencies) / n,
"rps": n / total_time,
"latencies": latencies,
}
def percentile(latencies: list[float], p: int) -> float:
"""Calculate the p-th percentile from a list of latencies."""
return sorted(latencies)[math.ceil((p/100) * len(latencies)) - 1]
async def main():
"""Run both benchmarks and print comparison."""
print(f"Benchmarking with {N_REQUESTS} requests...\n")
print("Running sequential...")
seq = await run_sequential(N_REQUESTS)
print("Running concurrent...")
con = await run_concurrent(N_REQUESTS)
header = f"{'Mode':<12} {'Total (s)':>10} {'Avg (s)':>10} {'p50 (s)':>10} {'p95 (s)':>10} {'p99 (s)':>10} {'RPS':>8}"
print(f"\n{header}")
print("-" * len(header))
for label, r in [("Sequential", seq), ("Concurrent", con)]:
lats = r["latencies"]
print(
f"{label:<12} {r['total_time']:>10.3f} {r['avg_latency']:>10.3f} "
f"{percentile(lats, 50):>10.3f} {percentile(lats, 95):>10.3f} "
f"{percentile(lats, 99):>10.3f} {r['rps']:>8.1f}"
)
speedup = seq["total_time"] / con["total_time"]
print(f"\nSpeedup: {speedup:.2f}x")
if __name__ == "__main__":
asyncio.run(main())