Spaces:
Sleeping
Sleeping
File size: 2,880 Bytes
046db3f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 | """Benchmark script: compare sequential vs concurrent request throughput.
Usage:
Start the server first: uv run uvicorn app.main:app
Then run: uv run python scripts/bench.py
This script sends N requests both sequentially and concurrently.
Sequential = no batching benefit (one at a time).
Concurrent = requests arrive together, batcher can group them.
Compare the two to measure batching impact.
"""
import asyncio
import time
import httpx
import math
URL = "http://localhost:8000/predict"
PAYLOAD = {"texts": ["hello world", "benchmark test"]}
N_REQUESTS = 20
async def send_one(client: httpx.AsyncClient) -> float:
"""Send a single POST request, return the response time in seconds."""
start_time = time.time()
await client.post(URL, json=PAYLOAD)
elapsed_time = time.time() - start_time
return elapsed_time
async def run_sequential(n: int) -> dict:
"""Send n requests one after another."""
async with httpx.AsyncClient() as client:
latencies = []
for _ in range(n):
latencies.append(await send_one(client))
total_time = sum(latencies)
return {
"total_time": total_time,
"avg_latency": total_time / n,
"rps": n / total_time,
"latencies": latencies,
}
async def run_concurrent(n: int) -> dict:
"""Send n requests all at once using asyncio.gather."""
async with httpx.AsyncClient() as client:
start = time.time()
latencies = list(await asyncio.gather(*(send_one(client) for _ in range(n))))
total_time = time.time() - start
return {
"total_time": total_time,
"avg_latency": sum(latencies) / n,
"rps": n / total_time,
"latencies": latencies,
}
def percentile(latencies: list[float], p: int) -> float:
"""Calculate the p-th percentile from a list of latencies."""
return sorted(latencies)[math.ceil((p/100) * len(latencies)) - 1]
async def main():
"""Run both benchmarks and print comparison."""
print(f"Benchmarking with {N_REQUESTS} requests...\n")
print("Running sequential...")
seq = await run_sequential(N_REQUESTS)
print("Running concurrent...")
con = await run_concurrent(N_REQUESTS)
header = f"{'Mode':<12} {'Total (s)':>10} {'Avg (s)':>10} {'p50 (s)':>10} {'p95 (s)':>10} {'p99 (s)':>10} {'RPS':>8}"
print(f"\n{header}")
print("-" * len(header))
for label, r in [("Sequential", seq), ("Concurrent", con)]:
lats = r["latencies"]
print(
f"{label:<12} {r['total_time']:>10.3f} {r['avg_latency']:>10.3f} "
f"{percentile(lats, 50):>10.3f} {percentile(lats, 95):>10.3f} "
f"{percentile(lats, 99):>10.3f} {r['rps']:>8.1f}"
)
speedup = seq["total_time"] / con["total_time"]
print(f"\nSpeedup: {speedup:.2f}x")
if __name__ == "__main__":
asyncio.run(main())
|