Spaces:

BeardedAmbivert
/

inference-server

Sleeping

File size: 2,880 Bytes

046db3f

"""Benchmark script: compare sequential vs concurrent request throughput.

Usage:
    Start the server first: uv run uvicorn app.main:app
    Then run: uv run python scripts/bench.py

This script sends N requests both sequentially and concurrently.
Sequential = no batching benefit (one at a time).
Concurrent = requests arrive together, batcher can group them.
Compare the two to measure batching impact.
"""

import asyncio
import time
import httpx
import math

URL = "http://localhost:8000/predict"
PAYLOAD = {"texts": ["hello world", "benchmark test"]}
N_REQUESTS = 20


async def send_one(client: httpx.AsyncClient) -> float:
    """Send a single POST request, return the response time in seconds."""
    start_time = time.time()
    await client.post(URL, json=PAYLOAD)
    elapsed_time = time.time() - start_time
    return elapsed_time


async def run_sequential(n: int) -> dict:
    """Send n requests one after another."""
    async with httpx.AsyncClient() as client:
        latencies = []
        for _ in range(n):
            latencies.append(await send_one(client))

    total_time = sum(latencies)
    return {
        "total_time": total_time,
        "avg_latency": total_time / n,
        "rps": n / total_time,
        "latencies": latencies,
    }


async def run_concurrent(n: int) -> dict:
    """Send n requests all at once using asyncio.gather."""
    async with httpx.AsyncClient() as client:
        start = time.time()
        latencies = list(await asyncio.gather(*(send_one(client) for _ in range(n))))
        total_time = time.time() - start

    return {
        "total_time": total_time,
        "avg_latency": sum(latencies) / n,
        "rps": n / total_time,
        "latencies": latencies,
    }


def percentile(latencies: list[float], p: int) -> float:
    """Calculate the p-th percentile from a list of latencies."""
    return sorted(latencies)[math.ceil((p/100) * len(latencies)) - 1]


async def main():
    """Run both benchmarks and print comparison."""
    print(f"Benchmarking with {N_REQUESTS} requests...\n")

    print("Running sequential...")
    seq = await run_sequential(N_REQUESTS)

    print("Running concurrent...")
    con = await run_concurrent(N_REQUESTS)

    header = f"{'Mode':<12} {'Total (s)':>10} {'Avg (s)':>10} {'p50 (s)':>10} {'p95 (s)':>10} {'p99 (s)':>10} {'RPS':>8}"
    print(f"\n{header}")
    print("-" * len(header))

    for label, r in [("Sequential", seq), ("Concurrent", con)]:
        lats = r["latencies"]
        print(
            f"{label:<12} {r['total_time']:>10.3f} {r['avg_latency']:>10.3f} "
            f"{percentile(lats, 50):>10.3f} {percentile(lats, 95):>10.3f} "
            f"{percentile(lats, 99):>10.3f} {r['rps']:>8.1f}"
        )

    speedup = seq["total_time"] / con["total_time"]
    print(f"\nSpeedup: {speedup:.2f}x")



if __name__ == "__main__":
    asyncio.run(main())