File size: 2,497 Bytes
3c0d3e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/usr/bin/env python3
from __future__ import annotations

import argparse
import concurrent.futures as cf
import json
import statistics
import time
from pathlib import Path

import requests


def call(url: str) -> tuple[float, int, float]:
    start = time.perf_counter()
    r = requests.post(url, json={"return_boxes": True}, timeout=30)
    r.raise_for_status()
    latency = (time.perf_counter() - start) * 1000.0
    data = r.json()
    result = data["results"][0]
    return latency, int(result.get("batch_size", 0)), float(result.get("infer_us", 0.0)) / 1000.0


def run(url: str, concurrency: int, requests_count: int, warmup: int) -> dict:
    for _ in range(warmup):
        call(url)
    latencies = []
    batch_sizes = []
    infer_ms = []
    start = time.perf_counter()
    with cf.ThreadPoolExecutor(max_workers=concurrency) as ex:
        futs = [ex.submit(call, url) for _ in range(requests_count)]
        for fut in cf.as_completed(futs):
            lat, batch_size, infer = fut.result()
            latencies.append(lat)
            batch_sizes.append(batch_size)
            infer_ms.append(infer)
    elapsed = time.perf_counter() - start
    latencies.sort()
    return {
        "server": "rust_dynamic_batcher",
        "concurrency": concurrency,
        "requests": requests_count,
        "pages": requests_count,
        "elapsed_s": elapsed,
        "pages_per_s": requests_count / elapsed,
        "p50_ms": statistics.median(latencies),
        "p95_ms": latencies[int(len(latencies) * 0.95) - 1],
        "p99_ms": latencies[int(len(latencies) * 0.99) - 1],
        "min_ms": latencies[0],
        "max_ms": latencies[-1],
        "avg_observed_batch": sum(batch_sizes) / len(batch_sizes),
        "avg_engine_infer_ms_per_batch": sum(infer_ms) / len(infer_ms),
    }


def main() -> None:
    ap = argparse.ArgumentParser()
    ap.add_argument("--url", default="http://localhost:18082/v1/infer")
    ap.add_argument("--concurrency", type=int, required=True)
    ap.add_argument("--requests", type=int, default=200)
    ap.add_argument("--warmup", type=int, default=10)
    ap.add_argument("--output", type=Path)
    args = ap.parse_args()
    result = run(args.url, args.concurrency, args.requests, args.warmup)
    print(json.dumps(result, indent=2))
    if args.output:
        args.output.parent.mkdir(parents=True, exist_ok=True)
        args.output.write_text(json.dumps(result, indent=2) + "\n", encoding="utf-8")


if __name__ == "__main__":
    main()