"""Lightweight local API load baseline using FastAPI TestClient.""" from __future__ import annotations import statistics import time from concurrent.futures import ThreadPoolExecutor from fastapi.testclient import TestClient from src.api import main as api_main from src.api.main import app from src.core.rag_orchestrator import QueryResponse def _fake_run(_req): return QueryResponse( query="q", provider="ollama", model="qwen2.5:7b", answer="ok", processing_time_ms=15.0, ) def _single_call(client: TestClient) -> float: t0 = time.perf_counter() resp = client.post("/query", json={"query": "benchmark"}) resp.raise_for_status() return (time.perf_counter() - t0) * 1000.0 def main() -> None: api_main._orchestrator.run = _fake_run client = TestClient(app) total_requests = 200 concurrency = 20 samples = [] with ThreadPoolExecutor(max_workers=concurrency) as pool: futures = [pool.submit(_single_call, client) for _ in range(total_requests)] for f in futures: samples.append(f.result()) p95 = statistics.quantiles(samples, n=100)[94] avg = statistics.mean(samples) print(f"requests={total_requests}") print(f"concurrency={concurrency}") print(f"latency_ms_avg={avg:.2f}") print(f"latency_ms_p95={p95:.2f}") if __name__ == "__main__": main()