doc-ingestion / scripts /perf_baseline.py
Vamshi Pokala
feat: add API orchestration and citation-aware RAG flow
0d03152
"""Lightweight local API load baseline using FastAPI TestClient."""
from __future__ import annotations
import statistics
import time
from concurrent.futures import ThreadPoolExecutor
from fastapi.testclient import TestClient
from src.api import main as api_main
from src.api.main import app
from src.core.rag_orchestrator import QueryResponse
def _fake_run(_req):
return QueryResponse(
query="q",
provider="ollama",
model="qwen2.5:7b",
answer="ok",
processing_time_ms=15.0,
)
def _single_call(client: TestClient) -> float:
t0 = time.perf_counter()
resp = client.post("/query", json={"query": "benchmark"})
resp.raise_for_status()
return (time.perf_counter() - t0) * 1000.0
def main() -> None:
api_main._orchestrator.run = _fake_run
client = TestClient(app)
total_requests = 200
concurrency = 20
samples = []
with ThreadPoolExecutor(max_workers=concurrency) as pool:
futures = [pool.submit(_single_call, client) for _ in range(total_requests)]
for f in futures:
samples.append(f.result())
p95 = statistics.quantiles(samples, n=100)[94]
avg = statistics.mean(samples)
print(f"requests={total_requests}")
print(f"concurrency={concurrency}")
print(f"latency_ms_avg={avg:.2f}")
print(f"latency_ms_p95={p95:.2f}")
if __name__ == "__main__":
main()