NOT-OMEGA commited on
Commit
a1175b8
Β·
verified Β·
1 Parent(s): 8cbe83e

Update NanoMind/load_test.py

Browse files
Files changed (1) hide show
  1. NanoMind/load_test.py +488 -180
NanoMind/load_test.py CHANGED
@@ -1,101 +1,140 @@
1
  """
2
- load_test.py β€” NanoMind Concurrency & Latency Benchmark
3
- ========================================================
4
- Measures what actually matters for a serving system:
5
- - TTFT (time-to-first-token) under load
6
- - End-to-end latency distribution (P50/P95/P99)
7
- - Queue wait time (time from request send to first token)
8
- - Throughput (requests/sec, tokens/sec)
9
- - Rejection rate under overload
 
 
 
 
 
 
 
 
10
 
11
  Usage:
12
- # Basic test β€” 8 concurrent, 32 total
13
- python load_test.py
14
 
15
- # Stress test
16
- python load_test.py --concurrency 16 --requests 64
17
 
18
- # Test against HF Space
19
- python load_test.py --api https://your-space.hf.space --concurrency 4
20
 
21
- # Install deps first:
22
- # pip install aiohttp
 
 
 
 
 
 
23
  """
24
 
25
  import argparse
26
  import asyncio
27
  import json
28
  import random
29
- import statistics
30
  import sys
31
  import time
 
32
  from dataclasses import dataclass, field
33
  from typing import Optional
34
 
35
  try:
36
  import aiohttp
37
  except ImportError:
38
- print("Missing dep: pip install aiohttp")
39
  sys.exit(1)
40
 
41
  # ─────────────────────────────────────────────────────────────
42
- # Test prompts β€” variety of lengths
43
  # ─────────────────────────────────────────────────────────────
44
- PROMPTS_SHORT = [
45
- "What is the capital of Japan?",
46
- "What is 2 + 2?",
47
- "Who wrote Romeo and Juliet?",
48
- "What is the boiling point of water?",
49
- "How many days in a week?",
50
- ]
51
- PROMPTS_MEDIUM = [
52
- "Explain machine learning in simple terms.",
53
- "What are 3 benefits of regular exercise?",
54
- "What is a neural network and how does it learn?",
55
- "Write a Python function that reverses a string.",
56
- "Describe the water cycle briefly.",
57
- ]
58
- PROMPTS_LONG = [
59
- (
60
- "Please explain the transformer architecture in natural language processing, "
61
- "covering attention mechanisms, positional encoding, and how BERT and GPT differ."
62
- ),
63
- (
64
- "Give me a detailed explanation of how the internet works, from DNS lookup "
65
- "to HTTP request handling to rendering in a browser."
66
- ),
67
- ]
68
- ALL_PROMPTS = PROMPTS_SHORT + PROMPTS_MEDIUM + PROMPTS_LONG
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
 
71
  # ─────────────────────────────────────────────────────────────
72
  # Result dataclass
73
  # ─────────────────────────────────────────────────────────────
 
74
  @dataclass
75
- class RequestResult:
76
- status: int
77
- prompt: str
78
- ttft_ms: Optional[float] = None
79
- e2e_ms: Optional[float] = None
80
- tps: Optional[float] = None
81
- queue_wait_ms: Optional[float] = None
82
- tokens_out: int = 0
83
- error: Optional[str] = None
 
84
 
85
 
86
  # ─────────────────────────────────────────────────────────────
87
- # Single async request
88
  # ─────────────────────────────────────────────────────────────
 
89
  async def do_request(
90
  session: aiohttp.ClientSession,
91
  api_url: str,
92
  prompt: str,
 
93
  max_new: int = 80,
94
  temperature: float = 0.7,
95
  top_k: int = 40,
96
- ) -> RequestResult:
 
 
 
97
  payload = {
98
  "message": prompt,
 
99
  "max_new_tokens": max_new,
100
  "temperature": temperature,
101
  "top_k": top_k,
@@ -103,23 +142,24 @@ async def do_request(
103
  t0 = time.perf_counter()
104
  ttft = None
105
  toks = 0
 
 
106
 
107
  try:
108
  async with session.post(
109
  f"{api_url}/chat",
110
  json=payload,
111
- timeout=aiohttp.ClientTimeout(total=60),
112
  ) as resp:
113
  if resp.status == 429:
114
- return RequestResult(status=429, prompt=prompt, error="rejected")
 
 
 
 
115
  if resp.status != 200:
116
- return RequestResult(
117
- status=resp.status, prompt=prompt,
118
- error=f"HTTP {resp.status}"
119
- )
120
-
121
- queue_wait = None
122
- final_tps = None
123
 
124
  async for raw_line in resp.content:
125
  line = raw_line.decode("utf-8", errors="replace").strip()
@@ -133,35 +173,42 @@ async def do_request(
133
  except json.JSONDecodeError:
134
  continue
135
 
136
- if chunk.get("type") == "token":
 
137
  if ttft is None:
138
  ttft = (time.perf_counter() - t0) * 1000
139
  toks += 1
140
-
141
- elif chunk.get("type") == "done":
142
- final_tps = chunk.get("tps")
143
- queue_wait = chunk.get("queue_wait_ms")
144
 
145
  e2e = (time.perf_counter() - t0) * 1000
146
- return RequestResult(
147
  status=200,
148
- prompt=prompt,
149
  ttft_ms=round(ttft or 0, 1),
150
  e2e_ms=round(e2e, 1),
151
- tps=final_tps,
152
- queue_wait_ms=queue_wait,
153
  tokens_out=toks,
 
154
  )
155
 
156
  except asyncio.TimeoutError:
157
- return RequestResult(status=408, prompt=prompt, error="timeout")
 
 
 
 
158
  except Exception as exc:
159
- return RequestResult(status=0, prompt=prompt, error=str(exc))
 
160
 
161
 
162
  # ─────────────────────────────────────────────────────────────
163
- # Percentile helper
164
  # ─────────────────────────────────────────────────────────────
 
165
  def pct(data: list, p: int) -> float:
166
  if not data:
167
  return 0.0
@@ -169,156 +216,417 @@ def pct(data: list, p: int) -> float:
169
  i = min(len(s) - 1, int(p / 100 * len(s)))
170
  return round(s[i], 1)
171
 
 
 
172
 
173
- # ─────────────────────────────────────────────────────────────
174
- # One concurrency level run
175
- # ─────────────────────────────────────────────────────────────
176
- async def run_at_concurrency(
177
- api_url: str,
178
- concurrency: int,
179
- total_requests: int,
180
- max_new: int,
181
- seed: int = 42,
182
- ) -> list[RequestResult]:
183
- random.seed(seed)
184
- prompts = [random.choice(ALL_PROMPTS) for _ in range(total_requests)]
185
- sem = asyncio.Semaphore(concurrency)
186
- results = []
187
-
188
- async def bounded(prompt):
189
- async with sem:
190
- async with aiohttp.ClientSession() as s:
191
- r = await do_request(s, api_url, prompt, max_new=max_new)
192
- results.append(r)
193
-
194
- tasks = [asyncio.create_task(bounded(p)) for p in prompts]
195
- await asyncio.gather(*tasks)
196
- return results
197
 
198
 
199
  # ─────────────────────────────────────────────────────────────
200
- # Print summary table
201
  # ─────────────────────────────────────────────────────────────
202
- def print_summary(results: list[RequestResult], concurrency: int, elapsed_s: float):
 
 
 
 
 
 
203
  ok = [r for r in results if r.status == 200]
204
  rej = [r for r in results if r.status == 429]
205
- err = [r for r in results if r.status not in (200, 429)]
206
-
207
- ttfts = [r.ttft_ms for r in ok if r.ttft_ms is not None]
208
- e2es = [r.e2e_ms for r in ok if r.e2e_ms is not None]
209
- tps_l = [r.tps for r in ok if r.tps is not None]
210
- qwts = [r.queue_wait_ms for r in ok if r.queue_wait_ms is not None]
 
 
211
  t_out = sum(r.tokens_out for r in ok)
212
 
213
- n = len(results)
214
- W = 57
 
 
 
 
 
 
 
 
 
215
  print()
216
  print("=" * W)
217
- print(f" NanoMind Load Test β€” concurrency={concurrency} requests={n}")
218
  print("=" * W)
219
- print(f" Wall time : {elapsed_s:.1f}s")
220
- print(f" QPS : {n / elapsed_s:.1f}")
221
- print(f" Tokens/s : {t_out / elapsed_s:.1f} ({t_out} total)")
 
 
 
 
222
  print()
223
- print(f" Success : {len(ok)}/{n} ({100*len(ok)/max(n,1):.0f}%)")
224
- print(f" Rejected (429) : {len(rej)}")
225
- print(f" Errors : {len(err)}")
226
- if err:
227
- for r in err[:3]:
228
- print(f" ↳ {r.error}")
229
- print()
230
- print(f" β”Œ{'─'*15}┬{'─'*8}┬{'─'*8}┬{'─'*8}┐")
231
- print(f" β”‚ {'Metric':<13} β”‚ {'p50':>6} β”‚ {'p95':>6} β”‚ {'p99':>6} β”‚")
232
- print(f" β”œ{'─'*15}β”Ό{'─'*8}β”Ό{'─'*8}β”Ό{'─'*8}─")
233
-
234
- def row(label, data, unit="ms"):
235
- p50 = f"{pct(data, 50):.0f}{unit}"
236
- p95 = f"{pct(data, 95):.0f}{unit}"
237
- p99 = f"{pct(data, 99):.0f}{unit}"
238
- print(f" β”‚ {label:<13} β”‚ {p50:>6} β”‚ {p95:>6} β”‚ {p99:>6} β”‚")
239
-
240
- row("TTFT", ttfts)
241
- row("E2E", e2es)
242
- row("Queue wait", qwts)
243
- row("TPS", tps_l, unit="")
244
- print(f" β””{'─'*15}β”΄{'─'*8}β”΄{'─'*8}β”΄{'─'*8}β”˜")
245
 
246
  # Verdict
247
  print()
 
 
 
248
  if not ok:
249
- print(" ⚠️ No successful requests β€” check server is running")
250
  else:
251
- p95_ttft = pct(ttfts, 95)
252
- if p95_ttft < 1000:
253
- print(f" βœ… TTFT p95 = {p95_ttft:.0f}ms (good, < 1s)")
254
- elif p95_ttft < 3000:
255
- print(f" ⚠️ TTFT p95 = {p95_ttft:.0f}ms (ok, but check queue depth)")
256
- else:
257
- print(f" ❌ TTFT p95 = {p95_ttft:.0f}ms (bad β€” request serialization or overload)")
258
-
259
- if len(rej) / max(n, 1) > 0.1:
260
- print(f" ⚠️ {100*len(rej)/n:.0f}% rejected β€” server saturated, raise MAX_INFLIGHT or add engines")
 
 
 
 
 
 
 
 
 
 
261
 
262
  print("=" * W)
263
- print()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
 
265
 
266
  # ─────────────────────────────────────────────────────────────
267
- # Sweep mode β€” test at 1x, 2x, 4x, 8x concurrency
268
  # ─────────────────────────────────────────────────────────────
269
- async def run_sweep(api_url: str, max_new: int):
270
- print("\nπŸ”¬ Concurrency sweep: 1 β†’ 2 β†’ 4 β†’ 8")
271
- for c in (1, 2, 4, 8):
272
- n = c * 4
273
- t0 = time.perf_counter()
274
- r = await run_at_concurrency(api_url, c, n, max_new)
275
- print_summary(r, c, time.perf_counter() - t0)
276
- await asyncio.sleep(1) # brief cool-down between levels
277
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
 
279
- # ─────────────────────────────────────────────────────────────
280
- # Health check
281
- # ─────────────────────────────────────────────────────────────
282
- async def check_health(api_url: str) -> bool:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
  try:
284
  async with aiohttp.ClientSession() as s:
285
- async with s.get(f"{api_url}/health", timeout=aiohttp.ClientTimeout(total=10)) as r:
286
- data = await r.json()
287
- ready = data.get("engines_ready", 0)
288
- print(f"[health] status={data.get('status')} engines_ready={ready} "
289
- f"inflight={data.get('inflight_requests', '?')}")
 
 
 
 
 
 
 
290
  return ready > 0
291
  except Exception as e:
292
- print(f"[health] Cannot reach {api_url}: {e}")
293
  return False
294
 
295
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
  # ─────────────────────────────────────────────────────────────
297
  # Main
298
  # ─────────────────────────────────────────────────────────────
 
299
  async def main():
300
- parser = argparse.ArgumentParser(description="NanoMind Load Test")
301
- parser.add_argument("--api", default="http://localhost:7860")
302
- parser.add_argument("--concurrency", type=int, default=8)
303
- parser.add_argument("--requests", type=int, default=32)
304
- parser.add_argument("--max-new", type=int, default=80)
 
 
 
 
 
 
305
  parser.add_argument("--sweep", action="store_true",
306
- help="Run concurrency sweep (1/2/4/8 concurrent)")
 
 
 
 
 
 
307
  args = parser.parse_args()
308
 
309
- print(f"\nNanoMind Load Test β†’ {args.api}")
310
- if not await check_health(args.api):
311
- print("Server not ready β€” aborting")
 
 
 
 
 
 
 
 
312
  return
313
 
314
- if args.sweep:
 
 
 
315
  await run_sweep(args.api, args.max_new)
 
 
 
 
 
316
  else:
317
- t0 = time.perf_counter()
318
- results = await run_at_concurrency(
319
  args.api, args.concurrency, args.requests, args.max_new
320
  )
321
- print_summary(results, args.concurrency, time.perf_counter() - t0)
322
 
323
 
324
  if __name__ == "__main__":
 
1
  """
2
+ load_test.py β€” NanoMind Production Load Test v2.0
3
+ ==================================================
4
+ Google-level stress testing: latency distribution, throughput,
5
+ concurrency scaling, cache speedup, rejection behavior, and
6
+ stability under sustained load.
7
+
8
+ Metrics collected (resume-ready):
9
+ - TTFT p50 / p95 / p99 (time-to-first-token)
10
+ - E2E latency p50 / p95 / p99 (full response)
11
+ - Queue wait p50 / p95 / p99 (scheduler overhead)
12
+ - tok/s p50 / p95 / p99 (throughput per request)
13
+ - Total system tok/s (aggregate throughput)
14
+ - QPS (requests per second)
15
+ - 429 rejection rate (backpressure behavior)
16
+ - KV cache speedup (warm vs cold TTFT ratio)
17
+ - Error rate (stability signal)
18
 
19
  Usage:
20
+ pip install aiohttp
 
21
 
22
+ # Full Google-level benchmark (recommended)
23
+ python load_test.py --api https://not-omega-nanomind.hf.space --full
24
 
25
+ # Quick concurrency sweep
26
+ python load_test.py --api https://not-omega-nanomind.hf.space --sweep
27
 
28
+ # Sustained load (stability test)
29
+ python load_test.py --api https://not-omega-nanomind.hf.space --sustained --duration 60
30
+
31
+ # Single concurrency level
32
+ python load_test.py --api https://not-omega-nanomind.hf.space --concurrency 8 --requests 40
33
+
34
+ # Local server
35
+ python load_test.py --full
36
  """
37
 
38
  import argparse
39
  import asyncio
40
  import json
41
  import random
 
42
  import sys
43
  import time
44
+ import uuid
45
  from dataclasses import dataclass, field
46
  from typing import Optional
47
 
48
  try:
49
  import aiohttp
50
  except ImportError:
51
+ print("❌ Missing: pip install aiohttp")
52
  sys.exit(1)
53
 
54
  # ─────────────────────────────────────────────────────────────
55
+ # Prompts β€” categorized by input token length
56
  # ─────────────────────────────────────────────────────────────
57
+
58
+ PROMPTS = {
59
+ "short": [
60
+ "What is the capital of Japan?",
61
+ "What is 2 + 2?",
62
+ "Who wrote Romeo and Juliet?",
63
+ "What is H2O?",
64
+ "How many days in a week?",
65
+ "What is the speed of light?",
66
+ "What planet is closest to the Sun?",
67
+ "What is the chemical symbol for gold?",
68
+ ],
69
+ "medium": [
70
+ "Explain machine learning in simple terms.",
71
+ "What are 3 benefits of regular exercise?",
72
+ "Write a Python function that reverses a string.",
73
+ "Describe the water cycle briefly.",
74
+ "What is a neural network and how does it learn?",
75
+ "Explain what an API is and give an example.",
76
+ "What is the difference between RAM and storage?",
77
+ "How does HTTPS encryption work?",
78
+ ],
79
+ "long": [
80
+ (
81
+ "Explain the transformer architecture in NLP, covering attention "
82
+ "mechanisms, positional encoding, and how BERT differs from GPT."
83
+ ),
84
+ (
85
+ "Give a detailed explanation of how the internet works, from DNS "
86
+ "resolution to HTTP request handling to rendering in a browser."
87
+ ),
88
+ (
89
+ "Describe the key differences between supervised, unsupervised, and "
90
+ "reinforcement learning with concrete examples for each."
91
+ ),
92
+ (
93
+ "Explain continuous batching in LLM inference systems, why it matters "
94
+ "for throughput, and what tradeoffs it introduces on CPU vs GPU."
95
+ ),
96
+ ],
97
+ }
98
+
99
+ ALL_PROMPTS = PROMPTS["short"] + PROMPTS["medium"] + PROMPTS["long"]
100
 
101
 
102
  # ─────────────────────────────────────────────────────────────
103
  # Result dataclass
104
  # ─────────────────────────────────────────────────────────────
105
+
106
  @dataclass
107
+ class Result:
108
+ status: int
109
+ prompt_len: str = "medium"
110
+ ttft_ms: float = 0.0
111
+ e2e_ms: float = 0.0
112
+ tps: float = 0.0
113
+ queue_wait_ms: float = 0.0
114
+ tokens_out: int = 0
115
+ error: str = ""
116
+ session_id: str = ""
117
 
118
 
119
  # ─────────────────────────────────────────────────────────────
120
+ # Core request function
121
  # ─────────────────────────────────────────────────────────────
122
+
123
  async def do_request(
124
  session: aiohttp.ClientSession,
125
  api_url: str,
126
  prompt: str,
127
+ prompt_len: str = "medium",
128
  max_new: int = 80,
129
  temperature: float = 0.7,
130
  top_k: int = 40,
131
+ session_id: Optional[str] = None,
132
+ timeout: float = 90.0,
133
+ ) -> Result:
134
+ sid = session_id or str(uuid.uuid4())
135
  payload = {
136
  "message": prompt,
137
+ "session_id": sid,
138
  "max_new_tokens": max_new,
139
  "temperature": temperature,
140
  "top_k": top_k,
 
142
  t0 = time.perf_counter()
143
  ttft = None
144
  toks = 0
145
+ queue_wait = 0.0
146
+ final_tps = 0.0
147
 
148
  try:
149
  async with session.post(
150
  f"{api_url}/chat",
151
  json=payload,
152
+ timeout=aiohttp.ClientTimeout(total=timeout),
153
  ) as resp:
154
  if resp.status == 429:
155
+ return Result(status=429, prompt_len=prompt_len,
156
+ error="rejected_429", session_id=sid)
157
+ if resp.status == 503:
158
+ return Result(status=503, prompt_len=prompt_len,
159
+ error="engine_loading", session_id=sid)
160
  if resp.status != 200:
161
+ return Result(status=resp.status, prompt_len=prompt_len,
162
+ error=f"http_{resp.status}", session_id=sid)
 
 
 
 
 
163
 
164
  async for raw_line in resp.content:
165
  line = raw_line.decode("utf-8", errors="replace").strip()
 
173
  except json.JSONDecodeError:
174
  continue
175
 
176
+ ctype = chunk.get("type")
177
+ if ctype == "token":
178
  if ttft is None:
179
  ttft = (time.perf_counter() - t0) * 1000
180
  toks += 1
181
+ elif ctype == "done":
182
+ final_tps = chunk.get("tps", 0.0)
183
+ queue_wait = chunk.get("queue_wait_ms", 0.0)
 
184
 
185
  e2e = (time.perf_counter() - t0) * 1000
186
+ return Result(
187
  status=200,
188
+ prompt_len=prompt_len,
189
  ttft_ms=round(ttft or 0, 1),
190
  e2e_ms=round(e2e, 1),
191
+ tps=round(final_tps, 1),
192
+ queue_wait_ms=round(queue_wait, 1),
193
  tokens_out=toks,
194
+ session_id=sid,
195
  )
196
 
197
  except asyncio.TimeoutError:
198
+ return Result(status=408, prompt_len=prompt_len,
199
+ error="timeout", session_id=sid)
200
+ except aiohttp.ClientConnectorError:
201
+ return Result(status=0, prompt_len=prompt_len,
202
+ error="connection_refused", session_id=sid)
203
  except Exception as exc:
204
+ return Result(status=0, prompt_len=prompt_len,
205
+ error=str(exc)[:80], session_id=sid)
206
 
207
 
208
  # ─────────────────────────────────────────────────────────────
209
+ # Statistics helpers
210
  # ─────────────────────────────────────────────────────────────
211
+
212
  def pct(data: list, p: int) -> float:
213
  if not data:
214
  return 0.0
 
216
  i = min(len(s) - 1, int(p / 100 * len(s)))
217
  return round(s[i], 1)
218
 
219
+ def mean(data: list) -> float:
220
+ return round(sum(data) / len(data), 1) if data else 0.0
221
 
222
+ def pct_row(label: str, data: list, unit: str = "ms") -> str:
223
+ if not data:
224
+ return f" β”‚ {label:<16} β”‚ {'β€”':>7} β”‚ {'β€”':>7} β”‚ {'β€”':>7} β”‚ {'β€”':>7} β”‚"
225
+ return (
226
+ f" β”‚ {label:<16} β”‚ {f'{pct(data,50):.0f}{unit}':>7} β”‚"
227
+ f" {f'{pct(data,95):.0f}{unit}':>7} β”‚"
228
+ f" {f'{pct(data,99):.0f}{unit}':>7} β”‚"
229
+ f" {f'{mean(data):.0f}{unit}':>7} β”‚"
230
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
 
232
 
233
  # ─────────────────────────────────────────────────────────────
234
+ # Print summary
235
  # ─────────────────────────────────────────────────────────────
236
+
237
+ def print_summary(
238
+ results: list,
239
+ concurrency: int,
240
+ elapsed_s: float,
241
+ label: str = "",
242
+ ) -> dict:
243
  ok = [r for r in results if r.status == 200]
244
  rej = [r for r in results if r.status == 429]
245
+ tmo = [r for r in results if r.status == 408]
246
+ err = [r for r in results if r.status not in (200, 429, 408)]
247
+ n = len(results)
248
+
249
+ ttfts = [r.ttft_ms for r in ok if r.ttft_ms > 0]
250
+ e2es = [r.e2e_ms for r in ok if r.e2e_ms > 0]
251
+ tps_l = [r.tps for r in ok if r.tps > 0]
252
+ qwts = [r.queue_wait_ms for r in ok if r.queue_wait_ms > 0]
253
  t_out = sum(r.tokens_out for r in ok)
254
 
255
+ qps = round(n / max(elapsed_s, 0.1), 2)
256
+ sys_tps = round(t_out / max(elapsed_s, 0.1), 1)
257
+ success_rt = round(100 * len(ok) / max(n, 1), 1)
258
+ rej_rt = round(100 * len(rej) / max(n, 1), 1)
259
+ err_rt = round(100 * (len(tmo) + len(err)) / max(n, 1), 1)
260
+
261
+ W = 65
262
+ hdr = f" concurrency={concurrency} requests={n}"
263
+ if label:
264
+ hdr += f" [{label}]"
265
+
266
  print()
267
  print("=" * W)
268
+ print(f" NanoMind Load Test{hdr}")
269
  print("=" * W)
270
+ print(f" Wall time : {elapsed_s:.1f}s")
271
+ print(f" QPS : {qps}")
272
+ print(f" System tok/s : {sys_tps} ({t_out} tokens total)")
273
+ print(f" Success : {len(ok)}/{n} ({success_rt}%)")
274
+ print(f" Rejected (429) : {len(rej)} ({rej_rt}%)")
275
+ print(f" Timeouts : {len(tmo)}")
276
+ print(f" Other errors : {len(err)}")
277
  print()
278
+ print(f" β”Œ{'─'*18}┬{'─'*9}┬{'─'*9}┬{'─'*9}┬{'─'*9}┐")
279
+ print(f" β”‚ {'Metric':<16} β”‚ {'p50':>7} β”‚ {'p95':>7} β”‚ {'p99':>7} β”‚ {'avg':>7} β”‚")
280
+ print(f" β”œ{'─'*18}β”Ό{'─'*9}β”Ό{'─'*9}β”Ό{'─'*9}β”Ό{'─'*9}─")
281
+ print(pct_row("TTFT", ttfts))
282
+ print(pct_row("E2E latency", e2es))
283
+ print(pct_row("Queue wait", qwts))
284
+ print(pct_row("TPS / req", tps_l, unit=""))
285
+ print(f" β””{'─'*18}β”΄{'─'*9}β”΄{'─'*9}β”΄{'─'*9}β”΄{'─'*9}β”˜")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
286
 
287
  # Verdict
288
  print()
289
+ p95_ttft = pct(ttfts, 95)
290
+ p95_e2e = pct(e2es, 95)
291
+ avg_tps = mean(tps_l)
292
  if not ok:
293
+ print(" ⚠️ No successful requests β€” check server is running.")
294
  else:
295
+ verdict = []
296
+ verdict.append(
297
+ f" {'βœ…' if p95_ttft < 1500 else '⚠️ ' if p95_ttft < 4000 else '❌'}"
298
+ f" TTFT p95 = {p95_ttft:.0f}ms"
299
+ + (" (good)" if p95_ttft < 1500 else
300
+ " (ok)" if p95_ttft < 4000 else
301
+ " (bad β€” serialization or overload)")
302
+ )
303
+ verdict.append(
304
+ f" {'βœ…' if avg_tps > 25 else '⚠️ ' if avg_tps > 15 else '❌'}"
305
+ f" Avg TPS = {avg_tps:.0f} tok/s"
306
+ )
307
+ if rej_rt > 20:
308
+ verdict.append(f" ⚠️ {rej_rt:.0f}% rejected β€” raise MAX_INFLIGHT or add engines")
309
+ elif rej_rt > 0:
310
+ verdict.append(f" ℹ️ {rej_rt:.0f}% rejected β€” backpressure working correctly")
311
+ if err_rt > 5:
312
+ verdict.append(f" ❌ {err_rt:.0f}% error rate β€” investigate stability")
313
+ for v in verdict:
314
+ print(v)
315
 
316
  print("=" * W)
317
+
318
+ return {
319
+ "concurrency": concurrency,
320
+ "n": n,
321
+ "ok": len(ok),
322
+ "qps": qps,
323
+ "sys_tps": sys_tps,
324
+ "ttft_p50": pct(ttfts, 50),
325
+ "ttft_p95": pct(ttfts, 95),
326
+ "ttft_p99": pct(ttfts, 99),
327
+ "e2e_p50": pct(e2es, 50),
328
+ "e2e_p95": pct(e2es, 95),
329
+ "tps_avg": avg_tps,
330
+ "tps_p50": pct(tps_l, 50),
331
+ "rej_pct": rej_rt,
332
+ "success_pct": success_rt,
333
+ "t_total": t_out,
334
+ }
335
 
336
 
337
  # ─────────────────────────────────────────────────────────────
338
+ # Test runners
339
  # ─────────────────────────────────────────────────────────────
 
 
 
 
 
 
 
 
340
 
341
+ async def run_batch(
342
+ api_url: str,
343
+ concurrency: int,
344
+ total_requests: int,
345
+ max_new: int = 80,
346
+ prompt_mix: str = "all",
347
+ reuse_session: bool = False,
348
+ seed: int = 42,
349
+ ) -> list:
350
+ """Run `total_requests` requests with `concurrency` in-flight at once."""
351
+ rng = random.Random(seed)
352
+ if prompt_mix == "short":
353
+ pool = PROMPTS["short"]
354
+ elif prompt_mix == "medium":
355
+ pool = PROMPTS["medium"]
356
+ elif prompt_mix == "long":
357
+ pool = PROMPTS["long"]
358
+ else:
359
+ pool = ALL_PROMPTS
360
 
361
+ shared_sid = str(uuid.uuid4()) if reuse_session else None
362
+ prompts = [(rng.choice(pool), prompt_mix) for _ in range(total_requests)]
363
+ results = []
364
+ sem = asyncio.Semaphore(concurrency)
365
+
366
+ async def bounded(prompt: str, cat: str):
367
+ async with sem:
368
+ async with aiohttp.ClientSession() as s:
369
+ r = await do_request(
370
+ s, api_url, prompt, cat, max_new,
371
+ session_id=shared_sid,
372
+ )
373
+ results.append(r)
374
+
375
+ await asyncio.gather(*[asyncio.create_task(bounded(p, c)) for p, c in prompts])
376
+ return results
377
+
378
+
379
+ async def run_kv_cache_test(api_url: str) -> dict:
380
+ """
381
+ Measure KV cache speedup:
382
+ Turn 1 (cold): full system prompt + first message
383
+ Turn 2 (warm): same session, follow-up question
384
+ Returns speedup ratio.
385
+ """
386
+ print("\n πŸ“Š KV Cache Speedup Test...")
387
+ sid = str(uuid.uuid4())
388
+
389
+ cold_times = []
390
+ warm_times = []
391
+
392
+ for _ in range(5):
393
+ sid = str(uuid.uuid4())
394
+ async with aiohttp.ClientSession() as s:
395
+ # Cold
396
+ r = await do_request(
397
+ s, api_url, "What is machine learning?",
398
+ max_new=60, temperature=0.1, session_id=sid
399
+ )
400
+ if r.status == 200:
401
+ cold_times.append(r.ttft_ms)
402
+
403
+ # Warm (same session, follow-up)
404
+ r2 = await do_request(
405
+ s, api_url, "Give me one more example.",
406
+ max_new=60, temperature=0.1, session_id=sid
407
+ )
408
+ if r2.status == 200:
409
+ warm_times.append(r2.ttft_ms)
410
+
411
+ cold_avg = mean(cold_times)
412
+ warm_avg = mean(warm_times)
413
+ speedup = round(cold_avg / warm_avg, 2) if warm_avg > 0 else 0
414
+
415
+ print(f" Cold TTFT avg : {cold_avg:.0f}ms (n={len(cold_times)})")
416
+ print(f" Warm TTFT avg : {warm_avg:.0f}ms (n={len(warm_times)})")
417
+ print(f" KV speedup : {speedup}x {'βœ…' if speedup > 1.2 else 'β€”'}")
418
+
419
+ return {"cold_ttft_ms": cold_avg, "warm_ttft_ms": warm_avg, "speedup_x": speedup}
420
+
421
+
422
+ async def run_health_check(api_url: str) -> bool:
423
  try:
424
  async with aiohttp.ClientSession() as s:
425
+ async with s.get(
426
+ f"{api_url}/health",
427
+ timeout=aiohttp.ClientTimeout(total=15)
428
+ ) as r:
429
+ d = await r.json()
430
+ ready = d.get("engines_ready", 0)
431
+ print(f" status : {d.get('status', '?')}")
432
+ print(f" engines_ready : {ready}")
433
+ print(f" active_sessions: {d.get('active_sessions', '?')}")
434
+ print(f" inflight : {d.get('inflight_requests', '?')}")
435
+ print(f" ram_mb : {d.get('process_ram_mb', '?')}")
436
+ print(f" uptime_s : {d.get('uptime_seconds', '?')}")
437
  return ready > 0
438
  except Exception as e:
439
+ print(f" ❌ Cannot reach {api_url}: {e}")
440
  return False
441
 
442
 
443
+ async def run_sustained(api_url: str, duration_s: int, concurrency: int, max_new: int):
444
+ """
445
+ Fire requests continuously for `duration_s` seconds.
446
+ Measures stability: does TPS degrade? Do errors increase over time?
447
+ """
448
+ print(f"\n ⏱ Sustained load: {duration_s}s @ concurrency={concurrency}")
449
+ results = []
450
+ t_start = time.perf_counter()
451
+ rng = random.Random(99)
452
+ sem = asyncio.Semaphore(concurrency)
453
+ stop = asyncio.Event()
454
+
455
+ async def worker():
456
+ while not stop.is_set():
457
+ prompt = rng.choice(ALL_PROMPTS)
458
+ async with sem:
459
+ if stop.is_set():
460
+ return
461
+ async with aiohttp.ClientSession() as s:
462
+ r = await do_request(s, api_url, prompt, max_new=max_new)
463
+ results.append((time.perf_counter() - t_start, r))
464
+
465
+ tasks = [asyncio.create_task(worker()) for _ in range(concurrency * 2)]
466
+ await asyncio.sleep(duration_s)
467
+ stop.set()
468
+ await asyncio.gather(*tasks, return_exceptions=True)
469
+
470
+ elapsed = time.perf_counter() - t_start
471
+
472
+ # Split into 3 thirds to check for degradation
473
+ third = elapsed / 3
474
+ t1 = [r for ts, r in results if ts < third]
475
+ t2 = [r for ts, r in results if third <= ts < 2*third]
476
+ t3 = [r for ts, r in results if ts >= 2*third]
477
+
478
+ def ok_tps(batch):
479
+ ok = [r for r in batch if r.status == 200 and r.tps > 0]
480
+ return mean([r.tps for r in ok])
481
+
482
+ all_r = [r for _, r in results]
483
+ print(f" Total requests : {len(all_r)}")
484
+ print(f" T1 avg TPS : {ok_tps(t1):.1f} (first {duration_s//3}s)")
485
+ print(f" T2 avg TPS : {ok_tps(t2):.1f} (middle {duration_s//3}s)")
486
+ print(f" T3 avg TPS : {ok_tps(t3):.1f} (last {duration_s//3}s)")
487
+ degradation = "βœ… stable" if abs(ok_tps(t1) - ok_tps(t3)) < 5 else "⚠️ degrading"
488
+ print(f" Stability : {degradation}")
489
+
490
+ return all_r
491
+
492
+
493
+ # ─────────────────────────────────────────────────────────────
494
+ # Sweep
495
+ # ─────────────────────────────────────────────────────────────
496
+
497
+ async def run_sweep(api_url: str, max_new: int):
498
+ print("\nπŸ”¬ Concurrency Sweep: 1 β†’ 2 β†’ 4 β†’ 8 β†’ 16")
499
+ sweep_results = []
500
+ for c in (1, 2, 4, 8, 16):
501
+ n = max(c * 4, 8)
502
+ t0 = time.perf_counter()
503
+ r = await run_batch(api_url, c, n, max_new)
504
+ sr = print_summary(r, c, time.perf_counter() - t0, label=f"sweep c={c}")
505
+ sweep_results.append(sr)
506
+ await asyncio.sleep(2)
507
+
508
+ # Scaling table
509
+ print("\n πŸ“ˆ Scaling Summary (TTFT p95)")
510
+ print(f" {'Concurrency':>12} β”‚ {'TTFT p95':>9} β”‚ {'TPS avg':>8} β”‚ {'QPS':>6} β”‚ {'SysTPS':>7}")
511
+ print(f" {'─'*12}─┼─{'─'*9}─┼─{'─'*8}─┼─{'─'*6}─┼─{'─'*7}")
512
+ for sr in sweep_results:
513
+ print(
514
+ f" {sr['concurrency']:>12} β”‚ {sr['ttft_p95']:>8.0f}ms β”‚"
515
+ f" {sr['tps_avg']:>7.1f} β”‚ {sr['qps']:>6.1f} β”‚ {sr['sys_tps']:>7.1f}"
516
+ )
517
+
518
+
519
+ # ─────────────────────────────────────────────────────────────
520
+ # Full Google-level benchmark
521
+ # ─────────────────────────────────────────────────────────────
522
+
523
+ async def run_full(api_url: str, max_new: int):
524
+ """
525
+ Full benchmark suite β€” produces all numbers needed for resume.
526
+ """
527
+ all_stats = {}
528
+
529
+ # 1. Baseline single request
530
+ print("\n[1/6] Baseline β€” Single request (warmup)")
531
+ r = await run_batch(api_url, 1, 5, max_new, seed=1)
532
+ print_summary(r, 1, sum(x.e2e_ms for x in r if x.status==200)/1000, label="baseline")
533
+
534
+ # 2. Concurrency scaling
535
+ print("\n[2/6] Concurrency scaling sweep")
536
+ await run_sweep(api_url, max_new)
537
+
538
+ # 3. KV cache speedup
539
+ print("\n[3/6] KV cache speedup")
540
+ kv = await run_kv_cache_test(api_url)
541
+ all_stats["kv_cache"] = kv
542
+
543
+ # 4. Input length sensitivity (short vs long prompts)
544
+ print("\n[4/6] Input length sensitivity")
545
+ for mix, label in [("short", "short prompts"), ("long", "long prompts")]:
546
+ t0 = time.perf_counter()
547
+ r = await run_batch(api_url, 4, 12, max_new, prompt_mix=mix)
548
+ print_summary(r, 4, time.perf_counter()-t0, label=label)
549
+
550
+ # 5. Backpressure test (hammer with 32 concurrent)
551
+ print("\n[5/6] Backpressure / overload test (concurrency=32)")
552
+ t0 = time.perf_counter()
553
+ r = await run_batch(api_url, 32, 48, max_new)
554
+ print_summary(r, 32, time.perf_counter()-t0, label="overload")
555
+
556
+ # 6. Sustained stability (30s)
557
+ print("\n[6/6] Sustained stability (30s)")
558
+ sr = await run_sustained(api_url, 30, 4, max_new)
559
+ t0 = 30.0
560
+ print_summary(sr, 4, t0, label="sustained-30s")
561
+
562
+ # Final resume numbers
563
+ print("\n" + "=" * 65)
564
+ print(" πŸ“‹ RESUME-READY NUMBERS")
565
+ print("=" * 65)
566
+ kv = all_stats.get("kv_cache", {})
567
+ print(f" KV cache speedup : {kv.get('speedup_x', '?')}x")
568
+ print(f" Cold TTFT : {kv.get('cold_ttft_ms', '?'):.0f}ms")
569
+ print(f" Warm TTFT : {kv.get('warm_ttft_ms', '?'):.0f}ms")
570
+ print()
571
+ print(" Run --sweep for full TTFT/TPS scaling table.")
572
+ print("=" * 65)
573
+
574
+
575
  # ─────────────────────────────────────────────────────────────
576
  # Main
577
  # ─────────────────────────────────────────────────────────────
578
+
579
  async def main():
580
+ parser = argparse.ArgumentParser(
581
+ description="NanoMind Load Test v2.0 β€” Google-level benchmark"
582
+ )
583
+ parser.add_argument("--api", default="http://localhost:7860",
584
+ help="API base URL")
585
+ parser.add_argument("--concurrency", type=int, default=8,
586
+ help="Concurrent requests")
587
+ parser.add_argument("--requests", type=int, default=32,
588
+ help="Total requests")
589
+ parser.add_argument("--max-new", type=int, default=80,
590
+ help="Max new tokens per request")
591
  parser.add_argument("--sweep", action="store_true",
592
+ help="Concurrency sweep 1β†’2β†’4β†’8β†’16")
593
+ parser.add_argument("--full", action="store_true",
594
+ help="Full Google-level benchmark suite")
595
+ parser.add_argument("--sustained", action="store_true",
596
+ help="Sustained load stability test")
597
+ parser.add_argument("--duration", type=int, default=60,
598
+ help="Sustained test duration in seconds")
599
  args = parser.parse_args()
600
 
601
+ print(f"\n{'='*65}")
602
+ print(f" NanoMind Load Test v2.0")
603
+ print(f" Target: {args.api}")
604
+ print(f"{'='*65}")
605
+
606
+ # Health check
607
+ print("\nπŸ₯ Health Check")
608
+ ready = await run_health_check(args.api)
609
+ if not ready:
610
+ print("\n❌ Server not ready. Aborting.")
611
+ print(" If using HF Spaces, wait 60s for engine startup.")
612
  return
613
 
614
+ if args.full:
615
+ await run_full(args.api, args.max_new)
616
+
617
+ elif args.sweep:
618
  await run_sweep(args.api, args.max_new)
619
+
620
+ elif args.sustained:
621
+ r = await run_sustained(args.api, args.duration, args.concurrency, args.max_new)
622
+ print_summary(r, args.concurrency, float(args.duration), label="sustained")
623
+
624
  else:
625
+ t0 = time.perf_counter()
626
+ r = await run_batch(
627
  args.api, args.concurrency, args.requests, args.max_new
628
  )
629
+ print_summary(r, args.concurrency, time.perf_counter()-t0)
630
 
631
 
632
  if __name__ == "__main__":