polats commited on
Commit
bd4a81a
·
1 Parent(s): 2f7e532

Add /persona/selftest to measure pure generation tok/s inside the Space

Browse files
Files changed (1) hide show
  1. app.py +17 -0
app.py CHANGED
@@ -228,6 +228,23 @@ def persona_status():
228
  return llm.status()
229
 
230
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  # Persona generation, woid-protocol-compatible so web/personaStream.js consumes it
232
  # unchanged: emits `model` → `delta`* → `persona-done` → `done` (or `error`). The
233
  # blocking llama.cpp generator runs in a worker thread bridged to this async SSE
 
228
  return llm.status()
229
 
230
 
231
+ @fastapi_app.get("/persona/selftest")
232
+ def persona_selftest():
233
+ """Measure pure generation speed inside the Space (no proxy, no lock race)."""
234
+ import time
235
+ t0 = time.time()
236
+ n = 0
237
+ try:
238
+ for _ in llm.stream_chat("You are terse.", "Count from one to twenty.",
239
+ max_tokens=24, temperature=0.1):
240
+ n += 1
241
+ except Exception as e:
242
+ return {"error": str(e), "tokens": n, "seconds": round(time.time() - t0, 2)}
243
+ s = time.time() - t0
244
+ return {"tokens": n, "seconds": round(s, 2),
245
+ "tok_per_sec": round(n / s, 2) if s else None, **llm.status()}
246
+
247
+
248
  # Persona generation, woid-protocol-compatible so web/personaStream.js consumes it
249
  # unchanged: emits `model` → `delta`* → `persona-done` → `done` (or `error`). The
250
  # blocking llama.cpp generator runs in a worker thread bridged to this async SSE