Spaces:
Running
Running
Add /persona/selftest to measure pure generation tok/s inside the Space
Browse files
app.py
CHANGED
|
@@ -228,6 +228,23 @@ def persona_status():
|
|
| 228 |
return llm.status()
|
| 229 |
|
| 230 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
# Persona generation, woid-protocol-compatible so web/personaStream.js consumes it
|
| 232 |
# unchanged: emits `model` → `delta`* → `persona-done` → `done` (or `error`). The
|
| 233 |
# blocking llama.cpp generator runs in a worker thread bridged to this async SSE
|
|
|
|
| 228 |
return llm.status()
|
| 229 |
|
| 230 |
|
| 231 |
+
@fastapi_app.get("/persona/selftest")
|
| 232 |
+
def persona_selftest():
|
| 233 |
+
"""Measure pure generation speed inside the Space (no proxy, no lock race)."""
|
| 234 |
+
import time
|
| 235 |
+
t0 = time.time()
|
| 236 |
+
n = 0
|
| 237 |
+
try:
|
| 238 |
+
for _ in llm.stream_chat("You are terse.", "Count from one to twenty.",
|
| 239 |
+
max_tokens=24, temperature=0.1):
|
| 240 |
+
n += 1
|
| 241 |
+
except Exception as e:
|
| 242 |
+
return {"error": str(e), "tokens": n, "seconds": round(time.time() - t0, 2)}
|
| 243 |
+
s = time.time() - t0
|
| 244 |
+
return {"tokens": n, "seconds": round(s, 2),
|
| 245 |
+
"tok_per_sec": round(n / s, 2) if s else None, **llm.status()}
|
| 246 |
+
|
| 247 |
+
|
| 248 |
# Persona generation, woid-protocol-compatible so web/personaStream.js consumes it
|
| 249 |
# unchanged: emits `model` → `delta`* → `persona-done` → `done` (or `error`). The
|
| 250 |
# blocking llama.cpp generator runs in a worker thread bridged to this async SSE
|