Spaces:
Running
Running
Persona: cap n_threads=2 (cpu_count over-reports host cores in a container → llama.cpp thrashing/50x slowdown); smaller ctx; /persona/status diag; lower token cap
Browse files
app.py
CHANGED
|
@@ -223,6 +223,11 @@ def _sse(event, data):
|
|
| 223 |
return f"event: {event}\ndata: {_json.dumps(data)}\n\n"
|
| 224 |
|
| 225 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
# Persona generation, woid-protocol-compatible so web/personaStream.js consumes it
|
| 227 |
# unchanged: emits `model` → `delta`* → `persona-done` → `done` (or `error`). The
|
| 228 |
# blocking llama.cpp generator runs in a worker thread bridged to this async SSE
|
|
@@ -246,7 +251,7 @@ async def persona_generate_stream(request: Request):
|
|
| 246 |
try:
|
| 247 |
for chunk in llm.stream_chat(
|
| 248 |
prompts.PERSONA_SYSTEM, prompts.persona_user_prompt(unit_class, seed),
|
| 249 |
-
max_tokens=
|
| 250 |
):
|
| 251 |
loop.call_soon_threadsafe(q.put_nowait, ("delta", chunk))
|
| 252 |
except Exception as e: # LlmUnavailable or runtime error
|
|
|
|
| 223 |
return f"event: {event}\ndata: {_json.dumps(data)}\n\n"
|
| 224 |
|
| 225 |
|
| 226 |
+
@fastapi_app.get("/persona/status")
|
| 227 |
+
def persona_status():
|
| 228 |
+
return llm.status()
|
| 229 |
+
|
| 230 |
+
|
| 231 |
# Persona generation, woid-protocol-compatible so web/personaStream.js consumes it
|
| 232 |
# unchanged: emits `model` → `delta`* → `persona-done` → `done` (or `error`). The
|
| 233 |
# blocking llama.cpp generator runs in a worker thread bridged to this async SSE
|
|
|
|
| 251 |
try:
|
| 252 |
for chunk in llm.stream_chat(
|
| 253 |
prompts.PERSONA_SYSTEM, prompts.persona_user_prompt(unit_class, seed),
|
| 254 |
+
max_tokens=160, temperature=0.8, should_stop=stop.is_set,
|
| 255 |
):
|
| 256 |
loop.call_soon_threadsafe(q.put_nowait, ("delta", chunk))
|
| 257 |
except Exception as e: # LlmUnavailable or runtime error
|
llm.py
CHANGED
|
@@ -25,8 +25,11 @@ API_KEY = os.environ.get("TINY_LLM_API_KEY", "")
|
|
| 25 |
MODEL_PATH = os.environ.get("TINY_LLM_MODEL_PATH", "")
|
| 26 |
HF_REPO = os.environ.get("TINY_LLM_HF_REPO", "Qwen/Qwen2.5-0.5B-Instruct-GGUF")
|
| 27 |
HF_FILE = os.environ.get("TINY_LLM_HF_FILE", "*q4_k_m.gguf")
|
| 28 |
-
N_CTX = int(os.environ.get("TINY_LLM_N_CTX", "
|
| 29 |
-
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
# A label for the `model` SSE event / UI — not used to route requests.
|
| 32 |
MODEL_ID = (
|
|
@@ -50,6 +53,20 @@ def model_id():
|
|
| 50 |
return MODEL_ID or "tiny-llm"
|
| 51 |
|
| 52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
def _get_local():
|
| 54 |
"""Load the GGUF once. Uses its OWN lock (not the generation lock) so the slow
|
| 55 |
first-time download doesn't make concurrent requests time out — they just wait
|
|
|
|
| 25 |
MODEL_PATH = os.environ.get("TINY_LLM_MODEL_PATH", "")
|
| 26 |
HF_REPO = os.environ.get("TINY_LLM_HF_REPO", "Qwen/Qwen2.5-0.5B-Instruct-GGUF")
|
| 27 |
HF_FILE = os.environ.get("TINY_LLM_HF_FILE", "*q4_k_m.gguf")
|
| 28 |
+
N_CTX = int(os.environ.get("TINY_LLM_N_CTX", "2048"))
|
| 29 |
+
# DEFAULT 2, not os.cpu_count(): in a container cpu_count() reports the HOST's cores
|
| 30 |
+
# (often 16-32), but an HF free Space only has ~2 vCPU. Over-subscribing threads makes
|
| 31 |
+
# llama.cpp thrash and run ~50x too slow. Override with TINY_LLM_N_THREADS on bigger HW.
|
| 32 |
+
N_THREADS = int(os.environ.get("TINY_LLM_N_THREADS") or 2)
|
| 33 |
|
| 34 |
# A label for the `model` SSE event / UI — not used to route requests.
|
| 35 |
MODEL_ID = (
|
|
|
|
| 53 |
return MODEL_ID or "tiny-llm"
|
| 54 |
|
| 55 |
|
| 56 |
+
def status():
|
| 57 |
+
"""Diagnostics for the persona backend (model load state + thread/CPU info)."""
|
| 58 |
+
return {
|
| 59 |
+
"mode": "external" if BASE_URL else "in-space",
|
| 60 |
+
"model": model_id(),
|
| 61 |
+
"loaded": _llm is not None,
|
| 62 |
+
"load_error": _load_error,
|
| 63 |
+
"n_threads": N_THREADS,
|
| 64 |
+
"n_ctx": N_CTX,
|
| 65 |
+
"cpu_count": os.cpu_count(),
|
| 66 |
+
"base_url": BASE_URL or None,
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
|
| 70 |
def _get_local():
|
| 71 |
"""Load the GGUF once. Uses its OWN lock (not the generation lock) so the slow
|
| 72 |
first-time download doesn't make concurrent requests time out — they just wait
|