Spaces:

build-small-hackathon
/

tiny-army

Running

polats commited on 5 days ago

Commit

2f7e532

1 Parent(s): f85d7c3

Persona: cap n_threads=2 (cpu_count over-reports host cores in a container → llama.cpp thrashing/50x slowdown); smaller ctx; /persona/status diag; lower token cap

Files changed (2) hide show

app.py CHANGED Viewed

@@ -223,6 +223,11 @@ def _sse(event, data):
     return f"event: {event}\ndata: {_json.dumps(data)}\n\n"
 # Persona generation, woid-protocol-compatible so web/personaStream.js consumes it
 # unchanged: emits `model` → `delta`* → `persona-done` → `done` (or `error`). The
 # blocking llama.cpp generator runs in a worker thread bridged to this async SSE
@@ -246,7 +251,7 @@ async def persona_generate_stream(request: Request):
             try:
                 for chunk in llm.stream_chat(
                     prompts.PERSONA_SYSTEM, prompts.persona_user_prompt(unit_class, seed),
-                    max_tokens=256, temperature=0.8, should_stop=stop.is_set,
                 ):
                     loop.call_soon_threadsafe(q.put_nowait, ("delta", chunk))
             except Exception as e:  # LlmUnavailable or runtime error

     return f"event: {event}\ndata: {_json.dumps(data)}\n\n"
+@fastapi_app.get("/persona/status")
+def persona_status():
+    return llm.status()
 # Persona generation, woid-protocol-compatible so web/personaStream.js consumes it
 # unchanged: emits `model` → `delta`* → `persona-done` → `done` (or `error`). The
 # blocking llama.cpp generator runs in a worker thread bridged to this async SSE
             try:
                 for chunk in llm.stream_chat(
                     prompts.PERSONA_SYSTEM, prompts.persona_user_prompt(unit_class, seed),
+                    max_tokens=160, temperature=0.8, should_stop=stop.is_set,
                 ):
                     loop.call_soon_threadsafe(q.put_nowait, ("delta", chunk))
             except Exception as e:  # LlmUnavailable or runtime error

llm.py CHANGED Viewed

@@ -25,8 +25,11 @@ API_KEY = os.environ.get("TINY_LLM_API_KEY", "")
 MODEL_PATH = os.environ.get("TINY_LLM_MODEL_PATH", "")
 HF_REPO = os.environ.get("TINY_LLM_HF_REPO", "Qwen/Qwen2.5-0.5B-Instruct-GGUF")
 HF_FILE = os.environ.get("TINY_LLM_HF_FILE", "*q4_k_m.gguf")
-N_CTX = int(os.environ.get("TINY_LLM_N_CTX", "4096"))
-N_THREADS = int(os.environ.get("TINY_LLM_N_THREADS", str(os.cpu_count() or 2)))
 # A label for the `model` SSE event / UI — not used to route requests.
 MODEL_ID = (
@@ -50,6 +53,20 @@ def model_id():
     return MODEL_ID or "tiny-llm"
 def _get_local():
     """Load the GGUF once. Uses its OWN lock (not the generation lock) so the slow
     first-time download doesn't make concurrent requests time out — they just wait

 MODEL_PATH = os.environ.get("TINY_LLM_MODEL_PATH", "")
 HF_REPO = os.environ.get("TINY_LLM_HF_REPO", "Qwen/Qwen2.5-0.5B-Instruct-GGUF")
 HF_FILE = os.environ.get("TINY_LLM_HF_FILE", "*q4_k_m.gguf")
+N_CTX = int(os.environ.get("TINY_LLM_N_CTX", "2048"))
+# DEFAULT 2, not os.cpu_count(): in a container cpu_count() reports the HOST's cores
+# (often 16-32), but an HF free Space only has ~2 vCPU. Over-subscribing threads makes
+# llama.cpp thrash and run ~50x too slow. Override with TINY_LLM_N_THREADS on bigger HW.
+N_THREADS = int(os.environ.get("TINY_LLM_N_THREADS") or 2)
 # A label for the `model` SSE event / UI — not used to route requests.
 MODEL_ID = (
     return MODEL_ID or "tiny-llm"
+def status():
+    """Diagnostics for the persona backend (model load state + thread/CPU info)."""
+    return {
+        "mode": "external" if BASE_URL else "in-space",
+        "model": model_id(),
+        "loaded": _llm is not None,
+        "load_error": _load_error,
+        "n_threads": N_THREADS,
+        "n_ctx": N_CTX,
+        "cpu_count": os.cpu_count(),
+        "base_url": BASE_URL or None,
+    }
 def _get_local():
     """Load the GGUF once. Uses its OWN lock (not the generation lock) so the slow
     first-time download doesn't make concurrent requests time out — they just wait