polats commited on
Commit
2f7e532
·
1 Parent(s): f85d7c3

Persona: cap n_threads=2 (cpu_count over-reports host cores in a container → llama.cpp thrashing/50x slowdown); smaller ctx; /persona/status diag; lower token cap

Browse files
Files changed (2) hide show
  1. app.py +6 -1
  2. llm.py +19 -2
app.py CHANGED
@@ -223,6 +223,11 @@ def _sse(event, data):
223
  return f"event: {event}\ndata: {_json.dumps(data)}\n\n"
224
 
225
 
 
 
 
 
 
226
  # Persona generation, woid-protocol-compatible so web/personaStream.js consumes it
227
  # unchanged: emits `model` → `delta`* → `persona-done` → `done` (or `error`). The
228
  # blocking llama.cpp generator runs in a worker thread bridged to this async SSE
@@ -246,7 +251,7 @@ async def persona_generate_stream(request: Request):
246
  try:
247
  for chunk in llm.stream_chat(
248
  prompts.PERSONA_SYSTEM, prompts.persona_user_prompt(unit_class, seed),
249
- max_tokens=256, temperature=0.8, should_stop=stop.is_set,
250
  ):
251
  loop.call_soon_threadsafe(q.put_nowait, ("delta", chunk))
252
  except Exception as e: # LlmUnavailable or runtime error
 
223
  return f"event: {event}\ndata: {_json.dumps(data)}\n\n"
224
 
225
 
226
+ @fastapi_app.get("/persona/status")
227
+ def persona_status():
228
+ return llm.status()
229
+
230
+
231
  # Persona generation, woid-protocol-compatible so web/personaStream.js consumes it
232
  # unchanged: emits `model` → `delta`* → `persona-done` → `done` (or `error`). The
233
  # blocking llama.cpp generator runs in a worker thread bridged to this async SSE
 
251
  try:
252
  for chunk in llm.stream_chat(
253
  prompts.PERSONA_SYSTEM, prompts.persona_user_prompt(unit_class, seed),
254
+ max_tokens=160, temperature=0.8, should_stop=stop.is_set,
255
  ):
256
  loop.call_soon_threadsafe(q.put_nowait, ("delta", chunk))
257
  except Exception as e: # LlmUnavailable or runtime error
llm.py CHANGED
@@ -25,8 +25,11 @@ API_KEY = os.environ.get("TINY_LLM_API_KEY", "")
25
  MODEL_PATH = os.environ.get("TINY_LLM_MODEL_PATH", "")
26
  HF_REPO = os.environ.get("TINY_LLM_HF_REPO", "Qwen/Qwen2.5-0.5B-Instruct-GGUF")
27
  HF_FILE = os.environ.get("TINY_LLM_HF_FILE", "*q4_k_m.gguf")
28
- N_CTX = int(os.environ.get("TINY_LLM_N_CTX", "4096"))
29
- N_THREADS = int(os.environ.get("TINY_LLM_N_THREADS", str(os.cpu_count() or 2)))
 
 
 
30
 
31
  # A label for the `model` SSE event / UI — not used to route requests.
32
  MODEL_ID = (
@@ -50,6 +53,20 @@ def model_id():
50
  return MODEL_ID or "tiny-llm"
51
 
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  def _get_local():
54
  """Load the GGUF once. Uses its OWN lock (not the generation lock) so the slow
55
  first-time download doesn't make concurrent requests time out — they just wait
 
25
  MODEL_PATH = os.environ.get("TINY_LLM_MODEL_PATH", "")
26
  HF_REPO = os.environ.get("TINY_LLM_HF_REPO", "Qwen/Qwen2.5-0.5B-Instruct-GGUF")
27
  HF_FILE = os.environ.get("TINY_LLM_HF_FILE", "*q4_k_m.gguf")
28
+ N_CTX = int(os.environ.get("TINY_LLM_N_CTX", "2048"))
29
+ # DEFAULT 2, not os.cpu_count(): in a container cpu_count() reports the HOST's cores
30
+ # (often 16-32), but an HF free Space only has ~2 vCPU. Over-subscribing threads makes
31
+ # llama.cpp thrash and run ~50x too slow. Override with TINY_LLM_N_THREADS on bigger HW.
32
+ N_THREADS = int(os.environ.get("TINY_LLM_N_THREADS") or 2)
33
 
34
  # A label for the `model` SSE event / UI — not used to route requests.
35
  MODEL_ID = (
 
53
  return MODEL_ID or "tiny-llm"
54
 
55
 
56
+ def status():
57
+ """Diagnostics for the persona backend (model load state + thread/CPU info)."""
58
+ return {
59
+ "mode": "external" if BASE_URL else "in-space",
60
+ "model": model_id(),
61
+ "loaded": _llm is not None,
62
+ "load_error": _load_error,
63
+ "n_threads": N_THREADS,
64
+ "n_ctx": N_CTX,
65
+ "cpu_count": os.cpu_count(),
66
+ "base_url": BASE_URL or None,
67
+ }
68
+
69
+
70
  def _get_local():
71
  """Load the GGUF once. Uses its OWN lock (not the generation lock) so the slow
72
  first-time download doesn't make concurrent requests time out — they just wait