Spaces:

build-small-hackathon
/

tiny-army

Running

polats commited on 6 days ago

Commit

1df0cfb

1 Parent(s): 67f4321

Persona endpoint: stop generation on client disconnect, fail-fast lock, lower token cap (prevents abandoned-gen lock pile-up)

Files changed (2) hide show

app.py CHANGED Viewed

@@ -234,6 +234,8 @@ async def persona_generate_stream(request: Request):
     seed = body.get("seed", "")
     unit_class = body.get("class") or body.get("unitClass") or ""
     async def gen():
         yield _sse("model", {"model": llm.model_id()})
         loop = asyncio.get_running_loop()
@@ -244,7 +246,7 @@ async def persona_generate_stream(request: Request):
             try:
                 for chunk in llm.stream_chat(
                     prompts.PERSONA_SYSTEM, prompts.persona_user_prompt(unit_class, seed),
-                    max_tokens=400, temperature=0.8,
                 ):
                     loop.call_soon_threadsafe(q.put_nowait, ("delta", chunk))
             except Exception as e:  # LlmUnavailable or runtime error
@@ -254,15 +256,18 @@ async def persona_generate_stream(request: Request):
         threading.Thread(target=worker, daemon=True).start()
         raw_parts = []
-        while True:
-            kind, val = await q.get()
-            if kind is DONE:
-                break
-            if kind == "error":
-                yield _sse("error", {"error": val})
-                return
-            raw_parts.append(val)
-            yield _sse("delta", {"content": val})
         try:
             p = persona_parse.parse_persona_json("".join(raw_parts))

     seed = body.get("seed", "")
     unit_class = body.get("class") or body.get("unitClass") or ""
+    stop = threading.Event()  # set when the client disconnects → worker stops, lock frees
     async def gen():
         yield _sse("model", {"model": llm.model_id()})
         loop = asyncio.get_running_loop()
             try:
                 for chunk in llm.stream_chat(
                     prompts.PERSONA_SYSTEM, prompts.persona_user_prompt(unit_class, seed),
+                    max_tokens=256, temperature=0.8, should_stop=stop.is_set,
                 ):
                     loop.call_soon_threadsafe(q.put_nowait, ("delta", chunk))
             except Exception as e:  # LlmUnavailable or runtime error
         threading.Thread(target=worker, daemon=True).start()
         raw_parts = []
+        try:
+            while True:
+                kind, val = await q.get()
+                if kind is DONE:
+                    break
+                if kind == "error":
+                    yield _sse("error", {"error": val})
+                    return
+                raw_parts.append(val)
+                yield _sse("delta", {"content": val})
+        finally:
+            stop.set()  # client gone or stream done → release the model
         try:
             p = persona_parse.parse_persona_json("".join(raw_parts))

llm.py CHANGED Viewed

@@ -108,9 +108,18 @@ def _stream_local(system, user, max_tokens, temperature):
             yield delta
-def stream_chat(system, user, max_tokens=400, temperature=0.8):
-    """Yield text chunks from the configured backend. Serialized by a module lock.
-    Raises LlmUnavailable if no backend is available."""
-    with _lock:
         gen = _stream_external if BASE_URL else _stream_local
-        yield from gen(system, user, max_tokens, temperature)

             yield delta
+def stream_chat(system, user, max_tokens=400, temperature=0.8, should_stop=None):
+    """Yield text chunks from the configured backend. Serialized by a module lock so
+    one CPU model never decodes two requests at once. `should_stop()` is polled each
+    chunk so an abandoned request (client gone) stops promptly and frees the lock.
+    Raises LlmUnavailable if no backend is available or the model is busy."""
+    if not _lock.acquire(timeout=2):
+        raise LlmUnavailable("the model is busy with another request — try again in a moment")
+    try:
         gen = _stream_external if BASE_URL else _stream_local
+        for chunk in gen(system, user, max_tokens, temperature):
+            if should_stop and should_stop():
+                break
+            yield chunk
+    finally:
+        _lock.release()