Spaces:

build-small-hackathon
/

tiny-army

Running

App Files Files Community

polats commited on 6 days ago

Commit

f85d7c3

1 Parent(s): 1df0cfb

Persona: load model under its own lock + prewarm in background (cold-start download no longer makes requests 'busy')

Browse files

Files changed (2) hide show

app.py +1 -0
llm.py +39 -12

app.py CHANGED Viewed

@@ -290,6 +290,7 @@ app = gr.mount_gradio_app(fastapi_app, demo, path="/", head=HEAD, theme=gr.theme
 if __name__ == "__main__":
     # proxy_headers + trusting forwarded IPs lets Gradio honour X-Forwarded-Proto
     # from HF's edge, so it generates https (not http) asset URLs behind the proxy.
     uvicorn.run(app, host="0.0.0.0", port=7860,

 if __name__ == "__main__":
+    llm.prewarm()  # load the GGUF in the background so the first request is warm
     # proxy_headers + trusting forwarded IPs lets Gradio honour X-Forwarded-Proto
     # from HF's edge, so it generates https (not http) asset URLs behind the proxy.
     uvicorn.run(app, host="0.0.0.0", port=7860,

llm.py CHANGED Viewed

@@ -36,7 +36,8 @@ MODEL_ID = (
     or HF_REPO.split("/")[-1]
 )
-_lock = threading.Lock()
 _llm = None
 _load_error = None
@@ -50,22 +51,44 @@ def model_id():
 def _get_local():
     global _llm, _load_error
     if _llm is not None:
         return _llm
     if _load_error is not None:
         raise LlmUnavailable(_load_error)
-    try:
-        from llama_cpp import Llama
-        common = dict(n_ctx=N_CTX, n_threads=N_THREADS, verbose=False)
-        if MODEL_PATH:
-            _llm = Llama(model_path=MODEL_PATH, **common)
-        else:  # pulls + caches the GGUF from Hugging Face on first use
-            _llm = Llama.from_pretrained(repo_id=HF_REPO, filename=HF_FILE, **common)
-        return _llm
-    except Exception as e:  # import / download / OOM / bad file
-        _load_error = f"{type(e).__name__}: {e}"
-        raise LlmUnavailable(_load_error)
 def _stream_external(system, user, max_tokens, temperature):
@@ -113,6 +136,10 @@ def stream_chat(system, user, max_tokens=400, temperature=0.8, should_stop=None)
     one CPU model never decodes two requests at once. `should_stop()` is polled each
     chunk so an abandoned request (client gone) stops promptly and frees the lock.
     Raises LlmUnavailable if no backend is available or the model is busy."""
     if not _lock.acquire(timeout=2):
         raise LlmUnavailable("the model is busy with another request — try again in a moment")
     try:

     or HF_REPO.split("/")[-1]
 )
+_lock = threading.Lock()        # serializes GENERATION (one CPU model at a time)
+_load_lock = threading.Lock()   # serializes the one-time model LOAD (the slow download)
 _llm = None
 _load_error = None
 def _get_local():
+    """Load the GGUF once. Uses its OWN lock (not the generation lock) so the slow
+    first-time download doesn't make concurrent requests time out — they just wait
+    here until the model is ready."""
     global _llm, _load_error
     if _llm is not None:
         return _llm
     if _load_error is not None:
         raise LlmUnavailable(_load_error)
+    with _load_lock:
+        if _llm is not None:
+            return _llm
+        if _load_error is not None:
+            raise LlmUnavailable(_load_error)
+        try:
+            from llama_cpp import Llama
+            common = dict(n_ctx=N_CTX, n_threads=N_THREADS, verbose=False)
+            if MODEL_PATH:
+                _llm = Llama(model_path=MODEL_PATH, **common)
+            else:  # pulls + caches the GGUF from Hugging Face on first use
+                _llm = Llama.from_pretrained(repo_id=HF_REPO, filename=HF_FILE, **common)
+            return _llm
+        except Exception as e:  # import / download / OOM / bad file
+            _load_error = f"{type(e).__name__}: {e}"
+            raise LlmUnavailable(_load_error)
+def prewarm():
+    """Kick off the model load in the background so the app starts immediately and the
+    download happens before the first user request (best-effort)."""
+    if BASE_URL:
+        return  # external endpoint — nothing to load locally
+    def _bg():
+        try:
+            _get_local()
+        except Exception:
+            pass  # _load_error is recorded; callers fall back to the stub
+    threading.Thread(target=_bg, daemon=True).start()
 def _stream_external(system, user, max_tokens, temperature):
     one CPU model never decodes two requests at once. `should_stop()` is polled each
     chunk so an abandoned request (client gone) stops promptly and frees the lock.
     Raises LlmUnavailable if no backend is available or the model is busy."""
+    # Ensure the model is loaded FIRST (its own lock; the slow download must not count
+    # against the short generation-lock timeout below).
+    if not BASE_URL:
+        _get_local()
     if not _lock.acquire(timeout=2):
         raise LlmUnavailable("the model is busy with another request — try again in a moment")
     try: