Spaces:
Running
Running
Persona: load model under its own lock + prewarm in background (cold-start download no longer makes requests 'busy')
Browse files
app.py
CHANGED
|
@@ -290,6 +290,7 @@ app = gr.mount_gradio_app(fastapi_app, demo, path="/", head=HEAD, theme=gr.theme
|
|
| 290 |
|
| 291 |
|
| 292 |
if __name__ == "__main__":
|
|
|
|
| 293 |
# proxy_headers + trusting forwarded IPs lets Gradio honour X-Forwarded-Proto
|
| 294 |
# from HF's edge, so it generates https (not http) asset URLs behind the proxy.
|
| 295 |
uvicorn.run(app, host="0.0.0.0", port=7860,
|
|
|
|
| 290 |
|
| 291 |
|
| 292 |
if __name__ == "__main__":
|
| 293 |
+
llm.prewarm() # load the GGUF in the background so the first request is warm
|
| 294 |
# proxy_headers + trusting forwarded IPs lets Gradio honour X-Forwarded-Proto
|
| 295 |
# from HF's edge, so it generates https (not http) asset URLs behind the proxy.
|
| 296 |
uvicorn.run(app, host="0.0.0.0", port=7860,
|
llm.py
CHANGED
|
@@ -36,7 +36,8 @@ MODEL_ID = (
|
|
| 36 |
or HF_REPO.split("/")[-1]
|
| 37 |
)
|
| 38 |
|
| 39 |
-
_lock = threading.Lock()
|
|
|
|
| 40 |
_llm = None
|
| 41 |
_load_error = None
|
| 42 |
|
|
@@ -50,22 +51,44 @@ def model_id():
|
|
| 50 |
|
| 51 |
|
| 52 |
def _get_local():
|
|
|
|
|
|
|
|
|
|
| 53 |
global _llm, _load_error
|
| 54 |
if _llm is not None:
|
| 55 |
return _llm
|
| 56 |
if _load_error is not None:
|
| 57 |
raise LlmUnavailable(_load_error)
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
if
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
|
| 71 |
def _stream_external(system, user, max_tokens, temperature):
|
|
@@ -113,6 +136,10 @@ def stream_chat(system, user, max_tokens=400, temperature=0.8, should_stop=None)
|
|
| 113 |
one CPU model never decodes two requests at once. `should_stop()` is polled each
|
| 114 |
chunk so an abandoned request (client gone) stops promptly and frees the lock.
|
| 115 |
Raises LlmUnavailable if no backend is available or the model is busy."""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
if not _lock.acquire(timeout=2):
|
| 117 |
raise LlmUnavailable("the model is busy with another request — try again in a moment")
|
| 118 |
try:
|
|
|
|
| 36 |
or HF_REPO.split("/")[-1]
|
| 37 |
)
|
| 38 |
|
| 39 |
+
_lock = threading.Lock() # serializes GENERATION (one CPU model at a time)
|
| 40 |
+
_load_lock = threading.Lock() # serializes the one-time model LOAD (the slow download)
|
| 41 |
_llm = None
|
| 42 |
_load_error = None
|
| 43 |
|
|
|
|
| 51 |
|
| 52 |
|
| 53 |
def _get_local():
|
| 54 |
+
"""Load the GGUF once. Uses its OWN lock (not the generation lock) so the slow
|
| 55 |
+
first-time download doesn't make concurrent requests time out — they just wait
|
| 56 |
+
here until the model is ready."""
|
| 57 |
global _llm, _load_error
|
| 58 |
if _llm is not None:
|
| 59 |
return _llm
|
| 60 |
if _load_error is not None:
|
| 61 |
raise LlmUnavailable(_load_error)
|
| 62 |
+
with _load_lock:
|
| 63 |
+
if _llm is not None:
|
| 64 |
+
return _llm
|
| 65 |
+
if _load_error is not None:
|
| 66 |
+
raise LlmUnavailable(_load_error)
|
| 67 |
+
try:
|
| 68 |
+
from llama_cpp import Llama
|
| 69 |
+
common = dict(n_ctx=N_CTX, n_threads=N_THREADS, verbose=False)
|
| 70 |
+
if MODEL_PATH:
|
| 71 |
+
_llm = Llama(model_path=MODEL_PATH, **common)
|
| 72 |
+
else: # pulls + caches the GGUF from Hugging Face on first use
|
| 73 |
+
_llm = Llama.from_pretrained(repo_id=HF_REPO, filename=HF_FILE, **common)
|
| 74 |
+
return _llm
|
| 75 |
+
except Exception as e: # import / download / OOM / bad file
|
| 76 |
+
_load_error = f"{type(e).__name__}: {e}"
|
| 77 |
+
raise LlmUnavailable(_load_error)
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def prewarm():
|
| 81 |
+
"""Kick off the model load in the background so the app starts immediately and the
|
| 82 |
+
download happens before the first user request (best-effort)."""
|
| 83 |
+
if BASE_URL:
|
| 84 |
+
return # external endpoint — nothing to load locally
|
| 85 |
+
|
| 86 |
+
def _bg():
|
| 87 |
+
try:
|
| 88 |
+
_get_local()
|
| 89 |
+
except Exception:
|
| 90 |
+
pass # _load_error is recorded; callers fall back to the stub
|
| 91 |
+
threading.Thread(target=_bg, daemon=True).start()
|
| 92 |
|
| 93 |
|
| 94 |
def _stream_external(system, user, max_tokens, temperature):
|
|
|
|
| 136 |
one CPU model never decodes two requests at once. `should_stop()` is polled each
|
| 137 |
chunk so an abandoned request (client gone) stops promptly and frees the lock.
|
| 138 |
Raises LlmUnavailable if no backend is available or the model is busy."""
|
| 139 |
+
# Ensure the model is loaded FIRST (its own lock; the slow download must not count
|
| 140 |
+
# against the short generation-lock timeout below).
|
| 141 |
+
if not BASE_URL:
|
| 142 |
+
_get_local()
|
| 143 |
if not _lock.acquire(timeout=2):
|
| 144 |
raise LlmUnavailable("the model is busy with another request — try again in a moment")
|
| 145 |
try:
|