polats commited on
Commit
f85d7c3
·
1 Parent(s): 1df0cfb

Persona: load model under its own lock + prewarm in background (cold-start download no longer makes requests 'busy')

Browse files
Files changed (2) hide show
  1. app.py +1 -0
  2. llm.py +39 -12
app.py CHANGED
@@ -290,6 +290,7 @@ app = gr.mount_gradio_app(fastapi_app, demo, path="/", head=HEAD, theme=gr.theme
290
 
291
 
292
  if __name__ == "__main__":
 
293
  # proxy_headers + trusting forwarded IPs lets Gradio honour X-Forwarded-Proto
294
  # from HF's edge, so it generates https (not http) asset URLs behind the proxy.
295
  uvicorn.run(app, host="0.0.0.0", port=7860,
 
290
 
291
 
292
  if __name__ == "__main__":
293
+ llm.prewarm() # load the GGUF in the background so the first request is warm
294
  # proxy_headers + trusting forwarded IPs lets Gradio honour X-Forwarded-Proto
295
  # from HF's edge, so it generates https (not http) asset URLs behind the proxy.
296
  uvicorn.run(app, host="0.0.0.0", port=7860,
llm.py CHANGED
@@ -36,7 +36,8 @@ MODEL_ID = (
36
  or HF_REPO.split("/")[-1]
37
  )
38
 
39
- _lock = threading.Lock()
 
40
  _llm = None
41
  _load_error = None
42
 
@@ -50,22 +51,44 @@ def model_id():
50
 
51
 
52
  def _get_local():
 
 
 
53
  global _llm, _load_error
54
  if _llm is not None:
55
  return _llm
56
  if _load_error is not None:
57
  raise LlmUnavailable(_load_error)
58
- try:
59
- from llama_cpp import Llama
60
- common = dict(n_ctx=N_CTX, n_threads=N_THREADS, verbose=False)
61
- if MODEL_PATH:
62
- _llm = Llama(model_path=MODEL_PATH, **common)
63
- else: # pulls + caches the GGUF from Hugging Face on first use
64
- _llm = Llama.from_pretrained(repo_id=HF_REPO, filename=HF_FILE, **common)
65
- return _llm
66
- except Exception as e: # import / download / OOM / bad file
67
- _load_error = f"{type(e).__name__}: {e}"
68
- raise LlmUnavailable(_load_error)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
 
71
  def _stream_external(system, user, max_tokens, temperature):
@@ -113,6 +136,10 @@ def stream_chat(system, user, max_tokens=400, temperature=0.8, should_stop=None)
113
  one CPU model never decodes two requests at once. `should_stop()` is polled each
114
  chunk so an abandoned request (client gone) stops promptly and frees the lock.
115
  Raises LlmUnavailable if no backend is available or the model is busy."""
 
 
 
 
116
  if not _lock.acquire(timeout=2):
117
  raise LlmUnavailable("the model is busy with another request — try again in a moment")
118
  try:
 
36
  or HF_REPO.split("/")[-1]
37
  )
38
 
39
+ _lock = threading.Lock() # serializes GENERATION (one CPU model at a time)
40
+ _load_lock = threading.Lock() # serializes the one-time model LOAD (the slow download)
41
  _llm = None
42
  _load_error = None
43
 
 
51
 
52
 
53
  def _get_local():
54
+ """Load the GGUF once. Uses its OWN lock (not the generation lock) so the slow
55
+ first-time download doesn't make concurrent requests time out — they just wait
56
+ here until the model is ready."""
57
  global _llm, _load_error
58
  if _llm is not None:
59
  return _llm
60
  if _load_error is not None:
61
  raise LlmUnavailable(_load_error)
62
+ with _load_lock:
63
+ if _llm is not None:
64
+ return _llm
65
+ if _load_error is not None:
66
+ raise LlmUnavailable(_load_error)
67
+ try:
68
+ from llama_cpp import Llama
69
+ common = dict(n_ctx=N_CTX, n_threads=N_THREADS, verbose=False)
70
+ if MODEL_PATH:
71
+ _llm = Llama(model_path=MODEL_PATH, **common)
72
+ else: # pulls + caches the GGUF from Hugging Face on first use
73
+ _llm = Llama.from_pretrained(repo_id=HF_REPO, filename=HF_FILE, **common)
74
+ return _llm
75
+ except Exception as e: # import / download / OOM / bad file
76
+ _load_error = f"{type(e).__name__}: {e}"
77
+ raise LlmUnavailable(_load_error)
78
+
79
+
80
+ def prewarm():
81
+ """Kick off the model load in the background so the app starts immediately and the
82
+ download happens before the first user request (best-effort)."""
83
+ if BASE_URL:
84
+ return # external endpoint — nothing to load locally
85
+
86
+ def _bg():
87
+ try:
88
+ _get_local()
89
+ except Exception:
90
+ pass # _load_error is recorded; callers fall back to the stub
91
+ threading.Thread(target=_bg, daemon=True).start()
92
 
93
 
94
  def _stream_external(system, user, max_tokens, temperature):
 
136
  one CPU model never decodes two requests at once. `should_stop()` is polled each
137
  chunk so an abandoned request (client gone) stops promptly and frees the lock.
138
  Raises LlmUnavailable if no backend is available or the model is busy."""
139
+ # Ensure the model is loaded FIRST (its own lock; the slow download must not count
140
+ # against the short generation-lock timeout below).
141
+ if not BASE_URL:
142
+ _get_local()
143
  if not _lock.acquire(timeout=2):
144
  raise LlmUnavailable("the model is busy with another request — try again in a moment")
145
  try: