celik-muhammed commited on
Commit
fbe25b6
Β·
verified Β·
1 Parent(s): 7f60b12

Upload 3 files

Browse files
Files changed (1) hide show
  1. app.py +179 -35
app.py CHANGED
@@ -73,7 +73,7 @@
73
  # * Blocking asyncio event loop with synchronous inference.
74
  # * Concurrent model.cuda() / model.cpu() without a lock.
75
  #
76
- # ASSEMBLY DIAGRAM (v2.1.0)
77
  # ─────────────────────────
78
  #
79
  # HuggingFace Spaces
@@ -662,36 +662,124 @@ logger.info("Validation helpers initialized successfully.")
662
 
663
  _MODEL_LOCK: Final[threading.Lock] = threading.Lock()
664
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
665
 
666
  # ─────────────────────────────────────────────────────────────────────────────
667
  # Model loading
668
  # ─────────────────────────────────────────────────────────────────────────────
669
- # Tokenizer remains CPU-only throughout the Space lifetime.
670
- # Model loads on CPU here; moved to GPU only inside @spaces.GPU.
 
 
 
671
  # Never call .to("cuda") or device_map="auto" at module level β€”
672
  # CUDA is not available outside @spaces.GPU on ZeroGPU Spaces.
673
 
674
- logger.info("Loading tokenizer for MODEL_ID=%s", MODEL_ID)
 
675
 
676
- _tokenizer = AutoTokenizer.from_pretrained(
677
- MODEL_ID,
678
- )
679
 
680
- logger.info("Tokenizer loaded successfully.")
 
 
681
 
682
- logger.info(
683
- "Loading model on CPU "
684
- "(low_cpu_mem_usage=True, torch_dtype=bfloat16)..."
685
- )
686
 
687
- _model = AutoModelForCausalLM.from_pretrained(
688
- MODEL_ID,
689
- torch_dtype=torch.bfloat16,
690
- low_cpu_mem_usage=True,
691
- device_map="cpu",
692
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
693
 
694
- logger.info("Model loaded on CPU successfully.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
695
 
696
 
697
  # ─────────────────────────────────────────────────────────────────────────────
@@ -751,6 +839,12 @@ def _generate(
751
  Notes
752
  -----
753
  Developer note
 
 
 
 
 
 
754
  GPU is acquired automatically by ``@spaces.GPU``.
755
 
756
  ``_MODEL_LOCK`` is held for the entire inference duration
@@ -783,6 +877,16 @@ def _generate(
783
  Do not call this function directly from async code.
784
  Use ``_generate_async`` from FastAPI routes.
785
  """
 
 
 
 
 
 
 
 
 
 
786
  validated_messages = _validate_messages(messages)
787
  max_new_tokens = _clamp_max_tokens(max_new_tokens)
788
 
@@ -936,6 +1040,12 @@ async def _generate_async(
936
  Notes
937
  -----
938
  Developer note
 
 
 
 
 
 
939
  Offloads the synchronous ``_generate`` call to a thread via
940
  ``asyncio.to_thread`` so the asyncio event loop is not blocked
941
  during GPU inference.
@@ -943,6 +1053,11 @@ async def _generate_async(
943
  Must NOT be called from Gradio event handlers β€” use ``_generate``
944
  directly from Gradio since it runs in its own thread pool.
945
  """
 
 
 
 
 
946
  return await asyncio.to_thread(
947
  _generate,
948
  messages,
@@ -1154,9 +1269,9 @@ def _build_completion_response(
1154
  current ``_generate`` implementation does not expose partial
1155
  stop conditions. Extend this if streaming or early stopping
1156
  is added.
1157
- ``system_fingerprint`` is derived from the model ID slug to
1158
- satisfy OpenAI SDK response parsing without exposing internal
1159
- infrastructure details.
1160
 
1161
  User note
1162
  The returned dict is compatible with OpenAI Python SDK
@@ -1167,21 +1282,12 @@ def _build_completion_response(
1167
  .. [1] OpenAI API reference: Chat completions object
1168
  https://platform.openai.com/docs/api-reference/chat/object
1169
  """
1170
- # Derive a deterministic, URL-safe fingerprint from the model ID.
1171
- _model_slug = (
1172
- model_id
1173
- .lower()
1174
- .replace("/", "-")
1175
- .replace(".", "-")
1176
- .replace("_", "-")
1177
- )
1178
-
1179
  return {
1180
  "id": f"chatcmpl-{uuid.uuid4().hex}",
1181
  "object": "chat.completion",
1182
  "created": int(time.time()),
1183
  "model": model_id,
1184
- "system_fingerprint": f"fp-{_model_slug}",
1185
  "choices": [
1186
  {
1187
  "index": 0,
@@ -1403,6 +1509,12 @@ def _gradio_respond(
1403
  Notes
1404
  -----
1405
  Developer note
 
 
 
 
 
 
1406
  Calls ``_generate`` (sync, ``@spaces.GPU``) directly.
1407
  Must NOT call ``_generate_async`` (async) because Gradio
1408
  dispatches event handlers via its own thread pool, completely
@@ -1420,6 +1532,11 @@ def _gradio_respond(
1420
  "Message must be a non-empty string."
1421
  )
1422
 
 
 
 
 
 
1423
  messages = _normalize_gradio_history(history)
1424
 
1425
  messages.append(
@@ -1588,7 +1705,7 @@ async def health() -> JSONResponse:
1588
  Examples
1589
  --------
1590
  >>> # curl http://localhost:7860/health
1591
- ... # {"status": "ok", "model": "...", "version": "2.1.0"}
1592
  """
1593
  logger.info("GET /health")
1594
 
@@ -1596,7 +1713,8 @@ async def health() -> JSONResponse:
1596
  content={
1597
  "status": "ok",
1598
  "model": MODEL_ID,
1599
- "version": "2.1.0",
 
1600
  },
1601
  status_code=200,
1602
  )
@@ -1642,6 +1760,8 @@ async def chat_completions( # noqa: PLR0911
1642
  3. Extract ``messages``, ``max_tokens``, ``temperature``,
1643
  ``top_p``, and ``model`` fields.
1644
  4. Validate with field-specific validators (400 guard).
 
 
1645
  5. Count prompt tokens on CPU (no GPU needed).
1646
  6. Dispatch to ``_generate_async`` which offloads to
1647
  ``@spaces.GPU`` via ``asyncio.to_thread``.
@@ -1771,6 +1891,29 @@ async def chat_completions( # noqa: PLR0911
1771
  top_p,
1772
  )
1773
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1774
  # ── 5. Prompt token count (CPU, pre-dispatch) ─────────────────────────────
1775
 
1776
  prompt_tokens: int = _count_prompt_tokens(messages)
@@ -1866,13 +2009,14 @@ logger.info(
1866
 
1867
  logger.info(
1868
  "scikit-plots ai-model Space initialized successfully.\n"
1869
- " version : 2.1.0\n"
1870
  " model : %s\n"
1871
  " CORS : %s\n"
1872
  " max_body : %s bytes\n"
1873
  " ASGI root : Gradio (ZeroGPU-compatible)\n"
1874
  " routes : GET /health | POST /v1/chat/completions\n"
1875
  " test UI : / (root, developer only)",
 
1876
  MODEL_ID,
1877
  CORS_ORIGINS,
1878
  MAX_BODY_BYTES,
 
73
  # * Blocking asyncio event loop with synchronous inference.
74
  # * Concurrent model.cuda() / model.cpu() without a lock.
75
  #
76
+ # ASSEMBLY DIAGRAM (v2.2.0)
77
  # ─────────────────────────
78
  #
79
  # HuggingFace Spaces
 
662
 
663
  _MODEL_LOCK: Final[threading.Lock] = threading.Lock()
664
 
665
+ # ─────────────────────────────────────────────────────────────────────────────
666
+ # Initialization lock and readiness event
667
+ # ─────────────────────────────────────────────────────────────────────────────
668
+ # _INIT_LOCK β€” guards the one-time model initialisation inside
669
+ # _ensure_model_loaded(). Held only during CPU-side
670
+ # loading, never inside @spaces.GPU. Separate from
671
+ # _MODEL_LOCK which serialises GPU device transitions.
672
+ # The two locks have strictly disjoint scopes and are
673
+ # never held simultaneously: no deadlock risk.
674
+ #
675
+ # _model_is_loaded β€” threading.Event set exactly once after a successful
676
+ # load. Provides a lock-free fast path on every
677
+ # subsequent call to _ensure_model_loaded() and
678
+ # exposes model readiness in /health.
679
+
680
+ _INIT_LOCK: Final[threading.Lock] = threading.Lock()
681
+ _model_is_loaded: Final[threading.Event] = threading.Event()
682
+
683
 
684
  # ─────────────────────────────────────────────────────────────────────────────
685
  # Model loading
686
  # ─────────────────────────────────────────────────────────────────────────────
687
+ # Both are None at module import; loaded exactly once on the first
688
+ # inference request via _ensure_model_loaded(). This prevents OOM when
689
+ # Gradio 6.x or ZeroGPU session management spawns a secondary Python
690
+ # worker process: a process that receives no inference request never loads
691
+ # the model and therefore stays well within the 16 GB RAM hard limit.
692
  # Never call .to("cuda") or device_map="auto" at module level β€”
693
  # CUDA is not available outside @spaces.GPU on ZeroGPU Spaces.
694
 
695
+ _tokenizer: AutoTokenizer | None = None
696
+ _model: AutoModelForCausalLM | None = None
697
 
 
 
 
698
 
699
+ def _ensure_model_loaded() -> None:
700
+ """
701
+ Load tokenizer and model exactly once; no-op on subsequent calls.
702
 
703
+ Uses double-checked locking (``_INIT_LOCK``) to guarantee that
704
+ tokenizer and model loading occur at most once across all threads in
705
+ the process. After the first successful load, all subsequent calls
706
+ return immediately via a lock-free check on ``_model_is_loaded``.
707
 
708
+ Returns
709
+ -------
710
+ None
711
+
712
+ Raises
713
+ ------
714
+ RuntimeError
715
+ If ``AutoTokenizer.from_pretrained`` or
716
+ ``AutoModelForCausalLM.from_pretrained`` raises. If the
717
+ tokenizer loads but the model fails, ``_model_is_loaded`` is
718
+ never set so the next call retries the full sequence from
719
+ the tokenizer step.
720
+
721
+ Notes
722
+ -----
723
+ Developer note
724
+ Must be called by callers **before** the ``@spaces.GPU`` scope
725
+ so that model loading (a CPU-only operation) does not consume
726
+ ZeroGPU GPU quota.
727
+
728
+ From sync callers (e.g. ``_gradio_respond``): call directly.
729
+
730
+ From async callers (e.g. ``_generate_async``,
731
+ ``chat_completions``): call via
732
+ ``await asyncio.to_thread(_ensure_model_loaded)`` to prevent
733
+ blocking the asyncio event loop during the first load.
734
+
735
+ Lock scope: ``_INIT_LOCK`` is held only during CPU-side loading,
736
+ never inside ``@spaces.GPU``. ``_MODEL_LOCK`` serialises GPU
737
+ device transitions inside ``_generate``. The two locks have
738
+ strictly disjoint scopes β€” no deadlock risk.
739
+
740
+ User note
741
+ The first inference request after a cold start may take
742
+ 30–120 seconds while the model downloads and loads to CPU.
743
+ Subsequent requests within the same active session complete in
744
+ seconds.
745
+ """
746
+ # Fast path β€” lock-free check on the threading.Event.
747
+ if _model_is_loaded.is_set():
748
+ return
749
+
750
+ with _INIT_LOCK:
751
+ # Double-checked locking: re-test inside the mutex in case
752
+ # another thread completed loading between the fast-path check
753
+ # above and lock acquisition.
754
+ if _model_is_loaded.is_set():
755
+ return
756
+
757
+ global _tokenizer, _model # noqa: PLW0603
758
 
759
+ logger.info("Loading tokenizer for MODEL_ID=%s", MODEL_ID)
760
+
761
+ _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
762
+
763
+ logger.info("Tokenizer loaded successfully.")
764
+
765
+ logger.info(
766
+ "Loading model on CPU "
767
+ "(low_cpu_mem_usage=True, torch_dtype=bfloat16)..."
768
+ )
769
+
770
+ _model = AutoModelForCausalLM.from_pretrained(
771
+ MODEL_ID,
772
+ torch_dtype=torch.bfloat16,
773
+ low_cpu_mem_usage=True,
774
+ device_map="cpu",
775
+ )
776
+
777
+ logger.info("Model loaded on CPU successfully.")
778
+
779
+ # Set the event last, only after both loads succeed.
780
+ # Any exception above leaves _model_is_loaded unset so the
781
+ # next request retries the full load sequence.
782
+ _model_is_loaded.set()
783
 
784
 
785
  # ─────────────────────────────────────────────────────────────────────────────
 
839
  Notes
840
  -----
841
  Developer note
842
+ Callers must invoke ``_ensure_model_loaded()`` **before** the
843
+ ``@spaces.GPU`` scope. A guard at the start of this function
844
+ raises ``RuntimeError`` immediately if ``_tokenizer`` or
845
+ ``_model`` is ``None``, producing a clear programming-error
846
+ message instead of a cryptic ``AttributeError`` on ``None``.
847
+
848
  GPU is acquired automatically by ``@spaces.GPU``.
849
 
850
  ``_MODEL_LOCK`` is held for the entire inference duration
 
877
  Do not call this function directly from async code.
878
  Use ``_generate_async`` from FastAPI routes.
879
  """
880
+ # Guard: callers must invoke _ensure_model_loaded() before the
881
+ # @spaces.GPU scope. This check makes the contract explicit and
882
+ # produces a clear RuntimeError instead of an AttributeError on None.
883
+ if _tokenizer is None or _model is None:
884
+ raise RuntimeError(
885
+ "_ensure_model_loaded() must be called by the caller "
886
+ "before entering the @spaces.GPU scope. "
887
+ "This is a programming error, not a user error."
888
+ )
889
+
890
  validated_messages = _validate_messages(messages)
891
  max_new_tokens = _clamp_max_tokens(max_new_tokens)
892
 
 
1040
  Notes
1041
  -----
1042
  Developer note
1043
+ Calls ``_ensure_model_loaded()`` via ``asyncio.to_thread``
1044
+ before dispatching ``_generate``, so the CPU-only model load
1045
+ does not block the asyncio event loop and does not consume
1046
+ ZeroGPU GPU quota. Subsequent calls hit the lock-free fast
1047
+ path (``_model_is_loaded.is_set()``) and return immediately.
1048
+
1049
  Offloads the synchronous ``_generate`` call to a thread via
1050
  ``asyncio.to_thread`` so the asyncio event loop is not blocked
1051
  during GPU inference.
 
1053
  Must NOT be called from Gradio event handlers β€” use ``_generate``
1054
  directly from Gradio since it runs in its own thread pool.
1055
  """
1056
+ # Load tokenizer and model on first call only (CPU-only operation).
1057
+ # Called before asyncio.to_thread(_generate) so loading completes
1058
+ # before @spaces.GPU activates β€” ZeroGPU GPU quota is not consumed.
1059
+ await asyncio.to_thread(_ensure_model_loaded)
1060
+
1061
  return await asyncio.to_thread(
1062
  _generate,
1063
  messages,
 
1269
  current ``_generate`` implementation does not expose partial
1270
  stop conditions. Extend this if streaming or early stopping
1271
  is added.
1272
+ ``system_fingerprint`` uses the pre-computed module-level
1273
+ constant ``_SYSTEM_FINGERPRINT`` (derived from ``MODEL_ID`` at
1274
+ import time) to avoid repeated string transformation per call.
1275
 
1276
  User note
1277
  The returned dict is compatible with OpenAI Python SDK
 
1282
  .. [1] OpenAI API reference: Chat completions object
1283
  https://platform.openai.com/docs/api-reference/chat/object
1284
  """
 
 
 
 
 
 
 
 
 
1285
  return {
1286
  "id": f"chatcmpl-{uuid.uuid4().hex}",
1287
  "object": "chat.completion",
1288
  "created": int(time.time()),
1289
  "model": model_id,
1290
+ "system_fingerprint": _SYSTEM_FINGERPRINT,
1291
  "choices": [
1292
  {
1293
  "index": 0,
 
1509
  Notes
1510
  -----
1511
  Developer note
1512
+ Calls ``_ensure_model_loaded()`` directly (synchronous) before
1513
+ ``_generate`` so the CPU-only model load does not consume
1514
+ ZeroGPU GPU quota. This is correct: Gradio dispatches event
1515
+ handlers in its own thread pool, so calling a blocking function
1516
+ here does not block the asyncio event loop.
1517
+
1518
  Calls ``_generate`` (sync, ``@spaces.GPU``) directly.
1519
  Must NOT call ``_generate_async`` (async) because Gradio
1520
  dispatches event handlers via its own thread pool, completely
 
1532
  "Message must be a non-empty string."
1533
  )
1534
 
1535
+ # Load tokenizer and model on first call only (CPU-only operation).
1536
+ # Called before _generate/@spaces.GPU so loading does not consume
1537
+ # ZeroGPU GPU quota. Gradio's thread pool makes this blocking call safe.
1538
+ _ensure_model_loaded()
1539
+
1540
  messages = _normalize_gradio_history(history)
1541
 
1542
  messages.append(
 
1705
  Examples
1706
  --------
1707
  >>> # curl http://localhost:7860/health
1708
+ ... # {"status": "ok", "model": "...", "version": "2.2.0", "model_ready": true}
1709
  """
1710
  logger.info("GET /health")
1711
 
 
1713
  content={
1714
  "status": "ok",
1715
  "model": MODEL_ID,
1716
+ "version": _VERSION,
1717
+ "model_ready": _model_is_loaded.is_set(),
1718
  },
1719
  status_code=200,
1720
  )
 
1760
  3. Extract ``messages``, ``max_tokens``, ``temperature``,
1761
  ``top_p``, and ``model`` fields.
1762
  4. Validate with field-specific validators (400 guard).
1763
+ 4b. Lazy model load β€” ``_ensure_model_loaded()`` via
1764
+ ``asyncio.to_thread`` (500 on failure).
1765
  5. Count prompt tokens on CPU (no GPU needed).
1766
  6. Dispatch to ``_generate_async`` which offloads to
1767
  ``@spaces.GPU`` via ``asyncio.to_thread``.
 
1891
  top_p,
1892
  )
1893
 
1894
+ # ── 4b. Lazy model loading (CPU only, before GPU dispatch) ───────────────
1895
+ # _ensure_model_loaded() must complete before _count_prompt_tokens
1896
+ # (which needs _tokenizer) and before the @spaces.GPU scope inside
1897
+ # _generate. asyncio.to_thread prevents blocking the event loop on
1898
+ # the first load (which may download ~14 GB from HuggingFace).
1899
+
1900
+ try:
1901
+ await asyncio.to_thread(_ensure_model_loaded)
1902
+ except Exception: # noqa: BLE001
1903
+ logger.exception(
1904
+ "Model loading failed | request_id=%s",
1905
+ request_id,
1906
+ )
1907
+ return _error_response(
1908
+ message=(
1909
+ "Model loading failed. "
1910
+ "Please retry in a few minutes."
1911
+ ),
1912
+ error_type="server_error",
1913
+ code="model_load_error",
1914
+ status_code=500,
1915
+ )
1916
+
1917
  # ── 5. Prompt token count (CPU, pre-dispatch) ─────────────────────────────
1918
 
1919
  prompt_tokens: int = _count_prompt_tokens(messages)
 
2009
 
2010
  logger.info(
2011
  "scikit-plots ai-model Space initialized successfully.\n"
2012
+ " version : %s\n"
2013
  " model : %s\n"
2014
  " CORS : %s\n"
2015
  " max_body : %s bytes\n"
2016
  " ASGI root : Gradio (ZeroGPU-compatible)\n"
2017
  " routes : GET /health | POST /v1/chat/completions\n"
2018
  " test UI : / (root, developer only)",
2019
+ _VERSION,
2020
  MODEL_ID,
2021
  CORS_ORIGINS,
2022
  MAX_BODY_BYTES,