Spaces:

scikit-plots
/

ai-model

Running

App Files Files Community

celik-muhammed commited on 3 days ago

Commit

fbe25b6

verified ·

1 Parent(s): 7f60b12

Upload 3 files

Browse files

Files changed (1) hide show

app.py +179 -35

app.py CHANGED Viewed

@@ -73,7 +73,7 @@
 #   * Blocking asyncio event loop with synchronous inference.
 #   * Concurrent model.cuda() / model.cpu() without a lock.
 #
-# ASSEMBLY DIAGRAM (v2.1.0)
 # ─────────────────────────
 #
 #   HuggingFace Spaces
@@ -662,36 +662,124 @@ logger.info("Validation helpers initialized successfully.")
 _MODEL_LOCK: Final[threading.Lock] = threading.Lock()
 # ─────────────────────────────────────────────────────────────────────────────
 # Model loading
 # ─────────────────────────────────────────────────────────────────────────────
-# Tokenizer remains CPU-only throughout the Space lifetime.
-# Model loads on CPU here; moved to GPU only inside @spaces.GPU.
 # Never call .to("cuda") or device_map="auto" at module level —
 # CUDA is not available outside @spaces.GPU on ZeroGPU Spaces.
-logger.info("Loading tokenizer for MODEL_ID=%s", MODEL_ID)
-_tokenizer = AutoTokenizer.from_pretrained(
-    MODEL_ID,
-)
-logger.info("Tokenizer loaded successfully.")
-logger.info(
-    "Loading model on CPU "
-    "(low_cpu_mem_usage=True, torch_dtype=bfloat16)..."
-)
-_model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
-    torch_dtype=torch.bfloat16,
-    low_cpu_mem_usage=True,
-    device_map="cpu",
-)
-logger.info("Model loaded on CPU successfully.")
 # ─────────────────────────────────────────────────────────────────────────────
@@ -751,6 +839,12 @@ def _generate(
     Notes
     -----
     Developer note
         GPU is acquired automatically by ``@spaces.GPU``.
         ``_MODEL_LOCK`` is held for the entire inference duration
@@ -783,6 +877,16 @@ def _generate(
         Do not call this function directly from async code.
         Use ``_generate_async`` from FastAPI routes.
     """
     validated_messages = _validate_messages(messages)
     max_new_tokens = _clamp_max_tokens(max_new_tokens)
@@ -936,6 +1040,12 @@ async def _generate_async(
     Notes
     -----
     Developer note
         Offloads the synchronous ``_generate`` call to a thread via
         ``asyncio.to_thread`` so the asyncio event loop is not blocked
         during GPU inference.
@@ -943,6 +1053,11 @@ async def _generate_async(
         Must NOT be called from Gradio event handlers — use ``_generate``
         directly from Gradio since it runs in its own thread pool.
     """
     return await asyncio.to_thread(
         _generate,
         messages,
@@ -1154,9 +1269,9 @@ def _build_completion_response(
         current ``_generate`` implementation does not expose partial
         stop conditions. Extend this if streaming or early stopping
         is added.
-        ``system_fingerprint`` is derived from the model ID slug to
-        satisfy OpenAI SDK response parsing without exposing internal
-        infrastructure details.
     User note
         The returned dict is compatible with OpenAI Python SDK
@@ -1167,21 +1282,12 @@ def _build_completion_response(
     .. [1] OpenAI API reference: Chat completions object
            https://platform.openai.com/docs/api-reference/chat/object
     """
-    # Derive a deterministic, URL-safe fingerprint from the model ID.
-    _model_slug = (
-        model_id
-        .lower()
-        .replace("/", "-")
-        .replace(".", "-")
-        .replace("_", "-")
-    )
     return {
         "id": f"chatcmpl-{uuid.uuid4().hex}",
         "object": "chat.completion",
         "created": int(time.time()),
         "model": model_id,
-        "system_fingerprint": f"fp-{_model_slug}",
         "choices": [
             {
                 "index": 0,
@@ -1403,6 +1509,12 @@ def _gradio_respond(
     Notes
     -----
     Developer note
         Calls ``_generate`` (sync, ``@spaces.GPU``) directly.
         Must NOT call ``_generate_async`` (async) because Gradio
         dispatches event handlers via its own thread pool, completely
@@ -1420,6 +1532,11 @@ def _gradio_respond(
             "Message must be a non-empty string."
         )
     messages = _normalize_gradio_history(history)
     messages.append(
@@ -1588,7 +1705,7 @@ async def health() -> JSONResponse:
     Examples
     --------
     >>> # curl http://localhost:7860/health
-    ... # {"status": "ok", "model": "...", "version": "2.1.0"}
     """
     logger.info("GET /health")
@@ -1596,7 +1713,8 @@ async def health() -> JSONResponse:
         content={
             "status": "ok",
             "model": MODEL_ID,
-            "version": "2.1.0",
         },
         status_code=200,
     )
@@ -1642,6 +1760,8 @@ async def chat_completions(  # noqa: PLR0911
         3. Extract ``messages``, ``max_tokens``, ``temperature``,
            ``top_p``, and ``model`` fields.
         4. Validate with field-specific validators (400 guard).
         5. Count prompt tokens on CPU (no GPU needed).
         6. Dispatch to ``_generate_async`` which offloads to
            ``@spaces.GPU`` via ``asyncio.to_thread``.
@@ -1771,6 +1891,29 @@ async def chat_completions(  # noqa: PLR0911
         top_p,
     )
     # ── 5. Prompt token count (CPU, pre-dispatch) ─────────────────────────────
     prompt_tokens: int = _count_prompt_tokens(messages)
@@ -1866,13 +2009,14 @@ logger.info(
 logger.info(
     "scikit-plots ai-model Space initialized successfully.\n"
-    "  version   : 2.1.0\n"
     "  model     : %s\n"
     "  CORS      : %s\n"
     "  max_body  : %s bytes\n"
     "  ASGI root : Gradio (ZeroGPU-compatible)\n"
     "  routes    : GET /health | POST /v1/chat/completions\n"
     "  test UI   : / (root, developer only)",
     MODEL_ID,
     CORS_ORIGINS,
     MAX_BODY_BYTES,

 #   * Blocking asyncio event loop with synchronous inference.
 #   * Concurrent model.cuda() / model.cpu() without a lock.
 #
+# ASSEMBLY DIAGRAM (v2.2.0)
 # ─────────────────────────
 #
 #   HuggingFace Spaces
 _MODEL_LOCK: Final[threading.Lock] = threading.Lock()
+# ─────────────────────────────────────────────────────────────────────────────
+# Initialization lock and readiness event
+# ─────────────────────────────────────────────────────────────────────────────
+# _INIT_LOCK       — guards the one-time model initialisation inside
+#                    _ensure_model_loaded().  Held only during CPU-side
+#                    loading, never inside @spaces.GPU.  Separate from
+#                    _MODEL_LOCK which serialises GPU device transitions.
+#                    The two locks have strictly disjoint scopes and are
+#                    never held simultaneously: no deadlock risk.
+#
+# _model_is_loaded — threading.Event set exactly once after a successful
+#                    load.  Provides a lock-free fast path on every
+#                    subsequent call to _ensure_model_loaded() and
+#                    exposes model readiness in /health.
+_INIT_LOCK: Final[threading.Lock] = threading.Lock()
+_model_is_loaded: Final[threading.Event] = threading.Event()
 # ─────────────────────────────────────────────────────────────────────────────
 # Model loading
 # ─────────────────────────────────────────────────────────────────────────────
+# Both are None at module import; loaded exactly once on the first
+# inference request via _ensure_model_loaded().  This prevents OOM when
+# Gradio 6.x or ZeroGPU session management spawns a secondary Python
+# worker process: a process that receives no inference request never loads
+# the model and therefore stays well within the 16 GB RAM hard limit.
 # Never call .to("cuda") or device_map="auto" at module level —
 # CUDA is not available outside @spaces.GPU on ZeroGPU Spaces.
+_tokenizer: AutoTokenizer | None = None
+_model: AutoModelForCausalLM | None = None
+def _ensure_model_loaded() -> None:
+    """
+    Load tokenizer and model exactly once; no-op on subsequent calls.
+    Uses double-checked locking (``_INIT_LOCK``) to guarantee that
+    tokenizer and model loading occur at most once across all threads in
+    the process.  After the first successful load, all subsequent calls
+    return immediately via a lock-free check on ``_model_is_loaded``.
+    Returns
+    -------
+    None
+    Raises
+    ------
+    RuntimeError
+        If ``AutoTokenizer.from_pretrained`` or
+        ``AutoModelForCausalLM.from_pretrained`` raises.  If the
+        tokenizer loads but the model fails, ``_model_is_loaded`` is
+        never set so the next call retries the full sequence from
+        the tokenizer step.
+    Notes
+    -----
+    Developer note
+        Must be called by callers **before** the ``@spaces.GPU`` scope
+        so that model loading (a CPU-only operation) does not consume
+        ZeroGPU GPU quota.
+        From sync callers (e.g. ``_gradio_respond``): call directly.
+        From async callers (e.g. ``_generate_async``,
+        ``chat_completions``): call via
+        ``await asyncio.to_thread(_ensure_model_loaded)`` to prevent
+        blocking the asyncio event loop during the first load.
+        Lock scope: ``_INIT_LOCK`` is held only during CPU-side loading,
+        never inside ``@spaces.GPU``.  ``_MODEL_LOCK`` serialises GPU
+        device transitions inside ``_generate``.  The two locks have
+        strictly disjoint scopes — no deadlock risk.
+    User note
+        The first inference request after a cold start may take
+        30–120 seconds while the model downloads and loads to CPU.
+        Subsequent requests within the same active session complete in
+        seconds.
+    """
+    # Fast path — lock-free check on the threading.Event.
+    if _model_is_loaded.is_set():
+        return
+    with _INIT_LOCK:
+        # Double-checked locking: re-test inside the mutex in case
+        # another thread completed loading between the fast-path check
+        # above and lock acquisition.
+        if _model_is_loaded.is_set():
+            return
+        global _tokenizer, _model  # noqa: PLW0603
+        logger.info("Loading tokenizer for MODEL_ID=%s", MODEL_ID)
+        _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+        logger.info("Tokenizer loaded successfully.")
+        logger.info(
+            "Loading model on CPU "
+            "(low_cpu_mem_usage=True, torch_dtype=bfloat16)..."
+        )
+        _model = AutoModelForCausalLM.from_pretrained(
+            MODEL_ID,
+            torch_dtype=torch.bfloat16,
+            low_cpu_mem_usage=True,
+            device_map="cpu",
+        )
+        logger.info("Model loaded on CPU successfully.")
+        # Set the event last, only after both loads succeed.
+        # Any exception above leaves _model_is_loaded unset so the
+        # next request retries the full load sequence.
+        _model_is_loaded.set()
 # ─────────────────────────────────────────────────────────────────────────────
     Notes
     -----
     Developer note
+        Callers must invoke ``_ensure_model_loaded()`` **before** the
+        ``@spaces.GPU`` scope.  A guard at the start of this function
+        raises ``RuntimeError`` immediately if ``_tokenizer`` or
+        ``_model`` is ``None``, producing a clear programming-error
+        message instead of a cryptic ``AttributeError`` on ``None``.
         GPU is acquired automatically by ``@spaces.GPU``.
         ``_MODEL_LOCK`` is held for the entire inference duration
         Do not call this function directly from async code.
         Use ``_generate_async`` from FastAPI routes.
     """
+    # Guard: callers must invoke _ensure_model_loaded() before the
+    # @spaces.GPU scope.  This check makes the contract explicit and
+    # produces a clear RuntimeError instead of an AttributeError on None.
+    if _tokenizer is None or _model is None:
+        raise RuntimeError(
+            "_ensure_model_loaded() must be called by the caller "
+            "before entering the @spaces.GPU scope. "
+            "This is a programming error, not a user error."
+        )
     validated_messages = _validate_messages(messages)
     max_new_tokens = _clamp_max_tokens(max_new_tokens)
     Notes
     -----
     Developer note
+        Calls ``_ensure_model_loaded()`` via ``asyncio.to_thread``
+        before dispatching ``_generate``, so the CPU-only model load
+        does not block the asyncio event loop and does not consume
+        ZeroGPU GPU quota.  Subsequent calls hit the lock-free fast
+        path (``_model_is_loaded.is_set()``) and return immediately.
         Offloads the synchronous ``_generate`` call to a thread via
         ``asyncio.to_thread`` so the asyncio event loop is not blocked
         during GPU inference.
         Must NOT be called from Gradio event handlers — use ``_generate``
         directly from Gradio since it runs in its own thread pool.
     """
+    # Load tokenizer and model on first call only (CPU-only operation).
+    # Called before asyncio.to_thread(_generate) so loading completes
+    # before @spaces.GPU activates — ZeroGPU GPU quota is not consumed.
+    await asyncio.to_thread(_ensure_model_loaded)
     return await asyncio.to_thread(
         _generate,
         messages,
         current ``_generate`` implementation does not expose partial
         stop conditions. Extend this if streaming or early stopping
         is added.
+        ``system_fingerprint`` uses the pre-computed module-level
+        constant ``_SYSTEM_FINGERPRINT`` (derived from ``MODEL_ID`` at
+        import time) to avoid repeated string transformation per call.
     User note
         The returned dict is compatible with OpenAI Python SDK
     .. [1] OpenAI API reference: Chat completions object
            https://platform.openai.com/docs/api-reference/chat/object
     """
     return {
         "id": f"chatcmpl-{uuid.uuid4().hex}",
         "object": "chat.completion",
         "created": int(time.time()),
         "model": model_id,
+        "system_fingerprint": _SYSTEM_FINGERPRINT,
         "choices": [
             {
                 "index": 0,
     Notes
     -----
     Developer note
+        Calls ``_ensure_model_loaded()`` directly (synchronous) before
+        ``_generate`` so the CPU-only model load does not consume
+        ZeroGPU GPU quota.  This is correct: Gradio dispatches event
+        handlers in its own thread pool, so calling a blocking function
+        here does not block the asyncio event loop.
         Calls ``_generate`` (sync, ``@spaces.GPU``) directly.
         Must NOT call ``_generate_async`` (async) because Gradio
         dispatches event handlers via its own thread pool, completely
             "Message must be a non-empty string."
         )
+    # Load tokenizer and model on first call only (CPU-only operation).
+    # Called before _generate/@spaces.GPU so loading does not consume
+    # ZeroGPU GPU quota.  Gradio's thread pool makes this blocking call safe.
+    _ensure_model_loaded()
     messages = _normalize_gradio_history(history)
     messages.append(
     Examples
     --------
     >>> # curl http://localhost:7860/health
+    ... # {"status": "ok", "model": "...", "version": "2.2.0", "model_ready": true}
     """
     logger.info("GET /health")
         content={
             "status": "ok",
             "model": MODEL_ID,
+            "version": _VERSION,
+            "model_ready": _model_is_loaded.is_set(),
         },
         status_code=200,
     )
         3. Extract ``messages``, ``max_tokens``, ``temperature``,
            ``top_p``, and ``model`` fields.
         4. Validate with field-specific validators (400 guard).
+        4b. Lazy model load — ``_ensure_model_loaded()`` via
+            ``asyncio.to_thread`` (500 on failure).
         5. Count prompt tokens on CPU (no GPU needed).
         6. Dispatch to ``_generate_async`` which offloads to
            ``@spaces.GPU`` via ``asyncio.to_thread``.
         top_p,
     )
+    # ── 4b. Lazy model loading (CPU only, before GPU dispatch) ───────────────
+    # _ensure_model_loaded() must complete before _count_prompt_tokens
+    # (which needs _tokenizer) and before the @spaces.GPU scope inside
+    # _generate.  asyncio.to_thread prevents blocking the event loop on
+    # the first load (which may download ~14 GB from HuggingFace).
+    try:
+        await asyncio.to_thread(_ensure_model_loaded)
+    except Exception:  # noqa: BLE001
+        logger.exception(
+            "Model loading failed | request_id=%s",
+            request_id,
+        )
+        return _error_response(
+            message=(
+                "Model loading failed. "
+                "Please retry in a few minutes."
+            ),
+            error_type="server_error",
+            code="model_load_error",
+            status_code=500,
+        )
     # ── 5. Prompt token count (CPU, pre-dispatch) ─────────────────────────────
     prompt_tokens: int = _count_prompt_tokens(messages)
 logger.info(
     "scikit-plots ai-model Space initialized successfully.\n"
+    "  version   : %s\n"
     "  model     : %s\n"
     "  CORS      : %s\n"
     "  max_body  : %s bytes\n"
     "  ASGI root : Gradio (ZeroGPU-compatible)\n"
     "  routes    : GET /health | POST /v1/chat/completions\n"
     "  test UI   : / (root, developer only)",
+    _VERSION,
     MODEL_ID,
     CORS_ORIGINS,
     MAX_BODY_BYTES,