Spaces:

scikit-plots
/

ai-model

Running

App Files Files Community

celik-muhammed commited on 3 days ago

Commit

4ee3014

verified ·

1 Parent(s): 385004a

Upload 3 files

Browse files

Files changed (1) hide show

app.py +398 -93

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# scikit-plots/ai-model  ·  app.py  v2.0.0
 #
 # PURPOSE
 # ───────
@@ -25,23 +25,38 @@
 # + Long-term maintainability
 # + Minimal hidden behavior
 #
 # CRITICAL: ZEROGPU ARCHITECTURE REQUIREMENT
 # ──────────────────────────────────────────
 # On HuggingFace Spaces with sdk: gradio, ZeroGPU hooks attach to the
 # Gradio server lifecycle. Gradio MUST be the ASGI root application.
 #
-# CORRECT (v2.0.0):
 #   Gradio (gr.Blocks) is the ASGI root.
 #   REST routes are registered on Gradio's internal FastAPI instance
 #   via gradio.routes.App.create_app(demo).
 #   @spaces.GPU is active on _generate.
 #
-# WRONG (v1.x.x):
 #   FastAPI was the ASGI root; Gradio was a child via gr.mount_gradio_app.
 #   @spaces.GPU was commented out.
 #   ZeroGPU hooks never activated.
 #
-# CRITICAL: ZERO-GPU MODEL PATTERN
 # ────────────────────────────────
 # CORRECT:
 #   * Tokenizer loaded on CPU at import time.
@@ -49,14 +64,16 @@
 #   * Model moved to GPU ONLY inside @spaces.GPU scope.
 #   * Model returned to CPU in finally block after inference.
 #   * torch.cuda.empty_cache() called after every request.
 #
 # WRONG:
 #   * pipeline(... device_map="auto") at module level.
 #   * model.to("cuda") outside @spaces.GPU scope.
 #   * Holding GPU between requests.
 #   * Blocking asyncio event loop with synchronous inference.
 #
-# ASSEMBLY DIAGRAM (v2.0.0)
 # ─────────────────────────
 #
 #   HuggingFace Spaces
@@ -77,6 +94,49 @@
 # MAX_BODY_BYTES
 #     Maximum accepted request size.
 #
 # SPDX-License-Identifier: BSD-3-Clause
 # Authors: The scikit-plots developers
@@ -98,6 +158,10 @@ Developer note
     HuggingFace Spaces exports the ``app`` variable. It must be the
     Gradio-rooted ASGI application returned by ``App.create_app``.
 User note
     The Gradio UI at ``/`` is for manual testing only.
     Production traffic routes through the proxy Space.
@@ -109,9 +173,10 @@ import asyncio
 import json
 import logging
 import os
 import time
 import uuid
-from typing import Final
 import gradio as gr  # type: ignore[]
 import spaces  # type: ignore[]  # ZeroGPU — must be imported before torch
@@ -231,6 +296,10 @@ _MAX_NEW_TOKENS_FLOOR: Final[int] = 1
 _MAX_NEW_TOKENS_CEIL: Final[int] = 4096
 _MAX_NEW_TOKENS_DEFAULT: Final[int] = 512
 DEFAULT_MAX_BODY_BYTES: Final[int] = 10 * 1024 * 1024
 MAX_BODY_BYTES: Final[int] = _safe_int(
@@ -295,15 +364,12 @@ def _clamp_max_tokens(
         parsed = int(value)
     except (TypeError, ValueError) as exc:
         raise ValueError(
-            f"max_tokens must be integer, got {value!r}"
         ) from exc
     return max(
         _MAX_NEW_TOKENS_FLOOR,
-        min(
-            parsed,
-            _MAX_NEW_TOKENS_CEIL,
-        ),
     )
@@ -369,9 +435,119 @@ def _validate_messages(
     return validated
 logger.info("Validation helpers initialized successfully.")
 # ─────────────────────────────────────────────────────────────────────────────
 # Model loading
 # ─────────────────────────────────────────────────────────────────────────────
@@ -411,6 +587,8 @@ logger.info("Model loaded on CPU successfully.")
 # * GPU exists only inside @spaces.GPU scope.
 # * Model moved CPU → GPU at entry; GPU → CPU in finally.
 # * VRAM fully released after every request.
 # * This function is called from both:
 #     - Gradio event handlers (direct sync call via _gradio_respond)
 #     - FastAPI route handlers (via asyncio.to_thread in _generate_async)
@@ -420,6 +598,8 @@ logger.info("Model loaded on CPU successfully.")
 def _generate(
     messages: list[dict[str, str]],
     max_new_tokens: int = _MAX_NEW_TOKENS_DEFAULT,
 ) -> str:
     """
     Run generation using ZeroGPU.
@@ -432,6 +612,14 @@ def _generate(
     max_new_tokens : int, default=512
         Maximum generated tokens.
     Returns
     -------
     str
@@ -443,24 +631,36 @@ def _generate(
         On invalid inputs or missing chat template.
     RuntimeError
-        On inference failure.
     Notes
     -----
     Developer note
         GPU is acquired automatically by ``@spaces.GPU``.
-        Model is explicitly moved CPU → GPU → CPU during each request
-        to avoid persistent VRAM ownership between requests.
         ``finally`` block ensures CPU return and cache clear even if
-        inference raises. The inner ``try`` around ``_model.cpu()``
-        guarantees ``torch.cuda.empty_cache()`` runs regardless of
-        whether the CPU move itself fails.
-        This function is intentionally synchronous. Async routes call
         it via ``_generate_async`` which wraps it with
-        ``asyncio.to_thread``. Gradio event handlers call it directly
         because Gradio dispatches handlers in its own thread pool,
         outside the asyncio event loop.
@@ -469,7 +669,6 @@ def _generate(
         Use ``_generate_async`` from FastAPI routes.
     """
     validated_messages = _validate_messages(messages)
     max_new_tokens = _clamp_max_tokens(max_new_tokens)
     if not getattr(_tokenizer, "chat_template", None):
@@ -481,69 +680,99 @@ def _generate(
     logger.info(
         "GPU generation starting | "
         "messages=%d | "
-        "max_new_tokens=%d",
         len(validated_messages),
         max_new_tokens,
     )
-    try:
-        logger.info("Moving model to GPU...")
-        _model.cuda()
-        input_ids = _tokenizer.apply_chat_template(
-            validated_messages,
-            add_generation_prompt=True,
-            return_tensors="pt",
-        )
-        input_ids = input_ids.cuda()
-        logger.info("Generation started.")
-        with torch.no_grad():
-            output_ids = _model.generate(
-                input_ids,
-                max_new_tokens=max_new_tokens,
-                do_sample=True,
-                temperature=0.7,
-                pad_token_id=_tokenizer.eos_token_id,
             )
-        new_token_ids = output_ids[0][input_ids.shape[-1]:]
-        decoded = _tokenizer.decode(
-            new_token_ids,
-            skip_special_tokens=True,
-        )
-        logger.info("Generation completed successfully.")
-        return decoded
-    except ValueError:
-        raise
-    except Exception as exc:
-        logger.exception("Inference failure.")
-        raise RuntimeError(
-            f"Inference failed: {exc}"
-        ) from exc
-    finally:
-        logger.info(
-            "Returning model to CPU "
-            "and clearing CUDA cache..."
-        )
-        try:
-            _model.cpu()
         finally:
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-        logger.info("GPU resources released.")
 # ─────────────────────────────────────────────────────────────────────────────
@@ -564,6 +793,8 @@ def _generate(
 async def _generate_async(
     messages: list[dict[str, str]],
     max_new_tokens: int,
 ) -> str:
     """
     Async wrapper for GPU generation.
@@ -576,6 +807,12 @@ async def _generate_async(
     max_new_tokens : int
         Generation token limit.
     Returns
     -------
     str
@@ -595,6 +832,8 @@ async def _generate_async(
         _generate,
         messages,
         max_new_tokens,
     )
@@ -651,7 +890,7 @@ async def _read_bounded_body(
 def _parse_request_body(
     raw: bytes,
-) -> dict:
     """
     Decode and parse a UTF-8 JSON request body.
@@ -662,7 +901,7 @@ def _parse_request_body(
     Returns
     -------
-    dict
         Parsed JSON payload.
     Raises
@@ -767,7 +1006,7 @@ def _build_completion_response(
     model_id: str,
     prompt_tokens: int,
     completion_tokens: int,
-) -> dict:
     """
     Build an OpenAI-compatible chat completion response payload.
@@ -787,7 +1026,7 @@ def _build_completion_response(
     Returns
     -------
-    dict
         OpenAI-compatible ``chat.completion`` object.
     Notes
@@ -800,6 +1039,9 @@ def _build_completion_response(
         current ``_generate`` implementation does not expose partial
         stop conditions. Extend this if streaming or early stopping
         is added.
     User note
         The returned dict is compatible with OpenAI Python SDK
@@ -810,11 +1052,21 @@ def _build_completion_response(
     .. [1] OpenAI API reference: Chat completions object
            https://platform.openai.com/docs/api-reference/chat/object
     """
     return {
         "id": f"chatcmpl-{uuid.uuid4().hex}",
         "object": "chat.completion",
         "created": int(time.time()),
         "model": model_id,
         "choices": [
             {
                 "index": 0,
@@ -995,6 +1247,8 @@ def _gradio_respond(
     message: str,
     history: list,
     max_new_tokens: int,
 ) -> str:
     """
     Gradio ``ChatInterface`` event handler.
@@ -1011,6 +1265,12 @@ def _gradio_respond(
     max_new_tokens : int
         Maximum tokens to generate, sourced from the UI slider.
     Returns
     -------
     str
@@ -1022,7 +1282,8 @@ def _gradio_respond(
         If ``message`` is empty after stripping.
     RuntimeError
-        Propagated from ``_generate`` on inference failure.
     Notes
     -----
@@ -1056,24 +1317,30 @@ def _gradio_respond(
     logger.info(
         "Gradio inference | "
         "history_turns=%d | "
-        "max_new_tokens=%d",
         len(messages) - 1,
         max_new_tokens,
     )
     return _generate(
         messages,
         max_new_tokens,
     )
 # ─────────────────────────────────────────────────────────────────────────────
 # Gradio UI
 # ─────────────────────────────────────────────────────────────────────────────
-# v2.0.0: Gradio is the ASGI ROOT — not a child sub-app mounted on FastAPI.
 # This is required for ZeroGPU to activate on HuggingFace Spaces.
 #
-# The Gradio UI is now served at / (root) instead of /ui.
 # Custom REST routes are added to Gradio's internal FastAPI instance below.
 _UI_WARNING = """\
@@ -1103,10 +1370,26 @@ with _gradio_ui:
                 step=1,
                 label="max_tokens",
                 info=(
-                    f"Range: {_MAX_NEW_TOKENS_FLOOR}-{_MAX_NEW_TOKENS_CEIL}. "
                     f"Default: {_MAX_NEW_TOKENS_DEFAULT}."
                 ),
             ),
         ],
         additional_inputs_accordion="Generation settings",
     )
@@ -1120,14 +1403,8 @@ logger.info(
 # ─────────────────────────────────────────────────────────────────────────────
 # App assembly — HuggingFace Space export
 # ─────────────────────────────────────────────────────────────────────────────
-# v2.0.0 ARCHITECTURE CHANGE:
-#
-# v1.x.x (BROKEN):
-#   _api = FastAPI()                          ← FastAPI was ASGI root
-#   app  = gr.mount_gradio_app(_api, ui, "/ui")  ← Gradio was child
-#   ZeroGPU never activated — wrong root.
 #
-# v2.0.0 (CORRECT):
 #   app = _GradioApp.create_app(_gradio_ui)   ← Gradio is ASGI root
 #   @app.get/post(...)                        ← routes on Gradio's FastAPI
 #   ZeroGPU activates correctly.
@@ -1157,6 +1434,7 @@ app.add_middleware(
     allow_headers=[
         "Content-Type",
     ],
 )
 logger.info(
@@ -1195,7 +1473,7 @@ async def health() -> JSONResponse:
     Examples
     --------
     >>> # curl http://localhost:7860/health
-    ... # {"status": "ok", "model": "...", "version": "2.0.0"}
     """
     logger.info("GET /health")
@@ -1203,7 +1481,7 @@ async def health() -> JSONResponse:
         content={
             "status": "ok",
             "model": MODEL_ID,
-            "version": "2.0.0",
         },
         status_code=200,
     )
@@ -1229,15 +1507,15 @@ async def chat_completions(  # noqa: PLR0911
     Returns
     -------
     JSONResponse
-        HTTP 200 with an OpenAI-compatible completion payload.
-    Raises
-    ------
-    JSONResponse
         HTTP 413 if the body exceeds ``MAX_BODY_BYTES``.
-        HTTP 400 if the body is not valid UTF-8 JSON.
-        HTTP 400 if ``messages`` or ``max_tokens`` fail validation.
-        HTTP 500 on inference failure.
     Notes
     -----
@@ -1246,9 +1524,9 @@ async def chat_completions(  # noqa: PLR0911
         1. Read and bound-check raw body bytes (413 guard).
         2. Decode and parse JSON (400 guard).
-        3. Extract ``messages`` and ``max_tokens`` fields.
-        4. Validate with ``_validate_messages`` and
-           ``_clamp_max_tokens`` (400 guard).
         5. Count prompt tokens on CPU (no GPU needed).
         6. Dispatch to ``_generate_async`` which offloads to
            ``@spaces.GPU`` via ``asyncio.to_thread``.
@@ -1261,6 +1539,10 @@ async def chat_completions(  # noqa: PLR0911
         * ``RuntimeError`` → 500 (wrapped inference failure from ``_generate``)
         * ``Exception``   → 500 (unexpected catch-all, never leaks internals)
     User note
         Compatible with the OpenAI Python SDK:
@@ -1274,6 +1556,8 @@ async def chat_completions(  # noqa: PLR0911
             response = client.chat.completions.create(
                 model="any",
                 messages=[{"role": "user", "content": "Hello"}],
             )
     """
     request_id = uuid.uuid4().hex
@@ -1324,12 +1608,25 @@ async def chat_completions(  # noqa: PLR0911
         "max_tokens",
         _MAX_NEW_TOKENS_DEFAULT,
     )
     # ── 4. Input validation ───────────────────────────────────────────────────
     try:
         messages = _validate_messages(messages_raw)
         max_new_tokens = _clamp_max_tokens(max_tokens_raw)
     except ValueError as exc:
         logger.warning(
             "Validation error | request_id=%s | error=%s",
@@ -1346,11 +1643,17 @@ async def chat_completions(  # noqa: PLR0911
     logger.info(
         "Dispatching inference | "
         "request_id=%s | "
         "messages=%d | "
-        "max_new_tokens=%d",
         request_id,
         len(messages),
         max_new_tokens,
     )
     # ── 5. Prompt token count (CPU, pre-dispatch) ────────────────────────────���
@@ -1363,6 +1666,8 @@ async def chat_completions(  # noqa: PLR0911
         content = await _generate_async(
             messages,
             max_new_tokens,
         )
     except ValueError as exc:
@@ -1446,7 +1751,7 @@ logger.info(
 logger.info(
     "scikit-plots ai-model Space initialized successfully.\n"
-    "  version   : 2.0.0\n"
     "  model     : %s\n"
     "  CORS      : %s\n"
     "  max_body  : %s bytes\n"

+# scikit-plots/ai-model  ·  app.py  v2.1.0
 #
 # PURPOSE
 # ───────
 # + Long-term maintainability
 # + Minimal hidden behavior
 #
+# CRITICAL: SINGLE-WORKER REQUIREMENT
+# ────────────────────────────────────
+# This Space MUST run with a single uvicorn worker.
+# The model (7B params, bfloat16) consumes ~14 GB of RAM on CPU.
+# The ZeroGPU hard RAM limit is 16 GB.
+#
+# Two workers × 14 GB = 28 GB → OOM → the OS kills the second process
+# with a clean exit code (0), which HuggingFace reports as "runtime error".
+#
+# HuggingFace Spaces with sdk: gradio default to a single worker, which
+# is the correct configuration. If you observe two initialization sequences
+# in the container log, verify that no external launcher is adding workers
+# (e.g., GRADIO_NUM_WORKERS, uvicorn --workers) and file a HuggingFace
+# support ticket if the double-start persists.
+#
 # CRITICAL: ZEROGPU ARCHITECTURE REQUIREMENT
 # ──────────────────────────────────────────
 # On HuggingFace Spaces with sdk: gradio, ZeroGPU hooks attach to the
 # Gradio server lifecycle. Gradio MUST be the ASGI root application.
 #
+# CORRECT (v2.x):
 #   Gradio (gr.Blocks) is the ASGI root.
 #   REST routes are registered on Gradio's internal FastAPI instance
 #   via gradio.routes.App.create_app(demo).
 #   @spaces.GPU is active on _generate.
 #
+# WRONG (v1.x):
 #   FastAPI was the ASGI root; Gradio was a child via gr.mount_gradio_app.
 #   @spaces.GPU was commented out.
 #   ZeroGPU hooks never activated.
 #
+# CRITICAL: ZEROGPU MODEL PATTERN
 # ────────────────────────────────
 # CORRECT:
 #   * Tokenizer loaded on CPU at import time.
 #   * Model moved to GPU ONLY inside @spaces.GPU scope.
 #   * Model returned to CPU in finally block after inference.
 #   * torch.cuda.empty_cache() called after every request.
+#   * _MODEL_LOCK serialises all model device transitions.
 #
 # WRONG:
 #   * pipeline(... device_map="auto") at module level.
 #   * model.to("cuda") outside @spaces.GPU scope.
 #   * Holding GPU between requests.
 #   * Blocking asyncio event loop with synchronous inference.
+#   * Concurrent model.cuda() / model.cpu() without a lock.
 #
+# ASSEMBLY DIAGRAM (v2.1.0)
 # ─────────────────────────
 #
 #   HuggingFace Spaces
 # MAX_BODY_BYTES
 #     Maximum accepted request size.
 #
+# CHANGES v2.0.0 → v2.1.0
+# ─────────────────────────
+# [CRITICAL] Add _MODEL_LOCK (threading.Lock) to serialise all model
+#            device transitions (cuda/cpu) across concurrent inference
+#            calls. Without this, concurrent @spaces.GPU activations
+#            can corrupt model device state.
+#
+# [CRITICAL] Explicit GPU tensor cleanup (del input_ids, output_ids,
+#            new_token_ids) in the success path before _model.cpu() and
+#            torch.cuda.empty_cache(). Ensures VRAM is fully released
+#            before the ZeroGPU scope exits.
+#
+# [HIGH]     Add `except RuntimeError: raise` to _generate exception
+#            chain so that RuntimeErrors (including the empty-response
+#            guard below) are not accidentally double-wrapped.
+#
+# [HIGH]     Guard against empty model output: raise RuntimeError if
+#            the decoded string is empty after skip_special_tokens.
+#
+# [MEDIUM]   temperature and top_p are now configurable from the
+#            request body (REST) and from sliders (Gradio UI).
+#            Defaults: temperature=0.7, top_p=1.0.
+#            temperature=0.0 → greedy decoding (do_sample=False).
+#
+# [MEDIUM]   Log the requested model field from the request body for
+#            proxy-routing diagnostics.
+#
+# [MEDIUM]   Fix chat_completions docstring: JSONResponse error cases
+#            moved from the incorrect Raises section to Notes, because
+#            they are returned values, not raised exceptions.
+#
+# [LOW]      _parse_request_body and _build_completion_response now
+#            carry precise dict[str, Any] return type annotations.
+#
+# [LOW]      system_fingerprint field added to completion response for
+#            improved OpenAI SDK compatibility.
+#
+# [LOW]      Explicit allow_credentials=False in CORS middleware.
+#
+# [DOC]      Prominent single-worker warning added to module header
+#            (see above) explaining the double-startup / exit-0 OOM
+#            failure mode observed in the container log.
+#
 # SPDX-License-Identifier: BSD-3-Clause
 # Authors: The scikit-plots developers
     HuggingFace Spaces exports the ``app`` variable. It must be the
     Gradio-rooted ASGI application returned by ``App.create_app``.
+    ``_MODEL_LOCK`` serialises all calls to ``_model.cuda()`` and
+    ``_model.cpu()``.  A single ``_model`` object must not have its
+    device changed by two threads simultaneously.
 User note
     The Gradio UI at ``/`` is for manual testing only.
     Production traffic routes through the proxy Space.
 import json
 import logging
 import os
+import threading
 import time
 import uuid
+from typing import Any, Final
 import gradio as gr  # type: ignore[]
 import spaces  # type: ignore[]  # ZeroGPU — must be imported before torch
 _MAX_NEW_TOKENS_CEIL: Final[int] = 4096
 _MAX_NEW_TOKENS_DEFAULT: Final[int] = 512
+# Generation defaults — match OpenAI API defaults where applicable.
+_DEFAULT_TEMPERATURE: Final[float] = 0.7
+_DEFAULT_TOP_P: Final[float] = 1.0
 DEFAULT_MAX_BODY_BYTES: Final[int] = 10 * 1024 * 1024
 MAX_BODY_BYTES: Final[int] = _safe_int(
         parsed = int(value)
     except (TypeError, ValueError) as exc:
         raise ValueError(
+            f"max_tokens must be an integer, got {value!r}"
         ) from exc
     return max(
         _MAX_NEW_TOKENS_FLOOR,
+        min(parsed, _MAX_NEW_TOKENS_CEIL),
     )
     return validated
+def _validate_temperature(
+    value: object,
+) -> float:
+    """
+    Validate and return a generation temperature value.
+    Parameters
+    ----------
+    value : object
+        Candidate temperature.
+    Returns
+    -------
+    float
+        Validated temperature in [0.0, 2.0].
+    Raises
+    ------
+    ValueError
+        If conversion fails or value is out of range.
+    Notes
+    -----
+    Developer note
+        ``temperature=0.0`` selects greedy decoding (``do_sample=False``).
+        The upper bound 2.0 matches the OpenAI API specification.
+    References
+    ----------
+    .. [1] OpenAI API reference: temperature parameter
+           https://platform.openai.com/docs/api-reference/chat/create#temperature
+    """
+    try:
+        parsed = float(value)
+    except (TypeError, ValueError) as exc:
+        raise ValueError(
+            f"temperature must be a number, got {value!r}"
+        ) from exc
+    if not (0.0 <= parsed <= 2.0):
+        raise ValueError(
+            f"temperature must be in [0.0, 2.0], got {parsed!r}"
+        )
+    return parsed
+def _validate_top_p(
+    value: object,
+) -> float:
+    """
+    Validate and return a nucleus-sampling top_p value.
+    Parameters
+    ----------
+    value : object
+        Candidate top_p.
+    Returns
+    -------
+    float
+        Validated top_p in (0.0, 1.0].
+    Raises
+    ------
+    ValueError
+        If conversion fails or value is out of range.
+    Notes
+    -----
+    Developer note
+        ``top_p=1.0`` effectively disables nucleus sampling.
+        OpenAI recommends altering temperature or top_p but not both.
+    References
+    ----------
+    .. [1] OpenAI API reference: top_p parameter
+           https://platform.openai.com/docs/api-reference/chat/create#top_p
+    """
+    try:
+        parsed = float(value)
+    except (TypeError, ValueError) as exc:
+        raise ValueError(
+            f"top_p must be a number, got {value!r}"
+        ) from exc
+    if not (0.0 < parsed <= 1.0):
+        raise ValueError(
+            f"top_p must be in (0.0, 1.0], got {parsed!r}"
+        )
+    return parsed
 logger.info("Validation helpers initialized successfully.")
+# ─────────────────────────────────────────────────────────────────────────────
+# Model lock
+# ─────────────────────────────────────────────────────────────────────────────
+# Serialises all _model.cuda() / _model.cpu() transitions.
+#
+# A single _model object must not be moved to different devices by two
+# threads simultaneously. @spaces.GPU does not prevent concurrent calls
+# by itself (the Gradio queue or multiple in-flight async requests can
+# dispatch _generate from multiple threads at the same time).
+#
+# Holding _MODEL_LOCK for the duration of the entire inference (cuda →
+# generate → cpu) is correct and safe: we are single-model, single-GPU.
+_MODEL_LOCK: Final[threading.Lock] = threading.Lock()
 # ─────────────────────────────────────────────────────────────────────────────
 # Model loading
 # ─────────────────────────────────────────────────────────────────────────────
 # * GPU exists only inside @spaces.GPU scope.
 # * Model moved CPU → GPU at entry; GPU → CPU in finally.
 # * VRAM fully released after every request.
+# * _MODEL_LOCK held for the full duration of the inference to prevent
+#   concurrent device transitions on the shared _model object.
 # * This function is called from both:
 #     - Gradio event handlers (direct sync call via _gradio_respond)
 #     - FastAPI route handlers (via asyncio.to_thread in _generate_async)
 def _generate(
     messages: list[dict[str, str]],
     max_new_tokens: int = _MAX_NEW_TOKENS_DEFAULT,
+    temperature: float = _DEFAULT_TEMPERATURE,
+    top_p: float = _DEFAULT_TOP_P,
 ) -> str:
     """
     Run generation using ZeroGPU.
     max_new_tokens : int, default=512
         Maximum generated tokens.
+    temperature : float, default=0.7
+        Sampling temperature in [0.0, 2.0].
+        ``0.0`` selects greedy decoding (do_sample=False).
+    top_p : float, default=1.0
+        Nucleus sampling cutoff in (0.0, 1.0].
+        ``1.0`` disables nucleus sampling.
     Returns
     -------
     str
         On invalid inputs or missing chat template.
     RuntimeError
+        On inference failure or empty model output.
     Notes
     -----
     Developer note
         GPU is acquired automatically by ``@spaces.GPU``.
+        ``_MODEL_LOCK`` is held for the entire inference duration
+        (cuda → generate → cpu) to prevent concurrent threads from
+        issuing conflicting device transitions on the shared ``_model``
+        object.  ZeroGPU + Gradio queue can dispatch this function from
+        multiple threads simultaneously; the lock serialises them.
+        GPU tensors (``input_ids``, ``output_ids``, ``new_token_ids``)
+        are explicitly deleted in the success path before ``_model.cpu()``
+        and ``torch.cuda.empty_cache()``.  This ensures VRAM is fully
+        reclaimed before the ``@spaces.GPU`` scope exits.  On the
+        exception path, any tensors that were assigned before the error
+        remain alive until the function exits (acceptable: ZeroGPU
+        releases all GPU memory at ``@spaces.GPU`` scope exit).
         ``finally`` block ensures CPU return and cache clear even if
+        inference raises.  The inner ``try/except`` around ``_model.cpu()``
+        logs and absorbs a potential CPU-move failure so that the
+        original inference exception is not masked; it still calls
+        ``torch.cuda.empty_cache()`` via its own nested ``finally``.
+        This function is intentionally synchronous.  Async routes call
         it via ``_generate_async`` which wraps it with
+        ``asyncio.to_thread``.  Gradio event handlers call it directly
         because Gradio dispatches handlers in its own thread pool,
         outside the asyncio event loop.
         Use ``_generate_async`` from FastAPI routes.
     """
     validated_messages = _validate_messages(messages)
     max_new_tokens = _clamp_max_tokens(max_new_tokens)
     if not getattr(_tokenizer, "chat_template", None):
     logger.info(
         "GPU generation starting | "
         "messages=%d | "
+        "max_new_tokens=%d | "
+        "temperature=%.2f | "
+        "top_p=%.2f",
         len(validated_messages),
         max_new_tokens,
+        temperature,
+        top_p,
     )
+    with _MODEL_LOCK:
+        try:
+            logger.info("Moving model to GPU...")
+            _model.cuda()
+            input_ids = _tokenizer.apply_chat_template(
+                validated_messages,
+                add_generation_prompt=True,
+                return_tensors="pt",
+            )
+            input_ids = input_ids.cuda()
+            logger.info("Generation started.")
+            # Build generation kwargs.
+            # temperature=0.0 → greedy (do_sample=False, no temperature/top_p).
+            # temperature>0.0 → sampling; top_p applied only when < 1.0.
+            generate_kwargs: dict[str, Any] = {
+                "max_new_tokens": max_new_tokens,
+                "pad_token_id": _tokenizer.eos_token_id,
+            }
+            if temperature > 0.0:
+                generate_kwargs["do_sample"] = True
+                generate_kwargs["temperature"] = temperature
+                if top_p < 1.0:
+                    generate_kwargs["top_p"] = top_p
+            with torch.no_grad():
+                output_ids = _model.generate(
+                    input_ids,
+                    **generate_kwargs,
+                )
+            new_token_ids = output_ids[0][input_ids.shape[-1]:]
+            decoded = _tokenizer.decode(
+                new_token_ids,
+                skip_special_tokens=True,
             )
+            # Release GPU tensors before CPU move and cache clear.
+            # new_token_ids is a view of output_ids; deleting both here
+            # drops all references, freeing the underlying CUDA storage.
+            del input_ids, output_ids, new_token_ids
+            if not decoded.strip():
+                raise RuntimeError(
+                    "Model returned an empty response. "
+                    "Retry or reduce prompt length."
+                )
+            logger.info("Generation completed successfully.")
+            return decoded
+        except ValueError:
+            raise
+        except RuntimeError:
+            raise
+        except Exception as exc:
+            logger.exception("Inference failure.")
+            raise RuntimeError(
+                f"Inference failed: {exc}"
+            ) from exc
         finally:
+            logger.info(
+                "Returning model to CPU "
+                "and clearing CUDA cache..."
+            )
+            try:
+                _model.cpu()
+            except Exception:  # noqa: BLE001
+                logger.exception(
+                    "Failed to move model back to CPU. "
+                    "VRAM may not be fully released."
+                )
+            finally:
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+            logger.info("GPU resources released.")
 # ─────────────────────────────────────────────────────────────────────────────
 async def _generate_async(
     messages: list[dict[str, str]],
     max_new_tokens: int,
+    temperature: float = _DEFAULT_TEMPERATURE,
+    top_p: float = _DEFAULT_TOP_P,
 ) -> str:
     """
     Async wrapper for GPU generation.
     max_new_tokens : int
         Generation token limit.
+    temperature : float, default=0.7
+        Sampling temperature forwarded to ``_generate``.
+    top_p : float, default=1.0
+        Nucleus sampling cutoff forwarded to ``_generate``.
     Returns
     -------
     str
         _generate,
         messages,
         max_new_tokens,
+        temperature,
+        top_p,
     )
 def _parse_request_body(
     raw: bytes,
+) -> dict[str, Any]:
     """
     Decode and parse a UTF-8 JSON request body.
     Returns
     -------
+    dict[str, Any]
         Parsed JSON payload.
     Raises
     model_id: str,
     prompt_tokens: int,
     completion_tokens: int,
+) -> dict[str, Any]:
     """
     Build an OpenAI-compatible chat completion response payload.
     Returns
     -------
+    dict[str, Any]
         OpenAI-compatible ``chat.completion`` object.
     Notes
         current ``_generate`` implementation does not expose partial
         stop conditions. Extend this if streaming or early stopping
         is added.
+        ``system_fingerprint`` is derived from the model ID slug to
+        satisfy OpenAI SDK response parsing without exposing internal
+        infrastructure details.
     User note
         The returned dict is compatible with OpenAI Python SDK
     .. [1] OpenAI API reference: Chat completions object
            https://platform.openai.com/docs/api-reference/chat/object
     """
+    # Derive a deterministic, URL-safe fingerprint from the model ID.
+    _model_slug = (
+        model_id
+        .lower()
+        .replace("/", "-")
+        .replace(".", "-")
+        .replace("_", "-")
+    )
     return {
         "id": f"chatcmpl-{uuid.uuid4().hex}",
         "object": "chat.completion",
         "created": int(time.time()),
         "model": model_id,
+        "system_fingerprint": f"fp-{_model_slug}",
         "choices": [
             {
                 "index": 0,
     message: str,
     history: list,
     max_new_tokens: int,
+    temperature: float,
+    top_p: float,
 ) -> str:
     """
     Gradio ``ChatInterface`` event handler.
     max_new_tokens : int
         Maximum tokens to generate, sourced from the UI slider.
+    temperature : float
+        Sampling temperature sourced from the UI slider.
+    top_p : float
+        Nucleus sampling cutoff sourced from the UI slider.
     Returns
     -------
     str
         If ``message`` is empty after stripping.
     RuntimeError
+        Propagated from ``_generate`` on inference failure or empty
+        model output.
     Notes
     -----
     logger.info(
         "Gradio inference | "
         "history_turns=%d | "
+        "max_new_tokens=%d | "
+        "temperature=%.2f | "
+        "top_p=%.2f",
         len(messages) - 1,
         max_new_tokens,
+        temperature,
+        top_p,
     )
     return _generate(
         messages,
         max_new_tokens,
+        temperature,
+        top_p,
     )
 # ─────────────────────────────────────────────────────────────────────────────
 # Gradio UI
 # ─────────────────────────────────────────────────────────────────────────────
+# v2.x: Gradio is the ASGI ROOT — not a child sub-app mounted on FastAPI.
 # This is required for ZeroGPU to activate on HuggingFace Spaces.
 #
+# The Gradio UI is served at / (root).
 # Custom REST routes are added to Gradio's internal FastAPI instance below.
 _UI_WARNING = """\
                 step=1,
                 label="max_tokens",
                 info=(
+                    f"Range: {_MAX_NEW_TOKENS_FLOOR}–{_MAX_NEW_TOKENS_CEIL}. "
                     f"Default: {_MAX_NEW_TOKENS_DEFAULT}."
                 ),
             ),
+            gr.Slider(
+                minimum=0.0,
+                maximum=2.0,
+                value=_DEFAULT_TEMPERATURE,
+                step=0.05,
+                label="temperature",
+                info="0.0 = greedy, 0.7 = default, 2.0 = very random.",
+            ),
+            gr.Slider(
+                minimum=0.01,
+                maximum=1.0,
+                value=_DEFAULT_TOP_P,
+                step=0.01,
+                label="top_p",
+                info="Nucleus sampling cutoff. 1.0 = disabled.",
+            ),
         ],
         additional_inputs_accordion="Generation settings",
     )
 # ─────────────────────────────────────────────────────────────────────────────
 # App assembly — HuggingFace Space export
 # ─────────────────────────────────────────────────────────────────────────────
+# v2.x ARCHITECTURE:
 #
 #   app = _GradioApp.create_app(_gradio_ui)   ← Gradio is ASGI root
 #   @app.get/post(...)                        ← routes on Gradio's FastAPI
 #   ZeroGPU activates correctly.
     allow_headers=[
         "Content-Type",
     ],
+    allow_credentials=False,  # This Space does not use credential-bearing requests.
 )
 logger.info(
     Examples
     --------
     >>> # curl http://localhost:7860/health
+    ... # {"status": "ok", "model": "...", "version": "2.1.0"}
     """
     logger.info("GET /health")
         content={
             "status": "ok",
             "model": MODEL_ID,
+            "version": "2.1.0",
         },
         status_code=200,
     )
     Returns
     -------
     JSONResponse
+        HTTP 200 with an OpenAI-compatible completion payload on success.
         HTTP 413 if the body exceeds ``MAX_BODY_BYTES``.
+        HTTP 400 if the body is not valid UTF-8 JSON, or if
+        ``messages``, ``max_tokens``, ``temperature``, or ``top_p``
+        fail validation.
+        HTTP 500 on inference failure or unexpected server error.
     Notes
     -----
         1. Read and bound-check raw body bytes (413 guard).
         2. Decode and parse JSON (400 guard).
+        3. Extract ``messages``, ``max_tokens``, ``temperature``,
+           ``top_p``, and ``model`` fields.
+        4. Validate with field-specific validators (400 guard).
         5. Count prompt tokens on CPU (no GPU needed).
         6. Dispatch to ``_generate_async`` which offloads to
            ``@spaces.GPU`` via ``asyncio.to_thread``.
         * ``RuntimeError`` → 500 (wrapped inference failure from ``_generate``)
         * ``Exception``   → 500 (unexpected catch-all, never leaks internals)
+        The requested ``model`` field is logged for proxy-routing
+        diagnostics but does not affect which model is used; this Space
+        always serves ``MODEL_ID``.
     User note
         Compatible with the OpenAI Python SDK:
             response = client.chat.completions.create(
                 model="any",
                 messages=[{"role": "user", "content": "Hello"}],
+                temperature=0.7,
+                top_p=1.0,
             )
     """
     request_id = uuid.uuid4().hex
         "max_tokens",
         _MAX_NEW_TOKENS_DEFAULT,
     )
+    temperature_raw: object = payload.get(
+        "temperature",
+        _DEFAULT_TEMPERATURE,
+    )
+    top_p_raw: object = payload.get(
+        "top_p",
+        _DEFAULT_TOP_P,
+    )
+    # Log requested model for proxy-routing diagnostics only.
+    # This Space always serves MODEL_ID regardless of the field value.
+    model_requested: object = payload.get("model", MODEL_ID)
     # ── 4. Input validation ───────────────────────────────────────────────────
     try:
         messages = _validate_messages(messages_raw)
         max_new_tokens = _clamp_max_tokens(max_tokens_raw)
+        temperature = _validate_temperature(temperature_raw)
+        top_p = _validate_top_p(top_p_raw)
     except ValueError as exc:
         logger.warning(
             "Validation error | request_id=%s | error=%s",
     logger.info(
         "Dispatching inference | "
         "request_id=%s | "
+        "model_requested=%s | "
         "messages=%d | "
+        "max_new_tokens=%d | "
+        "temperature=%.2f | "
+        "top_p=%.2f",
         request_id,
+        model_requested,
         len(messages),
         max_new_tokens,
+        temperature,
+        top_p,
     )
     # ── 5. Prompt token count (CPU, pre-dispatch) ────────────────────────────���
         content = await _generate_async(
             messages,
             max_new_tokens,
+            temperature,
+            top_p,
         )
     except ValueError as exc:
 logger.info(
     "scikit-plots ai-model Space initialized successfully.\n"
+    "  version   : 2.1.0\n"
     "  model     : %s\n"
     "  CORS      : %s\n"
     "  max_body  : %s bytes\n"