Spaces:
Running
Running
Upload 3 files
Browse files
app.py
CHANGED
|
@@ -73,7 +73,7 @@
|
|
| 73 |
# * Blocking asyncio event loop with synchronous inference.
|
| 74 |
# * Concurrent model.cuda() / model.cpu() without a lock.
|
| 75 |
#
|
| 76 |
-
# ASSEMBLY DIAGRAM (v2.
|
| 77 |
# βββββββββββββββββββββββββ
|
| 78 |
#
|
| 79 |
# HuggingFace Spaces
|
|
@@ -662,36 +662,124 @@ logger.info("Validation helpers initialized successfully.")
|
|
| 662 |
|
| 663 |
_MODEL_LOCK: Final[threading.Lock] = threading.Lock()
|
| 664 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 665 |
|
| 666 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 667 |
# Model loading
|
| 668 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 669 |
-
#
|
| 670 |
-
#
|
|
|
|
|
|
|
|
|
|
| 671 |
# Never call .to("cuda") or device_map="auto" at module level β
|
| 672 |
# CUDA is not available outside @spaces.GPU on ZeroGPU Spaces.
|
| 673 |
|
| 674 |
-
|
|
|
|
| 675 |
|
| 676 |
-
_tokenizer = AutoTokenizer.from_pretrained(
|
| 677 |
-
MODEL_ID,
|
| 678 |
-
)
|
| 679 |
|
| 680 |
-
|
|
|
|
|
|
|
| 681 |
|
| 682 |
-
|
| 683 |
-
|
| 684 |
-
|
| 685 |
-
|
| 686 |
|
| 687 |
-
|
| 688 |
-
|
| 689 |
-
|
| 690 |
-
|
| 691 |
-
|
| 692 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 693 |
|
| 694 |
-
logger.info("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 695 |
|
| 696 |
|
| 697 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -751,6 +839,12 @@ def _generate(
|
|
| 751 |
Notes
|
| 752 |
-----
|
| 753 |
Developer note
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 754 |
GPU is acquired automatically by ``@spaces.GPU``.
|
| 755 |
|
| 756 |
``_MODEL_LOCK`` is held for the entire inference duration
|
|
@@ -783,6 +877,16 @@ def _generate(
|
|
| 783 |
Do not call this function directly from async code.
|
| 784 |
Use ``_generate_async`` from FastAPI routes.
|
| 785 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 786 |
validated_messages = _validate_messages(messages)
|
| 787 |
max_new_tokens = _clamp_max_tokens(max_new_tokens)
|
| 788 |
|
|
@@ -936,6 +1040,12 @@ async def _generate_async(
|
|
| 936 |
Notes
|
| 937 |
-----
|
| 938 |
Developer note
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 939 |
Offloads the synchronous ``_generate`` call to a thread via
|
| 940 |
``asyncio.to_thread`` so the asyncio event loop is not blocked
|
| 941 |
during GPU inference.
|
|
@@ -943,6 +1053,11 @@ async def _generate_async(
|
|
| 943 |
Must NOT be called from Gradio event handlers β use ``_generate``
|
| 944 |
directly from Gradio since it runs in its own thread pool.
|
| 945 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 946 |
return await asyncio.to_thread(
|
| 947 |
_generate,
|
| 948 |
messages,
|
|
@@ -1154,9 +1269,9 @@ def _build_completion_response(
|
|
| 1154 |
current ``_generate`` implementation does not expose partial
|
| 1155 |
stop conditions. Extend this if streaming or early stopping
|
| 1156 |
is added.
|
| 1157 |
-
``system_fingerprint``
|
| 1158 |
-
|
| 1159 |
-
|
| 1160 |
|
| 1161 |
User note
|
| 1162 |
The returned dict is compatible with OpenAI Python SDK
|
|
@@ -1167,21 +1282,12 @@ def _build_completion_response(
|
|
| 1167 |
.. [1] OpenAI API reference: Chat completions object
|
| 1168 |
https://platform.openai.com/docs/api-reference/chat/object
|
| 1169 |
"""
|
| 1170 |
-
# Derive a deterministic, URL-safe fingerprint from the model ID.
|
| 1171 |
-
_model_slug = (
|
| 1172 |
-
model_id
|
| 1173 |
-
.lower()
|
| 1174 |
-
.replace("/", "-")
|
| 1175 |
-
.replace(".", "-")
|
| 1176 |
-
.replace("_", "-")
|
| 1177 |
-
)
|
| 1178 |
-
|
| 1179 |
return {
|
| 1180 |
"id": f"chatcmpl-{uuid.uuid4().hex}",
|
| 1181 |
"object": "chat.completion",
|
| 1182 |
"created": int(time.time()),
|
| 1183 |
"model": model_id,
|
| 1184 |
-
"system_fingerprint":
|
| 1185 |
"choices": [
|
| 1186 |
{
|
| 1187 |
"index": 0,
|
|
@@ -1403,6 +1509,12 @@ def _gradio_respond(
|
|
| 1403 |
Notes
|
| 1404 |
-----
|
| 1405 |
Developer note
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1406 |
Calls ``_generate`` (sync, ``@spaces.GPU``) directly.
|
| 1407 |
Must NOT call ``_generate_async`` (async) because Gradio
|
| 1408 |
dispatches event handlers via its own thread pool, completely
|
|
@@ -1420,6 +1532,11 @@ def _gradio_respond(
|
|
| 1420 |
"Message must be a non-empty string."
|
| 1421 |
)
|
| 1422 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1423 |
messages = _normalize_gradio_history(history)
|
| 1424 |
|
| 1425 |
messages.append(
|
|
@@ -1588,7 +1705,7 @@ async def health() -> JSONResponse:
|
|
| 1588 |
Examples
|
| 1589 |
--------
|
| 1590 |
>>> # curl http://localhost:7860/health
|
| 1591 |
-
... # {"status": "ok", "model": "...", "version": "2.
|
| 1592 |
"""
|
| 1593 |
logger.info("GET /health")
|
| 1594 |
|
|
@@ -1596,7 +1713,8 @@ async def health() -> JSONResponse:
|
|
| 1596 |
content={
|
| 1597 |
"status": "ok",
|
| 1598 |
"model": MODEL_ID,
|
| 1599 |
-
"version":
|
|
|
|
| 1600 |
},
|
| 1601 |
status_code=200,
|
| 1602 |
)
|
|
@@ -1642,6 +1760,8 @@ async def chat_completions( # noqa: PLR0911
|
|
| 1642 |
3. Extract ``messages``, ``max_tokens``, ``temperature``,
|
| 1643 |
``top_p``, and ``model`` fields.
|
| 1644 |
4. Validate with field-specific validators (400 guard).
|
|
|
|
|
|
|
| 1645 |
5. Count prompt tokens on CPU (no GPU needed).
|
| 1646 |
6. Dispatch to ``_generate_async`` which offloads to
|
| 1647 |
``@spaces.GPU`` via ``asyncio.to_thread``.
|
|
@@ -1771,6 +1891,29 @@ async def chat_completions( # noqa: PLR0911
|
|
| 1771 |
top_p,
|
| 1772 |
)
|
| 1773 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1774 |
# ββ 5. Prompt token count (CPU, pre-dispatch) βββββββββββββββββββββββββββββ
|
| 1775 |
|
| 1776 |
prompt_tokens: int = _count_prompt_tokens(messages)
|
|
@@ -1866,13 +2009,14 @@ logger.info(
|
|
| 1866 |
|
| 1867 |
logger.info(
|
| 1868 |
"scikit-plots ai-model Space initialized successfully.\n"
|
| 1869 |
-
" version :
|
| 1870 |
" model : %s\n"
|
| 1871 |
" CORS : %s\n"
|
| 1872 |
" max_body : %s bytes\n"
|
| 1873 |
" ASGI root : Gradio (ZeroGPU-compatible)\n"
|
| 1874 |
" routes : GET /health | POST /v1/chat/completions\n"
|
| 1875 |
" test UI : / (root, developer only)",
|
|
|
|
| 1876 |
MODEL_ID,
|
| 1877 |
CORS_ORIGINS,
|
| 1878 |
MAX_BODY_BYTES,
|
|
|
|
| 73 |
# * Blocking asyncio event loop with synchronous inference.
|
| 74 |
# * Concurrent model.cuda() / model.cpu() without a lock.
|
| 75 |
#
|
| 76 |
+
# ASSEMBLY DIAGRAM (v2.2.0)
|
| 77 |
# βββββββββββββββββββββββββ
|
| 78 |
#
|
| 79 |
# HuggingFace Spaces
|
|
|
|
| 662 |
|
| 663 |
_MODEL_LOCK: Final[threading.Lock] = threading.Lock()
|
| 664 |
|
| 665 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 666 |
+
# Initialization lock and readiness event
|
| 667 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 668 |
+
# _INIT_LOCK β guards the one-time model initialisation inside
|
| 669 |
+
# _ensure_model_loaded(). Held only during CPU-side
|
| 670 |
+
# loading, never inside @spaces.GPU. Separate from
|
| 671 |
+
# _MODEL_LOCK which serialises GPU device transitions.
|
| 672 |
+
# The two locks have strictly disjoint scopes and are
|
| 673 |
+
# never held simultaneously: no deadlock risk.
|
| 674 |
+
#
|
| 675 |
+
# _model_is_loaded β threading.Event set exactly once after a successful
|
| 676 |
+
# load. Provides a lock-free fast path on every
|
| 677 |
+
# subsequent call to _ensure_model_loaded() and
|
| 678 |
+
# exposes model readiness in /health.
|
| 679 |
+
|
| 680 |
+
_INIT_LOCK: Final[threading.Lock] = threading.Lock()
|
| 681 |
+
_model_is_loaded: Final[threading.Event] = threading.Event()
|
| 682 |
+
|
| 683 |
|
| 684 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 685 |
# Model loading
|
| 686 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 687 |
+
# Both are None at module import; loaded exactly once on the first
|
| 688 |
+
# inference request via _ensure_model_loaded(). This prevents OOM when
|
| 689 |
+
# Gradio 6.x or ZeroGPU session management spawns a secondary Python
|
| 690 |
+
# worker process: a process that receives no inference request never loads
|
| 691 |
+
# the model and therefore stays well within the 16 GB RAM hard limit.
|
| 692 |
# Never call .to("cuda") or device_map="auto" at module level β
|
| 693 |
# CUDA is not available outside @spaces.GPU on ZeroGPU Spaces.
|
| 694 |
|
| 695 |
+
_tokenizer: AutoTokenizer | None = None
|
| 696 |
+
_model: AutoModelForCausalLM | None = None
|
| 697 |
|
|
|
|
|
|
|
|
|
|
| 698 |
|
| 699 |
+
def _ensure_model_loaded() -> None:
|
| 700 |
+
"""
|
| 701 |
+
Load tokenizer and model exactly once; no-op on subsequent calls.
|
| 702 |
|
| 703 |
+
Uses double-checked locking (``_INIT_LOCK``) to guarantee that
|
| 704 |
+
tokenizer and model loading occur at most once across all threads in
|
| 705 |
+
the process. After the first successful load, all subsequent calls
|
| 706 |
+
return immediately via a lock-free check on ``_model_is_loaded``.
|
| 707 |
|
| 708 |
+
Returns
|
| 709 |
+
-------
|
| 710 |
+
None
|
| 711 |
+
|
| 712 |
+
Raises
|
| 713 |
+
------
|
| 714 |
+
RuntimeError
|
| 715 |
+
If ``AutoTokenizer.from_pretrained`` or
|
| 716 |
+
``AutoModelForCausalLM.from_pretrained`` raises. If the
|
| 717 |
+
tokenizer loads but the model fails, ``_model_is_loaded`` is
|
| 718 |
+
never set so the next call retries the full sequence from
|
| 719 |
+
the tokenizer step.
|
| 720 |
+
|
| 721 |
+
Notes
|
| 722 |
+
-----
|
| 723 |
+
Developer note
|
| 724 |
+
Must be called by callers **before** the ``@spaces.GPU`` scope
|
| 725 |
+
so that model loading (a CPU-only operation) does not consume
|
| 726 |
+
ZeroGPU GPU quota.
|
| 727 |
+
|
| 728 |
+
From sync callers (e.g. ``_gradio_respond``): call directly.
|
| 729 |
+
|
| 730 |
+
From async callers (e.g. ``_generate_async``,
|
| 731 |
+
``chat_completions``): call via
|
| 732 |
+
``await asyncio.to_thread(_ensure_model_loaded)`` to prevent
|
| 733 |
+
blocking the asyncio event loop during the first load.
|
| 734 |
+
|
| 735 |
+
Lock scope: ``_INIT_LOCK`` is held only during CPU-side loading,
|
| 736 |
+
never inside ``@spaces.GPU``. ``_MODEL_LOCK`` serialises GPU
|
| 737 |
+
device transitions inside ``_generate``. The two locks have
|
| 738 |
+
strictly disjoint scopes β no deadlock risk.
|
| 739 |
+
|
| 740 |
+
User note
|
| 741 |
+
The first inference request after a cold start may take
|
| 742 |
+
30β120 seconds while the model downloads and loads to CPU.
|
| 743 |
+
Subsequent requests within the same active session complete in
|
| 744 |
+
seconds.
|
| 745 |
+
"""
|
| 746 |
+
# Fast path β lock-free check on the threading.Event.
|
| 747 |
+
if _model_is_loaded.is_set():
|
| 748 |
+
return
|
| 749 |
+
|
| 750 |
+
with _INIT_LOCK:
|
| 751 |
+
# Double-checked locking: re-test inside the mutex in case
|
| 752 |
+
# another thread completed loading between the fast-path check
|
| 753 |
+
# above and lock acquisition.
|
| 754 |
+
if _model_is_loaded.is_set():
|
| 755 |
+
return
|
| 756 |
+
|
| 757 |
+
global _tokenizer, _model # noqa: PLW0603
|
| 758 |
|
| 759 |
+
logger.info("Loading tokenizer for MODEL_ID=%s", MODEL_ID)
|
| 760 |
+
|
| 761 |
+
_tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
| 762 |
+
|
| 763 |
+
logger.info("Tokenizer loaded successfully.")
|
| 764 |
+
|
| 765 |
+
logger.info(
|
| 766 |
+
"Loading model on CPU "
|
| 767 |
+
"(low_cpu_mem_usage=True, torch_dtype=bfloat16)..."
|
| 768 |
+
)
|
| 769 |
+
|
| 770 |
+
_model = AutoModelForCausalLM.from_pretrained(
|
| 771 |
+
MODEL_ID,
|
| 772 |
+
torch_dtype=torch.bfloat16,
|
| 773 |
+
low_cpu_mem_usage=True,
|
| 774 |
+
device_map="cpu",
|
| 775 |
+
)
|
| 776 |
+
|
| 777 |
+
logger.info("Model loaded on CPU successfully.")
|
| 778 |
+
|
| 779 |
+
# Set the event last, only after both loads succeed.
|
| 780 |
+
# Any exception above leaves _model_is_loaded unset so the
|
| 781 |
+
# next request retries the full load sequence.
|
| 782 |
+
_model_is_loaded.set()
|
| 783 |
|
| 784 |
|
| 785 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 839 |
Notes
|
| 840 |
-----
|
| 841 |
Developer note
|
| 842 |
+
Callers must invoke ``_ensure_model_loaded()`` **before** the
|
| 843 |
+
``@spaces.GPU`` scope. A guard at the start of this function
|
| 844 |
+
raises ``RuntimeError`` immediately if ``_tokenizer`` or
|
| 845 |
+
``_model`` is ``None``, producing a clear programming-error
|
| 846 |
+
message instead of a cryptic ``AttributeError`` on ``None``.
|
| 847 |
+
|
| 848 |
GPU is acquired automatically by ``@spaces.GPU``.
|
| 849 |
|
| 850 |
``_MODEL_LOCK`` is held for the entire inference duration
|
|
|
|
| 877 |
Do not call this function directly from async code.
|
| 878 |
Use ``_generate_async`` from FastAPI routes.
|
| 879 |
"""
|
| 880 |
+
# Guard: callers must invoke _ensure_model_loaded() before the
|
| 881 |
+
# @spaces.GPU scope. This check makes the contract explicit and
|
| 882 |
+
# produces a clear RuntimeError instead of an AttributeError on None.
|
| 883 |
+
if _tokenizer is None or _model is None:
|
| 884 |
+
raise RuntimeError(
|
| 885 |
+
"_ensure_model_loaded() must be called by the caller "
|
| 886 |
+
"before entering the @spaces.GPU scope. "
|
| 887 |
+
"This is a programming error, not a user error."
|
| 888 |
+
)
|
| 889 |
+
|
| 890 |
validated_messages = _validate_messages(messages)
|
| 891 |
max_new_tokens = _clamp_max_tokens(max_new_tokens)
|
| 892 |
|
|
|
|
| 1040 |
Notes
|
| 1041 |
-----
|
| 1042 |
Developer note
|
| 1043 |
+
Calls ``_ensure_model_loaded()`` via ``asyncio.to_thread``
|
| 1044 |
+
before dispatching ``_generate``, so the CPU-only model load
|
| 1045 |
+
does not block the asyncio event loop and does not consume
|
| 1046 |
+
ZeroGPU GPU quota. Subsequent calls hit the lock-free fast
|
| 1047 |
+
path (``_model_is_loaded.is_set()``) and return immediately.
|
| 1048 |
+
|
| 1049 |
Offloads the synchronous ``_generate`` call to a thread via
|
| 1050 |
``asyncio.to_thread`` so the asyncio event loop is not blocked
|
| 1051 |
during GPU inference.
|
|
|
|
| 1053 |
Must NOT be called from Gradio event handlers β use ``_generate``
|
| 1054 |
directly from Gradio since it runs in its own thread pool.
|
| 1055 |
"""
|
| 1056 |
+
# Load tokenizer and model on first call only (CPU-only operation).
|
| 1057 |
+
# Called before asyncio.to_thread(_generate) so loading completes
|
| 1058 |
+
# before @spaces.GPU activates β ZeroGPU GPU quota is not consumed.
|
| 1059 |
+
await asyncio.to_thread(_ensure_model_loaded)
|
| 1060 |
+
|
| 1061 |
return await asyncio.to_thread(
|
| 1062 |
_generate,
|
| 1063 |
messages,
|
|
|
|
| 1269 |
current ``_generate`` implementation does not expose partial
|
| 1270 |
stop conditions. Extend this if streaming or early stopping
|
| 1271 |
is added.
|
| 1272 |
+
``system_fingerprint`` uses the pre-computed module-level
|
| 1273 |
+
constant ``_SYSTEM_FINGERPRINT`` (derived from ``MODEL_ID`` at
|
| 1274 |
+
import time) to avoid repeated string transformation per call.
|
| 1275 |
|
| 1276 |
User note
|
| 1277 |
The returned dict is compatible with OpenAI Python SDK
|
|
|
|
| 1282 |
.. [1] OpenAI API reference: Chat completions object
|
| 1283 |
https://platform.openai.com/docs/api-reference/chat/object
|
| 1284 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1285 |
return {
|
| 1286 |
"id": f"chatcmpl-{uuid.uuid4().hex}",
|
| 1287 |
"object": "chat.completion",
|
| 1288 |
"created": int(time.time()),
|
| 1289 |
"model": model_id,
|
| 1290 |
+
"system_fingerprint": _SYSTEM_FINGERPRINT,
|
| 1291 |
"choices": [
|
| 1292 |
{
|
| 1293 |
"index": 0,
|
|
|
|
| 1509 |
Notes
|
| 1510 |
-----
|
| 1511 |
Developer note
|
| 1512 |
+
Calls ``_ensure_model_loaded()`` directly (synchronous) before
|
| 1513 |
+
``_generate`` so the CPU-only model load does not consume
|
| 1514 |
+
ZeroGPU GPU quota. This is correct: Gradio dispatches event
|
| 1515 |
+
handlers in its own thread pool, so calling a blocking function
|
| 1516 |
+
here does not block the asyncio event loop.
|
| 1517 |
+
|
| 1518 |
Calls ``_generate`` (sync, ``@spaces.GPU``) directly.
|
| 1519 |
Must NOT call ``_generate_async`` (async) because Gradio
|
| 1520 |
dispatches event handlers via its own thread pool, completely
|
|
|
|
| 1532 |
"Message must be a non-empty string."
|
| 1533 |
)
|
| 1534 |
|
| 1535 |
+
# Load tokenizer and model on first call only (CPU-only operation).
|
| 1536 |
+
# Called before _generate/@spaces.GPU so loading does not consume
|
| 1537 |
+
# ZeroGPU GPU quota. Gradio's thread pool makes this blocking call safe.
|
| 1538 |
+
_ensure_model_loaded()
|
| 1539 |
+
|
| 1540 |
messages = _normalize_gradio_history(history)
|
| 1541 |
|
| 1542 |
messages.append(
|
|
|
|
| 1705 |
Examples
|
| 1706 |
--------
|
| 1707 |
>>> # curl http://localhost:7860/health
|
| 1708 |
+
... # {"status": "ok", "model": "...", "version": "2.2.0", "model_ready": true}
|
| 1709 |
"""
|
| 1710 |
logger.info("GET /health")
|
| 1711 |
|
|
|
|
| 1713 |
content={
|
| 1714 |
"status": "ok",
|
| 1715 |
"model": MODEL_ID,
|
| 1716 |
+
"version": _VERSION,
|
| 1717 |
+
"model_ready": _model_is_loaded.is_set(),
|
| 1718 |
},
|
| 1719 |
status_code=200,
|
| 1720 |
)
|
|
|
|
| 1760 |
3. Extract ``messages``, ``max_tokens``, ``temperature``,
|
| 1761 |
``top_p``, and ``model`` fields.
|
| 1762 |
4. Validate with field-specific validators (400 guard).
|
| 1763 |
+
4b. Lazy model load β ``_ensure_model_loaded()`` via
|
| 1764 |
+
``asyncio.to_thread`` (500 on failure).
|
| 1765 |
5. Count prompt tokens on CPU (no GPU needed).
|
| 1766 |
6. Dispatch to ``_generate_async`` which offloads to
|
| 1767 |
``@spaces.GPU`` via ``asyncio.to_thread``.
|
|
|
|
| 1891 |
top_p,
|
| 1892 |
)
|
| 1893 |
|
| 1894 |
+
# ββ 4b. Lazy model loading (CPU only, before GPU dispatch) βββββββββββββββ
|
| 1895 |
+
# _ensure_model_loaded() must complete before _count_prompt_tokens
|
| 1896 |
+
# (which needs _tokenizer) and before the @spaces.GPU scope inside
|
| 1897 |
+
# _generate. asyncio.to_thread prevents blocking the event loop on
|
| 1898 |
+
# the first load (which may download ~14 GB from HuggingFace).
|
| 1899 |
+
|
| 1900 |
+
try:
|
| 1901 |
+
await asyncio.to_thread(_ensure_model_loaded)
|
| 1902 |
+
except Exception: # noqa: BLE001
|
| 1903 |
+
logger.exception(
|
| 1904 |
+
"Model loading failed | request_id=%s",
|
| 1905 |
+
request_id,
|
| 1906 |
+
)
|
| 1907 |
+
return _error_response(
|
| 1908 |
+
message=(
|
| 1909 |
+
"Model loading failed. "
|
| 1910 |
+
"Please retry in a few minutes."
|
| 1911 |
+
),
|
| 1912 |
+
error_type="server_error",
|
| 1913 |
+
code="model_load_error",
|
| 1914 |
+
status_code=500,
|
| 1915 |
+
)
|
| 1916 |
+
|
| 1917 |
# ββ 5. Prompt token count (CPU, pre-dispatch) βββββββββββββββββββββββββββββ
|
| 1918 |
|
| 1919 |
prompt_tokens: int = _count_prompt_tokens(messages)
|
|
|
|
| 2009 |
|
| 2010 |
logger.info(
|
| 2011 |
"scikit-plots ai-model Space initialized successfully.\n"
|
| 2012 |
+
" version : %s\n"
|
| 2013 |
" model : %s\n"
|
| 2014 |
" CORS : %s\n"
|
| 2015 |
" max_body : %s bytes\n"
|
| 2016 |
" ASGI root : Gradio (ZeroGPU-compatible)\n"
|
| 2017 |
" routes : GET /health | POST /v1/chat/completions\n"
|
| 2018 |
" test UI : / (root, developer only)",
|
| 2019 |
+
_VERSION,
|
| 2020 |
MODEL_ID,
|
| 2021 |
CORS_ORIGINS,
|
| 2022 |
MAX_BODY_BYTES,
|