Quran-multi-aligner

Running on Zero

hetchyy Claude Opus 4.6 commited on Feb 16

Commit

47224bd

1 Parent(s): bed2244

Fix race condition where concurrent quota exhaustion permanently breaks GPU

GPU quota state was a process-global variable shared across request threads.
When an unlogged user exhausted ZeroGPU quota, a concurrent request's
reset_quota_flag() could clear the flag before the CPU fallback path checked
it, causing model.to("cuda") outside a GPU context — permanently poisoning
CUDA init for all users until space restart.

Replace globals with threading.local() for per-request isolation and add
RuntimeError safety net in ensure_models_on_gpu to prevent CUDA init from
ever escaping. Also add row_group_size to parquet writes.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (3) hide show

src/core/usage_logger.py +6 -1
src/core/zero_gpu.py +20 -24
src/segmenter/segmenter_model.py +18 -13

src/core/usage_logger.py CHANGED Viewed

@@ -183,7 +183,12 @@ if _HAS_DEPS:
             try:
                 import tempfile
                 archive = tempfile.NamedTemporaryFile(suffix=".parquet", delete=False)
-                pq.write_table(table, archive.name)
                 self.api.upload_file(
                     repo_id=self.repo_id,
                     repo_type=self.repo_type,

             try:
                 import tempfile
                 archive = tempfile.NamedTemporaryFile(suffix=".parquet", delete=False)
+                pq.write_table(
+                    table,
+                    archive.name,
+                    row_group_size=1,
+                    write_page_index=True,
+                )
                 self.api.upload_file(
                     repo_id=self.repo_id,
                     repo_type=self.repo_type,

src/core/zero_gpu.py CHANGED Viewed

@@ -4,6 +4,7 @@ local or non-ZeroGPU environments.
 """
 import re
 from typing import Callable, TypeVar
 from functools import wraps
@@ -12,10 +13,8 @@ T = TypeVar("T", bound=Callable)
 # Default values in case the spaces package is unavailable (e.g., local runs).
 ZERO_GPU_AVAILABLE = False
-# Track whether we've fallen back to CPU due to quota exhaustion
-_gpu_quota_exhausted = False
-_quota_reset_time = None  # e.g. "13:53:59"
-_user_forced_cpu = False
 try:
     import spaces  # type: ignore
@@ -39,32 +38,30 @@ except Exception:
 def is_quota_exhausted() -> bool:
-    """Check if GPU quota has been exhausted this session."""
-    return _gpu_quota_exhausted
 def is_user_forced_cpu() -> bool:
-    """Check if the user manually selected CPU mode."""
-    return _user_forced_cpu
 def get_quota_reset_time() -> str | None:
     """Return the quota reset time string (e.g. '13:53:59'), or None."""
-    return _quota_reset_time
 def reset_quota_flag():
-    """Reset the quota exhausted flag (e.g., after quota resets)."""
-    global _gpu_quota_exhausted, _quota_reset_time, _user_forced_cpu
-    _gpu_quota_exhausted = False
-    _quota_reset_time = None
-    _user_forced_cpu = False
 def force_cpu_mode():
-    """Force all GPU-decorated functions to skip GPU and run on CPU."""
-    global _user_forced_cpu
-    _user_forced_cpu = True
     _move_models_to_cpu()
@@ -100,15 +97,13 @@ def gpu_with_fallback(duration=60):
         @wraps(func)
         def wrapper(*args, **kwargs):
-            global _gpu_quota_exhausted, _quota_reset_time
             # If user explicitly chose CPU mode, skip GPU entirely
-            if _user_forced_cpu:
                 print("[CPU] User selected CPU mode")
                 return func(*args, **kwargs)
             # If quota already exhausted, go straight to CPU
-            if _gpu_quota_exhausted:
                 print("[GPU] Quota exhausted, using CPU fallback")
                 _move_models_to_cpu()
                 return func(*args, **kwargs)
@@ -124,15 +119,16 @@ def gpu_with_fallback(duration=60):
                 if is_quota_error:
                     print(f"[GPU] Quota exceeded, falling back to CPU: {e}")
-                    _gpu_quota_exhausted = True
                     # Parse reset time from message like "Try again in 13:53:59"
                     match = re.search(r'Try again in (\d+:\d{2}:\d{2})', str(e))
                     if match:
-                        _quota_reset_time = match.group(1)
                     # Show immediate toast notification
                     try:
                         import gradio as gr
-                        reset_msg = f" Resets in {_quota_reset_time}." if _quota_reset_time else ""
                         gr.Warning(f"GPU quota reached — switching to CPU (slower).{reset_msg}")
                     except Exception:
                         pass  # Not in a Gradio context (e.g., CLI usage)

 """
 import re
+import threading
 from typing import Callable, TypeVar
 from functools import wraps
 # Default values in case the spaces package is unavailable (e.g., local runs).
 ZERO_GPU_AVAILABLE = False
+# Per-thread (per-request) GPU state so concurrent requests don't interfere
+_request_state = threading.local()
 try:
     import spaces  # type: ignore
 def is_quota_exhausted() -> bool:
+    """Check if GPU quota has been exhausted for this request's thread."""
+    return getattr(_request_state, 'gpu_quota_exhausted', False)
 def is_user_forced_cpu() -> bool:
+    """Check if the user manually selected CPU mode for this request."""
+    return getattr(_request_state, 'user_forced_cpu', False)
 def get_quota_reset_time() -> str | None:
     """Return the quota reset time string (e.g. '13:53:59'), or None."""
+    return getattr(_request_state, 'quota_reset_time', None)
 def reset_quota_flag():
+    """Reset the quota exhausted flag for this request's thread."""
+    _request_state.gpu_quota_exhausted = False
+    _request_state.quota_reset_time = None
+    _request_state.user_forced_cpu = False
 def force_cpu_mode():
+    """Force GPU-decorated functions to skip GPU and run on CPU for this request."""
+    _request_state.user_forced_cpu = True
     _move_models_to_cpu()
         @wraps(func)
         def wrapper(*args, **kwargs):
             # If user explicitly chose CPU mode, skip GPU entirely
+            if is_user_forced_cpu():
                 print("[CPU] User selected CPU mode")
                 return func(*args, **kwargs)
             # If quota already exhausted, go straight to CPU
+            if is_quota_exhausted():
                 print("[GPU] Quota exhausted, using CPU fallback")
                 _move_models_to_cpu()
                 return func(*args, **kwargs)
                 if is_quota_error:
                     print(f"[GPU] Quota exceeded, falling back to CPU: {e}")
+                    _request_state.gpu_quota_exhausted = True
                     # Parse reset time from message like "Try again in 13:53:59"
                     match = re.search(r'Try again in (\d+:\d{2}:\d{2})', str(e))
                     if match:
+                        _request_state.quota_reset_time = match.group(1)
                     # Show immediate toast notification
                     try:
                         import gradio as gr
+                        reset_time = get_quota_reset_time()
+                        reset_msg = f" Resets in {reset_time}." if reset_time else ""
                         gr.Warning(f"GPU quota reached — switching to CPU (slower).{reset_msg}")
                     except Exception:
                         pass  # Not in a Gradio context (e.g., CLI usage)

src/segmenter/segmenter_model.py CHANGED Viewed

@@ -74,19 +74,24 @@ def ensure_models_on_gpu(asr_model_name=None):
     dtype = _TORCH_DTYPE
     move_start = time.time()
-    # Move segmenter to GPU
-    if _segmenter_cache["loaded"] and _segmenter_cache["model"] is not None:
-        model = _segmenter_cache["model"]
-        if next(model.parameters()).device.type != "cuda":
-            print("[GPU] Moving segmenter to CUDA...")
-            model.to(device, dtype=dtype)
-            _segmenter_cache["model"] = model
-            _segmenter_cache["device"] = "cuda"
-            print("[GPU] Segmenter on CUDA")
-    # Move phoneme ASR to GPU (only the requested model)
-    if asr_model_name is not None:
-        move_phoneme_asr_to_gpu(asr_model_name)
     return time.time() - move_start

     dtype = _TORCH_DTYPE
     move_start = time.time()
+    try:
+        # Move segmenter to GPU
+        if _segmenter_cache["loaded"] and _segmenter_cache["model"] is not None:
+            model = _segmenter_cache["model"]
+            if next(model.parameters()).device.type != "cuda":
+                print("[GPU] Moving segmenter to CUDA...")
+                model.to(device, dtype=dtype)
+                _segmenter_cache["model"] = model
+                _segmenter_cache["device"] = "cuda"
+                print("[GPU] Segmenter on CUDA")
+        # Move phoneme ASR to GPU (only the requested model)
+        if asr_model_name is not None:
+            move_phoneme_asr_to_gpu(asr_model_name)
+    except RuntimeError as e:
+        # Prevent CUDA init outside GPU context from poisoning the process
+        print(f"[GPU] CUDA move failed, staying on CPU: {e}")
+        return 0.0
     return time.time() - move_start