Spaces:
Running on Zero
Running on Zero
Fix SDK worker_init CUDA poisoning: reset immediately instead of 300s cooldown
Browse filesWhen ZeroGPU SDK's worker_init fails (quota exhaustion causes torch._C._cuda_init()
to fail), it poisons torch.cuda._initialized at the process level. The SDK wraps
this as gradio.Error(title="ZeroGPU worker error") with the original CUDA message
stripped, bypassing our pattern matching. Now detect these SDK worker errors via
e.title and reset CUDA state immediately — no lock is held and no CUDA context is
active at worker_init time, so other users can retry GPU fresh without waiting 300s.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
- src/core/zero_gpu.py +22 -0
src/core/zero_gpu.py
CHANGED
|
@@ -366,6 +366,13 @@ def gpu_with_fallback(duration=60):
|
|
| 366 |
err_lower = str(e).lower()
|
| 367 |
is_cuda_error = any(p in err_lower for p in _CUDA_ERROR_PATTERNS)
|
| 368 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 369 |
if is_cuda_error:
|
| 370 |
print(f"[GPU] CUDA error, falling back to CPU: {e}")
|
| 371 |
_mark_cuda_unhealthy()
|
|
@@ -377,6 +384,21 @@ def gpu_with_fallback(duration=60):
|
|
| 377 |
pass
|
| 378 |
return func(*args, **kwargs)
|
| 379 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 380 |
is_timeout = (
|
| 381 |
'timeout' in err_lower
|
| 382 |
or 'duration' in err_lower
|
|
|
|
| 366 |
err_lower = str(e).lower()
|
| 367 |
is_cuda_error = any(p in err_lower for p in _CUDA_ERROR_PATTERNS)
|
| 368 |
|
| 369 |
+
# SDK wraps worker_init failures as gradio.Error(title="ZeroGPU worker error")
|
| 370 |
+
# with message = just the exception class name. Original CUDA message is lost.
|
| 371 |
+
is_sdk_worker_error = False
|
| 372 |
+
if not is_cuda_error:
|
| 373 |
+
err_title = getattr(e, 'title', '') or ''
|
| 374 |
+
is_sdk_worker_error = 'worker' in err_title.lower() and 'error' in err_title.lower()
|
| 375 |
+
|
| 376 |
if is_cuda_error:
|
| 377 |
print(f"[GPU] CUDA error, falling back to CPU: {e}")
|
| 378 |
_mark_cuda_unhealthy()
|
|
|
|
| 384 |
pass
|
| 385 |
return func(*args, **kwargs)
|
| 386 |
|
| 387 |
+
if is_sdk_worker_error:
|
| 388 |
+
# worker_init failed (torch._C._cuda_init() poisoned the process).
|
| 389 |
+
# Reset immediately — no lock is held, no CUDA context is active.
|
| 390 |
+
# Other users can retry GPU fresh; their worker_init gets a new GPU.
|
| 391 |
+
# Do NOT call _mark_cuda_unhealthy() — that blocks ALL users for 300s.
|
| 392 |
+
print(f"[GPU] SDK worker error, resetting CUDA state: {e}")
|
| 393 |
+
_try_reset_cuda_state()
|
| 394 |
+
_request_state.gpu_quota_exhausted = True
|
| 395 |
+
try:
|
| 396 |
+
import gradio as gr
|
| 397 |
+
gr.Warning("GPU temporarily unavailable — using CPU (slower).")
|
| 398 |
+
except Exception:
|
| 399 |
+
pass
|
| 400 |
+
return func(*args, **kwargs)
|
| 401 |
+
|
| 402 |
is_timeout = (
|
| 403 |
'timeout' in err_lower
|
| 404 |
or 'duration' in err_lower
|