hetchyy Claude Opus 4.6 commited on
Commit
6984b50
·
1 Parent(s): a6f747e

Fix SDK worker_init CUDA poisoning: reset immediately instead of 300s cooldown

Browse files

When ZeroGPU SDK's worker_init fails (quota exhaustion causes torch._C._cuda_init()
to fail), it poisons torch.cuda._initialized at the process level. The SDK wraps
this as gradio.Error(title="ZeroGPU worker error") with the original CUDA message
stripped, bypassing our pattern matching. Now detect these SDK worker errors via
e.title and reset CUDA state immediately — no lock is held and no CUDA context is
active at worker_init time, so other users can retry GPU fresh without waiting 300s.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. src/core/zero_gpu.py +22 -0
src/core/zero_gpu.py CHANGED
@@ -366,6 +366,13 @@ def gpu_with_fallback(duration=60):
366
  err_lower = str(e).lower()
367
  is_cuda_error = any(p in err_lower for p in _CUDA_ERROR_PATTERNS)
368
 
 
 
 
 
 
 
 
369
  if is_cuda_error:
370
  print(f"[GPU] CUDA error, falling back to CPU: {e}")
371
  _mark_cuda_unhealthy()
@@ -377,6 +384,21 @@ def gpu_with_fallback(duration=60):
377
  pass
378
  return func(*args, **kwargs)
379
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
380
  is_timeout = (
381
  'timeout' in err_lower
382
  or 'duration' in err_lower
 
366
  err_lower = str(e).lower()
367
  is_cuda_error = any(p in err_lower for p in _CUDA_ERROR_PATTERNS)
368
 
369
+ # SDK wraps worker_init failures as gradio.Error(title="ZeroGPU worker error")
370
+ # with message = just the exception class name. Original CUDA message is lost.
371
+ is_sdk_worker_error = False
372
+ if not is_cuda_error:
373
+ err_title = getattr(e, 'title', '') or ''
374
+ is_sdk_worker_error = 'worker' in err_title.lower() and 'error' in err_title.lower()
375
+
376
  if is_cuda_error:
377
  print(f"[GPU] CUDA error, falling back to CPU: {e}")
378
  _mark_cuda_unhealthy()
 
384
  pass
385
  return func(*args, **kwargs)
386
 
387
+ if is_sdk_worker_error:
388
+ # worker_init failed (torch._C._cuda_init() poisoned the process).
389
+ # Reset immediately — no lock is held, no CUDA context is active.
390
+ # Other users can retry GPU fresh; their worker_init gets a new GPU.
391
+ # Do NOT call _mark_cuda_unhealthy() — that blocks ALL users for 300s.
392
+ print(f"[GPU] SDK worker error, resetting CUDA state: {e}")
393
+ _try_reset_cuda_state()
394
+ _request_state.gpu_quota_exhausted = True
395
+ try:
396
+ import gradio as gr
397
+ gr.Warning("GPU temporarily unavailable — using CPU (slower).")
398
+ except Exception:
399
+ pass
400
+ return func(*args, **kwargs)
401
+
402
  is_timeout = (
403
  'timeout' in err_lower
404
  or 'duration' in err_lower