hetchyy Claude Opus 4.6 commited on
Commit
d2b3d8c
·
1 Parent(s): 10281c4

Catch SDK-wrapped GPU errors that bypass CUDA error detection

Browse files

The ZeroGPU SDK catches CUDA errors internally and re-raises its own
exception type (e.g. "GPU task aborted") which didn't match our error
patterns, causing the bare `raise` to propagate to the UI. Now:
- Add "gpu task aborted" to CUDA error patterns
- Log exception type/message for all GPU errors (aids debugging)
- Replace bare `raise` with catch-all CPU fallback for unrecognized errors
- Only timeouts still re-raise; everything else falls back to CPU

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (2) hide show
  1. .gitignore +1 -0
  2. src/core/zero_gpu.py +14 -1
.gitignore CHANGED
@@ -50,5 +50,6 @@ models/
50
  captures/
51
 
52
  docs/api.md
 
53
  scripts/
54
  tests/
 
50
  captures/
51
 
52
  docs/api.md
53
+ docs/usage-logging.md
54
  scripts/
55
  tests/
src/core/zero_gpu.py CHANGED
@@ -55,6 +55,7 @@ _CUDA_ERROR_PATTERNS = (
55
  "cuda error", "cuda out of memory",
56
  "cuda driver", "cuda runtime",
57
  "device-side assert", "cublas", "cudnn error", "nccl",
 
58
  )
59
 
60
  try:
@@ -310,6 +311,7 @@ def gpu_with_fallback(duration=60):
310
  try:
311
  return gpu_func(*args, **kwargs)
312
  except Exception as e:
 
313
  # ZeroGPU raises gradio.Error with title="ZeroGPU quota exceeded"
314
  is_quota_error = getattr(e, 'title', '') == "ZeroGPU quota exceeded"
315
  if not is_quota_error:
@@ -359,7 +361,18 @@ def gpu_with_fallback(duration=60):
359
  )
360
  if is_timeout:
361
  print(f"[GPU] Timeout error in {func.__name__}: {e}")
362
- raise
 
 
 
 
 
 
 
 
 
 
 
363
 
364
  return wrapper
365
  return decorator
 
55
  "cuda error", "cuda out of memory",
56
  "cuda driver", "cuda runtime",
57
  "device-side assert", "cublas", "cudnn error", "nccl",
58
+ "gpu task aborted", # ZeroGPU SDK wraps CUDA errors with this message
59
  )
60
 
61
  try:
 
311
  try:
312
  return gpu_func(*args, **kwargs)
313
  except Exception as e:
314
+ print(f"[GPU] gpu_func error: {type(e).__name__}: {e}")
315
  # ZeroGPU raises gradio.Error with title="ZeroGPU quota exceeded"
316
  is_quota_error = getattr(e, 'title', '') == "ZeroGPU quota exceeded"
317
  if not is_quota_error:
 
361
  )
362
  if is_timeout:
363
  print(f"[GPU] Timeout error in {func.__name__}: {e}")
364
+ raise
365
+
366
+ # Unrecognized GPU/SDK error — fall back to CPU rather than crash.
367
+ # Don't mark CUDA unhealthy (might be transient SDK issue).
368
+ print(f"[GPU] Unrecognized GPU error, falling back to CPU: {type(e).__name__}: {e}")
369
+ _request_state.gpu_quota_exhausted = True
370
+ try:
371
+ import gradio as gr
372
+ gr.Warning("GPU error — using CPU (slower).")
373
+ except Exception:
374
+ pass
375
+ return func(*args, **kwargs)
376
 
377
  return wrapper
378
  return decorator