Spaces:
Running on Zero
Running on Zero
Catch SDK-wrapped GPU errors that bypass CUDA error detection
Browse filesThe ZeroGPU SDK catches CUDA errors internally and re-raises its own
exception type (e.g. "GPU task aborted") which didn't match our error
patterns, causing the bare `raise` to propagate to the UI. Now:
- Add "gpu task aborted" to CUDA error patterns
- Log exception type/message for all GPU errors (aids debugging)
- Replace bare `raise` with catch-all CPU fallback for unrecognized errors
- Only timeouts still re-raise; everything else falls back to CPU
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
- .gitignore +1 -0
- src/core/zero_gpu.py +14 -1
.gitignore
CHANGED
|
@@ -50,5 +50,6 @@ models/
|
|
| 50 |
captures/
|
| 51 |
|
| 52 |
docs/api.md
|
|
|
|
| 53 |
scripts/
|
| 54 |
tests/
|
|
|
|
| 50 |
captures/
|
| 51 |
|
| 52 |
docs/api.md
|
| 53 |
+
docs/usage-logging.md
|
| 54 |
scripts/
|
| 55 |
tests/
|
src/core/zero_gpu.py
CHANGED
|
@@ -55,6 +55,7 @@ _CUDA_ERROR_PATTERNS = (
|
|
| 55 |
"cuda error", "cuda out of memory",
|
| 56 |
"cuda driver", "cuda runtime",
|
| 57 |
"device-side assert", "cublas", "cudnn error", "nccl",
|
|
|
|
| 58 |
)
|
| 59 |
|
| 60 |
try:
|
|
@@ -310,6 +311,7 @@ def gpu_with_fallback(duration=60):
|
|
| 310 |
try:
|
| 311 |
return gpu_func(*args, **kwargs)
|
| 312 |
except Exception as e:
|
|
|
|
| 313 |
# ZeroGPU raises gradio.Error with title="ZeroGPU quota exceeded"
|
| 314 |
is_quota_error = getattr(e, 'title', '') == "ZeroGPU quota exceeded"
|
| 315 |
if not is_quota_error:
|
|
@@ -359,7 +361,18 @@ def gpu_with_fallback(duration=60):
|
|
| 359 |
)
|
| 360 |
if is_timeout:
|
| 361 |
print(f"[GPU] Timeout error in {func.__name__}: {e}")
|
| 362 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 363 |
|
| 364 |
return wrapper
|
| 365 |
return decorator
|
|
|
|
| 55 |
"cuda error", "cuda out of memory",
|
| 56 |
"cuda driver", "cuda runtime",
|
| 57 |
"device-side assert", "cublas", "cudnn error", "nccl",
|
| 58 |
+
"gpu task aborted", # ZeroGPU SDK wraps CUDA errors with this message
|
| 59 |
)
|
| 60 |
|
| 61 |
try:
|
|
|
|
| 311 |
try:
|
| 312 |
return gpu_func(*args, **kwargs)
|
| 313 |
except Exception as e:
|
| 314 |
+
print(f"[GPU] gpu_func error: {type(e).__name__}: {e}")
|
| 315 |
# ZeroGPU raises gradio.Error with title="ZeroGPU quota exceeded"
|
| 316 |
is_quota_error = getattr(e, 'title', '') == "ZeroGPU quota exceeded"
|
| 317 |
if not is_quota_error:
|
|
|
|
| 361 |
)
|
| 362 |
if is_timeout:
|
| 363 |
print(f"[GPU] Timeout error in {func.__name__}: {e}")
|
| 364 |
+
raise
|
| 365 |
+
|
| 366 |
+
# Unrecognized GPU/SDK error — fall back to CPU rather than crash.
|
| 367 |
+
# Don't mark CUDA unhealthy (might be transient SDK issue).
|
| 368 |
+
print(f"[GPU] Unrecognized GPU error, falling back to CPU: {type(e).__name__}: {e}")
|
| 369 |
+
_request_state.gpu_quota_exhausted = True
|
| 370 |
+
try:
|
| 371 |
+
import gradio as gr
|
| 372 |
+
gr.Warning("GPU error — using CPU (slower).")
|
| 373 |
+
except Exception:
|
| 374 |
+
pass
|
| 375 |
+
return func(*args, **kwargs)
|
| 376 |
|
| 377 |
return wrapper
|
| 378 |
return decorator
|