Spaces:
Running on Zero
Running on Zero
Reset torch.cuda._initialized on GPU errors instead of nuking global model caches
Browse filesReplace _invalidate_all_models() (which disrupted concurrent GPU users)
with torch.cuda._initialized = False to unpoison CUDA for the next user,
plus _models_stale flag so stale models drain safely inside the next GPU
lease. Also tune anchor voting params and increase MFA timeout.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
- config.py +3 -3
- src/core/zero_gpu.py +15 -25
config.py
CHANGED
|
@@ -120,8 +120,8 @@ AOTI_HUB_REPO = "hetchyy/quran-aligner-aoti" # Hub repo for compiled model cach
|
|
| 120 |
|
| 121 |
ANCHOR_SEGMENTS = 5 # N-gram voting uses first N Quran segments
|
| 122 |
ANCHOR_RARITY_WEIGHTING = True # Weight votes by 1/count (rarity); False = equal weight
|
| 123 |
-
ANCHOR_RUN_TRIM_RATIO = 0.
|
| 124 |
-
ANCHOR_TOP_CANDIDATES =
|
| 125 |
|
| 126 |
# Edit operation costs (Levenshtein hyperparameters)
|
| 127 |
COST_SUBSTITUTION = 1.0 # Default phoneme substitution cost
|
|
@@ -188,7 +188,7 @@ UNDERSEG_MIN_DURATION = 15 # Duration gate (seconds)
|
|
| 188 |
# =============================================================================
|
| 189 |
|
| 190 |
MFA_SPACE_URL = "https://hetchyy-quran-phoneme-mfa.hf.space"
|
| 191 |
-
MFA_TIMEOUT =
|
| 192 |
|
| 193 |
# =============================================================================
|
| 194 |
# Usage logging (pushed to HF Hub via ParquetScheduler)
|
|
|
|
| 120 |
|
| 121 |
ANCHOR_SEGMENTS = 5 # N-gram voting uses first N Quran segments
|
| 122 |
ANCHOR_RARITY_WEIGHTING = True # Weight votes by 1/count (rarity); False = equal weight
|
| 123 |
+
ANCHOR_RUN_TRIM_RATIO = 0.2 # Trim leading/trailing ayahs whose weight < ratio * max weight in run
|
| 124 |
+
ANCHOR_TOP_CANDIDATES = 20 # Evaluate top N surahs by total weight for contiguous run comparison
|
| 125 |
|
| 126 |
# Edit operation costs (Levenshtein hyperparameters)
|
| 127 |
COST_SUBSTITUTION = 1.0 # Default phoneme substitution cost
|
|
|
|
| 188 |
# =============================================================================
|
| 189 |
|
| 190 |
MFA_SPACE_URL = "https://hetchyy-quran-phoneme-mfa.hf.space"
|
| 191 |
+
MFA_TIMEOUT = 180
|
| 192 |
|
| 193 |
# =============================================================================
|
| 194 |
# Usage logging (pushed to HF Hub via ParquetScheduler)
|
src/core/zero_gpu.py
CHANGED
|
@@ -149,28 +149,6 @@ def _drain_stale_models():
|
|
| 149 |
print("[GPU CLEANUP] Drained stale models from previous lease")
|
| 150 |
|
| 151 |
|
| 152 |
-
# =========================================================================
|
| 153 |
-
# Model cache invalidation (for GPU error recovery)
|
| 154 |
-
# =========================================================================
|
| 155 |
-
|
| 156 |
-
def _invalidate_all_models():
|
| 157 |
-
"""Drop all cached models so next load creates fresh CPU copies.
|
| 158 |
-
|
| 159 |
-
Called after GPU errors — cached models may hold dead CUDA tensor
|
| 160 |
-
references that would crash if accessed.
|
| 161 |
-
"""
|
| 162 |
-
try:
|
| 163 |
-
from ..segmenter.segmenter_model import invalidate_segmenter_cache
|
| 164 |
-
from ..alignment.phoneme_asr import invalidate_asr_cache
|
| 165 |
-
invalidate_segmenter_cache()
|
| 166 |
-
invalidate_asr_cache()
|
| 167 |
-
import gc
|
| 168 |
-
gc.collect()
|
| 169 |
-
print("[GPU] Invalidated all model caches for CPU fallback")
|
| 170 |
-
except Exception as e:
|
| 171 |
-
print(f"[GPU] Cache invalidation error (non-fatal): {e}")
|
| 172 |
-
|
| 173 |
-
|
| 174 |
# =========================================================================
|
| 175 |
# GPU decorator with fallback
|
| 176 |
# =========================================================================
|
|
@@ -273,11 +251,23 @@ def gpu_with_fallback(duration=60):
|
|
| 273 |
print(f"[GPU] Timeout error in {func.__name__}: {e}")
|
| 274 |
raise
|
| 275 |
|
| 276 |
-
# ANY other GPU error →
|
| 277 |
-
#
|
| 278 |
print(f"[GPU] GPU error, falling back to CPU: {type(e).__name__}: {e}")
|
| 279 |
_request_state.gpu_quota_exhausted = True
|
| 280 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
try:
|
| 282 |
import gradio as gr
|
| 283 |
gr.Warning("GPU error — processing on CPU (slower).")
|
|
|
|
| 149 |
print("[GPU CLEANUP] Drained stale models from previous lease")
|
| 150 |
|
| 151 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
# =========================================================================
|
| 153 |
# GPU decorator with fallback
|
| 154 |
# =========================================================================
|
|
|
|
| 251 |
print(f"[GPU] Timeout error in {func.__name__}: {e}")
|
| 252 |
raise
|
| 253 |
|
| 254 |
+
# ANY other GPU error → reset CUDA flag so next user gets
|
| 255 |
+
# clean GPU, and fall back to CPU for THIS request only.
|
| 256 |
print(f"[GPU] GPU error, falling back to CPU: {type(e).__name__}: {e}")
|
| 257 |
_request_state.gpu_quota_exhausted = True
|
| 258 |
+
|
| 259 |
+
# Unpoison CUDA for next GPU user — just a Python bool, not a CUDA op.
|
| 260 |
+
try:
|
| 261 |
+
import torch
|
| 262 |
+
torch.cuda._initialized = False
|
| 263 |
+
except Exception:
|
| 264 |
+
pass
|
| 265 |
+
|
| 266 |
+
# Mark models stale so next GPU lease drains them safely.
|
| 267 |
+
global _models_stale
|
| 268 |
+
with _lease_lock:
|
| 269 |
+
_models_stale = True
|
| 270 |
+
|
| 271 |
try:
|
| 272 |
import gradio as gr
|
| 273 |
gr.Warning("GPU error — processing on CPU (slower).")
|