Quran-multi-aligner

Running on Zero

hetchyy Claude Opus 4.6 commited on Feb 21

Commit

93e5e86

1 Parent(s): 3f00433

Reset torch.cuda._initialized on GPU errors instead of nuking global model caches

Replace _invalidate_all_models() (which disrupted concurrent GPU users)
with torch.cuda._initialized = False to unpoison CUDA for the next user,
plus _models_stale flag so stale models drain safely inside the next GPU
lease. Also tune anchor voting params and increase MFA timeout.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (2) hide show

config.py +3 -3
src/core/zero_gpu.py +15 -25

config.py CHANGED Viewed

@@ -120,8 +120,8 @@ AOTI_HUB_REPO = "hetchyy/quran-aligner-aoti"  # Hub repo for compiled model cach
 ANCHOR_SEGMENTS = 5                 # N-gram voting uses first N Quran segments
 ANCHOR_RARITY_WEIGHTING = True      # Weight votes by 1/count (rarity); False = equal weight
-ANCHOR_RUN_TRIM_RATIO = 0.15        # Trim leading/trailing ayahs whose weight < ratio * max weight in run
-ANCHOR_TOP_CANDIDATES = 10             # Evaluate top N surahs by total weight for contiguous run comparison
 # Edit operation costs (Levenshtein hyperparameters)
 COST_SUBSTITUTION = 1.0             # Default phoneme substitution cost
@@ -188,7 +188,7 @@ UNDERSEG_MIN_DURATION = 15      # Duration gate (seconds)
 # =============================================================================
 MFA_SPACE_URL = "https://hetchyy-quran-phoneme-mfa.hf.space"
-MFA_TIMEOUT = 120
 # =============================================================================
 # Usage logging (pushed to HF Hub via ParquetScheduler)

 ANCHOR_SEGMENTS = 5                 # N-gram voting uses first N Quran segments
 ANCHOR_RARITY_WEIGHTING = True      # Weight votes by 1/count (rarity); False = equal weight
+ANCHOR_RUN_TRIM_RATIO = 0.2         # Trim leading/trailing ayahs whose weight < ratio * max weight in run
+ANCHOR_TOP_CANDIDATES = 20          # Evaluate top N surahs by total weight for contiguous run comparison
 # Edit operation costs (Levenshtein hyperparameters)
 COST_SUBSTITUTION = 1.0             # Default phoneme substitution cost
 # =============================================================================
 MFA_SPACE_URL = "https://hetchyy-quran-phoneme-mfa.hf.space"
+MFA_TIMEOUT = 180
 # =============================================================================
 # Usage logging (pushed to HF Hub via ParquetScheduler)

src/core/zero_gpu.py CHANGED Viewed

@@ -149,28 +149,6 @@ def _drain_stale_models():
     print("[GPU CLEANUP] Drained stale models from previous lease")
-# =========================================================================
-# Model cache invalidation (for GPU error recovery)
-# =========================================================================
-def _invalidate_all_models():
-    """Drop all cached models so next load creates fresh CPU copies.
-    Called after GPU errors — cached models may hold dead CUDA tensor
-    references that would crash if accessed.
-    """
-    try:
-        from ..segmenter.segmenter_model import invalidate_segmenter_cache
-        from ..alignment.phoneme_asr import invalidate_asr_cache
-        invalidate_segmenter_cache()
-        invalidate_asr_cache()
-        import gc
-        gc.collect()
-        print("[GPU] Invalidated all model caches for CPU fallback")
-    except Exception as e:
-        print(f"[GPU] Cache invalidation error (non-fatal): {e}")
 # =========================================================================
 # GPU decorator with fallback
 # =========================================================================
@@ -273,11 +251,23 @@ def gpu_with_fallback(duration=60):
                     print(f"[GPU] Timeout error in {func.__name__}: {e}")
                     raise
-                # ANY other GPU error → flag as exhausted so downstream
-                # code reloads models fresh on CPU instead of touching CUDA.
                 print(f"[GPU] GPU error, falling back to CPU: {type(e).__name__}: {e}")
                 _request_state.gpu_quota_exhausted = True
-                _invalidate_all_models()
                 try:
                     import gradio as gr
                     gr.Warning("GPU error — processing on CPU (slower).")

     print("[GPU CLEANUP] Drained stale models from previous lease")
 # =========================================================================
 # GPU decorator with fallback
 # =========================================================================
                     print(f"[GPU] Timeout error in {func.__name__}: {e}")
                     raise
+                # ANY other GPU error → reset CUDA flag so next user gets
+                # clean GPU, and fall back to CPU for THIS request only.
                 print(f"[GPU] GPU error, falling back to CPU: {type(e).__name__}: {e}")
                 _request_state.gpu_quota_exhausted = True
+                # Unpoison CUDA for next GPU user — just a Python bool, not a CUDA op.
+                try:
+                    import torch
+                    torch.cuda._initialized = False
+                except Exception:
+                    pass
+                # Mark models stale so next GPU lease drains them safely.
+                global _models_stale
+                with _lease_lock:
+                    _models_stale = True
                 try:
                     import gradio as gr
                     gr.Warning("GPU error — processing on CPU (slower).")