hetchyy Claude Opus 4.6 commited on
Commit
93e5e86
·
1 Parent(s): 3f00433

Reset torch.cuda._initialized on GPU errors instead of nuking global model caches

Browse files

Replace _invalidate_all_models() (which disrupted concurrent GPU users)
with torch.cuda._initialized = False to unpoison CUDA for the next user,
plus _models_stale flag so stale models drain safely inside the next GPU
lease. Also tune anchor voting params and increase MFA timeout.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (2) hide show
  1. config.py +3 -3
  2. src/core/zero_gpu.py +15 -25
config.py CHANGED
@@ -120,8 +120,8 @@ AOTI_HUB_REPO = "hetchyy/quran-aligner-aoti" # Hub repo for compiled model cach
120
 
121
  ANCHOR_SEGMENTS = 5 # N-gram voting uses first N Quran segments
122
  ANCHOR_RARITY_WEIGHTING = True # Weight votes by 1/count (rarity); False = equal weight
123
- ANCHOR_RUN_TRIM_RATIO = 0.15 # Trim leading/trailing ayahs whose weight < ratio * max weight in run
124
- ANCHOR_TOP_CANDIDATES = 10 # Evaluate top N surahs by total weight for contiguous run comparison
125
 
126
  # Edit operation costs (Levenshtein hyperparameters)
127
  COST_SUBSTITUTION = 1.0 # Default phoneme substitution cost
@@ -188,7 +188,7 @@ UNDERSEG_MIN_DURATION = 15 # Duration gate (seconds)
188
  # =============================================================================
189
 
190
  MFA_SPACE_URL = "https://hetchyy-quran-phoneme-mfa.hf.space"
191
- MFA_TIMEOUT = 120
192
 
193
  # =============================================================================
194
  # Usage logging (pushed to HF Hub via ParquetScheduler)
 
120
 
121
  ANCHOR_SEGMENTS = 5 # N-gram voting uses first N Quran segments
122
  ANCHOR_RARITY_WEIGHTING = True # Weight votes by 1/count (rarity); False = equal weight
123
+ ANCHOR_RUN_TRIM_RATIO = 0.2 # Trim leading/trailing ayahs whose weight < ratio * max weight in run
124
+ ANCHOR_TOP_CANDIDATES = 20 # Evaluate top N surahs by total weight for contiguous run comparison
125
 
126
  # Edit operation costs (Levenshtein hyperparameters)
127
  COST_SUBSTITUTION = 1.0 # Default phoneme substitution cost
 
188
  # =============================================================================
189
 
190
  MFA_SPACE_URL = "https://hetchyy-quran-phoneme-mfa.hf.space"
191
+ MFA_TIMEOUT = 180
192
 
193
  # =============================================================================
194
  # Usage logging (pushed to HF Hub via ParquetScheduler)
src/core/zero_gpu.py CHANGED
@@ -149,28 +149,6 @@ def _drain_stale_models():
149
  print("[GPU CLEANUP] Drained stale models from previous lease")
150
 
151
 
152
- # =========================================================================
153
- # Model cache invalidation (for GPU error recovery)
154
- # =========================================================================
155
-
156
- def _invalidate_all_models():
157
- """Drop all cached models so next load creates fresh CPU copies.
158
-
159
- Called after GPU errors — cached models may hold dead CUDA tensor
160
- references that would crash if accessed.
161
- """
162
- try:
163
- from ..segmenter.segmenter_model import invalidate_segmenter_cache
164
- from ..alignment.phoneme_asr import invalidate_asr_cache
165
- invalidate_segmenter_cache()
166
- invalidate_asr_cache()
167
- import gc
168
- gc.collect()
169
- print("[GPU] Invalidated all model caches for CPU fallback")
170
- except Exception as e:
171
- print(f"[GPU] Cache invalidation error (non-fatal): {e}")
172
-
173
-
174
  # =========================================================================
175
  # GPU decorator with fallback
176
  # =========================================================================
@@ -273,11 +251,23 @@ def gpu_with_fallback(duration=60):
273
  print(f"[GPU] Timeout error in {func.__name__}: {e}")
274
  raise
275
 
276
- # ANY other GPU error → flag as exhausted so downstream
277
- # code reloads models fresh on CPU instead of touching CUDA.
278
  print(f"[GPU] GPU error, falling back to CPU: {type(e).__name__}: {e}")
279
  _request_state.gpu_quota_exhausted = True
280
- _invalidate_all_models()
 
 
 
 
 
 
 
 
 
 
 
 
281
  try:
282
  import gradio as gr
283
  gr.Warning("GPU error — processing on CPU (slower).")
 
149
  print("[GPU CLEANUP] Drained stale models from previous lease")
150
 
151
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  # =========================================================================
153
  # GPU decorator with fallback
154
  # =========================================================================
 
251
  print(f"[GPU] Timeout error in {func.__name__}: {e}")
252
  raise
253
 
254
+ # ANY other GPU error → reset CUDA flag so next user gets
255
+ # clean GPU, and fall back to CPU for THIS request only.
256
  print(f"[GPU] GPU error, falling back to CPU: {type(e).__name__}: {e}")
257
  _request_state.gpu_quota_exhausted = True
258
+
259
+ # Unpoison CUDA for next GPU user — just a Python bool, not a CUDA op.
260
+ try:
261
+ import torch
262
+ torch.cuda._initialized = False
263
+ except Exception:
264
+ pass
265
+
266
+ # Mark models stale so next GPU lease drains them safely.
267
+ global _models_stale
268
+ with _lease_lock:
269
+ _models_stale = True
270
+
271
  try:
272
  import gradio as gr
273
  gr.Warning("GPU error — processing on CPU (slower).")