Spaces:

pgits
/

stt-gpu-service-python-v4

Runtime error

Peter Michael Gits Claude commited on Sep 4, 2025

Commit

812a43d

1 Parent(s): 049566b

Fix CUDA memory issues and libgomp warning

v1.3.11 - Critical memory management fixes:
1. Added PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True for fragmentation
2. Added torch.cuda.empty_cache() calls to free GPU memory
3. Added CPU fallback for Moshi model if CUDA OOM occurs
4. Added GPU memory logging for debugging
5. Fixed OMP_NUM_THREADS with quotes: ENV OMP_NUM_THREADS="1"
6. Disabled flash attention to reduce memory usage

This should resolve both CUDA out of memory and libgomp warnings

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (2) hide show

Dockerfile +4 -2
app.py +28 -4

Dockerfile CHANGED Viewed

@@ -43,8 +43,10 @@ RUN chown -R appuser:appuser /app
 # Switch back to non-root user for running the app
 USER appuser
-# Set environment variables to fix OpenMP and caching issues
-ENV OMP_NUM_THREADS=1
 ENV HF_HOME=/app/hf_cache
 ENV HUGGINGFACE_HUB_CACHE=/app/hf_cache
 ENV TRANSFORMERS_CACHE=/app/hf_cache

 # Switch back to non-root user for running the app
 USER appuser
+# Set environment variables to fix OpenMP, CUDA memory, and caching issues
+ENV OMP_NUM_THREADS="1"
+ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+ENV CUDA_LAUNCH_BLOCKING=0
 ENV HF_HOME=/app/hf_cache
 ENV HUGGINGFACE_HUB_CACHE=/app/hf_cache
 ENV TRANSFORMERS_CACHE=/app/hf_cache

app.py CHANGED Viewed

@@ -16,7 +16,7 @@ from fastapi.responses import JSONResponse, HTMLResponse
 import uvicorn
 # Version tracking
-VERSION = "1.3.10"
 COMMIT_SHA = "TBD"
 # Configure logging
@@ -42,6 +42,13 @@ async def load_moshi_models():
         logger.info(f"Using device: {device}")
         logger.info(f"Cache directory: {os.environ.get('HF_HOME', 'default')}")
         try:
             from huggingface_hub import hf_hub_download
             from moshi.models import loaders, LMGen
@@ -53,12 +60,29 @@ async def load_moshi_models():
             mimi.set_num_codebooks(8)  # Limited to 8 for Moshi
             logger.info("✅ Mimi loaded successfully")
             # Load Moshi (language model)
             logger.info("Loading Moshi language model...")
             moshi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MOSHI_NAME, cache_dir='/app/hf_cache')
-            moshi = loaders.get_moshi_lm(moshi_weight, device=device)
-            lm_gen = LMGen(moshi, temp=0.8, temp_text=0.7)
-            logger.info("✅ Moshi loaded successfully")
             logger.info("🎉 All Moshi models loaded successfully!")
             return True

 import uvicorn
 # Version tracking
+VERSION = "1.3.11"
 COMMIT_SHA = "TBD"
 # Configure logging
         logger.info(f"Using device: {device}")
         logger.info(f"Cache directory: {os.environ.get('HF_HOME', 'default')}")
+        # Clear GPU memory and set memory management
+        if device == "cuda":
+            torch.cuda.empty_cache()
+            # Enable memory efficient attention
+            torch.backends.cuda.enable_flash_sdp(False)
+            logger.info(f"GPU memory before loading: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
         try:
             from huggingface_hub import hf_hub_download
             from moshi.models import loaders, LMGen
             mimi.set_num_codebooks(8)  # Limited to 8 for Moshi
             logger.info("✅ Mimi loaded successfully")
+            # Clear cache after Mimi loading
+            if device == "cuda":
+                torch.cuda.empty_cache()
+                logger.info(f"GPU memory after Mimi: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
             # Load Moshi (language model)
             logger.info("Loading Moshi language model...")
             moshi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MOSHI_NAME, cache_dir='/app/hf_cache')
+            # Try loading with memory-efficient settings
+            try:
+                moshi = loaders.get_moshi_lm(moshi_weight, device=device)
+                lm_gen = LMGen(moshi, temp=0.8, temp_text=0.7)
+                logger.info("✅ Moshi loaded successfully")
+            except RuntimeError as cuda_error:
+                if "CUDA out of memory" in str(cuda_error):
+                    logger.warning(f"CUDA out of memory, trying CPU fallback: {cuda_error}")
+                    device = "cpu"
+                    moshi = loaders.get_moshi_lm(moshi_weight, device="cpu")
+                    lm_gen = LMGen(moshi, temp=0.8, temp_text=0.7)
+                    logger.info("✅ Moshi loaded successfully on CPU (fallback)")
+                else:
+                    raise
             logger.info("🎉 All Moshi models loaded successfully!")
             return True