Spaces:
Runtime error
Runtime error
Peter Michael Gits Claude commited on
Commit Β·
812a43d
1
Parent(s): 049566b
Fix CUDA memory issues and libgomp warning
Browse filesv1.3.11 - Critical memory management fixes:
1. Added PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True for fragmentation
2. Added torch.cuda.empty_cache() calls to free GPU memory
3. Added CPU fallback for Moshi model if CUDA OOM occurs
4. Added GPU memory logging for debugging
5. Fixed OMP_NUM_THREADS with quotes: ENV OMP_NUM_THREADS="1"
6. Disabled flash attention to reduce memory usage
This should resolve both CUDA out of memory and libgomp warnings
π€ Generated with [Claude Code](https://claude.ai/code)
Co-Authored-By: Claude <noreply@anthropic.com>
- Dockerfile +4 -2
- app.py +28 -4
Dockerfile
CHANGED
|
@@ -43,8 +43,10 @@ RUN chown -R appuser:appuser /app
|
|
| 43 |
# Switch back to non-root user for running the app
|
| 44 |
USER appuser
|
| 45 |
|
| 46 |
-
# Set environment variables to fix OpenMP and caching issues
|
| 47 |
-
ENV OMP_NUM_THREADS=1
|
|
|
|
|
|
|
| 48 |
ENV HF_HOME=/app/hf_cache
|
| 49 |
ENV HUGGINGFACE_HUB_CACHE=/app/hf_cache
|
| 50 |
ENV TRANSFORMERS_CACHE=/app/hf_cache
|
|
|
|
| 43 |
# Switch back to non-root user for running the app
|
| 44 |
USER appuser
|
| 45 |
|
| 46 |
+
# Set environment variables to fix OpenMP, CUDA memory, and caching issues
|
| 47 |
+
ENV OMP_NUM_THREADS="1"
|
| 48 |
+
ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
| 49 |
+
ENV CUDA_LAUNCH_BLOCKING=0
|
| 50 |
ENV HF_HOME=/app/hf_cache
|
| 51 |
ENV HUGGINGFACE_HUB_CACHE=/app/hf_cache
|
| 52 |
ENV TRANSFORMERS_CACHE=/app/hf_cache
|
app.py
CHANGED
|
@@ -16,7 +16,7 @@ from fastapi.responses import JSONResponse, HTMLResponse
|
|
| 16 |
import uvicorn
|
| 17 |
|
| 18 |
# Version tracking
|
| 19 |
-
VERSION = "1.3.
|
| 20 |
COMMIT_SHA = "TBD"
|
| 21 |
|
| 22 |
# Configure logging
|
|
@@ -42,6 +42,13 @@ async def load_moshi_models():
|
|
| 42 |
logger.info(f"Using device: {device}")
|
| 43 |
logger.info(f"Cache directory: {os.environ.get('HF_HOME', 'default')}")
|
| 44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
try:
|
| 46 |
from huggingface_hub import hf_hub_download
|
| 47 |
from moshi.models import loaders, LMGen
|
|
@@ -53,12 +60,29 @@ async def load_moshi_models():
|
|
| 53 |
mimi.set_num_codebooks(8) # Limited to 8 for Moshi
|
| 54 |
logger.info("β
Mimi loaded successfully")
|
| 55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
# Load Moshi (language model)
|
| 57 |
logger.info("Loading Moshi language model...")
|
| 58 |
moshi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MOSHI_NAME, cache_dir='/app/hf_cache')
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
logger.info("π All Moshi models loaded successfully!")
|
| 64 |
return True
|
|
|
|
| 16 |
import uvicorn
|
| 17 |
|
| 18 |
# Version tracking
|
| 19 |
+
VERSION = "1.3.11"
|
| 20 |
COMMIT_SHA = "TBD"
|
| 21 |
|
| 22 |
# Configure logging
|
|
|
|
| 42 |
logger.info(f"Using device: {device}")
|
| 43 |
logger.info(f"Cache directory: {os.environ.get('HF_HOME', 'default')}")
|
| 44 |
|
| 45 |
+
# Clear GPU memory and set memory management
|
| 46 |
+
if device == "cuda":
|
| 47 |
+
torch.cuda.empty_cache()
|
| 48 |
+
# Enable memory efficient attention
|
| 49 |
+
torch.backends.cuda.enable_flash_sdp(False)
|
| 50 |
+
logger.info(f"GPU memory before loading: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
|
| 51 |
+
|
| 52 |
try:
|
| 53 |
from huggingface_hub import hf_hub_download
|
| 54 |
from moshi.models import loaders, LMGen
|
|
|
|
| 60 |
mimi.set_num_codebooks(8) # Limited to 8 for Moshi
|
| 61 |
logger.info("β
Mimi loaded successfully")
|
| 62 |
|
| 63 |
+
# Clear cache after Mimi loading
|
| 64 |
+
if device == "cuda":
|
| 65 |
+
torch.cuda.empty_cache()
|
| 66 |
+
logger.info(f"GPU memory after Mimi: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
|
| 67 |
+
|
| 68 |
# Load Moshi (language model)
|
| 69 |
logger.info("Loading Moshi language model...")
|
| 70 |
moshi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MOSHI_NAME, cache_dir='/app/hf_cache')
|
| 71 |
+
|
| 72 |
+
# Try loading with memory-efficient settings
|
| 73 |
+
try:
|
| 74 |
+
moshi = loaders.get_moshi_lm(moshi_weight, device=device)
|
| 75 |
+
lm_gen = LMGen(moshi, temp=0.8, temp_text=0.7)
|
| 76 |
+
logger.info("β
Moshi loaded successfully")
|
| 77 |
+
except RuntimeError as cuda_error:
|
| 78 |
+
if "CUDA out of memory" in str(cuda_error):
|
| 79 |
+
logger.warning(f"CUDA out of memory, trying CPU fallback: {cuda_error}")
|
| 80 |
+
device = "cpu"
|
| 81 |
+
moshi = loaders.get_moshi_lm(moshi_weight, device="cpu")
|
| 82 |
+
lm_gen = LMGen(moshi, temp=0.8, temp_text=0.7)
|
| 83 |
+
logger.info("β
Moshi loaded successfully on CPU (fallback)")
|
| 84 |
+
else:
|
| 85 |
+
raise
|
| 86 |
|
| 87 |
logger.info("π All Moshi models loaded successfully!")
|
| 88 |
return True
|