Peter Michael Gits Claude commited on
Commit
812a43d
Β·
1 Parent(s): 049566b

Fix CUDA memory issues and libgomp warning

Browse files

v1.3.11 - Critical memory management fixes:
1. Added PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True for fragmentation
2. Added torch.cuda.empty_cache() calls to free GPU memory
3. Added CPU fallback for Moshi model if CUDA OOM occurs
4. Added GPU memory logging for debugging
5. Fixed OMP_NUM_THREADS with quotes: ENV OMP_NUM_THREADS="1"
6. Disabled flash attention to reduce memory usage

This should resolve both CUDA out of memory and libgomp warnings

πŸ€– Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (2) hide show
  1. Dockerfile +4 -2
  2. app.py +28 -4
Dockerfile CHANGED
@@ -43,8 +43,10 @@ RUN chown -R appuser:appuser /app
43
  # Switch back to non-root user for running the app
44
  USER appuser
45
 
46
- # Set environment variables to fix OpenMP and caching issues
47
- ENV OMP_NUM_THREADS=1
 
 
48
  ENV HF_HOME=/app/hf_cache
49
  ENV HUGGINGFACE_HUB_CACHE=/app/hf_cache
50
  ENV TRANSFORMERS_CACHE=/app/hf_cache
 
43
  # Switch back to non-root user for running the app
44
  USER appuser
45
 
46
+ # Set environment variables to fix OpenMP, CUDA memory, and caching issues
47
+ ENV OMP_NUM_THREADS="1"
48
+ ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
49
+ ENV CUDA_LAUNCH_BLOCKING=0
50
  ENV HF_HOME=/app/hf_cache
51
  ENV HUGGINGFACE_HUB_CACHE=/app/hf_cache
52
  ENV TRANSFORMERS_CACHE=/app/hf_cache
app.py CHANGED
@@ -16,7 +16,7 @@ from fastapi.responses import JSONResponse, HTMLResponse
16
  import uvicorn
17
 
18
  # Version tracking
19
- VERSION = "1.3.10"
20
  COMMIT_SHA = "TBD"
21
 
22
  # Configure logging
@@ -42,6 +42,13 @@ async def load_moshi_models():
42
  logger.info(f"Using device: {device}")
43
  logger.info(f"Cache directory: {os.environ.get('HF_HOME', 'default')}")
44
 
 
 
 
 
 
 
 
45
  try:
46
  from huggingface_hub import hf_hub_download
47
  from moshi.models import loaders, LMGen
@@ -53,12 +60,29 @@ async def load_moshi_models():
53
  mimi.set_num_codebooks(8) # Limited to 8 for Moshi
54
  logger.info("βœ… Mimi loaded successfully")
55
 
 
 
 
 
 
56
  # Load Moshi (language model)
57
  logger.info("Loading Moshi language model...")
58
  moshi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MOSHI_NAME, cache_dir='/app/hf_cache')
59
- moshi = loaders.get_moshi_lm(moshi_weight, device=device)
60
- lm_gen = LMGen(moshi, temp=0.8, temp_text=0.7)
61
- logger.info("βœ… Moshi loaded successfully")
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
  logger.info("πŸŽ‰ All Moshi models loaded successfully!")
64
  return True
 
16
  import uvicorn
17
 
18
  # Version tracking
19
+ VERSION = "1.3.11"
20
  COMMIT_SHA = "TBD"
21
 
22
  # Configure logging
 
42
  logger.info(f"Using device: {device}")
43
  logger.info(f"Cache directory: {os.environ.get('HF_HOME', 'default')}")
44
 
45
+ # Clear GPU memory and set memory management
46
+ if device == "cuda":
47
+ torch.cuda.empty_cache()
48
+ # Enable memory efficient attention
49
+ torch.backends.cuda.enable_flash_sdp(False)
50
+ logger.info(f"GPU memory before loading: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
51
+
52
  try:
53
  from huggingface_hub import hf_hub_download
54
  from moshi.models import loaders, LMGen
 
60
  mimi.set_num_codebooks(8) # Limited to 8 for Moshi
61
  logger.info("βœ… Mimi loaded successfully")
62
 
63
+ # Clear cache after Mimi loading
64
+ if device == "cuda":
65
+ torch.cuda.empty_cache()
66
+ logger.info(f"GPU memory after Mimi: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
67
+
68
  # Load Moshi (language model)
69
  logger.info("Loading Moshi language model...")
70
  moshi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MOSHI_NAME, cache_dir='/app/hf_cache')
71
+
72
+ # Try loading with memory-efficient settings
73
+ try:
74
+ moshi = loaders.get_moshi_lm(moshi_weight, device=device)
75
+ lm_gen = LMGen(moshi, temp=0.8, temp_text=0.7)
76
+ logger.info("βœ… Moshi loaded successfully")
77
+ except RuntimeError as cuda_error:
78
+ if "CUDA out of memory" in str(cuda_error):
79
+ logger.warning(f"CUDA out of memory, trying CPU fallback: {cuda_error}")
80
+ device = "cpu"
81
+ moshi = loaders.get_moshi_lm(moshi_weight, device="cpu")
82
+ lm_gen = LMGen(moshi, temp=0.8, temp_text=0.7)
83
+ logger.info("βœ… Moshi loaded successfully on CPU (fallback)")
84
+ else:
85
+ raise
86
 
87
  logger.info("πŸŽ‰ All Moshi models loaded successfully!")
88
  return True