Spaces:

pgits
/

stt-gpu-service-python-v4

Runtime error

Peter Michael Gits Claude commited on Sep 4, 2025

Commit

393f5a7

1 Parent(s): 94bc832

Fix device mismatch error - CUDA/CPU tensor mixing

v1.3.14 - CRITICAL FIX: Device consistency for CPU fallback
1. When Moshi falls back to CPU due to CUDA OOM, also move Mimi to CPU
2. Added model device detection in transcription function
3. All tensors now use same device as loaded models
4. Fixes: 'Expected all tensors to be on the same device, but found cuda:0 and cpu!'
5. Service now works properly on CPU fallback mode

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (1) hide show

app.py +11 -3

app.py CHANGED Viewed

@@ -16,7 +16,7 @@ from fastapi.responses import JSONResponse, HTMLResponse
 import uvicorn
 # Version tracking
-VERSION = "1.3.13"
 COMMIT_SHA = "TBD"
 # Configure logging
@@ -77,10 +77,14 @@ async def load_moshi_models():
             except RuntimeError as cuda_error:
                 if "CUDA out of memory" in str(cuda_error):
                     logger.warning(f"CUDA out of memory, trying CPU fallback: {cuda_error}")
                     device = "cpu"
                     moshi = loaders.get_moshi_lm(moshi_weight, device="cpu")
                     lm_gen = LMGen(moshi, temp=0.8, temp_text=0.7)
                     logger.info("✅ Moshi loaded successfully on CPU (fallback)")
                 else:
                     raise
@@ -121,8 +125,12 @@ def transcribe_audio_moshi(audio_data: np.ndarray, sample_rate: int = 24000) ->
             import librosa
             audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=24000)
-        # Convert to torch tensor
-        wav = torch.from_numpy(audio_data).unsqueeze(0).unsqueeze(0).to(device)
         # Process with Mimi codec in streaming mode
         with torch.no_grad(), mimi.streaming(batch_size=1):

 import uvicorn
 # Version tracking
+VERSION = "1.3.14"
 COMMIT_SHA = "TBD"
 # Configure logging
             except RuntimeError as cuda_error:
                 if "CUDA out of memory" in str(cuda_error):
                     logger.warning(f"CUDA out of memory, trying CPU fallback: {cuda_error}")
+                    # Move Mimi to CPU as well for consistency
+                    mimi = loaders.get_mimi(mimi_weight, device="cpu")
+                    mimi.set_num_codebooks(8)
                     device = "cpu"
                     moshi = loaders.get_moshi_lm(moshi_weight, device="cpu")
                     lm_gen = LMGen(moshi, temp=0.8, temp_text=0.7)
                     logger.info("✅ Moshi loaded successfully on CPU (fallback)")
+                    logger.info("✅ Mimi also moved to CPU for device consistency")
                 else:
                     raise
             import librosa
             audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=24000)
+        # Determine actual device of the models (might have fallen back to CPU)
+        model_device = next(mimi.parameters()).device if hasattr(mimi, 'parameters') else device
+        logger.info(f"Using device for transcription: {model_device}")
+        # Convert to torch tensor and put on same device as models
+        wav = torch.from_numpy(audio_data).unsqueeze(0).unsqueeze(0).to(model_device)
         # Process with Mimi codec in streaming mode
         with torch.no_grad(), mimi.streaming(batch_size=1):