Spaces:

pgits
/

stt-gpu-service-python-v4

Runtime error

Peter Michael Gits Claude commited on Sep 4, 2025

Commit

1195fdc

1 Parent(s): b8737d8

Major: Revert to full Moshi model with L4 GPU upgrade

v2.0.0 - MAJOR: Back to full Moshi model with proper GPU
1. Reverted from Moshiko small model back to full Moshi (DEFAULT_REPO)
2. Upgraded HuggingFace Space hardware from t4-small to l4 (30GB VRAM)
3. Full Moshi model should fit comfortably in 30GB vs 14.74GB
4. Updated all references from Moshiko back to Moshi
5. Major version bump reflects significant architecture change
6. Should provide better STT quality with proper GPU resources

No more CPU fallback - full GPU STT power!

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (2) hide show

README.md +4 -3
app.py +16 -18

README.md CHANGED Viewed

@@ -5,7 +5,7 @@ colorFrom: blue
 colorTo: green
 sdk: docker
 app_port: 7860
-hardware: t4-small
 sleep_time_timeout: 1800
 suggested_storage: small
 pinned: false
@@ -13,13 +13,14 @@ pinned: false
 # STT GPU Service Python v4
-Real-time WebSocket STT streaming service using kyutai/stt-1b-en_fr model.
 ## Features
 - WebSocket streaming (80ms chunks at 24kHz)
 - REST API endpoints
 - FastAPI backend with real-time transcription
-- T4 GPU acceleration
 ## Endpoints
 - `/` - Web interface for testing

 colorTo: green
 sdk: docker
 app_port: 7860
+hardware: l4
 sleep_time_timeout: 1800
 suggested_storage: small
 pinned: false
 # STT GPU Service Python v4
+Real-time WebSocket STT streaming service using full Moshi model.
 ## Features
 - WebSocket streaming (80ms chunks at 24kHz)
 - REST API endpoints
 - FastAPI backend with real-time transcription
+- L4 GPU acceleration (30GB VRAM)
+- Full Moshi model for high-quality STT
 ## Endpoints
 - `/` - Web interface for testing

app.py CHANGED Viewed

@@ -21,7 +21,7 @@ from fastapi.responses import JSONResponse, HTMLResponse
 import uvicorn
 # Version tracking
-VERSION = "1.4.9"
 COMMIT_SHA = "TBD"
 # Configure logging
@@ -58,44 +58,42 @@ async def load_moshi_models():
             from huggingface_hub import hf_hub_download
             from moshi.models import loaders, LMGen
-            # Load Mimi (audio codec) - using smaller Moshiko model
-            logger.info("Loading Mimi audio codec for Moshiko...")
-            # Use Moshiko model repo instead of default
-            MOSHIKO_REPO = "kyutai/moshiko-pytorch-bf16"
-            mimi_weight = hf_hub_download(MOSHIKO_REPO, loaders.MIMI_NAME, cache_dir='/app/hf_cache')
             mimi = loaders.get_mimi(mimi_weight, device=device)
             mimi.set_num_codebooks(8)  # Limited to 8 for Moshi
-            logger.info("✅ Mimi loaded successfully (Moshiko variant)")
             # Clear cache after Mimi loading
             if device == "cuda":
                 torch.cuda.empty_cache()
                 logger.info(f"GPU memory after Mimi: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
-            # Load Moshiko (smaller language model)
-            logger.info("Loading Moshiko language model...")
-            moshi_weight = hf_hub_download(MOSHIKO_REPO, loaders.MOSHI_NAME, cache_dir='/app/hf_cache')
             # Try loading with memory-efficient settings
             try:
                 moshi = loaders.get_moshi_lm(moshi_weight, device=device)
                 lm_gen = LMGen(moshi, temp=0.8, temp_text=0.7)
-                logger.info("✅ Moshiko loaded successfully on GPU")
             except RuntimeError as cuda_error:
                 if "CUDA out of memory" in str(cuda_error):
-                    logger.warning(f"Moshiko CUDA out of memory, trying CPU fallback: {cuda_error}")
                     # Move Mimi to CPU as well for consistency
                     mimi = loaders.get_mimi(mimi_weight, device="cpu")
                     mimi.set_num_codebooks(8)
                     device = "cpu"
                     moshi = loaders.get_moshi_lm(moshi_weight, device="cpu")
                     lm_gen = LMGen(moshi, temp=0.8, temp_text=0.7)
-                    logger.info("✅ Moshiko loaded successfully on CPU (fallback)")
                     logger.info("✅ Mimi also moved to CPU for device consistency")
                 else:
                     raise
-            logger.info("🎉 All Moshiko models loaded successfully!")
             return True
         except ImportError as import_error:
@@ -127,7 +125,7 @@ def transcribe_audio_moshi(audio_data: np.ndarray, sample_rate: int = 24000) ->
         if mimi == "mock":
             duration = len(audio_data) / sample_rate
-            return f"Mock Moshiko STT: {duration:.2f}s audio at {sample_rate}Hz"
         # Ensure 24kHz audio for Moshi
         if sample_rate != 24000:
@@ -232,8 +230,8 @@ async def lifespan(app: FastAPI):
 # FastAPI app with lifespan
 app = FastAPI(
-    title="STT GPU Service Python v4 - Moshiko Model",
-    description="Real-time WebSocket STT streaming with Moshiko PyTorch implementation (Smaller model for T4 GPU)",
     version=VERSION,
     lifespan=lifespan
 )
@@ -246,7 +244,7 @@ async def health_check():
         "timestamp": time.time(),
         "version": VERSION,
         "commit_sha": COMMIT_SHA,
-        "message": "Moshiko STT WebSocket Service - Smaller model for T4 GPU",
         "space_name": "stt-gpu-service-python-v4",
         "mimi_loaded": mimi is not None and mimi != "mock",
         "moshi_loaded": moshi is not None and moshi != "mock",

 import uvicorn
 # Version tracking
+VERSION = "2.0.0"
 COMMIT_SHA = "TBD"
 # Configure logging
             from huggingface_hub import hf_hub_download
             from moshi.models import loaders, LMGen
+            # Load Mimi (audio codec) - using full Moshi model
+            logger.info("Loading Mimi audio codec...")
+            mimi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MIMI_NAME, cache_dir='/app/hf_cache')
             mimi = loaders.get_mimi(mimi_weight, device=device)
             mimi.set_num_codebooks(8)  # Limited to 8 for Moshi
+            logger.info("✅ Mimi loaded successfully")
             # Clear cache after Mimi loading
             if device == "cuda":
                 torch.cuda.empty_cache()
                 logger.info(f"GPU memory after Mimi: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
+            # Load Moshi (full language model)
+            logger.info("Loading Moshi language model...")
+            moshi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MOSHI_NAME, cache_dir='/app/hf_cache')
             # Try loading with memory-efficient settings
             try:
                 moshi = loaders.get_moshi_lm(moshi_weight, device=device)
                 lm_gen = LMGen(moshi, temp=0.8, temp_text=0.7)
+                logger.info("✅ Moshi loaded successfully on GPU")
             except RuntimeError as cuda_error:
                 if "CUDA out of memory" in str(cuda_error):
+                    logger.warning(f"Moshi CUDA out of memory, trying CPU fallback: {cuda_error}")
                     # Move Mimi to CPU as well for consistency
                     mimi = loaders.get_mimi(mimi_weight, device="cpu")
                     mimi.set_num_codebooks(8)
                     device = "cpu"
                     moshi = loaders.get_moshi_lm(moshi_weight, device="cpu")
                     lm_gen = LMGen(moshi, temp=0.8, temp_text=0.7)
+                    logger.info("✅ Moshi loaded successfully on CPU (fallback)")
                     logger.info("✅ Mimi also moved to CPU for device consistency")
                 else:
                     raise
+            logger.info("🎉 All Moshi models loaded successfully!")
             return True
         except ImportError as import_error:
         if mimi == "mock":
             duration = len(audio_data) / sample_rate
+            return f"Mock Moshi STT: {duration:.2f}s audio at {sample_rate}Hz"
         # Ensure 24kHz audio for Moshi
         if sample_rate != 24000:
 # FastAPI app with lifespan
 app = FastAPI(
+    title="STT GPU Service Python v4 - Full Moshi Model",
+    description="Real-time WebSocket STT streaming with full Moshi PyTorch implementation (L4 GPU with 30GB VRAM)",
     version=VERSION,
     lifespan=lifespan
 )
         "timestamp": time.time(),
         "version": VERSION,
         "commit_sha": COMMIT_SHA,
+        "message": "Moshi STT WebSocket Service - Full model on L4 GPU",
         "space_name": "stt-gpu-service-python-v4",
         "mimi_loaded": mimi is not None and mimi != "mock",
         "moshi_loaded": moshi is not None and moshi != "mock",