Spaces:
Runtime error
Runtime error
Peter Michael Gits Claude commited on
Commit Β·
1195fdc
1
Parent(s): b8737d8
Major: Revert to full Moshi model with L4 GPU upgrade
Browse filesv2.0.0 - MAJOR: Back to full Moshi model with proper GPU
1. Reverted from Moshiko small model back to full Moshi (DEFAULT_REPO)
2. Upgraded HuggingFace Space hardware from t4-small to l4 (30GB VRAM)
3. Full Moshi model should fit comfortably in 30GB vs 14.74GB
4. Updated all references from Moshiko back to Moshi
5. Major version bump reflects significant architecture change
6. Should provide better STT quality with proper GPU resources
No more CPU fallback - full GPU STT power!
π€ Generated with [Claude Code](https://claude.ai/code)
Co-Authored-By: Claude <noreply@anthropic.com>
README.md
CHANGED
|
@@ -5,7 +5,7 @@ colorFrom: blue
|
|
| 5 |
colorTo: green
|
| 6 |
sdk: docker
|
| 7 |
app_port: 7860
|
| 8 |
-
hardware:
|
| 9 |
sleep_time_timeout: 1800
|
| 10 |
suggested_storage: small
|
| 11 |
pinned: false
|
|
@@ -13,13 +13,14 @@ pinned: false
|
|
| 13 |
|
| 14 |
# STT GPU Service Python v4
|
| 15 |
|
| 16 |
-
Real-time WebSocket STT streaming service using
|
| 17 |
|
| 18 |
## Features
|
| 19 |
- WebSocket streaming (80ms chunks at 24kHz)
|
| 20 |
- REST API endpoints
|
| 21 |
- FastAPI backend with real-time transcription
|
| 22 |
-
-
|
|
|
|
| 23 |
|
| 24 |
## Endpoints
|
| 25 |
- `/` - Web interface for testing
|
|
|
|
| 5 |
colorTo: green
|
| 6 |
sdk: docker
|
| 7 |
app_port: 7860
|
| 8 |
+
hardware: l4
|
| 9 |
sleep_time_timeout: 1800
|
| 10 |
suggested_storage: small
|
| 11 |
pinned: false
|
|
|
|
| 13 |
|
| 14 |
# STT GPU Service Python v4
|
| 15 |
|
| 16 |
+
Real-time WebSocket STT streaming service using full Moshi model.
|
| 17 |
|
| 18 |
## Features
|
| 19 |
- WebSocket streaming (80ms chunks at 24kHz)
|
| 20 |
- REST API endpoints
|
| 21 |
- FastAPI backend with real-time transcription
|
| 22 |
+
- L4 GPU acceleration (30GB VRAM)
|
| 23 |
+
- Full Moshi model for high-quality STT
|
| 24 |
|
| 25 |
## Endpoints
|
| 26 |
- `/` - Web interface for testing
|
app.py
CHANGED
|
@@ -21,7 +21,7 @@ from fastapi.responses import JSONResponse, HTMLResponse
|
|
| 21 |
import uvicorn
|
| 22 |
|
| 23 |
# Version tracking
|
| 24 |
-
VERSION = "
|
| 25 |
COMMIT_SHA = "TBD"
|
| 26 |
|
| 27 |
# Configure logging
|
|
@@ -58,44 +58,42 @@ async def load_moshi_models():
|
|
| 58 |
from huggingface_hub import hf_hub_download
|
| 59 |
from moshi.models import loaders, LMGen
|
| 60 |
|
| 61 |
-
# Load Mimi (audio codec) - using
|
| 62 |
-
logger.info("Loading Mimi audio codec
|
| 63 |
-
|
| 64 |
-
MOSHIKO_REPO = "kyutai/moshiko-pytorch-bf16"
|
| 65 |
-
mimi_weight = hf_hub_download(MOSHIKO_REPO, loaders.MIMI_NAME, cache_dir='/app/hf_cache')
|
| 66 |
mimi = loaders.get_mimi(mimi_weight, device=device)
|
| 67 |
mimi.set_num_codebooks(8) # Limited to 8 for Moshi
|
| 68 |
-
logger.info("β
Mimi loaded successfully
|
| 69 |
|
| 70 |
# Clear cache after Mimi loading
|
| 71 |
if device == "cuda":
|
| 72 |
torch.cuda.empty_cache()
|
| 73 |
logger.info(f"GPU memory after Mimi: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
|
| 74 |
|
| 75 |
-
# Load
|
| 76 |
-
logger.info("Loading
|
| 77 |
-
moshi_weight = hf_hub_download(
|
| 78 |
|
| 79 |
# Try loading with memory-efficient settings
|
| 80 |
try:
|
| 81 |
moshi = loaders.get_moshi_lm(moshi_weight, device=device)
|
| 82 |
lm_gen = LMGen(moshi, temp=0.8, temp_text=0.7)
|
| 83 |
-
logger.info("β
|
| 84 |
except RuntimeError as cuda_error:
|
| 85 |
if "CUDA out of memory" in str(cuda_error):
|
| 86 |
-
logger.warning(f"
|
| 87 |
# Move Mimi to CPU as well for consistency
|
| 88 |
mimi = loaders.get_mimi(mimi_weight, device="cpu")
|
| 89 |
mimi.set_num_codebooks(8)
|
| 90 |
device = "cpu"
|
| 91 |
moshi = loaders.get_moshi_lm(moshi_weight, device="cpu")
|
| 92 |
lm_gen = LMGen(moshi, temp=0.8, temp_text=0.7)
|
| 93 |
-
logger.info("β
|
| 94 |
logger.info("β
Mimi also moved to CPU for device consistency")
|
| 95 |
else:
|
| 96 |
raise
|
| 97 |
|
| 98 |
-
logger.info("π All
|
| 99 |
return True
|
| 100 |
|
| 101 |
except ImportError as import_error:
|
|
@@ -127,7 +125,7 @@ def transcribe_audio_moshi(audio_data: np.ndarray, sample_rate: int = 24000) ->
|
|
| 127 |
|
| 128 |
if mimi == "mock":
|
| 129 |
duration = len(audio_data) / sample_rate
|
| 130 |
-
return f"Mock
|
| 131 |
|
| 132 |
# Ensure 24kHz audio for Moshi
|
| 133 |
if sample_rate != 24000:
|
|
@@ -232,8 +230,8 @@ async def lifespan(app: FastAPI):
|
|
| 232 |
|
| 233 |
# FastAPI app with lifespan
|
| 234 |
app = FastAPI(
|
| 235 |
-
title="STT GPU Service Python v4 -
|
| 236 |
-
description="Real-time WebSocket STT streaming with
|
| 237 |
version=VERSION,
|
| 238 |
lifespan=lifespan
|
| 239 |
)
|
|
@@ -246,7 +244,7 @@ async def health_check():
|
|
| 246 |
"timestamp": time.time(),
|
| 247 |
"version": VERSION,
|
| 248 |
"commit_sha": COMMIT_SHA,
|
| 249 |
-
"message": "
|
| 250 |
"space_name": "stt-gpu-service-python-v4",
|
| 251 |
"mimi_loaded": mimi is not None and mimi != "mock",
|
| 252 |
"moshi_loaded": moshi is not None and moshi != "mock",
|
|
|
|
| 21 |
import uvicorn
|
| 22 |
|
| 23 |
# Version tracking
|
| 24 |
+
VERSION = "2.0.0"
|
| 25 |
COMMIT_SHA = "TBD"
|
| 26 |
|
| 27 |
# Configure logging
|
|
|
|
| 58 |
from huggingface_hub import hf_hub_download
|
| 59 |
from moshi.models import loaders, LMGen
|
| 60 |
|
| 61 |
+
# Load Mimi (audio codec) - using full Moshi model
|
| 62 |
+
logger.info("Loading Mimi audio codec...")
|
| 63 |
+
mimi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MIMI_NAME, cache_dir='/app/hf_cache')
|
|
|
|
|
|
|
| 64 |
mimi = loaders.get_mimi(mimi_weight, device=device)
|
| 65 |
mimi.set_num_codebooks(8) # Limited to 8 for Moshi
|
| 66 |
+
logger.info("β
Mimi loaded successfully")
|
| 67 |
|
| 68 |
# Clear cache after Mimi loading
|
| 69 |
if device == "cuda":
|
| 70 |
torch.cuda.empty_cache()
|
| 71 |
logger.info(f"GPU memory after Mimi: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
|
| 72 |
|
| 73 |
+
# Load Moshi (full language model)
|
| 74 |
+
logger.info("Loading Moshi language model...")
|
| 75 |
+
moshi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MOSHI_NAME, cache_dir='/app/hf_cache')
|
| 76 |
|
| 77 |
# Try loading with memory-efficient settings
|
| 78 |
try:
|
| 79 |
moshi = loaders.get_moshi_lm(moshi_weight, device=device)
|
| 80 |
lm_gen = LMGen(moshi, temp=0.8, temp_text=0.7)
|
| 81 |
+
logger.info("β
Moshi loaded successfully on GPU")
|
| 82 |
except RuntimeError as cuda_error:
|
| 83 |
if "CUDA out of memory" in str(cuda_error):
|
| 84 |
+
logger.warning(f"Moshi CUDA out of memory, trying CPU fallback: {cuda_error}")
|
| 85 |
# Move Mimi to CPU as well for consistency
|
| 86 |
mimi = loaders.get_mimi(mimi_weight, device="cpu")
|
| 87 |
mimi.set_num_codebooks(8)
|
| 88 |
device = "cpu"
|
| 89 |
moshi = loaders.get_moshi_lm(moshi_weight, device="cpu")
|
| 90 |
lm_gen = LMGen(moshi, temp=0.8, temp_text=0.7)
|
| 91 |
+
logger.info("β
Moshi loaded successfully on CPU (fallback)")
|
| 92 |
logger.info("β
Mimi also moved to CPU for device consistency")
|
| 93 |
else:
|
| 94 |
raise
|
| 95 |
|
| 96 |
+
logger.info("π All Moshi models loaded successfully!")
|
| 97 |
return True
|
| 98 |
|
| 99 |
except ImportError as import_error:
|
|
|
|
| 125 |
|
| 126 |
if mimi == "mock":
|
| 127 |
duration = len(audio_data) / sample_rate
|
| 128 |
+
return f"Mock Moshi STT: {duration:.2f}s audio at {sample_rate}Hz"
|
| 129 |
|
| 130 |
# Ensure 24kHz audio for Moshi
|
| 131 |
if sample_rate != 24000:
|
|
|
|
| 230 |
|
| 231 |
# FastAPI app with lifespan
|
| 232 |
app = FastAPI(
|
| 233 |
+
title="STT GPU Service Python v4 - Full Moshi Model",
|
| 234 |
+
description="Real-time WebSocket STT streaming with full Moshi PyTorch implementation (L4 GPU with 30GB VRAM)",
|
| 235 |
version=VERSION,
|
| 236 |
lifespan=lifespan
|
| 237 |
)
|
|
|
|
| 244 |
"timestamp": time.time(),
|
| 245 |
"version": VERSION,
|
| 246 |
"commit_sha": COMMIT_SHA,
|
| 247 |
+
"message": "Moshi STT WebSocket Service - Full model on L4 GPU",
|
| 248 |
"space_name": "stt-gpu-service-python-v4",
|
| 249 |
"mimi_loaded": mimi is not None and mimi != "mock",
|
| 250 |
"moshi_loaded": moshi is not None and moshi != "mock",
|