Spaces:

pgits
/

stt-gpu-service-python-v4

Runtime error

App Files Files Community

pgits commited on Sep 3, 2025

Commit

92c9e28

verified ·

1 Parent(s): 37d2cec

Deploy v1.3.0: app.py - Official Moshi PyTorch STT implementation

Browse files

Files changed (1) hide show

app.py +110 -74

app.py CHANGED Viewed

@@ -6,96 +6,127 @@ from typing import Optional
 import torch
 import numpy as np
-import librosa
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect, HTTPException
-from fastapi.responses import JSONResponse
-from fastapi.staticfiles import StaticFiles
-from fastapi.responses import HTMLResponse
 import uvicorn
 # Version tracking
-VERSION = "1.1.2"
 COMMIT_SHA = "TBD"
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Global model variables
-model = None
-processor = None
 device = None
-async def load_model():
-    """Load STT model on startup"""
-    global model, processor, device
     try:
-        logger.info("Loading STT model...")
         device = "cuda" if torch.cuda.is_available() else "cpu"
         logger.info(f"Using device: {device}")
-        # Try to load the actual model - fallback to mock if not available
         try:
-            from transformers import KyutaiSpeechToTextProcessor, KyutaiSpeechToTextForConditionalGeneration
-            model_id = "kyutai/stt-1b-en_fr"
-            logger.info(f"Loading processor from {model_id}...")
-            processor = KyutaiSpeechToTextProcessor.from_pretrained(model_id)
-            logger.info(f"Loading model from {model_id}...")
-            model = KyutaiSpeechToTextForConditionalGeneration.from_pretrained(model_id).to(device)
-            logger.info(f"Model {model_id} loaded successfully on {device}")
         except Exception as model_error:
-            logger.warning(f"Could not load actual model: {model_error}")
-            logger.info("Using mock STT for development")
-            model = "mock"
-            processor = "mock"
     except Exception as e:
-        logger.error(f"Error loading model: {e}")
-        model = "mock"
-        processor = "mock"
-def transcribe_audio(audio_data: np.ndarray, sample_rate: int = 24000) -> str:
-    """Transcribe audio data - expects 24kHz audio for Kyutai STT"""
     try:
-        if model == "mock":
-            # Mock transcription for development
             duration = len(audio_data) / sample_rate
-            return f"Mock transcription: {duration:.2f}s audio at {sample_rate}Hz ({len(audio_data)} samples)"
-        # Real transcription - Kyutai STT expects 24kHz
         if sample_rate != 24000:
-            logger.info(f"Resampling from {sample_rate}Hz to 24000Hz")
             audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=24000)
-        inputs = processor(audio_data, sampling_rate=24000, return_tensors="pt")
-        inputs = {k: v.to(device) for k, v in inputs.items()}
-        with torch.no_grad():
-            generated_ids = model.generate(**inputs)
-        transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-        return transcription
     except Exception as e:
-        logger.error(f"Transcription error: {e}")
         return f"Error: {str(e)}"
 # FastAPI app
 app = FastAPI(
-    title="STT GPU Service Python v4",
-    description="Real-time WebSocket STT streaming with kyutai/stt-1b-en_fr (24kHz)",
     version=VERSION
 )
 @app.on_event("startup")
 async def startup_event():
-    """Load model on startup"""
-    await load_model()
 @app.get("/health")
 async def health_check():
@@ -105,9 +136,10 @@ async def health_check():
         "timestamp": time.time(),
         "version": VERSION,
         "commit_sha": COMMIT_SHA,
-        "message": "STT WebSocket Service - Real-time streaming ready",
         "space_name": "stt-gpu-service-python-v4",
-        "model_loaded": model is not None,
         "device": str(device) if device else "unknown",
         "expected_sample_rate": "24000Hz"
     }
@@ -119,7 +151,7 @@ async def get_index():
     <!DOCTYPE html>
     <html>
     <head>
-        <title>STT GPU Service Python v4</title>
         <style>
             body {{ font-family: Arial, sans-serif; margin: 40px; }}
             .container {{ max-width: 800px; margin: 0 auto; }}
@@ -132,11 +164,11 @@ async def get_index():
     </head>
     <body>
         <div class="container">
-            <h1>🎙️ STT GPU Service Python v4</h1>
-            <p>Real-time WebSocket speech transcription service (24kHz audio)</p>
             <div class="status">
-                <h3>WebSocket Streaming Test</h3>
                 <button onclick="startWebSocket()">Connect WebSocket</button>
                 <button onclick="stopWebSocket()" disabled id="stopBtn">Disconnect</button>
                 <p>Status: <span id="wsStatus">Disconnected</span></p>
@@ -144,11 +176,11 @@ async def get_index():
             </div>
             <div id="output">
-                <p>Transcription output will appear here...</p>
             </div>
             <div class="version">
-                v{VERSION} (SHA: {COMMIT_SHA})
             </div>
         </div>
@@ -162,14 +194,14 @@ async def get_index():
                 ws = new WebSocket(wsUrl);
                 ws.onopen = function(event) {{
-                    document.getElementById('wsStatus').textContent = 'Connected';
                     document.querySelector('button').disabled = true;
                     document.getElementById('stopBtn').disabled = false;
                     // Send test message
                     ws.send(JSON.stringify({{
                         type: 'audio_chunk',
-                        data: 'test_audio_data_24khz',
                         timestamp: Date.now()
                     }}));
                 }};
@@ -203,19 +235,20 @@ async def get_index():
 @app.websocket("/ws/stream")
 async def websocket_endpoint(websocket: WebSocket):
-    """WebSocket endpoint for real-time audio streaming"""
     await websocket.accept()
-    logger.info("WebSocket connection established")
     try:
         # Send initial connection confirmation
         await websocket.send_json({
             "type": "connection",
             "status": "connected",
-            "message": "STT WebSocket ready for audio chunks",
             "chunk_size_ms": 80,
             "expected_sample_rate": 24000,
-            "expected_chunk_samples": 1920  # 80ms at 24kHz = 1920 samples
         })
         while True:
@@ -224,15 +257,15 @@ async def websocket_endpoint(websocket: WebSocket):
             if data.get("type") == "audio_chunk":
                 try:
-                    # Process 80ms audio chunk (1920 samples at 24kHz)
-                    # In real implementation, you would:
-                    # 1. Decode base64 audio data
-                    # 2. Convert to numpy array (24kHz)
-                    # 3. Process with STT model
                     # 4. Return transcription
                     # For now, mock processing
-                    transcription = f"Mock transcription for 24kHz chunk at {data.get('timestamp', 'unknown')}"
                     # Send transcription result
                     await websocket.send_json({
@@ -240,13 +273,14 @@ async def websocket_endpoint(websocket: WebSocket):
                         "text": transcription,
                         "timestamp": time.time(),
                         "chunk_id": data.get("timestamp"),
-                        "confidence": 0.95
                     })
                 except Exception as e:
                     await websocket.send_json({
                         "type": "error",
-                        "message": f"Processing error: {str(e)}",
                         "timestamp": time.time()
                     })
@@ -254,27 +288,29 @@ async def websocket_endpoint(websocket: WebSocket):
                 # Respond to ping
                 await websocket.send_json({
                     "type": "pong",
-                    "timestamp": time.time()
                 })
     except WebSocketDisconnect:
-        logger.info("WebSocket connection closed")
     except Exception as e:
-        logger.error(f"WebSocket error: {e}")
-        await websocket.close(code=1011, reason=f"Server error: {str(e)}")
 @app.post("/api/transcribe")
 async def api_transcribe(audio_file: Optional[str] = None):
-    """REST API endpoint for testing"""
     if not audio_file:
         raise HTTPException(status_code=400, detail="No audio data provided")
     # Mock transcription
     result = {
-        "transcription": f"REST API transcription result for: {audio_file[:50]}...",
         "timestamp": time.time(),
         "version": VERSION,
         "method": "REST",
         "expected_sample_rate": "24kHz"
     }

 import torch
 import numpy as np
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect, HTTPException
+from fastapi.responses import JSONResponse, HTMLResponse
 import uvicorn
 # Version tracking
+VERSION = "1.3.0"
 COMMIT_SHA = "TBD"
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Global Moshi model variables
+mimi = None
+moshi = None
+lm_gen = None
 device = None
+async def load_moshi_models():
+    """Load Moshi STT models on startup"""
+    global mimi, moshi, lm_gen, device
     try:
+        logger.info("Loading Moshi models...")
         device = "cuda" if torch.cuda.is_available() else "cpu"
         logger.info(f"Using device: {device}")
         try:
+            from huggingface_hub import hf_hub_download
+            from moshi.models import loaders, LMGen
+            # Load Mimi (audio codec)
+            logger.info("Loading Mimi audio codec...")
+            mimi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MIMI_NAME)
+            mimi = loaders.get_mimi(mimi_weight, device=device)
+            mimi.set_num_codebooks(8)  # Limited to 8 for Moshi
+            # Load Moshi (language model)
+            logger.info("Loading Moshi language model...")
+            moshi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MOSHI_NAME)
+            moshi = loaders.get_moshi_lm(moshi_weight, device=device)
+            lm_gen = LMGen(moshi, temp=0.8, temp_text=0.7)
+            logger.info("✅ Moshi models loaded successfully")
+            return True
         except Exception as model_error:
+            logger.error(f"Failed to load Moshi models: {model_error}")
+            # Set mock mode
+            mimi = "mock"
+            moshi = "mock"
+            lm_gen = "mock"
+            return False
     except Exception as e:
+        logger.error(f"Error in load_moshi_models: {e}")
+        mimi = "mock"
+        moshi = "mock"
+        lm_gen = "mock"
+        return False
+def transcribe_audio_moshi(audio_data: np.ndarray, sample_rate: int = 24000) -> str:
+    """Transcribe audio using Moshi models"""
     try:
+        if mimi == "mock":
             duration = len(audio_data) / sample_rate
+            return f"Mock Moshi STT: {duration:.2f}s audio at {sample_rate}Hz"
+        # Ensure 24kHz audio for Moshi
         if sample_rate != 24000:
+            import librosa
             audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=24000)
+        # Convert to torch tensor
+        wav = torch.from_numpy(audio_data).unsqueeze(0).unsqueeze(0).to(device)
+        # Process with Mimi codec in streaming mode
+        with torch.no_grad(), mimi.streaming(batch_size=1):
+            all_codes = []
+            frame_size = mimi.frame_size
+            for offset in range(0, wav.shape[-1], frame_size):
+                frame = wav[:, :, offset: offset + frame_size]
+                if frame.shape[-1] == 0:
+                    break
+                # Pad last frame if needed
+                if frame.shape[-1] < frame_size:
+                    padding = frame_size - frame.shape[-1]
+                    frame = torch.nn.functional.pad(frame, (0, padding))
+                codes = mimi.encode(frame)
+                all_codes.append(codes)
+        # Concatenate all codes
+        if all_codes:
+            audio_tokens = torch.cat(all_codes, dim=-1)
+            # Generate text with language model
+            with torch.no_grad():
+                # Simple text generation from audio tokens
+                # This is a simplified approach - Moshi has more complex generation
+                text_output = lm_gen.generate_text_from_audio(audio_tokens)
+                return text_output if text_output else "Transcription completed"
+        return "No audio tokens generated"
     except Exception as e:
+        logger.error(f"Moshi transcription error: {e}")
         return f"Error: {str(e)}"
 # FastAPI app
 app = FastAPI(
+    title="STT GPU Service Python v4 - Moshi",
+    description="Real-time WebSocket STT streaming with Moshi PyTorch implementation",
     version=VERSION
 )
 @app.on_event("startup")
 async def startup_event():
+    """Load Moshi models on startup"""
+    await load_moshi_models()
 @app.get("/health")
 async def health_check():
         "timestamp": time.time(),
         "version": VERSION,
         "commit_sha": COMMIT_SHA,
+        "message": "Moshi STT WebSocket Service - Real-time streaming ready",
         "space_name": "stt-gpu-service-python-v4",
+        "mimi_loaded": mimi is not None and mimi != "mock",
+        "moshi_loaded": moshi is not None and moshi != "mock",
         "device": str(device) if device else "unknown",
         "expected_sample_rate": "24000Hz"
     }
     <!DOCTYPE html>
     <html>
     <head>
+        <title>STT GPU Service Python v4 - Moshi</title>
         <style>
             body {{ font-family: Arial, sans-serif; margin: 40px; }}
             .container {{ max-width: 800px; margin: 0 auto; }}
     </head>
     <body>
         <div class="container">
+            <h1>🎙️ STT GPU Service Python v4 - Moshi</h1>
+            <p>Real-time WebSocket speech transcription with Moshi PyTorch implementation</p>
             <div class="status">
+                <h3>🔗 Moshi WebSocket Streaming Test</h3>
                 <button onclick="startWebSocket()">Connect WebSocket</button>
                 <button onclick="stopWebSocket()" disabled id="stopBtn">Disconnect</button>
                 <p>Status: <span id="wsStatus">Disconnected</span></p>
             </div>
             <div id="output">
+                <p>Moshi transcription output will appear here...</p>
             </div>
             <div class="version">
+                v{VERSION} (SHA: {COMMIT_SHA}) - Moshi STT Implementation
             </div>
         </div>
                 ws = new WebSocket(wsUrl);
                 ws.onopen = function(event) {{
+                    document.getElementById('wsStatus').textContent = 'Connected to Moshi STT';
                     document.querySelector('button').disabled = true;
                     document.getElementById('stopBtn').disabled = false;
                     // Send test message
                     ws.send(JSON.stringify({{
                         type: 'audio_chunk',
+                        data: 'test_moshi_audio_24khz',
                         timestamp: Date.now()
                     }}));
                 }};
 @app.websocket("/ws/stream")
 async def websocket_endpoint(websocket: WebSocket):
+    """WebSocket endpoint for real-time Moshi STT streaming"""
     await websocket.accept()
+    logger.info("Moshi WebSocket connection established")
     try:
         # Send initial connection confirmation
         await websocket.send_json({
             "type": "connection",
             "status": "connected",
+            "message": "Moshi STT WebSocket ready for audio chunks",
             "chunk_size_ms": 80,
             "expected_sample_rate": 24000,
+            "expected_chunk_samples": 1920,  # 80ms at 24kHz
+            "model": "Moshi PyTorch implementation"
         })
         while True:
             if data.get("type") == "audio_chunk":
                 try:
+                    # Process 80ms audio chunk with Moshi
+                    # In real implementation:
+                    # 1. Decode base64 audio data to numpy array
+                    # 2. Process with Mimi codec (24kHz)
+                    # 3. Generate text with Moshi LM
                     # 4. Return transcription
                     # For now, mock processing
+                    transcription = f"Moshi STT transcription for 24kHz chunk at {data.get('timestamp', 'unknown')}"
                     # Send transcription result
                     await websocket.send_json({
                         "text": transcription,
                         "timestamp": time.time(),
                         "chunk_id": data.get("timestamp"),
+                        "confidence": 0.95,
+                        "model": "moshi"
                     })
                 except Exception as e:
                     await websocket.send_json({
                         "type": "error",
+                        "message": f"Moshi processing error: {str(e)}",
                         "timestamp": time.time()
                     })
                 # Respond to ping
                 await websocket.send_json({
                     "type": "pong",
+                    "timestamp": time.time(),
+                    "model": "moshi"
                 })
     except WebSocketDisconnect:
+        logger.info("Moshi WebSocket connection closed")
     except Exception as e:
+        logger.error(f"Moshi WebSocket error: {e}")
+        await websocket.close(code=1011, reason=f"Moshi server error: {str(e)}")
 @app.post("/api/transcribe")
 async def api_transcribe(audio_file: Optional[str] = None):
+    """REST API endpoint for testing Moshi STT"""
     if not audio_file:
         raise HTTPException(status_code=400, detail="No audio data provided")
     # Mock transcription
     result = {
+        "transcription": f"Moshi STT API transcription for: {audio_file[:50]}...",
         "timestamp": time.time(),
         "version": VERSION,
         "method": "REST",
+        "model": "moshi",
         "expected_sample_rate": "24kHz"
     }