Spaces:

pgits
/

stt-gpu-service-python-v4

Runtime error

App Files Files Community

pgits commited on Sep 3, 2025

Commit

4020b5c

verified ·

1 Parent(s): 6ba8fa9

Fix v1.1.1: app.py - transformers>=4.53.0 + 24kHz audio support

Browse files

Files changed (1) hide show

app.py +29 -16

app.py CHANGED Viewed

@@ -14,7 +14,7 @@ from fastapi.responses import HTMLResponse
 import uvicorn
 # Version tracking
-VERSION = "1.1.0"
 COMMIT_SHA = "TBD"
 # Configure logging
@@ -40,9 +40,13 @@ async def load_model():
             from transformers import KyutaiSpeechToTextProcessor, KyutaiSpeechToTextForConditionalGeneration
             model_id = "kyutai/stt-1b-en_fr"
             processor = KyutaiSpeechToTextProcessor.from_pretrained(model_id)
             model = KyutaiSpeechToTextForConditionalGeneration.from_pretrained(model_id).to(device)
-            logger.info(f"Model {model_id} loaded successfully")
         except Exception as model_error:
             logger.warning(f"Could not load actual model: {model_error}")
@@ -55,15 +59,20 @@ async def load_model():
         model = "mock"
         processor = "mock"
-def transcribe_audio(audio_data: np.ndarray, sample_rate: int = 16000) -> str:
-    """Transcribe audio data"""
     try:
         if model == "mock":
             # Mock transcription for development
-            return f"Mock transcription: {len(audio_data)} samples at {sample_rate}Hz"
-        # Real transcription
-        inputs = processor(audio_data, sampling_rate=sample_rate, return_tensors="pt")
         inputs = {k: v.to(device) for k, v in inputs.items()}
         with torch.no_grad():
@@ -79,7 +88,7 @@ def transcribe_audio(audio_data: np.ndarray, sample_rate: int = 16000) -> str:
 # FastAPI app
 app = FastAPI(
     title="STT GPU Service Python v4",
-    description="Real-time WebSocket STT streaming with kyutai/stt-1b-en_fr",
     version=VERSION
 )
@@ -99,7 +108,8 @@ async def health_check():
         "message": "STT WebSocket Service - Real-time streaming ready",
         "space_name": "stt-gpu-service-python-v4",
         "model_loaded": model is not None,
-        "device": str(device) if device else "unknown"
     }
 @app.get("/", response_class=HTMLResponse)
@@ -123,13 +133,14 @@ async def get_index():
     <body>
         <div class="container">
             <h1>🎙️ STT GPU Service Python v4</h1>
-            <p>Real-time WebSocket speech transcription service</p>
             <div class="status">
                 <h3>WebSocket Streaming Test</h3>
                 <button onclick="startWebSocket()">Connect WebSocket</button>
                 <button onclick="stopWebSocket()" disabled id="stopBtn">Disconnect</button>
                 <p>Status: <span id="wsStatus">Disconnected</span></p>
             </div>
             <div id="output">
@@ -158,7 +169,7 @@ async def get_index():
                     // Send test message
                     ws.send(JSON.stringify({{
                         type: 'audio_chunk',
-                        data: 'test_audio_data',
                         timestamp: Date.now()
                     }}));
                 }};
@@ -203,7 +214,8 @@ async def websocket_endpoint(websocket: WebSocket):
             "status": "connected",
             "message": "STT WebSocket ready for audio chunks",
             "chunk_size_ms": 80,
-            "expected_sample_rate": 16000
         })
         while True:
@@ -212,15 +224,15 @@ async def websocket_endpoint(websocket: WebSocket):
             if data.get("type") == "audio_chunk":
                 try:
-                    # Process 80ms audio chunk
                     # In real implementation, you would:
                     # 1. Decode base64 audio data
-                    # 2. Convert to numpy array
                     # 3. Process with STT model
                     # 4. Return transcription
                     # For now, mock processing
-                    transcription = f"Mock transcription for chunk at {data.get('timestamp', 'unknown')}"
                     # Send transcription result
                     await websocket.send_json({
@@ -262,7 +274,8 @@ async def api_transcribe(audio_file: Optional[str] = None):
         "transcription": f"REST API transcription result for: {audio_file[:50]}...",
         "timestamp": time.time(),
         "version": VERSION,
-        "method": "REST"
     }
     return result

 import uvicorn
 # Version tracking
+VERSION = "1.1.1"
 COMMIT_SHA = "TBD"
 # Configure logging
             from transformers import KyutaiSpeechToTextProcessor, KyutaiSpeechToTextForConditionalGeneration
             model_id = "kyutai/stt-1b-en_fr"
+            logger.info(f"Loading processor from {model_id}...")
             processor = KyutaiSpeechToTextProcessor.from_pretrained(model_id)
+            logger.info(f"Loading model from {model_id}...")
             model = KyutaiSpeechToTextForConditionalGeneration.from_pretrained(model_id).to(device)
+            logger.info(f"Model {model_id} loaded successfully on {device}")
         except Exception as model_error:
             logger.warning(f"Could not load actual model: {model_error}")
         model = "mock"
         processor = "mock"
+def transcribe_audio(audio_data: np.ndarray, sample_rate: int = 24000) -> str:
+    """Transcribe audio data - expects 24kHz audio for Kyutai STT"""
     try:
         if model == "mock":
             # Mock transcription for development
+            duration = len(audio_data) / sample_rate
+            return f"Mock transcription: {duration:.2f}s audio at {sample_rate}Hz ({len(audio_data)} samples)"
+        # Real transcription - Kyutai STT expects 24kHz
+        if sample_rate != 24000:
+            logger.info(f"Resampling from {sample_rate}Hz to 24000Hz")
+            audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=24000)
+        inputs = processor(audio_data, sampling_rate=24000, return_tensors="pt")
         inputs = {k: v.to(device) for k, v in inputs.items()}
         with torch.no_grad():
 # FastAPI app
 app = FastAPI(
     title="STT GPU Service Python v4",
+    description="Real-time WebSocket STT streaming with kyutai/stt-1b-en_fr (24kHz)",
     version=VERSION
 )
         "message": "STT WebSocket Service - Real-time streaming ready",
         "space_name": "stt-gpu-service-python-v4",
         "model_loaded": model is not None,
+        "device": str(device) if device else "unknown",
+        "expected_sample_rate": "24000Hz"
     }
 @app.get("/", response_class=HTMLResponse)
     <body>
         <div class="container">
             <h1>🎙️ STT GPU Service Python v4</h1>
+            <p>Real-time WebSocket speech transcription service (24kHz audio)</p>
             <div class="status">
                 <h3>WebSocket Streaming Test</h3>
                 <button onclick="startWebSocket()">Connect WebSocket</button>
                 <button onclick="stopWebSocket()" disabled id="stopBtn">Disconnect</button>
                 <p>Status: <span id="wsStatus">Disconnected</span></p>
+                <p><small>Expected: 24kHz audio chunks (80ms = ~1920 samples)</small></p>
             </div>
             <div id="output">
                     // Send test message
                     ws.send(JSON.stringify({{
                         type: 'audio_chunk',
+                        data: 'test_audio_data_24khz',
                         timestamp: Date.now()
                     }}));
                 }};
             "status": "connected",
             "message": "STT WebSocket ready for audio chunks",
             "chunk_size_ms": 80,
+            "expected_sample_rate": 24000,
+            "expected_chunk_samples": 1920  # 80ms at 24kHz = 1920 samples
         })
         while True:
             if data.get("type") == "audio_chunk":
                 try:
+                    # Process 80ms audio chunk (1920 samples at 24kHz)
                     # In real implementation, you would:
                     # 1. Decode base64 audio data
+                    # 2. Convert to numpy array (24kHz)
                     # 3. Process with STT model
                     # 4. Return transcription
                     # For now, mock processing
+                    transcription = f"Mock transcription for 24kHz chunk at {data.get('timestamp', 'unknown')}"
                     # Send transcription result
                     await websocket.send_json({
         "transcription": f"REST API transcription result for: {audio_file[:50]}...",
         "timestamp": time.time(),
         "version": VERSION,
+        "method": "REST",
+        "expected_sample_rate": "24kHz"
     }
     return result