Spaces:

pgits
/

stt-gpu-service

Sleeping

Peter Michael Gits Claude commited on Aug 25, 2025

Commit

ee64ed2

1 Parent(s): 1111d6d

feat: Implement hybrid Gradio+FastAPI WebSocket service for ZeroGPU compatibility

- Use gr.mount_gradio_app() for proper WebSocket routing (fixes 404 WebSocket errors)
- Create minimal Gradio interface for HF Spaces compliance with ZeroGPU
- Mount FastAPI WebSocket endpoints at /ws/stt using official mounting approach
- Maintain ZeroGPU compatibility with @spaces.GPU decorators on global functions
- Add CORS middleware for WebRTC connectivity
- Implement WebSocket connection tracking and message handling
- Remove complex lifecycle management (let Gradio handle queue management)
- Based on research of HF Spaces best practices and known WebSocket fixes

Architecture: HF Spaces (Gradio SDK) → gr.mount_gradio_app() → FastAPI WebSocket

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (2) hide show

app.py +138 -166
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 """
-Standalone WebSocket-only STT Service
-Simplified service without Gradio, MCP, or web interfaces
 Following unmute.sh WebRTC pattern for HuggingFace Spaces
 """
@@ -14,7 +14,6 @@ import os
 import logging
 from datetime import datetime
 from typing import Optional, Dict, Any
-from contextlib import asynccontextmanager
 import torch
 from transformers import WhisperProcessor, WhisperForConditionalGeneration
 import torchaudio
@@ -22,8 +21,8 @@ import soundfile as sf
 import numpy as np
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect
 from fastapi.middleware.cors import CORSMiddleware
 import spaces
-import uvicorn
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -115,142 +114,44 @@ def transcribe_audio_zerogpu(
         logger.error(f"Transcription error: {str(e)}")
         return "", "error", {"error": str(e)}
-class STTWebSocketService:
-    """Standalone STT service with WebSocket-only interface"""
-    def __init__(self):
-        self.active_connections: Dict[str, WebSocket] = {}
-        logger.info(f"🎤 {__service__} v{__version__} initializing...")
-        logger.info(f"Device: {device}")
-        logger.info(f"Model: whisper-{model_size}")
-    async def load_model(self):
-        """Load Whisper model with ZeroGPU compatibility - delegates to global function"""
-        global model
-        if model is None:
-            # Trigger model loading by calling the ZeroGPU function with a dummy path
-            # The actual loading will happen on first real transcription
-            logger.info("Model will be loaded on first transcription request...")
-        else:
-            logger.info("✅ Model already loaded")
-    async def connect_websocket(self, websocket: WebSocket) -> str:
-        """Accept WebSocket connection and return client ID"""
-        client_id = str(uuid.uuid4())
-        await websocket.accept()
-        self.active_connections[client_id] = websocket
-        # Send connection confirmation
-        await websocket.send_text(json.dumps({
-            "type": "stt_connection_confirmed",
-            "client_id": client_id,
-            "service": __service__,
-            "version": __version__,
-            "model": f"whisper-{model_size}",
-            "device": device,
-            "message": "STT WebSocket connected and ready"
-        }))
-        logger.info(f"Client {client_id} connected")
-        return client_id
-    async def disconnect_websocket(self, client_id: str):
-        """Clean up WebSocket connection"""
-        if client_id in self.active_connections:
-            del self.active_connections[client_id]
-            logger.info(f"Client {client_id} disconnected")
-    async def process_audio_message(self, client_id: str, message: Dict[str, Any]):
-        """Process incoming audio data from WebSocket"""
-        try:
-            websocket = self.active_connections[client_id]
-            # Extract audio data (base64 encoded)
-            audio_data_b64 = message.get("audio_data")
-            if not audio_data_b64:
-                await websocket.send_text(json.dumps({
-                    "type": "stt_transcription_error",
-                    "client_id": client_id,
-                    "error": "No audio data provided"
-                }))
-                return
-            # Decode base64 audio
-            audio_bytes = base64.b64decode(audio_data_b64)
-            # Save to temporary file
-            with tempfile.NamedTemporaryFile(suffix=".webm", delete=False) as tmp_file:
-                tmp_file.write(audio_bytes)
-                temp_path = tmp_file.name
-            try:
-                # Transcribe audio using global ZeroGPU function
-                transcription, status, timing = transcribe_audio_zerogpu(
-                    temp_path,
-                    message.get("language", "auto"),
-                    message.get("model_size", model_size)
-                )
-                # Send result back
-                if status == "success" and transcription:
-                    await websocket.send_text(json.dumps({
-                        "type": "stt_transcription_complete",
-                        "client_id": client_id,
-                        "transcription": transcription,
-                        "timing": timing,
-                        "status": "success"
-                    }))
-                else:
-                    await websocket.send_text(json.dumps({
-                        "type": "stt_transcription_error",
-                        "client_id": client_id,
-                        "error": "Transcription failed or empty result",
-                        "timing": timing
-                    }))
-            finally:
-                # Clean up temp file
-                if os.path.exists(temp_path):
-                    os.unlink(temp_path)
-        except Exception as e:
-            logger.error(f"Error processing audio for {client_id}: {str(e)}")
-            if client_id in self.active_connections:
-                websocket = self.active_connections[client_id]
-                await websocket.send_text(json.dumps({
-                    "type": "stt_transcription_error",
-                    "client_id": client_id,
-                    "error": f"Processing error: {str(e)}"
-                }))
-# Initialize service
-stt_service = STTWebSocketService()
-@asynccontextmanager
-async def lifespan(app: FastAPI):
-    """Lifespan event handler for FastAPI app startup/shutdown"""
-    # Startup
-    logger.info(f"🚀 {__service__} v{__version__} starting...")
-    logger.info("Pre-loading Whisper model for optimal performance...")
-    await stt_service.load_model()
-    logger.info("✅ Service ready for WebSocket connections")
-    yield
-    # Shutdown
-    logger.info("🛑 STT WebSocket Service shutting down...")
-# Create FastAPI app with lifespan
-app = FastAPI(
-    title="STT WebSocket Service",
-    description="Standalone WebSocket-only Speech-to-Text service",
-    version=__version__,
-    lifespan=lifespan
 )
-# Add CORS middleware
-app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_credentials=True,
@@ -258,42 +159,116 @@ app.add_middleware(
     allow_headers=["*"],
 )
-@app.get("/")
-async def root():
-    """Health check endpoint"""
-    return {
-        "service": __service__,
-        "version": __version__,
-        "status": "ready",
-        "endpoints": {
-            "websocket": "/ws/stt",
-            "health": "/health"
-        },
-        "model": f"whisper-{model_size}",
-        "device": device
-    }
-@app.get("/health")
 async def health_check():
-    """Detailed health check"""
     return {
         "service": __service__,
         "version": __version__,
         "status": "healthy",
         "model_loaded": model is not None,
-        "active_connections": len(stt_service.active_connections),
         "device": device,
         "timestamp": datetime.now().isoformat()
     }
-@app.websocket("/ws/stt")
 async def websocket_stt_endpoint(websocket: WebSocket):
     """Main STT WebSocket endpoint"""
     client_id = None
     try:
         # Accept connection
-        client_id = await stt_service.connect_websocket(websocket)
         # Handle messages
         while True:
@@ -306,7 +281,7 @@ async def websocket_stt_endpoint(websocket: WebSocket):
                 message_type = message.get("type", "unknown")
                 if message_type == "stt_audio_chunk":
-                    await stt_service.process_audio_message(client_id, message)
                 elif message_type == "ping":
                     # Respond to ping
                     await websocket.send_text(json.dumps({
@@ -335,15 +310,12 @@ async def websocket_stt_endpoint(websocket: WebSocket):
         logger.error(f"WebSocket error for {client_id}: {str(e)}")
     finally:
         if client_id:
-            await stt_service.disconnect_websocket(client_id)
 if __name__ == "__main__":
-    port = int(os.environ.get("PORT", 7860))
-    logger.info(f"🎤 Starting {__service__} v{__version__} on port {port}")
-    uvicorn.run(
-        app,
-        host="0.0.0.0",
-        port=port,
-        log_level="info"
-    )

 #!/usr/bin/env python3
 """
+STT WebSocket Service with Gradio + FastAPI Integration
+ZeroGPU compatible service with WebSocket endpoints for VoiceCal
 Following unmute.sh WebRTC pattern for HuggingFace Spaces
 """
 import logging
 from datetime import datetime
 from typing import Optional, Dict, Any
 import torch
 from transformers import WhisperProcessor, WhisperForConditionalGeneration
 import torchaudio
 import numpy as np
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect
 from fastapi.middleware.cors import CORSMiddleware
+import gradio as gr
 import spaces
 # Configure logging
 logging.basicConfig(level=logging.INFO)
         logger.error(f"Transcription error: {str(e)}")
         return "", "error", {"error": str(e)}
+# Global WebSocket connection tracker
+active_connections: Dict[str, WebSocket] = {}
+# Simple Gradio interface for HF Spaces compliance
+def get_service_info():
+    """Simple function for Gradio interface"""
+    return f"""
+# 🎤 STT WebSocket Service v{__version__}
+**WebSocket Endpoint:** `/ws/stt`
+**Model:** Whisper {model_size}
+**Device:** {device}
+**ZeroGPU:** {'✅ Available' if torch.cuda.is_available() else '❌ Not Available'}
+**Status:** Ready for WebSocket connections
+Connect your WebRTC client to: `wss://your-space.hf.space/ws/stt`
+"""
+# Create minimal Gradio interface for HF Spaces
+demo = gr.Interface(
+    fn=get_service_info,
+    inputs=None,
+    outputs=gr.Markdown(),
+    title="🎤 STT WebSocket Service v1.0.0",
+    description="WebSocket-enabled Speech-to-Text service with ZeroGPU acceleration",
+    examples=None,
+    live=False
 )
+# Create FastAPI app for WebSocket endpoints
+fastapi_app = FastAPI(
+    title="STT WebSocket Service",
+    version=__version__
+)
+# Add CORS middleware for WebRTC
+fastapi_app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_credentials=True,
     allow_headers=["*"],
 )
+@fastapi_app.get("/api/health")
 async def health_check():
+    """Health check endpoint"""
     return {
         "service": __service__,
         "version": __version__,
         "status": "healthy",
         "model_loaded": model is not None,
+        "active_connections": len(active_connections),
         "device": device,
         "timestamp": datetime.now().isoformat()
     }
+async def connect_websocket(websocket: WebSocket) -> str:
+    """Accept WebSocket connection and return client ID"""
+    client_id = str(uuid.uuid4())
+    await websocket.accept()
+    active_connections[client_id] = websocket
+    # Send connection confirmation
+    await websocket.send_text(json.dumps({
+        "type": "stt_connection_confirmed",
+        "client_id": client_id,
+        "service": __service__,
+        "version": __version__,
+        "model": f"whisper-{model_size}",
+        "device": device,
+        "message": "STT WebSocket connected and ready"
+    }))
+    logger.info(f"Client {client_id} connected")
+    return client_id
+async def disconnect_websocket(client_id: str):
+    """Clean up WebSocket connection"""
+    if client_id in active_connections:
+        del active_connections[client_id]
+        logger.info(f"Client {client_id} disconnected")
+async def process_audio_message(client_id: str, message: Dict[str, Any]):
+    """Process incoming audio data from WebSocket"""
+    try:
+        websocket = active_connections[client_id]
+        # Extract audio data (base64 encoded)
+        audio_data_b64 = message.get("audio_data")
+        if not audio_data_b64:
+            await websocket.send_text(json.dumps({
+                "type": "stt_transcription_error",
+                "client_id": client_id,
+                "error": "No audio data provided"
+            }))
+            return
+        # Decode base64 audio
+        audio_bytes = base64.b64decode(audio_data_b64)
+        # Save to temporary file
+        with tempfile.NamedTemporaryFile(suffix=".webm", delete=False) as tmp_file:
+            tmp_file.write(audio_bytes)
+            temp_path = tmp_file.name
+        try:
+            # Transcribe audio using global ZeroGPU function
+            transcription, status, timing = transcribe_audio_zerogpu(
+                temp_path,
+                message.get("language", "auto"),
+                message.get("model_size", model_size)
+            )
+            # Send result back
+            if status == "success" and transcription:
+                await websocket.send_text(json.dumps({
+                    "type": "stt_transcription_complete",
+                    "client_id": client_id,
+                    "transcription": transcription,
+                    "timing": timing,
+                    "status": "success"
+                }))
+            else:
+                await websocket.send_text(json.dumps({
+                    "type": "stt_transcription_error",
+                    "client_id": client_id,
+                    "error": "Transcription failed or empty result",
+                    "timing": timing
+                }))
+        finally:
+            # Clean up temp file
+            if os.path.exists(temp_path):
+                os.unlink(temp_path)
+    except Exception as e:
+        logger.error(f"Error processing audio for {client_id}: {str(e)}")
+        if client_id in active_connections:
+            websocket = active_connections[client_id]
+            await websocket.send_text(json.dumps({
+                "type": "stt_transcription_error",
+                "client_id": client_id,
+                "error": f"Processing error: {str(e)}"
+            }))
+@fastapi_app.websocket("/ws/stt")
 async def websocket_stt_endpoint(websocket: WebSocket):
     """Main STT WebSocket endpoint"""
     client_id = None
     try:
         # Accept connection
+        client_id = await connect_websocket(websocket)
         # Handle messages
         while True:
                 message_type = message.get("type", "unknown")
                 if message_type == "stt_audio_chunk":
+                    await process_audio_message(client_id, message)
                 elif message_type == "ping":
                     # Respond to ping
                     await websocket.send_text(json.dumps({
         logger.error(f"WebSocket error for {client_id}: {str(e)}")
     finally:
         if client_id:
+            await disconnect_websocket(client_id)
+# CRITICAL: Use gr.mount_gradio_app() for proper WebSocket routing
+app = gr.mount_gradio_app(fastapi_app, demo, path="/")
+# For HuggingFace Spaces - this becomes the main app
 if __name__ == "__main__":
+    logger.info(f"🎤 Starting {__service__} v{__version__} with Gradio+WebSocket integration")
+    demo.launch(server_port=7860, server_name="0.0.0.0")

requirements.txt CHANGED Viewed

@@ -1,9 +1,10 @@
-# Minimal requirements for WebSocket-only STT service
 torch>=2.1.0
 torchaudio>=2.1.0
 transformers>=4.35.0
 accelerate>=0.24.0
 spaces>=0.19.0
 numpy>=1.21.0
 soundfile>=0.12.0
 fastapi>=0.104.0

+# Requirements for Gradio+WebSocket STT service with ZeroGPU
 torch>=2.1.0
 torchaudio>=2.1.0
 transformers>=4.35.0
 accelerate>=0.24.0
 spaces>=0.19.0
+gradio>=5.42.0
 numpy>=1.21.0
 soundfile>=0.12.0
 fastapi>=0.104.0