Spaces:

RinggAI
/

STT

Paused

App Files Files Community

Ubuntu commited on Jan 5

Commit

5a0bcf6

1 Parent(s): 7ce11b0

Add WebSocket on_final mode support for faster transcription and update requirements

Browse files

Files changed (2) hide show

app.py +139 -38
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -7,6 +7,9 @@ Real-time streaming transcription using Gradio's audio streaming.
 import os
 import tempfile
 from pathlib import Path
 import gradio as gr
 import requests
@@ -14,6 +17,13 @@ import numpy as np
 import soundfile as sf
 from dotenv import load_dotenv
 try:
     import librosa
     HAS_LIBROSA = True
@@ -87,6 +97,80 @@ class RinggSTTClient:
             print(f"Transcription error: {e}")
             return ""
     def transcribe_file(self, audio_file_path: str, language: str = "hi") -> str:
         """Transcribe audio file via multipart upload API"""
         try:
@@ -141,12 +225,8 @@ def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarra
 def transcribe_stream(audio, language, audio_buffer, last_transcription, samples_processed):
     """
-    Process streaming audio from microphone.
-    Simplified approach:
-    - Accumulate ALL audio chunks
-    - When we have enough new audio, transcribe the ENTIRE recording
-    - Display the complete transcription (backend handles everything)
     """
     # Initialize states
     if audio_buffer is None:
@@ -183,22 +263,35 @@ def transcribe_stream(audio, language, audio_buffer, last_transcription, samples
     total_samples = sum(len(arr) for arr in audio_buffer)
     total_duration = total_samples / sample_rate
-    # Calculate new audio since last transcription
-    new_samples = total_samples - samples_processed
-    new_duration = new_samples / sample_rate
-    # Only transcribe if we have enough NEW audio (to avoid too frequent API calls)
-    if new_duration < MIN_AUDIO_LENGTH:
-        display = last_transcription if last_transcription else f"🎤 Recording... ({total_duration:.1f}s)"
-        return display, audio_buffer, last_transcription, samples_processed
     try:
         # Concatenate ALL buffered audio
         full_audio = np.concatenate(audio_buffer)
         # Resample to 16kHz if needed
         if sample_rate != TARGET_SAMPLE_RATE:
             full_audio = resample_audio(full_audio, sample_rate, TARGET_SAMPLE_RATE)
         # Normalize audio
         max_val = np.max(np.abs(full_audio))
@@ -208,32 +301,31 @@ def transcribe_stream(audio, language, audio_buffer, last_transcription, samples
         # Get language code
         lang_code = "hi" if language == "Hindi" else "en"
-        # Transcribe the ENTIRE audio
-        transcription = stt_client.transcribe_audio_data(
-            full_audio.astype(np.float32),
-            TARGET_SAMPLE_RATE,
-            lang_code
         )
         # Update state
-        if transcription.strip():
             last_transcription = transcription
-        # Mark all current samples as processed
-        samples_processed = total_samples
-        display = last_transcription if last_transcription else f"🎤 Recording... ({total_duration:.1f}s)"
-        return display, audio_buffer, last_transcription, samples_processed
     except Exception as e:
         print(f"Processing error: {e}")
-        display = last_transcription if last_transcription else "🎤 Listening..."
-        return display, audio_buffer, last_transcription, samples_processed
 def clear_transcription():
     """Clear all transcription state"""
-    return "🎤 Click microphone to start...", None, "", 0
 def transcribe_file(audio_file, language):
@@ -270,16 +362,16 @@ def create_interface():
         # Real-time streaming section
         gr.Markdown("""
-            ## 🎤 Real-time Transcription
-            Click the microphone to start recording. Transcription updates as you speak.
-            *The entire recording is transcribed each time, so text may refine as more context is added.*
         """)
         # States for streaming
         audio_buffer = gr.State(None)
         last_transcription = gr.State("")
-        samples_processed = gr.State(0)
         with gr.Row():
             with gr.Column(scale=1):
@@ -294,7 +386,9 @@ def create_interface():
                     streaming=True,
                     label="🎤 Click to start recording",
                 )
-                clear_btn = gr.Button("🗑️ Clear & Reset", variant="secondary")
             with gr.Column(scale=2):
                 text_output = gr.Textbox(
@@ -304,18 +398,25 @@ def create_interface():
                     interactive=False,
                 )
-        # Wire up streaming
         audio_input.stream(
             fn=transcribe_stream,
-            inputs=[audio_input, stream_language, audio_buffer, last_transcription, samples_processed],
-            outputs=[text_output, audio_buffer, last_transcription, samples_processed],
         )
         # Clear button
         clear_btn.click(
             fn=clear_transcription,
             inputs=[],
-            outputs=[text_output, audio_buffer, last_transcription, samples_processed],
         )
         gr.Markdown("<br>")

 import os
 import tempfile
 from pathlib import Path
+import json
+import struct
+import asyncio
 import gradio as gr
 import requests
 import soundfile as sf
 from dotenv import load_dotenv
+try:
+    import websockets
+    HAS_WEBSOCKETS = True
+except ImportError:
+    HAS_WEBSOCKETS = False
+    print("⚠️ websockets not installed. Install with: pip install websockets")
 try:
     import librosa
     HAS_LIBROSA = True
             print(f"Transcription error: {e}")
             return ""
+    async def transcribe_websocket_on_final(self, audio_data: np.ndarray, sample_rate: int, language: str = "hi") -> str:
+        """Transcribe audio via WebSocket on_final endpoint"""
+        if not HAS_WEBSOCKETS:
+            return "❌ websockets library not installed"
+        try:
+            # Convert HTTP endpoint to WebSocket
+            ws_endpoint = self.api_endpoint.replace("http://", "ws://").replace("https://", "wss://")
+            ws_url = f"{ws_endpoint}/v1/audio/stream"
+            # Convert audio to int16 PCM
+            audio_int16 = (audio_data * 32767).astype(np.int16)
+            audio_bytes = audio_int16.tobytes()
+            # Chunk size for streaming (send in 1 second chunks)
+            chunk_size = sample_rate * 2  # 2 bytes per sample (int16)
+            async with websockets.connect(ws_url, max_size=None) as ws:
+                # Send start message with on_final mode (first message must be "start")
+                start_msg = {
+                    "type": "start",
+                    "prediction_method": "on_final",
+                    "sample_rate": sample_rate,
+                    "encoding": "int16",
+                    "language": "Hindi" if language == "hi" else "English",
+                    "api_key": "gradio-client",
+                    "punctuate": False
+                }
+                await ws.send(json.dumps(start_msg))
+                # Wait for ready response
+                ready_msg = await ws.recv()
+                ready_data = json.loads(ready_msg)
+                if ready_data.get("type") != "ready":
+                    return f"❌ Unexpected response: {ready_data}"
+                print(f"✅ WebSocket ready: {ready_data}")
+                # Send audio in chunks
+                for i in range(0, len(audio_bytes), chunk_size):
+                    chunk = audio_bytes[i:i + chunk_size]
+                    await ws.send(chunk)
+                    # Receive chunk acknowledgment
+                    ack = await ws.recv()
+                    ack_data = json.loads(ack)
+                    if ack_data.get("type") == "chunk":
+                        print(f"Buffered: {ack_data.get('total_buffered', 0)} samples")
+                # Send end signal to trigger transcription
+                end_msg = {"type": "end"}
+                await ws.send(json.dumps(end_msg))
+                # Receive transcription
+                transcription = ""
+                result_msg = await ws.recv()
+                result_data = json.loads(result_msg)
+                if result_data.get("type") == "transcript":
+                    transcription = result_data.get("transcription", "")
+                elif result_data.get("type") == "error":
+                    return f"❌ Error: {result_data.get('detail', 'Unknown error')}"
+                # Send stop to end session
+                stop_msg = {"type": "stop"}
+                await ws.send(json.dumps(stop_msg))
+                return transcription
+        except Exception as e:
+            print(f"WebSocket transcription error: {e}")
+            return f"❌ WebSocket Error: {str(e)}"
     def transcribe_file(self, audio_file_path: str, language: str = "hi") -> str:
         """Transcribe audio file via multipart upload API"""
         try:
 def transcribe_stream(audio, language, audio_buffer, last_transcription, samples_processed):
     """
+    Accumulate audio chunks during recording.
+    Just buffer the audio, don't transcribe yet.
     """
     # Initialize states
     if audio_buffer is None:
     total_samples = sum(len(arr) for arr in audio_buffer)
     total_duration = total_samples / sample_rate
+    # Just show recording status, don't transcribe yet
+    display = last_transcription if last_transcription else f"🎤 Recording... ({total_duration:.1f}s)"
+    return display, audio_buffer, last_transcription, sample_rate
+def process_recorded_audio(audio_buffer, sample_rate, language, last_transcription):
+    """
+    Process the entire recorded audio after user stops recording.
+    This is called when the stop recording button is pressed.
+    Uses WebSocket on_final endpoint for faster transcription.
+    """
+    if audio_buffer is None or len(audio_buffer) == 0:
+        return "⚠️ No audio recorded", audio_buffer, last_transcription, 0
     try:
         # Concatenate ALL buffered audio
         full_audio = np.concatenate(audio_buffer)
+        # Calculate duration
+        total_samples = len(full_audio)
+        total_duration = total_samples / sample_rate
+        # Show processing message
+        print(f"Processing {total_duration:.1f}s of audio via WebSocket...")
         # Resample to 16kHz if needed
         if sample_rate != TARGET_SAMPLE_RATE:
             full_audio = resample_audio(full_audio, sample_rate, TARGET_SAMPLE_RATE)
+            sample_rate = TARGET_SAMPLE_RATE
         # Normalize audio
         max_val = np.max(np.abs(full_audio))
         # Get language code
         lang_code = "hi" if language == "Hindi" else "en"
+        # Transcribe via WebSocket on_final endpoint
+        transcription = asyncio.run(
+            stt_client.transcribe_websocket_on_final(
+                full_audio.astype(np.float32),
+                sample_rate,
+                lang_code
+            )
         )
         # Update state
+        if transcription and transcription.strip() and not transcription.startswith("❌"):
             last_transcription = transcription
+            return transcription, audio_buffer, last_transcription, sample_rate
+        else:
+            return transcription or "⚠️ No speech detected in the recording", audio_buffer, last_transcription, sample_rate
     except Exception as e:
         print(f"Processing error: {e}")
+        error_msg = f"❌ Error processing audio: {str(e)}"
+        return error_msg, audio_buffer, last_transcription, sample_rate
 def clear_transcription():
     """Clear all transcription state"""
+    return "🎤 Click microphone to start...", None, "", 16000
 def transcribe_file(audio_file, language):
         # Real-time streaming section
         gr.Markdown("""
+            ## 🎤 Record & Transcribe (WebSocket)
+            Click the microphone to start recording. Click stop when finished to get transcription.
+            *The entire recording will be transcribed via WebSocket on_final endpoint with TensorRT acceleration.*
         """)
         # States for streaming
         audio_buffer = gr.State(None)
         last_transcription = gr.State("")
+        sample_rate_state = gr.State(16000)
         with gr.Row():
             with gr.Column(scale=1):
                     streaming=True,
                     label="🎤 Click to start recording",
                 )
+                with gr.Row():
+                    stop_btn = gr.Button("⏹️ Stop & Transcribe", variant="primary", size="lg")
+                    clear_btn = gr.Button("🗑️ Clear & Reset", variant="secondary")
             with gr.Column(scale=2):
                 text_output = gr.Textbox(
                     interactive=False,
                 )
+        # Wire up streaming (just accumulates audio, doesn't transcribe)
         audio_input.stream(
             fn=transcribe_stream,
+            inputs=[audio_input, stream_language, audio_buffer, last_transcription, sample_rate_state],
+            outputs=[text_output, audio_buffer, last_transcription, sample_rate_state],
+        )
+        # Stop button - processes all accumulated audio
+        stop_btn.click(
+            fn=process_recorded_audio,
+            inputs=[audio_buffer, sample_rate_state, stream_language, last_transcription],
+            outputs=[text_output, audio_buffer, last_transcription, sample_rate_state],
         )
         # Clear button
         clear_btn.click(
             fn=clear_transcription,
             inputs=[],
+            outputs=[text_output, audio_buffer, last_transcription, sample_rate_state],
         )
         gr.Markdown("<br>")

requirements.txt CHANGED Viewed

@@ -5,3 +5,4 @@ requests==2.32.5
 huggingface-hub==1.0.1
 python-dotenv
 soundfile

 huggingface-hub==1.0.1
 python-dotenv
 soundfile
+websockets