Spaces:

RinggAI
/

STT

Running

App Files Files Community

harsh2ai commited on Oct 31, 2025

Commit

8f86c2d

1 Parent(s): d2a497b

Update app UI layout

Browse files

Files changed (1) hide show

app.py +248 -86

app.py CHANGED Viewed

@@ -25,19 +25,6 @@ custom_css = """
     border-radius: 10px;
     margin-bottom: 20px;
 }
-footer {
-    visibility: hidden !important;
-    height: 50px !important;
-}
-footer:after {
-    content: "Made with ❤️ by RinggAI Team" !important;
-    visibility: visible !important;
-    display: block !important;
-    text-align: center !important;
-    margin-top: 15px !important;
-    color: #666 !important;
-    font-size: 14px !important;
-}
 """
 # Backend API endpoint (ngrok URL)
@@ -104,6 +91,33 @@ class RinggSTTClient:
         except Exception as e:
             return f"❌ Error: {str(e)}"
 # Initialize API client
 print(f"🔗 Connecting to STT API: {API_ENDPOINT}")
 stt_client = RinggSTTClient(API_ENDPOINT)
@@ -123,6 +137,43 @@ def create_interface():
         return stt_client.transcribe_audio(audio_file)
     def check_api_status():
         """Check API health status"""
         health = stt_client.check_health()
@@ -133,17 +184,193 @@ def create_interface():
         gr.Markdown("""
         <div class="main-header">
         <h1>🎙️ Ringg STT V0</h1>
-        <p>State-of-the-Art Bilingual Speech-to-Text (English & Hindi)</p>
         </div>
         """)
-        # Performance Comparison Table
         gr.Markdown("""
         ## Performance Benchmarks
-        Our model achieves **state-of-the-art performance** on English-Hindi bilingual speech recognition:
         """)
         with gr.Row():
             gr.DataFrame(
                 value=[
@@ -159,75 +386,10 @@ def create_interface():
                 col_count=(3, "fixed"),
                 label="Word Error Rate Comparison (Lower is Better)"
             )
-        gr.Markdown("""
-        **Ringg STT V0** ranks **2nd** among top models, outperforming OpenAI Whisper Large-v3 and other leading solutions.
-        Lower WER (Word Error Rate) indicates better accuracy. Our model achieves competitive performance while supporting bilingual transcription.
-        """)
-        gr.Markdown("""
-        ### ✨ Features
-        - 🌐 **Bilingual Support**: Transcribe English and Hindi speech
-        - 🎯 **High Accuracy**: Competitive with leading ASR models
-        - 📁 **File Upload**: Support for various audio formats (WAV, MP3, FLAC, etc.)
-        - ⚡ **Fast Processing**: Optimized for quick transcription
-        - 🔒 **Private Infrastructure**: Secure and controlled deployment
-        """)
         gr.Markdown("""
-        ### 🔗 Links
-        - **Organization**: [RinggAI on Hugging Face](https://huggingface.co/RinggAI)
-        - **TTS Space**: [Ringg TTS V0](https://huggingface.co/spaces/RinggAI/Ringg-TTS-v0.0)
-        ### 🙏 Acknowledgements
-        - Special thanks to [@jeremylee12](https://huggingface.co/jeremylee12) for their contributions
         """)
-        # API Status indicator
-        with gr.Row():
-            with gr.Column(scale=4):
-                api_status = gr.Textbox(
-                    label="🔌 API Status",
-                    value=health_status["message"],
-                    interactive=False
-                )
-            with gr.Column(scale=1):
-                check_btn = gr.Button("🔄 Check Status", size="sm")
-                check_btn.click(check_api_status, outputs=api_status)
-        with gr.Tab("📁 File Upload"):
-            gr.Markdown("### Upload Audio File")
-            gr.Markdown("Upload an audio file for transcription (supports WAV, MP3, FLAC, M4A, etc.)")
-            audio_input = gr.Audio(
-                label="📁 Upload Audio File",
-                type="filepath",
-                sources=["upload"]
-            )
-            transcribe_btn = gr.Button("🔄 Transcribe", variant="primary", size="lg")
-            file_output = gr.Textbox(
-                label="Transcription Result",
-                lines=8,
-                interactive=False,
-                placeholder="Upload a file and click Transcribe..."
-            )
-            transcribe_btn.click(
-                transcribe_audio,
-                inputs=audio_input,
-                outputs=file_output
-            )
-            gr.Markdown("""
-            ### 💡 Tips for Best Results
-            - Use clear audio with minimal background noise
-            - Speak naturally at a moderate pace
-            - For file upload, ensure audio quality is good (16kHz or higher recommended)
-            - Model handles code-switching between English and Hindi
-            """)
     return demo

     border-radius: 10px;
     margin-bottom: 20px;
 }
 """
 # Backend API endpoint (ngrok URL)
         except Exception as e:
             return f"❌ Error: {str(e)}"
+    def transcribe_streaming(self, audio_chunk: np.ndarray) -> Optional[str]:
+        """Send audio chunk for streaming transcription"""
+        try:
+            # Convert numpy array to base64
+            audio_bytes = audio_chunk.tobytes()
+            audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
+            response = self.session.post(
+                f"{self.api_endpoint}/transcribe_stream",
+                json={
+                    "audio_chunk": audio_base64,
+                    "dtype": str(audio_chunk.dtype),
+                    "shape": list(audio_chunk.shape)
+                },
+                timeout=10
+            )
+            if response.status_code == 200:
+                result = response.json()
+                return result.get("transcription")
+            return None
+        except Exception as e:
+            print(f"Streaming error: {e}")
+            return None
 # Initialize API client
 print(f"🔗 Connecting to STT API: {API_ENDPOINT}")
 stt_client = RinggSTTClient(API_ENDPOINT)
         return stt_client.transcribe_audio(audio_file)
+    def stream_audio(audio, state):
+        """Handle streaming audio"""
+        if audio is None:
+            return "No audio input", state
+        try:
+            if state is None:
+                state = {"transcripts": []}
+            if isinstance(audio, tuple):
+                sample_rate, audio_array = audio
+            else:
+                audio_array = audio
+                sample_rate = 16000
+            if audio_array is not None and len(audio_array) > 0:
+                if len(audio_array.shape) > 1:
+                    audio_array = np.mean(audio_array, axis=1)
+                audio_array = audio_array.astype(np.float32)
+                max_abs = np.max(np.abs(audio_array)) if audio_array.size else 0.0
+                if max_abs > 1e-6:
+                    audio_array = audio_array / max_abs
+                # Send to API
+                transcript = stt_client.transcribe_streaming(audio_array)
+                if transcript and transcript.strip():
+                    if not state["transcripts"] or transcript != state["transcripts"][-1]:
+                        state["transcripts"].append(transcript)
+            combined = " ".join(state["transcripts"]) if state["transcripts"] else "🎤 Listening..."
+            return combined, state
+        except Exception as e:
+            return f"❌ Error: {str(e)}", state
     def check_api_status():
         """Check API health status"""
         health = stt_client.check_health()
         gr.Markdown("""
         <div class="main-header">
         <h1>🎙️ Ringg STT V0</h1>
+        <p>State-of-the-Art Speech-to-Text for Hindi and code-switching</p>
         </div>
         """)
+        # API Status indicator
+        with gr.Row():
+            with gr.Column(scale=4):
+                api_status = gr.Textbox(
+                    label="🔌 API Status",
+                    value=health_status["message"],
+                    interactive=False
+                )
+            with gr.Column(scale=1):
+                check_btn = gr.Button("🔄 Check Status", size="sm")
+                check_btn.click(check_api_status, outputs=api_status)
+        gr.Markdown("""
+        ### 📁 File Upload
+        Upload an audio file for transcription (supports WAV, MP3, FLAC, M4A, etc.)
+        """)
+        audio_input = gr.Audio(
+            label="📁 Upload Audio File",
+            type="filepath",
+            sources=["upload"]
+        )
+        transcribe_btn = gr.Button("🔄 Transcribe", variant="primary", size="lg")
+        file_output = gr.Textbox(
+            label="Transcription Result",
+            lines=8,
+            interactive=False,
+            placeholder="Upload a file and click Transcribe..."
+        )
+        transcribe_btn.click(
+            transcribe_audio,
+            inputs=audio_input,
+            outputs=file_output
+        )
+        gr.Markdown("""
+        ### 💡 Tips for Best Results
+        - Use clear audio with minimal background noise
+        - Speak naturally at a moderate pace
+        - For file upload, ensure audio quality is good (16kHz or higher recommended)
+        - Model handles Hindi code-switching scenarios
+        """)
+        gr.Markdown("""
+        ### ✨ Features
+        - 🌐 **Bilingual Support**: Handles Hindi and common code-switching patterns
+        - ⚡ **Real-time Processing**: Instant transcription as you speak
+        - 🎯 **High Accuracy**: Powered by Parakeet TDT CTC 110M model
+        - 📁 **File Upload**: Support for various audio formats (WAV, MP3, FLAC, etc.)
+        """)
+        gr.Markdown("""
+        ### 🎤 Real-time Streaming
+        Speak into your microphone for real-time transcription tuned for Hindi and code-switching.
+        """)
+        gr.Markdown("""
+        ⚠️ **Note**: Real-time streaming sends audio chunks to the API endpoint.
+        Make sure your backend service is running and accessible.
+        """)
+        # Buffer Configuration Controls
+        with gr.Row():
+            with gr.Column():
+                gr.Markdown("### 🔧 Streaming Configuration")
+                gr.Markdown("Adjust these settings to optimize streaming performance based on your connection.")
+                buffer_duration = gr.Slider(
+                    minimum=2.0, maximum=6.0, step=0.5, value=3.0,
+                    label="Buffer Duration (seconds)",
+                    info="Size of audio chunks sent to API"
+                )
+                process_every_n = gr.Slider(
+                    minimum=2, maximum=8, step=1, value=3,
+                    label="Process Every N Chunks",
+                    info="How often to send audio (higher = less frequent)"
+                )
+                min_interval = gr.Slider(
+                    minimum=1.0, maximum=4.0, step=0.5, value=2.0,
+                    label="Min Processing Interval (seconds)",
+                    info="Minimum time between API calls"
+                )
+                config_info = gr.Markdown("""
+                **💡 Tuning Tips:**
+                - **Lower latency**: Decrease buffer duration and interval
+                - **Better accuracy**: Increase buffer duration
+                - **Reduce API calls**: Increase process frequency
+                - **Slow connection**: Increase all values
+                """)
+        mic_input = gr.Audio(
+            sources=["microphone"],
+            type="numpy",
+            streaming=True,
+            label="🎤 Microphone Input"
+        )
+        live_output = gr.Textbox(
+            label="Live Transcription",
+            lines=8,
+            interactive=False,
+            placeholder="Your transcription will appear here..."
+        )
+        session_state = gr.State(lambda: None)
+        mic_input.stream(
+            fn=stream_audio,
+            inputs=[mic_input, session_state],
+            outputs=[live_output, session_state],
+            stream_every=0.5
+        )
+        gr.Markdown("""
+        ### ⚙️ Configuration
+        **Current API Endpoint**: `{API_ENDPOINT}`
+        The transcription service runs on a private backend accessed via a secure API endpoint.
+        To update the API endpoint, set the `STT_API_ENDPOINT` environment variable in Space Settings.
+        """)
+        gr.Markdown("""
+        ## About Ringg STT V0
+        Ringg STT V0 is powered by NVIDIA NeMo's Parakeet TDT CTC 110M model, optimized for Hindi transcription and code-switching scenarios.
+        ### 🎯 Model Details
+        - **Model**: Parakeet TDT CTC 110M
+        - **Architecture**: FastConformer encoder with CTC decoder
+        - **Parameters**: 110 Million
+        - **Languages**: Hindi + code-switching contexts
+        - **Sample Rate**: 16kHz
+        - **Framework**: PyTorch + NVIDIA NeMo
+        ### 🏗️ Architecture
+        This Space uses a **frontend-backend architecture**:
+        ```
+        User → HF Space (Frontend) → API Endpoint → Private Server (Model) → Response
+        ```
+        - **Frontend**: This Hugging Face Space (Gradio UI)
+        - **Backend**: Private inference server with the actual model
+        - **Connection**: Secure API calls via tunnel
+        ### 🚀 Key Features
+        - **Hindi-focused Recognition** with code-switching support
+        - **Real-time Streaming** with low latency
+        - **Flexible Input** supporting microphone and file upload
+        ### 📊 Use Cases
+        - Meeting transcription and call analytics
+        - Media subtitling
+        - Accessibility applications
+        - Voice search and automation workflows
+        ### 🔧 Technical Specifications
+        - **Decoder**: CTC (Connectionist Temporal Classification)
+        - **Audio Processing**: 16kHz mono, PCM16
+        - **Latency**: ~2-3 seconds for streaming
+        - **API Protocol**: REST API with base64-encoded audio
+        ### 📝 Limitations
+        - Requires an active backend API endpoint
+        - Performs best with clear audio and minimal background noise
+        - Accuracy may vary with accents and challenging acoustic conditions
+        ---
+        Made with ❤️ by RinggAI Team
+        """)
         gr.Markdown("""
         ## Performance Benchmarks
+        Our model achieves **state-of-the-art performance** on bilingual speech recognition benchmarks:
         """)
         with gr.Row():
             gr.DataFrame(
                 value=[
                 col_count=(3, "fixed"),
                 label="Word Error Rate Comparison (Lower is Better)"
             )
         gr.Markdown("""
+        **Ringg STT V0** ranks **2nd** among top models, offering competitive performance while keeping the backend infrastructure private.
         """)
     return demo