Spaces:

RinggAI
/

STT

Running

App Files Files Community

harsh2ai commited on Oct 31, 2025

Commit

508b24f

1 Parent(s): 428a84e

Revert to original app layout

Browse files

Files changed (1) hide show

app.py +86 -247

app.py CHANGED Viewed

@@ -25,6 +25,19 @@ custom_css = """
     border-radius: 10px;
     margin-bottom: 20px;
 }
 """
 # Backend API endpoint (ngrok URL)
@@ -91,33 +104,6 @@ class RinggSTTClient:
         except Exception as e:
             return f"❌ Error: {str(e)}"
-    def transcribe_streaming(self, audio_chunk: np.ndarray) -> Optional[str]:
-        """Send audio chunk for streaming transcription"""
-        try:
-            # Convert numpy array to base64
-            audio_bytes = audio_chunk.tobytes()
-            audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
-            response = self.session.post(
-                f"{self.api_endpoint}/transcribe_stream",
-                json={
-                    "audio_chunk": audio_base64,
-                    "dtype": str(audio_chunk.dtype),
-                    "shape": list(audio_chunk.shape)
-                },
-                timeout=10
-            )
-            if response.status_code == 200:
-                result = response.json()
-                return result.get("transcription")
-            return None
-        except Exception as e:
-            print(f"Streaming error: {e}")
-            return None
 # Initialize API client
 print(f"🔗 Connecting to STT API: {API_ENDPOINT}")
 stt_client = RinggSTTClient(API_ENDPOINT)
@@ -137,43 +123,6 @@ def create_interface():
         return stt_client.transcribe_audio(audio_file)
-    def stream_audio(audio, state):
-        """Handle streaming audio"""
-        if audio is None:
-            return "No audio input", state
-        try:
-            if state is None:
-                state = {"transcripts": []}
-            if isinstance(audio, tuple):
-                sample_rate, audio_array = audio
-            else:
-                audio_array = audio
-                sample_rate = 16000
-            if audio_array is not None and len(audio_array) > 0:
-                if len(audio_array.shape) > 1:
-                    audio_array = np.mean(audio_array, axis=1)
-                audio_array = audio_array.astype(np.float32)
-                max_abs = np.max(np.abs(audio_array)) if audio_array.size else 0.0
-                if max_abs > 1e-6:
-                    audio_array = audio_array / max_abs
-                # Send to API
-                transcript = stt_client.transcribe_streaming(audio_array)
-                if transcript and transcript.strip():
-                    if not state["transcripts"] or transcript != state["transcripts"][-1]:
-                        state["transcripts"].append(transcript)
-            combined = " ".join(state["transcripts"]) if state["transcripts"] else "🎤 Listening..."
-            return combined, state
-        except Exception as e:
-            return f"❌ Error: {str(e)}", state
     def check_api_status():
         """Check API health status"""
         health = stt_client.check_health()
@@ -184,192 +133,17 @@ def create_interface():
         gr.Markdown("""
         <div class="main-header">
         <h1>🎙️ Ringg STT V0</h1>
-        <p>State-of-the-Art Speech-to-Text for Hindi and code-switching</p>
         </div>
         """)
-        # API Status indicator
-        with gr.Row():
-            with gr.Column(scale=4):
-                api_status = gr.Textbox(
-                    label="🔌 API Status",
-                    value=health_status["message"],
-                    interactive=False
-                )
-            with gr.Column(scale=1):
-                check_btn = gr.Button("🔄 Check Status", size="sm")
-                check_btn.click(check_api_status, outputs=api_status)
-        gr.Markdown("""
-        ### 📁 File Upload
-        Upload an audio file for transcription (supports WAV, MP3, FLAC, M4A, etc.)
-        """)
-        audio_input = gr.Audio(
-            label="📁 Upload Audio File",
-            type="filepath",
-            sources=["upload"]
-        )
-        transcribe_btn = gr.Button("🔄 Transcribe", variant="primary", size="lg")
-        file_output = gr.Textbox(
-            label="Transcription Result",
-            lines=8,
-            interactive=False,
-            placeholder="Upload a file and click Transcribe..."
-        )
-        transcribe_btn.click(
-            transcribe_audio,
-            inputs=audio_input,
-            outputs=file_output
-        )
-        gr.Markdown("""
-        ### 💡 Tips for Best Results
-        - Use clear audio with minimal background noise
-        - Speak naturally at a moderate pace
-        - For file upload, ensure audio quality is good (16kHz or higher recommended)
-        - Model handles Hindi code-switching scenarios
-        """)
-        gr.Markdown("""
-        ### ✨ Features
-        - 🌐 **Bilingual Support**: Handles Hindi and common code-switching patterns
-        - ⚡ **Real-time Processing**: Instant transcription as you speak
-        - 🎯 **High Accuracy**: Powered by Parakeet TDT CTC 110M model
-        - 📁 **File Upload**: Support for various audio formats (WAV, MP3, FLAC, etc.)
-        """)
-        gr.Markdown("""
-        ### 🎤 Real-time Streaming
-        Speak into your microphone for real-time transcription tuned for Hindi and code-switching.
-        """)
-        gr.Markdown("""
-        ⚠️ **Note**: Real-time streaming sends audio chunks to the API endpoint.
-        Make sure your backend service is running and accessible.
-        """)
-        # Buffer Configuration Controls
-        with gr.Row():
-            with gr.Column():
-                gr.Markdown("### 🔧 Streaming Configuration")
-                gr.Markdown("Adjust these settings to optimize streaming performance based on your connection.")
-                buffer_duration = gr.Slider(
-                    minimum=2.0, maximum=6.0, step=0.5, value=3.0,
-                    label="Buffer Duration (seconds)",
-                    info="Size of audio chunks sent to API"
-                )
-                process_every_n = gr.Slider(
-                    minimum=2, maximum=8, step=1, value=3,
-                    label="Process Every N Chunks",
-                    info="How often to send audio (higher = less frequent)"
-                )
-                min_interval = gr.Slider(
-                    minimum=1.0, maximum=4.0, step=0.5, value=2.0,
-                    label="Min Processing Interval (seconds)",
-                    info="Minimum time between API calls"
-                )
-                config_info = gr.Markdown("""
-                **💡 Tuning Tips:**
-                - **Lower latency**: Decrease buffer duration and interval
-                - **Better accuracy**: Increase buffer duration
-                - **Reduce API calls**: Increase process frequency
-                - **Slow connection**: Increase all values
-                """)
-        mic_input = gr.Audio(
-            sources=["microphone"],
-            type="numpy",
-            streaming=True,
-            label="🎤 Microphone Input"
-        )
-        live_output = gr.Textbox(
-            label="Live Transcription",
-            lines=8,
-            interactive=False,
-            placeholder="Your transcription will appear here..."
-        )
-        session_state = gr.State(lambda: None)
-        mic_input.stream(
-            fn=stream_audio,
-            inputs=[mic_input, session_state],
-            outputs=[live_output, session_state]
-        )
-        gr.Markdown("""
-        ### ⚙️ Configuration
-        **Current API Endpoint**: `{API_ENDPOINT}`
-        The transcription service runs on a private backend accessed via a secure API endpoint.
-        To update the API endpoint, set the `STT_API_ENDPOINT` environment variable in Space Settings.
-        """)
-        gr.Markdown("""
-        ## About Ringg STT V0
-        Ringg STT V0 is powered by NVIDIA NeMo's Parakeet TDT CTC 110M model, optimized for Hindi transcription and code-switching scenarios.
-        ### 🎯 Model Details
-        - **Model**: Parakeet TDT CTC 110M
-        - **Architecture**: FastConformer encoder with CTC decoder
-        - **Parameters**: 110 Million
-        - **Languages**: Hindi + code-switching contexts
-        - **Sample Rate**: 16kHz
-        - **Framework**: PyTorch + NVIDIA NeMo
-        ### 🏗️ Architecture
-        This Space uses a **frontend-backend architecture**:
-        ```
-        User → HF Space (Frontend) → API Endpoint → Private Server (Model) → Response
-        ```
-        - **Frontend**: This Hugging Face Space (Gradio UI)
-        - **Backend**: Private inference server with the actual model
-        - **Connection**: Secure API calls via tunnel
-        ### 🚀 Key Features
-        - **Hindi-focused Recognition** with code-switching support
-        - **Real-time Streaming** with low latency
-        - **Flexible Input** supporting microphone and file upload
-        ### 📊 Use Cases
-        - Meeting transcription and call analytics
-        - Media subtitling
-        - Accessibility applications
-        - Voice search and automation workflows
-        ### 🔧 Technical Specifications
-        - **Decoder**: CTC (Connectionist Temporal Classification)
-        - **Audio Processing**: 16kHz mono, PCM16
-        - **Latency**: ~2-3 seconds for streaming
-        - **API Protocol**: REST API with base64-encoded audio
-        ### 📝 Limitations
-        - Requires an active backend API endpoint
-        - Performs best with clear audio and minimal background noise
-        - Accuracy may vary with accents and challenging acoustic conditions
-        ---
-        Made with ❤️ by RinggAI Team
-        """)
         gr.Markdown("""
         ## Performance Benchmarks
-        Our model achieves **state-of-the-art performance** on bilingual speech recognition benchmarks:
         """)
         with gr.Row():
             gr.DataFrame(
                 value=[
@@ -385,10 +159,75 @@ def create_interface():
                 col_count=(3, "fixed"),
                 label="Word Error Rate Comparison (Lower is Better)"
             )
         gr.Markdown("""
-        **Ringg STT V0** ranks **2nd** among top models, offering competitive performance while keeping the backend infrastructure private.
         """)
     return demo

     border-radius: 10px;
     margin-bottom: 20px;
 }
+footer {
+    visibility: hidden !important;
+    height: 50px !important;
+}
+footer:after {
+    content: "Made with ❤️ by RinggAI Team" !important;
+    visibility: visible !important;
+    display: block !important;
+    text-align: center !important;
+    margin-top: 15px !important;
+    color: #666 !important;
+    font-size: 14px !important;
+}
 """
 # Backend API endpoint (ngrok URL)
         except Exception as e:
             return f"❌ Error: {str(e)}"
 # Initialize API client
 print(f"🔗 Connecting to STT API: {API_ENDPOINT}")
 stt_client = RinggSTTClient(API_ENDPOINT)
         return stt_client.transcribe_audio(audio_file)
     def check_api_status():
         """Check API health status"""
         health = stt_client.check_health()
         gr.Markdown("""
         <div class="main-header">
         <h1>🎙️ Ringg STT V0</h1>
+        <p>State-of-the-Art Bilingual Speech-to-Text (English & Hindi)</p>
         </div>
         """)
+        # Performance Comparison Table
         gr.Markdown("""
         ## Performance Benchmarks
+        Our model achieves **state-of-the-art performance** on English-Hindi bilingual speech recognition:
         """)
         with gr.Row():
             gr.DataFrame(
                 value=[
                 col_count=(3, "fixed"),
                 label="Word Error Rate Comparison (Lower is Better)"
             )
+        gr.Markdown("""
+        **Ringg STT V0** ranks **2nd** among top models, outperforming OpenAI Whisper Large-v3 and other leading solutions.
+        Lower WER (Word Error Rate) indicates better accuracy. Our model achieves competitive performance while supporting bilingual transcription.
+        """)
+        gr.Markdown("""
+        ### ✨ Features
+        - 🌐 **Bilingual Support**: Transcribe English and Hindi speech
+        - 🎯 **High Accuracy**: Competitive with leading ASR models
+        - 📁 **File Upload**: Support for various audio formats (WAV, MP3, FLAC, etc.)
+        - ⚡ **Fast Processing**: Optimized for quick transcription
+        - 🔒 **Private Infrastructure**: Secure and controlled deployment
+        """)
         gr.Markdown("""
+        ### 🔗 Links
+        - **Organization**: [RinggAI on Hugging Face](https://huggingface.co/RinggAI)
+        - **TTS Space**: [Ringg TTS V0](https://huggingface.co/spaces/RinggAI/Ringg-TTS-v0.0)
+        ### 🙏 Acknowledgements
+        - Special thanks to [@jeremylee12](https://huggingface.co/jeremylee12) for their contributions
         """)
+        # API Status indicator
+        with gr.Row():
+            with gr.Column(scale=4):
+                api_status = gr.Textbox(
+                    label="🔌 API Status",
+                    value=health_status["message"],
+                    interactive=False
+                )
+            with gr.Column(scale=1):
+                check_btn = gr.Button("🔄 Check Status", size="sm")
+                check_btn.click(check_api_status, outputs=api_status)
+        with gr.Tab("📁 File Upload"):
+            gr.Markdown("### Upload Audio File")
+            gr.Markdown("Upload an audio file for transcription (supports WAV, MP3, FLAC, M4A, etc.)")
+            audio_input = gr.Audio(
+                label="📁 Upload Audio File",
+                type="filepath",
+                sources=["upload"]
+            )
+            transcribe_btn = gr.Button("🔄 Transcribe", variant="primary", size="lg")
+            file_output = gr.Textbox(
+                label="Transcription Result",
+                lines=8,
+                interactive=False,
+                placeholder="Upload a file and click Transcribe..."
+            )
+            transcribe_btn.click(
+                transcribe_audio,
+                inputs=audio_input,
+                outputs=file_output
+            )
+            gr.Markdown("""
+            ### 💡 Tips for Best Results
+            - Use clear audio with minimal background noise
+            - Speak naturally at a moderate pace
+            - For file upload, ensure audio quality is good (16kHz or higher recommended)
+            - Model handles code-switching between English and Hindi
+            """)
     return demo