Spaces:

RinggAI
/

STT

Running

App Files Files Community

harsh2ai commited on Oct 29, 2025

Commit

2fc062a

1 Parent(s): 2cbdadf

Remove real-time streaming: Keep only file upload transcription

Browse files

Files changed (2) hide show

app.py +5 -148
requirements.txt +4 -0

app.py CHANGED Viewed

@@ -91,33 +91,6 @@ class RinggSTTClient:
         except Exception as e:
             return f"❌ Error: {str(e)}"
-    def transcribe_streaming(self, audio_chunk: np.ndarray) -> Optional[str]:
-        """Send audio chunk for streaming transcription"""
-        try:
-            # Convert numpy array to base64
-            audio_bytes = audio_chunk.tobytes()
-            audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
-            response = self.session.post(
-                f"{self.api_endpoint}/transcribe_stream",
-                json={
-                    "audio_chunk": audio_base64,
-                    "dtype": str(audio_chunk.dtype),
-                    "shape": list(audio_chunk.shape)
-                },
-                timeout=10
-            )
-            if response.status_code == 200:
-                result = response.json()
-                return result.get("transcription")
-            return None
-        except Exception as e:
-            print(f"Streaming error: {e}")
-            return None
 # Initialize API client
 print(f"🔗 Connecting to STT API: {API_ENDPOINT}")
 stt_client = RinggSTTClient(API_ENDPOINT)
@@ -137,43 +110,6 @@ def create_interface():
         return stt_client.transcribe_audio(audio_file)
-    def stream_audio(audio, state):
-        """Handle streaming audio"""
-        if audio is None:
-            return "No audio input", state
-        try:
-            if state is None:
-                state = {"transcripts": []}
-            if isinstance(audio, tuple):
-                sample_rate, audio_array = audio
-            else:
-                audio_array = audio
-                sample_rate = 16000
-            if audio_array is not None and len(audio_array) > 0:
-                if len(audio_array.shape) > 1:
-                    audio_array = np.mean(audio_array, axis=1)
-                audio_array = audio_array.astype(np.float32)
-                max_abs = np.max(np.abs(audio_array)) if audio_array.size else 0.0
-                if max_abs > 1e-6:
-                    audio_array = audio_array / max_abs
-                # Send to API
-                transcript = stt_client.transcribe_streaming(audio_array)
-                if transcript and transcript.strip():
-                    if not state["transcripts"] or transcript != state["transcripts"][-1]:
-                        state["transcripts"].append(transcript)
-            combined = " ".join(state["transcripts"]) if state["transcripts"] else "🎤 Listening..."
-            return combined, state
-        except Exception as e:
-            return f"❌ Error: {str(e)}", state
     def check_api_status():
         """Check API health status"""
         health = stt_client.check_health()
@@ -220,9 +156,9 @@ def create_interface():
         gr.Markdown("""
         ### ✨ Features
         - 🌐 **Bilingual Support**: Transcribe English and Hindi speech
-        - ⚡ **Real-time Processing**: Instant transcription as you speak
-        - 🎯 **High Accuracy**: Competitive with leading ASR models
         - 📁 **File Upload**: Support for various audio formats (WAV, MP3, FLAC, etc.)
         - 🔒 **Private Infrastructure**: Secure and controlled deployment
         """)
@@ -238,67 +174,6 @@ def create_interface():
                 check_btn = gr.Button("🔄 Check Status", size="sm")
                 check_btn.click(check_api_status, outputs=api_status)
-        with gr.Tab("🎤 Real-time Streaming"):
-            gr.Markdown("### Live Microphone Transcription")
-            gr.Markdown("Speak into your microphone for real-time transcription in English or Hindi.")
-            gr.Markdown("""
-            ⚠️ **Note**: Real-time streaming sends audio chunks to the API endpoint.
-            Make sure your backend service is running and accessible.
-            """)
-            # Buffer Configuration Controls
-            with gr.Row():
-                with gr.Column():
-                    gr.Markdown("### 🔧 Streaming Configuration")
-                    gr.Markdown("Adjust these settings to optimize streaming performance based on your connection.")
-                    buffer_duration = gr.Slider(
-                        minimum=2.0, maximum=6.0, step=0.5, value=3.0,
-                        label="Buffer Duration (seconds)",
-                        info="Size of audio chunks sent to API"
-                    )
-                    process_every_n = gr.Slider(
-                        minimum=2, maximum=8, step=1, value=3,
-                        label="Process Every N Chunks",
-                        info="How often to send audio (higher = less frequent)"
-                    )
-                    min_interval = gr.Slider(
-                        minimum=1.0, maximum=4.0, step=0.5, value=2.0,
-                        label="Min Processing Interval (seconds)",
-                        info="Minimum time between API calls"
-                    )
-                    gr.Markdown("""
-                    **💡 Tuning Tips:**
-                    - **Lower latency**: Decrease buffer duration and interval
-                    - **Better accuracy**: Increase buffer duration
-                    - **Reduce API calls**: Increase process frequency
-                    - **Slow connection**: Increase all values
-                    """)
-            mic_input = gr.Audio(
-                sources=["microphone"],
-                type="numpy",
-                streaming=True,
-                label="🎤 Microphone Input"
-            )
-            live_output = gr.Textbox(
-                label="Live Transcription",
-                lines=8,
-                interactive=False,
-                placeholder="Your transcription will appear here..."
-            )
-            session_state = gr.State(lambda: None)
-            mic_input.stream(
-                fn=stream_audio,
-                inputs=[mic_input, session_state],
-                outputs=[live_output, session_state]
-            )
         with gr.Tab("📁 File Upload"):
             gr.Markdown("### Upload Audio File")
             gr.Markdown("Upload an audio file for transcription (supports WAV, MP3, FLAC, M4A, etc.)")
@@ -330,28 +205,10 @@ def create_interface():
             - Speak naturally at a moderate pace
             - For file upload, ensure audio quality is good (16kHz or higher recommended)
             - Model handles code-switching between English and Hindi
-            """)
-        with gr.Tab("⚙️ Configuration"):
-            gr.Markdown("### API Endpoint Configuration")
-            gr.Markdown(f"""
-            **Current API Endpoint**: `{API_ENDPOINT}`
-            The transcription service runs on a private infrastructure and is accessed via a secure API endpoint.
-            #### How it Works:
-            1. 🎤 You interact with this Hugging Face Space (frontend)
-            2. 📡 Audio is sent to the private API endpoint
-            3. 🤖 The model processes the audio on secure infrastructure
-            4. 📝 Transcription is returned and displayed
-            #### Benefits:
-            - 🔒 **Privacy**: Model and data stay on private infrastructure
-            - ⚡ **Performance**: Dedicated compute resources
-            - 🎯 **Control**: Full control over the model and processing
-            - 💰 **Cost-effective**: Use your own compute resources
-            To update the API endpoint, set the `STT_API_ENDPOINT` environment variable in Space Settings.
             """)
         with gr.Tab("ℹ️ About"):

         except Exception as e:
             return f"❌ Error: {str(e)}"
 # Initialize API client
 print(f"🔗 Connecting to STT API: {API_ENDPOINT}")
 stt_client = RinggSTTClient(API_ENDPOINT)
         return stt_client.transcribe_audio(audio_file)
     def check_api_status():
         """Check API health status"""
         health = stt_client.check_health()
         gr.Markdown("""
         ### ✨ Features
         - 🌐 **Bilingual Support**: Transcribe English and Hindi speech
+        - 🎯 **High Accuracy**: Competitive with leading ASR models
         - 📁 **File Upload**: Support for various audio formats (WAV, MP3, FLAC, etc.)
+        - ⚡ **Fast Processing**: Optimized for quick transcription
         - 🔒 **Private Infrastructure**: Secure and controlled deployment
         """)
                 check_btn = gr.Button("🔄 Check Status", size="sm")
                 check_btn.click(check_api_status, outputs=api_status)
         with gr.Tab("📁 File Upload"):
             gr.Markdown("### Upload Audio File")
             gr.Markdown("Upload an audio file for transcription (supports WAV, MP3, FLAC, M4A, etc.)")
             - Speak naturally at a moderate pace
             - For file upload, ensure audio quality is good (16kHz or higher recommended)
             - Model handles code-switching between English and Hindi
+            ### 🔧 Backend Configuration
+            This Space connects to a private API endpoint for transcription.
+            The API endpoint can be configured via the `STT_API_ENDPOINT` secret in Space Settings.
             """)
         with gr.Tab("ℹ️ About"):

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio==4.44.0
+numpy
+requests