Spaces:

RinggAI
/

STT

Sleeping

App Files Files Community

harsh2ai commited on 11 days ago

Commit

f4a0156

1 Parent(s): dea7e3e

updated the logic

Browse files

Files changed (1) hide show

app.py +284 -127

app.py CHANGED Viewed

@@ -1,37 +1,34 @@
 #!/usr/bin/env python3
-# updated
 """
 Ringg Parrot STT V1 🦜 - Hugging Face Space (Frontend)
-Makes API calls to private inference endpoint via ngrok
 """
 import os
-import base64
 from pathlib import Path
-# os.environ.setdefault("GRADIO_API_INFO_ENABLED", "false")
 import gradio as gr
 import requests
 from dotenv import load_dotenv
-load_dotenv()  # reads variables from a .env file and sets them in os.environ
-LOGO_BASE64 = ""
-logo_path = Path(__file__).parent / "logo.png"
-if logo_path.exists():
-    try:
-        LOGO_BASE64 = base64.b64encode(logo_path.read_bytes()).decode("utf-8")
-    except Exception as e:
-        print(f"⚠️ Unable to load logo.png: {e}")
-DEFAULT_LOGO_URL = "https://storage.googleapis.com/desivocal-prod/desi-vocal/logo.png"
-LOGO_URL = os.environ.get("STT_LOGO_URL", DEFAULT_LOGO_URL).strip()
-# Backend API endpoint (ngrok URL)
-# You can update this via Hugging Face Space Secrets
-API_ENDPOINT = os.environ.get("STT_API_ENDPOINT", "")
 class RinggSTTClient:
@@ -43,49 +40,79 @@ class RinggSTTClient:
         self.session.headers.update({"User-Agent": "RinggSTT-HF-Space/1.0"})
     def check_health(self) -> dict:
-        """Check if the API is available"""
         try:
             response = self.session.get(f"{self.api_endpoint}/health", timeout=5)
             if response.status_code == 200:
                 return {"status": "healthy", "message": "✅ API is online"}
-            else:
-                return {
-                    "status": "error",
-                    "message": f"❌ API returned status {response.status_code}",
-                }
-        except requests.exceptions.Timeout:
-            return {"status": "error", "message": "⏱️ API request timed out"}
-        except requests.exceptions.ConnectionError:
-            return {"status": "error", "message": "❌ Cannot connect to API"}
         except Exception as e:
             return {"status": "error", "message": f"❌ Error: {str(e)}"}
-    def transcribe_audio(self, audio_file_path: str) -> str:
-        """Transcribe audio file via API"""
         try:
-            # Read audio file and encode as base64
-            with open(audio_file_path, "rb") as f:
-                audio_data = f.read()
-            audio_base64 = base64.b64encode(audio_data).decode("utf-8")
-            # Make API request
-            response = self.session.post(
-                f"{self.api_endpoint}/transcribe",
-                json={"audio_data": audio_base64, "sample_rate": 16000},
-                timeout=30,
-            )
             if response.status_code == 200:
                 result = response.json()
                 return result.get("transcription", "No transcription received")
             else:
-                return f"❌ API Error: {response.status_code} - {response.text}"
-        except requests.exceptions.Timeout:
-            return "⏱️ Request timed out. The audio file might be too long."
-        except requests.exceptions.ConnectionError:
-            return "❌ Cannot connect to the transcription service. Please try again later."
         except Exception as e:
             return f"❌ Error: {str(e)}"
@@ -93,35 +120,140 @@ class RinggSTTClient:
 # Initialize API client
 print(f"🔗 Connecting to STT API: {API_ENDPOINT}")
 stt_client = RinggSTTClient(API_ENDPOINT)
-# Check health on startup
 health_status = stt_client.check_health()
 print(f"API Health: {health_status}")
-def create_interface():
-    """Create Gradio interface"""
-    def transcribe_audio(audio_file):
-        """Transcribe uploaded audio"""
-        if audio_file is None:
-            return "⚠️ Please upload an audio file to transcribe."
-        transcription = stt_client.transcribe_audio(audio_file)
-        text = (transcription or "").strip()
-        if not text or text.startswith("❌") or text.startswith("⏱"):
-            return text or "⚠️ No speech detected—try a clearer recording."
-        # footer = "(Served via API • Remote backend)"
-        return text
-    def check_api_status():
-        """Check API health status"""
-        health = stt_client.check_health()
-        return health["message"]
-    # Create interface
     with gr.Blocks(
         theme=gr.themes.Base(
             font=[gr.themes.GoogleFont("Source Sans Pro"), "Arial", "sans-serif"]
@@ -129,48 +261,104 @@ def create_interface():
         css=".gradio-container {max-width: none !important;}",
     ) as demo:
         gr.HTML("""
-                <div style="display: flex; align-items: center; gap: 10px;">
-                    <img style="width: 50px; height: 50px; background-color: white; border-radius: 10%;" src="https://storage.googleapis.com/desivocal-prod/desi-vocal/ringg.svg" alt="Logo">
-                    <h1 style="margin: 0;">Ringg Parrot STT V1.0 🦜</h1>
-                </div>
-                """)
-        gr.Markdown(
-            """
-            ## 📁 Upload an audio file for transcription (supports WAV, MP3, FLAC, M4A, etc.)
-            """
-        )
-        with gr.Column():
-            with gr.Row():
                 audio_input = gr.Audio(
                     type="filepath",
                     sources=["upload"],
-                    show_label=False,
                 )
                 file_output = gr.Textbox(
-                    info="<p style=' text-align: center;'>Transcription Result</p>",
-                    container=False,
-                    lines=10,
                     interactive=False,
                 )
-            transcribe_btn = gr.Button("Transcribe", variant="primary", size="lg")
         transcribe_btn.click(
-            transcribe_audio,
-            inputs=audio_input,
             outputs=file_output,
-            concurrency_limit=1,
         )
-        gr.Markdown(
-            """
             <br>
             ## 🎯 Performance Benchmarks
-            **Ringg Parrot STT V1** Ranks **1st** Among Top Models, Outperforming OpenAI Whisper Large-v3 and Other Leading Solutions.
-            """
-        )
         with gr.Row():
             gr.DataFrame(
@@ -186,48 +374,17 @@ def create_interface():
                 interactive=False,
             )
-        # gr.Markdown(
-        #     """
-        #     <br>
-        #     ##  ✨ Features
-        #     - **Hindi Support**: Accurate transcription for Hindi audio
-        #     - **High Accuracy**: Competitive with leading ASR models
-        #     - **File Upload**: Support for various audio formats (WAV, MP3, FLAC, etc.)
-        #     - **Fast Processing**: Optimized for quick transcription
-        #     """
-        # )
-        gr.Markdown(
-            """
-            <br>
-            ##  ⚠️ Benchmark Disclaimer
-            - Evaluated on a modified FLEURS subset to ensure consistent Hindi coverage
-            - Dataset issues include inaudible segments and repeated sentences caused by interruptions
-            - Background noise is prominent across many clips, impacting recognition quality
-            - Mixed Hindi-English speech often provides Hindi-only transcripts
-            - Currency, time, and year normalization is inconsistent with spoken forms
-            - Original transcripts lack punctuation, increasing WER for models that predict it
-            """
-        )
-        gr.Markdown(
-            """
-            <br>
-            ##  🙏 Acknowledgements
-            - Special thanks to [@jeremylee12](https://huggingface.co/jeremylee12) for their contributions
             - Built with [NVIDIA NeMo](https://github.com/NVIDIA/NeMo) models
-            """
-        )
     return demo
-# Launch the app
 if __name__ == "__main__":
     print("🌐 Launching Ringg Parrot STT V1 Gradio Interface...")
     demo = create_interface()
     demo.queue(default_concurrency_limit=2, max_size=20)
     demo.launch(

 #!/usr/bin/env python3
 """
 Ringg Parrot STT V1 🦜 - Hugging Face Space (Frontend)
+Real-time streaming transcription using Gradio's audio streaming.
 """
 import os
+import tempfile
 from pathlib import Path
 import gradio as gr
 import requests
+import numpy as np
+import soundfile as sf
 from dotenv import load_dotenv
+try:
+    import librosa
+    HAS_LIBROSA = True
+except ImportError:
+    HAS_LIBROSA = False
+    print("⚠️ librosa not installed. Install with: pip install librosa")
+load_dotenv()
+# Backend API endpoint
+API_ENDPOINT = os.environ.get("STT_API_ENDPOINT", "http://localhost:7864")
+TARGET_SAMPLE_RATE = 16000
+# How often to transcribe (in seconds of audio)
+MIN_AUDIO_LENGTH = 0.4  # Transcribe when we have at least 400ms of new audio
 class RinggSTTClient:
         self.session.headers.update({"User-Agent": "RinggSTT-HF-Space/1.0"})
     def check_health(self) -> dict:
         try:
             response = self.session.get(f"{self.api_endpoint}/health", timeout=5)
             if response.status_code == 200:
                 return {"status": "healthy", "message": "✅ API is online"}
+            return {"status": "error", "message": f"❌ API returned status {response.status_code}"}
         except Exception as e:
             return {"status": "error", "message": f"❌ Error: {str(e)}"}
+    def transcribe_audio_data(self, audio_data: np.ndarray, sample_rate: int, language: str = "hi") -> str:
+        """Transcribe audio data (numpy array) via multipart upload API"""
         try:
+            # Save to temporary WAV file
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+                temp_path = f.name
+                sf.write(temp_path, audio_data, sample_rate)
+            try:
+                with open(temp_path, "rb") as f:
+                    files = {"file": ("audio.wav", f, "audio/wav")}
+                    data = {"language": language, "punctuate": "false"}
+                    response = self.session.post(
+                        f"{self.api_endpoint}/v1/audio/transcriptions",
+                        files=files,
+                        data=data,
+                        timeout=30,
+                    )
+                # Debug: log the response for troubleshooting
+                print(
+                    f"[transcribe_audio_data] status={response.status_code} "
+                    f"body={response.text[:500]}"
+                )
+                if response.status_code == 200:
+                    result = response.json()
+                    if "transcription_channel_0" in result:
+                        return result.get("transcription_channel_0", "")
+                    return result.get("transcription", "")
+                else:
+                    return ""
+            finally:
+                os.unlink(temp_path)
+        except Exception as e:
+            print(f"Transcription error: {e}")
+            return ""
+    def transcribe_file(self, audio_file_path: str, language: str = "hi") -> str:
+        """Transcribe audio file via multipart upload API"""
+        try:
+            with open(audio_file_path, "rb") as f:
+                files = {"file": (Path(audio_file_path).name, f)}
+                data = {"language": language, "punctuate": "false"}
+                response = self.session.post(
+                    f"{self.api_endpoint}/v1/audio/transcriptions",
+                    files=files,
+                    data=data,
+                    timeout=120,
+                )
             if response.status_code == 200:
                 result = response.json()
+                if "transcription_channel_0" in result:
+                    transcripts = []
+                    if result.get("transcription_channel_0"):
+                        transcripts.append(result["transcription_channel_0"])
+                    if result.get("transcription_channel_1"):
+                        transcripts.append(f"\n[Channel 2]: {result['transcription_channel_1']}")
+                    return "".join(transcripts) if transcripts else "No speech detected"
                 return result.get("transcription", "No transcription received")
             else:
+                return f"❌ API Error: {response.status_code}"
         except Exception as e:
             return f"❌ Error: {str(e)}"
 # Initialize API client
 print(f"🔗 Connecting to STT API: {API_ENDPOINT}")
 stt_client = RinggSTTClient(API_ENDPOINT)
 health_status = stt_client.check_health()
 print(f"API Health: {health_status}")
+def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
+    """Resample audio to target sample rate"""
+    if orig_sr == target_sr:
+        return audio
+    if HAS_LIBROSA:
+        return librosa.resample(audio.astype(np.float64), orig_sr=orig_sr, target_sr=target_sr)
+    else:
+        # Simple linear interpolation fallback
+        duration = len(audio) / orig_sr
+        new_length = int(duration * target_sr)
+        indices = np.linspace(0, len(audio) - 1, new_length)
+        return np.interp(indices, np.arange(len(audio)), audio.astype(np.float64))
+def transcribe_stream(audio, language, audio_buffer, last_transcription, samples_processed):
+    """
+    Process streaming audio from microphone.
+    Simplified approach:
+    - Accumulate ALL audio chunks
+    - When we have enough new audio, transcribe the ENTIRE recording
+    - Display the complete transcription (backend handles everything)
+    """
+    # Initialize states
+    if audio_buffer is None:
+        audio_buffer = []
+    if last_transcription is None:
+        last_transcription = ""
+    if samples_processed is None:
+        samples_processed = 0
+    # Handle invalid audio input
+    if audio is None or isinstance(audio, int):
+        display = last_transcription if last_transcription else "🎤 Click microphone to start..."
+        return display, audio_buffer, last_transcription, samples_processed
+    # Gradio streaming returns (sample_rate, audio_data)
+    if not isinstance(audio, tuple) or len(audio) != 2:
+        display = last_transcription if last_transcription else "🎤 Listening..."
+        return display, audio_buffer, last_transcription, samples_processed
+    sample_rate, audio_data = audio
+    if not isinstance(audio_data, np.ndarray) or len(audio_data) == 0:
+        display = last_transcription if last_transcription else "🎤 Listening..."
+        return display, audio_buffer, last_transcription, samples_processed
+    # Convert stereo to mono if needed
+    if len(audio_data.shape) > 1:
+        audio_data = np.mean(audio_data, axis=1)
+    # Append this chunk to buffer
+    audio_buffer.append(audio_data.copy())
+    # Calculate total samples we have now
+    total_samples = sum(len(arr) for arr in audio_buffer)
+    total_duration = total_samples / sample_rate
+    # Calculate new audio since last transcription
+    new_samples = total_samples - samples_processed
+    new_duration = new_samples / sample_rate
+    # Only transcribe if we have enough NEW audio (to avoid too frequent API calls)
+    if new_duration < MIN_AUDIO_LENGTH:
+        display = last_transcription if last_transcription else f"🎤 Recording... ({total_duration:.1f}s)"
+        return display, audio_buffer, last_transcription, samples_processed
+    try:
+        # Concatenate ALL buffered audio
+        full_audio = np.concatenate(audio_buffer)
+        # Resample to 16kHz if needed
+        if sample_rate != TARGET_SAMPLE_RATE:
+            full_audio = resample_audio(full_audio, sample_rate, TARGET_SAMPLE_RATE)
+        # Normalize audio
+        max_val = np.max(np.abs(full_audio))
+        if max_val > 0:
+            full_audio = full_audio / max_val * 0.95
+        # Get language code
+        lang_code = "hi" if language == "Hindi" else "en"
+        # Transcribe the ENTIRE audio
+        transcription = stt_client.transcribe_audio_data(
+            full_audio.astype(np.float32),
+            TARGET_SAMPLE_RATE,
+            lang_code
+        )
+        # Update state
+        if transcription.strip():
+            last_transcription = transcription
+        # Mark all current samples as processed
+        samples_processed = total_samples
+        display = last_transcription if last_transcription else f"🎤 Recording... ({total_duration:.1f}s)"
+        return display, audio_buffer, last_transcription, samples_processed
+    except Exception as e:
+        print(f"Processing error: {e}")
+        display = last_transcription if last_transcription else "🎤 Listening..."
+        return display, audio_buffer, last_transcription, samples_processed
+def clear_transcription():
+    """Clear all transcription state"""
+    return "🎤 Click microphone to start...", None, "", 0
+def transcribe_file(audio_file, language):
+    """Transcribe uploaded audio file"""
+    if audio_file is None:
+        return "⚠️ Please upload an audio file to transcribe."
+    lang_code = "hi" if language == "Hindi" else "en"
+    transcription = stt_client.transcribe_file(audio_file, lang_code)
+    text = (transcription or "").strip()
+    if not text or text.startswith("❌") or text.startswith("⏱"):
+        return text or "⚠️ No speech detected—try a clearer recording."
+    return text
+def create_interface():
+    """Create Gradio interface"""
     with gr.Blocks(
         theme=gr.themes.Base(
             font=[gr.themes.GoogleFont("Source Sans Pro"), "Arial", "sans-serif"]
         css=".gradio-container {max-width: none !important;}",
     ) as demo:
         gr.HTML("""
+            <div style="display: flex; align-items: center; gap: 10px;">
+                <img style="width: 50px; height: 50px; background-color: white; border-radius: 10%;"
+                     src="https://storage.googleapis.com/desivocal-prod/desi-vocal/ringg.svg" alt="Logo">
+                <h1 style="margin: 0;">Ringg Parrot STT V1.0 🦜</h1>
+            </div>
+        """)
+        # Real-time streaming section
+        gr.Markdown("""
+            ## 🎤 Real-time Transcription
+            Click the microphone to start recording. Transcription updates as you speak.
+            *The entire recording is transcribed each time, so text may refine as more context is added.*
+        """)
+        # States for streaming
+        audio_buffer = gr.State(None)
+        last_transcription = gr.State("")
+        samples_processed = gr.State(0)
+        with gr.Row():
+            with gr.Column(scale=1):
+                stream_language = gr.Dropdown(
+                    choices=["Hindi", "English"],
+                    value="Hindi",
+                    label="Language",
+                )
                 audio_input = gr.Audio(
+                    sources=["microphone"],
+                    type="numpy",
+                    streaming=True,
+                    label="🎤 Click to start recording",
+                )
+                clear_btn = gr.Button("🗑️ Clear & Reset", variant="secondary")
+            with gr.Column(scale=2):
+                text_output = gr.Textbox(
+                    label="Transcription",
+                    value="🎤 Click microphone to start...",
+                    lines=10,
+                    interactive=False,
+                )
+        # Wire up streaming
+        audio_input.stream(
+            fn=transcribe_stream,
+            inputs=[audio_input, stream_language, audio_buffer, last_transcription, samples_processed],
+            outputs=[text_output, audio_buffer, last_transcription, samples_processed],
+        )
+        # Clear button
+        clear_btn.click(
+            fn=clear_transcription,
+            inputs=[],
+            outputs=[text_output, audio_buffer, last_transcription, samples_processed],
+        )
+        gr.Markdown("<br>")
+        # File upload section
+        gr.Markdown("""
+            ## 📁 Upload an audio file for transcription
+            Supports WAV, MP3, FLAC, M4A, and more.
+        """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                file_language = gr.Dropdown(
+                    choices=["Hindi", "English"],
+                    value="Hindi",
+                    label="Language",
+                )
+                file_input = gr.Audio(
                     type="filepath",
                     sources=["upload"],
+                    label="Upload Audio",
                 )
+                transcribe_btn = gr.Button("Transcribe File", variant="primary", size="lg")
+            with gr.Column(scale=2):
                 file_output = gr.Textbox(
+                    label="Transcription",
+                    lines=8,
                     interactive=False,
                 )
         transcribe_btn.click(
+            fn=transcribe_file,
+            inputs=[file_input, file_language],
             outputs=file_output,
         )
+        gr.Markdown("""
             <br>
             ## 🎯 Performance Benchmarks
+            **Ringg Parrot STT V1** Ranks **1st** Among Top Models.
+        """)
         with gr.Row():
             gr.DataFrame(
                 interactive=False,
             )
+        gr.Markdown("""
+            ## 🙏 Acknowledgements
             - Built with [NVIDIA NeMo](https://github.com/NVIDIA/NeMo) models
+        """)
     return demo
 if __name__ == "__main__":
     print("🌐 Launching Ringg Parrot STT V1 Gradio Interface...")
+    print(f"Backend API: {API_ENDPOINT}")
     demo = create_interface()
     demo.queue(default_concurrency_limit=2, max_size=20)
     demo.launch(