RealTime-Mic-Transcription-Multilingual

Runtime error

App Files Files Community

WJ88 commited on Nov 8, 2025

Commit

4e726f3

verified ·

1 Parent(s): 76ac8d9

Create app.py

Browse files

Files changed (1) hide show

app.py +159 -0

app.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import gradio as gr
+import nemo.collections.asr as nemo_asr
+import numpy as np
+from pydub import AudioSegment
+from pydub.silence import detect_silence
+import warnings
+import torch
+warnings.filterwarnings("ignore")
+# Global model loader
+model = None
+def load_model():
+    global model
+    if model is None:
+        model = nemo_asr.models.ASRModel.from_pretrained(
+            model_name="nvidia/parakeet-tdt-0.6b-v3",
+            map_location="cpu"
+        )
+        model.eval()
+    return model
+class TranscriptionState:
+    def __init__(self):
+        self.buffer = None
+        self.text = ""
+def transcribe_segment(segment_array: np.ndarray):
+    """Transcribe a normalized audio segment."""
+    load_model()
+    with torch.no_grad(), warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        output = model.transcribe([segment_array])
+    return output[0]
+def process_live_audio(audio: np.ndarray, state: TranscriptionState):
+    """Process live mic audio with VAD and buffer management."""
+    if audio is None or len(audio) == 0:
+        return "", state
+    # Convert to int16 for pydub VAD
+    audio_int16 = (audio * 32767).astype(np.int16)
+    new_segment = AudioSegment(
+        data=audio_int16.tobytes(),
+        frame_rate=16000,
+        sample_width=2,
+        channels=1
+    )
+    # Append to buffer
+    if state.buffer is None:
+        state.buffer = new_segment
+    else:
+        state.buffer += new_segment
+    # Trim buffer to prevent accumulation (keep last 60s)
+    max_duration_ms = 60000
+    if state.buffer.duration_seconds > 60:
+        # Re-transcribe full current buffer before trimming
+        full_array = np.array(state.buffer.get_array_of_samples(), dtype=np.float32) / 32767.0
+        state.text = transcribe_segment(full_array)
+        # Trim to last 30s for ongoing buffer (balances memory and context)
+        state.buffer = state.buffer[-30000:]
+    # VAD: Detect pauses in current buffer
+    silent_windows = detect_silence(
+        state.buffer,
+        min_silence_len=500,  # 0.5s pause
+        silence_thresh=-40    # dB threshold
+    )
+    if len(silent_windows) > 0:
+        last_silence_end = silent_windows[-1][1]
+        if last_silence_end < len(state.buffer):
+            # Transcribe up to end of last silence
+            segment = state.buffer[:last_silence_end]
+            segment_array = np.array(segment.get_array_of_samples(), dtype=np.float32) / 32767.0
+            partial_text = transcribe_segment(segment_array)
+            state.text = partial_text
+            # Keep remaining as buffer
+            state.buffer = state.buffer[last_silence_end:]
+    return state.text, state
+def transcribe_file(audio: np.ndarray):
+    """Batch transcribe uploaded file."""
+    if audio is None:
+        return ""
+    load_model()
+    # Assume mono 16kHz; resample if needed (Gradio handles basic)
+    if len(audio.shape) > 1:
+        audio = np.mean(audio, axis=1)
+    with torch.no_grad(), warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        output = model.transcribe([audio])
+    return output[0]
+def clear_session(state: TranscriptionState):
+    """Reset session."""
+    state.buffer = None
+    state.text = ""
+    return ""
+# Gradio UI
+with gr.Blocks(title="Parakeet v3 Real-Time Transcription") as demo:
+    gr.Markdown(
+        """
+        # NVIDIA Parakeet-TDT 0.6B v3 Real-Time Transcription
+        Speak into your microphone for live multilingual transcription. Updates on pauses. Clear to start over.
+        Supports 25 European languages automatically. Optimized for CPU.
+        """
+    )
+    with gr.Tab("Live Microphone"):
+        state = gr.State(TranscriptionState())
+        audio_input = gr.Audio(
+            source="microphone",
+            type="numpy",
+            live=True,
+            label="Speak now..."
+        )
+        output_text = gr.Textbox(
+            label="Live Transcription",
+            lines=10,
+            interactive=False
+        )
+        clear_btn = gr.Button("Clear Session")
+        # Live updates
+        audio_input.change(
+            process_live_audio,
+            inputs=[audio_input, state],
+            outputs=[output_text, state]
+        )
+        clear_btn.click(
+            clear_session,
+            inputs=state,
+            outputs=[output_text, state]
+        )
+    with gr.Tab("File Upload"):
+        file_input = gr.Audio(source="upload", type="numpy")
+        file_output = gr.Textbox(label="File Transcription", lines=10)
+        transcribe_btn = gr.Button("Transcribe File")
+        transcribe_btn.click(
+            transcribe_file,
+            inputs=file_input,
+            outputs=file_output
+        )
+    gr.Markdown(
+        """
+        **Notes:** For best results, speak clearly with short pauses. Long sessions (>1 min) may require clearing to maintain speed.
+        """
+    )
+if __name__ == "__main__":
+    demo.launch()