RealTime-Mic-Transcription-Multilingual

Runtime error

App Files Files Community

WJ88 commited on Nov 8, 2025

Commit

16e01ac

verified ·

1 Parent(s): 6079a5c

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -39

app.py CHANGED Viewed

@@ -6,6 +6,8 @@ from pydub.silence import detect_silence
 import warnings
 import torch
 import logging
 warnings.filterwarnings("ignore")
 logging.getLogger("nemo").setLevel(logging.ERROR)  # Suppress NeMo logs
@@ -25,7 +27,7 @@ def load_model():
 class TranscriptionState:
     def __init__(self):
-        self.buffer = None
         self.text = ""
 def transcribe_segment(segment_array: np.ndarray):
@@ -36,24 +38,22 @@ def transcribe_segment(segment_array: np.ndarray):
         output = model.transcribe([segment_array])
     return output[0]
-def process_live_audio(audio, state: TranscriptionState):
-    """Process live mic audio chunks with VAD and buffer management."""
-    if audio is None or len(audio) == 0:
         return state.text, state
-    sr, audio_data = audio  # Unpack Gradio tuple (sr, data)
-    if sr != 16000:
-        # Basic resampling placeholder; use librosa if needed
-        warnings.warn(f"Unexpected SR {sr}; assuming 16000")
-    # Convert to int16 for pydub VAD
-    audio_int16 = (audio_data * 32767).astype(np.int16)
-    new_segment = AudioSegment(
-        data=audio_int16.tobytes(),
-        frame_rate=16000,
-        sample_width=2,
-        channels=1
-    )
     # Append to buffer
     if state.buffer is None:
@@ -62,10 +62,9 @@ def process_live_audio(audio, state: TranscriptionState):
         state.buffer += new_segment
     # Trim buffer to prevent accumulation (keep last 60s)
-    max_duration_ms = 60000
     if state.buffer.duration_seconds > 60:
         # Re-transcribe full current buffer before trimming
-        full_array = np.array(state.buffer.get_array_of_samples(), dtype=np.float32) / 32767.0
         state.text = transcribe_segment(full_array)
         # Trim to last 30s for ongoing buffer
         state.buffer = state.buffer[-30000:]
@@ -82,7 +81,7 @@ def process_live_audio(audio, state: TranscriptionState):
         if last_silence_end < len(state.buffer):
             # Transcribe up to end of last silence
             segment = state.buffer[:last_silence_end]
-            segment_array = np.array(segment.get_array_of_samples(), dtype=np.float32) / 32767.0
             partial_text = transcribe_segment(segment_array)
             state.text = partial_text
             # Keep remaining as buffer
@@ -90,15 +89,16 @@ def process_live_audio(audio, state: TranscriptionState):
     return state.text, state
-def transcribe_file(audio):
-    """Batch transcribe uploaded file."""
-    if audio is None:
         return ""
-    sr, audio_data = audio  # Unpack tuple
-    if sr != 16000:
-        warnings.warn(f"Unexpected SR {sr}; assuming 16000")
-    if len(audio_data.shape) > 1:
-        audio_data = np.mean(audio_data, axis=1)
     load_model()
     with torch.no_grad(), warnings.catch_warnings():
         warnings.simplefilter("ignore")
@@ -111,13 +111,12 @@ def clear_session(state: TranscriptionState):
     state.text = ""
     return "", state
-# Gradio UI
 with gr.Blocks(title="Parakeet v3 Real-Time Transcription") as demo:
     gr.Markdown(
         """
         # NVIDIA Parakeet-TDT 0.6B v3 Real-Time Transcription
-        Speak into your microphone for live multilingual transcription. Updates on pauses. Clear to start over.
-        Supports 25 European languages automatically. Optimized for CPU.
         """
     )
@@ -125,22 +124,23 @@ with gr.Blocks(title="Parakeet v3 Real-Time Transcription") as demo:
         state = gr.State(TranscriptionState())
         audio_input = gr.Audio(
             sources=["microphone"],
-            type="numpy",
             streaming=True,
-            label="Speak now..."
         )
         output_text = gr.Textbox(
             label="Live Transcription",
             lines=10,
             interactive=False
         )
-        clear_btn = gr.Button("Clear Session")
-        # Live updates on chunks
         audio_input.change(
             process_live_audio,
             inputs=[audio_input, state],
-            outputs=[output_text, state]
         )
         clear_btn.click(
             clear_session,
@@ -149,7 +149,7 @@ with gr.Blocks(title="Parakeet v3 Real-Time Transcription") as demo:
         )
     with gr.Tab("File Upload"):
-        file_input = gr.Audio(sources=["upload"], type="numpy")
         file_output = gr.Textbox(label="File Transcription", lines=10)
         transcribe_btn = gr.Button("Transcribe File")
         transcribe_btn.click(
@@ -160,9 +160,9 @@ with gr.Blocks(title="Parakeet v3 Real-Time Transcription") as demo:
     gr.Markdown(
         """
-        **Notes:** For best results, speak clearly with short pauses. Long sessions (>1 min) may require clearing to maintain speed.
         """
     )
 if __name__ == "__main__":
-    demo.launch()

 import warnings
 import torch
 import logging
+import io
+import librosa
 warnings.filterwarnings("ignore")
 logging.getLogger("nemo").setLevel(logging.ERROR)  # Suppress NeMo logs
 class TranscriptionState:
     def __init__(self):
+        self.buffer = None  # AudioSegment
         self.text = ""
 def transcribe_segment(segment_array: np.ndarray):
         output = model.transcribe([segment_array])
     return output[0]
+def process_live_audio(chunk_bytes, state: TranscriptionState):
+    """Process live mic PCM bytes chunk with VAD and buffer management."""
+    if chunk_bytes is None or len(chunk_bytes) == 0:
         return state.text, state
+    # Create AudioSegment from raw PCM bytes (16kHz mono int16)
+    try:
+        new_segment = AudioSegment(
+            data=chunk_bytes,
+            frame_rate=16000,
+            sample_width=2,
+            channels=1
+        )
+    except Exception as e:
+        print(f"Chunk creation error: {e}")
+        return state.text, state
     # Append to buffer
     if state.buffer is None:
         state.buffer += new_segment
     # Trim buffer to prevent accumulation (keep last 60s)
     if state.buffer.duration_seconds > 60:
         # Re-transcribe full current buffer before trimming
+        full_array = np.array(state.buffer.get_array_of_samples(), dtype=np.float32) / 32768.0
         state.text = transcribe_segment(full_array)
         # Trim to last 30s for ongoing buffer
         state.buffer = state.buffer[-30000:]
         if last_silence_end < len(state.buffer):
             # Transcribe up to end of last silence
             segment = state.buffer[:last_silence_end]
+            segment_array = np.array(segment.get_array_of_samples(), dtype=np.float32) / 32768.0
             partial_text = transcribe_segment(segment_array)
             state.text = partial_text
             # Keep remaining as buffer
     return state.text, state
+def transcribe_file(audio_path):
+    """Batch transcribe uploaded file path."""
+    if audio_path is None or not os.path.exists(audio_path):
         return ""
+    try:
+        audio_data, sr = librosa.load(audio_path, sr=16000, mono=True)
+        if len(audio_data) == 0:
+            return ""
+    except Exception:
+        return "Error loading file."
     load_model()
     with torch.no_grad(), warnings.catch_warnings():
         warnings.simplefilter("ignore")
     state.text = ""
     return "", state
+# Gradio UI with Blocks for tabs
 with gr.Blocks(title="Parakeet v3 Real-Time Transcription") as demo:
     gr.Markdown(
         """
         # NVIDIA Parakeet-TDT 0.6B v3 Real-Time Transcription
+        Speak continuously into the microphone—transcription updates live on natural pauses (0.5s+). Supports 25 European languages automatically. Optimized for CPU.
         """
     )
         state = gr.State(TranscriptionState())
         audio_input = gr.Audio(
             sources=["microphone"],
+            type="bytes",
             streaming=True,
+            label="Speak now—updates on pauses"
         )
         output_text = gr.Textbox(
             label="Live Transcription",
             lines=10,
             interactive=False
         )
+        clear_btn = gr.Button("Clear Session", variant="secondary")
+        # Stream updates on each chunk
         audio_input.change(
             process_live_audio,
             inputs=[audio_input, state],
+            outputs=[output_text, state],
+            show_progress=False  # Avoid UI flicker during fast chunks
         )
         clear_btn.click(
             clear_session,
         )
     with gr.Tab("File Upload"):
+        file_input = gr.Audio(sources=["upload"], type="filepath")
         file_output = gr.Textbox(label="File Transcription", lines=10)
         transcribe_btn = gr.Button("Transcribe File")
         transcribe_btn.click(
     gr.Markdown(
         """
+        **Tips:** Speak clearly with brief pauses for instant updates. Long monologues auto-update every 60s. Clear resets buffer for fresh starts.
         """
     )
 if __name__ == "__main__":
+    demo.launch(share=False, debug=True)