Spaces:

GavinHuang
/

asr-demo

Running

App Files Files Community

GavinHuang commited on May 3

Commit

779d79b

1 Parent(s): fe027e3

fix: improve audio processing in transcribe function and add soundfile dependency

Browse files

Files changed (2) hide show

app.py +31 -36
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -15,45 +15,40 @@ model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("nvidia/parakeet-tdt-
 print(f"Model loaded on device: {model.device}")
-@spaces.GPU(duration=120)  # Increase duration if inference takes >60s
 def transcribe(audio, state=""):
-    """
-    Transcribe audio in real-time
-    """
-    # Skip processing if no audio is provided
-    if audio is None:
         return state, state
-    if isinstance(audio, tuple):
-        # If audio is a tuple, assume the first element is the file path
-        print("Received tuple input, extracting first element as file path")
-        audio = audio[0] if len(audio) > 0 else None
-    elif not isinstance(audio, str):
-        raise ValueError(f"Expected audio as a file path (str), got {type(audio)}")
-    if not audio:
-        raise ValueError("No valid audio input provided")
-    global model
-    # Move model to GPU if available
-    if torch.cuda.is_available():
-        print(f"CUDA device: {torch.cuda.get_device_name(0)}")
-        model = model.cuda()
-    # Get the sample rate from the audio
-    sample_rate = 16000  # Default to 16kHz if not specified
-    # Process the audio with the ASR model
-    with torch.no_grad():
-        transcription = model.transcribe([audio])[0]
-    # Append new transcription to the state
-    if state == "":
-        new_state = transcription
-    else:
-        new_state = state + " " + transcription
-    model.cpu()
-    return new_state, new_state
 # Define the Gradio interface
 with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
@@ -91,7 +86,7 @@ with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
         inputs=[audio_input, state],
         outputs=[state, streaming_text],
     )
     # Clear the transcription
     def clear_transcription():
         return "", "", ""

 print(f"Model loaded on device: {model.device}")
+import numpy as np
+import soundfile as sf
+audio_buffer = []
+@spaces.GPU(duration=120)
 def transcribe(audio, state=""):
+    global model, audio_buffer
+    if audio is None or isinstance(audio, int):
+        print(f"Skipping invalid audio input: {type(audio)}")
         return state, state
+    # Append NumPy array to buffer
+    if isinstance(audio, np.ndarray):
+        audio_buffer.append(audio)
+        # Process if buffer has enough data (e.g., 5 seconds at 16kHz)
+        if len(np.concatenate(audio_buffer)) >= 5 * 16000:
+            # Concatenate and preprocess
+            audio_data = np.concatenate(audio_buffer)
+            audio_data = audio_data.mean(axis=1) if audio_data.ndim > 1 else audio_data  # To mono
+            temp_file = "temp_audio.wav"
+            sf.write(temp_file, audio_data, samplerate=16000)
+            # Transcribe
+            if torch.cuda.is_available():
+                model = model.cuda()
+            transcription = model.transcribe([temp_file])[0]
+            model = model.cpu()
+            os.remove(temp_file)
+            # Clear buffer
+            audio_buffer = []
+            new_state = state + " " + transcription if state else transcription
+            return new_state, new_state
+    return state, state
 # Define the Gradio interface
 with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
         inputs=[audio_input, state],
         outputs=[state, streaming_text],
     )
     # Clear the transcription
     def clear_transcription():
         return "", "", ""

requirements.txt CHANGED Viewed

@@ -4,3 +4,4 @@ nemo_toolkit[asr]>=1.18.0
 omegaconf>=2.2.0
 numpy>=1.22.0
 cuda-python>=12.3

 omegaconf>=2.2.0
 numpy>=1.22.0
 cuda-python>=12.3
+soundfile