Spaces:

Haitam03
/

asr_live

Running

App Files Files Community

Haitam03 commited on Dec 27, 2025

Commit

4bb1fb5

verified ·

1 Parent(s): 7daba76

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -26

app.py CHANGED Viewed

@@ -1,42 +1,67 @@
 import gradio as gr
 import numpy as np
 import nemo.collections.asr as nemo_asr
 # Load the NVIDIA FastConformer model
 asr_model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.from_pretrained(
     "nvidia/stt_ar_fastconformer_hybrid_large_pcd_v1.0"
 )
-def transcribe(stream, new_chunk):
-    sr, y = new_chunk
-    # Convert to mono if stereo
-    if y.ndim > 1:
-        y = y.mean(axis=1)
-    # Normalize audio
-    y = y.astype(np.float32)
-    if np.max(np.abs(y)) > 0:
-        y /= np.max(np.abs(y))
-    # Accumulate audio stream
-    if stream is not None:
-        stream = np.concatenate([stream, y])
-    else:
-        stream = y
-    # Transcribe using NeMo model
-    # The model expects a list of audio arrays
-    transcription = asr_model.transcribe([stream], batch_size=1)[0]
-    return stream, transcription
 demo = gr.Interface(
-    transcribe,
-    ["state", gr.Audio(sources=["microphone"], streaming=True)],
-    ["state", "text"],
-    live=True,
-    api_name="predict"
 )
 if __name__ == "__main__":

 import gradio as gr
 import numpy as np
 import nemo.collections.asr as nemo_asr
+import tempfile
+import soundfile as sf
 # Load the NVIDIA FastConformer model
 asr_model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.from_pretrained(
     "nvidia/stt_ar_fastconformer_hybrid_large_pcd_v1.0"
 )
+def transcribe_audio(audio_file):
+    """
+    Transcribe a complete audio file without streaming.
+    This is more reliable and produces better results on low-resource systems.
+    """
+    if audio_file is None:
+        return "Please upload an audio file."
+    try:
+        # Read the audio file
+        audio_data, sample_rate = sf.read(audio_file)
+        # Convert to mono if stereo
+        if audio_data.ndim > 1:
+            audio_data = audio_data.mean(axis=1)
+        # Normalize audio
+        audio_data = audio_data.astype(np.float32)
+        if np.max(np.abs(audio_data)) > 0:
+            audio_data /= np.max(np.abs(audio_data))
+        # Save to temporary file at 16kHz (required by model)
+        with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
+            sf.write(tmp_file.name, audio_data, sample_rate)
+            tmp_path = tmp_file.name
+        # Transcribe using NeMo model
+        # Pass the file path directly - more memory efficient
+        transcription = asr_model.transcribe([tmp_path])[0]
+        return transcription
+    except Exception as e:
+        return f"Error during transcription: {str(e)}"
+# Create Gradio interface
 demo = gr.Interface(
+    fn=transcribe_audio,
+    inputs=gr.Audio(
+        sources=["microphone", "upload"],
+        type="filepath",
+        label="Upload Audio or Record"
+    ),
+    outputs=gr.Textbox(
+        label="Transcription",
+        lines=5,
+        placeholder="Your transcription will appear here..."
+    ),
+    title="Arabic Speech Recognition with NVIDIA FastConformer",
+    description="Upload an audio file or record your voice to get the transcription. This model supports Arabic language.",
+    examples=[],
+    cache_examples=False,
+    api_name="transcribe"
 )
 if __name__ == "__main__":