Haitam03 commited on
Commit
4bb1fb5
·
verified ·
1 Parent(s): 7daba76

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -26
app.py CHANGED
@@ -1,42 +1,67 @@
1
  import gradio as gr
2
  import numpy as np
3
  import nemo.collections.asr as nemo_asr
 
 
4
 
5
  # Load the NVIDIA FastConformer model
6
  asr_model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.from_pretrained(
7
  "nvidia/stt_ar_fastconformer_hybrid_large_pcd_v1.0"
8
  )
9
 
10
- def transcribe(stream, new_chunk):
11
- sr, y = new_chunk
 
 
 
 
 
12
 
13
- # Convert to mono if stereo
14
- if y.ndim > 1:
15
- y = y.mean(axis=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
- # Normalize audio
18
- y = y.astype(np.float32)
19
- if np.max(np.abs(y)) > 0:
20
- y /= np.max(np.abs(y))
21
-
22
- # Accumulate audio stream
23
- if stream is not None:
24
- stream = np.concatenate([stream, y])
25
- else:
26
- stream = y
27
-
28
- # Transcribe using NeMo model
29
- # The model expects a list of audio arrays
30
- transcription = asr_model.transcribe([stream], batch_size=1)[0]
31
-
32
- return stream, transcription
33
 
 
34
  demo = gr.Interface(
35
- transcribe,
36
- ["state", gr.Audio(sources=["microphone"], streaming=True)],
37
- ["state", "text"],
38
- live=True,
39
- api_name="predict"
 
 
 
 
 
 
 
 
 
 
 
40
  )
41
 
42
  if __name__ == "__main__":
 
1
  import gradio as gr
2
  import numpy as np
3
  import nemo.collections.asr as nemo_asr
4
+ import tempfile
5
+ import soundfile as sf
6
 
7
  # Load the NVIDIA FastConformer model
8
  asr_model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.from_pretrained(
9
  "nvidia/stt_ar_fastconformer_hybrid_large_pcd_v1.0"
10
  )
11
 
12
+ def transcribe_audio(audio_file):
13
+ """
14
+ Transcribe a complete audio file without streaming.
15
+ This is more reliable and produces better results on low-resource systems.
16
+ """
17
+ if audio_file is None:
18
+ return "Please upload an audio file."
19
 
20
+ try:
21
+ # Read the audio file
22
+ audio_data, sample_rate = sf.read(audio_file)
23
+
24
+ # Convert to mono if stereo
25
+ if audio_data.ndim > 1:
26
+ audio_data = audio_data.mean(axis=1)
27
+
28
+ # Normalize audio
29
+ audio_data = audio_data.astype(np.float32)
30
+ if np.max(np.abs(audio_data)) > 0:
31
+ audio_data /= np.max(np.abs(audio_data))
32
+
33
+ # Save to temporary file at 16kHz (required by model)
34
+ with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
35
+ sf.write(tmp_file.name, audio_data, sample_rate)
36
+ tmp_path = tmp_file.name
37
+
38
+ # Transcribe using NeMo model
39
+ # Pass the file path directly - more memory efficient
40
+ transcription = asr_model.transcribe([tmp_path])[0]
41
+
42
+ return transcription
43
 
44
+ except Exception as e:
45
+ return f"Error during transcription: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
+ # Create Gradio interface
48
  demo = gr.Interface(
49
+ fn=transcribe_audio,
50
+ inputs=gr.Audio(
51
+ sources=["microphone", "upload"],
52
+ type="filepath",
53
+ label="Upload Audio or Record"
54
+ ),
55
+ outputs=gr.Textbox(
56
+ label="Transcription",
57
+ lines=5,
58
+ placeholder="Your transcription will appear here..."
59
+ ),
60
+ title="Arabic Speech Recognition with NVIDIA FastConformer",
61
+ description="Upload an audio file or record your voice to get the transcription. This model supports Arabic language.",
62
+ examples=[],
63
+ cache_examples=False,
64
+ api_name="transcribe"
65
  )
66
 
67
  if __name__ == "__main__":