Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from pathlib import Path | |
| import nemo.collections.asr as nemo_asr | |
| import librosa | |
| import soundfile as sf | |
| base_path = str(Path(__file__).parent) | |
| # Convert audio to 16kHz WAV | |
| def convert_wav_to_16k(input_wav, output_file_path, sr=16000): | |
| if input_wav is None: | |
| raise ValueError("No audio file provided") | |
| if isinstance(input_wav, str): # filepath | |
| y, s = librosa.load(input_wav, sr=sr) | |
| elif isinstance(input_wav, tuple): # numpy array + sample rate | |
| y, orig_sr = input_wav | |
| y = librosa.resample(y, orig_sr=orig_sr, target_sr=sr) | |
| s = sr | |
| else: | |
| raise ValueError(f"Unsupported audio input type: {type(input_wav)}") | |
| sf.write(output_file_path, y, s) | |
| print(f'"{output_file_path}" has been converted to {s}Hz') | |
| return output_file_path | |
| # Load NeMo model and run transcription | |
| def loading_nemo_and_prediction(processed_wav): | |
| arabic_asr = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.restore_from( | |
| restore_path="FastConformer-Custom-Tokenizer.nemo" | |
| ) | |
| prediction = arabic_asr.transcribe(paths2audio_files=[processed_wav]) | |
| return prediction | |
| # Prediction wrapper | |
| def predict(uploaded_wav): | |
| if uploaded_wav is None: | |
| return "No audio file uploaded." | |
| out_path = base_path + "/converted.wav" | |
| audio_conversion = convert_wav_to_16k(uploaded_wav, out_path) | |
| prediction_text = loading_nemo_and_prediction(audio_conversion) | |
| return prediction_text[0] | |
| # Gradio interface | |
| demo = gr.Interface( | |
| fn=predict, | |
| inputs=gr.Audio(label="Upload or record audio", interactive=True, type="filepath"), | |
| outputs=gr.Textbox(label="Transcription") | |
| ) | |
| demo.launch(debug=True, share=True) | |