| from transformers import pipeline | |
| import gradio as gr | |
| from transformers import WhisperProcessor, WhisperForConditionalGeneration | |
| import librosa | |
| import numpy as np | |
| device = 'cpu' | |
| processor = WhisperProcessor.from_pretrained("Neurai/Persian_ASR") | |
| model = WhisperForConditionalGeneration.from_pretrained("Neurai/Persian_ASR").to(device) | |
| forced_decoder_ids = processor.get_decoder_prompt_ids(language="fa", task="transcribe") | |
| def transcribe(audio): | |
| array, sample_rate = librosa.load(audio, sr=16000,mono=True) | |
| array = librosa.to_mono(array) | |
| array = librosa.resample(array, orig_sr=sample_rate, target_sr=16000) | |
| array = list(array) | |
| input_features = processor(array[0:int(14*16000)], sampling_rate=16000, return_tensors="pt").input_features | |
| predicted_ids = model.generate(input_features.to(device)) | |
| transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) | |
| text= transcription[0] | |
| return text | |
| iface = gr.Interface( | |
| fn=transcribe, | |
| inputs=gr.Audio(sources=["microphone", "upload"],type="filepath"), | |
| outputs="text", | |
| title="Neura Persian ASR", | |
| description="Realtime Persian ASR", | |
| ) | |
| iface.launch() |