from transformers import pipeline import gradio as gr from transformers import WhisperProcessor, WhisperForConditionalGeneration import librosa import numpy as np device = 'cpu' processor = WhisperProcessor.from_pretrained("Neurai/Persian_ASR") model = WhisperForConditionalGeneration.from_pretrained("Neurai/Persian_ASR").to(device) forced_decoder_ids = processor.get_decoder_prompt_ids(language="fa", task="transcribe") def transcribe(audio): array, sample_rate = librosa.load(audio, sr=16000,mono=True) array = librosa.to_mono(array) array = librosa.resample(array, orig_sr=sample_rate, target_sr=16000) array = list(array) input_features = processor(array[0:int(14*16000)], sampling_rate=16000, return_tensors="pt").input_features predicted_ids = model.generate(input_features.to(device)) transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) text= transcription[0] return text iface = gr.Interface( fn=transcribe, inputs=gr.Audio(sources=["microphone", "upload"],type="filepath"), outputs="text", title="Neura Persian ASR", description="Realtime Persian ASR", ) iface.launch()