from transformers import pipeline pipe =pipeline("automatic-speech-recognition", model="multi-lingual-stt",return_timestamps=True) result=pipe("farsi.wav")["text"] print(result)