| This is a fine-tuned whisper small model which is trained only on 23.8 hours of data and achieves 14.73 Word Error Rate which is impressive as it is trained on very little data. There is much more room to improve which will be done at near feautre. If anybody wants to collaborate, please be my guest. You can train the model on your own data using [this link](https://github.com/hassanaliemon/BanglaASR). And feel free to provide your feed back or connect with me at [linkedin](https://www.linkedin.com/in/hassan-ali-emon/) | |
| ```python | |
| import librosa | |
| import torch | |
| import torchaudio | |
| import numpy as np | |
| from transformers import WhisperTokenizer | |
| from transformers import WhisperProcessor | |
| from transformers import WhisperFeatureExtractor | |
| from transformers import WhisperForConditionalGeneration | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| audio_path = "https://huggingface.co/hassanaliemon/BanglaASR/resolve/main/test_audio/common_voice_bn_31255511.mp3" | |
| model_path = "hassanaliemon/BanglaASR" | |
| feature_extractor = WhisperFeatureExtractor.from_pretrained(model_path) | |
| tokenizer = WhisperTokenizer.from_pretrained(model_path) | |
| processor = WhisperProcessor.from_pretrained(model_path) | |
| model = WhisperForConditionalGeneration.from_pretrained(model_path).to(device) | |
| speech_array, sampling_rate = torchaudio.load(audio_path, format="mp3") | |
| speech_array = speech_array[0].numpy() | |
| speech_array = librosa.resample(np.asarray(speech_array), orig_sr=sampling_rate, target_sr=16000) | |
| input_features = feature_extractor(speech_array, sampling_rate=16000, return_tensors="pt").input_features | |
| predicted_ids = model.generate(inputs=input_features.to(device))[0] | |
| transcription = processor.decode(predicted_ids, skip_special_tokens=True) | |
| print(transcription) | |
| ``` |