| Inference | |
| ``` | |
| from transformers import WhisperFeatureExtractor, WhisperProcessor | |
| import numpy as np | |
| import librosa | |
| import torch | |
| model = WhisperForConditionalGeneration.from_pretrained("userdata/ud-whisper-medium-1").cuda() | |
| processor = WhisperProcessor.from_pretrained( "userdata/ud-whisper-medium-1") | |
| _ = model.eval() | |
| model.config.forced_decoder_ids = None | |
| sec = 30 | |
| target_sr = 16_000 | |
| audio, sr = librosa.load('/home/userdata/ariff-wav2vec2/finetune/2887.mp3', sr=None) | |
| audio_array = librosa.resample(audio, orig_sr=sr, target_sr=target_sr) | |
| chunk = [audio_array[i: i + (target_sr * sec)] for i in range(0, len(audio_array), target_sr * sec)] | |
| with torch.no_grad(): | |
| input_features = (processor(chunk[4], sampling_rate=16_000, return_tensors="pt").input_features).cuda() | |
| predicted_ids = model.generate(input_features) | |
| transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) | |
| res = ''.join(list(transcription)) | |
| print(res) | |
| ``` | |
| Play Audio | |
| ``` | |
| import IPython.display as ipd | |
| ipd.Audio(data=np.asarray(chunk), autoplay=True, rate=16000) | |
| ``` |