File size: 1,079 Bytes
5e7da56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
Inference
```
from transformers import WhisperFeatureExtractor, WhisperProcessor
import numpy as np
import librosa
import torch

model = WhisperForConditionalGeneration.from_pretrained("userdata/ud-whisper-medium-1").cuda()
processor = WhisperProcessor.from_pretrained( "userdata/ud-whisper-medium-1")

_ = model.eval()
model.config.forced_decoder_ids = None

sec = 30
target_sr = 16_000     
audio, sr = librosa.load('/home/userdata/ariff-wav2vec2/finetune/2887.mp3', sr=None)  
audio_array = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)
chunk = [audio_array[i: i + (target_sr * sec)] for i in range(0, len(audio_array), target_sr * sec)]

with torch.no_grad():
    input_features = (processor(chunk[4], sampling_rate=16_000, return_tensors="pt").input_features).cuda()
    predicted_ids = model.generate(input_features)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

res = ''.join(list(transcription))
print(res)
```

Play Audio
```
import IPython.display as ipd
ipd.Audio(data=np.asarray(chunk), autoplay=True, rate=16000)
```