File size: 2,774 Bytes
872019a 0caa3bc 872019a 0caa3bc 872019a 0caa3bc a2cefd9 0caa3bc a2cefd9 0caa3bc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
from silero_vad_axera import load_silero_vad
import numpy as np
from datetime import datetime, timedelta
class StreamVAD:
def __init__(self, backend='ax650',
sensitivity=0.5,
silence_ms=200,
datetime_format='%Y-%m-%d %H:%M:%S.%f'):
'''
model_path: path of silero_vad.onnx
sensitivity: thresh of voice activation,
higher means more sensitive,
hence, low speech prob thresh
silence_ms: pop audio after silence for silence_ms milliseconds
datetime_format: format of datetime in return data
'''
self.model = load_silero_vad(backend)
self.sensitivity = sensitivity
self.silence_ms = silence_ms
self.datetime_format = datetime_format
self.reset()
def reset(self):
self.silence_count = 0
self.speech_count = 0
self.return_data = {
"start_ts": '',
"end_ts": '',
"audio": None
}
self.vad_data_list = []
self.model.reset_states()
def run(self, audio: np.ndarray, sr: int = 16000):
# record datetime
cur_ts = datetime.now()
# freq scale
freq_scale = int(sr / self.model.sr)
# inference
speech_probs = self.model.audio_forward(audio, sr)[0]
for i, prob in enumerate(speech_probs):
audio_slice = audio[i * self.model.num_samples * freq_scale : (i + 1) * self.model.num_samples * freq_scale]
ts = cur_ts.strftime(self.datetime_format)
# is speech
if prob > 1 - self.sensitivity:
self.silence_count = 0
# new speech segment
if self.speech_count == 0:
self.return_data['start_ts'] = ts
self.speech_count += 1
self.vad_data_list.append(audio_slice)
# silence
else:
if self.speech_count > 0:
self.silence_count += 1
# exceed silence limit
if 1000 * self.silence_count * self.model.num_samples / self.model.sr > self.silence_ms:
# return audio segment
self.return_data['end_ts'] = ts
self.return_data['audio'] = np.concatenate(self.vad_data_list, axis=-1)
yield self.return_data
self.reset()
else:
self.vad_data_list.append(audio_slice)
# timestamp
cur_ts += timedelta(seconds=self.model.num_samples / self.model.sr)
|