duyv's picture
Upload 86 files
a257816 verified
from typing import Union
import numpy as np
import torch
from silero_vad import load_silero_vad, read_audio, get_speech_timestamps
model = load_silero_vad()
def get_speech(
audio_input: Union[str, np.ndarray, torch.Tensor],
return_numpy: bool=False,
min_duration: float=3,
max_duration: float=5
) -> Union[torch.Tensor, np.ndarray]:
if isinstance(audio_input, str):
audio_input = read_audio(audio_input)
speech_timestamps = get_speech_timestamps(audio_input, model)
speech = [audio_input[t['start']:t['end']] \
for t in speech_timestamps \
if (t['end'] - t['start']) >= 16000 * min_duration \
and (t['end'] - t['start']) <= 16000 * max_duration]
if not speech:
speech = audio_input[:int(max_duration*16000)].unsqueeze(0)
else:
speech = speech[0].unsqueeze(0)
if return_numpy:
speech = speech.cpu().numpy()
return speech
if __name__ == '__main__':
print(get_speech('samples/diep-chi.wav'))