from typing import Union import numpy as np import torch from silero_vad import load_silero_vad, read_audio, get_speech_timestamps model = load_silero_vad() def get_speech( audio_input: Union[str, np.ndarray, torch.Tensor], return_numpy: bool=False, min_duration: float=3, max_duration: float=5 ) -> Union[torch.Tensor, np.ndarray]: if isinstance(audio_input, str): audio_input = read_audio(audio_input) speech_timestamps = get_speech_timestamps(audio_input, model) speech = [audio_input[t['start']:t['end']] \ for t in speech_timestamps \ if (t['end'] - t['start']) >= 16000 * min_duration \ and (t['end'] - t['start']) <= 16000 * max_duration] if not speech: speech = audio_input[:int(max_duration*16000)].unsqueeze(0) else: speech = speech[0].unsqueeze(0) if return_numpy: speech = speech.cpu().numpy() return speech if __name__ == '__main__': print(get_speech('samples/diep-chi.wav'))