Spaces:
Paused
Paused
| import torch | |
| import warnings | |
| warnings.filterwarnings("ignore") | |
| class VAD: | |
| def __init__(self): | |
| self.model = init_jit_model("silero_vad.jit") | |
| def no_speech(self, audio): | |
| speech = get_speech_timestamps(torch.Tensor(audio), self.model, return_seconds=True) | |
| # print(speech) | |
| return len(speech) == 0 | |
| def init_jit_model(model_path: str, | |
| device=torch.device('cpu')): | |
| torch.set_grad_enabled(False) | |
| model = torch.jit.load(model_path, map_location=device) | |
| model.eval() | |
| return model | |
| def get_speech_timestamps(audio: torch.Tensor, | |
| model, | |
| threshold: float = 0.5, | |
| sampling_rate: int = 16000, | |
| min_speech_duration_ms: int = 250, | |
| min_silence_duration_ms: int = 100, | |
| window_size_samples: int = 1536, | |
| speech_pad_ms: int = 30, | |
| return_seconds: bool = False): | |
| """ | |
| From https://github.com/snakers4/silero-vad/blob/master/utils_vad.py | |
| This method is used for splitting long audios into speech chunks using silero VAD | |
| Parameters | |
| ---------- | |
| audio: torch.Tensor | |
| One dimensional float torch.Tensor, other types are cast to torch if possible | |
| model: preloaded .jit silero VAD model | |
| threshold: float (default - 0.5) | |
| Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value | |
| are considered as SPEECH. It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is | |
| pretty good for most datasets. | |
| sampling_rate: int (default - 16000) | |
| Currently silero VAD models support 8000 and 16000 sample rates | |
| min_speech_duration_ms: int (default - 250 milliseconds) | |
| Final speech chunks shorter min_speech_duration_ms are thrown out | |
| min_silence_duration_ms: int (default - 100 milliseconds) | |
| In the end of each speech chunk wait for min_silence_duration_ms before separating it | |
| window_size_samples: int (default - 1536 samples) | |
| Audio chunks of window_size_samples size are fed to the silero VAD model. | |
| WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 sample rate and 256, 512, 768 | |
| samples for 8000 sample rate.Values other than these may affect model performance!! | |
| speech_pad_ms: int (default - 30 milliseconds) | |
| Final speech chunks are padded by speech_pad_ms each side | |
| return_seconds: bool (default - False) | |
| whether return timestamps in seconds (default - samples) | |
| Returns | |
| ---------- | |
| speeches: list of dicts | |
| list containing ends and beginnings of speech chunks (samples or seconds based on return_seconds) | |
| """ | |
| model.reset_states() | |
| min_speech_samples = sampling_rate * min_speech_duration_ms / 1000 | |
| min_silence_samples = sampling_rate * min_silence_duration_ms / 1000 | |
| speech_pad_samples = sampling_rate * speech_pad_ms / 1000 | |
| audio_length_samples = len(audio) | |
| speech_probs = [] | |
| for current_start_sample in range(0, audio_length_samples, window_size_samples): | |
| chunk = audio[current_start_sample: current_start_sample + window_size_samples] | |
| if len(chunk) < window_size_samples: | |
| chunk = torch.nn.functional.pad(chunk, (0, int(window_size_samples - len(chunk)))) | |
| speech_prob = model(chunk, sampling_rate).item() | |
| speech_probs.append(speech_prob) | |
| triggered = False | |
| speeches = [] | |
| current_speech = {} | |
| neg_threshold = threshold - 0.15 | |
| temp_end = 0 | |
| for i, speech_prob in enumerate(speech_probs): | |
| if (speech_prob >= threshold) and temp_end: | |
| temp_end = 0 | |
| if (speech_prob >= threshold) and not triggered: | |
| triggered = True | |
| current_speech['start'] = window_size_samples * i | |
| continue | |
| if (speech_prob < neg_threshold) and triggered: | |
| if not temp_end: | |
| temp_end = window_size_samples * i | |
| if (window_size_samples * i) - temp_end < min_silence_samples: | |
| continue | |
| else: | |
| current_speech['end'] = temp_end | |
| if (current_speech['end'] - current_speech['start']) > min_speech_samples: | |
| speeches.append(current_speech) | |
| temp_end = 0 | |
| current_speech = {} | |
| triggered = False | |
| continue | |
| if current_speech and (audio_length_samples - current_speech['start']) > min_speech_samples: | |
| current_speech['end'] = audio_length_samples | |
| speeches.append(current_speech) | |
| for i, speech in enumerate(speeches): | |
| if i == 0: | |
| speech['start'] = int(max(0, speech['start'] - speech_pad_samples)) | |
| if i != len(speeches) - 1: | |
| silence_duration = speeches[i + 1]['start'] - speech['end'] | |
| if silence_duration < 2 * speech_pad_samples: | |
| speech['end'] += int(silence_duration // 2) | |
| speeches[i + 1]['start'] = int(max(0, speeches[i + 1]['start'] - silence_duration // 2)) | |
| else: | |
| speech['end'] = int(min(audio_length_samples, speech['end'] + speech_pad_samples)) | |
| speeches[i + 1]['start'] = int(max(0, speeches[i + 1]['start'] - speech_pad_samples)) | |
| else: | |
| speech['end'] = int(min(audio_length_samples, speech['end'] + speech_pad_samples)) | |
| if return_seconds: | |
| for speech_dict in speeches: | |
| speech_dict['start'] = round(speech_dict['start'] / sampling_rate, 1) | |
| speech_dict['end'] = round(speech_dict['end'] / sampling_rate, 1) | |
| return speeches | |