Spaces:

gordonchan
/

whisper-app

Paused

App Files Files Community

whisper-app / vad.py

admin321

deploy

fd6a8dc over 2 years ago

raw

history blame contribute delete

5.76 kB

	import torch
	import warnings

	warnings.filterwarnings("ignore")


	class VAD:
	def __init__(self):
	self.model = init_jit_model("silero_vad.jit")

	def no_speech(self, audio):
	speech = get_speech_timestamps(torch.Tensor(audio), self.model, return_seconds=True)
	# print(speech)
	return len(speech) == 0


	def init_jit_model(model_path: str,
	device=torch.device('cpu')):
	torch.set_grad_enabled(False)
	model = torch.jit.load(model_path, map_location=device)
	model.eval()
	return model


	def get_speech_timestamps(audio: torch.Tensor,
	model,
	threshold: float = 0.5,
	sampling_rate: int = 16000,
	min_speech_duration_ms: int = 250,
	min_silence_duration_ms: int = 100,
	window_size_samples: int = 1536,
	speech_pad_ms: int = 30,
	return_seconds: bool = False):
	"""
	From https://github.com/snakers4/silero-vad/blob/master/utils_vad.py

	This method is used for splitting long audios into speech chunks using silero VAD
	Parameters
	----------
	audio: torch.Tensor
	One dimensional float torch.Tensor, other types are cast to torch if possible
	model: preloaded .jit silero VAD model
	threshold: float (default - 0.5)
	Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value
	are considered as SPEECH. It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is
	pretty good for most datasets.
	sampling_rate: int (default - 16000)
	Currently silero VAD models support 8000 and 16000 sample rates
	min_speech_duration_ms: int (default - 250 milliseconds)
	Final speech chunks shorter min_speech_duration_ms are thrown out
	min_silence_duration_ms: int (default - 100 milliseconds)
	In the end of each speech chunk wait for min_silence_duration_ms before separating it
	window_size_samples: int (default - 1536 samples)
	Audio chunks of window_size_samples size are fed to the silero VAD model.
	WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 sample rate and 256, 512, 768
	samples for 8000 sample rate.Values other than these may affect model performance!!
	speech_pad_ms: int (default - 30 milliseconds)
	Final speech chunks are padded by speech_pad_ms each side
	return_seconds: bool (default - False)
	whether return timestamps in seconds (default - samples)
	Returns
	----------
	speeches: list of dicts
	list containing ends and beginnings of speech chunks (samples or seconds based on return_seconds)
	"""
	model.reset_states()
	min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
	min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
	speech_pad_samples = sampling_rate * speech_pad_ms / 1000

	audio_length_samples = len(audio)

	speech_probs = []
	for current_start_sample in range(0, audio_length_samples, window_size_samples):
	chunk = audio[current_start_sample: current_start_sample + window_size_samples]
	if len(chunk) < window_size_samples:
	chunk = torch.nn.functional.pad(chunk, (0, int(window_size_samples - len(chunk))))
	speech_prob = model(chunk, sampling_rate).item()
	speech_probs.append(speech_prob)

	triggered = False
	speeches = []
	current_speech = {}
	neg_threshold = threshold - 0.15
	temp_end = 0

	for i, speech_prob in enumerate(speech_probs):
	if (speech_prob >= threshold) and temp_end:
	temp_end = 0

	if (speech_prob >= threshold) and not triggered:
	triggered = True
	current_speech['start'] = window_size_samples * i
	continue

	if (speech_prob < neg_threshold) and triggered:
	if not temp_end:
	temp_end = window_size_samples * i
	if (window_size_samples * i) - temp_end < min_silence_samples:
	continue
	else:
	current_speech['end'] = temp_end
	if (current_speech['end'] - current_speech['start']) > min_speech_samples:
	speeches.append(current_speech)
	temp_end = 0
	current_speech = {}
	triggered = False
	continue

	if current_speech and (audio_length_samples - current_speech['start']) > min_speech_samples:
	current_speech['end'] = audio_length_samples
	speeches.append(current_speech)

	for i, speech in enumerate(speeches):
	if i == 0:
	speech['start'] = int(max(0, speech['start'] - speech_pad_samples))
	if i != len(speeches) - 1:
	silence_duration = speeches[i + 1]['start'] - speech['end']
	if silence_duration < 2 * speech_pad_samples:
	speech['end'] += int(silence_duration // 2)
	speeches[i + 1]['start'] = int(max(0, speeches[i + 1]['start'] - silence_duration // 2))
	else:
	speech['end'] = int(min(audio_length_samples, speech['end'] + speech_pad_samples))
	speeches[i + 1]['start'] = int(max(0, speeches[i + 1]['start'] - speech_pad_samples))
	else:
	speech['end'] = int(min(audio_length_samples, speech['end'] + speech_pad_samples))

	if return_seconds:
	for speech_dict in speeches:
	speech_dict['start'] = round(speech_dict['start'] / sampling_rate, 1)
	speech_dict['end'] = round(speech_dict['end'] / sampling_rate, 1)

	return speeches