vad-audio-labels-experimental-test

Running

App Files Files Community

vad-audio-labels-experimental-test / app.py

deepsync

Update app.py

cfc2b12 verified over 1 year ago

raw

history blame

2.91 kB

	import gradio as gr
	import os
	import torchaudio
	from uuid import uuid4
	from pydub.silence import detect_nonsilent
	from pydub import AudioSegment

	import torch
	torch.set_num_threads(1)

	model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
	model='silero_vad',
	force_reload=True,
	onnx=False)

	(get_speech_timestamps,
	save_audio,
	read_audio,
	VADIterator,
	collect_chunks) = utils


	def get_labels(audio_fp, threshold, min_speech_duration_ms, min_silence_duration_ms, auto_merge, uppper_merge_threshold, max_segment_length):
	audio = AudioSegment.from_file(audio_fp)
	speech_timestamps = detect_nonsilent(audio, min_silence_len=min_silence_duration_ms, silence_thresh=-40)
	speech_timestamps = list(filter(lambda x: x[1]-x[0] > min_speech_duration_ms, speech_timestamps))
	labels_str = []
	labels = []

	uppper_merge_threshold = float(uppper_merge_threshold)

	for i, st in enumerate(speech_timestamps):
	labels_str.append(f"{st['start']}\t{st['end']}\tSound {i+1}")
	labels.append((float(st['start']), float(st['end']), f"Sound {i+1}"))

	fn = str(uuid4()) + ".txt"
	with open(fn, "w") as f:
	f.write("\n".join(labels_str))

	if not auto_merge:
	return fn, None

	gaps = [labels[i][0] - labels[i - 1][1] for i in range(1, len(labels))]

	duration = lambda x: float(x[1]) - float(x[0])

	new_labels = [list(labels[0])]
	for i in range(1, len(labels)):
	if (
	gaps[i - 1] <= uppper_merge_threshold
	and duration(new_labels[-1]) + gaps[i - 1] + duration(labels[i])
	< max_segment_length
	):
	new_labels[-1][1] = labels[i][1]
	new_labels[-1][
	2
	] = f'{new_labels[-1][2]} \|{round(gaps[i-1], 2)}s\| {labels[i][2]}'
	else:
	new_labels.append(list(labels[i]))

	translate_labels = list(map(lambda x: f"{x[0]}\t{x[1]}\t{x[2]}", new_labels))

	filename_path = f"{fn}_translate_label.txt"
	with open(filename_path, "w") as f:
	f.write("\n".join(translate_labels))

	return fn, filename_path


	interface = gr.Interface(
	get_labels,
	[
	gr.Audio(type="filepath", label="Audio file"),
	gr.Slider(0, 1, value=0.7, label="Threshold", step=0.01, info="default (0.5)"),
	gr.Number(label="min_speech_duration_ms", value=250, info="default (250)"),
	gr.Number(label="min_silence_duration_ms", value=40, info="default (100)"),
	gr.Checkbox(label="Auto merge", value=True),
	gr.Textbox(label="Gap max threshold value (seconds)", value=0.3),
	gr.Number(label="Approx Max Segment Length", value=5)
	],
	[
	gr.File(label="VAD Labels"),
	gr.File(label="Merged Labels File")
	]
	)

	if __name__ == "__main__":
	interface.queue().launch()