| import gradio as gr |
| import os |
| import torchaudio |
| from uuid import uuid4 |
| from pydub.silence import detect_nonsilent |
| from pydub import AudioSegment |
|
|
| import torch |
| torch.set_num_threads(1) |
|
|
| model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', |
| model='silero_vad', |
| force_reload=True, |
| onnx=False) |
| |
| (get_speech_timestamps, |
| save_audio, |
| read_audio, |
| VADIterator, |
| collect_chunks) = utils |
|
|
|
|
| def get_labels(audio_fp, threshold, min_speech_duration_ms, min_silence_duration_ms, auto_merge, uppper_merge_threshold, max_segment_length): |
| audio = AudioSegment.from_file(audio_fp) |
| speech_timestamps = detect_nonsilent(audio, min_silence_len=min_silence_duration_ms, silence_thresh=-40) |
| speech_timestamps = list(filter(lambda x: x[1]-x[0] > min_speech_duration_ms, speech_timestamps)) |
| labels_str = [] |
| labels = [] |
|
|
| uppper_merge_threshold = float(uppper_merge_threshold) |
| |
| for i, st in enumerate(speech_timestamps): |
| labels_str.append(f"{st['start']}\t{st['end']}\tSound {i+1}") |
| labels.append((float(st['start']), float(st['end']), f"Sound {i+1}")) |
| |
| fn = str(uuid4()) + ".txt" |
| with open(fn, "w") as f: |
| f.write("\n".join(labels_str)) |
|
|
| if not auto_merge: |
| return fn, None |
|
|
| gaps = [labels[i][0] - labels[i - 1][1] for i in range(1, len(labels))] |
|
|
| duration = lambda x: float(x[1]) - float(x[0]) |
|
|
| new_labels = [list(labels[0])] |
| for i in range(1, len(labels)): |
| if ( |
| gaps[i - 1] <= uppper_merge_threshold |
| and duration(new_labels[-1]) + gaps[i - 1] + duration(labels[i]) |
| < max_segment_length |
| ): |
| new_labels[-1][1] = labels[i][1] |
| new_labels[-1][ |
| 2 |
| ] = f'{new_labels[-1][2]} |{round(gaps[i-1], 2)}s| {labels[i][2]}' |
| else: |
| new_labels.append(list(labels[i])) |
|
|
| translate_labels = list(map(lambda x: f"{x[0]}\t{x[1]}\t{x[2]}", new_labels)) |
|
|
| filename_path = f"{fn}_translate_label.txt" |
| with open(filename_path, "w") as f: |
| f.write("\n".join(translate_labels)) |
| |
| return fn, filename_path |
|
|
|
|
| interface = gr.Interface( |
| get_labels, |
| [ |
| gr.Audio(type="filepath", label="Audio file"), |
| gr.Slider(0, 1, value=0.7, label="Threshold", step=0.01, info="default (0.5)"), |
| gr.Number(label="min_speech_duration_ms", value=250, info="default (250)"), |
| gr.Number(label="min_silence_duration_ms", value=40, info="default (100)"), |
| gr.Checkbox(label="Auto merge", value=True), |
| gr.Textbox(label="Gap max threshold value (seconds)", value=0.3), |
| gr.Number(label="Approx Max Segment Length", value=5) |
| ], |
| [ |
| gr.File(label="VAD Labels"), |
| gr.File(label="Merged Labels File") |
| ] |
| ) |
|
|
| if __name__ == "__main__": |
| interface.queue().launch() |