File size: 3,572 Bytes
d710575 cfc2b12 d710575 75da7a9 c2c59b5 75da7a9 cfc2b12 df05b61 0041d9c d710575 ed53687 0041d9c d710575 0041d9c d710575 0041d9c 3aad5c9 0041d9c f3c4188 a639500 db06467 f3c4188 a639500 f3c4188 a639500 c535b52 0041d9c d710575 0041d9c c2c59b5 a639500 0041d9c 9e90036 a639500 c535b52 dd6c43b 75da7a9 0041d9c d710575 5456318 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
import gradio as gr
import os
from uuid import uuid4
from pydub.silence import detect_nonsilent
from pydub import AudioSegment
def get_labels(audio_fp, min_speech_duration_ms, min_silence_duration_ms, auto_merge, uppper_merge_threshold, max_segment_length, end_extension, extend_small_segments, show_duration_label, sil_thresh):
audio = AudioSegment.from_file(audio_fp.name).set_channels(1)
speech_timestamps = detect_nonsilent(audio, min_silence_len=min_silence_duration_ms, silence_thresh=int(sil_thresh))
speech_timestamps = list(filter(lambda x: x[1]-x[0] > min_speech_duration_ms, speech_timestamps))
speech_timestamps = [{"start": s[0]/1000, "end": s[1]/1000} for s in speech_timestamps]
labels_str = []
labels = []
uppper_merge_threshold = float(uppper_merge_threshold)
for i, st in enumerate(speech_timestamps):
labels_str.append(f"{st['start']}\t{st['end']}\tSound {i+1}")
labels.append((float(st['start']), float(st['end']), f"Sound {i+1}"))
fn = str(uuid4()) + ".txt"
with open(fn, "w") as f:
f.write("\n".join(labels_str))
if not auto_merge:
return fn, None
gaps = [labels[i][0] - labels[i - 1][1] for i in range(1, len(labels))]
duration = lambda x: float(x[1]) - float(x[0])
new_labels = [list(labels[0])]
for i in range(1, len(labels)):
if (
gaps[i - 1] <= uppper_merge_threshold
and duration(new_labels[-1]) + gaps[i - 1] + duration(labels[i])
< max_segment_length
):
new_labels[-1][1] = labels[i][1]
new_labels[-1][
2
] = f'{new_labels[-1][2]} |{round(gaps[i-1], 2)}s| {labels[i][2]}'
else:
new_labels.append(list(labels[i]))
extended = [False] * (len(new_labels) - 1)
if extend_small_segments:
for i, nl in enumerate(new_labels[:-1]):
if nl[1] - nl[0] <= 1.02 and nl[0] + 1.02 < new_labels[i+1][0]:
nl[1] = nl[0] + 1.02
extended[i] = True
if end_extension:
for i, nl in enumerate(new_labels[:-1]):
if not extended[i]:
if nl[1] + end_extension < new_labels[i+1][0]:
nl[1] = nl[1] + end_extension
if show_duration_label:
for nl in new_labels:
nl[2] = round(nl[1] - nl[0], 3)
translate_labels = list(map(lambda x: f"{x[0]}\t{x[1]}\t{x[2]}", new_labels))
filename_path = f"{fn}_translate_label.txt"
with open(filename_path, "w") as f:
f.write("\n".join(translate_labels))
return fn, filename_path
interface = gr.Interface(
get_labels,
[
gr.File(type="filepath", label="Audio file", file_types=["audio"], file_count="single"),
gr.Number(label="min_speech_duration_ms", value=40, info="default (40)"),
gr.Number(label="min_silence_duration_ms", value=40, info="default (40)"),
gr.Checkbox(label="Auto merge", value=True),
gr.Textbox(label="Gap max threshold value (seconds)", value=0.350),
gr.Number(label="Approx Max Segment Length", value=7),
gr.Number(label="Extend end by (seconds)", value=0),
gr.Checkbox(label="Extend small segments (minimum 1.02 seconds)", value=False),
gr.Checkbox(label="Show only duration in labels", value=False),
gr.Textbox(label="Silence Threshold", value="-45")
],
[
gr.File(label="VAD Labels"),
gr.File(label="Merged Labels File")
]
)
if __name__ == "__main__":
interface.queue().launch() |