File size: 3,572 Bytes
d710575
 
 
cfc2b12
 
d710575
 
75da7a9
c2c59b5
75da7a9
cfc2b12
df05b61
0041d9c
d710575
ed53687
 
0041d9c
d710575
0041d9c
 
 
d710575
 
0041d9c
 
 
 
 
 
 
 
 
 
 
 
3aad5c9
0041d9c
 
 
 
 
 
 
 
 
 
f3c4188
a639500
 
db06467
 
f3c4188
a639500
 
 
f3c4188
 
 
a639500
c535b52
 
 
 
0041d9c
 
 
 
 
 
 
d710575
 
 
 
0041d9c
c2c59b5
a639500
 
0041d9c
9e90036
a639500
 
c535b52
dd6c43b
75da7a9
0041d9c
 
 
 
 
d710575
 
 
5456318
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import gradio as gr
import os
from uuid import uuid4
from pydub.silence import detect_nonsilent
from pydub import AudioSegment
 

def get_labels(audio_fp, min_speech_duration_ms, min_silence_duration_ms, auto_merge, uppper_merge_threshold, max_segment_length, end_extension, extend_small_segments, show_duration_label, sil_thresh):
    audio = AudioSegment.from_file(audio_fp.name).set_channels(1)
    speech_timestamps = detect_nonsilent(audio, min_silence_len=min_silence_duration_ms, silence_thresh=int(sil_thresh))
    speech_timestamps = list(filter(lambda x: x[1]-x[0] > min_speech_duration_ms, speech_timestamps))
    speech_timestamps = [{"start": s[0]/1000, "end": s[1]/1000} for s in speech_timestamps]
    labels_str = []
    labels = []

    uppper_merge_threshold = float(uppper_merge_threshold)
    
    for i, st in enumerate(speech_timestamps):
        labels_str.append(f"{st['start']}\t{st['end']}\tSound {i+1}")
        labels.append((float(st['start']), float(st['end']), f"Sound {i+1}"))
        
    fn = str(uuid4()) + ".txt"
    with open(fn, "w") as f:
        f.write("\n".join(labels_str))

    if not auto_merge:
        return fn, None

    gaps = [labels[i][0] - labels[i - 1][1] for i in range(1, len(labels))]

    duration = lambda x: float(x[1]) - float(x[0])

    new_labels = [list(labels[0])]
    for i in range(1, len(labels)):
        if (
            gaps[i - 1] <= uppper_merge_threshold
            and duration(new_labels[-1]) + gaps[i - 1] + duration(labels[i])
            < max_segment_length
        ):
            new_labels[-1][1] = labels[i][1]
            new_labels[-1][
                2
            ] = f'{new_labels[-1][2]} |{round(gaps[i-1], 2)}s| {labels[i][2]}'
        else:
            new_labels.append(list(labels[i]))

    extended = [False] * (len(new_labels) - 1)
    if extend_small_segments:
        for i, nl in enumerate(new_labels[:-1]):
            if nl[1] - nl[0] <= 1.02 and nl[0] + 1.02 < new_labels[i+1][0]:
                nl[1] = nl[0] + 1.02
                extended[i] = True

    if end_extension:
        for i, nl in enumerate(new_labels[:-1]):
            if not extended[i]:
                if nl[1] + end_extension < new_labels[i+1][0]:
                    nl[1] = nl[1] + end_extension

    if show_duration_label:
        for nl in new_labels:
            nl[2] = round(nl[1] - nl[0], 3)

    translate_labels = list(map(lambda x: f"{x[0]}\t{x[1]}\t{x[2]}", new_labels))

    filename_path = f"{fn}_translate_label.txt"
    with open(filename_path, "w") as f:
        f.write("\n".join(translate_labels))
    
    return fn, filename_path


interface = gr.Interface(
    get_labels,
    [
        gr.File(type="filepath", label="Audio file", file_types=["audio"], file_count="single"),
        gr.Number(label="min_speech_duration_ms", value=40, info="default (40)"), 
        gr.Number(label="min_silence_duration_ms", value=40, info="default (40)"),
        gr.Checkbox(label="Auto merge", value=True),
        gr.Textbox(label="Gap max threshold value (seconds)", value=0.350),
        gr.Number(label="Approx Max Segment Length", value=7),
        gr.Number(label="Extend end by (seconds)", value=0),
        gr.Checkbox(label="Extend small segments (minimum 1.02 seconds)", value=False),
        gr.Checkbox(label="Show only duration in labels", value=False),
        gr.Textbox(label="Silence Threshold", value="-45")
    ],
    [
        gr.File(label="VAD Labels"),
        gr.File(label="Merged Labels File")
    ]
)

if __name__ == "__main__":
    interface.queue().launch()