Spaces:

deepsync
/

vad-audio-labels-experimental

Runtime error

App Files Files Community

deepsync commited on Jul 5, 2024

Commit

0041d9c

verified ·

1 Parent(s): 968ecd9

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -6

app.py CHANGED Viewed

@@ -18,7 +18,7 @@ model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
  collect_chunks) = utils
-def get_labels(audio_fp, threshold, min_speech_duration_ms, min_silence_duration_ms):
     wav, sr = torchaudio.load(audio_fp)
     wav = torchaudio.functional.resample(wav, orig_freq=sr, new_freq=16000)
     speech_timestamps = get_speech_timestamps(wav,
@@ -28,19 +28,62 @@ def get_labels(audio_fp, threshold, min_speech_duration_ms, min_silence_duration
                                             min_speech_duration_ms=min_speech_duration_ms,
                                             min_silence_duration_ms=min_silence_duration_ms,
                                             return_seconds=True)
     labels = []
     for i, st in enumerate(speech_timestamps):
-        labels.append(f"{st['start']}\t{st['end']}\tSound {i+1}")
     fn = str(uuid4()) + ".txt"
     with open(fn, "w") as f:
-        f.write("\n".join(labels))
-    return fn
 interface = gr.Interface(
     get_labels,
-    [gr.Audio(type="filepath", label="Audio file"), gr.Slider(0, 1, value=0.5, label="Threshold", step=0.01, info="default (0.5)"), gr.Number(label="min_speech_duration_ms", value=250, info="default (250)"), gr.Number(label="min_silence_duration_ms", value=100, info="default (100)")],
-    gr.File(label="Labels")
 )
 if __name__ == "__main__":

  collect_chunks) = utils
+def get_labels(audio_fp, threshold, min_speech_duration_ms, min_silence_duration_ms, auto_merge, uppper_merge_threshold, max_seg_length):
     wav, sr = torchaudio.load(audio_fp)
     wav = torchaudio.functional.resample(wav, orig_freq=sr, new_freq=16000)
     speech_timestamps = get_speech_timestamps(wav,
                                             min_speech_duration_ms=min_speech_duration_ms,
                                             min_silence_duration_ms=min_silence_duration_ms,
                                             return_seconds=True)
+    labels_str = []
     labels = []
     for i, st in enumerate(speech_timestamps):
+        labels_str.append(f"{st['start']}\t{st['end']}\tSound {i+1}")
+        labels.append((float(st['start']), float(st['end']), f"Sound {i+1}"))
     fn = str(uuid4()) + ".txt"
     with open(fn, "w") as f:
+        f.write("\n".join(labels_str))
+    if not auto_merge:
+        return fn, None
+    gaps = [labels[i][0] - labels[i - 1][1] for i in range(1, len(labels))]
+    duration = lambda x: float(x[1]) - float(x[0])
+    new_labels = [list(labels[0])]
+    for i in range(1, len(labels)):
+        if (
+            gaps[i - 1] <= threshold
+            and duration(new_labels[-1]) + gaps[i - 1] + duration(labels[i])
+            < max_segment_length
+        ):
+            new_labels[-1][1] = labels[i][1]
+            new_labels[-1][
+                2
+            ] = f'{new_labels[-1][2]} |{round(gaps[i-1], 2)}s| {labels[i][2]}'
+        else:
+            new_labels.append(list(labels[i]))
+    translate_labels = list(map(lambda x: f"{x[0]}\t{x[1]}\t{x[2]}", new_labels))
+    filename_path = f"{fn}_translate_label.txt"
+    with open(filename_path, "w") as f:
+        f.write("\n".join(translate_labels))
+    return fn, filename_path
 interface = gr.Interface(
     get_labels,
+    [
+        gr.Audio(type="filepath", label="Audio file"),
+        gr.Slider(0, 1, value=0.5, label="Threshold", step=0.01, info="default (0.5)"),
+        gr.Number(label="min_speech_duration_ms", value=250, info="default (250)"),
+        gr.Number(label="min_silence_duration_ms", value=100, info="default (100)"),
+        gr.Checkbox(label="Auto merge", value=True),
+        gr.Number(label="Gap max threshold value (seconds)", value=1),
+        gr.Number(label="Approx Max Segment Length", value=12)
+    ],
+    [
+        gr.File(label="VAD Labels"),
+        gr.File(label="Merged Labels File")
+    ]
 )
 if __name__ == "__main__":