deepsync commited on
Commit
0041d9c
·
verified ·
1 Parent(s): 968ecd9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -6
app.py CHANGED
@@ -18,7 +18,7 @@ model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
18
  collect_chunks) = utils
19
 
20
 
21
- def get_labels(audio_fp, threshold, min_speech_duration_ms, min_silence_duration_ms):
22
  wav, sr = torchaudio.load(audio_fp)
23
  wav = torchaudio.functional.resample(wav, orig_freq=sr, new_freq=16000)
24
  speech_timestamps = get_speech_timestamps(wav,
@@ -28,19 +28,62 @@ def get_labels(audio_fp, threshold, min_speech_duration_ms, min_silence_duration
28
  min_speech_duration_ms=min_speech_duration_ms,
29
  min_silence_duration_ms=min_silence_duration_ms,
30
  return_seconds=True)
 
31
  labels = []
 
32
  for i, st in enumerate(speech_timestamps):
33
- labels.append(f"{st['start']}\t{st['end']}\tSound {i+1}")
 
 
34
  fn = str(uuid4()) + ".txt"
35
  with open(fn, "w") as f:
36
- f.write("\n".join(labels))
37
- return fn
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
 
40
  interface = gr.Interface(
41
  get_labels,
42
- [gr.Audio(type="filepath", label="Audio file"), gr.Slider(0, 1, value=0.5, label="Threshold", step=0.01, info="default (0.5)"), gr.Number(label="min_speech_duration_ms", value=250, info="default (250)"), gr.Number(label="min_silence_duration_ms", value=100, info="default (100)")],
43
- gr.File(label="Labels")
 
 
 
 
 
 
 
 
 
 
 
44
  )
45
 
46
  if __name__ == "__main__":
 
18
  collect_chunks) = utils
19
 
20
 
21
+ def get_labels(audio_fp, threshold, min_speech_duration_ms, min_silence_duration_ms, auto_merge, uppper_merge_threshold, max_seg_length):
22
  wav, sr = torchaudio.load(audio_fp)
23
  wav = torchaudio.functional.resample(wav, orig_freq=sr, new_freq=16000)
24
  speech_timestamps = get_speech_timestamps(wav,
 
28
  min_speech_duration_ms=min_speech_duration_ms,
29
  min_silence_duration_ms=min_silence_duration_ms,
30
  return_seconds=True)
31
+ labels_str = []
32
  labels = []
33
+
34
  for i, st in enumerate(speech_timestamps):
35
+ labels_str.append(f"{st['start']}\t{st['end']}\tSound {i+1}")
36
+ labels.append((float(st['start']), float(st['end']), f"Sound {i+1}"))
37
+
38
  fn = str(uuid4()) + ".txt"
39
  with open(fn, "w") as f:
40
+ f.write("\n".join(labels_str))
41
+
42
+ if not auto_merge:
43
+ return fn, None
44
+
45
+ gaps = [labels[i][0] - labels[i - 1][1] for i in range(1, len(labels))]
46
+
47
+ duration = lambda x: float(x[1]) - float(x[0])
48
+
49
+ new_labels = [list(labels[0])]
50
+ for i in range(1, len(labels)):
51
+ if (
52
+ gaps[i - 1] <= threshold
53
+ and duration(new_labels[-1]) + gaps[i - 1] + duration(labels[i])
54
+ < max_segment_length
55
+ ):
56
+ new_labels[-1][1] = labels[i][1]
57
+ new_labels[-1][
58
+ 2
59
+ ] = f'{new_labels[-1][2]} |{round(gaps[i-1], 2)}s| {labels[i][2]}'
60
+ else:
61
+ new_labels.append(list(labels[i]))
62
+
63
+ translate_labels = list(map(lambda x: f"{x[0]}\t{x[1]}\t{x[2]}", new_labels))
64
+
65
+ filename_path = f"{fn}_translate_label.txt"
66
+ with open(filename_path, "w") as f:
67
+ f.write("\n".join(translate_labels))
68
+
69
+ return fn, filename_path
70
 
71
 
72
  interface = gr.Interface(
73
  get_labels,
74
+ [
75
+ gr.Audio(type="filepath", label="Audio file"),
76
+ gr.Slider(0, 1, value=0.5, label="Threshold", step=0.01, info="default (0.5)"),
77
+ gr.Number(label="min_speech_duration_ms", value=250, info="default (250)"),
78
+ gr.Number(label="min_silence_duration_ms", value=100, info="default (100)"),
79
+ gr.Checkbox(label="Auto merge", value=True),
80
+ gr.Number(label="Gap max threshold value (seconds)", value=1),
81
+ gr.Number(label="Approx Max Segment Length", value=12)
82
+ ],
83
+ [
84
+ gr.File(label="VAD Labels"),
85
+ gr.File(label="Merged Labels File")
86
+ ]
87
  )
88
 
89
  if __name__ == "__main__":