mariesig commited on
Commit
a7c506c
·
1 Parent(s): 6606020

VAD in spectrogram

Browse files
Files changed (2) hide show
  1. offline_pipeline.py +3 -3
  2. utils.py +122 -54
offline_pipeline.py CHANGED
@@ -122,7 +122,7 @@ def _process_audio_chunks(
122
  loop_progress = (i + original_chunk_len) / n if n > 0 else 1.0
123
  _safe_progress(
124
  progress,
125
- 0.20 + 0.60 * loop_progress,
126
  "Enhancing audio...",
127
  )
128
 
@@ -189,9 +189,9 @@ def run_offline_pipeline(
189
  progress=progress,
190
  )
191
 
192
- _safe_progress(progress, 0.82, "Finalizing transcripts...")
193
  noisy_transcript = _finalize_stream_transcript(streamer_noisy)
194
- _safe_progress(progress, 0.88, "Finalizing transcripts...")
195
  enhanced_transcript = _finalize_stream_transcript(streamer_enhanced)
196
 
197
  _safe_progress(progress, 0.94, "Loading reference transcript...")
 
122
  loop_progress = (i + original_chunk_len) / n if n > 0 else 1.0
123
  _safe_progress(
124
  progress,
125
+ 0.20 + 0.50 * loop_progress,
126
  "Enhancing audio...",
127
  )
128
 
 
189
  progress=progress,
190
  )
191
 
192
+ _safe_progress(progress, 0.72, "Finalizing transcripts...")
193
  noisy_transcript = _finalize_stream_transcript(streamer_noisy)
194
+ _safe_progress(progress, 0.80, "Finalizing transcripts...")
195
  enhanced_transcript = _finalize_stream_transcript(streamer_enhanced)
196
 
197
  _safe_progress(progress, 0.94, "Loading reference transcript...")
utils.py CHANGED
@@ -1,59 +1,86 @@
1
  from typing import Optional
2
- import numpy as np
3
- import librosa
4
- from PIL import Image
5
  import io
 
 
 
 
6
  import matplotlib.pyplot as plt
7
- from constants import TARGET_LOUDNESS, TARGET_TP, VAD_OFF, VAD_ON
8
  import pyloudnorm as pyln
9
- import warnings
 
 
 
 
10
 
11
  def get_vad_labels(vad_timestamps: list[list[float]], length: float) -> list[dict]:
12
  subtitles = []
13
  cur = 0.0
 
14
  for start, end in vad_timestamps:
15
  if start > cur:
16
- subtitles.append({
17
- "text": f"Voice Detection: {VAD_OFF}",
18
- "timestamp": [cur, start]
19
- })
20
-
21
- subtitles.append({
22
- "text": f"Voice Detection: {VAD_ON}",
23
- "timestamp": [start, end]
24
- })
25
 
 
 
 
 
 
 
26
  cur = end
 
27
  if cur < length:
28
- subtitles.append({
29
- "text": f"Voice Detection: {VAD_OFF}",
30
- "timestamp": [cur, length]
31
- })
 
 
 
32
  return subtitles
33
 
34
 
35
  def to_gradio_audio(x: np.ndarray, sr: int) -> tuple[int, np.ndarray]:
36
- """Return (sample_rate, int16 mono array) for Gradio Audio. Gradio expects int16;
37
- passing float32 triggers an internal conversion and a warning."""
38
  x = np.asarray(x)
39
-
40
- # Remove extra dims like (1, n, 1) etc.
41
  x = np.squeeze(x)
42
 
43
- # If it's (channels, samples), transpose to (samples, channels)
44
  if x.ndim == 2 and x.shape[0] in (1, 2) and x.shape[1] > x.shape[0]:
45
  x = x.T
46
 
47
- # Ensure mono is (n_samples,)
48
  if x.ndim == 2 and x.shape[1] == 1:
49
  x = x[:, 0]
50
 
51
  x = x.astype(np.float32)
52
  x = np.clip(x, -1.0, 1.0)
53
- # Gradio Audio expects int16; convert here so Gradio doesn't convert and warn
54
  x = (x * 32767).astype(np.int16)
55
 
56
- return (sr, x)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
 
59
  def spec_image(
@@ -65,10 +92,8 @@ def spec_image(
65
  fmax: Optional[float] = None,
66
  vad_timestamps: Optional[list[list[float]]] = None,
67
  ) -> Image.Image:
68
- """
69
- Generate a mel-spectrogram image from an audio array.
70
- """
71
- y = audio_array.flatten() # Ensure it's 1D
72
  S = librosa.feature.melspectrogram(
73
  y=y,
74
  sr=sr,
@@ -77,24 +102,65 @@ def spec_image(
77
  n_mels=n_mels,
78
  fmax=fmax or sr // 2,
79
  )
80
- S_db = librosa.power_to_db(S, ref=np.max(S))
 
81
  fig, ax = plt.subplots(figsize=(8, 3), dpi=150)
 
82
  img = librosa.display.specshow(
83
- S_db, sr=sr, hop_length=hop_length, x_axis="time", y_axis="mel", ax=ax
 
 
 
 
 
 
84
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  cbar = fig.colorbar(img, ax=ax, format="%+2.0f dB")
86
  cbar.set_label("dB")
 
87
  ax.set_title("Mel-spectrogram")
88
  ax.set_xlabel("Time in s")
89
  ax.set_ylabel("Frequency in Hz")
 
90
  fig.tight_layout(pad=0.2)
 
91
  buf = io.BytesIO()
92
  fig.savefig(buf, format="png", bbox_inches="tight", pad_inches=0)
93
- if vad_timestamps:
94
- for start, end in vad_timestamps:
95
- ax.axvspan(start, end, color="red", alpha=0.3)
96
-
97
  plt.close(fig)
 
98
  buf.seek(0)
99
  return Image.open(buf).convert("RGB")
100
 
@@ -105,24 +171,24 @@ def compute_wer(reference: str, hypothesis: str) -> float:
105
  """
106
  ref_words = reference.split()
107
  hyp_words = hypothesis.split()
108
- d = np.zeros((len(ref_words) + 1, len(hyp_words) + 1), dtype=np.uint8)
 
 
109
  for i in range(len(ref_words) + 1):
110
  d[i][0] = i
111
  for j in range(len(hyp_words) + 1):
112
  d[0][j] = j
 
113
  for i in range(1, len(ref_words) + 1):
114
  for j in range(1, len(hyp_words) + 1):
115
- if ref_words[i - 1] == hyp_words[j - 1]:
116
- cost = 0
117
- else:
118
- cost = 1
119
  d[i][j] = min(
120
- d[i - 1][j] + 1, # Deletion
121
- d[i][j - 1] + 1, # Insertion
122
- d[i - 1][j - 1] + cost, # Substitution
123
  )
124
- wer = d[len(ref_words)][len(hyp_words)] / max(len(ref_words), 1)
125
- return wer
126
 
127
 
128
  def measure_loudness(x: np.ndarray, sr: int) -> float:
@@ -130,7 +196,11 @@ def measure_loudness(x: np.ndarray, sr: int) -> float:
130
  return float(meter.integrated_loudness(x))
131
 
132
 
133
- def true_peak_limiter(x: np.ndarray, sr: int, max_true_peak: float = TARGET_TP) -> np.ndarray:
 
 
 
 
134
  upsampled_sr = 192000
135
  x_upsampled = librosa.resample(x, orig_sr=sr, target_sr=upsampled_sr)
136
  true_peak = np.max(np.abs(x_upsampled))
@@ -144,7 +214,7 @@ def true_peak_limiter(x: np.ndarray, sr: int, max_true_peak: float = TARGET_TP)
144
 
145
  x_limited = librosa.resample(x_upsampled, orig_sr=upsampled_sr, target_sr=sr)
146
  x_limited = librosa.util.fix_length(x_limited, size=x.shape[-1])
147
- return x_limited.astype("float32")
148
 
149
 
150
  def normalize_lufs(x: np.ndarray, sr: int) -> np.ndarray:
@@ -153,9 +223,9 @@ def normalize_lufs(x: np.ndarray, sr: int) -> np.ndarray:
153
  """
154
  try:
155
  current_lufs = measure_loudness(x, sr)
156
-
157
  if not np.isfinite(current_lufs):
158
- return x.astype("float32")
159
 
160
  gain_db = TARGET_LOUDNESS - current_lufs
161
  gain = 10 ** (gain_db / 20)
@@ -163,9 +233,7 @@ def normalize_lufs(x: np.ndarray, sr: int) -> np.ndarray:
163
  y = x * gain
164
  y = true_peak_limiter(y, sr, max_true_peak=TARGET_TP)
165
 
166
- return y.astype("float32")
167
  except Exception as e:
168
  warnings.warn(f"LUFS normalization failed, returning input unchanged: {e}")
169
- return x.astype("float32")
170
-
171
-
 
1
  from typing import Optional
 
 
 
2
  import io
3
+ import warnings
4
+
5
+ import librosa
6
+ import librosa.display
7
  import matplotlib.pyplot as plt
8
+ import numpy as np
9
  import pyloudnorm as pyln
10
+ from matplotlib.patches import Patch
11
+ from PIL import Image
12
+
13
+ from constants import TARGET_LOUDNESS, TARGET_TP, VAD_OFF, VAD_ON
14
+
15
 
16
  def get_vad_labels(vad_timestamps: list[list[float]], length: float) -> list[dict]:
17
  subtitles = []
18
  cur = 0.0
19
+
20
  for start, end in vad_timestamps:
21
  if start > cur:
22
+ subtitles.append(
23
+ {
24
+ "text": f"Voice Detection: {VAD_OFF}",
25
+ "timestamp": [cur, start],
26
+ }
27
+ )
 
 
 
28
 
29
+ subtitles.append(
30
+ {
31
+ "text": f"Voice Detection: {VAD_ON}",
32
+ "timestamp": [start, end],
33
+ }
34
+ )
35
  cur = end
36
+
37
  if cur < length:
38
+ subtitles.append(
39
+ {
40
+ "text": f"Voice Detection: {VAD_OFF}",
41
+ "timestamp": [cur, length],
42
+ }
43
+ )
44
+
45
  return subtitles
46
 
47
 
48
  def to_gradio_audio(x: np.ndarray, sr: int) -> tuple[int, np.ndarray]:
49
+ """Return (sample_rate, int16 array) for Gradio Audio."""
 
50
  x = np.asarray(x)
 
 
51
  x = np.squeeze(x)
52
 
 
53
  if x.ndim == 2 and x.shape[0] in (1, 2) and x.shape[1] > x.shape[0]:
54
  x = x.T
55
 
 
56
  if x.ndim == 2 and x.shape[1] == 1:
57
  x = x[:, 0]
58
 
59
  x = x.astype(np.float32)
60
  x = np.clip(x, -1.0, 1.0)
 
61
  x = (x * 32767).astype(np.int16)
62
 
63
+ return sr, x
64
+
65
+
66
+ def _merge_vad_segments(
67
+ vad_timestamps: list[list[float]],
68
+ gap_tolerance: float = 0.05,
69
+ ) -> list[tuple[float, float]]:
70
+ if not vad_timestamps:
71
+ return []
72
+
73
+ segments = sorted((float(start), float(end)) for start, end in vad_timestamps)
74
+ merged: list[tuple[float, float]] = [segments[0]]
75
+
76
+ for start, end in segments[1:]:
77
+ last_start, last_end = merged[-1]
78
+ if start <= last_end + gap_tolerance:
79
+ merged[-1] = (last_start, max(last_end, end))
80
+ else:
81
+ merged.append((start, end))
82
+
83
+ return merged
84
 
85
 
86
  def spec_image(
 
92
  fmax: Optional[float] = None,
93
  vad_timestamps: Optional[list[list[float]]] = None,
94
  ) -> Image.Image:
95
+ y = np.asarray(audio_array, dtype=np.float32).flatten()
96
+
 
 
97
  S = librosa.feature.melspectrogram(
98
  y=y,
99
  sr=sr,
 
102
  n_mels=n_mels,
103
  fmax=fmax or sr // 2,
104
  )
105
+ S_db = librosa.power_to_db(S, ref=np.max)
106
+
107
  fig, ax = plt.subplots(figsize=(8, 3), dpi=150)
108
+
109
  img = librosa.display.specshow(
110
+ S_db,
111
+ sr=sr,
112
+ hop_length=hop_length,
113
+ x_axis="time",
114
+ y_axis="mel",
115
+ cmap="magma",
116
+ ax=ax,
117
  )
118
+
119
+ if vad_timestamps:
120
+ vad_color = "#22C55E" # softer, cleaner green
121
+ merged_segments = _merge_vad_segments(vad_timestamps, gap_tolerance=0.05)
122
+
123
+ ymin, ymax = ax.get_ylim()
124
+ bar_height = (ymax - ymin) * 0.02
125
+ bar_bottom = ymin
126
+
127
+ for start, end in merged_segments:
128
+ ax.fill_between(
129
+ [start, end],
130
+ [bar_bottom, bar_bottom],
131
+ [bar_bottom + bar_height, bar_bottom + bar_height],
132
+ color=vad_color,
133
+ alpha=0.95,
134
+ linewidth=0,
135
+ zorder=5,
136
+ )
137
+
138
+ vad_patch = Patch(
139
+ facecolor=vad_color,
140
+ edgecolor=vad_color,
141
+ label="Voice Activity",
142
+ )
143
+ ax.legend(
144
+ handles=[vad_patch],
145
+ loc="upper right",
146
+ fontsize=8,
147
+ frameon=True,
148
+ framealpha=0.9,
149
+ )
150
+
151
  cbar = fig.colorbar(img, ax=ax, format="%+2.0f dB")
152
  cbar.set_label("dB")
153
+
154
  ax.set_title("Mel-spectrogram")
155
  ax.set_xlabel("Time in s")
156
  ax.set_ylabel("Frequency in Hz")
157
+
158
  fig.tight_layout(pad=0.2)
159
+
160
  buf = io.BytesIO()
161
  fig.savefig(buf, format="png", bbox_inches="tight", pad_inches=0)
 
 
 
 
162
  plt.close(fig)
163
+
164
  buf.seek(0)
165
  return Image.open(buf).convert("RGB")
166
 
 
171
  """
172
  ref_words = reference.split()
173
  hyp_words = hypothesis.split()
174
+
175
+ d = np.zeros((len(ref_words) + 1, len(hyp_words) + 1), dtype=np.uint16)
176
+
177
  for i in range(len(ref_words) + 1):
178
  d[i][0] = i
179
  for j in range(len(hyp_words) + 1):
180
  d[0][j] = j
181
+
182
  for i in range(1, len(ref_words) + 1):
183
  for j in range(1, len(hyp_words) + 1):
184
+ cost = 0 if ref_words[i - 1] == hyp_words[j - 1] else 1
 
 
 
185
  d[i][j] = min(
186
+ d[i - 1][j] + 1,
187
+ d[i][j - 1] + 1,
188
+ d[i - 1][j - 1] + cost,
189
  )
190
+
191
+ return d[len(ref_words)][len(hyp_words)] / max(len(ref_words), 1)
192
 
193
 
194
  def measure_loudness(x: np.ndarray, sr: int) -> float:
 
196
  return float(meter.integrated_loudness(x))
197
 
198
 
199
+ def true_peak_limiter(
200
+ x: np.ndarray,
201
+ sr: int,
202
+ max_true_peak: float = TARGET_TP,
203
+ ) -> np.ndarray:
204
  upsampled_sr = 192000
205
  x_upsampled = librosa.resample(x, orig_sr=sr, target_sr=upsampled_sr)
206
  true_peak = np.max(np.abs(x_upsampled))
 
214
 
215
  x_limited = librosa.resample(x_upsampled, orig_sr=upsampled_sr, target_sr=sr)
216
  x_limited = librosa.util.fix_length(x_limited, size=x.shape[-1])
217
+ return x_limited.astype(np.float32)
218
 
219
 
220
  def normalize_lufs(x: np.ndarray, sr: int) -> np.ndarray:
 
223
  """
224
  try:
225
  current_lufs = measure_loudness(x, sr)
226
+
227
  if not np.isfinite(current_lufs):
228
+ return x.astype(np.float32)
229
 
230
  gain_db = TARGET_LOUDNESS - current_lufs
231
  gain = 10 ** (gain_db / 20)
 
233
  y = x * gain
234
  y = true_peak_limiter(y, sr, max_true_peak=TARGET_TP)
235
 
236
+ return y.astype(np.float32)
237
  except Exception as e:
238
  warnings.warn(f"LUFS normalization failed, returning input unchanged: {e}")
239
+ return x.astype(np.float32)