mariesig commited on
Commit
4be2da0
·
1 Parent(s): aa7cfd7

Add audio normalization

Browse files
Files changed (5) hide show
  1. app.py +19 -7
  2. constants.py +3 -0
  3. offline_pipeline.py +9 -6
  4. requirements.txt +2 -1
  5. utils.py +51 -4
app.py CHANGED
@@ -69,7 +69,7 @@ def process_with_live_transcript(
69
  result_holder["error"] = e
70
 
71
  # 1) First yield: ground truth + input spectrogram only (no audio, no enhanced spec, no transcripts yet)
72
- cleanup_out = cleanup_previous_run(last_sample_stem)
73
  noisy_spec_path = f"{APP_TMP_DIR}/{sample_stem}_noisy_spectrogram.png"
74
  if input_array is not None:
75
  try:
@@ -238,10 +238,17 @@ with gr.Blocks() as demo:
238
  with gr.Tab("Upload local file") as upload_tab:
239
  with gr.Row():
240
  gr.Markdown(open("docs/local_file.md", "r", encoding="utf-8").read())
241
- audio_file_upload = gr.Audio(
242
- type="filepath", sources=["upload"], buttons=["download"], autoplay=True
 
 
 
 
 
 
 
 
243
  )
244
-
245
  enhance_btn = gr.Button("Enhance with Quail Voice Focus 2.0", scale=2, visible=False)
246
 
247
  with gr.Group(elem_classes="panel results-card", visible=False) as results_card:
@@ -385,12 +392,17 @@ with gr.Blocks() as demo:
385
  # Uploading a local file triggers loading the audio file and hiding results until enhancement
386
  audio_file_upload.change(
387
  lambda: gr.update(visible=False),
388
- inputs=None,
389
  outputs=results_card,
390
  ).then(
391
  load_local_file,
392
- inputs=[audio_file_upload],
393
- outputs=[input_array, sample_stem]
 
 
 
 
 
 
394
  )
395
 
396
  # Enhancement button: run pipeline with live transcript progress (dataset + local file modes).
 
69
  result_holder["error"] = e
70
 
71
  # 1) First yield: ground truth + input spectrogram only (no audio, no enhanced spec, no transcripts yet)
72
+ _ = cleanup_previous_run(last_sample_stem)
73
  noisy_spec_path = f"{APP_TMP_DIR}/{sample_stem}_noisy_spectrogram.png"
74
  if input_array is not None:
75
  try:
 
238
  with gr.Tab("Upload local file") as upload_tab:
239
  with gr.Row():
240
  gr.Markdown(open("docs/local_file.md", "r", encoding="utf-8").read())
241
+ audio_file_upload = gr.File(
242
+ file_types=[".wav", ".mp3", ".flac", ".m4a", ".ogg"],
243
+ file_count="single",
244
+ scale=3,
245
+ )
246
+ normalize = gr.Checkbox(label="Normalize audio", value=True)
247
+ audio_preview = gr.Audio(
248
+ label="Preview",
249
+ autoplay=False,
250
+ interactive=False,
251
  )
 
252
  enhance_btn = gr.Button("Enhance with Quail Voice Focus 2.0", scale=2, visible=False)
253
 
254
  with gr.Group(elem_classes="panel results-card", visible=False) as results_card:
 
392
  # Uploading a local file triggers loading the audio file and hiding results until enhancement
393
  audio_file_upload.change(
394
  lambda: gr.update(visible=False),
 
395
  outputs=results_card,
396
  ).then(
397
  load_local_file,
398
+ inputs=[audio_file_upload, normalize],
399
+ outputs=[input_array, sample_stem, audio_preview],
400
+ )
401
+
402
+ normalize.change(
403
+ load_local_file,
404
+ inputs=[audio_file_upload, normalize],
405
+ outputs=[input_array, sample_stem, audio_preview],
406
  )
407
 
408
  # Enhancement button: run pipeline with live transcript progress (dataset + local file modes).
constants.py CHANGED
@@ -23,6 +23,9 @@ DEFAULT_SR: Final = 16000
23
  STREAM_EVERY: Final = 0.2
24
  WARMUP_SECONDS: Final = 2 # seconds before "recording ready" light turns on
25
 
 
 
 
26
  STREAMER_CLASSES: Final = {
27
  "Deepgram Nova-3 RT": DeepgramStreamer,
28
  "Soniox STT-RT v3": SonioxStreamer,
 
23
  STREAM_EVERY: Final = 0.2
24
  WARMUP_SECONDS: Final = 2 # seconds before "recording ready" light turns on
25
 
26
+ TARGET_LOUDNESS: Final = -17.0
27
+ TARGET_TP: Final = -1.5
28
+
29
  STREAMER_CLASSES: Final = {
30
  "Deepgram Nova-3 RT": DeepgramStreamer,
31
  "Soniox STT-RT v3": SonioxStreamer,
offline_pipeline.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
  import gradio as gr
6
  import librosa
7
  from sdk import SDKWrapper
8
- from utils import spec_image, compute_wer, transcribe_audio, to_gradio_audio
9
  from hf_dataset_utils import get_audio, get_transcript
10
  from constants import DEFAULT_SR, APP_TMP_DIR, STREAMER_CLASSES
11
  import numpy as np
@@ -320,17 +320,20 @@ def run_offline_pipeline_streaming(
320
 
321
 
322
  def load_local_file(
323
- sample_path: str
324
- ) -> tuple[np.ndarray, str]:
 
325
  if not sample_path or not os.path.exists(sample_path):
326
- gr.Warning("Please upload a valid audio file.")
327
- raise ValueError("Missing audio sample. Please upload an audio sample or use the microphone input.")
328
  if os.path.getsize(sample_path) > 5 * 1024 * 1024:
329
  gr.Warning("File size exceeds 5 MB limit. Please upload a smaller file.")
330
  raise ValueError("Uploaded file exceeds the 5 MB size limit.")
331
  new_sample_stem = os.path.splitext(os.path.basename(sample_path))[0]
332
  y_16k, _ = librosa.load(sample_path, sr=DEFAULT_SR, dtype="float32", mono=True)
333
- return y_16k, new_sample_stem
 
 
 
334
 
335
  def load_file_from_dataset(sample_id: str) -> tuple[tuple | None, np.ndarray | None, str]:
336
  if not sample_id:
 
5
  import gradio as gr
6
  import librosa
7
  from sdk import SDKWrapper
8
+ from utils import spec_image, compute_wer, transcribe_audio, to_gradio_audio, normalize_lufs
9
  from hf_dataset_utils import get_audio, get_transcript
10
  from constants import DEFAULT_SR, APP_TMP_DIR, STREAMER_CLASSES
11
  import numpy as np
 
320
 
321
 
322
  def load_local_file(
323
+ sample_path: str,
324
+ normalize: bool = True,
325
+ ) -> tuple[np.ndarray | None, str, tuple | None]:
326
  if not sample_path or not os.path.exists(sample_path):
327
+ return None, "", None
 
328
  if os.path.getsize(sample_path) > 5 * 1024 * 1024:
329
  gr.Warning("File size exceeds 5 MB limit. Please upload a smaller file.")
330
  raise ValueError("Uploaded file exceeds the 5 MB size limit.")
331
  new_sample_stem = os.path.splitext(os.path.basename(sample_path))[0]
332
  y_16k, _ = librosa.load(sample_path, sr=DEFAULT_SR, dtype="float32", mono=True)
333
+ if normalize:
334
+ y_16k = normalize_lufs(y_16k, DEFAULT_SR)
335
+ gradio_audio = to_gradio_audio(y_16k, DEFAULT_SR)
336
+ return y_16k, new_sample_stem, gradio_audio
337
 
338
  def load_file_from_dataset(sample_id: str) -> tuple[tuple | None, np.ndarray | None, str]:
339
  if not sample_id:
requirements.txt CHANGED
@@ -13,4 +13,5 @@ soxr
13
  datasets
14
  torchcodec
15
  torch
16
- torchaudio
 
 
13
  datasets
14
  torchcodec
15
  torch
16
+ torchaudio
17
+ pyloudnorm
utils.py CHANGED
@@ -1,12 +1,14 @@
1
- from typing import Callable, Optional
2
-
3
  import numpy as np
4
  import librosa
5
  from PIL import Image
6
  import io
7
  import matplotlib.pyplot as plt
8
- import resampy
9
- from constants import DEFAULT_SR, STREAMER_CLASSES
 
 
10
 
11
  def to_gradio_audio(x: np.ndarray, sr: int) -> tuple[int, np.ndarray]:
12
  """Return (sample_rate, int16 mono array) for Gradio Audio. Gradio expects int16;
@@ -96,6 +98,51 @@ def compute_wer(reference: str, hypothesis: str) -> float:
96
  return wer
97
 
98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
  def transcribe_audio(
101
  audio_array: np.ndarray,
 
1
+ from typing import Optional, Callable
2
+ import resampy
3
  import numpy as np
4
  import librosa
5
  from PIL import Image
6
  import io
7
  import matplotlib.pyplot as plt
8
+ from constants import DEFAULT_SR, TARGET_LOUDNESS, TARGET_TP, STREAMER_CLASSES
9
+ import warnings
10
+ import pyloudnorm as pyln
11
+
12
 
13
  def to_gradio_audio(x: np.ndarray, sr: int) -> tuple[int, np.ndarray]:
14
  """Return (sample_rate, int16 mono array) for Gradio Audio. Gradio expects int16;
 
98
  return wer
99
 
100
 
101
+ def measure_loudness(x: np.ndarray, sr: int) -> float:
102
+ meter = pyln.Meter(sr)
103
+ return float(meter.integrated_loudness(x))
104
+
105
+
106
+ def true_peak_limiter(x: np.ndarray, sr: int, max_true_peak: float = TARGET_TP) -> np.ndarray:
107
+ upsampled_sr = 192000
108
+ x_upsampled = librosa.resample(x, orig_sr=sr, target_sr=upsampled_sr)
109
+ true_peak = np.max(np.abs(x_upsampled))
110
+
111
+ if true_peak > 0:
112
+ true_peak_db = 20 * np.log10(true_peak)
113
+ if true_peak_db > max_true_peak:
114
+ gain_db = max_true_peak - true_peak_db
115
+ gain = 10 ** (gain_db / 20)
116
+ x_upsampled = x_upsampled * gain
117
+
118
+ x_limited = librosa.resample(x_upsampled, orig_sr=upsampled_sr, target_sr=sr)
119
+ x_limited = librosa.util.fix_length(x_limited, size=x.shape[-1])
120
+ return x_limited.astype("float32")
121
+
122
+
123
+ def normalize_lufs(x: np.ndarray, sr: int) -> np.ndarray:
124
+ """
125
+ Normalize audio to a fixed integrated loudness target and limit true peak.
126
+ """
127
+ try:
128
+ current_lufs = measure_loudness(x, sr)
129
+
130
+ if not np.isfinite(current_lufs):
131
+ return x.astype("float32")
132
+
133
+ gain_db = TARGET_LOUDNESS - current_lufs
134
+ gain = 10 ** (gain_db / 20)
135
+
136
+ y = x * gain
137
+ y = true_peak_limiter(y, sr, max_true_peak=TARGET_TP)
138
+
139
+ return y.astype("float32")
140
+ except Exception as e:
141
+ warnings.warn(f"LUFS normalization failed, returning input unchanged: {e}")
142
+ return x.astype("float32")
143
+
144
+
145
+
146
 
147
  def transcribe_audio(
148
  audio_array: np.ndarray,