Spaces:

ai-coustics
/

VoiceFocus

Running on CPU Upgrade

App Files Files Community

mariesig commited on Mar 13

Commit

4be2da0

1 Parent(s): aa7cfd7

Add audio normalization

Browse files

Files changed (5) hide show

app.py +19 -7
constants.py +3 -0
offline_pipeline.py +9 -6
requirements.txt +2 -1
utils.py +51 -4

app.py CHANGED Viewed

@@ -69,7 +69,7 @@ def process_with_live_transcript(
             result_holder["error"] = e
     # 1) First yield: ground truth + input spectrogram only (no audio, no enhanced spec, no transcripts yet)
-    cleanup_out = cleanup_previous_run(last_sample_stem)
     noisy_spec_path = f"{APP_TMP_DIR}/{sample_stem}_noisy_spectrogram.png"
     if input_array is not None:
         try:
@@ -238,10 +238,17 @@ with gr.Blocks() as demo:
         with gr.Tab("Upload local file") as upload_tab:
             with gr.Row():
                 gr.Markdown(open("docs/local_file.md", "r", encoding="utf-8").read())
-            audio_file_upload = gr.Audio(
-                type="filepath", sources=["upload"], buttons=["download"], autoplay=True
             )
         enhance_btn = gr.Button("Enhance with Quail Voice Focus 2.0", scale=2, visible=False)
     with gr.Group(elem_classes="panel results-card", visible=False) as results_card:
@@ -385,12 +392,17 @@ with gr.Blocks() as demo:
     # Uploading a local file triggers loading the audio file and hiding results until enhancement
     audio_file_upload.change(
         lambda: gr.update(visible=False),
-        inputs=None,
         outputs=results_card,
     ).then(
         load_local_file,
-        inputs=[audio_file_upload],
-        outputs=[input_array, sample_stem]
     )
     # Enhancement button: run pipeline with live transcript progress (dataset + local file modes).

             result_holder["error"] = e
     # 1) First yield: ground truth + input spectrogram only (no audio, no enhanced spec, no transcripts yet)
+    _ = cleanup_previous_run(last_sample_stem)
     noisy_spec_path = f"{APP_TMP_DIR}/{sample_stem}_noisy_spectrogram.png"
     if input_array is not None:
         try:
         with gr.Tab("Upload local file") as upload_tab:
             with gr.Row():
                 gr.Markdown(open("docs/local_file.md", "r", encoding="utf-8").read())
+            audio_file_upload = gr.File(
+                file_types=[".wav", ".mp3", ".flac", ".m4a", ".ogg"],
+                file_count="single",
+                scale=3,
+            )
+            normalize = gr.Checkbox(label="Normalize audio", value=True)
+            audio_preview = gr.Audio(
+                label="Preview",
+                autoplay=False,
+                interactive=False,
             )
         enhance_btn = gr.Button("Enhance with Quail Voice Focus 2.0", scale=2, visible=False)
     with gr.Group(elem_classes="panel results-card", visible=False) as results_card:
     # Uploading a local file triggers loading the audio file and hiding results until enhancement
     audio_file_upload.change(
         lambda: gr.update(visible=False),
         outputs=results_card,
     ).then(
         load_local_file,
+        inputs=[audio_file_upload, normalize],
+        outputs=[input_array, sample_stem, audio_preview],
+    )
+    normalize.change(
+        load_local_file,
+        inputs=[audio_file_upload, normalize],
+        outputs=[input_array, sample_stem, audio_preview],
     )
     # Enhancement button: run pipeline with live transcript progress (dataset + local file modes).

constants.py CHANGED Viewed

@@ -23,6 +23,9 @@ DEFAULT_SR: Final = 16000
 STREAM_EVERY: Final = 0.2
 WARMUP_SECONDS: Final = 2  # seconds before "recording ready" light turns on
 STREAMER_CLASSES: Final = {
     "Deepgram Nova-3 RT": DeepgramStreamer,
     "Soniox STT-RT v3": SonioxStreamer,

 STREAM_EVERY: Final = 0.2
 WARMUP_SECONDS: Final = 2  # seconds before "recording ready" light turns on
+TARGET_LOUDNESS: Final = -17.0
+TARGET_TP: Final = -1.5
 STREAMER_CLASSES: Final = {
     "Deepgram Nova-3 RT": DeepgramStreamer,
     "Soniox STT-RT v3": SonioxStreamer,

offline_pipeline.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Optional
 import gradio as gr
 import librosa
 from sdk import SDKWrapper
-from utils import spec_image, compute_wer, transcribe_audio, to_gradio_audio
 from hf_dataset_utils import get_audio, get_transcript
 from constants import DEFAULT_SR, APP_TMP_DIR, STREAMER_CLASSES
 import numpy as np
@@ -320,17 +320,20 @@ def run_offline_pipeline_streaming(
 def load_local_file(
-    sample_path: str
-    ) -> tuple[np.ndarray, str]:
     if not sample_path or not os.path.exists(sample_path):
-        gr.Warning("Please upload a valid audio file.")
-        raise ValueError("Missing audio sample. Please upload an audio sample or use the microphone input.")
     if os.path.getsize(sample_path) > 5 * 1024 * 1024:
         gr.Warning("File size exceeds 5 MB limit. Please upload a smaller file.")
         raise ValueError("Uploaded file exceeds the 5 MB size limit.")
     new_sample_stem = os.path.splitext(os.path.basename(sample_path))[0]
     y_16k, _ = librosa.load(sample_path, sr=DEFAULT_SR, dtype="float32", mono=True)
-    return y_16k, new_sample_stem
 def load_file_from_dataset(sample_id: str) -> tuple[tuple | None, np.ndarray | None, str]:
     if not sample_id:

 import gradio as gr
 import librosa
 from sdk import SDKWrapper
+from utils import spec_image, compute_wer, transcribe_audio, to_gradio_audio, normalize_lufs
 from hf_dataset_utils import get_audio, get_transcript
 from constants import DEFAULT_SR, APP_TMP_DIR, STREAMER_CLASSES
 import numpy as np
 def load_local_file(
+    sample_path: str,
+    normalize: bool = True,
+    ) -> tuple[np.ndarray | None, str, tuple | None]:
     if not sample_path or not os.path.exists(sample_path):
+        return None, "", None
     if os.path.getsize(sample_path) > 5 * 1024 * 1024:
         gr.Warning("File size exceeds 5 MB limit. Please upload a smaller file.")
         raise ValueError("Uploaded file exceeds the 5 MB size limit.")
     new_sample_stem = os.path.splitext(os.path.basename(sample_path))[0]
     y_16k, _ = librosa.load(sample_path, sr=DEFAULT_SR, dtype="float32", mono=True)
+    if normalize:
+        y_16k = normalize_lufs(y_16k, DEFAULT_SR)
+    gradio_audio = to_gradio_audio(y_16k, DEFAULT_SR)
+    return y_16k, new_sample_stem, gradio_audio
 def load_file_from_dataset(sample_id: str) -> tuple[tuple | None, np.ndarray | None, str]:
     if not sample_id:

requirements.txt CHANGED Viewed

@@ -13,4 +13,5 @@ soxr
 datasets
 torchcodec
 torch
-torchaudio

 datasets
 torchcodec
 torch
+torchaudio
+pyloudnorm

utils.py CHANGED Viewed

@@ -1,12 +1,14 @@
-from typing import Callable, Optional
 import numpy as np
 import librosa
 from PIL import Image
 import io
 import matplotlib.pyplot as plt
-import resampy
-from constants import DEFAULT_SR, STREAMER_CLASSES
 def to_gradio_audio(x: np.ndarray, sr: int) -> tuple[int, np.ndarray]:
     """Return (sample_rate, int16 mono array) for Gradio Audio. Gradio expects int16;
@@ -96,6 +98,51 @@ def compute_wer(reference: str, hypothesis: str) -> float:
     return wer
 def transcribe_audio(
     audio_array: np.ndarray,

+from typing import Optional, Callable
+import resampy
 import numpy as np
 import librosa
 from PIL import Image
 import io
 import matplotlib.pyplot as plt
+from constants import DEFAULT_SR, TARGET_LOUDNESS, TARGET_TP, STREAMER_CLASSES
+import warnings
+import pyloudnorm as pyln
 def to_gradio_audio(x: np.ndarray, sr: int) -> tuple[int, np.ndarray]:
     """Return (sample_rate, int16 mono array) for Gradio Audio. Gradio expects int16;
     return wer
+def measure_loudness(x: np.ndarray, sr: int) -> float:
+    meter = pyln.Meter(sr)
+    return float(meter.integrated_loudness(x))
+def true_peak_limiter(x: np.ndarray, sr: int, max_true_peak: float = TARGET_TP) -> np.ndarray:
+    upsampled_sr = 192000
+    x_upsampled = librosa.resample(x, orig_sr=sr, target_sr=upsampled_sr)
+    true_peak = np.max(np.abs(x_upsampled))
+    if true_peak > 0:
+        true_peak_db = 20 * np.log10(true_peak)
+        if true_peak_db > max_true_peak:
+            gain_db = max_true_peak - true_peak_db
+            gain = 10 ** (gain_db / 20)
+            x_upsampled = x_upsampled * gain
+    x_limited = librosa.resample(x_upsampled, orig_sr=upsampled_sr, target_sr=sr)
+    x_limited = librosa.util.fix_length(x_limited, size=x.shape[-1])
+    return x_limited.astype("float32")
+def normalize_lufs(x: np.ndarray, sr: int) -> np.ndarray:
+    """
+    Normalize audio to a fixed integrated loudness target and limit true peak.
+    """
+    try:
+        current_lufs = measure_loudness(x, sr)
+        if not np.isfinite(current_lufs):
+            return x.astype("float32")
+        gain_db = TARGET_LOUDNESS - current_lufs
+        gain = 10 ** (gain_db / 20)
+        y = x * gain
+        y = true_peak_limiter(y, sr, max_true_peak=TARGET_TP)
+        return y.astype("float32")
+    except Exception as e:
+        warnings.warn(f"LUFS normalization failed, returning input unchanged: {e}")
+        return x.astype("float32")
 def transcribe_audio(
     audio_array: np.ndarray,