Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
mariesig commited on
Commit ·
4be2da0
1
Parent(s): aa7cfd7
Add audio normalization
Browse files- app.py +19 -7
- constants.py +3 -0
- offline_pipeline.py +9 -6
- requirements.txt +2 -1
- utils.py +51 -4
app.py
CHANGED
|
@@ -69,7 +69,7 @@ def process_with_live_transcript(
|
|
| 69 |
result_holder["error"] = e
|
| 70 |
|
| 71 |
# 1) First yield: ground truth + input spectrogram only (no audio, no enhanced spec, no transcripts yet)
|
| 72 |
-
|
| 73 |
noisy_spec_path = f"{APP_TMP_DIR}/{sample_stem}_noisy_spectrogram.png"
|
| 74 |
if input_array is not None:
|
| 75 |
try:
|
|
@@ -238,10 +238,17 @@ with gr.Blocks() as demo:
|
|
| 238 |
with gr.Tab("Upload local file") as upload_tab:
|
| 239 |
with gr.Row():
|
| 240 |
gr.Markdown(open("docs/local_file.md", "r", encoding="utf-8").read())
|
| 241 |
-
audio_file_upload = gr.
|
| 242 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
)
|
| 244 |
-
|
| 245 |
enhance_btn = gr.Button("Enhance with Quail Voice Focus 2.0", scale=2, visible=False)
|
| 246 |
|
| 247 |
with gr.Group(elem_classes="panel results-card", visible=False) as results_card:
|
|
@@ -385,12 +392,17 @@ with gr.Blocks() as demo:
|
|
| 385 |
# Uploading a local file triggers loading the audio file and hiding results until enhancement
|
| 386 |
audio_file_upload.change(
|
| 387 |
lambda: gr.update(visible=False),
|
| 388 |
-
inputs=None,
|
| 389 |
outputs=results_card,
|
| 390 |
).then(
|
| 391 |
load_local_file,
|
| 392 |
-
inputs=[audio_file_upload],
|
| 393 |
-
outputs=[input_array, sample_stem]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 394 |
)
|
| 395 |
|
| 396 |
# Enhancement button: run pipeline with live transcript progress (dataset + local file modes).
|
|
|
|
| 69 |
result_holder["error"] = e
|
| 70 |
|
| 71 |
# 1) First yield: ground truth + input spectrogram only (no audio, no enhanced spec, no transcripts yet)
|
| 72 |
+
_ = cleanup_previous_run(last_sample_stem)
|
| 73 |
noisy_spec_path = f"{APP_TMP_DIR}/{sample_stem}_noisy_spectrogram.png"
|
| 74 |
if input_array is not None:
|
| 75 |
try:
|
|
|
|
| 238 |
with gr.Tab("Upload local file") as upload_tab:
|
| 239 |
with gr.Row():
|
| 240 |
gr.Markdown(open("docs/local_file.md", "r", encoding="utf-8").read())
|
| 241 |
+
audio_file_upload = gr.File(
|
| 242 |
+
file_types=[".wav", ".mp3", ".flac", ".m4a", ".ogg"],
|
| 243 |
+
file_count="single",
|
| 244 |
+
scale=3,
|
| 245 |
+
)
|
| 246 |
+
normalize = gr.Checkbox(label="Normalize audio", value=True)
|
| 247 |
+
audio_preview = gr.Audio(
|
| 248 |
+
label="Preview",
|
| 249 |
+
autoplay=False,
|
| 250 |
+
interactive=False,
|
| 251 |
)
|
|
|
|
| 252 |
enhance_btn = gr.Button("Enhance with Quail Voice Focus 2.0", scale=2, visible=False)
|
| 253 |
|
| 254 |
with gr.Group(elem_classes="panel results-card", visible=False) as results_card:
|
|
|
|
| 392 |
# Uploading a local file triggers loading the audio file and hiding results until enhancement
|
| 393 |
audio_file_upload.change(
|
| 394 |
lambda: gr.update(visible=False),
|
|
|
|
| 395 |
outputs=results_card,
|
| 396 |
).then(
|
| 397 |
load_local_file,
|
| 398 |
+
inputs=[audio_file_upload, normalize],
|
| 399 |
+
outputs=[input_array, sample_stem, audio_preview],
|
| 400 |
+
)
|
| 401 |
+
|
| 402 |
+
normalize.change(
|
| 403 |
+
load_local_file,
|
| 404 |
+
inputs=[audio_file_upload, normalize],
|
| 405 |
+
outputs=[input_array, sample_stem, audio_preview],
|
| 406 |
)
|
| 407 |
|
| 408 |
# Enhancement button: run pipeline with live transcript progress (dataset + local file modes).
|
constants.py
CHANGED
|
@@ -23,6 +23,9 @@ DEFAULT_SR: Final = 16000
|
|
| 23 |
STREAM_EVERY: Final = 0.2
|
| 24 |
WARMUP_SECONDS: Final = 2 # seconds before "recording ready" light turns on
|
| 25 |
|
|
|
|
|
|
|
|
|
|
| 26 |
STREAMER_CLASSES: Final = {
|
| 27 |
"Deepgram Nova-3 RT": DeepgramStreamer,
|
| 28 |
"Soniox STT-RT v3": SonioxStreamer,
|
|
|
|
| 23 |
STREAM_EVERY: Final = 0.2
|
| 24 |
WARMUP_SECONDS: Final = 2 # seconds before "recording ready" light turns on
|
| 25 |
|
| 26 |
+
TARGET_LOUDNESS: Final = -17.0
|
| 27 |
+
TARGET_TP: Final = -1.5
|
| 28 |
+
|
| 29 |
STREAMER_CLASSES: Final = {
|
| 30 |
"Deepgram Nova-3 RT": DeepgramStreamer,
|
| 31 |
"Soniox STT-RT v3": SonioxStreamer,
|
offline_pipeline.py
CHANGED
|
@@ -5,7 +5,7 @@ from typing import Optional
|
|
| 5 |
import gradio as gr
|
| 6 |
import librosa
|
| 7 |
from sdk import SDKWrapper
|
| 8 |
-
from utils import spec_image, compute_wer, transcribe_audio, to_gradio_audio
|
| 9 |
from hf_dataset_utils import get_audio, get_transcript
|
| 10 |
from constants import DEFAULT_SR, APP_TMP_DIR, STREAMER_CLASSES
|
| 11 |
import numpy as np
|
|
@@ -320,17 +320,20 @@ def run_offline_pipeline_streaming(
|
|
| 320 |
|
| 321 |
|
| 322 |
def load_local_file(
|
| 323 |
-
sample_path: str
|
| 324 |
-
|
|
|
|
| 325 |
if not sample_path or not os.path.exists(sample_path):
|
| 326 |
-
|
| 327 |
-
raise ValueError("Missing audio sample. Please upload an audio sample or use the microphone input.")
|
| 328 |
if os.path.getsize(sample_path) > 5 * 1024 * 1024:
|
| 329 |
gr.Warning("File size exceeds 5 MB limit. Please upload a smaller file.")
|
| 330 |
raise ValueError("Uploaded file exceeds the 5 MB size limit.")
|
| 331 |
new_sample_stem = os.path.splitext(os.path.basename(sample_path))[0]
|
| 332 |
y_16k, _ = librosa.load(sample_path, sr=DEFAULT_SR, dtype="float32", mono=True)
|
| 333 |
-
|
|
|
|
|
|
|
|
|
|
| 334 |
|
| 335 |
def load_file_from_dataset(sample_id: str) -> tuple[tuple | None, np.ndarray | None, str]:
|
| 336 |
if not sample_id:
|
|
|
|
| 5 |
import gradio as gr
|
| 6 |
import librosa
|
| 7 |
from sdk import SDKWrapper
|
| 8 |
+
from utils import spec_image, compute_wer, transcribe_audio, to_gradio_audio, normalize_lufs
|
| 9 |
from hf_dataset_utils import get_audio, get_transcript
|
| 10 |
from constants import DEFAULT_SR, APP_TMP_DIR, STREAMER_CLASSES
|
| 11 |
import numpy as np
|
|
|
|
| 320 |
|
| 321 |
|
| 322 |
def load_local_file(
|
| 323 |
+
sample_path: str,
|
| 324 |
+
normalize: bool = True,
|
| 325 |
+
) -> tuple[np.ndarray | None, str, tuple | None]:
|
| 326 |
if not sample_path or not os.path.exists(sample_path):
|
| 327 |
+
return None, "", None
|
|
|
|
| 328 |
if os.path.getsize(sample_path) > 5 * 1024 * 1024:
|
| 329 |
gr.Warning("File size exceeds 5 MB limit. Please upload a smaller file.")
|
| 330 |
raise ValueError("Uploaded file exceeds the 5 MB size limit.")
|
| 331 |
new_sample_stem = os.path.splitext(os.path.basename(sample_path))[0]
|
| 332 |
y_16k, _ = librosa.load(sample_path, sr=DEFAULT_SR, dtype="float32", mono=True)
|
| 333 |
+
if normalize:
|
| 334 |
+
y_16k = normalize_lufs(y_16k, DEFAULT_SR)
|
| 335 |
+
gradio_audio = to_gradio_audio(y_16k, DEFAULT_SR)
|
| 336 |
+
return y_16k, new_sample_stem, gradio_audio
|
| 337 |
|
| 338 |
def load_file_from_dataset(sample_id: str) -> tuple[tuple | None, np.ndarray | None, str]:
|
| 339 |
if not sample_id:
|
requirements.txt
CHANGED
|
@@ -13,4 +13,5 @@ soxr
|
|
| 13 |
datasets
|
| 14 |
torchcodec
|
| 15 |
torch
|
| 16 |
-
torchaudio
|
|
|
|
|
|
| 13 |
datasets
|
| 14 |
torchcodec
|
| 15 |
torch
|
| 16 |
+
torchaudio
|
| 17 |
+
pyloudnorm
|
utils.py
CHANGED
|
@@ -1,12 +1,14 @@
|
|
| 1 |
-
from typing import
|
| 2 |
-
|
| 3 |
import numpy as np
|
| 4 |
import librosa
|
| 5 |
from PIL import Image
|
| 6 |
import io
|
| 7 |
import matplotlib.pyplot as plt
|
| 8 |
-
import
|
| 9 |
-
|
|
|
|
|
|
|
| 10 |
|
| 11 |
def to_gradio_audio(x: np.ndarray, sr: int) -> tuple[int, np.ndarray]:
|
| 12 |
"""Return (sample_rate, int16 mono array) for Gradio Audio. Gradio expects int16;
|
|
@@ -96,6 +98,51 @@ def compute_wer(reference: str, hypothesis: str) -> float:
|
|
| 96 |
return wer
|
| 97 |
|
| 98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
def transcribe_audio(
|
| 101 |
audio_array: np.ndarray,
|
|
|
|
| 1 |
+
from typing import Optional, Callable
|
| 2 |
+
import resampy
|
| 3 |
import numpy as np
|
| 4 |
import librosa
|
| 5 |
from PIL import Image
|
| 6 |
import io
|
| 7 |
import matplotlib.pyplot as plt
|
| 8 |
+
from constants import DEFAULT_SR, TARGET_LOUDNESS, TARGET_TP, STREAMER_CLASSES
|
| 9 |
+
import warnings
|
| 10 |
+
import pyloudnorm as pyln
|
| 11 |
+
|
| 12 |
|
| 13 |
def to_gradio_audio(x: np.ndarray, sr: int) -> tuple[int, np.ndarray]:
|
| 14 |
"""Return (sample_rate, int16 mono array) for Gradio Audio. Gradio expects int16;
|
|
|
|
| 98 |
return wer
|
| 99 |
|
| 100 |
|
| 101 |
+
def measure_loudness(x: np.ndarray, sr: int) -> float:
|
| 102 |
+
meter = pyln.Meter(sr)
|
| 103 |
+
return float(meter.integrated_loudness(x))
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def true_peak_limiter(x: np.ndarray, sr: int, max_true_peak: float = TARGET_TP) -> np.ndarray:
|
| 107 |
+
upsampled_sr = 192000
|
| 108 |
+
x_upsampled = librosa.resample(x, orig_sr=sr, target_sr=upsampled_sr)
|
| 109 |
+
true_peak = np.max(np.abs(x_upsampled))
|
| 110 |
+
|
| 111 |
+
if true_peak > 0:
|
| 112 |
+
true_peak_db = 20 * np.log10(true_peak)
|
| 113 |
+
if true_peak_db > max_true_peak:
|
| 114 |
+
gain_db = max_true_peak - true_peak_db
|
| 115 |
+
gain = 10 ** (gain_db / 20)
|
| 116 |
+
x_upsampled = x_upsampled * gain
|
| 117 |
+
|
| 118 |
+
x_limited = librosa.resample(x_upsampled, orig_sr=upsampled_sr, target_sr=sr)
|
| 119 |
+
x_limited = librosa.util.fix_length(x_limited, size=x.shape[-1])
|
| 120 |
+
return x_limited.astype("float32")
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def normalize_lufs(x: np.ndarray, sr: int) -> np.ndarray:
|
| 124 |
+
"""
|
| 125 |
+
Normalize audio to a fixed integrated loudness target and limit true peak.
|
| 126 |
+
"""
|
| 127 |
+
try:
|
| 128 |
+
current_lufs = measure_loudness(x, sr)
|
| 129 |
+
|
| 130 |
+
if not np.isfinite(current_lufs):
|
| 131 |
+
return x.astype("float32")
|
| 132 |
+
|
| 133 |
+
gain_db = TARGET_LOUDNESS - current_lufs
|
| 134 |
+
gain = 10 ** (gain_db / 20)
|
| 135 |
+
|
| 136 |
+
y = x * gain
|
| 137 |
+
y = true_peak_limiter(y, sr, max_true_peak=TARGET_TP)
|
| 138 |
+
|
| 139 |
+
return y.astype("float32")
|
| 140 |
+
except Exception as e:
|
| 141 |
+
warnings.warn(f"LUFS normalization failed, returning input unchanged: {e}")
|
| 142 |
+
return x.astype("float32")
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
|
| 146 |
|
| 147 |
def transcribe_audio(
|
| 148 |
audio_array: np.ndarray,
|