Spaces:
Running on CPU Upgrade
vad (#3)
Browse files- remove websocket assignment during shutdown (ac7c4481f1d26fd8161dfc668c79440b88d09a5c)
- add vad and sr to streaming (7274b791d482afe7ef9ab2020f74ceace38e16c2)
- move VAD constants to seperate file (06fe4294bf3df2784ab123904982e173aec34394)
- dataset tab stops recording (62744142a409edfa5686bd086b723901e71e92a1)
- remove redundant code from tab switch (0a77e8e90695c7eb99d272461aec14c057c9b08a)
- improve layout (ba97774b30ae8d6dc089f66e0b1d8471dbe10098)
- refactor streaming pipeline (b22fd96890c75efae0e6c56d5bc5d9c329be7192)
- validate sample rate in stt streamers (ab0481a43d19026f3fd291f05b2b146c857ad746)
- default SDK Params (43283c8a389a539e100033f0ecda6038e0b1f1ea)
- remove unused DEFAULT_SAMPLE_ID constant (df08ce34defdaf2ab622249f3012fc53b5585986)
- reorder/name to match offline pipeline (c4c5df8d3f6acc14eb6d426047b74576b68a6e4f)
- fix ui on tab switch (be39c5be95c88f69b66c46e15501b808e3c25507)
- refactor offline pipeline (4e945b9cf36c72bb129d1bb2ce23a9a849fc4af4)
- replace soundfile with librosa (03a504941537d06664bacde8b154e48e9348a429)
- bug fix (4319862f89f59aed7f8739bf83be99d34001698f)
- ruff (6606020cb7ce644e81865e4c79066f7f82fd2d7c)
- VAD in spectrogram (a7c506c419746c71d1d63d8719017ebc50b2ef07)
- stt model change on streaming (12039524efd8521399fecb520e0274213ff044ee)
- fix vad bar height (b949ffac592b85f0fd3b52d975485659f7c4d98a)
- app.py +132 -278
- assets/active_light.css +0 -68
- assets/styling.css +82 -0
- clean_up.py +2 -5
- constants.py +3 -1
- offline_pipeline.py +210 -86
- sdk.py +44 -18
- stream_pipeline.py +133 -158
- stt_streamers/deepgram_streamer.py +2 -2
- stt_streamers/soniox_streamer.py +2 -3
- ui.py +20 -0
- utils.py +135 -36
|
@@ -1,162 +1,33 @@
|
|
| 1 |
import os
|
| 2 |
-
import threading
|
| 3 |
-
import time
|
| 4 |
import gradio as gr
|
| 5 |
from pathlib import Path
|
| 6 |
|
| 7 |
-
from constants import STREAM_EVERY,
|
| 8 |
-
from
|
|
|
|
| 9 |
|
| 10 |
from stream_pipeline import (
|
| 11 |
-
|
| 12 |
-
get_live_transcripts,
|
| 13 |
on_stop_recording,
|
| 14 |
-
set_stt_streamer,
|
| 15 |
-
stop_online_backend,
|
| 16 |
-
transcribe_stream,
|
| 17 |
shutdown_streamers,
|
|
|
|
| 18 |
)
|
| 19 |
from offline_pipeline import (
|
| 20 |
load_file_from_dataset,
|
| 21 |
load_local_file,
|
| 22 |
-
|
| 23 |
)
|
| 24 |
-
from utils import spec_image
|
| 25 |
from clean_up import purge_tmp_directory, cleanup_previous_run
|
| 26 |
|
| 27 |
-
# Active light HTML: whole container is the light (gray = warming up, red = ready)
|
| 28 |
-
ACTIVE_LIGHT_GRAY = (
|
| 29 |
-
'<div class="active-light active-light--off" title="Warming up" style="display:flex;justify-content:center;align-items:center;width:100%;height:100%;min-height:3rem;">'
|
| 30 |
-
'<span class="active-light__label">Warming up...</span></div>'
|
| 31 |
-
)
|
| 32 |
-
ACTIVE_LIGHT_RED = (
|
| 33 |
-
'<div class="active-light active-light--on" title="Ready" style="display:flex;justify-content:center;align-items:center;width:100%;height:100%;min-height:3rem;">'
|
| 34 |
-
'<span class="active-light__label">Ready!</span></div>'
|
| 35 |
-
)
|
| 36 |
-
WARMUP_TICKS = max(1, int(WARMUP_SECONDS * 2)) # timer ticks every 0.5s
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
def warmup_tick(on_stream_tab, warmup_elapsed, _current_html):
|
| 40 |
-
if not on_stream_tab:
|
| 41 |
-
return 0, ACTIVE_LIGHT_GRAY, gr.update(active=True)
|
| 42 |
-
if warmup_elapsed >= WARMUP_TICKS:
|
| 43 |
-
return warmup_elapsed, ACTIVE_LIGHT_RED, gr.update(active=False)
|
| 44 |
-
return warmup_elapsed + 1, ACTIVE_LIGHT_GRAY, gr.update(active=True)
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
def process_with_live_transcript(
|
| 48 |
-
input_array,
|
| 49 |
-
enhancement_level,
|
| 50 |
-
sample_stem,
|
| 51 |
-
stt_model,
|
| 52 |
-
last_sample_stem,
|
| 53 |
-
current_sample_rate,
|
| 54 |
-
):
|
| 55 |
-
"""Generator that runs the offline pipeline in real time (chunked): enhanced audio and
|
| 56 |
-
both transcripts stream from the first chunk so playback and transcription start immediately."""
|
| 57 |
-
progress_state = {}
|
| 58 |
-
result_holder = {}
|
| 59 |
-
|
| 60 |
-
def worker():
|
| 61 |
-
try:
|
| 62 |
-
result_holder["result"] = run_offline_pipeline_streaming(
|
| 63 |
-
input_array,
|
| 64 |
-
current_sample_rate,
|
| 65 |
-
enhancement_level,
|
| 66 |
-
sample_stem,
|
| 67 |
-
stt_model,
|
| 68 |
-
progress_state
|
| 69 |
-
)
|
| 70 |
-
except Exception as e:
|
| 71 |
-
result_holder["error"] = e
|
| 72 |
-
|
| 73 |
-
# 1) First yield: ground truth + input spectrogram only (no audio, no enhanced spec, no transcripts yet)
|
| 74 |
-
_ = cleanup_previous_run(last_sample_stem)
|
| 75 |
-
noisy_spec_path = f"{APP_TMP_DIR}/{sample_stem}_noisy_spectrogram.png"
|
| 76 |
-
if input_array is not None:
|
| 77 |
-
try:
|
| 78 |
-
spec_image(input_array).save(noisy_spec_path)
|
| 79 |
-
except Exception:
|
| 80 |
-
noisy_spec_path = None
|
| 81 |
-
else:
|
| 82 |
-
noisy_spec_path = None
|
| 83 |
-
try:
|
| 84 |
-
original_transcript = get_transcript(sample_stem)
|
| 85 |
-
except Exception:
|
| 86 |
-
original_transcript = "Unavailable"
|
| 87 |
-
|
| 88 |
-
yield (
|
| 89 |
-
gr.update(visible=True),
|
| 90 |
-
None, # enhanced_audio: set only in final yield (smooth playback)
|
| 91 |
-
gr.update(value=None), # enhanced_image: clear until step 3 (last)
|
| 92 |
-
gr.update(value=noisy_spec_path), # noisy_image: input spectrogram (step 1)
|
| 93 |
-
original_transcript,
|
| 94 |
-
"",
|
| 95 |
-
"",
|
| 96 |
-
sample_stem,
|
| 97 |
-
None,
|
| 98 |
-
"",
|
| 99 |
-
)
|
| 100 |
-
# Let the UI render step 1 before we flood with polling updates
|
| 101 |
-
time.sleep(0.2)
|
| 102 |
-
|
| 103 |
-
thread = threading.Thread(target=worker, daemon=True)
|
| 104 |
-
thread.start()
|
| 105 |
-
|
| 106 |
-
poll_interval = 0.05
|
| 107 |
-
while "result" not in result_holder and "error" not in result_holder:
|
| 108 |
-
time.sleep(poll_interval)
|
| 109 |
-
# 2) Realtime: stream transcripts only; audio set in final yield for smooth playback
|
| 110 |
-
yield (
|
| 111 |
-
gr.update(visible=True),
|
| 112 |
-
gr.update(), # enhanced_audio: set only in final yield, then autoplay
|
| 113 |
-
gr.update(), # enhanced_image: reveal only in step 3 (final yield)
|
| 114 |
-
gr.update(), # noisy_image already set in step 1
|
| 115 |
-
gr.update(), # original_transcript unchanged
|
| 116 |
-
gr.update(value=progress_state.get("noisy", "")),
|
| 117 |
-
gr.update(value=progress_state.get("enhanced", "")),
|
| 118 |
-
gr.update(),
|
| 119 |
-
gr.update(),
|
| 120 |
-
gr.update(),
|
| 121 |
-
)
|
| 122 |
-
|
| 123 |
-
if "error" in result_holder:
|
| 124 |
-
raise result_holder["error"]
|
| 125 |
-
|
| 126 |
-
(
|
| 127 |
-
enhanced_spec_path,
|
| 128 |
-
enhanced_transcript,
|
| 129 |
-
noisy_transcript_with_wer,
|
| 130 |
-
enhanced_audio,
|
| 131 |
-
last_stem,
|
| 132 |
-
enhanced_array,
|
| 133 |
-
precomputed_noisy,
|
| 134 |
-
) = result_holder["result"]
|
| 135 |
-
|
| 136 |
-
# 3) Last: reveal enhanced spectrogram (and final audio/transcripts)
|
| 137 |
-
yield (
|
| 138 |
-
gr.update(visible=True),
|
| 139 |
-
enhanced_audio,
|
| 140 |
-
enhanced_spec_path, # enhanced_image: show only now
|
| 141 |
-
noisy_spec_path,
|
| 142 |
-
original_transcript,
|
| 143 |
-
noisy_transcript_with_wer,
|
| 144 |
-
enhanced_transcript,
|
| 145 |
-
last_stem,
|
| 146 |
-
enhanced_array,
|
| 147 |
-
precomputed_noisy,
|
| 148 |
-
)
|
| 149 |
-
|
| 150 |
|
| 151 |
_CSS_DIR = Path(__file__).resolve().parent / "assets"
|
| 152 |
with gr.Blocks() as demo:
|
| 153 |
sample_stem = gr.State("")
|
| 154 |
last_sample_stem = gr.State("")
|
| 155 |
input_array = gr.State()
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
gr.HTML(
|
| 161 |
'<a href="https://ai-coustics.com/" target="_blank">'
|
| 162 |
'<img src="https://mintcdn.com/ai-coustics/Sxcrv8jVSE2qWMR1/logo/dark.svg?fit=max&auto=format&n=Sxcrv8jVSE2qWMR1&q=85&s=7f26caaf21e963912961cbd8541e6d84" '
|
|
@@ -182,80 +53,90 @@ with gr.Blocks() as demo:
|
|
| 182 |
scale=2,
|
| 183 |
)
|
| 184 |
|
| 185 |
-
with gr.
|
| 186 |
with gr.Tab("Stream audio in real time") as stream_tab:
|
| 187 |
-
gr.
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
with gr.Row(elem_classes="stream-row"):
|
| 200 |
-
with gr.Column(scale=4, min_width=200):
|
| 201 |
-
audio_stream = gr.Audio(
|
| 202 |
-
sources=["microphone"], streaming=True, elem_id="audio_stream"
|
| 203 |
-
)
|
| 204 |
-
with gr.Column(scale=1, min_width=120, elem_classes="active-light-column"):
|
| 205 |
-
active_light = gr.HTML(value=ACTIVE_LIGHT_GRAY)
|
| 206 |
-
with gr.Group(elem_classes="panel"):
|
| 207 |
-
with gr.Column(scale=5, min_width=320):
|
| 208 |
-
enhanced_text = gr.Textbox(
|
| 209 |
-
label="Enhanced Transcribed Text", lines=6, autoscroll=False
|
| 210 |
)
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
|
|
|
|
|
|
| 214 |
)
|
| 215 |
-
# Poll transcript globals so interim results show live (streamers update in background)
|
| 216 |
-
transcript_timer = gr.Timer(0.1, active=True)
|
| 217 |
-
transcript_timer.tick(
|
| 218 |
-
get_live_transcripts,
|
| 219 |
-
inputs=None,
|
| 220 |
-
outputs=[enhanced_text, raw_text],
|
| 221 |
-
show_progress="hidden",
|
| 222 |
-
)
|
| 223 |
-
warmup_timer = gr.Timer(0.5, active=True)
|
| 224 |
-
warmup_timer.tick(
|
| 225 |
-
warmup_tick,
|
| 226 |
-
inputs=[on_stream_tab, warmup_elapsed, active_light],
|
| 227 |
-
outputs=[warmup_elapsed, active_light, warmup_timer],
|
| 228 |
-
show_progress="hidden",
|
| 229 |
-
)
|
| 230 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
with gr.Tab("Dataset: Dawn Chorus") as dataset_tab:
|
| 232 |
-
with gr.
|
| 233 |
-
gr.Markdown(
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
|
|
|
|
|
|
| 240 |
|
| 241 |
with gr.Tab("Upload local file") as upload_tab:
|
| 242 |
-
with gr.
|
| 243 |
-
gr.Markdown(
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
label="
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
|
|
|
|
|
|
|
|
|
| 256 |
|
| 257 |
with gr.Group(elem_classes="panel results-card", visible=False) as results_card:
|
| 258 |
-
result_title = gr.Markdown("")
|
| 259 |
enhanced_audio = gr.Audio(
|
| 260 |
type="numpy",
|
| 261 |
interactive=False,
|
|
@@ -284,107 +165,75 @@ with gr.Blocks() as demo:
|
|
| 284 |
)
|
| 285 |
|
| 286 |
# ------------------------------------------------------
|
| 287 |
-
#
|
| 288 |
# ------------------------------------------------------
|
| 289 |
-
DEFAULT_DATASET_SAMPLE = "en_00412_i_h_36"
|
| 290 |
-
|
| 291 |
-
def load_dataset_sample_on_tab_visit(dropdown_value):
|
| 292 |
-
"""Load the selected sample when visiting the Dataset tab; use default if dropdown is empty."""
|
| 293 |
-
sample_id = dropdown_value or DEFAULT_DATASET_SAMPLE
|
| 294 |
-
audio_path, arr, stem, sample_rate = load_file_from_dataset(sample_id)
|
| 295 |
-
return sample_id, audio_path, arr, stem, sample_rate
|
| 296 |
-
|
| 297 |
stream_tab.select(
|
| 298 |
lambda: (
|
| 299 |
gr.update(visible=False),
|
| 300 |
gr.update(visible=False),
|
| 301 |
-
gr.update(
|
| 302 |
-
True,
|
| 303 |
-
0,
|
| 304 |
-
ACTIVE_LIGHT_GRAY,
|
| 305 |
-
gr.update(active=True),
|
| 306 |
),
|
| 307 |
-
|
| 308 |
-
outputs=[results_card, enhance_btn, audio_stream, on_stream_tab, warmup_elapsed, active_light, warmup_timer],
|
| 309 |
)
|
| 310 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 311 |
upload_tab.select(
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
False,
|
| 315 |
-
ACTIVE_LIGHT_GRAY,
|
| 316 |
-
),
|
| 317 |
-
inputs=None,
|
| 318 |
-
outputs=[stream_state, enhanced_text, raw_text, audio_stream, on_stream_tab, active_light],
|
| 319 |
).then(
|
| 320 |
-
|
| 321 |
-
|
|
|
|
| 322 |
)
|
| 323 |
-
|
| 324 |
dataset_tab.select(
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
False,
|
| 328 |
-
ACTIVE_LIGHT_GRAY,
|
| 329 |
-
),
|
| 330 |
-
inputs=None,
|
| 331 |
-
outputs=[stream_state, enhanced_text, raw_text, audio_stream, on_stream_tab, active_light],
|
| 332 |
-
).then(
|
| 333 |
-
lambda: gr.update(visible=True),
|
| 334 |
-
outputs=enhance_btn,
|
| 335 |
).then(
|
| 336 |
-
|
| 337 |
-
inputs=
|
| 338 |
-
outputs=[
|
| 339 |
-
)
|
| 340 |
-
|
| 341 |
-
stt_model.change(
|
| 342 |
-
fn=shutdown_streamers,
|
| 343 |
-
).then(
|
| 344 |
-
clear_ui,
|
| 345 |
-
inputs=None,
|
| 346 |
-
outputs=[stream_state, enhanced_text, raw_text],
|
| 347 |
-
).then(
|
| 348 |
-
set_stt_streamer,
|
| 349 |
-
inputs=stt_model,
|
| 350 |
-
outputs=None,
|
| 351 |
)
|
| 352 |
-
|
| 353 |
# ------------------------------------------------------
|
| 354 |
-
#
|
| 355 |
# ------------------------------------------------------
|
| 356 |
-
|
| 357 |
audio_stream.stream(
|
| 358 |
-
|
| 359 |
-
inputs=[
|
| 360 |
-
outputs=[
|
| 361 |
stream_every=STREAM_EVERY,
|
| 362 |
time_limit=60 * 2,
|
| 363 |
-
concurrency_limit=1,
|
| 364 |
)
|
| 365 |
|
| 366 |
audio_stream.stop_recording(
|
| 367 |
on_stop_recording,
|
|
|
|
|
|
|
|
|
|
| 368 |
)
|
| 369 |
|
| 370 |
audio_stream.start_recording(
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
).then(
|
| 375 |
-
fn=set_stt_streamer,
|
| 376 |
-
inputs=stt_model,
|
| 377 |
-
outputs=None,
|
| 378 |
-
)
|
| 379 |
|
| 380 |
# ------------------------------------------------------
|
| 381 |
-
# OFFLINE
|
| 382 |
# ------------------------------------------------------
|
| 383 |
|
| 384 |
# Dataset dropdown selection triggers loading the audio file and hiding results until enhancement
|
| 385 |
dataset_dropdown.change(
|
| 386 |
lambda: gr.update(visible=False),
|
| 387 |
-
inputs=None,
|
| 388 |
outputs=results_card,
|
| 389 |
).then(
|
| 390 |
load_file_from_dataset,
|
|
@@ -410,10 +259,15 @@ with gr.Blocks() as demo:
|
|
| 410 |
|
| 411 |
# Enhancement button: run pipeline with live transcript progress (dataset + local file modes).
|
| 412 |
enhance_btn.click(
|
| 413 |
-
|
| 414 |
-
inputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 415 |
outputs=[
|
| 416 |
-
results_card,
|
| 417 |
enhanced_audio,
|
| 418 |
enhanced_image,
|
| 419 |
noisy_image,
|
|
@@ -421,15 +275,15 @@ with gr.Blocks() as demo:
|
|
| 421 |
noisy_transcript,
|
| 422 |
enhanced_transcript,
|
| 423 |
last_sample_stem,
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
)
|
| 428 |
|
| 429 |
os.makedirs(APP_TMP_DIR, exist_ok=True)
|
| 430 |
purge_tmp_directory(max_age_minutes=0, tmp_dir=APP_TMP_DIR)
|
| 431 |
demo.queue()
|
| 432 |
demo.launch(
|
| 433 |
-
css=(_CSS_DIR / "
|
| 434 |
allowed_paths=[APP_TMP_DIR, "/tmp", "/"],
|
| 435 |
)
|
|
|
|
| 1 |
import os
|
|
|
|
|
|
|
| 2 |
import gradio as gr
|
| 3 |
from pathlib import Path
|
| 4 |
|
| 5 |
+
from constants import STREAM_EVERY, APP_TMP_DIR
|
| 6 |
+
from ui import LED_DOT_OFF
|
| 7 |
+
from hf_dataset_utils import ALL_FILES
|
| 8 |
|
| 9 |
from stream_pipeline import (
|
| 10 |
+
on_start_recording,
|
|
|
|
| 11 |
on_stop_recording,
|
|
|
|
|
|
|
|
|
|
| 12 |
shutdown_streamers,
|
| 13 |
+
stream_step,
|
| 14 |
)
|
| 15 |
from offline_pipeline import (
|
| 16 |
load_file_from_dataset,
|
| 17 |
load_local_file,
|
| 18 |
+
run_offline_pipeline,
|
| 19 |
)
|
|
|
|
| 20 |
from clean_up import purge_tmp_directory, cleanup_previous_run
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
_CSS_DIR = Path(__file__).resolve().parent / "assets"
|
| 24 |
with gr.Blocks() as demo:
|
| 25 |
sample_stem = gr.State("")
|
| 26 |
last_sample_stem = gr.State("")
|
| 27 |
input_array = gr.State()
|
| 28 |
+
streaming_sr = gr.State(None)
|
| 29 |
+
current_sample_rate = gr.State(None)
|
| 30 |
+
|
|
|
|
| 31 |
gr.HTML(
|
| 32 |
'<a href="https://ai-coustics.com/" target="_blank">'
|
| 33 |
'<img src="https://mintcdn.com/ai-coustics/Sxcrv8jVSE2qWMR1/logo/dark.svg?fit=max&auto=format&n=Sxcrv8jVSE2qWMR1&q=85&s=7f26caaf21e963912961cbd8541e6d84" '
|
|
|
|
| 53 |
scale=2,
|
| 54 |
)
|
| 55 |
|
| 56 |
+
with gr.Tabs():
|
| 57 |
with gr.Tab("Stream audio in real time") as stream_tab:
|
| 58 |
+
with gr.Row(equal_height=False, elem_classes="stream-layout"):
|
| 59 |
+
with gr.Column(scale=4, min_width=320):
|
| 60 |
+
with gr.Group(elem_classes="panel section-panel"):
|
| 61 |
+
gr.Markdown("### Input", elem_classes="title")
|
| 62 |
+
gr.Markdown(open("docs/online.md", "r", encoding="utf-8").read(), elem_classes="tab-description")
|
| 63 |
+
|
| 64 |
+
input_gain_db = gr.Slider(
|
| 65 |
+
minimum=0,
|
| 66 |
+
maximum=20,
|
| 67 |
+
step=0.5,
|
| 68 |
+
value=0,
|
| 69 |
+
label="Input gain (dB)",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
)
|
| 71 |
+
|
| 72 |
+
audio_stream = gr.Audio(
|
| 73 |
+
sources=["microphone"],
|
| 74 |
+
streaming=True,
|
| 75 |
+
elem_id="audio_stream",
|
| 76 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
+
with gr.Column(scale=6, min_width=420):
|
| 79 |
+
with gr.Group(elem_classes="status-panel output-panel"):
|
| 80 |
+
gr.Markdown("### Live Output", elem_classes="title")
|
| 81 |
+
with gr.Row(equal_height=True, elem_classes="status-indicators"):
|
| 82 |
+
with gr.Group(elem_classes="status-card"):
|
| 83 |
+
gr.Markdown("**System Status**", elem_classes="status-card__label")
|
| 84 |
+
system_status_led = gr.HTML(value=LED_DOT_OFF, show_label=False)
|
| 85 |
+
system_status_text = gr.Markdown(
|
| 86 |
+
value="Off",
|
| 87 |
+
elem_classes="status-card__subtext",
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
with gr.Group(elem_classes="status-card"):
|
| 91 |
+
gr.Markdown("**Voice Activity**", elem_classes="status-card__label")
|
| 92 |
+
vad_led = gr.HTML(value=LED_DOT_OFF, show_label=False)
|
| 93 |
+
|
| 94 |
+
with gr.Row(equal_height=True, elem_classes="transcript-row transcript-row--large"):
|
| 95 |
+
enhanced_text = gr.Textbox(
|
| 96 |
+
label="Enhanced Transcript",
|
| 97 |
+
lines=10,
|
| 98 |
+
autoscroll=False,
|
| 99 |
+
)
|
| 100 |
+
raw_text = gr.Textbox(
|
| 101 |
+
label="Raw Transcript",
|
| 102 |
+
lines=10,
|
| 103 |
+
autoscroll=False,
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
|
| 107 |
with gr.Tab("Dataset: Dawn Chorus") as dataset_tab:
|
| 108 |
+
with gr.Group(elem_classes="panel section-panel"):
|
| 109 |
+
gr.Markdown("### Input", elem_classes="title")
|
| 110 |
+
gr.Markdown(open("docs/dawn_chorus.md", "r", encoding="utf-8").read(), elem_classes="tab-description")
|
| 111 |
+
|
| 112 |
+
dataset_dropdown = gr.Dropdown(
|
| 113 |
+
choices=ALL_FILES, value="en_00412_i_h_36", label="Sample"
|
| 114 |
+
)
|
| 115 |
+
audio_file_from_dataset = gr.Audio(
|
| 116 |
+
type="filepath", interactive=False, buttons=["download"], autoplay=False
|
| 117 |
+
)
|
| 118 |
|
| 119 |
with gr.Tab("Upload local file") as upload_tab:
|
| 120 |
+
with gr.Group(elem_classes="panel section-panel"):
|
| 121 |
+
gr.Markdown("### Input", elem_classes="title")
|
| 122 |
+
gr.Markdown(open("docs/local_file.md", "r", encoding="utf-8").read(), elem_classes="tab-description")
|
| 123 |
+
|
| 124 |
+
audio_file_upload = gr.File(
|
| 125 |
+
file_types=[".wav", ".mp3", ".flac", ".m4a", ".ogg"],
|
| 126 |
+
file_count="single",
|
| 127 |
+
scale=3,
|
| 128 |
+
)
|
| 129 |
+
normalize = gr.Checkbox(label="Normalize audio", value=True)
|
| 130 |
+
audio_preview = gr.Audio(
|
| 131 |
+
label="Preview",
|
| 132 |
+
autoplay=False,
|
| 133 |
+
interactive=False,
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
enhance_btn = gr.Button("Enhance with Quail Voice Focus 2.0", scale=2, visible=False)
|
| 137 |
|
| 138 |
with gr.Group(elem_classes="panel results-card", visible=False) as results_card:
|
| 139 |
+
result_title = gr.Markdown("", elem_classes="title")
|
| 140 |
enhanced_audio = gr.Audio(
|
| 141 |
type="numpy",
|
| 142 |
interactive=False,
|
|
|
|
| 165 |
)
|
| 166 |
|
| 167 |
# ------------------------------------------------------
|
| 168 |
+
# TAB CHANGES
|
| 169 |
# ------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
stream_tab.select(
|
| 171 |
lambda: (
|
| 172 |
gr.update(visible=False),
|
| 173 |
gr.update(visible=False),
|
| 174 |
+
gr.update(sources=["microphone"], streaming=True, interactive= True),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
),
|
| 176 |
+
outputs=[results_card, enhance_btn, audio_stream],
|
|
|
|
| 177 |
)
|
| 178 |
|
| 179 |
+
def _on_not_streaming_tab():
|
| 180 |
+
shutdown_streamers()
|
| 181 |
+
return (
|
| 182 |
+
gr.update(streaming=False, interactive=False),
|
| 183 |
+
gr.update(visible=True),
|
| 184 |
+
LED_DOT_OFF,
|
| 185 |
+
LED_DOT_OFF,
|
| 186 |
+
"Off",
|
| 187 |
+
)
|
| 188 |
+
|
| 189 |
upload_tab.select(
|
| 190 |
+
_on_not_streaming_tab,
|
| 191 |
+
outputs=[audio_stream, enhance_btn, vad_led, system_status_led, system_status_text],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
).then(
|
| 193 |
+
load_local_file,
|
| 194 |
+
inputs=[audio_file_upload, normalize],
|
| 195 |
+
outputs=[input_array, sample_stem, audio_preview, current_sample_rate],
|
| 196 |
)
|
| 197 |
+
|
| 198 |
dataset_tab.select(
|
| 199 |
+
_on_not_streaming_tab,
|
| 200 |
+
outputs=[audio_stream, enhance_btn, vad_led, system_status_led, system_status_text],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
).then(
|
| 202 |
+
load_file_from_dataset,
|
| 203 |
+
inputs=dataset_dropdown,
|
| 204 |
+
outputs=[audio_file_from_dataset, input_array, sample_stem, current_sample_rate],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
)
|
|
|
|
| 206 |
# ------------------------------------------------------
|
| 207 |
+
# STREAMING EVENTS
|
| 208 |
# ------------------------------------------------------
|
|
|
|
| 209 |
audio_stream.stream(
|
| 210 |
+
stream_step,
|
| 211 |
+
inputs=[audio_stream, streaming_sr, stt_model, enhancement_level, input_gain_db],
|
| 212 |
+
outputs=[streaming_sr, system_status_led, system_status_text,enhanced_text, raw_text, vad_led],
|
| 213 |
stream_every=STREAM_EVERY,
|
| 214 |
time_limit=60 * 2,
|
| 215 |
+
concurrency_limit=1,
|
| 216 |
)
|
| 217 |
|
| 218 |
audio_stream.stop_recording(
|
| 219 |
on_stop_recording,
|
| 220 |
+
outputs=[vad_led, system_status_led, system_status_text, streaming_sr],
|
| 221 |
+
).then(
|
| 222 |
+
shutdown_streamers,
|
| 223 |
)
|
| 224 |
|
| 225 |
audio_stream.start_recording(
|
| 226 |
+
on_start_recording,
|
| 227 |
+
outputs=[enhanced_text, raw_text, system_status_led, system_status_text],
|
| 228 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
|
| 230 |
# ------------------------------------------------------
|
| 231 |
+
# OFFLINE EVENTS (DATASET + LOCAL FILE)
|
| 232 |
# ------------------------------------------------------
|
| 233 |
|
| 234 |
# Dataset dropdown selection triggers loading the audio file and hiding results until enhancement
|
| 235 |
dataset_dropdown.change(
|
| 236 |
lambda: gr.update(visible=False),
|
|
|
|
| 237 |
outputs=results_card,
|
| 238 |
).then(
|
| 239 |
load_file_from_dataset,
|
|
|
|
| 259 |
|
| 260 |
# Enhancement button: run pipeline with live transcript progress (dataset + local file modes).
|
| 261 |
enhance_btn.click(
|
| 262 |
+
cleanup_previous_run,
|
| 263 |
+
inputs=[last_sample_stem]
|
| 264 |
+
).then(
|
| 265 |
+
lambda: gr.update(visible=True),
|
| 266 |
+
outputs=results_card,
|
| 267 |
+
).then(
|
| 268 |
+
run_offline_pipeline,
|
| 269 |
+
inputs=[input_array, current_sample_rate, enhancement_level, stt_model, sample_stem],
|
| 270 |
outputs=[
|
|
|
|
| 271 |
enhanced_audio,
|
| 272 |
enhanced_image,
|
| 273 |
noisy_image,
|
|
|
|
| 275 |
noisy_transcript,
|
| 276 |
enhanced_transcript,
|
| 277 |
last_sample_stem,
|
| 278 |
+
]
|
| 279 |
+
).failure(
|
| 280 |
+
lambda: gr.Warning("Enhancement failed. Please refresh page and make sure you have a stable connection.")
|
| 281 |
)
|
| 282 |
|
| 283 |
os.makedirs(APP_TMP_DIR, exist_ok=True)
|
| 284 |
purge_tmp_directory(max_age_minutes=0, tmp_dir=APP_TMP_DIR)
|
| 285 |
demo.queue()
|
| 286 |
demo.launch(
|
| 287 |
+
css=(_CSS_DIR / "styling.css").read_text(encoding="utf-8"),
|
| 288 |
allowed_paths=[APP_TMP_DIR, "/tmp", "/"],
|
| 289 |
)
|
|
@@ -1,68 +0,0 @@
|
|
| 1 |
-
/* Stream row: stretch columns to match audio height */
|
| 2 |
-
.stream-row {
|
| 3 |
-
align-items: stretch !important;
|
| 4 |
-
}
|
| 5 |
-
|
| 6 |
-
/* Active light column: flex container, fill height */
|
| 7 |
-
.stream-row > div:last-child {
|
| 8 |
-
display: flex !important;
|
| 9 |
-
align-items: stretch !important;
|
| 10 |
-
min-height: 100%;
|
| 11 |
-
}
|
| 12 |
-
|
| 13 |
-
/* Gradio block wrapper: fill and flex so child can stretch */
|
| 14 |
-
.stream-row > div:last-child > div {
|
| 15 |
-
display: flex !important;
|
| 16 |
-
width: 100% !important;
|
| 17 |
-
min-height: 100% !important;
|
| 18 |
-
height: 100% !important;
|
| 19 |
-
box-sizing: border-box;
|
| 20 |
-
}
|
| 21 |
-
|
| 22 |
-
/* All divs in light column: fill so panel stretches */
|
| 23 |
-
.stream-row > div:last-child div {
|
| 24 |
-
width: 100% !important;
|
| 25 |
-
min-height: 100% !important;
|
| 26 |
-
height: 100% !important;
|
| 27 |
-
box-sizing: border-box;
|
| 28 |
-
}
|
| 29 |
-
|
| 30 |
-
/* Active light panel: fill container, base styles */
|
| 31 |
-
.active-light {
|
| 32 |
-
width: 100% !important;
|
| 33 |
-
min-height: 100% !important;
|
| 34 |
-
height: 100% !important;
|
| 35 |
-
box-sizing: border-box;
|
| 36 |
-
border-radius: 8px;
|
| 37 |
-
border: 1px solid var(--border-color-primary);
|
| 38 |
-
transition: background 0.2s, box-shadow 0.2s;
|
| 39 |
-
padding: 0.5rem;
|
| 40 |
-
}
|
| 41 |
-
|
| 42 |
-
/* Warming up state */
|
| 43 |
-
.active-light--off {
|
| 44 |
-
background: #555;
|
| 45 |
-
box-shadow: inset 0 2px 8px rgba(0, 0, 0, 0.4);
|
| 46 |
-
}
|
| 47 |
-
|
| 48 |
-
/* Ready state */
|
| 49 |
-
.active-light--on {
|
| 50 |
-
background: #c00;
|
| 51 |
-
box-shadow:
|
| 52 |
-
0 0 16px rgba(220, 0, 0, 0.5),
|
| 53 |
-
inset 0 0 12px rgba(255, 80, 80, 0.3);
|
| 54 |
-
}
|
| 55 |
-
|
| 56 |
-
/* Label text */
|
| 57 |
-
.active-light__label {
|
| 58 |
-
font-size: 12px;
|
| 59 |
-
font-weight: 500;
|
| 60 |
-
}
|
| 61 |
-
|
| 62 |
-
.active-light--off .active-light__label {
|
| 63 |
-
color: rgba(255, 255, 255, 0.85);
|
| 64 |
-
}
|
| 65 |
-
|
| 66 |
-
.active-light--on .active-light__label {
|
| 67 |
-
color: #fff;
|
| 68 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.status-panel {
|
| 2 |
+
padding: 16px;
|
| 3 |
+
border-radius: 16px;
|
| 4 |
+
border: 1px solid #27272a;
|
| 5 |
+
background: #111827;
|
| 6 |
+
}
|
| 7 |
+
|
| 8 |
+
.title, .tab-description {
|
| 9 |
+
margin-left: 2px;
|
| 10 |
+
margin-top: 2px;
|
| 11 |
+
margin-bottom: 0px;
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
.transcript-row {
|
| 15 |
+
gap: 12px;
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
.transcript-row--large textarea {
|
| 19 |
+
min-height: 260px;
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
.status-card {
|
| 23 |
+
height: 50px;
|
| 24 |
+
border: 1px solid #3f3f46;
|
| 25 |
+
border-radius: 14px;
|
| 26 |
+
background: #18181b;
|
| 27 |
+
padding: 5px 5px;
|
| 28 |
+
display: flex;
|
| 29 |
+
flex-direction: column;
|
| 30 |
+
justify-content: center;
|
| 31 |
+
align-items: center;
|
| 32 |
+
box-sizing: border-box;
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
.status-card__label {
|
| 37 |
+
text-align: center;
|
| 38 |
+
margin-bottom: 0px;
|
| 39 |
+
color: #e4e4e7;
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
.status-card__subtext {
|
| 43 |
+
margin-top: 0px;
|
| 44 |
+
font-size: 0.85rem;
|
| 45 |
+
color: #a1a1aa;
|
| 46 |
+
text-align: center;
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
.led-dot {
|
| 50 |
+
width: 30px;
|
| 51 |
+
height: 30px;
|
| 52 |
+
border-radius: 9999px;
|
| 53 |
+
border: 1px solid #71717a;
|
| 54 |
+
margin: 0 auto;
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
.led-dot--black {
|
| 58 |
+
background: #1c1c1c;
|
| 59 |
+
box-shadow: none;
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
.led-dot--red {
|
| 63 |
+
background: #ef4444;
|
| 64 |
+
box-shadow: 0 0 14px rgba(239, 68, 68, 0.85);
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
.led-dot--green {
|
| 68 |
+
background: #22c55e;
|
| 69 |
+
box-shadow: 0 0 14px rgba(34, 197, 94, 0.85);
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
.led-dot--yellow {
|
| 73 |
+
background: #facc15;
|
| 74 |
+
box-shadow: 0 0 14px rgba(250, 204, 21, 0.85);
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
.led-dot--off {
|
| 80 |
+
background: #3f3f46;
|
| 81 |
+
box-shadow: none;
|
| 82 |
+
}
|
|
@@ -2,7 +2,6 @@ import os
|
|
| 2 |
import time
|
| 3 |
from loguru import logger
|
| 4 |
from constants import MINUTES_KEEP, APP_TMP_DIR
|
| 5 |
-
import gradio as gr
|
| 6 |
|
| 7 |
|
| 8 |
def purge_tmp_directory(
|
|
@@ -93,11 +92,9 @@ def cleanup_previous_run(
|
|
| 93 |
sample_stem: str,
|
| 94 |
tmp_dir: str = APP_TMP_DIR,
|
| 95 |
max_age_minutes: int = MINUTES_KEEP,
|
| 96 |
-
)
|
| 97 |
-
gr.Info("Processing started. This may take a moment. Please do not refresh or close the window.")
|
| 98 |
try:
|
| 99 |
remove_files_related_to(sample_stem, tmp_dir=tmp_dir)
|
| 100 |
except Exception as e:
|
| 101 |
print(f"Failed to delete last run with id {sample_stem}: {e}")
|
| 102 |
-
purge_tmp_directory(max_age_minutes=max_age_minutes, tmp_dir=tmp_dir)
|
| 103 |
-
return None, None, "", "", ""
|
|
|
|
| 2 |
import time
|
| 3 |
from loguru import logger
|
| 4 |
from constants import MINUTES_KEEP, APP_TMP_DIR
|
|
|
|
| 5 |
|
| 6 |
|
| 7 |
def purge_tmp_directory(
|
|
|
|
| 92 |
sample_stem: str,
|
| 93 |
tmp_dir: str = APP_TMP_DIR,
|
| 94 |
max_age_minutes: int = MINUTES_KEEP,
|
| 95 |
+
):
|
|
|
|
| 96 |
try:
|
| 97 |
remove_files_related_to(sample_stem, tmp_dir=tmp_dir)
|
| 98 |
except Exception as e:
|
| 99 |
print(f"Failed to delete last run with id {sample_stem}: {e}")
|
| 100 |
+
purge_tmp_directory(max_age_minutes=max_age_minutes, tmp_dir=tmp_dir)
|
|
|
|
@@ -19,7 +19,6 @@ MIX_DIR: Final = "mix"
|
|
| 19 |
SPEECH_DIR: Final = "speech"
|
| 20 |
TRANS_DIR: Final = "transcripts"
|
| 21 |
|
| 22 |
-
DEFAULT_SR: Final = 16000
|
| 23 |
STREAM_EVERY: Final = 0.2
|
| 24 |
WARMUP_SECONDS: Final = 2 # seconds before "recording ready" light turns on
|
| 25 |
|
|
@@ -30,3 +29,6 @@ STREAMER_CLASSES: Final = {
|
|
| 30 |
"Deepgram Nova-3 RT": DeepgramStreamer,
|
| 31 |
"Soniox STT-RT v3": SonioxStreamer,
|
| 32 |
}
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
SPEECH_DIR: Final = "speech"
|
| 20 |
TRANS_DIR: Final = "transcripts"
|
| 21 |
|
|
|
|
| 22 |
STREAM_EVERY: Final = 0.2
|
| 23 |
WARMUP_SECONDS: Final = 2 # seconds before "recording ready" light turns on
|
| 24 |
|
|
|
|
| 29 |
"Deepgram Nova-3 RT": DeepgramStreamer,
|
| 30 |
"Soniox STT-RT v3": SonioxStreamer,
|
| 31 |
}
|
| 32 |
+
|
| 33 |
+
VAD_ON: Final = "🟢"
|
| 34 |
+
VAD_OFF: Final = "⚫"
|
|
@@ -1,153 +1,277 @@
|
|
| 1 |
import os
|
| 2 |
-
from
|
| 3 |
|
| 4 |
import gradio as gr
|
| 5 |
-
import soundfile as sf
|
| 6 |
-
from sdk import SDKWrapper
|
| 7 |
-
from utils import spec_image, compute_wer, to_gradio_audio, normalize_lufs
|
| 8 |
-
from hf_dataset_utils import get_audio, get_transcript
|
| 9 |
-
from constants import APP_TMP_DIR, STREAMER_CLASSES
|
| 10 |
import numpy as np
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
|
|
|
| 12 |
|
| 13 |
-
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
if hasattr(streamer, "close_stream"):
|
| 16 |
streamer.close_stream()
|
| 17 |
else:
|
| 18 |
streamer.close()
|
| 19 |
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
) -> tuple[str, str, str, tuple[int, np.ndarray], str, np.ndarray, str]:
|
| 28 |
-
"""Run enhancement and both STTs in real time by processing in chunks. Transcripts stream
|
| 29 |
-
via progress_state['noisy'] and progress_state['enhanced']. Enhanced audio is returned
|
| 30 |
-
only at the end; the app plays it automatically when processing is complete.
|
| 31 |
-
Returns same tuple as run_offline_pipeline_ordered."""
|
| 32 |
-
if sample is None:
|
| 33 |
-
raise ValueError("No audio to enhance. Please upload a file first.")
|
| 34 |
-
sample = np.asarray(sample, dtype=np.float32).flatten()
|
| 35 |
-
|
| 36 |
-
sdk = SDKWrapper()
|
| 37 |
-
sdk.init_processor(
|
| 38 |
sample_rate=sample_rate,
|
| 39 |
-
enhancement_level=
|
| 40 |
)
|
| 41 |
-
|
|
|
|
| 42 |
|
| 43 |
-
# Sync transcript callbacks so both boxes update together
|
| 44 |
-
progress_state["noisy_pending"] = ""
|
| 45 |
-
progress_state["enhanced_pending"] = ""
|
| 46 |
-
progress_state["noisy_has_sent"] = False
|
| 47 |
-
progress_state["enhanced_has_sent"] = False
|
| 48 |
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
-
|
| 55 |
-
progress_state["noisy_pending"] = t
|
| 56 |
-
progress_state["noisy_has_sent"] = True
|
| 57 |
-
_flush_both()
|
| 58 |
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
-
if stt_model not in STREAMER_CLASSES:
|
| 65 |
-
raise ValueError(f"Unknown STT model: {stt_model}")
|
| 66 |
-
StreamerClass = STREAMER_CLASSES[stt_model]
|
| 67 |
-
streamer_noisy = StreamerClass(sample_rate, f"{sample_id}_noisy", on_update=on_noisy)
|
| 68 |
-
streamer_enhanced = StreamerClass(sample_rate, f"{sample_id}_enhanced", on_update=on_enhanced)
|
| 69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
accumulated_enhanced: list[np.ndarray] = []
|
|
|
|
| 71 |
n = len(sample)
|
| 72 |
|
| 73 |
for i in range(0, n, chunk_size):
|
| 74 |
raw_chunk = sample[i : i + chunk_size]
|
| 75 |
-
|
|
|
|
|
|
|
| 76 |
raw_chunk = np.pad(
|
| 77 |
raw_chunk,
|
| 78 |
-
(0, chunk_size -
|
| 79 |
mode="constant",
|
| 80 |
constant_values=0.0,
|
| 81 |
)
|
| 82 |
-
|
| 83 |
-
enhanced_chunk =
|
| 84 |
-
enhanced_1d = np.asarray(enhanced_chunk).flatten()
|
|
|
|
| 85 |
streamer_noisy.process_chunk(raw_chunk)
|
| 86 |
streamer_enhanced.process_chunk(enhanced_1d)
|
| 87 |
accumulated_enhanced.append(enhanced_1d)
|
| 88 |
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
|
|
|
|
|
|
| 93 |
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
|
| 99 |
enhanced_array = np.concatenate(accumulated_enhanced).astype(np.float32)
|
| 100 |
-
|
| 101 |
|
| 102 |
-
enhanced_spec_path = f"{APP_TMP_DIR}/{sample_id}_enhanced_spectrogram.png"
|
| 103 |
-
spec_image(enhanced_array).save(enhanced_spec_path)
|
| 104 |
-
progress_state["enhanced_spec_path"] = enhanced_spec_path
|
| 105 |
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
try:
|
| 108 |
original_transcript = get_transcript(sample_id)
|
| 109 |
-
wer_enhanced = compute_wer(original_transcript, enhanced_transcript)
|
| 110 |
-
wer_noisy = compute_wer(original_transcript, noisy_transcript)
|
| 111 |
-
enhanced_transcript += f" (WER: {wer_enhanced * 100:.2f}%)"
|
| 112 |
-
noisy_transcript += f" (WER: {wer_noisy * 100:.2f}%)"
|
| 113 |
except Exception:
|
| 114 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
|
| 116 |
return (
|
|
|
|
| 117 |
enhanced_spec_path,
|
| 118 |
-
|
|
|
|
| 119 |
noisy_transcript,
|
| 120 |
-
|
| 121 |
sample_id,
|
| 122 |
-
enhanced_array,
|
| 123 |
-
precomputed_noisy,
|
| 124 |
)
|
| 125 |
|
|
|
|
| 126 |
def load_local_file(
|
| 127 |
sample_path: str,
|
| 128 |
normalize: bool = True,
|
| 129 |
-
|
| 130 |
if not sample_path or not os.path.exists(sample_path):
|
| 131 |
-
return None, "", None
|
|
|
|
| 132 |
if os.path.getsize(sample_path) > 5 * 1024 * 1024:
|
| 133 |
gr.Warning("File size exceeds 5 MB limit. Please upload a smaller file.")
|
| 134 |
raise ValueError("Uploaded file exceeds the 5 MB size limit.")
|
|
|
|
| 135 |
new_sample_stem = os.path.splitext(os.path.basename(sample_path))[0]
|
| 136 |
-
y, sample_rate =
|
|
|
|
|
|
|
| 137 |
if normalize:
|
| 138 |
y = normalize_lufs(y, sample_rate)
|
| 139 |
gradio_audio = to_gradio_audio(y, sample_rate)
|
| 140 |
return y, new_sample_stem, gradio_audio, sample_rate
|
| 141 |
|
| 142 |
-
|
|
|
|
|
|
|
|
|
|
| 143 |
if not sample_id:
|
| 144 |
gr.Warning("Please select a sample from the dropdown.")
|
| 145 |
return None, None, "", None
|
|
|
|
| 146 |
new_sample_stem = sample_id
|
|
|
|
| 147 |
try:
|
| 148 |
y, sample_rate = get_audio(sample_id, prefix="mix")
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
+
from typing import Any
|
| 3 |
|
| 4 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
import numpy as np
|
| 6 |
+
import librosa
|
| 7 |
+
|
| 8 |
+
from constants import APP_TMP_DIR, STREAMER_CLASSES
|
| 9 |
+
from hf_dataset_utils import get_audio, get_transcript
|
| 10 |
+
from sdk import SDKParams, SDKWrapper
|
| 11 |
+
from utils import (
|
| 12 |
+
compute_wer,
|
| 13 |
+
get_vad_labels,
|
| 14 |
+
normalize_lufs,
|
| 15 |
+
spec_image,
|
| 16 |
+
to_gradio_audio,
|
| 17 |
+
)
|
| 18 |
|
| 19 |
+
SDK_OFFLINE = SDKWrapper()
|
| 20 |
|
| 21 |
+
|
| 22 |
+
def _safe_progress(progress: gr.Progress, value: float, desc: str) -> None:
|
| 23 |
+
progress(max(0.0, min(1.0, value)), desc=desc)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def _empty_pipeline_result(sample_id: str) -> tuple[Any, str, str, str, str, str, str]:
|
| 27 |
+
return (
|
| 28 |
+
None,
|
| 29 |
+
"",
|
| 30 |
+
"",
|
| 31 |
+
"Unavailable",
|
| 32 |
+
"Unavailable",
|
| 33 |
+
"Unavailable",
|
| 34 |
+
sample_id,
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def _finalize_stream_transcript(streamer) -> str:
|
| 39 |
if hasattr(streamer, "close_stream"):
|
| 40 |
streamer.close_stream()
|
| 41 |
else:
|
| 42 |
streamer.close()
|
| 43 |
|
| 44 |
+
streamer.finished_event.wait()
|
| 45 |
+
with streamer.lock:
|
| 46 |
+
return streamer.render_tokens(streamer.final_tokens, [])
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def _init_sdk(sample_rate: int, enhancement_level: int) -> int:
|
| 50 |
+
sdk_params = SDKParams(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
sample_rate=sample_rate,
|
| 52 |
+
enhancement_level=enhancement_level / 100.0,
|
| 53 |
)
|
| 54 |
+
SDK_OFFLINE.init_processor(sdk_params)
|
| 55 |
+
return SDK_OFFLINE.num_frames
|
| 56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
+
def _init_streamers(
|
| 59 |
+
sample_rate: int,
|
| 60 |
+
stt_model: str,
|
| 61 |
+
sample_id: str,
|
| 62 |
+
progress: gr.Progress,
|
| 63 |
+
):
|
| 64 |
+
if stt_model not in STREAMER_CLASSES:
|
| 65 |
+
raise ValueError(f"Unknown STT model: {stt_model}")
|
| 66 |
|
| 67 |
+
streamer_class = STREAMER_CLASSES[stt_model]
|
|
|
|
|
|
|
|
|
|
| 68 |
|
| 69 |
+
_safe_progress(progress, 0.12, f"Initializing {stt_model} stream 1/2...")
|
| 70 |
+
streamer_noisy = streamer_class(sample_rate, f"{sample_id}_noisy")
|
| 71 |
+
|
| 72 |
+
_safe_progress(progress, 0.18, f"Initializing {stt_model} stream 2/2...")
|
| 73 |
+
streamer_enhanced = streamer_class(sample_rate, f"{sample_id}_enhanced")
|
| 74 |
+
|
| 75 |
+
return streamer_noisy, streamer_enhanced
|
| 76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
+
def _attach_wer(
|
| 79 |
+
original_transcript: str,
|
| 80 |
+
noisy_transcript: str,
|
| 81 |
+
enhanced_transcript: str,
|
| 82 |
+
) -> tuple[str, str]:
|
| 83 |
+
wer_enhanced = compute_wer(original_transcript, enhanced_transcript)
|
| 84 |
+
wer_noisy = compute_wer(original_transcript, noisy_transcript)
|
| 85 |
+
|
| 86 |
+
noisy_transcript = f"{noisy_transcript} (WER: {wer_noisy * 100:.2f}%)"
|
| 87 |
+
enhanced_transcript = f"{enhanced_transcript} (WER: {wer_enhanced * 100:.2f}%)"
|
| 88 |
+
return noisy_transcript, enhanced_transcript
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def _process_audio_chunks(
|
| 92 |
+
sample: np.ndarray,
|
| 93 |
+
sample_rate: int,
|
| 94 |
+
chunk_size: int,
|
| 95 |
+
streamer_noisy,
|
| 96 |
+
streamer_enhanced,
|
| 97 |
+
progress: gr.Progress,
|
| 98 |
+
) -> tuple[np.ndarray, list[list[float]]]:
|
| 99 |
accumulated_enhanced: list[np.ndarray] = []
|
| 100 |
+
vad_timestamps: list[list[float]] = []
|
| 101 |
n = len(sample)
|
| 102 |
|
| 103 |
for i in range(0, n, chunk_size):
|
| 104 |
raw_chunk = sample[i : i + chunk_size]
|
| 105 |
+
original_chunk_len = raw_chunk.size
|
| 106 |
+
|
| 107 |
+
if original_chunk_len < chunk_size:
|
| 108 |
raw_chunk = np.pad(
|
| 109 |
raw_chunk,
|
| 110 |
+
(0, chunk_size - original_chunk_len),
|
| 111 |
mode="constant",
|
| 112 |
constant_values=0.0,
|
| 113 |
)
|
| 114 |
+
|
| 115 |
+
enhanced_chunk = SDK_OFFLINE.process_chunk(raw_chunk.reshape(1, -1))
|
| 116 |
+
enhanced_1d = np.asarray(enhanced_chunk, dtype=np.float32).flatten()
|
| 117 |
+
|
| 118 |
streamer_noisy.process_chunk(raw_chunk)
|
| 119 |
streamer_enhanced.process_chunk(enhanced_1d)
|
| 120 |
accumulated_enhanced.append(enhanced_1d)
|
| 121 |
|
| 122 |
+
loop_progress = (i + original_chunk_len) / n if n > 0 else 1.0
|
| 123 |
+
_safe_progress(
|
| 124 |
+
progress,
|
| 125 |
+
0.20 + 0.50 * loop_progress,
|
| 126 |
+
"Enhancing audio...",
|
| 127 |
+
)
|
| 128 |
|
| 129 |
+
if SDK_OFFLINE.vad_context.is_speech_detected():
|
| 130 |
+
start_in_sec = i / sample_rate
|
| 131 |
+
end_in_sec = min(i + original_chunk_len, n) / sample_rate
|
| 132 |
+
vad_timestamps.append([start_in_sec, end_in_sec])
|
| 133 |
|
| 134 |
enhanced_array = np.concatenate(accumulated_enhanced).astype(np.float32)
|
| 135 |
+
return enhanced_array, vad_timestamps
|
| 136 |
|
|
|
|
|
|
|
|
|
|
| 137 |
|
| 138 |
+
def _save_spectrograms(
|
| 139 |
+
sample: np.ndarray,
|
| 140 |
+
enhanced_array: np.ndarray,
|
| 141 |
+
sample_rate: int,
|
| 142 |
+
sample_id: str,
|
| 143 |
+
vad_timestamps: list[list[float]],
|
| 144 |
+
) -> tuple[str, str]:
|
| 145 |
+
os.makedirs(APP_TMP_DIR, exist_ok=True)
|
| 146 |
+
|
| 147 |
+
enhanced_spec_path = os.path.join(APP_TMP_DIR, f"{sample_id}_enhanced_spectrogram.png")
|
| 148 |
+
noisy_spec_path = os.path.join(APP_TMP_DIR, f"{sample_id}_noisy_spectrogram.png")
|
| 149 |
+
|
| 150 |
+
spec_image(enhanced_array, sr=sample_rate, vad_timestamps=vad_timestamps).save(enhanced_spec_path)
|
| 151 |
+
spec_image(sample, sr=sample_rate, vad_timestamps=vad_timestamps).save(noisy_spec_path)
|
| 152 |
+
|
| 153 |
+
return enhanced_spec_path, noisy_spec_path
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
def run_offline_pipeline(
|
| 157 |
+
sample: np.ndarray,
|
| 158 |
+
sample_rate: int,
|
| 159 |
+
enhancement_level: int,
|
| 160 |
+
stt_model: str,
|
| 161 |
+
sample_id: str,
|
| 162 |
+
progress=gr.Progress(),
|
| 163 |
+
) -> tuple[Any, str, str, str, str, str, str]:
|
| 164 |
+
_safe_progress(progress, 0.00, "Starting...")
|
| 165 |
+
|
| 166 |
+
if sample is None or len(sample) == 0:
|
| 167 |
+
gr.Warning("No audio to enhance. Please upload a file first.")
|
| 168 |
+
return _empty_pipeline_result(sample_id)
|
| 169 |
+
|
| 170 |
+
_safe_progress(progress, 0.05, "Initializing enhancement...")
|
| 171 |
+
chunk_size = _init_sdk(sample_rate, enhancement_level)
|
| 172 |
+
|
| 173 |
+
try:
|
| 174 |
+
streamer_noisy, streamer_enhanced = _init_streamers(
|
| 175 |
+
sample_rate=sample_rate,
|
| 176 |
+
stt_model=stt_model,
|
| 177 |
+
sample_id=sample_id,
|
| 178 |
+
progress=progress,
|
| 179 |
+
)
|
| 180 |
+
except Exception as e:
|
| 181 |
+
raise RuntimeError(f"Failed to initialize STT streaming: {e}") from e
|
| 182 |
+
|
| 183 |
+
enhanced_array, vad_timestamps = _process_audio_chunks(
|
| 184 |
+
sample=sample,
|
| 185 |
+
sample_rate=sample_rate,
|
| 186 |
+
chunk_size=chunk_size,
|
| 187 |
+
streamer_noisy=streamer_noisy,
|
| 188 |
+
streamer_enhanced=streamer_enhanced,
|
| 189 |
+
progress=progress,
|
| 190 |
+
)
|
| 191 |
+
|
| 192 |
+
_safe_progress(progress, 0.72, "Finalizing transcripts...")
|
| 193 |
+
noisy_transcript = _finalize_stream_transcript(streamer_noisy)
|
| 194 |
+
_safe_progress(progress, 0.80, "Finalizing transcripts...")
|
| 195 |
+
enhanced_transcript = _finalize_stream_transcript(streamer_enhanced)
|
| 196 |
+
|
| 197 |
+
_safe_progress(progress, 0.94, "Loading reference transcript...")
|
| 198 |
try:
|
| 199 |
original_transcript = get_transcript(sample_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
except Exception:
|
| 201 |
+
original_transcript = "Unavailable"
|
| 202 |
+
if original_transcript != "Unavailable":
|
| 203 |
+
_safe_progress(progress, 0.96, "Computing WER...")
|
| 204 |
+
noisy_transcript, enhanced_transcript = _attach_wer(
|
| 205 |
+
original_transcript=original_transcript,
|
| 206 |
+
noisy_transcript=noisy_transcript,
|
| 207 |
+
enhanced_transcript=enhanced_transcript,
|
| 208 |
+
)
|
| 209 |
+
|
| 210 |
+
_safe_progress(progress, 0.99, "Generating outputs...")
|
| 211 |
+
gradio_enhanced_audio = to_gradio_audio(enhanced_array, sample_rate)
|
| 212 |
+
enhanced_spec_path, noisy_spec_path = _save_spectrograms(
|
| 213 |
+
sample=sample,
|
| 214 |
+
enhanced_array=enhanced_array,
|
| 215 |
+
sample_rate=sample_rate,
|
| 216 |
+
sample_id=sample_id,
|
| 217 |
+
vad_timestamps=vad_timestamps
|
| 218 |
+
)
|
| 219 |
+
|
| 220 |
+
vad_labels = get_vad_labels(
|
| 221 |
+
vad_timestamps,
|
| 222 |
+
length=len(sample) / sample_rate,
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
+
_safe_progress(progress, 1.00, "Done.")
|
| 226 |
|
| 227 |
return (
|
| 228 |
+
gr.update(value=gradio_enhanced_audio, subtitles=vad_labels),
|
| 229 |
enhanced_spec_path,
|
| 230 |
+
noisy_spec_path,
|
| 231 |
+
original_transcript,
|
| 232 |
noisy_transcript,
|
| 233 |
+
enhanced_transcript,
|
| 234 |
sample_id,
|
|
|
|
|
|
|
| 235 |
)
|
| 236 |
|
| 237 |
+
|
| 238 |
def load_local_file(
|
| 239 |
sample_path: str,
|
| 240 |
normalize: bool = True,
|
| 241 |
+
) -> tuple[np.ndarray | None, str, tuple | None, int | None]:
|
| 242 |
if not sample_path or not os.path.exists(sample_path):
|
| 243 |
+
return None, "", None, None
|
| 244 |
+
|
| 245 |
if os.path.getsize(sample_path) > 5 * 1024 * 1024:
|
| 246 |
gr.Warning("File size exceeds 5 MB limit. Please upload a smaller file.")
|
| 247 |
raise ValueError("Uploaded file exceeds the 5 MB size limit.")
|
| 248 |
+
|
| 249 |
new_sample_stem = os.path.splitext(os.path.basename(sample_path))[0]
|
| 250 |
+
y, sample_rate = librosa.load(sample_path, sr=None, mono=True)
|
| 251 |
+
sample_rate = int(sample_rate)
|
| 252 |
+
y = np.asarray(y, dtype=np.float32)
|
| 253 |
if normalize:
|
| 254 |
y = normalize_lufs(y, sample_rate)
|
| 255 |
gradio_audio = to_gradio_audio(y, sample_rate)
|
| 256 |
return y, new_sample_stem, gradio_audio, sample_rate
|
| 257 |
|
| 258 |
+
|
| 259 |
+
def load_file_from_dataset(
|
| 260 |
+
sample_id: str,
|
| 261 |
+
) -> tuple[tuple | None, np.ndarray | None, str, int | None]:
|
| 262 |
if not sample_id:
|
| 263 |
gr.Warning("Please select a sample from the dropdown.")
|
| 264 |
return None, None, "", None
|
| 265 |
+
|
| 266 |
new_sample_stem = sample_id
|
| 267 |
+
|
| 268 |
try:
|
| 269 |
y, sample_rate = get_audio(sample_id, prefix="mix")
|
| 270 |
+
except Exception as e:
|
| 271 |
+
gr.Warning(str(e))
|
| 272 |
+
raise
|
| 273 |
+
y = np.asarray(y, dtype=np.float32)
|
| 274 |
+
if y.ndim > 1:
|
| 275 |
+
y = np.mean(y, axis=0)
|
| 276 |
+
gradio_audio = to_gradio_audio(y, sample_rate)
|
| 277 |
+
return gradio_audio, y, new_sample_stem, sample_rate
|
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
import numpy as np
|
| 2 |
from dotenv import load_dotenv
|
| 3 |
import aic_sdk as aic
|
|
@@ -8,6 +9,24 @@ from constants import MODEL_ID
|
|
| 8 |
load_dotenv()
|
| 9 |
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
class SDKWrapper:
|
| 12 |
def __init__(self, model_id: str = MODEL_ID, models_dir: str = "./models"):
|
| 13 |
if os.getenv("AIC_SDK_KEY") is None:
|
|
@@ -16,25 +35,25 @@ class SDKWrapper:
|
|
| 16 |
model_path = aic.Model.download(model_id, models_dir)
|
| 17 |
self.model = aic.Model.from_file(model_path)
|
| 18 |
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
self.
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
num_channels=num_channels,
|
| 27 |
num_frames=self.num_frames,
|
| 28 |
-
allow_variable_frames=allow_variable_frames,
|
| 29 |
)
|
| 30 |
-
if sync:
|
| 31 |
-
processor = aic.Processor(self.model, self.sdk_key,
|
| 32 |
else:
|
| 33 |
-
processor = aic.ProcessorAsync(self.model, self.sdk_key,
|
| 34 |
-
processor.get_processor_context().set_parameter(
|
| 35 |
-
aic.ProcessorParameter.EnhancementLevel, float(enhancement_level)
|
| 36 |
)
|
| 37 |
-
self.
|
|
|
|
| 38 |
|
| 39 |
def change_enhancement_level(self, enhancement_level: float):
|
| 40 |
if not hasattr(self, "processor"):
|
|
@@ -42,6 +61,7 @@ class SDKWrapper:
|
|
| 42 |
self.processor.get_processor_context().set_parameter(
|
| 43 |
aic.ProcessorParameter.EnhancementLevel, float(enhancement_level)
|
| 44 |
)
|
|
|
|
| 45 |
|
| 46 |
def _check_shape(self, audio: np.ndarray) -> np.ndarray:
|
| 47 |
if len(audio.shape) == 1:
|
|
@@ -50,15 +70,17 @@ class SDKWrapper:
|
|
| 50 |
raise ValueError("Expected audio with shape (n, frames)")
|
| 51 |
return audio
|
| 52 |
|
| 53 |
-
def
|
| 54 |
self,
|
| 55 |
audio: np.ndarray,
|
| 56 |
-
) -> np.ndarray:
|
| 57 |
"""
|
| 58 |
audio_array: 2D NumPy array with shape (num_channels, samples) containing audio data to be enhanced
|
| 59 |
"""
|
| 60 |
audio = self._check_shape(audio)
|
| 61 |
out = np.zeros_like(audio)
|
|
|
|
|
|
|
| 62 |
chunk_size = self.num_frames
|
| 63 |
n = audio.shape[1]
|
| 64 |
for i in range(0, n, chunk_size):
|
|
@@ -72,7 +94,11 @@ class SDKWrapper:
|
|
| 72 |
break
|
| 73 |
enhanced = self.processor.process(chunk)
|
| 74 |
out[:, i : i + chunk_size] = enhanced[:, :chunk_size]
|
| 75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
def process_chunk(self, audio: np.ndarray) -> np.ndarray:
|
| 78 |
audio = self._check_shape(audio)
|
|
|
|
| 1 |
+
|
| 2 |
import numpy as np
|
| 3 |
from dotenv import load_dotenv
|
| 4 |
import aic_sdk as aic
|
|
|
|
| 9 |
load_dotenv()
|
| 10 |
|
| 11 |
|
| 12 |
+
class SDKParams:
|
| 13 |
+
def __init__(
|
| 14 |
+
self,
|
| 15 |
+
sample_rate: int = 16000,
|
| 16 |
+
enhancement_level: float = 1.0,
|
| 17 |
+
allow_variable_frames: bool = False,
|
| 18 |
+
num_channels: int = 1,
|
| 19 |
+
sync: bool = True,
|
| 20 |
+
num_frames: int | None = None,
|
| 21 |
+
|
| 22 |
+
):
|
| 23 |
+
self.sample_rate = sample_rate
|
| 24 |
+
self.enhancement_level = enhancement_level
|
| 25 |
+
self.allow_variable_frames = allow_variable_frames
|
| 26 |
+
self.num_channels = num_channels
|
| 27 |
+
self.sync = sync
|
| 28 |
+
self.num_frames = num_frames # to be set after processor init
|
| 29 |
+
|
| 30 |
class SDKWrapper:
|
| 31 |
def __init__(self, model_id: str = MODEL_ID, models_dir: str = "./models"):
|
| 32 |
if os.getenv("AIC_SDK_KEY") is None:
|
|
|
|
| 35 |
model_path = aic.Model.download(model_id, models_dir)
|
| 36 |
self.model = aic.Model.from_file(model_path)
|
| 37 |
|
| 38 |
+
def init_processor(self, sdk_params: SDKParams):
|
| 39 |
+
optimal_frames = self.model.get_optimal_num_frames(sdk_params.sample_rate)
|
| 40 |
+
self.num_frames = sdk_params.num_frames if sdk_params.num_frames else optimal_frames
|
| 41 |
+
self.sample_rate = sdk_params.sample_rate
|
| 42 |
+
aic_config = aic.ProcessorConfig(
|
| 43 |
+
sample_rate=sdk_params.sample_rate,
|
| 44 |
+
num_channels=sdk_params.num_channels,
|
|
|
|
| 45 |
num_frames=self.num_frames,
|
| 46 |
+
allow_variable_frames=sdk_params.allow_variable_frames,
|
| 47 |
)
|
| 48 |
+
if sdk_params.sync:
|
| 49 |
+
self.processor = aic.Processor(self.model, self.sdk_key, aic_config)
|
| 50 |
else:
|
| 51 |
+
self.processor = aic.ProcessorAsync(self.model, self.sdk_key, aic_config)
|
| 52 |
+
self.processor.get_processor_context().set_parameter(
|
| 53 |
+
aic.ProcessorParameter.EnhancementLevel, float(sdk_params.enhancement_level)
|
| 54 |
)
|
| 55 |
+
self.enhancement_level = sdk_params.enhancement_level
|
| 56 |
+
self.vad_context = self.processor.get_vad_context()
|
| 57 |
|
| 58 |
def change_enhancement_level(self, enhancement_level: float):
|
| 59 |
if not hasattr(self, "processor"):
|
|
|
|
| 61 |
self.processor.get_processor_context().set_parameter(
|
| 62 |
aic.ProcessorParameter.EnhancementLevel, float(enhancement_level)
|
| 63 |
)
|
| 64 |
+
self.enhancement_level = enhancement_level
|
| 65 |
|
| 66 |
def _check_shape(self, audio: np.ndarray) -> np.ndarray:
|
| 67 |
if len(audio.shape) == 1:
|
|
|
|
| 70 |
raise ValueError("Expected audio with shape (n, frames)")
|
| 71 |
return audio
|
| 72 |
|
| 73 |
+
def process_with_vad(
|
| 74 |
self,
|
| 75 |
audio: np.ndarray,
|
| 76 |
+
) -> tuple[np.ndarray, bool]:
|
| 77 |
"""
|
| 78 |
audio_array: 2D NumPy array with shape (num_channels, samples) containing audio data to be enhanced
|
| 79 |
"""
|
| 80 |
audio = self._check_shape(audio)
|
| 81 |
out = np.zeros_like(audio)
|
| 82 |
+
vad_per_sample = np.zeros_like(audio, dtype=bool)
|
| 83 |
+
vad_overall = False
|
| 84 |
chunk_size = self.num_frames
|
| 85 |
n = audio.shape[1]
|
| 86 |
for i in range(0, n, chunk_size):
|
|
|
|
| 94 |
break
|
| 95 |
enhanced = self.processor.process(chunk)
|
| 96 |
out[:, i : i + chunk_size] = enhanced[:, :chunk_size]
|
| 97 |
+
if self.vad_context.is_speech_detected():
|
| 98 |
+
vad_per_sample[:, i : i + chunk_size] = True
|
| 99 |
+
if vad_per_sample.mean() > 0.5:
|
| 100 |
+
vad_overall = True
|
| 101 |
+
return out, vad_overall
|
| 102 |
|
| 103 |
def process_chunk(self, audio: np.ndarray) -> np.ndarray:
|
| 104 |
audio = self._check_shape(audio)
|
|
@@ -1,191 +1,166 @@
|
|
| 1 |
-
import gradio as gr
|
| 2 |
import numpy as np
|
| 3 |
-
import
|
| 4 |
-
|
| 5 |
from stt_streamers import DeepgramStreamer
|
| 6 |
-
from sdk import SDKWrapper
|
| 7 |
-
from
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
-
# ----------------------------
|
| 10 |
-
# Global transcript store (UI pulls from this)
|
| 11 |
-
# ----------------------------
|
| 12 |
-
_ENHANCED_TRANSCRIPT: str = ""
|
| 13 |
-
_RAW_TRANSCRIPT: str = ""
|
| 14 |
|
| 15 |
def _set_transcript_enhanced(text: str) -> None:
|
| 16 |
-
"""Deepgram callback: update latest transcript text (no printing)."""
|
| 17 |
global _ENHANCED_TRANSCRIPT
|
| 18 |
_ENHANCED_TRANSCRIPT = text
|
| 19 |
|
|
|
|
| 20 |
def _set_transcript_raw(text: str) -> None:
|
| 21 |
-
"""Deepgram callback: update latest transcript text (no printing)."""
|
| 22 |
global _RAW_TRANSCRIPT
|
| 23 |
_RAW_TRANSCRIPT = text
|
| 24 |
|
| 25 |
|
| 26 |
-
def
|
| 27 |
-
|
| 28 |
-
|
|
|
|
| 29 |
|
| 30 |
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
def _get_or_init_session(session: StreamSession | None, sr_in: int) -> StreamSession:
|
| 60 |
-
if session is None or session.sr_in != sr_in:
|
| 61 |
-
# ResampleStream ist für real-time processing gedacht citeturn8view0
|
| 62 |
-
resampler = None if sr_in == DEFAULT_SR else soxr.ResampleStream(sr_in, DEFAULT_SR, num_channels=1, dtype="float32")
|
| 63 |
-
return StreamSession(resampler=resampler, sr_in=sr_in, tail_16k=np.zeros((0,), dtype=np.float32), tail_max=10 * DEFAULT_SR)
|
| 64 |
-
return session
|
| 65 |
|
| 66 |
def _to_float32_mono(y: np.ndarray) -> np.ndarray:
|
| 67 |
-
# Gradio liefert int16 (oder (samples, channels)). citeturn1view4
|
| 68 |
y = np.asarray(y)
|
| 69 |
if y.ndim > 1:
|
| 70 |
y = y.mean(axis=1)
|
| 71 |
if y.dtype == np.int16:
|
| 72 |
-
y =
|
| 73 |
else:
|
| 74 |
y = y.astype(np.float32)
|
| 75 |
-
return y
|
| 76 |
|
| 77 |
|
| 78 |
-
def
|
| 79 |
-
|
| 80 |
-
|
|
|
|
|
|
|
|
|
|
| 81 |
or Streamer_raw is None
|
| 82 |
-
or Streamer_enhanced
|
| 83 |
-
or Streamer_raw
|
| 84 |
-
)
|
| 85 |
-
return session, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT
|
| 86 |
-
if new_chunk is None or new_chunk[1] is None:
|
| 87 |
-
return session, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT
|
| 88 |
-
|
| 89 |
-
sr, y = new_chunk
|
| 90 |
-
y = _to_float32_mono(y)
|
| 91 |
-
# Apply input gain: linear = 10^(dB/20), clip to avoid overflow
|
| 92 |
-
if input_gain_db is not None and input_gain_db > 0:
|
| 93 |
-
gain_linear = np.float32(10.0 ** (float(input_gain_db) / 20.0))
|
| 94 |
-
y = (y * gain_linear).astype(np.float32)
|
| 95 |
-
y = np.clip(y, -1.0, 1.0)
|
| 96 |
-
|
| 97 |
-
session = _get_or_init_session(session, sr)
|
| 98 |
-
SDK.change_enhancement_level(float(enhancement_level) / 100.0)
|
| 99 |
-
if session.resampler is not None:
|
| 100 |
-
y_16k = session.resampler.resample_chunk(y)
|
| 101 |
-
else:
|
| 102 |
-
y_16k = y
|
| 103 |
-
|
| 104 |
-
# Ensure 1D float32 for SDK and streamers (resample_chunk can return 0 samples or 2D)
|
| 105 |
-
y_16k = np.asarray(y_16k, dtype=np.float32).flatten()
|
| 106 |
-
|
| 107 |
-
# Ringbuffer (nicht unendlich konkatenieren)
|
| 108 |
-
if y_16k.size > 0:
|
| 109 |
-
tail = np.concatenate([session.tail_16k, y_16k])
|
| 110 |
-
if tail.size > session.tail_max:
|
| 111 |
-
tail = tail[-session.tail_max:]
|
| 112 |
-
session.tail_16k = tail
|
| 113 |
-
|
| 114 |
-
# Only send when we have samples (resample_chunk can return empty; SDK needs valid input)
|
| 115 |
-
if y_16k.size == 0:
|
| 116 |
-
return session, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT
|
| 117 |
-
|
| 118 |
-
# Parallel path: send raw to STT immediately, then enhance and send enhanced.
|
| 119 |
-
# SDK requires fixed num_frames (AudioConfigMismatchError if we use process_chunk with variable size).
|
| 120 |
-
Streamer_raw.process_chunk(y_16k)
|
| 121 |
-
enhanced_chunk_16k = SDK.process_sync(y_16k)
|
| 122 |
-
out_1d = np.asarray(enhanced_chunk_16k, dtype=np.float32).flatten()
|
| 123 |
-
# Always send something to enhanced so Soniox doesn't close with "No audio received"
|
| 124 |
-
if out_1d.size > 0:
|
| 125 |
-
Streamer_enhanced.process_chunk(out_1d)
|
| 126 |
-
else:
|
| 127 |
-
Streamer_enhanced.process_chunk(np.zeros(160, dtype=np.float32))
|
| 128 |
-
return session, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT
|
| 129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
|
| 131 |
-
def shutdown_streamers(from_stop_recording: bool = False):
|
| 132 |
-
"""Shut down STT streamers. If from_stop_recording, skip when streamers were
|
| 133 |
-
created after the last stop (avoids delayed stop killing new streamers)."""
|
| 134 |
-
global Streamer_enhanced, Streamer_raw, _streamer_generation, _last_stop_generation
|
| 135 |
-
if from_stop_recording and _streamer_generation > _last_stop_generation:
|
| 136 |
-
return
|
| 137 |
-
gen = _streamer_generation
|
| 138 |
try:
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
def on_stop_recording():
|
| 152 |
-
|
| 153 |
-
|
| 154 |
|
| 155 |
|
| 156 |
-
def clear_ui():
|
| 157 |
-
global _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT
|
| 158 |
-
_ENHANCED_TRANSCRIPT = ""
|
| 159 |
-
_RAW_TRANSCRIPT = ""
|
| 160 |
-
return None, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT
|
| 161 |
-
|
| 162 |
-
def stop_online_backend():
|
| 163 |
-
"""Stop streamers and clear transcripts. Do not update the Audio component:
|
| 164 |
-
toggling streaming=False then back to True can make the frontend lose the
|
| 165 |
-
microphone (getUserMedia not re-called), so we leave it unchanged."""
|
| 166 |
-
shutdown_streamers()
|
| 167 |
-
session, enhanced_transcript, raw_transcript = clear_ui()
|
| 168 |
-
return session, enhanced_transcript, raw_transcript, gr.update()
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
def set_stt_streamer(model_name):
|
| 172 |
-
StreamerCls = STREAMER_CLASSES.get(model_name, DeepgramStreamer)
|
| 173 |
-
global Streamer_enhanced, Streamer_raw, _streamer_generation
|
| 174 |
-
# Shut down current streamers first so we don't leak
|
| 175 |
-
if Streamer_enhanced is not None or Streamer_raw is not None:
|
| 176 |
-
shutdown_streamers()
|
| 177 |
-
# Create both before assigning so transcribe_stream never sees one new and one old
|
| 178 |
-
new_enhanced = StreamerCls(
|
| 179 |
-
fs_hz=DEFAULT_SR,
|
| 180 |
-
stream_name="enhanced",
|
| 181 |
-
on_update=_set_transcript_enhanced,
|
| 182 |
-
)
|
| 183 |
-
new_raw = StreamerCls(
|
| 184 |
-
fs_hz=DEFAULT_SR,
|
| 185 |
-
stream_name="raw",
|
| 186 |
-
on_update=_set_transcript_raw,
|
| 187 |
-
)
|
| 188 |
-
_streamer_generation += 1
|
| 189 |
-
Streamer_enhanced = new_enhanced
|
| 190 |
-
Streamer_raw = new_raw
|
| 191 |
-
|
|
|
|
|
|
|
| 1 |
import numpy as np
|
| 2 |
+
from constants import STREAMER_CLASSES
|
| 3 |
+
import gradio as gr
|
| 4 |
from stt_streamers import DeepgramStreamer
|
| 5 |
+
from sdk import SDKWrapper, SDKParams
|
| 6 |
+
from ui import LED_DOT_BLACK, LED_DOT_GREEN, LED_DOT_OFF, LED_DOT_RED, LED_DOT_YELLOW
|
| 7 |
+
|
| 8 |
+
_ENHANCED_TRANSCRIPT = ""
|
| 9 |
+
_RAW_TRANSCRIPT = ""
|
| 10 |
+
|
| 11 |
+
SDK_STREAMING = SDKWrapper()
|
| 12 |
+
Streamer_enhanced = None
|
| 13 |
+
Streamer_raw = None
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
def _set_transcript_enhanced(text: str) -> None:
|
|
|
|
| 17 |
global _ENHANCED_TRANSCRIPT
|
| 18 |
_ENHANCED_TRANSCRIPT = text
|
| 19 |
|
| 20 |
+
|
| 21 |
def _set_transcript_raw(text: str) -> None:
|
|
|
|
| 22 |
global _RAW_TRANSCRIPT
|
| 23 |
_RAW_TRANSCRIPT = text
|
| 24 |
|
| 25 |
|
| 26 |
+
def clear_live_transcripts():
|
| 27 |
+
global _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT
|
| 28 |
+
_ENHANCED_TRANSCRIPT = ""
|
| 29 |
+
_RAW_TRANSCRIPT = ""
|
| 30 |
|
| 31 |
|
| 32 |
+
def render_system_status(status: str):
|
| 33 |
+
if status == "off":
|
| 34 |
+
return LED_DOT_OFF, "Off"
|
| 35 |
+
if status == "init":
|
| 36 |
+
return LED_DOT_YELLOW, "Initializing..."
|
| 37 |
+
if status == "ready":
|
| 38 |
+
return LED_DOT_GREEN, "Ready"
|
| 39 |
+
if status == "error":
|
| 40 |
+
return LED_DOT_RED, "Error. Please refresh the page."
|
| 41 |
+
raise ValueError(f"Invalid status: {status}")
|
| 42 |
+
|
| 43 |
+
def shutdown_streamers():
|
| 44 |
+
global Streamer_enhanced, Streamer_raw
|
| 45 |
+
try:
|
| 46 |
+
if Streamer_enhanced is not None:
|
| 47 |
+
Streamer_enhanced.shutdown()
|
| 48 |
+
if Streamer_raw is not None:
|
| 49 |
+
Streamer_raw.shutdown()
|
| 50 |
+
except Exception as e:
|
| 51 |
+
print(f"Error shutting down streamers: {e}")
|
| 52 |
+
finally:
|
| 53 |
+
Streamer_enhanced = None
|
| 54 |
+
Streamer_raw = None
|
| 55 |
|
| 56 |
+
|
| 57 |
+
def set_stt_streamer(sample_rate: int, stt_model: str):
|
| 58 |
+
global Streamer_enhanced, Streamer_raw
|
| 59 |
+
StreamerCls = STREAMER_CLASSES.get(stt_model, DeepgramStreamer)
|
| 60 |
+
try:
|
| 61 |
+
Streamer_enhanced = StreamerCls(
|
| 62 |
+
fs_hz=sample_rate,
|
| 63 |
+
stream_name="Enhanced",
|
| 64 |
+
on_update=_set_transcript_enhanced,
|
| 65 |
+
)
|
| 66 |
+
Streamer_raw = StreamerCls(
|
| 67 |
+
fs_hz=sample_rate,
|
| 68 |
+
stream_name="Raw",
|
| 69 |
+
on_update=_set_transcript_raw,
|
| 70 |
+
)
|
| 71 |
+
except Exception as e:
|
| 72 |
+
Streamer_enhanced = None
|
| 73 |
+
Streamer_raw = None
|
| 74 |
+
raise RuntimeError(f"Error initializing STT streamer '{stt_model}': {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
def _to_float32_mono(y: np.ndarray) -> np.ndarray:
|
|
|
|
| 77 |
y = np.asarray(y)
|
| 78 |
if y.ndim > 1:
|
| 79 |
y = y.mean(axis=1)
|
| 80 |
if y.dtype == np.int16:
|
| 81 |
+
y = y.astype(np.float32) / 32768.0
|
| 82 |
else:
|
| 83 |
y = y.astype(np.float32)
|
| 84 |
+
return np.asarray(y, dtype=np.float32).flatten()
|
| 85 |
|
| 86 |
|
| 87 |
+
def _ensure_initialized(sr: int, streaming_sr, stt_model: str, enhancement_level: float):
|
| 88 |
+
streamer_cls = STREAMER_CLASSES[stt_model]
|
| 89 |
+
needs_init = (
|
| 90 |
+
streaming_sr is None
|
| 91 |
+
or streaming_sr != sr
|
| 92 |
+
or Streamer_enhanced is None
|
| 93 |
or Streamer_raw is None
|
| 94 |
+
or not isinstance(Streamer_enhanced, streamer_cls)
|
| 95 |
+
or not isinstance(Streamer_raw, streamer_cls)
|
| 96 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
+
if not needs_init:
|
| 99 |
+
if SDK_STREAMING.enhancement_level != enhancement_level:
|
| 100 |
+
SDK_STREAMING.change_enhancement_level(enhancement_level)
|
| 101 |
+
return streaming_sr, *render_system_status("ready")
|
| 102 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
try:
|
| 104 |
+
shutdown_streamers()
|
| 105 |
+
sdk_params = SDKParams(
|
| 106 |
+
sample_rate=sr,
|
| 107 |
+
enhancement_level=enhancement_level
|
| 108 |
+
)
|
| 109 |
+
SDK_STREAMING.init_processor(sdk_params)
|
| 110 |
+
set_stt_streamer(sr, stt_model)
|
| 111 |
+
return sr, *render_system_status("ready")
|
| 112 |
+
except Exception as e:
|
| 113 |
+
gr.Warning(f"Streaming process failed: {e}")
|
| 114 |
+
return None, *render_system_status("error")
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def stream_step(audio_stream, streaming_sr, stt_model, enhancement_level, input_gain_db):
|
| 118 |
+
if audio_stream is None:
|
| 119 |
+
return streaming_sr, *render_system_status("off"), _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT, LED_DOT_OFF
|
| 120 |
+
|
| 121 |
+
sr, chunk = audio_stream
|
| 122 |
+
if chunk is None:
|
| 123 |
+
return streaming_sr, *render_system_status("off"), _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT, LED_DOT_OFF
|
| 124 |
+
|
| 125 |
+
enhancement_level_float = enhancement_level / 100.0
|
| 126 |
+
|
| 127 |
+
streaming_sr, system_led, system_text = _ensure_initialized(
|
| 128 |
+
sr=sr,
|
| 129 |
+
streaming_sr=streaming_sr,
|
| 130 |
+
stt_model=stt_model,
|
| 131 |
+
enhancement_level=enhancement_level_float,
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
try:
|
| 135 |
+
y = _to_float32_mono(chunk)
|
| 136 |
+
|
| 137 |
+
if input_gain_db and input_gain_db > 0:
|
| 138 |
+
gain_linear = np.float32(10.0 ** (float(input_gain_db) / 20.0))
|
| 139 |
+
y = np.clip(y * gain_linear, -1.0, 1.0).astype(np.float32)
|
| 140 |
+
|
| 141 |
+
enhanced_chunk_16k, vad_detected = SDK_STREAMING.process_with_vad(y)
|
| 142 |
+
enhanced_chunk_16k = np.asarray(enhanced_chunk_16k, dtype=np.float32).flatten()
|
| 143 |
+
|
| 144 |
+
Streamer_raw.process_chunk(y)
|
| 145 |
+
Streamer_enhanced.process_chunk(enhanced_chunk_16k)
|
| 146 |
+
|
| 147 |
+
vad_led = LED_DOT_GREEN if vad_detected else LED_DOT_BLACK
|
| 148 |
+
return streaming_sr, system_led, system_text, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT, vad_led
|
| 149 |
+
|
| 150 |
+
except Exception as e:
|
| 151 |
+
gr.Warning(f"Streaming process failed: {e}")
|
| 152 |
+
err_led, err_text = render_system_status("error")
|
| 153 |
+
return streaming_sr, err_led, err_text, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT, LED_DOT_OFF
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
def on_start_recording():
|
| 157 |
+
clear_live_transcripts()
|
| 158 |
+
led, text = render_system_status("init")
|
| 159 |
+
return "", "", led, text
|
| 160 |
+
|
| 161 |
+
|
| 162 |
def on_stop_recording():
|
| 163 |
+
led, text = render_system_status("off")
|
| 164 |
+
return led, led, text, None
|
| 165 |
|
| 166 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -15,7 +15,8 @@ class DeepgramStreamer:
|
|
| 15 |
api_key = os.environ.get("DEEPGRAM_API_KEY")
|
| 16 |
if not api_key:
|
| 17 |
raise RuntimeError("Missing DEEPGRAM_API_KEY.")
|
| 18 |
-
|
|
|
|
| 19 |
self.stream_name = stream_name
|
| 20 |
self.api_name = "Deepgram V1 Nova-3"
|
| 21 |
self.on_update = on_update
|
|
@@ -210,7 +211,6 @@ class DeepgramStreamer:
|
|
| 210 |
self.thread.join(timeout=1.0)
|
| 211 |
if hasattr(self, "keepalive_thread") and self.keepalive_thread.is_alive():
|
| 212 |
self.keepalive_thread.join(timeout=1.0)
|
| 213 |
-
self.ws = None
|
| 214 |
print(f"DeepgramStreamer '{self.stream_name}' shutdown complete.")
|
| 215 |
|
| 216 |
|
|
|
|
| 15 |
api_key = os.environ.get("DEEPGRAM_API_KEY")
|
| 16 |
if not api_key:
|
| 17 |
raise RuntimeError("Missing DEEPGRAM_API_KEY.")
|
| 18 |
+
if not fs_hz:
|
| 19 |
+
raise ValueError("Sample rate (fs_hz) must be specified.")
|
| 20 |
self.stream_name = stream_name
|
| 21 |
self.api_name = "Deepgram V1 Nova-3"
|
| 22 |
self.on_update = on_update
|
|
|
|
| 211 |
self.thread.join(timeout=1.0)
|
| 212 |
if hasattr(self, "keepalive_thread") and self.keepalive_thread.is_alive():
|
| 213 |
self.keepalive_thread.join(timeout=1.0)
|
|
|
|
| 214 |
print(f"DeepgramStreamer '{self.stream_name}' shutdown complete.")
|
| 215 |
|
| 216 |
|
|
@@ -13,7 +13,8 @@ class SonioxStreamer:
|
|
| 13 |
api_key = os.environ.get("SONIOX_API_KEY")
|
| 14 |
if not api_key:
|
| 15 |
raise RuntimeError("Missing SONIOX_API_KEY.")
|
| 16 |
-
|
|
|
|
| 17 |
self.stream_name = stream_name
|
| 18 |
self.api_name = "Soniox RT"
|
| 19 |
self.on_update = on_update
|
|
@@ -207,6 +208,4 @@ class SonioxStreamer:
|
|
| 207 |
self.thread.join(timeout=1.0)
|
| 208 |
if hasattr(self, "keepalive_thread") and self.keepalive_thread.is_alive():
|
| 209 |
self.keepalive_thread.join(timeout=1.0)
|
| 210 |
-
|
| 211 |
-
self.ws = None
|
| 212 |
print(f"SonioxStreamer '{self.stream_name}' shutdown complete.")
|
|
|
|
| 13 |
api_key = os.environ.get("SONIOX_API_KEY")
|
| 14 |
if not api_key:
|
| 15 |
raise RuntimeError("Missing SONIOX_API_KEY.")
|
| 16 |
+
if not fs_hz:
|
| 17 |
+
raise ValueError("Sample rate (fs_hz) must be specified.")
|
| 18 |
self.stream_name = stream_name
|
| 19 |
self.api_name = "Soniox RT"
|
| 20 |
self.on_update = on_update
|
|
|
|
| 208 |
self.thread.join(timeout=1.0)
|
| 209 |
if hasattr(self, "keepalive_thread") and self.keepalive_thread.is_alive():
|
| 210 |
self.keepalive_thread.join(timeout=1.0)
|
|
|
|
|
|
|
| 211 |
print(f"SonioxStreamer '{self.stream_name}' shutdown complete.")
|
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
LED_DOT_RED = """
|
| 2 |
+
<div class="led-dot led-dot--red"></div>
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
LED_DOT_BLACK = """
|
| 6 |
+
<div class="led-dot led-dot--black"></div>
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
LED_DOT_GREEN = """
|
| 10 |
+
<div class="led-dot led-dot--green"></div>
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
LED_DOT_OFF = """
|
| 14 |
+
<div class="led-dot led-dot--off"></div>
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
LED_DOT_YELLOW = """
|
| 19 |
+
<div class="led-dot led-dot--yellow"></div>
|
| 20 |
+
"""
|
|
@@ -1,50 +1,99 @@
|
|
| 1 |
from typing import Optional
|
| 2 |
-
import numpy as np
|
| 3 |
-
import librosa
|
| 4 |
-
from PIL import Image
|
| 5 |
import io
|
| 6 |
-
import matplotlib.pyplot as plt
|
| 7 |
-
from constants import DEFAULT_SR, TARGET_LOUDNESS, TARGET_TP
|
| 8 |
import warnings
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
import pyloudnorm as pyln
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
def to_gradio_audio(x: np.ndarray, sr: int) -> tuple[int, np.ndarray]:
|
| 13 |
-
"""Return (sample_rate, int16
|
| 14 |
-
passing float32 triggers an internal conversion and a warning."""
|
| 15 |
x = np.asarray(x)
|
| 16 |
-
|
| 17 |
-
# Remove extra dims like (1, n, 1) etc.
|
| 18 |
x = np.squeeze(x)
|
| 19 |
|
| 20 |
-
# If it's (channels, samples), transpose to (samples, channels)
|
| 21 |
if x.ndim == 2 and x.shape[0] in (1, 2) and x.shape[1] > x.shape[0]:
|
| 22 |
x = x.T
|
| 23 |
|
| 24 |
-
# Ensure mono is (n_samples,)
|
| 25 |
if x.ndim == 2 and x.shape[1] == 1:
|
| 26 |
x = x[:, 0]
|
| 27 |
|
| 28 |
x = x.astype(np.float32)
|
| 29 |
x = np.clip(x, -1.0, 1.0)
|
| 30 |
-
# Gradio Audio expects int16; convert here so Gradio doesn't convert and warn
|
| 31 |
x = (x * 32767).astype(np.int16)
|
| 32 |
|
| 33 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
|
| 36 |
def spec_image(
|
| 37 |
audio_array: np.ndarray,
|
| 38 |
-
sr: int
|
| 39 |
n_fft: int = 2048,
|
| 40 |
hop_length: int = 512,
|
| 41 |
n_mels: int = 128,
|
| 42 |
fmax: Optional[float] = None,
|
|
|
|
| 43 |
) -> Image.Image:
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
"""
|
| 47 |
-
y = audio_array.flatten() # Ensure it's 1D
|
| 48 |
S = librosa.feature.melspectrogram(
|
| 49 |
y=y,
|
| 50 |
sr=sr,
|
|
@@ -53,20 +102,66 @@ def spec_image(
|
|
| 53 |
n_mels=n_mels,
|
| 54 |
fmax=fmax or sr // 2,
|
| 55 |
)
|
| 56 |
-
S_db = librosa.power_to_db(S, ref=np.max
|
|
|
|
| 57 |
fig, ax = plt.subplots(figsize=(8, 3), dpi=150)
|
|
|
|
| 58 |
img = librosa.display.specshow(
|
| 59 |
-
S_db,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
cbar = fig.colorbar(img, ax=ax, format="%+2.0f dB")
|
| 62 |
cbar.set_label("dB")
|
|
|
|
| 63 |
ax.set_title("Mel-spectrogram")
|
| 64 |
ax.set_xlabel("Time in s")
|
| 65 |
ax.set_ylabel("Frequency in Hz")
|
|
|
|
| 66 |
fig.tight_layout(pad=0.2)
|
|
|
|
| 67 |
buf = io.BytesIO()
|
| 68 |
fig.savefig(buf, format="png", bbox_inches="tight", pad_inches=0)
|
| 69 |
plt.close(fig)
|
|
|
|
| 70 |
buf.seek(0)
|
| 71 |
return Image.open(buf).convert("RGB")
|
| 72 |
|
|
@@ -77,24 +172,24 @@ def compute_wer(reference: str, hypothesis: str) -> float:
|
|
| 77 |
"""
|
| 78 |
ref_words = reference.split()
|
| 79 |
hyp_words = hypothesis.split()
|
| 80 |
-
|
|
|
|
|
|
|
| 81 |
for i in range(len(ref_words) + 1):
|
| 82 |
d[i][0] = i
|
| 83 |
for j in range(len(hyp_words) + 1):
|
| 84 |
d[0][j] = j
|
|
|
|
| 85 |
for i in range(1, len(ref_words) + 1):
|
| 86 |
for j in range(1, len(hyp_words) + 1):
|
| 87 |
-
if ref_words[i - 1] == hyp_words[j - 1]
|
| 88 |
-
cost = 0
|
| 89 |
-
else:
|
| 90 |
-
cost = 1
|
| 91 |
d[i][j] = min(
|
| 92 |
-
d[i - 1][j] + 1,
|
| 93 |
-
d[i][j - 1] + 1,
|
| 94 |
-
d[i - 1][j - 1] + cost,
|
| 95 |
)
|
| 96 |
-
|
| 97 |
-
return
|
| 98 |
|
| 99 |
|
| 100 |
def measure_loudness(x: np.ndarray, sr: int) -> float:
|
|
@@ -102,7 +197,11 @@ def measure_loudness(x: np.ndarray, sr: int) -> float:
|
|
| 102 |
return float(meter.integrated_loudness(x))
|
| 103 |
|
| 104 |
|
| 105 |
-
def true_peak_limiter(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
upsampled_sr = 192000
|
| 107 |
x_upsampled = librosa.resample(x, orig_sr=sr, target_sr=upsampled_sr)
|
| 108 |
true_peak = np.max(np.abs(x_upsampled))
|
|
@@ -116,7 +215,7 @@ def true_peak_limiter(x: np.ndarray, sr: int, max_true_peak: float = TARGET_TP)
|
|
| 116 |
|
| 117 |
x_limited = librosa.resample(x_upsampled, orig_sr=upsampled_sr, target_sr=sr)
|
| 118 |
x_limited = librosa.util.fix_length(x_limited, size=x.shape[-1])
|
| 119 |
-
return x_limited.astype(
|
| 120 |
|
| 121 |
|
| 122 |
def normalize_lufs(x: np.ndarray, sr: int) -> np.ndarray:
|
|
@@ -125,9 +224,9 @@ def normalize_lufs(x: np.ndarray, sr: int) -> np.ndarray:
|
|
| 125 |
"""
|
| 126 |
try:
|
| 127 |
current_lufs = measure_loudness(x, sr)
|
| 128 |
-
|
| 129 |
if not np.isfinite(current_lufs):
|
| 130 |
-
return x.astype(
|
| 131 |
|
| 132 |
gain_db = TARGET_LOUDNESS - current_lufs
|
| 133 |
gain = 10 ** (gain_db / 20)
|
|
@@ -135,7 +234,7 @@ def normalize_lufs(x: np.ndarray, sr: int) -> np.ndarray:
|
|
| 135 |
y = x * gain
|
| 136 |
y = true_peak_limiter(y, sr, max_true_peak=TARGET_TP)
|
| 137 |
|
| 138 |
-
return y.astype(
|
| 139 |
except Exception as e:
|
| 140 |
warnings.warn(f"LUFS normalization failed, returning input unchanged: {e}")
|
| 141 |
-
return x.astype(
|
|
|
|
| 1 |
from typing import Optional
|
|
|
|
|
|
|
|
|
|
| 2 |
import io
|
|
|
|
|
|
|
| 3 |
import warnings
|
| 4 |
+
|
| 5 |
+
import librosa
|
| 6 |
+
import librosa.display
|
| 7 |
+
import matplotlib.pyplot as plt
|
| 8 |
+
import numpy as np
|
| 9 |
import pyloudnorm as pyln
|
| 10 |
+
from matplotlib.patches import Patch
|
| 11 |
+
from PIL import Image
|
| 12 |
+
|
| 13 |
+
from constants import TARGET_LOUDNESS, TARGET_TP, VAD_OFF, VAD_ON
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def get_vad_labels(vad_timestamps: list[list[float]], length: float) -> list[dict]:
|
| 17 |
+
subtitles = []
|
| 18 |
+
cur = 0.0
|
| 19 |
+
|
| 20 |
+
for start, end in vad_timestamps:
|
| 21 |
+
if start > cur:
|
| 22 |
+
subtitles.append(
|
| 23 |
+
{
|
| 24 |
+
"text": f"Voice Detection: {VAD_OFF}",
|
| 25 |
+
"timestamp": [cur, start],
|
| 26 |
+
}
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
subtitles.append(
|
| 30 |
+
{
|
| 31 |
+
"text": f"Voice Detection: {VAD_ON}",
|
| 32 |
+
"timestamp": [start, end],
|
| 33 |
+
}
|
| 34 |
+
)
|
| 35 |
+
cur = end
|
| 36 |
+
|
| 37 |
+
if cur < length:
|
| 38 |
+
subtitles.append(
|
| 39 |
+
{
|
| 40 |
+
"text": f"Voice Detection: {VAD_OFF}",
|
| 41 |
+
"timestamp": [cur, length],
|
| 42 |
+
}
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
return subtitles
|
| 46 |
|
| 47 |
|
| 48 |
def to_gradio_audio(x: np.ndarray, sr: int) -> tuple[int, np.ndarray]:
|
| 49 |
+
"""Return (sample_rate, int16 array) for Gradio Audio."""
|
|
|
|
| 50 |
x = np.asarray(x)
|
|
|
|
|
|
|
| 51 |
x = np.squeeze(x)
|
| 52 |
|
|
|
|
| 53 |
if x.ndim == 2 and x.shape[0] in (1, 2) and x.shape[1] > x.shape[0]:
|
| 54 |
x = x.T
|
| 55 |
|
|
|
|
| 56 |
if x.ndim == 2 and x.shape[1] == 1:
|
| 57 |
x = x[:, 0]
|
| 58 |
|
| 59 |
x = x.astype(np.float32)
|
| 60 |
x = np.clip(x, -1.0, 1.0)
|
|
|
|
| 61 |
x = (x * 32767).astype(np.int16)
|
| 62 |
|
| 63 |
+
return sr, x
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def _merge_vad_segments(
|
| 67 |
+
vad_timestamps: list[list[float]],
|
| 68 |
+
gap_tolerance: float = 0.05,
|
| 69 |
+
) -> list[tuple[float, float]]:
|
| 70 |
+
if not vad_timestamps:
|
| 71 |
+
return []
|
| 72 |
+
|
| 73 |
+
segments = sorted((float(start), float(end)) for start, end in vad_timestamps)
|
| 74 |
+
merged: list[tuple[float, float]] = [segments[0]]
|
| 75 |
+
|
| 76 |
+
for start, end in segments[1:]:
|
| 77 |
+
last_start, last_end = merged[-1]
|
| 78 |
+
if start <= last_end + gap_tolerance:
|
| 79 |
+
merged[-1] = (last_start, max(last_end, end))
|
| 80 |
+
else:
|
| 81 |
+
merged.append((start, end))
|
| 82 |
+
|
| 83 |
+
return merged
|
| 84 |
|
| 85 |
|
| 86 |
def spec_image(
|
| 87 |
audio_array: np.ndarray,
|
| 88 |
+
sr: int,
|
| 89 |
n_fft: int = 2048,
|
| 90 |
hop_length: int = 512,
|
| 91 |
n_mels: int = 128,
|
| 92 |
fmax: Optional[float] = None,
|
| 93 |
+
vad_timestamps: Optional[list[list[float]]] = None,
|
| 94 |
) -> Image.Image:
|
| 95 |
+
y = np.asarray(audio_array, dtype=np.float32).flatten()
|
| 96 |
+
|
|
|
|
|
|
|
| 97 |
S = librosa.feature.melspectrogram(
|
| 98 |
y=y,
|
| 99 |
sr=sr,
|
|
|
|
| 102 |
n_mels=n_mels,
|
| 103 |
fmax=fmax or sr // 2,
|
| 104 |
)
|
| 105 |
+
S_db = librosa.power_to_db(S, ref=np.max)
|
| 106 |
+
|
| 107 |
fig, ax = plt.subplots(figsize=(8, 3), dpi=150)
|
| 108 |
+
|
| 109 |
img = librosa.display.specshow(
|
| 110 |
+
S_db,
|
| 111 |
+
sr=sr,
|
| 112 |
+
hop_length=hop_length,
|
| 113 |
+
x_axis="time",
|
| 114 |
+
y_axis="mel",
|
| 115 |
+
cmap="magma",
|
| 116 |
+
ax=ax,
|
| 117 |
)
|
| 118 |
+
|
| 119 |
+
if vad_timestamps:
|
| 120 |
+
vad_color = "#22C55E" # softer, cleaner green
|
| 121 |
+
merged_segments = _merge_vad_segments(vad_timestamps, gap_tolerance=0.05)
|
| 122 |
+
|
| 123 |
+
# Draw VAD bar as a fixed portion of the figure height (e.g., 4% of axes height)
|
| 124 |
+
bar_height_axes = 0.05 # 2% of axes height
|
| 125 |
+
bar_bottom_axes = 0.0 # 0% above the bottom
|
| 126 |
+
|
| 127 |
+
for start, end in merged_segments:
|
| 128 |
+
ax.fill_between(
|
| 129 |
+
[start, end],
|
| 130 |
+
[bar_bottom_axes, bar_bottom_axes],
|
| 131 |
+
[bar_bottom_axes + bar_height_axes, bar_bottom_axes + bar_height_axes],
|
| 132 |
+
color=vad_color,
|
| 133 |
+
alpha=0.95,
|
| 134 |
+
linewidth=0,
|
| 135 |
+
zorder=5,
|
| 136 |
+
transform=ax.get_xaxis_transform(),
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
vad_patch = Patch(
|
| 140 |
+
facecolor=vad_color,
|
| 141 |
+
edgecolor=vad_color,
|
| 142 |
+
label="Voice Activity",
|
| 143 |
+
)
|
| 144 |
+
ax.legend(
|
| 145 |
+
handles=[vad_patch],
|
| 146 |
+
loc="upper right",
|
| 147 |
+
fontsize=8,
|
| 148 |
+
frameon=True,
|
| 149 |
+
framealpha=0.9,
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
cbar = fig.colorbar(img, ax=ax, format="%+2.0f dB")
|
| 153 |
cbar.set_label("dB")
|
| 154 |
+
|
| 155 |
ax.set_title("Mel-spectrogram")
|
| 156 |
ax.set_xlabel("Time in s")
|
| 157 |
ax.set_ylabel("Frequency in Hz")
|
| 158 |
+
|
| 159 |
fig.tight_layout(pad=0.2)
|
| 160 |
+
|
| 161 |
buf = io.BytesIO()
|
| 162 |
fig.savefig(buf, format="png", bbox_inches="tight", pad_inches=0)
|
| 163 |
plt.close(fig)
|
| 164 |
+
|
| 165 |
buf.seek(0)
|
| 166 |
return Image.open(buf).convert("RGB")
|
| 167 |
|
|
|
|
| 172 |
"""
|
| 173 |
ref_words = reference.split()
|
| 174 |
hyp_words = hypothesis.split()
|
| 175 |
+
|
| 176 |
+
d = np.zeros((len(ref_words) + 1, len(hyp_words) + 1), dtype=np.uint16)
|
| 177 |
+
|
| 178 |
for i in range(len(ref_words) + 1):
|
| 179 |
d[i][0] = i
|
| 180 |
for j in range(len(hyp_words) + 1):
|
| 181 |
d[0][j] = j
|
| 182 |
+
|
| 183 |
for i in range(1, len(ref_words) + 1):
|
| 184 |
for j in range(1, len(hyp_words) + 1):
|
| 185 |
+
cost = 0 if ref_words[i - 1] == hyp_words[j - 1] else 1
|
|
|
|
|
|
|
|
|
|
| 186 |
d[i][j] = min(
|
| 187 |
+
d[i - 1][j] + 1,
|
| 188 |
+
d[i][j - 1] + 1,
|
| 189 |
+
d[i - 1][j - 1] + cost,
|
| 190 |
)
|
| 191 |
+
|
| 192 |
+
return d[len(ref_words)][len(hyp_words)] / max(len(ref_words), 1)
|
| 193 |
|
| 194 |
|
| 195 |
def measure_loudness(x: np.ndarray, sr: int) -> float:
|
|
|
|
| 197 |
return float(meter.integrated_loudness(x))
|
| 198 |
|
| 199 |
|
| 200 |
+
def true_peak_limiter(
|
| 201 |
+
x: np.ndarray,
|
| 202 |
+
sr: int,
|
| 203 |
+
max_true_peak: float = TARGET_TP,
|
| 204 |
+
) -> np.ndarray:
|
| 205 |
upsampled_sr = 192000
|
| 206 |
x_upsampled = librosa.resample(x, orig_sr=sr, target_sr=upsampled_sr)
|
| 207 |
true_peak = np.max(np.abs(x_upsampled))
|
|
|
|
| 215 |
|
| 216 |
x_limited = librosa.resample(x_upsampled, orig_sr=upsampled_sr, target_sr=sr)
|
| 217 |
x_limited = librosa.util.fix_length(x_limited, size=x.shape[-1])
|
| 218 |
+
return x_limited.astype(np.float32)
|
| 219 |
|
| 220 |
|
| 221 |
def normalize_lufs(x: np.ndarray, sr: int) -> np.ndarray:
|
|
|
|
| 224 |
"""
|
| 225 |
try:
|
| 226 |
current_lufs = measure_loudness(x, sr)
|
| 227 |
+
|
| 228 |
if not np.isfinite(current_lufs):
|
| 229 |
+
return x.astype(np.float32)
|
| 230 |
|
| 231 |
gain_db = TARGET_LOUDNESS - current_lufs
|
| 232 |
gain = 10 ** (gain_db / 20)
|
|
|
|
| 234 |
y = x * gain
|
| 235 |
y = true_peak_limiter(y, sr, max_true_peak=TARGET_TP)
|
| 236 |
|
| 237 |
+
return y.astype(np.float32)
|
| 238 |
except Exception as e:
|
| 239 |
warnings.warn(f"LUFS normalization failed, returning input unchanged: {e}")
|
| 240 |
+
return x.astype(np.float32)
|