Spaces:

ai-coustics
/

VoiceFocus

Running on CPU Upgrade

mariesig commited on Mar 19

Commit

32dcdfe

1 Parent(s): 909a8c1

vad (#3)

- remove websocket assignment during shutdown (ac7c4481f1d26fd8161dfc668c79440b88d09a5c)
- add vad and sr to streaming (7274b791d482afe7ef9ab2020f74ceace38e16c2)
- move VAD constants to seperate file (06fe4294bf3df2784ab123904982e173aec34394)
- dataset tab stops recording (62744142a409edfa5686bd086b723901e71e92a1)
- remove redundant code from tab switch (0a77e8e90695c7eb99d272461aec14c057c9b08a)
- improve layout (ba97774b30ae8d6dc089f66e0b1d8471dbe10098)
- refactor streaming pipeline (b22fd96890c75efae0e6c56d5bc5d9c329be7192)
- validate sample rate in stt streamers (ab0481a43d19026f3fd291f05b2b146c857ad746)
- default SDK Params (43283c8a389a539e100033f0ecda6038e0b1f1ea)
- remove unused DEFAULT_SAMPLE_ID constant (df08ce34defdaf2ab622249f3012fc53b5585986)
- reorder/name to match offline pipeline (c4c5df8d3f6acc14eb6d426047b74576b68a6e4f)
- fix ui on tab switch (be39c5be95c88f69b66c46e15501b808e3c25507)
- refactor offline pipeline (4e945b9cf36c72bb129d1bb2ce23a9a849fc4af4)
- replace soundfile with librosa (03a504941537d06664bacde8b154e48e9348a429)
- bug fix (4319862f89f59aed7f8739bf83be99d34001698f)
- ruff (6606020cb7ce644e81865e4c79066f7f82fd2d7c)
- VAD in spectrogram (a7c506c419746c71d1d63d8719017ebc50b2ef07)
- stt model change on streaming (12039524efd8521399fecb520e0274213ff044ee)
- fix vad bar height (b949ffac592b85f0fd3b52d975485659f7c4d98a)

Files changed (12) hide show

app.py +132 -278
assets/active_light.css +0 -68
assets/styling.css +82 -0
clean_up.py +2 -5
constants.py +3 -1
offline_pipeline.py +210 -86
sdk.py +44 -18
stream_pipeline.py +133 -158
stt_streamers/deepgram_streamer.py +2 -2
stt_streamers/soniox_streamer.py +2 -3
ui.py +20 -0
utils.py +135 -36

app.py CHANGED Viewed

@@ -1,162 +1,33 @@
 import os
-import threading
-import time
 import gradio as gr
 from pathlib import Path
-from constants import STREAM_EVERY, WARMUP_SECONDS, APP_TMP_DIR
-from hf_dataset_utils import ALL_FILES, get_transcript
 from stream_pipeline import (
-    clear_ui,
-    get_live_transcripts,
     on_stop_recording,
-    set_stt_streamer,
-    stop_online_backend,
-    transcribe_stream,
     shutdown_streamers,
 )
 from offline_pipeline import (
     load_file_from_dataset,
     load_local_file,
-    run_offline_pipeline_streaming,
 )
-from utils import spec_image
 from clean_up import purge_tmp_directory, cleanup_previous_run
-# Active light HTML: whole container is the light (gray = warming up, red = ready)
-ACTIVE_LIGHT_GRAY = (
-    '<div class="active-light active-light--off" title="Warming up" style="display:flex;justify-content:center;align-items:center;width:100%;height:100%;min-height:3rem;">'
-    '<span class="active-light__label">Warming up...</span></div>'
-)
-ACTIVE_LIGHT_RED = (
-    '<div class="active-light active-light--on" title="Ready" style="display:flex;justify-content:center;align-items:center;width:100%;height:100%;min-height:3rem;">'
-    '<span class="active-light__label">Ready!</span></div>'
-)
-WARMUP_TICKS = max(1, int(WARMUP_SECONDS * 2))  # timer ticks every 0.5s
-def warmup_tick(on_stream_tab, warmup_elapsed, _current_html):
-    if not on_stream_tab:
-        return 0, ACTIVE_LIGHT_GRAY, gr.update(active=True)
-    if warmup_elapsed >= WARMUP_TICKS:
-        return warmup_elapsed, ACTIVE_LIGHT_RED, gr.update(active=False)
-    return warmup_elapsed + 1, ACTIVE_LIGHT_GRAY, gr.update(active=True)
-def process_with_live_transcript(
-    input_array,
-    enhancement_level,
-    sample_stem,
-    stt_model,
-    last_sample_stem,
-    current_sample_rate,
-):
-    """Generator that runs the offline pipeline in real time (chunked): enhanced audio and
-    both transcripts stream from the first chunk so playback and transcription start immediately."""
-    progress_state = {}
-    result_holder = {}
-    def worker():
-        try:
-            result_holder["result"] = run_offline_pipeline_streaming(
-                input_array,
-                current_sample_rate,
-                enhancement_level,
-                sample_stem,
-                stt_model,
-                progress_state
-            )
-        except Exception as e:
-            result_holder["error"] = e
-    # 1) First yield: ground truth + input spectrogram only (no audio, no enhanced spec, no transcripts yet)
-    _ = cleanup_previous_run(last_sample_stem)
-    noisy_spec_path = f"{APP_TMP_DIR}/{sample_stem}_noisy_spectrogram.png"
-    if input_array is not None:
-        try:
-            spec_image(input_array).save(noisy_spec_path)
-        except Exception:
-            noisy_spec_path = None
-    else:
-        noisy_spec_path = None
-    try:
-        original_transcript = get_transcript(sample_stem)
-    except Exception:
-        original_transcript = "Unavailable"
-    yield (
-        gr.update(visible=True),
-        None,                # enhanced_audio: set only in final yield (smooth playback)
-        gr.update(value=None),  # enhanced_image: clear until step 3 (last)
-        gr.update(value=noisy_spec_path),  # noisy_image: input spectrogram (step 1)
-        original_transcript,
-        "",
-        "",
-        sample_stem,
-        None,
-        "",
-    )
-    # Let the UI render step 1 before we flood with polling updates
-    time.sleep(0.2)
-    thread = threading.Thread(target=worker, daemon=True)
-    thread.start()
-    poll_interval = 0.05
-    while "result" not in result_holder and "error" not in result_holder:
-        time.sleep(poll_interval)
-        # 2) Realtime: stream transcripts only; audio set in final yield for smooth playback
-        yield (
-            gr.update(visible=True),
-            gr.update(),  # enhanced_audio: set only in final yield, then autoplay
-            gr.update(),  # enhanced_image: reveal only in step 3 (final yield)
-            gr.update(),  # noisy_image already set in step 1
-            gr.update(),  # original_transcript unchanged
-            gr.update(value=progress_state.get("noisy", "")),
-            gr.update(value=progress_state.get("enhanced", "")),
-            gr.update(),
-            gr.update(),
-            gr.update(),
-        )
-    if "error" in result_holder:
-        raise result_holder["error"]
-    (
-        enhanced_spec_path,
-        enhanced_transcript,
-        noisy_transcript_with_wer,
-        enhanced_audio,
-        last_stem,
-        enhanced_array,
-        precomputed_noisy,
-    ) = result_holder["result"]
-    # 3) Last: reveal enhanced spectrogram (and final audio/transcripts)
-    yield (
-        gr.update(visible=True),
-        enhanced_audio,
-        enhanced_spec_path,   # enhanced_image: show only now
-        noisy_spec_path,
-        original_transcript,
-        noisy_transcript_with_wer,
-        enhanced_transcript,
-        last_stem,
-        enhanced_array,
-        precomputed_noisy,
-    )
 _CSS_DIR = Path(__file__).resolve().parent / "assets"
 with gr.Blocks() as demo:
     sample_stem = gr.State("")
     last_sample_stem = gr.State("")
     input_array = gr.State()
-    enhanced_array = gr.State()
-    precomputed_noisy_transcript = gr.State("")
-    current_sample_rate = gr.State(16000)  # default sample rate for dataset samples; updated on local file load if different
     gr.HTML(
         '<a href="https://ai-coustics.com/" target="_blank">'
         '<img src="https://mintcdn.com/ai-coustics/Sxcrv8jVSE2qWMR1/logo/dark.svg?fit=max&auto=format&n=Sxcrv8jVSE2qWMR1&q=85&s=7f26caaf21e963912961cbd8541e6d84" '
@@ -182,80 +53,90 @@ with gr.Blocks() as demo:
         scale=2,
     )
-    with gr.Group(elem_classes="panel"):
         with gr.Tab("Stream audio in real time") as stream_tab:
-            gr.Markdown(open("docs/online.md", "r", encoding="utf-8").read())
-            with gr.Group(elem_classes="panel"):
-                stream_state = gr.State(None)
-                on_stream_tab = gr.State(True)  # True on load: stream tab is the default first tab
-                warmup_elapsed = gr.State(0)
-                input_gain_db = gr.Slider(
-                    minimum=0,
-                    maximum=20,
-                    step=0.5,
-                    value=0,
-                    label="Input gain (dB)",
-                )
-                with gr.Row(elem_classes="stream-row"):
-                    with gr.Column(scale=4, min_width=200):
-                        audio_stream = gr.Audio(
-                            sources=["microphone"], streaming=True, elem_id="audio_stream"
-                        )
-                    with gr.Column(scale=1, min_width=120, elem_classes="active-light-column"):
-                        active_light = gr.HTML(value=ACTIVE_LIGHT_GRAY)
-                with gr.Group(elem_classes="panel"):
-                    with gr.Column(scale=5, min_width=320):
-                        enhanced_text = gr.Textbox(
-                            label="Enhanced Transcribed Text", lines=6, autoscroll=False
                         )
-                    with gr.Column(scale=5, min_width=320):
-                        raw_text = gr.Textbox(
-                            label="Raw Transcribed Text", lines=6, autoscroll=False
                         )
-                # Poll transcript globals so interim results show live (streamers update in background)
-                transcript_timer = gr.Timer(0.1, active=True)
-                transcript_timer.tick(
-                    get_live_transcripts,
-                    inputs=None,
-                    outputs=[enhanced_text, raw_text],
-                    show_progress="hidden",
-                )
-                warmup_timer = gr.Timer(0.5, active=True)
-                warmup_timer.tick(
-                    warmup_tick,
-                    inputs=[on_stream_tab, warmup_elapsed, active_light],
-                    outputs=[warmup_elapsed, active_light, warmup_timer],
-                    show_progress="hidden",
-                )
         with gr.Tab("Dataset: Dawn Chorus") as dataset_tab:
-            with gr.Row():
-                gr.Markdown(open("docs/dawn_chorus.md", "r", encoding="utf-8").read())
-            dataset_dropdown = gr.Dropdown(
-                choices=ALL_FILES, value="en_00412_i_h_36", label="Sample"
-            )
-            audio_file_from_dataset = gr.Audio(
-                type="filepath", interactive=False, buttons=["download"], autoplay=False
-            )
         with gr.Tab("Upload local file") as upload_tab:
-            with gr.Row():
-                gr.Markdown(open("docs/local_file.md", "r", encoding="utf-8").read())
-            audio_file_upload = gr.File(
-                file_types=[".wav", ".mp3", ".flac", ".m4a", ".ogg"],
-                file_count="single",
-                scale=3,
-            )
-            normalize = gr.Checkbox(label="Normalize audio", value=True)
-            audio_preview = gr.Audio(
-                label="Preview",
-                autoplay=False,
-                interactive=False,
-            )
-        enhance_btn = gr.Button("Enhance with Quail Voice Focus 2.0", scale=2, visible=False)
     with gr.Group(elem_classes="panel results-card", visible=False) as results_card:
-        result_title = gr.Markdown("")
         enhanced_audio = gr.Audio(
             type="numpy",
             interactive=False,
@@ -284,107 +165,75 @@ with gr.Blocks() as demo:
                 )
     # ------------------------------------------------------
-    # GLOBAL EVENTS (TAB CHANGES, MODEL CHANGES)
     # ------------------------------------------------------
-    DEFAULT_DATASET_SAMPLE = "en_00412_i_h_36"
-    def load_dataset_sample_on_tab_visit(dropdown_value):
-        """Load the selected sample when visiting the Dataset tab; use default if dropdown is empty."""
-        sample_id = dropdown_value or DEFAULT_DATASET_SAMPLE
-        audio_path, arr, stem, sample_rate = load_file_from_dataset(sample_id)
-        return sample_id, audio_path, arr, stem, sample_rate
     stream_tab.select(
         lambda: (
             gr.update(visible=False),
             gr.update(visible=False),
-            gr.update(visible=True, interactive=True, streaming=True, sources=["microphone"]),
-            True,
-            0,
-            ACTIVE_LIGHT_GRAY,
-            gr.update(active=True),
         ),
-        inputs=None,
-        outputs=[results_card, enhance_btn, audio_stream, on_stream_tab, warmup_elapsed, active_light, warmup_timer],
     )
     upload_tab.select(
-        lambda: (
-            *stop_online_backend(),
-            False,
-            ACTIVE_LIGHT_GRAY,
-        ),
-        inputs=None,
-        outputs=[stream_state, enhanced_text, raw_text, audio_stream, on_stream_tab, active_light],
     ).then(
-        lambda: gr.update(visible=True),
-        outputs=enhance_btn,
     )
     dataset_tab.select(
-        lambda: (
-            *stop_online_backend(),
-            False,
-            ACTIVE_LIGHT_GRAY,
-        ),
-        inputs=None,
-        outputs=[stream_state, enhanced_text, raw_text, audio_stream, on_stream_tab, active_light],
-    ).then(
-        lambda: gr.update(visible=True),
-        outputs=enhance_btn,
     ).then(
-        load_dataset_sample_on_tab_visit,
-        inputs=[dataset_dropdown],
-        outputs=[dataset_dropdown, audio_file_from_dataset, input_array, sample_stem, current_sample_rate],
-    )
-    stt_model.change(
-        fn=shutdown_streamers,
-    ).then(
-        clear_ui,
-        inputs=None,
-        outputs=[stream_state, enhanced_text, raw_text],
-    ).then(
-        set_stt_streamer,
-        inputs=stt_model,
-        outputs=None,
     )
     # ------------------------------------------------------
-    # ONLINE PIPELINE EVENTS (DATASET + UPLOAD TABS)
     # ------------------------------------------------------
     audio_stream.stream(
-        fn=transcribe_stream,
-        inputs=[stream_state, audio_stream, enhancement_level, input_gain_db],
-        outputs=[stream_state, enhanced_text, raw_text],
         stream_every=STREAM_EVERY,
         time_limit=60 * 2,
-        concurrency_limit=1,
     )
     audio_stream.stop_recording(
         on_stop_recording,
     )
     audio_stream.start_recording(
-        clear_ui,
-        inputs=None,
-        outputs=[stream_state, enhanced_text, raw_text],
-    ).then(
-        fn=set_stt_streamer,
-        inputs=stt_model,
-        outputs=None,
-    )
     # ------------------------------------------------------
-    # OFFLINE PIPELINE EVENTS (DATASET + UPLOAD TABS)
     # ------------------------------------------------------
     # Dataset dropdown selection triggers loading the audio file and hiding results until enhancement
     dataset_dropdown.change(
         lambda: gr.update(visible=False),
-        inputs=None,
         outputs=results_card,
     ).then(
         load_file_from_dataset,
@@ -410,10 +259,15 @@ with gr.Blocks() as demo:
     # Enhancement button: run pipeline with live transcript progress (dataset + local file modes).
     enhance_btn.click(
-        process_with_live_transcript,
-        inputs=[input_array, enhancement_level, sample_stem, stt_model, last_sample_stem, current_sample_rate],
         outputs=[
-            results_card,
             enhanced_audio,
             enhanced_image,
             noisy_image,
@@ -421,15 +275,15 @@ with gr.Blocks() as demo:
             noisy_transcript,
             enhanced_transcript,
             last_sample_stem,
-            enhanced_array,
-            precomputed_noisy_transcript,
-        ],
     )
 os.makedirs(APP_TMP_DIR, exist_ok=True)
 purge_tmp_directory(max_age_minutes=0, tmp_dir=APP_TMP_DIR)
 demo.queue()
 demo.launch(
-    css=(_CSS_DIR / "active_light.css").read_text(encoding="utf-8"),
     allowed_paths=[APP_TMP_DIR, "/tmp", "/"],
 )

 import os
 import gradio as gr
 from pathlib import Path
+from constants import STREAM_EVERY, APP_TMP_DIR
+from ui import LED_DOT_OFF
+from hf_dataset_utils import ALL_FILES
 from stream_pipeline import (
+    on_start_recording,
     on_stop_recording,
     shutdown_streamers,
+    stream_step,
 )
 from offline_pipeline import (
     load_file_from_dataset,
     load_local_file,
+    run_offline_pipeline,
 )
 from clean_up import purge_tmp_directory, cleanup_previous_run
 _CSS_DIR = Path(__file__).resolve().parent / "assets"
 with gr.Blocks() as demo:
     sample_stem = gr.State("")
     last_sample_stem = gr.State("")
     input_array = gr.State()
+    streaming_sr = gr.State(None)
+    current_sample_rate = gr.State(None)
     gr.HTML(
         '<a href="https://ai-coustics.com/" target="_blank">'
         '<img src="https://mintcdn.com/ai-coustics/Sxcrv8jVSE2qWMR1/logo/dark.svg?fit=max&auto=format&n=Sxcrv8jVSE2qWMR1&q=85&s=7f26caaf21e963912961cbd8541e6d84" '
         scale=2,
     )
+    with gr.Tabs():
         with gr.Tab("Stream audio in real time") as stream_tab:
+            with gr.Row(equal_height=False, elem_classes="stream-layout"):
+                with gr.Column(scale=4, min_width=320):
+                    with gr.Group(elem_classes="panel section-panel"):
+                        gr.Markdown("### Input", elem_classes="title")
+                        gr.Markdown(open("docs/online.md", "r", encoding="utf-8").read(), elem_classes="tab-description")
+                        input_gain_db = gr.Slider(
+                            minimum=0,
+                            maximum=20,
+                            step=0.5,
+                            value=0,
+                            label="Input gain (dB)",
                         )
+                        audio_stream = gr.Audio(
+                            sources=["microphone"],
+                            streaming=True,
+                            elem_id="audio_stream",
                         )
+                with gr.Column(scale=6, min_width=420):
+                    with gr.Group(elem_classes="status-panel output-panel"):
+                            gr.Markdown("### Live Output", elem_classes="title")
+                            with gr.Row(equal_height=True, elem_classes="status-indicators"):
+                                with gr.Group(elem_classes="status-card"):
+                                    gr.Markdown("**System Status**", elem_classes="status-card__label")
+                                    system_status_led = gr.HTML(value=LED_DOT_OFF, show_label=False)
+                                    system_status_text = gr.Markdown(
+                                        value="Off",
+                                        elem_classes="status-card__subtext",
+                                    )
+                                with gr.Group(elem_classes="status-card"):
+                                    gr.Markdown("**Voice Activity**", elem_classes="status-card__label")
+                                    vad_led = gr.HTML(value=LED_DOT_OFF, show_label=False)
+                            with gr.Row(equal_height=True, elem_classes="transcript-row transcript-row--large"):
+                                enhanced_text = gr.Textbox(
+                                    label="Enhanced Transcript",
+                                    lines=10,
+                                    autoscroll=False,
+                                )
+                                raw_text = gr.Textbox(
+                                    label="Raw Transcript",
+                                    lines=10,
+                                    autoscroll=False,
+                                )
         with gr.Tab("Dataset: Dawn Chorus") as dataset_tab:
+            with gr.Group(elem_classes="panel section-panel"):
+                gr.Markdown("### Input", elem_classes="title")
+                gr.Markdown(open("docs/dawn_chorus.md", "r", encoding="utf-8").read(), elem_classes="tab-description")
+                dataset_dropdown = gr.Dropdown(
+                    choices=ALL_FILES, value="en_00412_i_h_36", label="Sample"
+                )
+                audio_file_from_dataset = gr.Audio(
+                    type="filepath", interactive=False, buttons=["download"], autoplay=False
+                )
         with gr.Tab("Upload local file") as upload_tab:
+            with gr.Group(elem_classes="panel section-panel"):
+                gr.Markdown("### Input", elem_classes="title")
+                gr.Markdown(open("docs/local_file.md", "r", encoding="utf-8").read(), elem_classes="tab-description")
+                audio_file_upload = gr.File(
+                    file_types=[".wav", ".mp3", ".flac", ".m4a", ".ogg"],
+                    file_count="single",
+                    scale=3,
+                )
+                normalize = gr.Checkbox(label="Normalize audio", value=True)
+                audio_preview = gr.Audio(
+                    label="Preview",
+                    autoplay=False,
+                    interactive=False,
+                )
+    enhance_btn = gr.Button("Enhance with Quail Voice Focus 2.0", scale=2, visible=False)
     with gr.Group(elem_classes="panel results-card", visible=False) as results_card:
+        result_title = gr.Markdown("", elem_classes="title")
         enhanced_audio = gr.Audio(
             type="numpy",
             interactive=False,
                 )
     # ------------------------------------------------------
+    # TAB CHANGES
     # ------------------------------------------------------
     stream_tab.select(
         lambda: (
             gr.update(visible=False),
             gr.update(visible=False),
+            gr.update(sources=["microphone"], streaming=True, interactive= True),
         ),
+        outputs=[results_card, enhance_btn, audio_stream],
     )
+    def _on_not_streaming_tab():
+        shutdown_streamers()
+        return (
+            gr.update(streaming=False, interactive=False),
+            gr.update(visible=True),
+            LED_DOT_OFF,
+            LED_DOT_OFF,
+            "Off",
+        )
     upload_tab.select(
+        _on_not_streaming_tab,
+        outputs=[audio_stream, enhance_btn, vad_led, system_status_led, system_status_text],
     ).then(
+        load_local_file,
+        inputs=[audio_file_upload, normalize],
+        outputs=[input_array, sample_stem, audio_preview, current_sample_rate],
     )
     dataset_tab.select(
+        _on_not_streaming_tab,
+        outputs=[audio_stream, enhance_btn, vad_led, system_status_led, system_status_text],
     ).then(
+        load_file_from_dataset,
+        inputs=dataset_dropdown,
+        outputs=[audio_file_from_dataset, input_array, sample_stem, current_sample_rate],
     )
     # ------------------------------------------------------
+    # STREAMING EVENTS
     # ------------------------------------------------------
     audio_stream.stream(
+        stream_step,
+        inputs=[audio_stream, streaming_sr, stt_model, enhancement_level, input_gain_db],
+        outputs=[streaming_sr, system_status_led, system_status_text,enhanced_text, raw_text, vad_led],
         stream_every=STREAM_EVERY,
         time_limit=60 * 2,
+        concurrency_limit=1,
     )
     audio_stream.stop_recording(
         on_stop_recording,
+        outputs=[vad_led, system_status_led, system_status_text, streaming_sr],
+    ).then(
+        shutdown_streamers,
     )
     audio_stream.start_recording(
+        on_start_recording,
+        outputs=[enhanced_text, raw_text, system_status_led, system_status_text],
+    )
     # ------------------------------------------------------
+    # OFFLINE EVENTS (DATASET + LOCAL FILE)
     # ------------------------------------------------------
     # Dataset dropdown selection triggers loading the audio file and hiding results until enhancement
     dataset_dropdown.change(
         lambda: gr.update(visible=False),
         outputs=results_card,
     ).then(
         load_file_from_dataset,
     # Enhancement button: run pipeline with live transcript progress (dataset + local file modes).
     enhance_btn.click(
+        cleanup_previous_run,
+        inputs=[last_sample_stem]
+    ).then(
+        lambda: gr.update(visible=True),
+        outputs=results_card,
+    ).then(
+        run_offline_pipeline,
+        inputs=[input_array, current_sample_rate, enhancement_level, stt_model, sample_stem],
         outputs=[
             enhanced_audio,
             enhanced_image,
             noisy_image,
             noisy_transcript,
             enhanced_transcript,
             last_sample_stem,
+            ]
+    ).failure(
+        lambda: gr.Warning("Enhancement failed. Please refresh page and make sure you have a stable connection.")
     )
 os.makedirs(APP_TMP_DIR, exist_ok=True)
 purge_tmp_directory(max_age_minutes=0, tmp_dir=APP_TMP_DIR)
 demo.queue()
 demo.launch(
+    css=(_CSS_DIR / "styling.css").read_text(encoding="utf-8"),
     allowed_paths=[APP_TMP_DIR, "/tmp", "/"],
 )

assets/active_light.css DELETED Viewed

@@ -1,68 +0,0 @@
-/* Stream row: stretch columns to match audio height */
-.stream-row {
-  align-items: stretch !important;
-}
-/* Active light column: flex container, fill height */
-.stream-row > div:last-child {
-  display: flex !important;
-  align-items: stretch !important;
-  min-height: 100%;
-}
-/* Gradio block wrapper: fill and flex so child can stretch */
-.stream-row > div:last-child > div {
-  display: flex !important;
-  width: 100% !important;
-  min-height: 100% !important;
-  height: 100% !important;
-  box-sizing: border-box;
-}
-/* All divs in light column: fill so panel stretches */
-.stream-row > div:last-child div {
-  width: 100% !important;
-  min-height: 100% !important;
-  height: 100% !important;
-  box-sizing: border-box;
-}
-/* Active light panel: fill container, base styles */
-.active-light {
-  width: 100% !important;
-  min-height: 100% !important;
-  height: 100% !important;
-  box-sizing: border-box;
-  border-radius: 8px;
-  border: 1px solid var(--border-color-primary);
-  transition: background 0.2s, box-shadow 0.2s;
-  padding: 0.5rem;
-}
-/* Warming up state */
-.active-light--off {
-  background: #555;
-  box-shadow: inset 0 2px 8px rgba(0, 0, 0, 0.4);
-}
-/* Ready state */
-.active-light--on {
-  background: #c00;
-  box-shadow:
-    0 0 16px rgba(220, 0, 0, 0.5),
-    inset 0 0 12px rgba(255, 80, 80, 0.3);
-}
-/* Label text */
-.active-light__label {
-  font-size: 12px;
-  font-weight: 500;
-}
-.active-light--off .active-light__label {
-  color: rgba(255, 255, 255, 0.85);
-}
-.active-light--on .active-light__label {
-  color: #fff;
-}

assets/styling.css ADDED Viewed

	@@ -0,0 +1,82 @@

+.status-panel {
+    padding: 16px;
+    border-radius: 16px;
+    border: 1px solid #27272a;
+    background: #111827;
+}
+.title, .tab-description {
+    margin-left: 2px;
+    margin-top: 2px;
+    margin-bottom: 0px;
+}
+.transcript-row {
+    gap: 12px;
+}
+.transcript-row--large textarea {
+    min-height: 260px;
+}
+.status-card {
+    height: 50px;
+    border: 1px solid #3f3f46;
+    border-radius: 14px;
+    background: #18181b;
+    padding: 5px 5px;
+    display: flex;
+    flex-direction: column;
+    justify-content: center;
+    align-items: center;
+    box-sizing: border-box;
+}
+.status-card__label {
+    text-align: center;
+    margin-bottom: 0px;
+    color: #e4e4e7;
+}
+.status-card__subtext {
+    margin-top: 0px;
+    font-size: 0.85rem;
+    color: #a1a1aa;
+    text-align: center;
+}
+.led-dot {
+    width: 30px;
+    height: 30px;
+    border-radius: 9999px;
+    border: 1px solid #71717a;
+    margin: 0 auto;
+}
+.led-dot--black {
+    background: #1c1c1c;
+    box-shadow: none;
+}
+.led-dot--red {
+    background: #ef4444;
+    box-shadow: 0 0 14px rgba(239, 68, 68, 0.85);
+}
+.led-dot--green {
+    background: #22c55e;
+    box-shadow: 0 0 14px rgba(34, 197, 94, 0.85);
+}
+.led-dot--yellow {
+    background: #facc15;
+    box-shadow: 0 0 14px rgba(250, 204, 21, 0.85);
+}
+.led-dot--off {
+    background: #3f3f46;
+    box-shadow: none;
+}

clean_up.py CHANGED Viewed

@@ -2,7 +2,6 @@ import os
 import time
 from loguru import logger
 from constants import MINUTES_KEEP, APP_TMP_DIR
-import gradio as gr
 def purge_tmp_directory(
@@ -93,11 +92,9 @@ def cleanup_previous_run(
     sample_stem: str,
     tmp_dir: str = APP_TMP_DIR,
     max_age_minutes: int = MINUTES_KEEP,
-) -> tuple[None, None, str, str, str]:
-    gr.Info("Processing started. This may take a moment. Please do not refresh or close the window.")
     try:
         remove_files_related_to(sample_stem, tmp_dir=tmp_dir)
     except Exception as e:
         print(f"Failed to delete last run with id {sample_stem}: {e}")
-    purge_tmp_directory(max_age_minutes=max_age_minutes, tmp_dir=tmp_dir)
-    return None, None, "", "", ""

 import time
 from loguru import logger
 from constants import MINUTES_KEEP, APP_TMP_DIR
 def purge_tmp_directory(
     sample_stem: str,
     tmp_dir: str = APP_TMP_DIR,
     max_age_minutes: int = MINUTES_KEEP,
+):
     try:
         remove_files_related_to(sample_stem, tmp_dir=tmp_dir)
     except Exception as e:
         print(f"Failed to delete last run with id {sample_stem}: {e}")
+    purge_tmp_directory(max_age_minutes=max_age_minutes, tmp_dir=tmp_dir)

constants.py CHANGED Viewed

@@ -19,7 +19,6 @@ MIX_DIR: Final = "mix"
 SPEECH_DIR: Final = "speech"
 TRANS_DIR: Final = "transcripts"
-DEFAULT_SR: Final = 16000
 STREAM_EVERY: Final = 0.2
 WARMUP_SECONDS: Final = 2  # seconds before "recording ready" light turns on
@@ -30,3 +29,6 @@ STREAMER_CLASSES: Final = {
     "Deepgram Nova-3 RT": DeepgramStreamer,
     "Soniox STT-RT v3": SonioxStreamer,
 }

 SPEECH_DIR: Final = "speech"
 TRANS_DIR: Final = "transcripts"
 STREAM_EVERY: Final = 0.2
 WARMUP_SECONDS: Final = 2  # seconds before "recording ready" light turns on
     "Deepgram Nova-3 RT": DeepgramStreamer,
     "Soniox STT-RT v3": SonioxStreamer,
 }
+VAD_ON: Final = "🟢"
+VAD_OFF: Final = "⚫"

offline_pipeline.py CHANGED Viewed

@@ -1,153 +1,277 @@
 import os
-from random import sample
 import gradio as gr
-import soundfile as sf
-from sdk import SDKWrapper
-from utils import spec_image, compute_wer, to_gradio_audio, normalize_lufs
-from hf_dataset_utils import get_audio, get_transcript
-from constants import APP_TMP_DIR, STREAMER_CLASSES
 import numpy as np
-def _close_stt_stream(streamer) -> None:
-    """Signal end-of-stream; streamer type may be Soniox (close_stream) or Deepgram (close)."""
     if hasattr(streamer, "close_stream"):
         streamer.close_stream()
     else:
         streamer.close()
-def run_offline_pipeline_streaming(
-    sample: np.ndarray,
-    sample_rate: int,
-    enhancement_level: float,
-    sample_id: str,
-    stt_model: str,
-    progress_state: dict,
-) -> tuple[str, str, str, tuple[int, np.ndarray], str, np.ndarray, str]:
-    """Run enhancement and both STTs in real time by processing in chunks. Transcripts stream
-    via progress_state['noisy'] and progress_state['enhanced']. Enhanced audio is returned
-    only at the end; the app plays it automatically when processing is complete.
-    Returns same tuple as run_offline_pipeline_ordered."""
-    if sample is None:
-        raise ValueError("No audio to enhance. Please upload a file first.")
-    sample = np.asarray(sample, dtype=np.float32).flatten()
-    sdk = SDKWrapper()
-    sdk.init_processor(
         sample_rate=sample_rate,
-        enhancement_level=float(enhancement_level) / 100.0,
     )
-    chunk_size = sdk.num_frames
-    # Sync transcript callbacks so both boxes update together
-    progress_state["noisy_pending"] = ""
-    progress_state["enhanced_pending"] = ""
-    progress_state["noisy_has_sent"] = False
-    progress_state["enhanced_has_sent"] = False
-    def _flush_both():
-        if progress_state.get("noisy_has_sent") and progress_state.get("enhanced_has_sent"):
-            progress_state["noisy"] = progress_state.get("noisy_pending", "")
-            progress_state["enhanced"] = progress_state.get("enhanced_pending", "")
-    def on_noisy(t: str):
-        progress_state["noisy_pending"] = t
-        progress_state["noisy_has_sent"] = True
-        _flush_both()
-    def on_enhanced(t: str):
-        progress_state["enhanced_pending"] = t
-        progress_state["enhanced_has_sent"] = True
-        _flush_both()
-    if stt_model not in STREAMER_CLASSES:
-        raise ValueError(f"Unknown STT model: {stt_model}")
-    StreamerClass = STREAMER_CLASSES[stt_model]
-    streamer_noisy = StreamerClass(sample_rate, f"{sample_id}_noisy", on_update=on_noisy)
-    streamer_enhanced = StreamerClass(sample_rate, f"{sample_id}_enhanced", on_update=on_enhanced)
     accumulated_enhanced: list[np.ndarray] = []
     n = len(sample)
     for i in range(0, n, chunk_size):
         raw_chunk = sample[i : i + chunk_size]
-        if raw_chunk.size < chunk_size:
             raw_chunk = np.pad(
                 raw_chunk,
-                (0, chunk_size - raw_chunk.size),
                 mode="constant",
                 constant_values=0.0,
             )
-        raw_2d = raw_chunk.reshape(1, -1)
-        enhanced_chunk = sdk.process_chunk(raw_2d)
-        enhanced_1d = np.asarray(enhanced_chunk).flatten()
         streamer_noisy.process_chunk(raw_chunk)
         streamer_enhanced.process_chunk(enhanced_1d)
         accumulated_enhanced.append(enhanced_1d)
-    _close_stt_stream(streamer_noisy)
-    _close_stt_stream(streamer_enhanced)
-    streamer_noisy.finished_event.wait()
-    streamer_enhanced.finished_event.wait()
-    with streamer_noisy.lock:
-        noisy_transcript = streamer_noisy.render_tokens(streamer_noisy.final_tokens, [])
-    with streamer_enhanced.lock:
-        enhanced_transcript = streamer_enhanced.render_tokens(streamer_enhanced.final_tokens, [])
     enhanced_array = np.concatenate(accumulated_enhanced).astype(np.float32)
-    gradio_enhanced_audio = to_gradio_audio(enhanced_array, sample_rate)
-    enhanced_spec_path = f"{APP_TMP_DIR}/{sample_id}_enhanced_spectrogram.png"
-    spec_image(enhanced_array).save(enhanced_spec_path)
-    progress_state["enhanced_spec_path"] = enhanced_spec_path
-    precomputed_noisy = noisy_transcript
     try:
         original_transcript = get_transcript(sample_id)
-        wer_enhanced = compute_wer(original_transcript, enhanced_transcript)
-        wer_noisy = compute_wer(original_transcript, noisy_transcript)
-        enhanced_transcript += f" (WER: {wer_enhanced * 100:.2f}%)"
-        noisy_transcript += f" (WER: {wer_noisy * 100:.2f}%)"
     except Exception:
-        pass
     return (
         enhanced_spec_path,
-        enhanced_transcript,
         noisy_transcript,
-        gradio_enhanced_audio,
         sample_id,
-        enhanced_array,
-        precomputed_noisy,
     )
 def load_local_file(
     sample_path: str,
     normalize: bool = True,
-    ) -> tuple[np.ndarray | None, str, tuple | None, int]:
     if not sample_path or not os.path.exists(sample_path):
-        return None, "", None
     if os.path.getsize(sample_path) > 5 * 1024 * 1024:
         gr.Warning("File size exceeds 5 MB limit. Please upload a smaller file.")
         raise ValueError("Uploaded file exceeds the 5 MB size limit.")
     new_sample_stem = os.path.splitext(os.path.basename(sample_path))[0]
-    y, sample_rate = sf.read(sample_path, dtype="float32", always_2d=False)
     if normalize:
         y = normalize_lufs(y, sample_rate)
     gradio_audio = to_gradio_audio(y, sample_rate)
     return y, new_sample_stem, gradio_audio, sample_rate
-def load_file_from_dataset(sample_id: str) -> tuple[tuple | None, np.ndarray | None, str, int | None]:
     if not sample_id:
         gr.Warning("Please select a sample from the dropdown.")
         return None, None, "", None
     new_sample_stem = sample_id
     try:
         y, sample_rate = get_audio(sample_id, prefix="mix")
-        y_for_gradio = to_gradio_audio(y, sample_rate)
-    except Exception as e:  # Convert to 16-bit PCM for gradio audio component
-        gr.Warning(f"{e}")
-        raise e
-    return y_for_gradio, y, new_sample_stem, sample_rate

 import os
+from typing import Any
 import gradio as gr
 import numpy as np
+import librosa
+from constants import APP_TMP_DIR, STREAMER_CLASSES
+from hf_dataset_utils import get_audio, get_transcript
+from sdk import SDKParams, SDKWrapper
+from utils import (
+    compute_wer,
+    get_vad_labels,
+    normalize_lufs,
+    spec_image,
+    to_gradio_audio,
+)
+SDK_OFFLINE = SDKWrapper()
+def _safe_progress(progress: gr.Progress, value: float, desc: str) -> None:
+    progress(max(0.0, min(1.0, value)), desc=desc)
+def _empty_pipeline_result(sample_id: str) -> tuple[Any, str, str, str, str, str, str]:
+    return (
+        None,
+        "",
+        "",
+        "Unavailable",
+        "Unavailable",
+        "Unavailable",
+        sample_id,
+    )
+def _finalize_stream_transcript(streamer) -> str:
     if hasattr(streamer, "close_stream"):
         streamer.close_stream()
     else:
         streamer.close()
+    streamer.finished_event.wait()
+    with streamer.lock:
+        return streamer.render_tokens(streamer.final_tokens, [])
+def _init_sdk(sample_rate: int, enhancement_level: int) -> int:
+    sdk_params = SDKParams(
         sample_rate=sample_rate,
+        enhancement_level=enhancement_level / 100.0,
     )
+    SDK_OFFLINE.init_processor(sdk_params)
+    return SDK_OFFLINE.num_frames
+def _init_streamers(
+    sample_rate: int,
+    stt_model: str,
+    sample_id: str,
+    progress: gr.Progress,
+):
+    if stt_model not in STREAMER_CLASSES:
+        raise ValueError(f"Unknown STT model: {stt_model}")
+    streamer_class = STREAMER_CLASSES[stt_model]
+    _safe_progress(progress, 0.12, f"Initializing {stt_model} stream 1/2...")
+    streamer_noisy = streamer_class(sample_rate, f"{sample_id}_noisy")
+    _safe_progress(progress, 0.18, f"Initializing {stt_model} stream 2/2...")
+    streamer_enhanced = streamer_class(sample_rate, f"{sample_id}_enhanced")
+    return streamer_noisy, streamer_enhanced
+def _attach_wer(
+    original_transcript: str,
+    noisy_transcript: str,
+    enhanced_transcript: str,
+) -> tuple[str, str]:
+    wer_enhanced = compute_wer(original_transcript, enhanced_transcript)
+    wer_noisy = compute_wer(original_transcript, noisy_transcript)
+    noisy_transcript = f"{noisy_transcript} (WER: {wer_noisy * 100:.2f}%)"
+    enhanced_transcript = f"{enhanced_transcript} (WER: {wer_enhanced * 100:.2f}%)"
+    return noisy_transcript, enhanced_transcript
+def _process_audio_chunks(
+    sample: np.ndarray,
+    sample_rate: int,
+    chunk_size: int,
+    streamer_noisy,
+    streamer_enhanced,
+    progress: gr.Progress,
+) -> tuple[np.ndarray, list[list[float]]]:
     accumulated_enhanced: list[np.ndarray] = []
+    vad_timestamps: list[list[float]] = []
     n = len(sample)
     for i in range(0, n, chunk_size):
         raw_chunk = sample[i : i + chunk_size]
+        original_chunk_len = raw_chunk.size
+        if original_chunk_len < chunk_size:
             raw_chunk = np.pad(
                 raw_chunk,
+                (0, chunk_size - original_chunk_len),
                 mode="constant",
                 constant_values=0.0,
             )
+        enhanced_chunk = SDK_OFFLINE.process_chunk(raw_chunk.reshape(1, -1))
+        enhanced_1d = np.asarray(enhanced_chunk, dtype=np.float32).flatten()
         streamer_noisy.process_chunk(raw_chunk)
         streamer_enhanced.process_chunk(enhanced_1d)
         accumulated_enhanced.append(enhanced_1d)
+        loop_progress = (i + original_chunk_len) / n if n > 0 else 1.0
+        _safe_progress(
+            progress,
+            0.20 + 0.50 * loop_progress,
+            "Enhancing audio...",
+        )
+        if SDK_OFFLINE.vad_context.is_speech_detected():
+            start_in_sec = i / sample_rate
+            end_in_sec = min(i + original_chunk_len, n) / sample_rate
+            vad_timestamps.append([start_in_sec, end_in_sec])
     enhanced_array = np.concatenate(accumulated_enhanced).astype(np.float32)
+    return enhanced_array, vad_timestamps
+def _save_spectrograms(
+    sample: np.ndarray,
+    enhanced_array: np.ndarray,
+    sample_rate: int,
+    sample_id: str,
+    vad_timestamps: list[list[float]],
+) -> tuple[str, str]:
+    os.makedirs(APP_TMP_DIR, exist_ok=True)
+    enhanced_spec_path = os.path.join(APP_TMP_DIR, f"{sample_id}_enhanced_spectrogram.png")
+    noisy_spec_path = os.path.join(APP_TMP_DIR, f"{sample_id}_noisy_spectrogram.png")
+    spec_image(enhanced_array, sr=sample_rate, vad_timestamps=vad_timestamps).save(enhanced_spec_path)
+    spec_image(sample, sr=sample_rate, vad_timestamps=vad_timestamps).save(noisy_spec_path)
+    return enhanced_spec_path, noisy_spec_path
+def run_offline_pipeline(
+    sample: np.ndarray,
+    sample_rate: int,
+    enhancement_level: int,
+    stt_model: str,
+    sample_id: str,
+    progress=gr.Progress(),
+) -> tuple[Any, str, str, str, str, str, str]:
+    _safe_progress(progress, 0.00, "Starting...")
+    if sample is None or len(sample) == 0:
+        gr.Warning("No audio to enhance. Please upload a file first.")
+        return _empty_pipeline_result(sample_id)
+    _safe_progress(progress, 0.05, "Initializing enhancement...")
+    chunk_size = _init_sdk(sample_rate, enhancement_level)
+    try:
+        streamer_noisy, streamer_enhanced = _init_streamers(
+            sample_rate=sample_rate,
+            stt_model=stt_model,
+            sample_id=sample_id,
+            progress=progress,
+        )
+    except Exception as e:
+        raise RuntimeError(f"Failed to initialize STT streaming: {e}") from e
+    enhanced_array, vad_timestamps = _process_audio_chunks(
+        sample=sample,
+        sample_rate=sample_rate,
+        chunk_size=chunk_size,
+        streamer_noisy=streamer_noisy,
+        streamer_enhanced=streamer_enhanced,
+        progress=progress,
+    )
+    _safe_progress(progress, 0.72, "Finalizing transcripts...")
+    noisy_transcript = _finalize_stream_transcript(streamer_noisy)
+    _safe_progress(progress, 0.80, "Finalizing transcripts...")
+    enhanced_transcript = _finalize_stream_transcript(streamer_enhanced)
+    _safe_progress(progress, 0.94, "Loading reference transcript...")
     try:
         original_transcript = get_transcript(sample_id)
     except Exception:
+        original_transcript = "Unavailable"
+    if original_transcript != "Unavailable":
+        _safe_progress(progress, 0.96, "Computing WER...")
+        noisy_transcript, enhanced_transcript = _attach_wer(
+            original_transcript=original_transcript,
+            noisy_transcript=noisy_transcript,
+            enhanced_transcript=enhanced_transcript,
+        )
+    _safe_progress(progress, 0.99, "Generating outputs...")
+    gradio_enhanced_audio = to_gradio_audio(enhanced_array, sample_rate)
+    enhanced_spec_path, noisy_spec_path = _save_spectrograms(
+        sample=sample,
+        enhanced_array=enhanced_array,
+        sample_rate=sample_rate,
+        sample_id=sample_id,
+        vad_timestamps=vad_timestamps
+    )
+    vad_labels = get_vad_labels(
+        vad_timestamps,
+        length=len(sample) / sample_rate,
+    )
+    _safe_progress(progress, 1.00, "Done.")
     return (
+        gr.update(value=gradio_enhanced_audio, subtitles=vad_labels),
         enhanced_spec_path,
+        noisy_spec_path,
+        original_transcript,
         noisy_transcript,
+        enhanced_transcript,
         sample_id,
     )
 def load_local_file(
     sample_path: str,
     normalize: bool = True,
+) -> tuple[np.ndarray | None, str, tuple | None, int | None]:
     if not sample_path or not os.path.exists(sample_path):
+        return None, "", None, None
     if os.path.getsize(sample_path) > 5 * 1024 * 1024:
         gr.Warning("File size exceeds 5 MB limit. Please upload a smaller file.")
         raise ValueError("Uploaded file exceeds the 5 MB size limit.")
     new_sample_stem = os.path.splitext(os.path.basename(sample_path))[0]
+    y, sample_rate = librosa.load(sample_path, sr=None, mono=True)
+    sample_rate = int(sample_rate)
+    y = np.asarray(y, dtype=np.float32)
     if normalize:
         y = normalize_lufs(y, sample_rate)
     gradio_audio = to_gradio_audio(y, sample_rate)
     return y, new_sample_stem, gradio_audio, sample_rate
+def load_file_from_dataset(
+    sample_id: str,
+) -> tuple[tuple | None, np.ndarray | None, str, int | None]:
     if not sample_id:
         gr.Warning("Please select a sample from the dropdown.")
         return None, None, "", None
     new_sample_stem = sample_id
     try:
         y, sample_rate = get_audio(sample_id, prefix="mix")
+    except Exception as e:
+        gr.Warning(str(e))
+        raise
+    y = np.asarray(y, dtype=np.float32)
+    if y.ndim > 1:
+        y = np.mean(y, axis=0)
+    gradio_audio = to_gradio_audio(y, sample_rate)
+    return gradio_audio, y, new_sample_stem, sample_rate

sdk.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import numpy as np
 from dotenv import load_dotenv
 import aic_sdk as aic
@@ -8,6 +9,24 @@ from constants import MODEL_ID
 load_dotenv()
 class SDKWrapper:
     def __init__(self, model_id: str = MODEL_ID, models_dir: str = "./models"):
         if os.getenv("AIC_SDK_KEY") is None:
@@ -16,25 +35,25 @@ class SDKWrapper:
         model_path = aic.Model.download(model_id, models_dir)
         self.model = aic.Model.from_file(model_path)
-    def init_processor(self, sample_rate: int, enhancement_level: float, allow_variable_frames: bool = False, num_frames: int | None = None,num_channels: int = 1, sync: bool = True):
-        self.processor_sample_rate = sample_rate
-        processor_optimal_frames = self.model.get_optimal_num_frames(sample_rate)
-        self.num_frames = num_frames if num_frames else processor_optimal_frames
-        config = aic.ProcessorConfig(
-            sample_rate=sample_rate,
-            num_channels=num_channels,
             num_frames=self.num_frames,
-            allow_variable_frames=allow_variable_frames,
         )
-        if sync:
-            processor = aic.Processor(self.model, self.sdk_key, config)
         else:
-            processor = aic.ProcessorAsync(self.model, self.sdk_key, config)
-        processor.get_processor_context().set_parameter(
-            aic.ProcessorParameter.EnhancementLevel, float(enhancement_level)
         )
-        self.processor = processor
     def change_enhancement_level(self, enhancement_level: float):
         if not hasattr(self, "processor"):
@@ -42,6 +61,7 @@ class SDKWrapper:
         self.processor.get_processor_context().set_parameter(
             aic.ProcessorParameter.EnhancementLevel, float(enhancement_level)
         )
     def _check_shape(self, audio: np.ndarray) -> np.ndarray:
         if len(audio.shape) == 1:
@@ -50,15 +70,17 @@ class SDKWrapper:
             raise ValueError("Expected audio with shape (n, frames)")
         return audio
-    def process_sync(
         self,
         audio: np.ndarray,
-    ) -> np.ndarray:
         """
             audio_array: 2D NumPy array with shape (num_channels, samples) containing audio data to be enhanced
         """
         audio = self._check_shape(audio)
         out = np.zeros_like(audio)
         chunk_size = self.num_frames
         n = audio.shape[1]
         for i in range(0, n, chunk_size):
@@ -72,7 +94,11 @@ class SDKWrapper:
                 break
             enhanced = self.processor.process(chunk)
             out[:, i : i + chunk_size] = enhanced[:, :chunk_size]
-        return out
     def process_chunk(self, audio: np.ndarray) -> np.ndarray:
         audio = self._check_shape(audio)

 import numpy as np
 from dotenv import load_dotenv
 import aic_sdk as aic
 load_dotenv()
+class SDKParams:
+    def __init__(
+        self,
+        sample_rate: int = 16000,
+        enhancement_level: float = 1.0,
+        allow_variable_frames: bool = False,
+        num_channels: int = 1,
+        sync: bool = True,
+        num_frames: int | None = None,
+    ):
+        self.sample_rate = sample_rate
+        self.enhancement_level = enhancement_level
+        self.allow_variable_frames = allow_variable_frames
+        self.num_channels = num_channels
+        self.sync = sync
+        self.num_frames = num_frames  # to be set after processor init
 class SDKWrapper:
     def __init__(self, model_id: str = MODEL_ID, models_dir: str = "./models"):
         if os.getenv("AIC_SDK_KEY") is None:
         model_path = aic.Model.download(model_id, models_dir)
         self.model = aic.Model.from_file(model_path)
+    def init_processor(self, sdk_params: SDKParams):
+        optimal_frames = self.model.get_optimal_num_frames(sdk_params.sample_rate)
+        self.num_frames = sdk_params.num_frames if sdk_params.num_frames else optimal_frames
+        self.sample_rate = sdk_params.sample_rate
+        aic_config = aic.ProcessorConfig(
+            sample_rate=sdk_params.sample_rate,
+            num_channels=sdk_params.num_channels,
             num_frames=self.num_frames,
+            allow_variable_frames=sdk_params.allow_variable_frames,
         )
+        if sdk_params.sync:
+            self.processor = aic.Processor(self.model, self.sdk_key, aic_config)
         else:
+            self.processor = aic.ProcessorAsync(self.model, self.sdk_key, aic_config)
+        self.processor.get_processor_context().set_parameter(
+            aic.ProcessorParameter.EnhancementLevel, float(sdk_params.enhancement_level)
         )
+        self.enhancement_level = sdk_params.enhancement_level
+        self.vad_context = self.processor.get_vad_context()
     def change_enhancement_level(self, enhancement_level: float):
         if not hasattr(self, "processor"):
         self.processor.get_processor_context().set_parameter(
             aic.ProcessorParameter.EnhancementLevel, float(enhancement_level)
         )
+        self.enhancement_level = enhancement_level
     def _check_shape(self, audio: np.ndarray) -> np.ndarray:
         if len(audio.shape) == 1:
             raise ValueError("Expected audio with shape (n, frames)")
         return audio
+    def process_with_vad(
         self,
         audio: np.ndarray,
+    ) -> tuple[np.ndarray, bool]:
         """
             audio_array: 2D NumPy array with shape (num_channels, samples) containing audio data to be enhanced
         """
         audio = self._check_shape(audio)
         out = np.zeros_like(audio)
+        vad_per_sample = np.zeros_like(audio, dtype=bool)
+        vad_overall = False
         chunk_size = self.num_frames
         n = audio.shape[1]
         for i in range(0, n, chunk_size):
                 break
             enhanced = self.processor.process(chunk)
             out[:, i : i + chunk_size] = enhanced[:, :chunk_size]
+            if self.vad_context.is_speech_detected():
+                vad_per_sample[:, i : i + chunk_size] = True
+        if vad_per_sample.mean() > 0.5:
+            vad_overall = True
+        return out, vad_overall
     def process_chunk(self, audio: np.ndarray) -> np.ndarray:
         audio = self._check_shape(audio)

stream_pipeline.py CHANGED Viewed

@@ -1,191 +1,166 @@
-import gradio as gr
 import numpy as np
-import soxr
-from constants import DEFAULT_SR, STREAMER_CLASSES
 from stt_streamers import DeepgramStreamer
-from sdk import SDKWrapper
-from dataclasses import dataclass
-# ----------------------------
-# Global transcript store (UI pulls from this)
-# ----------------------------
-_ENHANCED_TRANSCRIPT: str = ""
-_RAW_TRANSCRIPT: str = ""
 def _set_transcript_enhanced(text: str) -> None:
-    """Deepgram callback: update latest transcript text (no printing)."""
     global _ENHANCED_TRANSCRIPT
     _ENHANCED_TRANSCRIPT = text
 def _set_transcript_raw(text: str) -> None:
-    """Deepgram callback: update latest transcript text (no printing)."""
     global _RAW_TRANSCRIPT
     _RAW_TRANSCRIPT = text
-def get_live_transcripts() -> tuple[str, str]:
-    """Return current enhanced and raw transcript for live UI updates."""
-    return _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT
-SDK = SDKWrapper()
-SDK.init_processor(
-    sample_rate=DEFAULT_SR,
-    enhancement_level=1.0,
-    allow_variable_frames=True,  # streaming chunks are variable-sized
-    num_channels=1,
-)
-# Created on first start_recording (lazy) to avoid Soniox "No audio received" timeout at app load
-Streamer_enhanced = None
-Streamer_raw = None
-_streamer_generation = 0
-_last_stop_generation = 1  # so first stop doesn't skip (1 > 1 is False)
-@dataclass
-class EnhanceSession:
-    pending: np.ndarray        # 1D float32 @ processor sample rate
-    sr: int
-    num_frames: int
-@dataclass
-class StreamSession:
-    # nur was du wirklich brauchst
-    resampler: soxr.ResampleStream | None
-    sr_in: int | None
-    tail_16k: np.ndarray  # ring buffer (z.B. letzte 10s)
-    tail_max: int         # max samples
-def _get_or_init_session(session: StreamSession | None, sr_in: int) -> StreamSession:
-    if session is None or session.sr_in != sr_in:
-        # ResampleStream ist für real-time processing gedacht citeturn8view0
-        resampler = None if sr_in == DEFAULT_SR else soxr.ResampleStream(sr_in, DEFAULT_SR, num_channels=1, dtype="float32")
-        return StreamSession(resampler=resampler, sr_in=sr_in, tail_16k=np.zeros((0,), dtype=np.float32), tail_max=10 * DEFAULT_SR)
-    return session
 def _to_float32_mono(y: np.ndarray) -> np.ndarray:
-    # Gradio liefert int16 (oder (samples, channels)). citeturn1view4
     y = np.asarray(y)
     if y.ndim > 1:
         y = y.mean(axis=1)
     if y.dtype == np.int16:
-        y = (y.astype(np.float32) / 32768.0)
     else:
         y = y.astype(np.float32)
-    return y
-def transcribe_stream(session: StreamSession | None, new_chunk, enhancement_level, input_gain_db: float = 0.0):
-    if (
-        Streamer_enhanced is None
         or Streamer_raw is None
-        or Streamer_enhanced.ws is None
-        or Streamer_raw.ws is None
-    ):
-        return session, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT
-    if new_chunk is None or new_chunk[1] is None:
-        return session, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT
-    sr, y = new_chunk
-    y = _to_float32_mono(y)
-    # Apply input gain: linear = 10^(dB/20), clip to avoid overflow
-    if input_gain_db is not None and input_gain_db > 0:
-        gain_linear = np.float32(10.0 ** (float(input_gain_db) / 20.0))
-        y = (y * gain_linear).astype(np.float32)
-        y = np.clip(y, -1.0, 1.0)
-    session = _get_or_init_session(session, sr)
-    SDK.change_enhancement_level(float(enhancement_level) / 100.0)
-    if session.resampler is not None:
-        y_16k = session.resampler.resample_chunk(y)
-    else:
-        y_16k = y
-    # Ensure 1D float32 for SDK and streamers (resample_chunk can return 0 samples or 2D)
-    y_16k = np.asarray(y_16k, dtype=np.float32).flatten()
-    # Ringbuffer (nicht unendlich konkatenieren)
-    if y_16k.size > 0:
-        tail = np.concatenate([session.tail_16k, y_16k])
-        if tail.size > session.tail_max:
-            tail = tail[-session.tail_max:]
-        session.tail_16k = tail
-    # Only send when we have samples (resample_chunk can return empty; SDK needs valid input)
-    if y_16k.size == 0:
-        return session, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT
-    # Parallel path: send raw to STT immediately, then enhance and send enhanced.
-    # SDK requires fixed num_frames (AudioConfigMismatchError if we use process_chunk with variable size).
-    Streamer_raw.process_chunk(y_16k)
-    enhanced_chunk_16k = SDK.process_sync(y_16k)
-    out_1d = np.asarray(enhanced_chunk_16k, dtype=np.float32).flatten()
-    # Always send something to enhanced so Soniox doesn't close with "No audio received"
-    if out_1d.size > 0:
-        Streamer_enhanced.process_chunk(out_1d)
-    else:
-        Streamer_enhanced.process_chunk(np.zeros(160, dtype=np.float32))
-    return session, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT
-def shutdown_streamers(from_stop_recording: bool = False):
-    """Shut down STT streamers. If from_stop_recording, skip when streamers were
-    created after the last stop (avoids delayed stop killing new streamers)."""
-    global Streamer_enhanced, Streamer_raw, _streamer_generation, _last_stop_generation
-    if from_stop_recording and _streamer_generation > _last_stop_generation:
-        return
-    gen = _streamer_generation
     try:
-        if Streamer_enhanced is not None and Streamer_enhanced.ws is not None:
-            Streamer_enhanced.shutdown()
-        if Streamer_raw is not None and Streamer_raw.ws is not None:
-            Streamer_raw.shutdown()
-    except Exception:
-        print("Failed to shutdown streamers.")
-    finally:
-        Streamer_enhanced = None
-        Streamer_raw = None
-        if from_stop_recording:
-            _last_stop_generation = gen
 def on_stop_recording():
-    """Call from Gradio stop_recording so streamers shut down when user clicks Stop."""
-    shutdown_streamers(from_stop_recording=True)
-def clear_ui():
-    global _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT
-    _ENHANCED_TRANSCRIPT = ""
-    _RAW_TRANSCRIPT = ""
-    return None, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT
-def stop_online_backend():
-    """Stop streamers and clear transcripts. Do not update the Audio component:
-    toggling streaming=False then back to True can make the frontend lose the
-    microphone (getUserMedia not re-called), so we leave it unchanged."""
-    shutdown_streamers()
-    session, enhanced_transcript, raw_transcript = clear_ui()
-    return session, enhanced_transcript, raw_transcript, gr.update()
-def set_stt_streamer(model_name):
-    StreamerCls = STREAMER_CLASSES.get(model_name, DeepgramStreamer)
-    global Streamer_enhanced, Streamer_raw, _streamer_generation
-    # Shut down current streamers first so we don't leak
-    if Streamer_enhanced is not None or Streamer_raw is not None:
-        shutdown_streamers()
-    # Create both before assigning so transcribe_stream never sees one new and one old
-    new_enhanced = StreamerCls(
-        fs_hz=DEFAULT_SR,
-        stream_name="enhanced",
-        on_update=_set_transcript_enhanced,
-    )
-    new_raw = StreamerCls(
-        fs_hz=DEFAULT_SR,
-        stream_name="raw",
-        on_update=_set_transcript_raw,
-    )
-    _streamer_generation += 1
-    Streamer_enhanced = new_enhanced
-    Streamer_raw = new_raw

 import numpy as np
+from constants import STREAMER_CLASSES
+import gradio as gr
 from stt_streamers import DeepgramStreamer
+from sdk import SDKWrapper, SDKParams
+from ui import LED_DOT_BLACK, LED_DOT_GREEN, LED_DOT_OFF, LED_DOT_RED, LED_DOT_YELLOW
+_ENHANCED_TRANSCRIPT = ""
+_RAW_TRANSCRIPT = ""
+SDK_STREAMING = SDKWrapper()
+Streamer_enhanced = None
+Streamer_raw = None
 def _set_transcript_enhanced(text: str) -> None:
     global _ENHANCED_TRANSCRIPT
     _ENHANCED_TRANSCRIPT = text
 def _set_transcript_raw(text: str) -> None:
     global _RAW_TRANSCRIPT
     _RAW_TRANSCRIPT = text
+def clear_live_transcripts():
+    global _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT
+    _ENHANCED_TRANSCRIPT = ""
+    _RAW_TRANSCRIPT = ""
+def render_system_status(status: str):
+    if status == "off":
+        return LED_DOT_OFF, "Off"
+    if status == "init":
+        return LED_DOT_YELLOW, "Initializing..."
+    if status == "ready":
+        return LED_DOT_GREEN, "Ready"
+    if status == "error":
+        return LED_DOT_RED, "Error. Please refresh the page."
+    raise ValueError(f"Invalid status: {status}")
+def shutdown_streamers():
+    global Streamer_enhanced, Streamer_raw
+    try:
+        if Streamer_enhanced is not None:
+            Streamer_enhanced.shutdown()
+        if Streamer_raw is not None:
+            Streamer_raw.shutdown()
+    except Exception as e:
+        print(f"Error shutting down streamers: {e}")
+    finally:
+        Streamer_enhanced = None
+        Streamer_raw = None
+def set_stt_streamer(sample_rate: int, stt_model: str):
+    global Streamer_enhanced, Streamer_raw
+    StreamerCls = STREAMER_CLASSES.get(stt_model, DeepgramStreamer)
+    try:
+        Streamer_enhanced = StreamerCls(
+            fs_hz=sample_rate,
+            stream_name="Enhanced",
+            on_update=_set_transcript_enhanced,
+        )
+        Streamer_raw = StreamerCls(
+            fs_hz=sample_rate,
+            stream_name="Raw",
+            on_update=_set_transcript_raw,
+        )
+    except Exception as e:
+        Streamer_enhanced = None
+        Streamer_raw = None
+        raise RuntimeError(f"Error initializing STT streamer '{stt_model}': {e}")
 def _to_float32_mono(y: np.ndarray) -> np.ndarray:
     y = np.asarray(y)
     if y.ndim > 1:
         y = y.mean(axis=1)
     if y.dtype == np.int16:
+        y = y.astype(np.float32) / 32768.0
     else:
         y = y.astype(np.float32)
+    return np.asarray(y, dtype=np.float32).flatten()
+def _ensure_initialized(sr: int, streaming_sr, stt_model: str, enhancement_level: float):
+    streamer_cls = STREAMER_CLASSES[stt_model]
+    needs_init = (
+        streaming_sr is None
+        or streaming_sr != sr
+        or Streamer_enhanced is None
         or Streamer_raw is None
+        or not isinstance(Streamer_enhanced, streamer_cls)
+        or not isinstance(Streamer_raw, streamer_cls)
+    )
+    if not needs_init:
+        if SDK_STREAMING.enhancement_level != enhancement_level:
+            SDK_STREAMING.change_enhancement_level(enhancement_level)
+        return streaming_sr, *render_system_status("ready")
     try:
+        shutdown_streamers()
+        sdk_params = SDKParams(
+            sample_rate=sr,
+            enhancement_level=enhancement_level
+        )
+        SDK_STREAMING.init_processor(sdk_params)
+        set_stt_streamer(sr, stt_model)
+        return sr, *render_system_status("ready")
+    except Exception as e:
+        gr.Warning(f"Streaming process failed: {e}")
+        return None, *render_system_status("error")
+def stream_step(audio_stream, streaming_sr, stt_model, enhancement_level, input_gain_db):
+    if audio_stream is None:
+        return streaming_sr, *render_system_status("off"), _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT, LED_DOT_OFF
+    sr, chunk = audio_stream
+    if chunk is None:
+        return streaming_sr, *render_system_status("off"), _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT, LED_DOT_OFF
+    enhancement_level_float = enhancement_level / 100.0
+    streaming_sr, system_led, system_text = _ensure_initialized(
+        sr=sr,
+        streaming_sr=streaming_sr,
+        stt_model=stt_model,
+        enhancement_level=enhancement_level_float,
+    )
+    try:
+        y = _to_float32_mono(chunk)
+        if input_gain_db and input_gain_db > 0:
+            gain_linear = np.float32(10.0 ** (float(input_gain_db) / 20.0))
+            y = np.clip(y * gain_linear, -1.0, 1.0).astype(np.float32)
+        enhanced_chunk_16k, vad_detected = SDK_STREAMING.process_with_vad(y)
+        enhanced_chunk_16k = np.asarray(enhanced_chunk_16k, dtype=np.float32).flatten()
+        Streamer_raw.process_chunk(y)
+        Streamer_enhanced.process_chunk(enhanced_chunk_16k)
+        vad_led = LED_DOT_GREEN if vad_detected else LED_DOT_BLACK
+        return streaming_sr, system_led, system_text, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT, vad_led
+    except Exception as e:
+        gr.Warning(f"Streaming process failed: {e}")
+        err_led, err_text = render_system_status("error")
+        return streaming_sr, err_led, err_text, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT, LED_DOT_OFF
+def on_start_recording():
+    clear_live_transcripts()
+    led, text = render_system_status("init")
+    return "", "", led, text
 def on_stop_recording():
+    led, text = render_system_status("off")
+    return led, led, text, None

stt_streamers/deepgram_streamer.py CHANGED Viewed

@@ -15,7 +15,8 @@ class DeepgramStreamer:
         api_key = os.environ.get("DEEPGRAM_API_KEY")
         if not api_key:
             raise RuntimeError("Missing DEEPGRAM_API_KEY.")
         self.stream_name = stream_name
         self.api_name = "Deepgram V1 Nova-3"
         self.on_update = on_update
@@ -210,7 +211,6 @@ class DeepgramStreamer:
             self.thread.join(timeout=1.0)
         if hasattr(self, "keepalive_thread") and self.keepalive_thread.is_alive():
             self.keepalive_thread.join(timeout=1.0)
-        self.ws = None
         print(f"DeepgramStreamer '{self.stream_name}' shutdown complete.")

         api_key = os.environ.get("DEEPGRAM_API_KEY")
         if not api_key:
             raise RuntimeError("Missing DEEPGRAM_API_KEY.")
+        if not fs_hz:
+            raise ValueError("Sample rate (fs_hz) must be specified.")
         self.stream_name = stream_name
         self.api_name = "Deepgram V1 Nova-3"
         self.on_update = on_update
             self.thread.join(timeout=1.0)
         if hasattr(self, "keepalive_thread") and self.keepalive_thread.is_alive():
             self.keepalive_thread.join(timeout=1.0)
         print(f"DeepgramStreamer '{self.stream_name}' shutdown complete.")

stt_streamers/soniox_streamer.py CHANGED Viewed

@@ -13,7 +13,8 @@ class SonioxStreamer:
         api_key = os.environ.get("SONIOX_API_KEY")
         if not api_key:
             raise RuntimeError("Missing SONIOX_API_KEY.")
         self.stream_name = stream_name
         self.api_name = "Soniox RT"
         self.on_update = on_update
@@ -207,6 +208,4 @@ class SonioxStreamer:
             self.thread.join(timeout=1.0)
         if hasattr(self, "keepalive_thread") and self.keepalive_thread.is_alive():
             self.keepalive_thread.join(timeout=1.0)
-        self.ws = None
         print(f"SonioxStreamer '{self.stream_name}' shutdown complete.")

         api_key = os.environ.get("SONIOX_API_KEY")
         if not api_key:
             raise RuntimeError("Missing SONIOX_API_KEY.")
+        if not fs_hz:
+            raise ValueError("Sample rate (fs_hz) must be specified.")
         self.stream_name = stream_name
         self.api_name = "Soniox RT"
         self.on_update = on_update
             self.thread.join(timeout=1.0)
         if hasattr(self, "keepalive_thread") and self.keepalive_thread.is_alive():
             self.keepalive_thread.join(timeout=1.0)
         print(f"SonioxStreamer '{self.stream_name}' shutdown complete.")

ui.py ADDED Viewed

	@@ -0,0 +1,20 @@

+LED_DOT_RED = """
+<div class="led-dot led-dot--red"></div>
+"""
+LED_DOT_BLACK = """
+<div class="led-dot led-dot--black"></div>
+"""
+LED_DOT_GREEN = """
+<div class="led-dot led-dot--green"></div>
+"""
+LED_DOT_OFF = """
+<div class="led-dot led-dot--off"></div>
+"""
+LED_DOT_YELLOW = """
+<div class="led-dot led-dot--yellow"></div>
+"""

utils.py CHANGED Viewed

@@ -1,50 +1,99 @@
 from typing import Optional
-import numpy as np
-import librosa
-from PIL import Image
 import io
-import matplotlib.pyplot as plt
-from constants import DEFAULT_SR, TARGET_LOUDNESS, TARGET_TP
 import warnings
 import pyloudnorm as pyln
 def to_gradio_audio(x: np.ndarray, sr: int) -> tuple[int, np.ndarray]:
-    """Return (sample_rate, int16 mono array) for Gradio Audio. Gradio expects int16;
-    passing float32 triggers an internal conversion and a warning."""
     x = np.asarray(x)
-    # Remove extra dims like (1, n, 1) etc.
     x = np.squeeze(x)
-    # If it's (channels, samples), transpose to (samples, channels)
     if x.ndim == 2 and x.shape[0] in (1, 2) and x.shape[1] > x.shape[0]:
         x = x.T
-    # Ensure mono is (n_samples,)
     if x.ndim == 2 and x.shape[1] == 1:
         x = x[:, 0]
     x = x.astype(np.float32)
     x = np.clip(x, -1.0, 1.0)
-    # Gradio Audio expects int16; convert here so Gradio doesn't convert and warn
     x = (x * 32767).astype(np.int16)
-    return (sr, x)
 def spec_image(
     audio_array: np.ndarray,
-    sr: int = DEFAULT_SR,
     n_fft: int = 2048,
     hop_length: int = 512,
     n_mels: int = 128,
     fmax: Optional[float] = None,
 ) -> Image.Image:
-    """
-    Generate a mel-spectrogram image from an audio array.
-    """
-    y = audio_array.flatten()  # Ensure it's 1D
     S = librosa.feature.melspectrogram(
         y=y,
         sr=sr,
@@ -53,20 +102,66 @@ def spec_image(
         n_mels=n_mels,
         fmax=fmax or sr // 2,
     )
-    S_db = librosa.power_to_db(S, ref=np.max(S))
     fig, ax = plt.subplots(figsize=(8, 3), dpi=150)
     img = librosa.display.specshow(
-        S_db, sr=sr, hop_length=hop_length, x_axis="time", y_axis="mel", ax=ax
     )
     cbar = fig.colorbar(img, ax=ax, format="%+2.0f dB")
     cbar.set_label("dB")
     ax.set_title("Mel-spectrogram")
     ax.set_xlabel("Time in s")
     ax.set_ylabel("Frequency in Hz")
     fig.tight_layout(pad=0.2)
     buf = io.BytesIO()
     fig.savefig(buf, format="png", bbox_inches="tight", pad_inches=0)
     plt.close(fig)
     buf.seek(0)
     return Image.open(buf).convert("RGB")
@@ -77,24 +172,24 @@ def compute_wer(reference: str, hypothesis: str) -> float:
     """
     ref_words = reference.split()
     hyp_words = hypothesis.split()
-    d = np.zeros((len(ref_words) + 1, len(hyp_words) + 1), dtype=np.uint8)
     for i in range(len(ref_words) + 1):
         d[i][0] = i
     for j in range(len(hyp_words) + 1):
         d[0][j] = j
     for i in range(1, len(ref_words) + 1):
         for j in range(1, len(hyp_words) + 1):
-            if ref_words[i - 1] == hyp_words[j - 1]:
-                cost = 0
-            else:
-                cost = 1
             d[i][j] = min(
-                d[i - 1][j] + 1,      # Deletion
-                d[i][j - 1] + 1,      # Insertion
-                d[i - 1][j - 1] + cost,  # Substitution
             )
-    wer = d[len(ref_words)][len(hyp_words)] / max(len(ref_words), 1)
-    return wer
 def measure_loudness(x: np.ndarray, sr: int) -> float:
@@ -102,7 +197,11 @@ def measure_loudness(x: np.ndarray, sr: int) -> float:
     return float(meter.integrated_loudness(x))
-def true_peak_limiter(x: np.ndarray, sr: int, max_true_peak: float = TARGET_TP) -> np.ndarray:
     upsampled_sr = 192000
     x_upsampled = librosa.resample(x, orig_sr=sr, target_sr=upsampled_sr)
     true_peak = np.max(np.abs(x_upsampled))
@@ -116,7 +215,7 @@ def true_peak_limiter(x: np.ndarray, sr: int, max_true_peak: float = TARGET_TP)
     x_limited = librosa.resample(x_upsampled, orig_sr=upsampled_sr, target_sr=sr)
     x_limited = librosa.util.fix_length(x_limited, size=x.shape[-1])
-    return x_limited.astype("float32")
 def normalize_lufs(x: np.ndarray, sr: int) -> np.ndarray:
@@ -125,9 +224,9 @@ def normalize_lufs(x: np.ndarray, sr: int) -> np.ndarray:
     """
     try:
         current_lufs = measure_loudness(x, sr)
         if not np.isfinite(current_lufs):
-            return x.astype("float32")
         gain_db = TARGET_LOUDNESS - current_lufs
         gain = 10 ** (gain_db / 20)
@@ -135,7 +234,7 @@ def normalize_lufs(x: np.ndarray, sr: int) -> np.ndarray:
         y = x * gain
         y = true_peak_limiter(y, sr, max_true_peak=TARGET_TP)
-        return y.astype("float32")
     except Exception as e:
         warnings.warn(f"LUFS normalization failed, returning input unchanged: {e}")
-        return x.astype("float32")

 from typing import Optional
 import io
 import warnings
+import librosa
+import librosa.display
+import matplotlib.pyplot as plt
+import numpy as np
 import pyloudnorm as pyln
+from matplotlib.patches import Patch
+from PIL import Image
+from constants import TARGET_LOUDNESS, TARGET_TP, VAD_OFF, VAD_ON
+def get_vad_labels(vad_timestamps: list[list[float]], length: float) -> list[dict]:
+    subtitles = []
+    cur = 0.0
+    for start, end in vad_timestamps:
+        if start > cur:
+            subtitles.append(
+                {
+                    "text": f"Voice Detection: {VAD_OFF}",
+                    "timestamp": [cur, start],
+                }
+            )
+        subtitles.append(
+            {
+                "text": f"Voice Detection: {VAD_ON}",
+                "timestamp": [start, end],
+            }
+        )
+        cur = end
+    if cur < length:
+        subtitles.append(
+            {
+                "text": f"Voice Detection: {VAD_OFF}",
+                "timestamp": [cur, length],
+            }
+        )
+    return subtitles
 def to_gradio_audio(x: np.ndarray, sr: int) -> tuple[int, np.ndarray]:
+    """Return (sample_rate, int16 array) for Gradio Audio."""
     x = np.asarray(x)
     x = np.squeeze(x)
     if x.ndim == 2 and x.shape[0] in (1, 2) and x.shape[1] > x.shape[0]:
         x = x.T
     if x.ndim == 2 and x.shape[1] == 1:
         x = x[:, 0]
     x = x.astype(np.float32)
     x = np.clip(x, -1.0, 1.0)
     x = (x * 32767).astype(np.int16)
+    return sr, x
+def _merge_vad_segments(
+    vad_timestamps: list[list[float]],
+    gap_tolerance: float = 0.05,
+) -> list[tuple[float, float]]:
+    if not vad_timestamps:
+        return []
+    segments = sorted((float(start), float(end)) for start, end in vad_timestamps)
+    merged: list[tuple[float, float]] = [segments[0]]
+    for start, end in segments[1:]:
+        last_start, last_end = merged[-1]
+        if start <= last_end + gap_tolerance:
+            merged[-1] = (last_start, max(last_end, end))
+        else:
+            merged.append((start, end))
+    return merged
 def spec_image(
     audio_array: np.ndarray,
+    sr: int,
     n_fft: int = 2048,
     hop_length: int = 512,
     n_mels: int = 128,
     fmax: Optional[float] = None,
+    vad_timestamps: Optional[list[list[float]]] = None,
 ) -> Image.Image:
+    y = np.asarray(audio_array, dtype=np.float32).flatten()
     S = librosa.feature.melspectrogram(
         y=y,
         sr=sr,
         n_mels=n_mels,
         fmax=fmax or sr // 2,
     )
+    S_db = librosa.power_to_db(S, ref=np.max)
     fig, ax = plt.subplots(figsize=(8, 3), dpi=150)
     img = librosa.display.specshow(
+        S_db,
+        sr=sr,
+        hop_length=hop_length,
+        x_axis="time",
+        y_axis="mel",
+        cmap="magma",
+        ax=ax,
     )
+    if vad_timestamps:
+        vad_color = "#22C55E"  # softer, cleaner green
+        merged_segments = _merge_vad_segments(vad_timestamps, gap_tolerance=0.05)
+        # Draw VAD bar as a fixed portion of the figure height (e.g., 4% of axes height)
+        bar_height_axes = 0.05  # 2% of axes height
+        bar_bottom_axes = 0.0  # 0% above the bottom
+        for start, end in merged_segments:
+            ax.fill_between(
+                [start, end],
+                [bar_bottom_axes, bar_bottom_axes],
+                [bar_bottom_axes + bar_height_axes, bar_bottom_axes + bar_height_axes],
+                color=vad_color,
+                alpha=0.95,
+                linewidth=0,
+                zorder=5,
+                transform=ax.get_xaxis_transform(),
+            )
+        vad_patch = Patch(
+            facecolor=vad_color,
+            edgecolor=vad_color,
+            label="Voice Activity",
+        )
+        ax.legend(
+            handles=[vad_patch],
+            loc="upper right",
+            fontsize=8,
+            frameon=True,
+            framealpha=0.9,
+        )
     cbar = fig.colorbar(img, ax=ax, format="%+2.0f dB")
     cbar.set_label("dB")
     ax.set_title("Mel-spectrogram")
     ax.set_xlabel("Time in s")
     ax.set_ylabel("Frequency in Hz")
     fig.tight_layout(pad=0.2)
     buf = io.BytesIO()
     fig.savefig(buf, format="png", bbox_inches="tight", pad_inches=0)
     plt.close(fig)
     buf.seek(0)
     return Image.open(buf).convert("RGB")
     """
     ref_words = reference.split()
     hyp_words = hypothesis.split()
+    d = np.zeros((len(ref_words) + 1, len(hyp_words) + 1), dtype=np.uint16)
     for i in range(len(ref_words) + 1):
         d[i][0] = i
     for j in range(len(hyp_words) + 1):
         d[0][j] = j
     for i in range(1, len(ref_words) + 1):
         for j in range(1, len(hyp_words) + 1):
+            cost = 0 if ref_words[i - 1] == hyp_words[j - 1] else 1
             d[i][j] = min(
+                d[i - 1][j] + 1,
+                d[i][j - 1] + 1,
+                d[i - 1][j - 1] + cost,
             )
+    return d[len(ref_words)][len(hyp_words)] / max(len(ref_words), 1)
 def measure_loudness(x: np.ndarray, sr: int) -> float:
     return float(meter.integrated_loudness(x))
+def true_peak_limiter(
+    x: np.ndarray,
+    sr: int,
+    max_true_peak: float = TARGET_TP,
+) -> np.ndarray:
     upsampled_sr = 192000
     x_upsampled = librosa.resample(x, orig_sr=sr, target_sr=upsampled_sr)
     true_peak = np.max(np.abs(x_upsampled))
     x_limited = librosa.resample(x_upsampled, orig_sr=upsampled_sr, target_sr=sr)
     x_limited = librosa.util.fix_length(x_limited, size=x.shape[-1])
+    return x_limited.astype(np.float32)
 def normalize_lufs(x: np.ndarray, sr: int) -> np.ndarray:
     """
     try:
         current_lufs = measure_loudness(x, sr)
         if not np.isfinite(current_lufs):
+            return x.astype(np.float32)
         gain_db = TARGET_LOUDNESS - current_lufs
         gain = 10 ** (gain_db / 20)
         y = x * gain
         y = true_peak_limiter(y, sr, max_true_peak=TARGET_TP)
+        return y.astype(np.float32)
     except Exception as e:
         warnings.warn(f"LUFS normalization failed, returning input unchanged: {e}")
+        return x.astype(np.float32)