Spaces:

ai-coustics
/

VoiceFocus

Running on CPU Upgrade

App Files Files Community

mariesig commited on 22 days ago

Commit

4e945b9

1 Parent(s): be39c5b

refactor offline pipeline

Browse files

Files changed (4) hide show

app.py +16 -119
clean_up.py +2 -4
offline_pipeline.py +201 -89
utils.py +5 -0

app.py CHANGED Viewed

@@ -1,12 +1,10 @@
 import os
-import threading
-import time
 import gradio as gr
 from pathlib import Path
 from constants import STREAM_EVERY, APP_TMP_DIR
 from ui import LED_DOT_OFF
-from hf_dataset_utils import ALL_FILES, get_transcript
 from stream_pipeline import (
     on_start_recording,
@@ -17,114 +15,10 @@ from stream_pipeline import (
 from offline_pipeline import (
     load_file_from_dataset,
     load_local_file,
-    run_offline_pipeline_streaming,
 )
-from utils import spec_image
 from clean_up import purge_tmp_directory, cleanup_previous_run
-def process_with_live_transcript(
-    input_array,
-    enhancement_level,
-    sample_stem,
-    stt_model,
-    last_sample_stem,
-    current_sample_rate,
-):
-    """Generator that runs the offline pipeline in real time (chunked): enhanced audio and
-    both transcripts stream from the first chunk so playback and transcription start immediately."""
-    progress_state = {}
-    result_holder = {}
-    def worker():
-        try:
-            result_holder["result"] = run_offline_pipeline_streaming(
-                input_array,
-                current_sample_rate,
-                enhancement_level,
-                sample_stem,
-                stt_model,
-                progress_state
-            )
-        except Exception as e:
-            result_holder["error"] = e
-    # 1) First yield: ground truth + input spectrogram only (no audio, no enhanced spec, no transcripts yet)
-    _ = cleanup_previous_run(last_sample_stem)
-    noisy_spec_path = f"{APP_TMP_DIR}/{sample_stem}_noisy_spectrogram.png"
-    if input_array is not None:
-        try:
-            spec_image(input_array, sr = current_sample_rate).save(noisy_spec_path)
-        except Exception:
-            noisy_spec_path = None
-    else:
-        noisy_spec_path = None
-    try:
-        original_transcript = get_transcript(sample_stem)
-    except Exception:
-        original_transcript = "Unavailable"
-    yield (
-        gr.update(visible=True),
-        None,                # enhanced_audio: set only in final yield (smooth playback)
-        gr.update(value=None),  # enhanced_image: clear until step 3 (last)
-        gr.update(value=noisy_spec_path),  # noisy_image: input spectrogram (step 1)
-        original_transcript,
-        "",
-        "",
-        sample_stem,
-        None,
-        "",
-    )
-    # Let the UI render step 1 before we flood with polling updates
-    time.sleep(0.2)
-    thread = threading.Thread(target=worker, daemon=True)
-    thread.start()
-    poll_interval = 0.05
-    while "result" not in result_holder and "error" not in result_holder:
-        time.sleep(poll_interval)
-        # 2) Realtime: stream transcripts only; audio set in final yield for smooth playback
-        yield (
-            gr.update(visible=True),
-            gr.update(),  # enhanced_audio: set only in final yield, then autoplay
-            gr.update(),  # enhanced_image: reveal only in step 3 (final yield)
-            gr.update(),  # noisy_image already set in step 1
-            gr.update(),  # original_transcript unchanged
-            gr.update(value=progress_state.get("noisy", "")),
-            gr.update(value=progress_state.get("enhanced", "")),
-            gr.update(),
-            gr.update(),
-            gr.update(),
-        )
-    if "error" in result_holder:
-        raise result_holder["error"]
-    (
-        enhanced_spec_path,
-        enhanced_transcript,
-        noisy_transcript_with_wer,
-        enhanced_audio,
-        vad_labels,
-        last_stem,
-        enhanced_array,
-        precomputed_noisy,
-    ) = result_holder["result"]
-    # 3) Last: reveal enhanced spectrogram (and final audio/transcripts)
-    yield (
-        gr.update(visible=True),
-        gr.update(value=enhanced_audio, subtitles=vad_labels),
-        enhanced_spec_path,   # enhanced_image: show only now
-        noisy_spec_path,
-        original_transcript,
-        noisy_transcript_with_wer,
-        enhanced_transcript,
-        last_stem,
-        enhanced_array,
-        precomputed_noisy,
-    )
 _CSS_DIR = Path(__file__).resolve().parent / "assets"
 with gr.Blocks() as demo:
@@ -132,8 +26,6 @@ with gr.Blocks() as demo:
     last_sample_stem = gr.State("")
     input_array = gr.State()
     streaming_sr = gr.State(None)
-    enhanced_array = gr.State()
-    precomputed_noisy_transcript = gr.State("")
     current_sample_rate = gr.State(None)
     gr.HTML(
@@ -290,16 +182,17 @@ with gr.Blocks() as demo:
             gr.update(streaming=False, interactive=False),
             gr.update(visible=True),
             LED_DOT_OFF,
             "Off",
         )
     upload_tab.select(
         _on_not_streaming_tab,
-        outputs=[audio_stream, enhance_btn, system_status_led, system_status_text],
     ).then(
         load_local_file,
         inputs=[audio_file_upload, normalize],
-        outputs=[input_array, sample_stem, audio_preview, current_sample_rate],
     )
     dataset_tab.select(
@@ -341,7 +234,6 @@ with gr.Blocks() as demo:
     # Dataset dropdown selection triggers loading the audio file and hiding results until enhancement
     dataset_dropdown.change(
         lambda: gr.update(visible=False),
-        inputs=None,
         outputs=results_card,
     ).then(
         load_file_from_dataset,
@@ -367,10 +259,15 @@ with gr.Blocks() as demo:
     # Enhancement button: run pipeline with live transcript progress (dataset + local file modes).
     enhance_btn.click(
-        process_with_live_transcript,
-        inputs=[input_array, enhancement_level, sample_stem, stt_model, last_sample_stem, current_sample_rate],
         outputs=[
-            results_card,
             enhanced_audio,
             enhanced_image,
             noisy_image,
@@ -378,9 +275,9 @@ with gr.Blocks() as demo:
             noisy_transcript,
             enhanced_transcript,
             last_sample_stem,
-            enhanced_array,
-            precomputed_noisy_transcript,
-        ],
     )
 os.makedirs(APP_TMP_DIR, exist_ok=True)

 import os
 import gradio as gr
 from pathlib import Path
 from constants import STREAM_EVERY, APP_TMP_DIR
 from ui import LED_DOT_OFF
+from hf_dataset_utils import ALL_FILES
 from stream_pipeline import (
     on_start_recording,
 from offline_pipeline import (
     load_file_from_dataset,
     load_local_file,
+    run_offline_pipeline,
 )
 from clean_up import purge_tmp_directory, cleanup_previous_run
 _CSS_DIR = Path(__file__).resolve().parent / "assets"
 with gr.Blocks() as demo:
     last_sample_stem = gr.State("")
     input_array = gr.State()
     streaming_sr = gr.State(None)
     current_sample_rate = gr.State(None)
     gr.HTML(
             gr.update(streaming=False, interactive=False),
             gr.update(visible=True),
             LED_DOT_OFF,
+            LED_DOT_OFF,
             "Off",
         )
     upload_tab.select(
         _on_not_streaming_tab,
+        outputs=[audio_stream, enhance_btn, vad_led, system_status_led, system_status_text],
     ).then(
         load_local_file,
         inputs=[audio_file_upload, normalize],
+        outputs=[input_array, sample_stem, vad_led,audio_preview, current_sample_rate],
     )
     dataset_tab.select(
     # Dataset dropdown selection triggers loading the audio file and hiding results until enhancement
     dataset_dropdown.change(
         lambda: gr.update(visible=False),
         outputs=results_card,
     ).then(
         load_file_from_dataset,
     # Enhancement button: run pipeline with live transcript progress (dataset + local file modes).
     enhance_btn.click(
+        cleanup_previous_run,
+        inputs=[last_sample_stem]
+    ).then(
+        lambda: gr.update(visible=True),
+        outputs=results_card,
+    ).then(
+        run_offline_pipeline,
+        inputs=[input_array, current_sample_rate, enhancement_level, stt_model, sample_stem],
         outputs=[
             enhanced_audio,
             enhanced_image,
             noisy_image,
             noisy_transcript,
             enhanced_transcript,
             last_sample_stem,
+            ]
+    ).failure(
+        lambda: gr.Warning("Enhancement failed. Please refresh page and make sure you have a stable connection.")
     )
 os.makedirs(APP_TMP_DIR, exist_ok=True)

clean_up.py CHANGED Viewed

@@ -93,11 +93,9 @@ def cleanup_previous_run(
     sample_stem: str,
     tmp_dir: str = APP_TMP_DIR,
     max_age_minutes: int = MINUTES_KEEP,
-) -> tuple[None, None, str, str, str]:
-    gr.Info("Processing started. This may take a moment. Please do not refresh or close the window.")
     try:
         remove_files_related_to(sample_stem, tmp_dir=tmp_dir)
     except Exception as e:
         print(f"Failed to delete last run with id {sample_stem}: {e}")
-    purge_tmp_directory(max_age_minutes=max_age_minutes, tmp_dir=tmp_dir)
-    return None, None, "", "", ""

     sample_stem: str,
     tmp_dir: str = APP_TMP_DIR,
     max_age_minutes: int = MINUTES_KEEP,
+):
     try:
         remove_files_related_to(sample_stem, tmp_dir=tmp_dir)
     except Exception as e:
         print(f"Failed to delete last run with id {sample_stem}: {e}")
+    purge_tmp_directory(max_age_minutes=max_age_minutes, tmp_dir=tmp_dir)

offline_pipeline.py CHANGED Viewed

@@ -1,164 +1,276 @@
 import os
-from random import sample
 import gradio as gr
-import soundfile as sf
-from sdk import SDKWrapper, SDKParams
-from utils import spec_image, compute_wer, to_gradio_audio, normalize_lufs, get_vad_labels
-from hf_dataset_utils import get_audio, get_transcript
-from constants import APP_TMP_DIR, STREAMER_CLASSES
 import numpy as np
 SDK_OFFLINE = SDKWrapper()
-def _close_stt_stream(streamer) -> None:
-    """Signal end-of-stream; streamer type may be Soniox (close_stream) or Deepgram (close)."""
     if hasattr(streamer, "close_stream"):
         streamer.close_stream()
     else:
         streamer.close()
-def run_offline_pipeline_streaming(
-    sample: np.ndarray,
-    sample_rate: int,
-    enhancement_level: float,
-    sample_id: str,
-    stt_model: str,
-    progress_state: dict,
-) -> tuple[str, str, str, tuple[int, np.ndarray], list, str, np.ndarray, str]:
-    """Run enhancement and both STTs in real time by processing in chunks. Transcripts stream
-    via progress_state['noisy'] and progress_state['enhanced']. Enhanced audio is returned
-    only at the end; the app plays it automatically when processing is complete.
-    Returns same tuple as run_offline_pipeline_ordered."""
-    if sample is None:
-        raise ValueError("No audio to enhance. Please upload a file first.")
-    sample = np.asarray(sample, dtype=np.float32).flatten()
     sdk_params = SDKParams(
         sample_rate=sample_rate,
-        enhancement_level=enhancement_level/100.0,
-        allow_variable_frames=False,  # streaming pipeline uses fixed frames for simplicity
-        num_channels=1,
     )
     SDK_OFFLINE.init_processor(sdk_params)
-    chunk_size = SDK_OFFLINE.num_frames
-    # Sync transcript callbacks so both boxes update together
-    progress_state["noisy_pending"] = ""
-    progress_state["enhanced_pending"] = ""
-    progress_state["noisy_has_sent"] = False
-    progress_state["enhanced_has_sent"] = False
-    def _flush_both():
-        if progress_state.get("noisy_has_sent") and progress_state.get("enhanced_has_sent"):
-            progress_state["noisy"] = progress_state.get("noisy_pending", "")
-            progress_state["enhanced"] = progress_state.get("enhanced_pending", "")
-    def on_noisy(t: str):
-        progress_state["noisy_pending"] = t
-        progress_state["noisy_has_sent"] = True
-        _flush_both()
-    def on_enhanced(t: str):
-        progress_state["enhanced_pending"] = t
-        progress_state["enhanced_has_sent"] = True
-        _flush_both()
-    if stt_model not in STREAMER_CLASSES:
-        raise ValueError(f"Unknown STT model: {stt_model}")
-    StreamerClass = STREAMER_CLASSES[stt_model]
-    streamer_noisy = StreamerClass(sample_rate, f"{sample_id}_noisy", on_update=on_noisy)
-    streamer_enhanced = StreamerClass(sample_rate, f"{sample_id}_enhanced", on_update=on_enhanced)
     accumulated_enhanced: list[np.ndarray] = []
-    vad_timestamps = []
     n = len(sample)
     for i in range(0, n, chunk_size):
         raw_chunk = sample[i : i + chunk_size]
-        if raw_chunk.size < chunk_size:
             raw_chunk = np.pad(
                 raw_chunk,
-                (0, chunk_size - raw_chunk.size),
                 mode="constant",
                 constant_values=0.0,
             )
-        raw_2d = raw_chunk.reshape(1, -1)
-        enhanced_chunk = SDK_OFFLINE.process_chunk(raw_2d)
-        enhanced_1d = np.asarray(enhanced_chunk).flatten()
         streamer_noisy.process_chunk(raw_chunk)
         streamer_enhanced.process_chunk(enhanced_1d)
         accumulated_enhanced.append(enhanced_1d)
         if SDK_OFFLINE.vad_context.is_speech_detected():
-            start_in_sec = i/ sample_rate
-            end_in_sec = (i + chunk_size) / sample_rate
             vad_timestamps.append([start_in_sec, end_in_sec])
-    _close_stt_stream(streamer_noisy)
-    _close_stt_stream(streamer_enhanced)
-    streamer_noisy.finished_event.wait()
-    streamer_enhanced.finished_event.wait()
-    with streamer_noisy.lock:
-        noisy_transcript = streamer_noisy.render_tokens(streamer_noisy.final_tokens, [])
-    with streamer_enhanced.lock:
-        enhanced_transcript = streamer_enhanced.render_tokens(streamer_enhanced.final_tokens, [])
     enhanced_array = np.concatenate(accumulated_enhanced).astype(np.float32)
-    gradio_enhanced_audio = to_gradio_audio(enhanced_array, sample_rate)
-    enhanced_spec_path = f"{APP_TMP_DIR}/{sample_id}_enhanced_spectrogram.png"
-    spec_image(enhanced_array, sr = sample_rate).save(enhanced_spec_path)
-    progress_state["enhanced_spec_path"] = enhanced_spec_path
-    precomputed_noisy = noisy_transcript
     try:
         original_transcript = get_transcript(sample_id)
-        wer_enhanced = compute_wer(original_transcript, enhanced_transcript)
-        wer_noisy = compute_wer(original_transcript, noisy_transcript)
-        enhanced_transcript += f" (WER: {wer_enhanced * 100:.2f}%)"
-        noisy_transcript += f" (WER: {wer_noisy * 100:.2f}%)"
     except Exception:
-        pass
-    vad_labels = get_vad_labels(vad_timestamps, length=len(sample)/sample_rate)
     return (
         enhanced_spec_path,
-        enhanced_transcript,
         noisy_transcript,
-        gradio_enhanced_audio,
-        vad_labels,
         sample_id,
-        enhanced_array,
-        precomputed_noisy,
     )
 def load_local_file(
     sample_path: str,
     normalize: bool = True,
-    ) -> tuple[np.ndarray | None, str, tuple | None, int | None]:
     if not sample_path or not os.path.exists(sample_path):
         return None, "", None, None
     if os.path.getsize(sample_path) > 5 * 1024 * 1024:
         gr.Warning("File size exceeds 5 MB limit. Please upload a smaller file.")
         raise ValueError("Uploaded file exceeds the 5 MB size limit.")
     new_sample_stem = os.path.splitext(os.path.basename(sample_path))[0]
     y, sample_rate = sf.read(sample_path, dtype="float32", always_2d=False)
     if normalize:
         y = normalize_lufs(y, sample_rate)
     gradio_audio = to_gradio_audio(y, sample_rate)
     return y, new_sample_stem, gradio_audio, sample_rate
-def load_file_from_dataset(sample_id: str) -> tuple[tuple | None, np.ndarray | None, str, int | None]:
     if not sample_id:
         gr.Warning("Please select a sample from the dropdown.")
         return None, None, "", None
     new_sample_stem = sample_id
     try:
         y, sample_rate = get_audio(sample_id, prefix="mix")
         y_for_gradio = to_gradio_audio(y, sample_rate)
-    except Exception as e:  # Convert to 16-bit PCM for gradio audio component
-        gr.Warning(f"{e}")
-        raise e
     return y_for_gradio, y, new_sample_stem, sample_rate

 import os
+from typing import Any
 import gradio as gr
 import numpy as np
+import soundfile as sf
+from constants import APP_TMP_DIR, STREAMER_CLASSES
+from hf_dataset_utils import get_audio, get_transcript
+from sdk import SDKParams, SDKWrapper
+from utils import (
+    compute_wer,
+    get_vad_labels,
+    normalize_lufs,
+    spec_image,
+    to_gradio_audio,
+)
 SDK_OFFLINE = SDKWrapper()
+def _safe_progress(progress: gr.Progress, value: float, desc: str) -> None:
+    progress(max(0.0, min(1.0, value)), desc=desc)
+def _empty_pipeline_result(sample_id: str) -> tuple[Any, str, str, str, str, str, str]:
+    return (
+        None,
+        "",
+        "",
+        "Unavailable",
+        "Unavailable",
+        "Unavailable",
+        sample_id,
+    )
+def _finalize_stream_transcript(streamer) -> str:
     if hasattr(streamer, "close_stream"):
         streamer.close_stream()
     else:
         streamer.close()
+    streamer.finished_event.wait()
+    with streamer.lock:
+        return streamer.render_tokens(streamer.final_tokens, [])
+def _init_sdk(sample_rate: int, enhancement_level: int) -> int:
     sdk_params = SDKParams(
         sample_rate=sample_rate,
+        enhancement_level=enhancement_level / 100.0,
     )
     SDK_OFFLINE.init_processor(sdk_params)
+    return SDK_OFFLINE.num_frames
+def _init_streamers(
+    sample_rate: int,
+    stt_model: str,
+    sample_id: str,
+    progress: gr.Progress,
+):
+    if stt_model not in STREAMER_CLASSES:
+        raise ValueError(f"Unknown STT model: {stt_model}")
+    streamer_class = STREAMER_CLASSES[stt_model]
+    _safe_progress(progress, 0.12, f"Initializing {stt_model} stream 1/2...")
+    streamer_noisy = streamer_class(sample_rate, f"{sample_id}_noisy")
+    _safe_progress(progress, 0.18, f"Initializing {stt_model} stream 2/2...")
+    streamer_enhanced = streamer_class(sample_rate, f"{sample_id}_enhanced")
+    return streamer_noisy, streamer_enhanced
+def _attach_wer(
+    original_transcript: str,
+    noisy_transcript: str,
+    enhanced_transcript: str,
+) -> tuple[str, str]:
+    wer_enhanced = compute_wer(original_transcript, enhanced_transcript)
+    wer_noisy = compute_wer(original_transcript, noisy_transcript)
+    noisy_transcript = f"{noisy_transcript} (WER: {wer_noisy * 100:.2f}%)"
+    enhanced_transcript = f"{enhanced_transcript} (WER: {wer_enhanced * 100:.2f}%)"
+    return noisy_transcript, enhanced_transcript
+def _process_audio_chunks(
+    sample: np.ndarray,
+    sample_rate: int,
+    chunk_size: int,
+    streamer_noisy,
+    streamer_enhanced,
+    progress: gr.Progress,
+) -> tuple[np.ndarray, list[list[float]]]:
     accumulated_enhanced: list[np.ndarray] = []
+    vad_timestamps: list[list[float]] = []
     n = len(sample)
     for i in range(0, n, chunk_size):
         raw_chunk = sample[i : i + chunk_size]
+        original_chunk_len = raw_chunk.size
+        if original_chunk_len < chunk_size:
             raw_chunk = np.pad(
                 raw_chunk,
+                (0, chunk_size - original_chunk_len),
                 mode="constant",
                 constant_values=0.0,
             )
+        enhanced_chunk = SDK_OFFLINE.process_chunk(raw_chunk.reshape(1, -1))
+        enhanced_1d = np.asarray(enhanced_chunk, dtype=np.float32).flatten()
         streamer_noisy.process_chunk(raw_chunk)
         streamer_enhanced.process_chunk(enhanced_1d)
         accumulated_enhanced.append(enhanced_1d)
+        loop_progress = (i + original_chunk_len) / n if n > 0 else 1.0
+        _safe_progress(
+            progress,
+            0.20 + 0.60 * loop_progress,
+            "Enhancing audio...",
+        )
         if SDK_OFFLINE.vad_context.is_speech_detected():
+            start_in_sec = i / sample_rate
+            end_in_sec = min(i + original_chunk_len, n) / sample_rate
             vad_timestamps.append([start_in_sec, end_in_sec])
     enhanced_array = np.concatenate(accumulated_enhanced).astype(np.float32)
+    return enhanced_array, vad_timestamps
+def _save_spectrograms(
+    sample: np.ndarray,
+    enhanced_array: np.ndarray,
+    sample_rate: int,
+    sample_id: str,
+    vad_timestamps: list[list[float]],
+) -> tuple[str, str]:
+    os.makedirs(APP_TMP_DIR, exist_ok=True)
+    enhanced_spec_path = os.path.join(APP_TMP_DIR, f"{sample_id}_enhanced_spectrogram.png")
+    noisy_spec_path = os.path.join(APP_TMP_DIR, f"{sample_id}_noisy_spectrogram.png")
+    spec_image(enhanced_array, sr=sample_rate, vad_timestamps=vad_timestamps).save(enhanced_spec_path)
+    spec_image(sample, sr=sample_rate, vad_timestamps=vad_timestamps).save(noisy_spec_path)
+    return enhanced_spec_path, noisy_spec_path
+def run_offline_pipeline(
+    sample: np.ndarray,
+    sample_rate: int,
+    enhancement_level: int,
+    stt_model: str,
+    sample_id: str,
+    progress=gr.Progress(),
+) -> tuple[Any, str, str, str, str, str, str]:
+    _safe_progress(progress, 0.00, "Starting...")
+    if sample is None or len(sample) == 0:
+        gr.Warning("No audio to enhance. Please upload a file first.")
+        return _empty_pipeline_result(sample_id)
+    sample = np.asarray(sample, dtype=np.float32).flatten()
+    _safe_progress(progress, 0.05, "Initializing enhancement...")
+    chunk_size = _init_sdk(sample_rate, enhancement_level)
+    try:
+        streamer_noisy, streamer_enhanced = _init_streamers(
+            sample_rate=sample_rate,
+            stt_model=stt_model,
+            sample_id=sample_id,
+            progress=progress,
+        )
+    except Exception as e:
+        raise RuntimeError(f"Failed to initialize STT streaming: {e}") from e
+    enhanced_array, vad_timestamps = _process_audio_chunks(
+        sample=sample,
+        sample_rate=sample_rate,
+        chunk_size=chunk_size,
+        streamer_noisy=streamer_noisy,
+        streamer_enhanced=streamer_enhanced,
+        progress=progress,
+    )
+    _safe_progress(progress, 0.82, "Finalizing transcripts...")
+    noisy_transcript = _finalize_stream_transcript(streamer_noisy)
+    _safe_progress(progress, 0.88, "Finalizing transcripts...")
+    enhanced_transcript = _finalize_stream_transcript(streamer_enhanced)
+    _safe_progress(progress, 0.94, "Loading reference transcript...")
     try:
         original_transcript = get_transcript(sample_id)
     except Exception:
+        original_transcript = "Unavailable"
+    if original_transcript != "Unavailable":
+        _safe_progress(progress, 0.96, "Computing WER...")
+        noisy_transcript, enhanced_transcript = _attach_wer(
+            original_transcript=original_transcript,
+            noisy_transcript=noisy_transcript,
+            enhanced_transcript=enhanced_transcript,
+        )
+    _safe_progress(progress, 0.99, "Generating outputs...")
+    gradio_enhanced_audio = to_gradio_audio(enhanced_array, sample_rate)
+    enhanced_spec_path, noisy_spec_path = _save_spectrograms(
+        sample=sample,
+        enhanced_array=enhanced_array,
+        sample_rate=sample_rate,
+        sample_id=sample_id,
+        vad_timestamps=vad_timestamps
+    )
+    vad_labels = get_vad_labels(
+        vad_timestamps,
+        length=len(sample) / sample_rate,
+    )
+    _safe_progress(progress, 1.00, "Done.")
     return (
+        gr.update(value=gradio_enhanced_audio, subtitles=vad_labels),
         enhanced_spec_path,
+        noisy_spec_path,
+        original_transcript,
         noisy_transcript,
+        enhanced_transcript,
         sample_id,
     )
 def load_local_file(
     sample_path: str,
     normalize: bool = True,
+) -> tuple[np.ndarray | None, str, tuple | None, int | None]:
     if not sample_path or not os.path.exists(sample_path):
         return None, "", None, None
     if os.path.getsize(sample_path) > 5 * 1024 * 1024:
         gr.Warning("File size exceeds 5 MB limit. Please upload a smaller file.")
         raise ValueError("Uploaded file exceeds the 5 MB size limit.")
     new_sample_stem = os.path.splitext(os.path.basename(sample_path))[0]
     y, sample_rate = sf.read(sample_path, dtype="float32", always_2d=False)
     if normalize:
         y = normalize_lufs(y, sample_rate)
     gradio_audio = to_gradio_audio(y, sample_rate)
     return y, new_sample_stem, gradio_audio, sample_rate
+def load_file_from_dataset(
+    sample_id: str,
+) -> tuple[tuple | None, np.ndarray | None, str, int | None]:
     if not sample_id:
         gr.Warning("Please select a sample from the dropdown.")
         return None, None, "", None
     new_sample_stem = sample_id
     try:
         y, sample_rate = get_audio(sample_id, prefix="mix")
         y_for_gradio = to_gradio_audio(y, sample_rate)
+    except Exception as e:
+        gr.Warning(str(e))
+        raise
     return y_for_gradio, y, new_sample_stem, sample_rate

utils.py CHANGED Viewed

@@ -63,6 +63,7 @@ def spec_image(
     hop_length: int = 512,
     n_mels: int = 128,
     fmax: Optional[float] = None,
 ) -> Image.Image:
     """
     Generate a mel-spectrogram image from an audio array.
@@ -89,6 +90,10 @@ def spec_image(
     fig.tight_layout(pad=0.2)
     buf = io.BytesIO()
     fig.savefig(buf, format="png", bbox_inches="tight", pad_inches=0)
     plt.close(fig)
     buf.seek(0)
     return Image.open(buf).convert("RGB")

     hop_length: int = 512,
     n_mels: int = 128,
     fmax: Optional[float] = None,
+    vad_timestamps: Optional[list[list[float]]] = None,
 ) -> Image.Image:
     """
     Generate a mel-spectrogram image from an audio array.
     fig.tight_layout(pad=0.2)
     buf = io.BytesIO()
     fig.savefig(buf, format="png", bbox_inches="tight", pad_inches=0)
+    if vad_timestamps:
+        for start, end in vad_timestamps:
+            ax.axvspan(start, end, color="red", alpha=0.3)
     plt.close(fig)
     buf.seek(0)
     return Image.open(buf).convert("RGB")