Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
Merge branch 'main' into pr/2
Browse files- app.py +13 -10
- offline_pipeline.py +24 -220
- stt_streamers/deepgram_streamer.py +1 -2
- stt_streamers/soniox_streamer.py +1 -2
- utils.py +2 -43
app.py
CHANGED
|
@@ -50,7 +50,8 @@ def process_with_live_transcript(
|
|
| 50 |
sample_stem,
|
| 51 |
stt_model,
|
| 52 |
last_sample_stem,
|
| 53 |
-
|
|
|
|
| 54 |
"""Generator that runs the offline pipeline in real time (chunked): enhanced audio and
|
| 55 |
both transcripts stream from the first chunk so playback and transcription start immediately."""
|
| 56 |
progress_state = {}
|
|
@@ -60,10 +61,11 @@ def process_with_live_transcript(
|
|
| 60 |
try:
|
| 61 |
result_holder["result"] = run_offline_pipeline_streaming(
|
| 62 |
input_array,
|
|
|
|
| 63 |
enhancement_level,
|
| 64 |
sample_stem,
|
| 65 |
stt_model,
|
| 66 |
-
progress_state
|
| 67 |
)
|
| 68 |
except Exception as e:
|
| 69 |
result_holder["error"] = e
|
|
@@ -153,6 +155,7 @@ with gr.Blocks() as demo:
|
|
| 153 |
input_array = gr.State()
|
| 154 |
enhanced_array = gr.State()
|
| 155 |
precomputed_noisy_transcript = gr.State("")
|
|
|
|
| 156 |
|
| 157 |
gr.HTML(
|
| 158 |
'<a href="https://ai-coustics.com/" target="_blank">'
|
|
@@ -232,7 +235,7 @@ with gr.Blocks() as demo:
|
|
| 232 |
choices=ALL_FILES, value="en_00412_i_h_36", label="Sample"
|
| 233 |
)
|
| 234 |
audio_file_from_dataset = gr.Audio(
|
| 235 |
-
type="filepath", interactive=False, buttons=["download"], autoplay=
|
| 236 |
)
|
| 237 |
|
| 238 |
with gr.Tab("Upload local file") as upload_tab:
|
|
@@ -257,7 +260,7 @@ with gr.Blocks() as demo:
|
|
| 257 |
type="numpy",
|
| 258 |
interactive=False,
|
| 259 |
buttons=["download"],
|
| 260 |
-
autoplay=
|
| 261 |
)
|
| 262 |
|
| 263 |
with gr.Row(equal_height=True, elem_classes="results-row"):
|
|
@@ -288,8 +291,8 @@ with gr.Blocks() as demo:
|
|
| 288 |
def load_dataset_sample_on_tab_visit(dropdown_value):
|
| 289 |
"""Load the selected sample when visiting the Dataset tab; use default if dropdown is empty."""
|
| 290 |
sample_id = dropdown_value or DEFAULT_DATASET_SAMPLE
|
| 291 |
-
audio_path, arr, stem = load_file_from_dataset(sample_id)
|
| 292 |
-
return sample_id, audio_path, arr, stem
|
| 293 |
|
| 294 |
stream_tab.select(
|
| 295 |
lambda: (
|
|
@@ -332,7 +335,7 @@ with gr.Blocks() as demo:
|
|
| 332 |
).then(
|
| 333 |
load_dataset_sample_on_tab_visit,
|
| 334 |
inputs=[dataset_dropdown],
|
| 335 |
-
outputs=[dataset_dropdown, audio_file_from_dataset, input_array, sample_stem],
|
| 336 |
)
|
| 337 |
|
| 338 |
stt_model.change(
|
|
@@ -386,7 +389,7 @@ with gr.Blocks() as demo:
|
|
| 386 |
).then(
|
| 387 |
load_file_from_dataset,
|
| 388 |
inputs=dataset_dropdown,
|
| 389 |
-
outputs=[audio_file_from_dataset, input_array, sample_stem],
|
| 390 |
)
|
| 391 |
|
| 392 |
# Uploading a local file triggers loading the audio file and hiding results until enhancement
|
|
@@ -395,7 +398,7 @@ with gr.Blocks() as demo:
|
|
| 395 |
outputs=results_card,
|
| 396 |
).then(
|
| 397 |
load_local_file,
|
| 398 |
-
inputs=[audio_file_upload, normalize],
|
| 399 |
outputs=[input_array, sample_stem, audio_preview],
|
| 400 |
)
|
| 401 |
|
|
@@ -408,7 +411,7 @@ with gr.Blocks() as demo:
|
|
| 408 |
# Enhancement button: run pipeline with live transcript progress (dataset + local file modes).
|
| 409 |
enhance_btn.click(
|
| 410 |
process_with_live_transcript,
|
| 411 |
-
inputs=[input_array, enhancement_level, sample_stem, stt_model, last_sample_stem],
|
| 412 |
outputs=[
|
| 413 |
results_card,
|
| 414 |
enhanced_audio,
|
|
|
|
| 50 |
sample_stem,
|
| 51 |
stt_model,
|
| 52 |
last_sample_stem,
|
| 53 |
+
current_sample_rate,
|
| 54 |
+
):
|
| 55 |
"""Generator that runs the offline pipeline in real time (chunked): enhanced audio and
|
| 56 |
both transcripts stream from the first chunk so playback and transcription start immediately."""
|
| 57 |
progress_state = {}
|
|
|
|
| 61 |
try:
|
| 62 |
result_holder["result"] = run_offline_pipeline_streaming(
|
| 63 |
input_array,
|
| 64 |
+
current_sample_rate,
|
| 65 |
enhancement_level,
|
| 66 |
sample_stem,
|
| 67 |
stt_model,
|
| 68 |
+
progress_state
|
| 69 |
)
|
| 70 |
except Exception as e:
|
| 71 |
result_holder["error"] = e
|
|
|
|
| 155 |
input_array = gr.State()
|
| 156 |
enhanced_array = gr.State()
|
| 157 |
precomputed_noisy_transcript = gr.State("")
|
| 158 |
+
current_sample_rate = gr.State(16000) # default sample rate for dataset samples; updated on local file load if different
|
| 159 |
|
| 160 |
gr.HTML(
|
| 161 |
'<a href="https://ai-coustics.com/" target="_blank">'
|
|
|
|
| 235 |
choices=ALL_FILES, value="en_00412_i_h_36", label="Sample"
|
| 236 |
)
|
| 237 |
audio_file_from_dataset = gr.Audio(
|
| 238 |
+
type="filepath", interactive=False, buttons=["download"], autoplay=False
|
| 239 |
)
|
| 240 |
|
| 241 |
with gr.Tab("Upload local file") as upload_tab:
|
|
|
|
| 260 |
type="numpy",
|
| 261 |
interactive=False,
|
| 262 |
buttons=["download"],
|
| 263 |
+
autoplay=False,
|
| 264 |
)
|
| 265 |
|
| 266 |
with gr.Row(equal_height=True, elem_classes="results-row"):
|
|
|
|
| 291 |
def load_dataset_sample_on_tab_visit(dropdown_value):
|
| 292 |
"""Load the selected sample when visiting the Dataset tab; use default if dropdown is empty."""
|
| 293 |
sample_id = dropdown_value or DEFAULT_DATASET_SAMPLE
|
| 294 |
+
audio_path, arr, stem, sample_rate = load_file_from_dataset(sample_id)
|
| 295 |
+
return sample_id, audio_path, arr, stem, sample_rate
|
| 296 |
|
| 297 |
stream_tab.select(
|
| 298 |
lambda: (
|
|
|
|
| 335 |
).then(
|
| 336 |
load_dataset_sample_on_tab_visit,
|
| 337 |
inputs=[dataset_dropdown],
|
| 338 |
+
outputs=[dataset_dropdown, audio_file_from_dataset, input_array, sample_stem, current_sample_rate],
|
| 339 |
)
|
| 340 |
|
| 341 |
stt_model.change(
|
|
|
|
| 389 |
).then(
|
| 390 |
load_file_from_dataset,
|
| 391 |
inputs=dataset_dropdown,
|
| 392 |
+
outputs=[audio_file_from_dataset, input_array, sample_stem, current_sample_rate],
|
| 393 |
)
|
| 394 |
|
| 395 |
# Uploading a local file triggers loading the audio file and hiding results until enhancement
|
|
|
|
| 398 |
outputs=results_card,
|
| 399 |
).then(
|
| 400 |
load_local_file,
|
| 401 |
+
inputs=[audio_file_upload, normalize, current_sample_rate],
|
| 402 |
outputs=[input_array, sample_stem, audio_preview],
|
| 403 |
)
|
| 404 |
|
|
|
|
| 411 |
# Enhancement button: run pipeline with live transcript progress (dataset + local file modes).
|
| 412 |
enhance_btn.click(
|
| 413 |
process_with_live_transcript,
|
| 414 |
+
inputs=[input_array, enhancement_level, sample_stem, stt_model, last_sample_stem, current_sample_rate],
|
| 415 |
outputs=[
|
| 416 |
results_card,
|
| 417 |
enhanced_audio,
|
offline_pipeline.py
CHANGED
|
@@ -1,210 +1,15 @@
|
|
| 1 |
import os
|
| 2 |
-
from
|
| 3 |
-
from typing import Optional
|
| 4 |
|
| 5 |
import gradio as gr
|
| 6 |
-
import
|
| 7 |
from sdk import SDKWrapper
|
| 8 |
-
from utils import spec_image, compute_wer,
|
| 9 |
from hf_dataset_utils import get_audio, get_transcript
|
| 10 |
-
from constants import
|
| 11 |
import numpy as np
|
| 12 |
|
| 13 |
|
| 14 |
-
def retrieve_audio_information(
|
| 15 |
-
original_array: np.ndarray,
|
| 16 |
-
enhanced_array: np.ndarray,
|
| 17 |
-
sample_id: str,
|
| 18 |
-
stt_model: str,
|
| 19 |
-
noisy_transcript: str,
|
| 20 |
-
progress_state: Optional[dict] = None,
|
| 21 |
-
) -> tuple[str, str, str, str, str, str]:
|
| 22 |
-
"""Build spectrograms, transcribe enhanced audio, and compute WER. Caller must supply
|
| 23 |
-
noisy_transcript (transcription of original_array) so STT on the original is never run here.
|
| 24 |
-
If progress_state is provided, progress_state['enhanced'] is updated with partial transcript as enhanced STT streams."""
|
| 25 |
-
if original_array is None or enhanced_array is None:
|
| 26 |
-
raise ValueError("Audio arrays are not available.")
|
| 27 |
-
noisy_spec_path = f"{APP_TMP_DIR}/{sample_id}_noisy_spectrogram.png"
|
| 28 |
-
enhanced_spec_path = f"{APP_TMP_DIR}/{sample_id}_enhanced_spectrogram.png"
|
| 29 |
-
spec_image(original_array).save(noisy_spec_path)
|
| 30 |
-
spec_image(enhanced_array).save(enhanced_spec_path)
|
| 31 |
-
on_enhanced = (lambda t: progress_state.__setitem__("enhanced", t)) if progress_state is not None else None
|
| 32 |
-
enhanced_transcript = transcribe_audio(
|
| 33 |
-
enhanced_array,
|
| 34 |
-
DEFAULT_SR,
|
| 35 |
-
stt_model,
|
| 36 |
-
stream_name=f"{sample_id}_enhanced",
|
| 37 |
-
on_update=on_enhanced,
|
| 38 |
-
)
|
| 39 |
-
try:
|
| 40 |
-
original_transcript = get_transcript(sample_id)
|
| 41 |
-
wer_enhanced = compute_wer(original_transcript, enhanced_transcript)
|
| 42 |
-
wer_noisy = compute_wer(original_transcript, noisy_transcript)
|
| 43 |
-
enhanced_transcript += f" (WER: {wer_enhanced * 100:.2f}%)"
|
| 44 |
-
noisy_transcript += f" (WER: {wer_noisy * 100:.2f}%)"
|
| 45 |
-
except Exception:
|
| 46 |
-
original_transcript = "Unavailable"
|
| 47 |
-
return enhanced_spec_path, noisy_spec_path, enhanced_transcript, original_transcript, noisy_transcript, sample_id
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
def denoise_and_transcribe_noisy_parallel(
|
| 51 |
-
sample_16k: np.ndarray,
|
| 52 |
-
enhancement_level: float,
|
| 53 |
-
sample_id: str,
|
| 54 |
-
stt_model: str,
|
| 55 |
-
progress_state: Optional[dict] = None,
|
| 56 |
-
) -> tuple[np.ndarray | None, tuple[int, np.ndarray] | None, str]:
|
| 57 |
-
"""Run denoising and noisy transcription in parallel. Returns (enhanced_array, gradio_audio, noisy_transcript).
|
| 58 |
-
If progress_state is provided, progress_state['noisy'] is updated with partial transcript as noisy STT streams."""
|
| 59 |
-
if sample_16k is None:
|
| 60 |
-
raise ValueError("No audio to enhance. Please upload a file first.")
|
| 61 |
-
on_noisy = (lambda t: progress_state.__setitem__("noisy", t)) if progress_state is not None else None
|
| 62 |
-
with ThreadPoolExecutor(max_workers=2) as executor:
|
| 63 |
-
future_denoise = executor.submit(
|
| 64 |
-
_denoise_audio_impl, sample_16k, float(enhancement_level) / 100.0
|
| 65 |
-
)
|
| 66 |
-
future_noisy = executor.submit(
|
| 67 |
-
transcribe_audio,
|
| 68 |
-
sample_16k,
|
| 69 |
-
DEFAULT_SR,
|
| 70 |
-
stt_model,
|
| 71 |
-
stream_name=f"{sample_id}_noisy",
|
| 72 |
-
on_update=on_noisy,
|
| 73 |
-
)
|
| 74 |
-
enhanced_array, gradio_enhanced_audio = future_denoise.result()
|
| 75 |
-
noisy_transcript = future_noisy.result()
|
| 76 |
-
return enhanced_array, gradio_enhanced_audio, noisy_transcript
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
def _denoise_audio_impl(
|
| 80 |
-
sample_16k: np.ndarray, enhancement_level_frac: float
|
| 81 |
-
) -> tuple[np.ndarray, tuple[int, np.ndarray]]:
|
| 82 |
-
"""Run the enhancer on a single array and return both the array and Gradio-ready audio.
|
| 83 |
-
|
| 84 |
-
Used by denoise_and_transcribe_noisy_parallel so the denoise step can run in a
|
| 85 |
-
thread without Gradio UI calls. Expects enhancement_level_frac in [0, 1] (not percent).
|
| 86 |
-
Returns (enhanced_array, (sample_rate, numpy_array)) for Gradio Audio.
|
| 87 |
-
"""
|
| 88 |
-
sdk = SDKWrapper()
|
| 89 |
-
sdk.init_processor(
|
| 90 |
-
sample_rate=DEFAULT_SR,
|
| 91 |
-
enhancement_level=enhancement_level_frac,
|
| 92 |
-
)
|
| 93 |
-
enhanced_array = sdk.process_sync(sample_16k)
|
| 94 |
-
gradio_enhanced_audio = to_gradio_audio(enhanced_array, DEFAULT_SR)
|
| 95 |
-
return enhanced_array, gradio_enhanced_audio
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
def denoise_audio(
|
| 99 |
-
sample_16k: np.ndarray,
|
| 100 |
-
enhancement_level: float = 50.0,
|
| 101 |
-
) -> tuple[np.ndarray | None , tuple[int, np.ndarray]| None]:
|
| 102 |
-
"""Enhance-only entry point: run the SDK on the given audio and return enhanced array + Gradio audio.
|
| 103 |
-
|
| 104 |
-
The main app uses denoise_and_transcribe_noisy_parallel (denoise + noisy STT in parallel)
|
| 105 |
-
instead. This function remains for backward compatibility, scripts, or any caller that
|
| 106 |
-
needs only enhancement without transcription (e.g. backup flows, tests).
|
| 107 |
-
"""
|
| 108 |
-
if sample_16k is None:
|
| 109 |
-
raise ValueError("No audio to enhance. Please upload a file first.")
|
| 110 |
-
try:
|
| 111 |
-
sdk = SDKWrapper()
|
| 112 |
-
sdk.init_processor(sample_rate=DEFAULT_SR, enhancement_level=float(enhancement_level) / 100.0)
|
| 113 |
-
enhanced_array = sdk.process_sync(sample_16k)
|
| 114 |
-
except Exception as e:
|
| 115 |
-
gr.Warning(f"{e}")
|
| 116 |
-
raise e
|
| 117 |
-
gradio_enhanced_audio = to_gradio_audio(enhanced_array, DEFAULT_SR)
|
| 118 |
-
return enhanced_array, gradio_enhanced_audio
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
def run_offline_pipeline_ordered(
|
| 122 |
-
sample_16k: np.ndarray,
|
| 123 |
-
enhancement_level: float,
|
| 124 |
-
sample_id: str,
|
| 125 |
-
stt_model: str,
|
| 126 |
-
progress_state: dict,
|
| 127 |
-
) -> tuple[str, str, str, tuple[int, np.ndarray], str, np.ndarray, str]:
|
| 128 |
-
"""Run pipeline in UI order: 1) Denoise only. 2) When done, set progress_state['enhanced_spec_path']
|
| 129 |
-
and progress_state['enhanced_audio']. 3) Run noisy STT and enhanced STT in parallel (both stream via
|
| 130 |
-
progress_state['noisy'] and progress_state['enhanced']). 4) Return final transcripts with WER.
|
| 131 |
-
|
| 132 |
-
Returns: (enhanced_spec_path, enhanced_transcript, noisy_transcript_with_wer, enhanced_audio,
|
| 133 |
-
last_stem, enhanced_array, precomputed_noisy).
|
| 134 |
-
"""
|
| 135 |
-
if sample_16k is None:
|
| 136 |
-
raise ValueError("No audio to enhance. Please upload a file first.")
|
| 137 |
-
# 1) Denoise only
|
| 138 |
-
enhanced_array, gradio_enhanced_audio = _denoise_audio_impl(
|
| 139 |
-
sample_16k, float(enhancement_level) / 100.0
|
| 140 |
-
)
|
| 141 |
-
# 2) As soon as enhanced audio is ready: build enhanced spectrogram and expose to UI
|
| 142 |
-
enhanced_spec_path = f"{APP_TMP_DIR}/{sample_id}_enhanced_spectrogram.png"
|
| 143 |
-
spec_image(enhanced_array).save(enhanced_spec_path)
|
| 144 |
-
progress_state["enhanced_spec_path"] = enhanced_spec_path
|
| 145 |
-
progress_state["enhanced_audio"] = gradio_enhanced_audio
|
| 146 |
-
# 3) Noisy and enhanced transcription starting both at the same time (parallel, both with on_update).
|
| 147 |
-
# Sync so the UI shows both boxes updating together: only write to noisy/enhanced once both have sent at least one update.
|
| 148 |
-
progress_state["noisy_pending"] = ""
|
| 149 |
-
progress_state["enhanced_pending"] = ""
|
| 150 |
-
progress_state["noisy_has_sent"] = False
|
| 151 |
-
progress_state["enhanced_has_sent"] = False
|
| 152 |
-
|
| 153 |
-
def _flush_both():
|
| 154 |
-
if progress_state.get("noisy_has_sent") and progress_state.get("enhanced_has_sent"):
|
| 155 |
-
progress_state["noisy"] = progress_state.get("noisy_pending", "")
|
| 156 |
-
progress_state["enhanced"] = progress_state.get("enhanced_pending", "")
|
| 157 |
-
|
| 158 |
-
def on_noisy(t: str):
|
| 159 |
-
progress_state["noisy_pending"] = t
|
| 160 |
-
progress_state["noisy_has_sent"] = True
|
| 161 |
-
_flush_both()
|
| 162 |
-
|
| 163 |
-
def on_enhanced(t: str):
|
| 164 |
-
progress_state["enhanced_pending"] = t
|
| 165 |
-
progress_state["enhanced_has_sent"] = True
|
| 166 |
-
_flush_both()
|
| 167 |
-
|
| 168 |
-
with ThreadPoolExecutor(max_workers=2) as executor:
|
| 169 |
-
future_noisy = executor.submit(
|
| 170 |
-
transcribe_audio,
|
| 171 |
-
sample_16k,
|
| 172 |
-
DEFAULT_SR,
|
| 173 |
-
stt_model,
|
| 174 |
-
stream_name=f"{sample_id}_noisy",
|
| 175 |
-
on_update=on_noisy,
|
| 176 |
-
)
|
| 177 |
-
future_enhanced = executor.submit(
|
| 178 |
-
transcribe_audio,
|
| 179 |
-
enhanced_array,
|
| 180 |
-
DEFAULT_SR,
|
| 181 |
-
stt_model,
|
| 182 |
-
stream_name=f"{sample_id}_enhanced",
|
| 183 |
-
on_update=on_enhanced,
|
| 184 |
-
)
|
| 185 |
-
noisy_transcript = future_noisy.result()
|
| 186 |
-
enhanced_transcript = future_enhanced.result()
|
| 187 |
-
# 4) WER and final strings
|
| 188 |
-
precomputed_noisy = noisy_transcript
|
| 189 |
-
try:
|
| 190 |
-
original_transcript = get_transcript(sample_id)
|
| 191 |
-
wer_enhanced = compute_wer(original_transcript, enhanced_transcript)
|
| 192 |
-
wer_noisy = compute_wer(original_transcript, noisy_transcript)
|
| 193 |
-
enhanced_transcript += f" (WER: {wer_enhanced * 100:.2f}%)"
|
| 194 |
-
noisy_transcript += f" (WER: {wer_noisy * 100:.2f}%)"
|
| 195 |
-
except Exception:
|
| 196 |
-
original_transcript = "Unavailable"
|
| 197 |
-
return (
|
| 198 |
-
enhanced_spec_path,
|
| 199 |
-
enhanced_transcript,
|
| 200 |
-
noisy_transcript,
|
| 201 |
-
gradio_enhanced_audio,
|
| 202 |
-
sample_id,
|
| 203 |
-
enhanced_array,
|
| 204 |
-
precomputed_noisy,
|
| 205 |
-
)
|
| 206 |
-
|
| 207 |
-
|
| 208 |
def _close_stt_stream(streamer) -> None:
|
| 209 |
"""Signal end-of-stream; streamer type may be Soniox (close_stream) or Deepgram (close)."""
|
| 210 |
if hasattr(streamer, "close_stream"):
|
|
@@ -212,9 +17,9 @@ def _close_stt_stream(streamer) -> None:
|
|
| 212 |
else:
|
| 213 |
streamer.close()
|
| 214 |
|
| 215 |
-
|
| 216 |
def run_offline_pipeline_streaming(
|
| 217 |
-
|
|
|
|
| 218 |
enhancement_level: float,
|
| 219 |
sample_id: str,
|
| 220 |
stt_model: str,
|
|
@@ -224,13 +29,13 @@ def run_offline_pipeline_streaming(
|
|
| 224 |
via progress_state['noisy'] and progress_state['enhanced']. Enhanced audio is returned
|
| 225 |
only at the end; the app plays it automatically when processing is complete.
|
| 226 |
Returns same tuple as run_offline_pipeline_ordered."""
|
| 227 |
-
if
|
| 228 |
raise ValueError("No audio to enhance. Please upload a file first.")
|
| 229 |
-
|
| 230 |
|
| 231 |
sdk = SDKWrapper()
|
| 232 |
sdk.init_processor(
|
| 233 |
-
sample_rate=
|
| 234 |
enhancement_level=float(enhancement_level) / 100.0,
|
| 235 |
)
|
| 236 |
chunk_size = sdk.num_frames
|
|
@@ -259,14 +64,14 @@ def run_offline_pipeline_streaming(
|
|
| 259 |
if stt_model not in STREAMER_CLASSES:
|
| 260 |
raise ValueError(f"Unknown STT model: {stt_model}")
|
| 261 |
StreamerClass = STREAMER_CLASSES[stt_model]
|
| 262 |
-
streamer_noisy = StreamerClass(
|
| 263 |
-
streamer_enhanced = StreamerClass(
|
| 264 |
|
| 265 |
accumulated_enhanced: list[np.ndarray] = []
|
| 266 |
-
n = len(
|
| 267 |
|
| 268 |
for i in range(0, n, chunk_size):
|
| 269 |
-
raw_chunk =
|
| 270 |
if raw_chunk.size < chunk_size:
|
| 271 |
raw_chunk = np.pad(
|
| 272 |
raw_chunk,
|
|
@@ -292,7 +97,7 @@ def run_offline_pipeline_streaming(
|
|
| 292 |
enhanced_transcript = streamer_enhanced.render_tokens(streamer_enhanced.final_tokens, [])
|
| 293 |
|
| 294 |
enhanced_array = np.concatenate(accumulated_enhanced).astype(np.float32)
|
| 295 |
-
gradio_enhanced_audio = to_gradio_audio(enhanced_array,
|
| 296 |
|
| 297 |
enhanced_spec_path = f"{APP_TMP_DIR}/{sample_id}_enhanced_spectrogram.png"
|
| 298 |
spec_image(enhanced_array).save(enhanced_spec_path)
|
|
@@ -318,32 +123,31 @@ def run_offline_pipeline_streaming(
|
|
| 318 |
precomputed_noisy,
|
| 319 |
)
|
| 320 |
|
| 321 |
-
|
| 322 |
def load_local_file(
|
| 323 |
sample_path: str,
|
| 324 |
normalize: bool = True,
|
| 325 |
-
) -> tuple[np.ndarray | None, str, tuple | None]:
|
| 326 |
if not sample_path or not os.path.exists(sample_path):
|
| 327 |
return None, "", None
|
| 328 |
if os.path.getsize(sample_path) > 5 * 1024 * 1024:
|
| 329 |
gr.Warning("File size exceeds 5 MB limit. Please upload a smaller file.")
|
| 330 |
raise ValueError("Uploaded file exceeds the 5 MB size limit.")
|
| 331 |
new_sample_stem = os.path.splitext(os.path.basename(sample_path))[0]
|
| 332 |
-
|
| 333 |
if normalize:
|
| 334 |
-
|
| 335 |
-
gradio_audio = to_gradio_audio(
|
| 336 |
-
return
|
| 337 |
|
| 338 |
-
def load_file_from_dataset(sample_id: str) -> tuple[tuple | None, np.ndarray | None, str]:
|
| 339 |
if not sample_id:
|
| 340 |
gr.Warning("Please select a sample from the dropdown.")
|
| 341 |
-
return None, None, ""
|
| 342 |
new_sample_stem = sample_id
|
| 343 |
try:
|
| 344 |
-
|
| 345 |
-
|
| 346 |
except Exception as e: # Convert to 16-bit PCM for gradio audio component
|
| 347 |
gr.Warning(f"{e}")
|
| 348 |
raise e
|
| 349 |
-
return
|
|
|
|
| 1 |
import os
|
| 2 |
+
from random import sample
|
|
|
|
| 3 |
|
| 4 |
import gradio as gr
|
| 5 |
+
import soundfile as sf
|
| 6 |
from sdk import SDKWrapper
|
| 7 |
+
from utils import spec_image, compute_wer, to_gradio_audio, normalize_lufs
|
| 8 |
from hf_dataset_utils import get_audio, get_transcript
|
| 9 |
+
from constants import APP_TMP_DIR, STREAMER_CLASSES
|
| 10 |
import numpy as np
|
| 11 |
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
def _close_stt_stream(streamer) -> None:
|
| 14 |
"""Signal end-of-stream; streamer type may be Soniox (close_stream) or Deepgram (close)."""
|
| 15 |
if hasattr(streamer, "close_stream"):
|
|
|
|
| 17 |
else:
|
| 18 |
streamer.close()
|
| 19 |
|
|
|
|
| 20 |
def run_offline_pipeline_streaming(
|
| 21 |
+
sample: np.ndarray,
|
| 22 |
+
sample_rate: int,
|
| 23 |
enhancement_level: float,
|
| 24 |
sample_id: str,
|
| 25 |
stt_model: str,
|
|
|
|
| 29 |
via progress_state['noisy'] and progress_state['enhanced']. Enhanced audio is returned
|
| 30 |
only at the end; the app plays it automatically when processing is complete.
|
| 31 |
Returns same tuple as run_offline_pipeline_ordered."""
|
| 32 |
+
if sample is None:
|
| 33 |
raise ValueError("No audio to enhance. Please upload a file first.")
|
| 34 |
+
sample = np.asarray(sample, dtype=np.float32).flatten()
|
| 35 |
|
| 36 |
sdk = SDKWrapper()
|
| 37 |
sdk.init_processor(
|
| 38 |
+
sample_rate=sample_rate,
|
| 39 |
enhancement_level=float(enhancement_level) / 100.0,
|
| 40 |
)
|
| 41 |
chunk_size = sdk.num_frames
|
|
|
|
| 64 |
if stt_model not in STREAMER_CLASSES:
|
| 65 |
raise ValueError(f"Unknown STT model: {stt_model}")
|
| 66 |
StreamerClass = STREAMER_CLASSES[stt_model]
|
| 67 |
+
streamer_noisy = StreamerClass(sample_rate, f"{sample_id}_noisy", on_update=on_noisy)
|
| 68 |
+
streamer_enhanced = StreamerClass(sample_rate, f"{sample_id}_enhanced", on_update=on_enhanced)
|
| 69 |
|
| 70 |
accumulated_enhanced: list[np.ndarray] = []
|
| 71 |
+
n = len(sample)
|
| 72 |
|
| 73 |
for i in range(0, n, chunk_size):
|
| 74 |
+
raw_chunk = sample[i : i + chunk_size]
|
| 75 |
if raw_chunk.size < chunk_size:
|
| 76 |
raw_chunk = np.pad(
|
| 77 |
raw_chunk,
|
|
|
|
| 97 |
enhanced_transcript = streamer_enhanced.render_tokens(streamer_enhanced.final_tokens, [])
|
| 98 |
|
| 99 |
enhanced_array = np.concatenate(accumulated_enhanced).astype(np.float32)
|
| 100 |
+
gradio_enhanced_audio = to_gradio_audio(enhanced_array, sample_rate)
|
| 101 |
|
| 102 |
enhanced_spec_path = f"{APP_TMP_DIR}/{sample_id}_enhanced_spectrogram.png"
|
| 103 |
spec_image(enhanced_array).save(enhanced_spec_path)
|
|
|
|
| 123 |
precomputed_noisy,
|
| 124 |
)
|
| 125 |
|
|
|
|
| 126 |
def load_local_file(
|
| 127 |
sample_path: str,
|
| 128 |
normalize: bool = True,
|
| 129 |
+
) -> tuple[np.ndarray | None, str, tuple | None, int]:
|
| 130 |
if not sample_path or not os.path.exists(sample_path):
|
| 131 |
return None, "", None
|
| 132 |
if os.path.getsize(sample_path) > 5 * 1024 * 1024:
|
| 133 |
gr.Warning("File size exceeds 5 MB limit. Please upload a smaller file.")
|
| 134 |
raise ValueError("Uploaded file exceeds the 5 MB size limit.")
|
| 135 |
new_sample_stem = os.path.splitext(os.path.basename(sample_path))[0]
|
| 136 |
+
y, sample_rate = sf.read(sample_path, dtype="float32", always_2d=False)
|
| 137 |
if normalize:
|
| 138 |
+
y = normalize_lufs(y, sample_rate)
|
| 139 |
+
gradio_audio = to_gradio_audio(y, sample_rate)
|
| 140 |
+
return y, new_sample_stem, gradio_audio, sample_rate
|
| 141 |
|
| 142 |
+
def load_file_from_dataset(sample_id: str) -> tuple[tuple | None, np.ndarray | None, str, int | None]:
|
| 143 |
if not sample_id:
|
| 144 |
gr.Warning("Please select a sample from the dropdown.")
|
| 145 |
+
return None, None, "", None
|
| 146 |
new_sample_stem = sample_id
|
| 147 |
try:
|
| 148 |
+
y, sample_rate = get_audio(sample_id, prefix="mix")
|
| 149 |
+
y_for_gradio = to_gradio_audio(y, sample_rate)
|
| 150 |
except Exception as e: # Convert to 16-bit PCM for gradio audio component
|
| 151 |
gr.Warning(f"{e}")
|
| 152 |
raise e
|
| 153 |
+
return y_for_gradio, y, new_sample_stem, sample_rate
|
stt_streamers/deepgram_streamer.py
CHANGED
|
@@ -78,12 +78,11 @@ class DeepgramStreamer:
|
|
| 78 |
"""
|
| 79 |
Returns parameters for the Deepgram V1 URL query string.
|
| 80 |
"""
|
| 81 |
-
assert fs_hz == 16000, "Only 16 kHz audio is supported."
|
| 82 |
|
| 83 |
return {
|
| 84 |
"model": "nova-3", # Recommended general model
|
| 85 |
"encoding": "linear16", # Corresponds to pcm_s16le
|
| 86 |
-
"sample_rate":
|
| 87 |
"channels": 1,
|
| 88 |
"smart_format": "true", # handling punctuation/formatting
|
| 89 |
"interim_results": "true", # required for non-final updates
|
|
|
|
| 78 |
"""
|
| 79 |
Returns parameters for the Deepgram V1 URL query string.
|
| 80 |
"""
|
|
|
|
| 81 |
|
| 82 |
return {
|
| 83 |
"model": "nova-3", # Recommended general model
|
| 84 |
"encoding": "linear16", # Corresponds to pcm_s16le
|
| 85 |
+
"sample_rate": fs_hz,
|
| 86 |
"channels": 1,
|
| 87 |
"smart_format": "true", # handling punctuation/formatting
|
| 88 |
"interim_results": "true", # required for non-final updates
|
stt_streamers/soniox_streamer.py
CHANGED
|
@@ -13,7 +13,6 @@ class SonioxStreamer:
|
|
| 13 |
api_key = os.environ.get("SONIOX_API_KEY")
|
| 14 |
if not api_key:
|
| 15 |
raise RuntimeError("Missing SONIOX_API_KEY.")
|
| 16 |
-
assert fs_hz == 16000, "Only 16 kHz audio is supported."
|
| 17 |
|
| 18 |
self.stream_name = stream_name
|
| 19 |
self.api_name = "Soniox RT"
|
|
@@ -69,7 +68,7 @@ class SonioxStreamer:
|
|
| 69 |
"api_key": api_key,
|
| 70 |
"model": "stt-rt-v3",
|
| 71 |
"audio_format": "pcm_s16le",
|
| 72 |
-
"sample_rate":
|
| 73 |
"num_channels": 1,
|
| 74 |
"language_hints": ["en", "de"],
|
| 75 |
"language_hints_strict": True,
|
|
|
|
| 13 |
api_key = os.environ.get("SONIOX_API_KEY")
|
| 14 |
if not api_key:
|
| 15 |
raise RuntimeError("Missing SONIOX_API_KEY.")
|
|
|
|
| 16 |
|
| 17 |
self.stream_name = stream_name
|
| 18 |
self.api_name = "Soniox RT"
|
|
|
|
| 68 |
"api_key": api_key,
|
| 69 |
"model": "stt-rt-v3",
|
| 70 |
"audio_format": "pcm_s16le",
|
| 71 |
+
"sample_rate": fs_hz,
|
| 72 |
"num_channels": 1,
|
| 73 |
"language_hints": ["en", "de"],
|
| 74 |
"language_hints_strict": True,
|
utils.py
CHANGED
|
@@ -1,11 +1,10 @@
|
|
| 1 |
-
from typing import Optional
|
| 2 |
-
import resampy
|
| 3 |
import numpy as np
|
| 4 |
import librosa
|
| 5 |
from PIL import Image
|
| 6 |
import io
|
| 7 |
import matplotlib.pyplot as plt
|
| 8 |
-
from constants import DEFAULT_SR, TARGET_LOUDNESS, TARGET_TP
|
| 9 |
import warnings
|
| 10 |
import pyloudnorm as pyln
|
| 11 |
|
|
@@ -140,43 +139,3 @@ def normalize_lufs(x: np.ndarray, sr: int) -> np.ndarray:
|
|
| 140 |
except Exception as e:
|
| 141 |
warnings.warn(f"LUFS normalization failed, returning input unchanged: {e}")
|
| 142 |
return x.astype("float32")
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
def transcribe_audio(
|
| 148 |
-
audio_array: np.ndarray,
|
| 149 |
-
sr: int,
|
| 150 |
-
streamer_type: str = "Deepgram",
|
| 151 |
-
stream_name: str = "RAW",
|
| 152 |
-
on_update: Optional[Callable[[str], None]] = None,
|
| 153 |
-
):
|
| 154 |
-
"""
|
| 155 |
-
Transcribe an audio array using the specified STT streamer.
|
| 156 |
-
|
| 157 |
-
Args:
|
| 158 |
-
audio_array (np.ndarray): Audio data array
|
| 159 |
-
sr (int): Sample rate of the audio array
|
| 160 |
-
streamer_type (str): "Soniox" or "Deepgram"
|
| 161 |
-
stream_name (str): Optional label for streamer instance ("RAW", "ENHANCED", etc.)
|
| 162 |
-
on_update: Optional callback(text: str) called with partial transcript as results stream in.
|
| 163 |
-
|
| 164 |
-
Returns:
|
| 165 |
-
str: Final transcript text
|
| 166 |
-
"""
|
| 167 |
-
if sr != DEFAULT_SR:
|
| 168 |
-
audio_array = resampy.resample(audio_array, sr, DEFAULT_SR)
|
| 169 |
-
sr = DEFAULT_SR
|
| 170 |
-
|
| 171 |
-
if streamer_type not in STREAMER_CLASSES:
|
| 172 |
-
raise ValueError(
|
| 173 |
-
f"Invalid streamer_type '{streamer_type}'. "
|
| 174 |
-
f"Choose from: {', '.join(STREAMER_CLASSES.keys())}"
|
| 175 |
-
)
|
| 176 |
-
|
| 177 |
-
StreamerClass = STREAMER_CLASSES[streamer_type]
|
| 178 |
-
streamer = StreamerClass(sr, stream_name, on_update=on_update)
|
| 179 |
-
|
| 180 |
-
transcript = streamer.stream_array(audio_array)
|
| 181 |
-
|
| 182 |
-
return transcript
|
|
|
|
| 1 |
+
from typing import Optional
|
|
|
|
| 2 |
import numpy as np
|
| 3 |
import librosa
|
| 4 |
from PIL import Image
|
| 5 |
import io
|
| 6 |
import matplotlib.pyplot as plt
|
| 7 |
+
from constants import DEFAULT_SR, TARGET_LOUDNESS, TARGET_TP
|
| 8 |
import warnings
|
| 9 |
import pyloudnorm as pyln
|
| 10 |
|
|
|
|
| 139 |
except Exception as e:
|
| 140 |
warnings.warn(f"LUFS normalization failed, returning input unchanged: {e}")
|
| 141 |
return x.astype("float32")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|