mariesig commited on
Commit
f25152f
·
2 Parent(s): 4be2da0c62a089

Merge branch 'main' into pr/2

Browse files
app.py CHANGED
@@ -50,7 +50,8 @@ def process_with_live_transcript(
50
  sample_stem,
51
  stt_model,
52
  last_sample_stem,
53
- ):
 
54
  """Generator that runs the offline pipeline in real time (chunked): enhanced audio and
55
  both transcripts stream from the first chunk so playback and transcription start immediately."""
56
  progress_state = {}
@@ -60,10 +61,11 @@ def process_with_live_transcript(
60
  try:
61
  result_holder["result"] = run_offline_pipeline_streaming(
62
  input_array,
 
63
  enhancement_level,
64
  sample_stem,
65
  stt_model,
66
- progress_state,
67
  )
68
  except Exception as e:
69
  result_holder["error"] = e
@@ -153,6 +155,7 @@ with gr.Blocks() as demo:
153
  input_array = gr.State()
154
  enhanced_array = gr.State()
155
  precomputed_noisy_transcript = gr.State("")
 
156
 
157
  gr.HTML(
158
  '<a href="https://ai-coustics.com/" target="_blank">'
@@ -232,7 +235,7 @@ with gr.Blocks() as demo:
232
  choices=ALL_FILES, value="en_00412_i_h_36", label="Sample"
233
  )
234
  audio_file_from_dataset = gr.Audio(
235
- type="filepath", interactive=False, buttons=["download"], autoplay=True
236
  )
237
 
238
  with gr.Tab("Upload local file") as upload_tab:
@@ -257,7 +260,7 @@ with gr.Blocks() as demo:
257
  type="numpy",
258
  interactive=False,
259
  buttons=["download"],
260
- autoplay=True,
261
  )
262
 
263
  with gr.Row(equal_height=True, elem_classes="results-row"):
@@ -288,8 +291,8 @@ with gr.Blocks() as demo:
288
  def load_dataset_sample_on_tab_visit(dropdown_value):
289
  """Load the selected sample when visiting the Dataset tab; use default if dropdown is empty."""
290
  sample_id = dropdown_value or DEFAULT_DATASET_SAMPLE
291
- audio_path, arr, stem = load_file_from_dataset(sample_id)
292
- return sample_id, audio_path, arr, stem
293
 
294
  stream_tab.select(
295
  lambda: (
@@ -332,7 +335,7 @@ with gr.Blocks() as demo:
332
  ).then(
333
  load_dataset_sample_on_tab_visit,
334
  inputs=[dataset_dropdown],
335
- outputs=[dataset_dropdown, audio_file_from_dataset, input_array, sample_stem],
336
  )
337
 
338
  stt_model.change(
@@ -386,7 +389,7 @@ with gr.Blocks() as demo:
386
  ).then(
387
  load_file_from_dataset,
388
  inputs=dataset_dropdown,
389
- outputs=[audio_file_from_dataset, input_array, sample_stem],
390
  )
391
 
392
  # Uploading a local file triggers loading the audio file and hiding results until enhancement
@@ -395,7 +398,7 @@ with gr.Blocks() as demo:
395
  outputs=results_card,
396
  ).then(
397
  load_local_file,
398
- inputs=[audio_file_upload, normalize],
399
  outputs=[input_array, sample_stem, audio_preview],
400
  )
401
 
@@ -408,7 +411,7 @@ with gr.Blocks() as demo:
408
  # Enhancement button: run pipeline with live transcript progress (dataset + local file modes).
409
  enhance_btn.click(
410
  process_with_live_transcript,
411
- inputs=[input_array, enhancement_level, sample_stem, stt_model, last_sample_stem],
412
  outputs=[
413
  results_card,
414
  enhanced_audio,
 
50
  sample_stem,
51
  stt_model,
52
  last_sample_stem,
53
+ current_sample_rate,
54
+ ):
55
  """Generator that runs the offline pipeline in real time (chunked): enhanced audio and
56
  both transcripts stream from the first chunk so playback and transcription start immediately."""
57
  progress_state = {}
 
61
  try:
62
  result_holder["result"] = run_offline_pipeline_streaming(
63
  input_array,
64
+ current_sample_rate,
65
  enhancement_level,
66
  sample_stem,
67
  stt_model,
68
+ progress_state
69
  )
70
  except Exception as e:
71
  result_holder["error"] = e
 
155
  input_array = gr.State()
156
  enhanced_array = gr.State()
157
  precomputed_noisy_transcript = gr.State("")
158
+ current_sample_rate = gr.State(16000) # default sample rate for dataset samples; updated on local file load if different
159
 
160
  gr.HTML(
161
  '<a href="https://ai-coustics.com/" target="_blank">'
 
235
  choices=ALL_FILES, value="en_00412_i_h_36", label="Sample"
236
  )
237
  audio_file_from_dataset = gr.Audio(
238
+ type="filepath", interactive=False, buttons=["download"], autoplay=False
239
  )
240
 
241
  with gr.Tab("Upload local file") as upload_tab:
 
260
  type="numpy",
261
  interactive=False,
262
  buttons=["download"],
263
+ autoplay=False,
264
  )
265
 
266
  with gr.Row(equal_height=True, elem_classes="results-row"):
 
291
  def load_dataset_sample_on_tab_visit(dropdown_value):
292
  """Load the selected sample when visiting the Dataset tab; use default if dropdown is empty."""
293
  sample_id = dropdown_value or DEFAULT_DATASET_SAMPLE
294
+ audio_path, arr, stem, sample_rate = load_file_from_dataset(sample_id)
295
+ return sample_id, audio_path, arr, stem, sample_rate
296
 
297
  stream_tab.select(
298
  lambda: (
 
335
  ).then(
336
  load_dataset_sample_on_tab_visit,
337
  inputs=[dataset_dropdown],
338
+ outputs=[dataset_dropdown, audio_file_from_dataset, input_array, sample_stem, current_sample_rate],
339
  )
340
 
341
  stt_model.change(
 
389
  ).then(
390
  load_file_from_dataset,
391
  inputs=dataset_dropdown,
392
+ outputs=[audio_file_from_dataset, input_array, sample_stem, current_sample_rate],
393
  )
394
 
395
  # Uploading a local file triggers loading the audio file and hiding results until enhancement
 
398
  outputs=results_card,
399
  ).then(
400
  load_local_file,
401
+ inputs=[audio_file_upload, normalize, current_sample_rate],
402
  outputs=[input_array, sample_stem, audio_preview],
403
  )
404
 
 
411
  # Enhancement button: run pipeline with live transcript progress (dataset + local file modes).
412
  enhance_btn.click(
413
  process_with_live_transcript,
414
+ inputs=[input_array, enhancement_level, sample_stem, stt_model, last_sample_stem, current_sample_rate],
415
  outputs=[
416
  results_card,
417
  enhanced_audio,
offline_pipeline.py CHANGED
@@ -1,210 +1,15 @@
1
  import os
2
- from concurrent.futures import ThreadPoolExecutor
3
- from typing import Optional
4
 
5
  import gradio as gr
6
- import librosa
7
  from sdk import SDKWrapper
8
- from utils import spec_image, compute_wer, transcribe_audio, to_gradio_audio, normalize_lufs
9
  from hf_dataset_utils import get_audio, get_transcript
10
- from constants import DEFAULT_SR, APP_TMP_DIR, STREAMER_CLASSES
11
  import numpy as np
12
 
13
 
14
- def retrieve_audio_information(
15
- original_array: np.ndarray,
16
- enhanced_array: np.ndarray,
17
- sample_id: str,
18
- stt_model: str,
19
- noisy_transcript: str,
20
- progress_state: Optional[dict] = None,
21
- ) -> tuple[str, str, str, str, str, str]:
22
- """Build spectrograms, transcribe enhanced audio, and compute WER. Caller must supply
23
- noisy_transcript (transcription of original_array) so STT on the original is never run here.
24
- If progress_state is provided, progress_state['enhanced'] is updated with partial transcript as enhanced STT streams."""
25
- if original_array is None or enhanced_array is None:
26
- raise ValueError("Audio arrays are not available.")
27
- noisy_spec_path = f"{APP_TMP_DIR}/{sample_id}_noisy_spectrogram.png"
28
- enhanced_spec_path = f"{APP_TMP_DIR}/{sample_id}_enhanced_spectrogram.png"
29
- spec_image(original_array).save(noisy_spec_path)
30
- spec_image(enhanced_array).save(enhanced_spec_path)
31
- on_enhanced = (lambda t: progress_state.__setitem__("enhanced", t)) if progress_state is not None else None
32
- enhanced_transcript = transcribe_audio(
33
- enhanced_array,
34
- DEFAULT_SR,
35
- stt_model,
36
- stream_name=f"{sample_id}_enhanced",
37
- on_update=on_enhanced,
38
- )
39
- try:
40
- original_transcript = get_transcript(sample_id)
41
- wer_enhanced = compute_wer(original_transcript, enhanced_transcript)
42
- wer_noisy = compute_wer(original_transcript, noisy_transcript)
43
- enhanced_transcript += f" (WER: {wer_enhanced * 100:.2f}%)"
44
- noisy_transcript += f" (WER: {wer_noisy * 100:.2f}%)"
45
- except Exception:
46
- original_transcript = "Unavailable"
47
- return enhanced_spec_path, noisy_spec_path, enhanced_transcript, original_transcript, noisy_transcript, sample_id
48
-
49
-
50
- def denoise_and_transcribe_noisy_parallel(
51
- sample_16k: np.ndarray,
52
- enhancement_level: float,
53
- sample_id: str,
54
- stt_model: str,
55
- progress_state: Optional[dict] = None,
56
- ) -> tuple[np.ndarray | None, tuple[int, np.ndarray] | None, str]:
57
- """Run denoising and noisy transcription in parallel. Returns (enhanced_array, gradio_audio, noisy_transcript).
58
- If progress_state is provided, progress_state['noisy'] is updated with partial transcript as noisy STT streams."""
59
- if sample_16k is None:
60
- raise ValueError("No audio to enhance. Please upload a file first.")
61
- on_noisy = (lambda t: progress_state.__setitem__("noisy", t)) if progress_state is not None else None
62
- with ThreadPoolExecutor(max_workers=2) as executor:
63
- future_denoise = executor.submit(
64
- _denoise_audio_impl, sample_16k, float(enhancement_level) / 100.0
65
- )
66
- future_noisy = executor.submit(
67
- transcribe_audio,
68
- sample_16k,
69
- DEFAULT_SR,
70
- stt_model,
71
- stream_name=f"{sample_id}_noisy",
72
- on_update=on_noisy,
73
- )
74
- enhanced_array, gradio_enhanced_audio = future_denoise.result()
75
- noisy_transcript = future_noisy.result()
76
- return enhanced_array, gradio_enhanced_audio, noisy_transcript
77
-
78
-
79
- def _denoise_audio_impl(
80
- sample_16k: np.ndarray, enhancement_level_frac: float
81
- ) -> tuple[np.ndarray, tuple[int, np.ndarray]]:
82
- """Run the enhancer on a single array and return both the array and Gradio-ready audio.
83
-
84
- Used by denoise_and_transcribe_noisy_parallel so the denoise step can run in a
85
- thread without Gradio UI calls. Expects enhancement_level_frac in [0, 1] (not percent).
86
- Returns (enhanced_array, (sample_rate, numpy_array)) for Gradio Audio.
87
- """
88
- sdk = SDKWrapper()
89
- sdk.init_processor(
90
- sample_rate=DEFAULT_SR,
91
- enhancement_level=enhancement_level_frac,
92
- )
93
- enhanced_array = sdk.process_sync(sample_16k)
94
- gradio_enhanced_audio = to_gradio_audio(enhanced_array, DEFAULT_SR)
95
- return enhanced_array, gradio_enhanced_audio
96
-
97
-
98
- def denoise_audio(
99
- sample_16k: np.ndarray,
100
- enhancement_level: float = 50.0,
101
- ) -> tuple[np.ndarray | None , tuple[int, np.ndarray]| None]:
102
- """Enhance-only entry point: run the SDK on the given audio and return enhanced array + Gradio audio.
103
-
104
- The main app uses denoise_and_transcribe_noisy_parallel (denoise + noisy STT in parallel)
105
- instead. This function remains for backward compatibility, scripts, or any caller that
106
- needs only enhancement without transcription (e.g. backup flows, tests).
107
- """
108
- if sample_16k is None:
109
- raise ValueError("No audio to enhance. Please upload a file first.")
110
- try:
111
- sdk = SDKWrapper()
112
- sdk.init_processor(sample_rate=DEFAULT_SR, enhancement_level=float(enhancement_level) / 100.0)
113
- enhanced_array = sdk.process_sync(sample_16k)
114
- except Exception as e:
115
- gr.Warning(f"{e}")
116
- raise e
117
- gradio_enhanced_audio = to_gradio_audio(enhanced_array, DEFAULT_SR)
118
- return enhanced_array, gradio_enhanced_audio
119
-
120
-
121
- def run_offline_pipeline_ordered(
122
- sample_16k: np.ndarray,
123
- enhancement_level: float,
124
- sample_id: str,
125
- stt_model: str,
126
- progress_state: dict,
127
- ) -> tuple[str, str, str, tuple[int, np.ndarray], str, np.ndarray, str]:
128
- """Run pipeline in UI order: 1) Denoise only. 2) When done, set progress_state['enhanced_spec_path']
129
- and progress_state['enhanced_audio']. 3) Run noisy STT and enhanced STT in parallel (both stream via
130
- progress_state['noisy'] and progress_state['enhanced']). 4) Return final transcripts with WER.
131
-
132
- Returns: (enhanced_spec_path, enhanced_transcript, noisy_transcript_with_wer, enhanced_audio,
133
- last_stem, enhanced_array, precomputed_noisy).
134
- """
135
- if sample_16k is None:
136
- raise ValueError("No audio to enhance. Please upload a file first.")
137
- # 1) Denoise only
138
- enhanced_array, gradio_enhanced_audio = _denoise_audio_impl(
139
- sample_16k, float(enhancement_level) / 100.0
140
- )
141
- # 2) As soon as enhanced audio is ready: build enhanced spectrogram and expose to UI
142
- enhanced_spec_path = f"{APP_TMP_DIR}/{sample_id}_enhanced_spectrogram.png"
143
- spec_image(enhanced_array).save(enhanced_spec_path)
144
- progress_state["enhanced_spec_path"] = enhanced_spec_path
145
- progress_state["enhanced_audio"] = gradio_enhanced_audio
146
- # 3) Noisy and enhanced transcription starting both at the same time (parallel, both with on_update).
147
- # Sync so the UI shows both boxes updating together: only write to noisy/enhanced once both have sent at least one update.
148
- progress_state["noisy_pending"] = ""
149
- progress_state["enhanced_pending"] = ""
150
- progress_state["noisy_has_sent"] = False
151
- progress_state["enhanced_has_sent"] = False
152
-
153
- def _flush_both():
154
- if progress_state.get("noisy_has_sent") and progress_state.get("enhanced_has_sent"):
155
- progress_state["noisy"] = progress_state.get("noisy_pending", "")
156
- progress_state["enhanced"] = progress_state.get("enhanced_pending", "")
157
-
158
- def on_noisy(t: str):
159
- progress_state["noisy_pending"] = t
160
- progress_state["noisy_has_sent"] = True
161
- _flush_both()
162
-
163
- def on_enhanced(t: str):
164
- progress_state["enhanced_pending"] = t
165
- progress_state["enhanced_has_sent"] = True
166
- _flush_both()
167
-
168
- with ThreadPoolExecutor(max_workers=2) as executor:
169
- future_noisy = executor.submit(
170
- transcribe_audio,
171
- sample_16k,
172
- DEFAULT_SR,
173
- stt_model,
174
- stream_name=f"{sample_id}_noisy",
175
- on_update=on_noisy,
176
- )
177
- future_enhanced = executor.submit(
178
- transcribe_audio,
179
- enhanced_array,
180
- DEFAULT_SR,
181
- stt_model,
182
- stream_name=f"{sample_id}_enhanced",
183
- on_update=on_enhanced,
184
- )
185
- noisy_transcript = future_noisy.result()
186
- enhanced_transcript = future_enhanced.result()
187
- # 4) WER and final strings
188
- precomputed_noisy = noisy_transcript
189
- try:
190
- original_transcript = get_transcript(sample_id)
191
- wer_enhanced = compute_wer(original_transcript, enhanced_transcript)
192
- wer_noisy = compute_wer(original_transcript, noisy_transcript)
193
- enhanced_transcript += f" (WER: {wer_enhanced * 100:.2f}%)"
194
- noisy_transcript += f" (WER: {wer_noisy * 100:.2f}%)"
195
- except Exception:
196
- original_transcript = "Unavailable"
197
- return (
198
- enhanced_spec_path,
199
- enhanced_transcript,
200
- noisy_transcript,
201
- gradio_enhanced_audio,
202
- sample_id,
203
- enhanced_array,
204
- precomputed_noisy,
205
- )
206
-
207
-
208
  def _close_stt_stream(streamer) -> None:
209
  """Signal end-of-stream; streamer type may be Soniox (close_stream) or Deepgram (close)."""
210
  if hasattr(streamer, "close_stream"):
@@ -212,9 +17,9 @@ def _close_stt_stream(streamer) -> None:
212
  else:
213
  streamer.close()
214
 
215
-
216
  def run_offline_pipeline_streaming(
217
- sample_16k: np.ndarray,
 
218
  enhancement_level: float,
219
  sample_id: str,
220
  stt_model: str,
@@ -224,13 +29,13 @@ def run_offline_pipeline_streaming(
224
  via progress_state['noisy'] and progress_state['enhanced']. Enhanced audio is returned
225
  only at the end; the app plays it automatically when processing is complete.
226
  Returns same tuple as run_offline_pipeline_ordered."""
227
- if sample_16k is None:
228
  raise ValueError("No audio to enhance. Please upload a file first.")
229
- sample_16k = np.asarray(sample_16k, dtype=np.float32).flatten()
230
 
231
  sdk = SDKWrapper()
232
  sdk.init_processor(
233
- sample_rate=DEFAULT_SR,
234
  enhancement_level=float(enhancement_level) / 100.0,
235
  )
236
  chunk_size = sdk.num_frames
@@ -259,14 +64,14 @@ def run_offline_pipeline_streaming(
259
  if stt_model not in STREAMER_CLASSES:
260
  raise ValueError(f"Unknown STT model: {stt_model}")
261
  StreamerClass = STREAMER_CLASSES[stt_model]
262
- streamer_noisy = StreamerClass(DEFAULT_SR, f"{sample_id}_noisy", on_update=on_noisy)
263
- streamer_enhanced = StreamerClass(DEFAULT_SR, f"{sample_id}_enhanced", on_update=on_enhanced)
264
 
265
  accumulated_enhanced: list[np.ndarray] = []
266
- n = len(sample_16k)
267
 
268
  for i in range(0, n, chunk_size):
269
- raw_chunk = sample_16k[i : i + chunk_size]
270
  if raw_chunk.size < chunk_size:
271
  raw_chunk = np.pad(
272
  raw_chunk,
@@ -292,7 +97,7 @@ def run_offline_pipeline_streaming(
292
  enhanced_transcript = streamer_enhanced.render_tokens(streamer_enhanced.final_tokens, [])
293
 
294
  enhanced_array = np.concatenate(accumulated_enhanced).astype(np.float32)
295
- gradio_enhanced_audio = to_gradio_audio(enhanced_array, DEFAULT_SR)
296
 
297
  enhanced_spec_path = f"{APP_TMP_DIR}/{sample_id}_enhanced_spectrogram.png"
298
  spec_image(enhanced_array).save(enhanced_spec_path)
@@ -318,32 +123,31 @@ def run_offline_pipeline_streaming(
318
  precomputed_noisy,
319
  )
320
 
321
-
322
  def load_local_file(
323
  sample_path: str,
324
  normalize: bool = True,
325
- ) -> tuple[np.ndarray | None, str, tuple | None]:
326
  if not sample_path or not os.path.exists(sample_path):
327
  return None, "", None
328
  if os.path.getsize(sample_path) > 5 * 1024 * 1024:
329
  gr.Warning("File size exceeds 5 MB limit. Please upload a smaller file.")
330
  raise ValueError("Uploaded file exceeds the 5 MB size limit.")
331
  new_sample_stem = os.path.splitext(os.path.basename(sample_path))[0]
332
- y_16k, _ = librosa.load(sample_path, sr=DEFAULT_SR, dtype="float32", mono=True)
333
  if normalize:
334
- y_16k = normalize_lufs(y_16k, DEFAULT_SR)
335
- gradio_audio = to_gradio_audio(y_16k, DEFAULT_SR)
336
- return y_16k, new_sample_stem, gradio_audio
337
 
338
- def load_file_from_dataset(sample_id: str) -> tuple[tuple | None, np.ndarray | None, str]:
339
  if not sample_id:
340
  gr.Warning("Please select a sample from the dropdown.")
341
- return None, None, ""
342
  new_sample_stem = sample_id
343
  try:
344
- y_16k, sr = get_audio(sample_id, prefix="mix")
345
- y_16k_for_gradio = to_gradio_audio(y_16k, sr)
346
  except Exception as e: # Convert to 16-bit PCM for gradio audio component
347
  gr.Warning(f"{e}")
348
  raise e
349
- return y_16k_for_gradio, y_16k, new_sample_stem
 
1
  import os
2
+ from random import sample
 
3
 
4
  import gradio as gr
5
+ import soundfile as sf
6
  from sdk import SDKWrapper
7
+ from utils import spec_image, compute_wer, to_gradio_audio, normalize_lufs
8
  from hf_dataset_utils import get_audio, get_transcript
9
+ from constants import APP_TMP_DIR, STREAMER_CLASSES
10
  import numpy as np
11
 
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  def _close_stt_stream(streamer) -> None:
14
  """Signal end-of-stream; streamer type may be Soniox (close_stream) or Deepgram (close)."""
15
  if hasattr(streamer, "close_stream"):
 
17
  else:
18
  streamer.close()
19
 
 
20
  def run_offline_pipeline_streaming(
21
+ sample: np.ndarray,
22
+ sample_rate: int,
23
  enhancement_level: float,
24
  sample_id: str,
25
  stt_model: str,
 
29
  via progress_state['noisy'] and progress_state['enhanced']. Enhanced audio is returned
30
  only at the end; the app plays it automatically when processing is complete.
31
  Returns same tuple as run_offline_pipeline_ordered."""
32
+ if sample is None:
33
  raise ValueError("No audio to enhance. Please upload a file first.")
34
+ sample = np.asarray(sample, dtype=np.float32).flatten()
35
 
36
  sdk = SDKWrapper()
37
  sdk.init_processor(
38
+ sample_rate=sample_rate,
39
  enhancement_level=float(enhancement_level) / 100.0,
40
  )
41
  chunk_size = sdk.num_frames
 
64
  if stt_model not in STREAMER_CLASSES:
65
  raise ValueError(f"Unknown STT model: {stt_model}")
66
  StreamerClass = STREAMER_CLASSES[stt_model]
67
+ streamer_noisy = StreamerClass(sample_rate, f"{sample_id}_noisy", on_update=on_noisy)
68
+ streamer_enhanced = StreamerClass(sample_rate, f"{sample_id}_enhanced", on_update=on_enhanced)
69
 
70
  accumulated_enhanced: list[np.ndarray] = []
71
+ n = len(sample)
72
 
73
  for i in range(0, n, chunk_size):
74
+ raw_chunk = sample[i : i + chunk_size]
75
  if raw_chunk.size < chunk_size:
76
  raw_chunk = np.pad(
77
  raw_chunk,
 
97
  enhanced_transcript = streamer_enhanced.render_tokens(streamer_enhanced.final_tokens, [])
98
 
99
  enhanced_array = np.concatenate(accumulated_enhanced).astype(np.float32)
100
+ gradio_enhanced_audio = to_gradio_audio(enhanced_array, sample_rate)
101
 
102
  enhanced_spec_path = f"{APP_TMP_DIR}/{sample_id}_enhanced_spectrogram.png"
103
  spec_image(enhanced_array).save(enhanced_spec_path)
 
123
  precomputed_noisy,
124
  )
125
 
 
126
  def load_local_file(
127
  sample_path: str,
128
  normalize: bool = True,
129
+ ) -> tuple[np.ndarray | None, str, tuple | None, int]:
130
  if not sample_path or not os.path.exists(sample_path):
131
  return None, "", None
132
  if os.path.getsize(sample_path) > 5 * 1024 * 1024:
133
  gr.Warning("File size exceeds 5 MB limit. Please upload a smaller file.")
134
  raise ValueError("Uploaded file exceeds the 5 MB size limit.")
135
  new_sample_stem = os.path.splitext(os.path.basename(sample_path))[0]
136
+ y, sample_rate = sf.read(sample_path, dtype="float32", always_2d=False)
137
  if normalize:
138
+ y = normalize_lufs(y, sample_rate)
139
+ gradio_audio = to_gradio_audio(y, sample_rate)
140
+ return y, new_sample_stem, gradio_audio, sample_rate
141
 
142
+ def load_file_from_dataset(sample_id: str) -> tuple[tuple | None, np.ndarray | None, str, int | None]:
143
  if not sample_id:
144
  gr.Warning("Please select a sample from the dropdown.")
145
+ return None, None, "", None
146
  new_sample_stem = sample_id
147
  try:
148
+ y, sample_rate = get_audio(sample_id, prefix="mix")
149
+ y_for_gradio = to_gradio_audio(y, sample_rate)
150
  except Exception as e: # Convert to 16-bit PCM for gradio audio component
151
  gr.Warning(f"{e}")
152
  raise e
153
+ return y_for_gradio, y, new_sample_stem, sample_rate
stt_streamers/deepgram_streamer.py CHANGED
@@ -78,12 +78,11 @@ class DeepgramStreamer:
78
  """
79
  Returns parameters for the Deepgram V1 URL query string.
80
  """
81
- assert fs_hz == 16000, "Only 16 kHz audio is supported."
82
 
83
  return {
84
  "model": "nova-3", # Recommended general model
85
  "encoding": "linear16", # Corresponds to pcm_s16le
86
- "sample_rate": 16000,
87
  "channels": 1,
88
  "smart_format": "true", # handling punctuation/formatting
89
  "interim_results": "true", # required for non-final updates
 
78
  """
79
  Returns parameters for the Deepgram V1 URL query string.
80
  """
 
81
 
82
  return {
83
  "model": "nova-3", # Recommended general model
84
  "encoding": "linear16", # Corresponds to pcm_s16le
85
+ "sample_rate": fs_hz,
86
  "channels": 1,
87
  "smart_format": "true", # handling punctuation/formatting
88
  "interim_results": "true", # required for non-final updates
stt_streamers/soniox_streamer.py CHANGED
@@ -13,7 +13,6 @@ class SonioxStreamer:
13
  api_key = os.environ.get("SONIOX_API_KEY")
14
  if not api_key:
15
  raise RuntimeError("Missing SONIOX_API_KEY.")
16
- assert fs_hz == 16000, "Only 16 kHz audio is supported."
17
 
18
  self.stream_name = stream_name
19
  self.api_name = "Soniox RT"
@@ -69,7 +68,7 @@ class SonioxStreamer:
69
  "api_key": api_key,
70
  "model": "stt-rt-v3",
71
  "audio_format": "pcm_s16le",
72
- "sample_rate": 16000,
73
  "num_channels": 1,
74
  "language_hints": ["en", "de"],
75
  "language_hints_strict": True,
 
13
  api_key = os.environ.get("SONIOX_API_KEY")
14
  if not api_key:
15
  raise RuntimeError("Missing SONIOX_API_KEY.")
 
16
 
17
  self.stream_name = stream_name
18
  self.api_name = "Soniox RT"
 
68
  "api_key": api_key,
69
  "model": "stt-rt-v3",
70
  "audio_format": "pcm_s16le",
71
+ "sample_rate": fs_hz,
72
  "num_channels": 1,
73
  "language_hints": ["en", "de"],
74
  "language_hints_strict": True,
utils.py CHANGED
@@ -1,11 +1,10 @@
1
- from typing import Optional, Callable
2
- import resampy
3
  import numpy as np
4
  import librosa
5
  from PIL import Image
6
  import io
7
  import matplotlib.pyplot as plt
8
- from constants import DEFAULT_SR, TARGET_LOUDNESS, TARGET_TP, STREAMER_CLASSES
9
  import warnings
10
  import pyloudnorm as pyln
11
 
@@ -140,43 +139,3 @@ def normalize_lufs(x: np.ndarray, sr: int) -> np.ndarray:
140
  except Exception as e:
141
  warnings.warn(f"LUFS normalization failed, returning input unchanged: {e}")
142
  return x.astype("float32")
143
-
144
-
145
-
146
-
147
- def transcribe_audio(
148
- audio_array: np.ndarray,
149
- sr: int,
150
- streamer_type: str = "Deepgram",
151
- stream_name: str = "RAW",
152
- on_update: Optional[Callable[[str], None]] = None,
153
- ):
154
- """
155
- Transcribe an audio array using the specified STT streamer.
156
-
157
- Args:
158
- audio_array (np.ndarray): Audio data array
159
- sr (int): Sample rate of the audio array
160
- streamer_type (str): "Soniox" or "Deepgram"
161
- stream_name (str): Optional label for streamer instance ("RAW", "ENHANCED", etc.)
162
- on_update: Optional callback(text: str) called with partial transcript as results stream in.
163
-
164
- Returns:
165
- str: Final transcript text
166
- """
167
- if sr != DEFAULT_SR:
168
- audio_array = resampy.resample(audio_array, sr, DEFAULT_SR)
169
- sr = DEFAULT_SR
170
-
171
- if streamer_type not in STREAMER_CLASSES:
172
- raise ValueError(
173
- f"Invalid streamer_type '{streamer_type}'. "
174
- f"Choose from: {', '.join(STREAMER_CLASSES.keys())}"
175
- )
176
-
177
- StreamerClass = STREAMER_CLASSES[streamer_type]
178
- streamer = StreamerClass(sr, stream_name, on_update=on_update)
179
-
180
- transcript = streamer.stream_array(audio_array)
181
-
182
- return transcript
 
1
+ from typing import Optional
 
2
  import numpy as np
3
  import librosa
4
  from PIL import Image
5
  import io
6
  import matplotlib.pyplot as plt
7
+ from constants import DEFAULT_SR, TARGET_LOUDNESS, TARGET_TP
8
  import warnings
9
  import pyloudnorm as pyln
10
 
 
139
  except Exception as e:
140
  warnings.warn(f"LUFS normalization failed, returning input unchanged: {e}")
141
  return x.astype("float32")