user4-33 commited on
Commit
c62a089
·
1 Parent(s): aa7cfd7

dynamic sample rate (#1)

Browse files

- disable autoplay (a7806663ddacb0a6a58b7bdab296d05309239020)
- Support dynamic sample rates (abfb82b92ca57b41ad8001a7e16e54d78e60d5a0)
- Remove unused function (b80bb2398d4d4dca208626a14581afd954465a65)
- Upload file always to wav (fix m4a bug) (b38ade9597e00e1b3845c8780d5f25ceb792228d)

app.py CHANGED
@@ -50,7 +50,8 @@ def process_with_live_transcript(
50
  sample_stem,
51
  stt_model,
52
  last_sample_stem,
53
- ):
 
54
  """Generator that runs the offline pipeline in real time (chunked): enhanced audio and
55
  both transcripts stream from the first chunk so playback and transcription start immediately."""
56
  progress_state = {}
@@ -60,10 +61,11 @@ def process_with_live_transcript(
60
  try:
61
  result_holder["result"] = run_offline_pipeline_streaming(
62
  input_array,
 
63
  enhancement_level,
64
  sample_stem,
65
  stt_model,
66
- progress_state,
67
  )
68
  except Exception as e:
69
  result_holder["error"] = e
@@ -153,6 +155,7 @@ with gr.Blocks() as demo:
153
  input_array = gr.State()
154
  enhanced_array = gr.State()
155
  precomputed_noisy_transcript = gr.State("")
 
156
 
157
  gr.HTML(
158
  '<a href="https://ai-coustics.com/" target="_blank">'
@@ -232,14 +235,14 @@ with gr.Blocks() as demo:
232
  choices=ALL_FILES, value="en_00412_i_h_36", label="Sample"
233
  )
234
  audio_file_from_dataset = gr.Audio(
235
- type="filepath", interactive=False, buttons=["download"], autoplay=True
236
  )
237
 
238
  with gr.Tab("Upload local file") as upload_tab:
239
  with gr.Row():
240
  gr.Markdown(open("docs/local_file.md", "r", encoding="utf-8").read())
241
  audio_file_upload = gr.Audio(
242
- type="filepath", sources=["upload"], buttons=["download"], autoplay=True
243
  )
244
 
245
  enhance_btn = gr.Button("Enhance with Quail Voice Focus 2.0", scale=2, visible=False)
@@ -250,7 +253,7 @@ with gr.Blocks() as demo:
250
  type="numpy",
251
  interactive=False,
252
  buttons=["download"],
253
- autoplay=True,
254
  )
255
 
256
  with gr.Row(equal_height=True, elem_classes="results-row"):
@@ -281,8 +284,8 @@ with gr.Blocks() as demo:
281
  def load_dataset_sample_on_tab_visit(dropdown_value):
282
  """Load the selected sample when visiting the Dataset tab; use default if dropdown is empty."""
283
  sample_id = dropdown_value or DEFAULT_DATASET_SAMPLE
284
- audio_path, arr, stem = load_file_from_dataset(sample_id)
285
- return sample_id, audio_path, arr, stem
286
 
287
  stream_tab.select(
288
  lambda: (
@@ -325,7 +328,7 @@ with gr.Blocks() as demo:
325
  ).then(
326
  load_dataset_sample_on_tab_visit,
327
  inputs=[dataset_dropdown],
328
- outputs=[dataset_dropdown, audio_file_from_dataset, input_array, sample_stem],
329
  )
330
 
331
  stt_model.change(
@@ -379,7 +382,7 @@ with gr.Blocks() as demo:
379
  ).then(
380
  load_file_from_dataset,
381
  inputs=dataset_dropdown,
382
- outputs=[audio_file_from_dataset, input_array, sample_stem],
383
  )
384
 
385
  # Uploading a local file triggers loading the audio file and hiding results until enhancement
@@ -390,13 +393,13 @@ with gr.Blocks() as demo:
390
  ).then(
391
  load_local_file,
392
  inputs=[audio_file_upload],
393
- outputs=[input_array, sample_stem]
394
  )
395
 
396
  # Enhancement button: run pipeline with live transcript progress (dataset + local file modes).
397
  enhance_btn.click(
398
  process_with_live_transcript,
399
- inputs=[input_array, enhancement_level, sample_stem, stt_model, last_sample_stem],
400
  outputs=[
401
  results_card,
402
  enhanced_audio,
 
50
  sample_stem,
51
  stt_model,
52
  last_sample_stem,
53
+ current_sample_rate,
54
+ ):
55
  """Generator that runs the offline pipeline in real time (chunked): enhanced audio and
56
  both transcripts stream from the first chunk so playback and transcription start immediately."""
57
  progress_state = {}
 
61
  try:
62
  result_holder["result"] = run_offline_pipeline_streaming(
63
  input_array,
64
+ current_sample_rate,
65
  enhancement_level,
66
  sample_stem,
67
  stt_model,
68
+ progress_state
69
  )
70
  except Exception as e:
71
  result_holder["error"] = e
 
155
  input_array = gr.State()
156
  enhanced_array = gr.State()
157
  precomputed_noisy_transcript = gr.State("")
158
+ current_sample_rate = gr.State(16000) # default sample rate for dataset samples; updated on local file load if different
159
 
160
  gr.HTML(
161
  '<a href="https://ai-coustics.com/" target="_blank">'
 
235
  choices=ALL_FILES, value="en_00412_i_h_36", label="Sample"
236
  )
237
  audio_file_from_dataset = gr.Audio(
238
+ type="filepath", interactive=False, buttons=["download"], autoplay=False
239
  )
240
 
241
  with gr.Tab("Upload local file") as upload_tab:
242
  with gr.Row():
243
  gr.Markdown(open("docs/local_file.md", "r", encoding="utf-8").read())
244
  audio_file_upload = gr.Audio(
245
+ type="filepath", sources=["upload"], buttons=["download"], autoplay=False, format= "wav"
246
  )
247
 
248
  enhance_btn = gr.Button("Enhance with Quail Voice Focus 2.0", scale=2, visible=False)
 
253
  type="numpy",
254
  interactive=False,
255
  buttons=["download"],
256
+ autoplay=False,
257
  )
258
 
259
  with gr.Row(equal_height=True, elem_classes="results-row"):
 
284
  def load_dataset_sample_on_tab_visit(dropdown_value):
285
  """Load the selected sample when visiting the Dataset tab; use default if dropdown is empty."""
286
  sample_id = dropdown_value or DEFAULT_DATASET_SAMPLE
287
+ audio_path, arr, stem, sample_rate = load_file_from_dataset(sample_id)
288
+ return sample_id, audio_path, arr, stem, sample_rate
289
 
290
  stream_tab.select(
291
  lambda: (
 
328
  ).then(
329
  load_dataset_sample_on_tab_visit,
330
  inputs=[dataset_dropdown],
331
+ outputs=[dataset_dropdown, audio_file_from_dataset, input_array, sample_stem, current_sample_rate],
332
  )
333
 
334
  stt_model.change(
 
382
  ).then(
383
  load_file_from_dataset,
384
  inputs=dataset_dropdown,
385
+ outputs=[audio_file_from_dataset, input_array, sample_stem, current_sample_rate],
386
  )
387
 
388
  # Uploading a local file triggers loading the audio file and hiding results until enhancement
 
393
  ).then(
394
  load_local_file,
395
  inputs=[audio_file_upload],
396
+ outputs=[input_array, sample_stem, current_sample_rate]
397
  )
398
 
399
  # Enhancement button: run pipeline with live transcript progress (dataset + local file modes).
400
  enhance_btn.click(
401
  process_with_live_transcript,
402
+ inputs=[input_array, enhancement_level, sample_stem, stt_model, last_sample_stem, current_sample_rate],
403
  outputs=[
404
  results_card,
405
  enhanced_audio,
offline_pipeline.py CHANGED
@@ -1,210 +1,14 @@
1
  import os
2
- from concurrent.futures import ThreadPoolExecutor
3
- from typing import Optional
4
 
5
  import gradio as gr
6
- import librosa
7
  from sdk import SDKWrapper
8
- from utils import spec_image, compute_wer, transcribe_audio, to_gradio_audio
9
  from hf_dataset_utils import get_audio, get_transcript
10
- from constants import DEFAULT_SR, APP_TMP_DIR, STREAMER_CLASSES
11
  import numpy as np
12
 
13
 
14
- def retrieve_audio_information(
15
- original_array: np.ndarray,
16
- enhanced_array: np.ndarray,
17
- sample_id: str,
18
- stt_model: str,
19
- noisy_transcript: str,
20
- progress_state: Optional[dict] = None,
21
- ) -> tuple[str, str, str, str, str, str]:
22
- """Build spectrograms, transcribe enhanced audio, and compute WER. Caller must supply
23
- noisy_transcript (transcription of original_array) so STT on the original is never run here.
24
- If progress_state is provided, progress_state['enhanced'] is updated with partial transcript as enhanced STT streams."""
25
- if original_array is None or enhanced_array is None:
26
- raise ValueError("Audio arrays are not available.")
27
- noisy_spec_path = f"{APP_TMP_DIR}/{sample_id}_noisy_spectrogram.png"
28
- enhanced_spec_path = f"{APP_TMP_DIR}/{sample_id}_enhanced_spectrogram.png"
29
- spec_image(original_array).save(noisy_spec_path)
30
- spec_image(enhanced_array).save(enhanced_spec_path)
31
- on_enhanced = (lambda t: progress_state.__setitem__("enhanced", t)) if progress_state is not None else None
32
- enhanced_transcript = transcribe_audio(
33
- enhanced_array,
34
- DEFAULT_SR,
35
- stt_model,
36
- stream_name=f"{sample_id}_enhanced",
37
- on_update=on_enhanced,
38
- )
39
- try:
40
- original_transcript = get_transcript(sample_id)
41
- wer_enhanced = compute_wer(original_transcript, enhanced_transcript)
42
- wer_noisy = compute_wer(original_transcript, noisy_transcript)
43
- enhanced_transcript += f" (WER: {wer_enhanced * 100:.2f}%)"
44
- noisy_transcript += f" (WER: {wer_noisy * 100:.2f}%)"
45
- except Exception:
46
- original_transcript = "Unavailable"
47
- return enhanced_spec_path, noisy_spec_path, enhanced_transcript, original_transcript, noisy_transcript, sample_id
48
-
49
-
50
- def denoise_and_transcribe_noisy_parallel(
51
- sample_16k: np.ndarray,
52
- enhancement_level: float,
53
- sample_id: str,
54
- stt_model: str,
55
- progress_state: Optional[dict] = None,
56
- ) -> tuple[np.ndarray | None, tuple[int, np.ndarray] | None, str]:
57
- """Run denoising and noisy transcription in parallel. Returns (enhanced_array, gradio_audio, noisy_transcript).
58
- If progress_state is provided, progress_state['noisy'] is updated with partial transcript as noisy STT streams."""
59
- if sample_16k is None:
60
- raise ValueError("No audio to enhance. Please upload a file first.")
61
- on_noisy = (lambda t: progress_state.__setitem__("noisy", t)) if progress_state is not None else None
62
- with ThreadPoolExecutor(max_workers=2) as executor:
63
- future_denoise = executor.submit(
64
- _denoise_audio_impl, sample_16k, float(enhancement_level) / 100.0
65
- )
66
- future_noisy = executor.submit(
67
- transcribe_audio,
68
- sample_16k,
69
- DEFAULT_SR,
70
- stt_model,
71
- stream_name=f"{sample_id}_noisy",
72
- on_update=on_noisy,
73
- )
74
- enhanced_array, gradio_enhanced_audio = future_denoise.result()
75
- noisy_transcript = future_noisy.result()
76
- return enhanced_array, gradio_enhanced_audio, noisy_transcript
77
-
78
-
79
- def _denoise_audio_impl(
80
- sample_16k: np.ndarray, enhancement_level_frac: float
81
- ) -> tuple[np.ndarray, tuple[int, np.ndarray]]:
82
- """Run the enhancer on a single array and return both the array and Gradio-ready audio.
83
-
84
- Used by denoise_and_transcribe_noisy_parallel so the denoise step can run in a
85
- thread without Gradio UI calls. Expects enhancement_level_frac in [0, 1] (not percent).
86
- Returns (enhanced_array, (sample_rate, numpy_array)) for Gradio Audio.
87
- """
88
- sdk = SDKWrapper()
89
- sdk.init_processor(
90
- sample_rate=DEFAULT_SR,
91
- enhancement_level=enhancement_level_frac,
92
- )
93
- enhanced_array = sdk.process_sync(sample_16k)
94
- gradio_enhanced_audio = to_gradio_audio(enhanced_array, DEFAULT_SR)
95
- return enhanced_array, gradio_enhanced_audio
96
-
97
-
98
- def denoise_audio(
99
- sample_16k: np.ndarray,
100
- enhancement_level: float = 50.0,
101
- ) -> tuple[np.ndarray | None , tuple[int, np.ndarray]| None]:
102
- """Enhance-only entry point: run the SDK on the given audio and return enhanced array + Gradio audio.
103
-
104
- The main app uses denoise_and_transcribe_noisy_parallel (denoise + noisy STT in parallel)
105
- instead. This function remains for backward compatibility, scripts, or any caller that
106
- needs only enhancement without transcription (e.g. backup flows, tests).
107
- """
108
- if sample_16k is None:
109
- raise ValueError("No audio to enhance. Please upload a file first.")
110
- try:
111
- sdk = SDKWrapper()
112
- sdk.init_processor(sample_rate=DEFAULT_SR, enhancement_level=float(enhancement_level) / 100.0)
113
- enhanced_array = sdk.process_sync(sample_16k)
114
- except Exception as e:
115
- gr.Warning(f"{e}")
116
- raise e
117
- gradio_enhanced_audio = to_gradio_audio(enhanced_array, DEFAULT_SR)
118
- return enhanced_array, gradio_enhanced_audio
119
-
120
-
121
- def run_offline_pipeline_ordered(
122
- sample_16k: np.ndarray,
123
- enhancement_level: float,
124
- sample_id: str,
125
- stt_model: str,
126
- progress_state: dict,
127
- ) -> tuple[str, str, str, tuple[int, np.ndarray], str, np.ndarray, str]:
128
- """Run pipeline in UI order: 1) Denoise only. 2) When done, set progress_state['enhanced_spec_path']
129
- and progress_state['enhanced_audio']. 3) Run noisy STT and enhanced STT in parallel (both stream via
130
- progress_state['noisy'] and progress_state['enhanced']). 4) Return final transcripts with WER.
131
-
132
- Returns: (enhanced_spec_path, enhanced_transcript, noisy_transcript_with_wer, enhanced_audio,
133
- last_stem, enhanced_array, precomputed_noisy).
134
- """
135
- if sample_16k is None:
136
- raise ValueError("No audio to enhance. Please upload a file first.")
137
- # 1) Denoise only
138
- enhanced_array, gradio_enhanced_audio = _denoise_audio_impl(
139
- sample_16k, float(enhancement_level) / 100.0
140
- )
141
- # 2) As soon as enhanced audio is ready: build enhanced spectrogram and expose to UI
142
- enhanced_spec_path = f"{APP_TMP_DIR}/{sample_id}_enhanced_spectrogram.png"
143
- spec_image(enhanced_array).save(enhanced_spec_path)
144
- progress_state["enhanced_spec_path"] = enhanced_spec_path
145
- progress_state["enhanced_audio"] = gradio_enhanced_audio
146
- # 3) Noisy and enhanced transcription starting both at the same time (parallel, both with on_update).
147
- # Sync so the UI shows both boxes updating together: only write to noisy/enhanced once both have sent at least one update.
148
- progress_state["noisy_pending"] = ""
149
- progress_state["enhanced_pending"] = ""
150
- progress_state["noisy_has_sent"] = False
151
- progress_state["enhanced_has_sent"] = False
152
-
153
- def _flush_both():
154
- if progress_state.get("noisy_has_sent") and progress_state.get("enhanced_has_sent"):
155
- progress_state["noisy"] = progress_state.get("noisy_pending", "")
156
- progress_state["enhanced"] = progress_state.get("enhanced_pending", "")
157
-
158
- def on_noisy(t: str):
159
- progress_state["noisy_pending"] = t
160
- progress_state["noisy_has_sent"] = True
161
- _flush_both()
162
-
163
- def on_enhanced(t: str):
164
- progress_state["enhanced_pending"] = t
165
- progress_state["enhanced_has_sent"] = True
166
- _flush_both()
167
-
168
- with ThreadPoolExecutor(max_workers=2) as executor:
169
- future_noisy = executor.submit(
170
- transcribe_audio,
171
- sample_16k,
172
- DEFAULT_SR,
173
- stt_model,
174
- stream_name=f"{sample_id}_noisy",
175
- on_update=on_noisy,
176
- )
177
- future_enhanced = executor.submit(
178
- transcribe_audio,
179
- enhanced_array,
180
- DEFAULT_SR,
181
- stt_model,
182
- stream_name=f"{sample_id}_enhanced",
183
- on_update=on_enhanced,
184
- )
185
- noisy_transcript = future_noisy.result()
186
- enhanced_transcript = future_enhanced.result()
187
- # 4) WER and final strings
188
- precomputed_noisy = noisy_transcript
189
- try:
190
- original_transcript = get_transcript(sample_id)
191
- wer_enhanced = compute_wer(original_transcript, enhanced_transcript)
192
- wer_noisy = compute_wer(original_transcript, noisy_transcript)
193
- enhanced_transcript += f" (WER: {wer_enhanced * 100:.2f}%)"
194
- noisy_transcript += f" (WER: {wer_noisy * 100:.2f}%)"
195
- except Exception:
196
- original_transcript = "Unavailable"
197
- return (
198
- enhanced_spec_path,
199
- enhanced_transcript,
200
- noisy_transcript,
201
- gradio_enhanced_audio,
202
- sample_id,
203
- enhanced_array,
204
- precomputed_noisy,
205
- )
206
-
207
-
208
  def _close_stt_stream(streamer) -> None:
209
  """Signal end-of-stream; streamer type may be Soniox (close_stream) or Deepgram (close)."""
210
  if hasattr(streamer, "close_stream"):
@@ -212,9 +16,9 @@ def _close_stt_stream(streamer) -> None:
212
  else:
213
  streamer.close()
214
 
215
-
216
  def run_offline_pipeline_streaming(
217
- sample_16k: np.ndarray,
 
218
  enhancement_level: float,
219
  sample_id: str,
220
  stt_model: str,
@@ -224,13 +28,13 @@ def run_offline_pipeline_streaming(
224
  via progress_state['noisy'] and progress_state['enhanced']. Enhanced audio is returned
225
  only at the end; the app plays it automatically when processing is complete.
226
  Returns same tuple as run_offline_pipeline_ordered."""
227
- if sample_16k is None:
228
  raise ValueError("No audio to enhance. Please upload a file first.")
229
- sample_16k = np.asarray(sample_16k, dtype=np.float32).flatten()
230
 
231
  sdk = SDKWrapper()
232
  sdk.init_processor(
233
- sample_rate=DEFAULT_SR,
234
  enhancement_level=float(enhancement_level) / 100.0,
235
  )
236
  chunk_size = sdk.num_frames
@@ -259,14 +63,14 @@ def run_offline_pipeline_streaming(
259
  if stt_model not in STREAMER_CLASSES:
260
  raise ValueError(f"Unknown STT model: {stt_model}")
261
  StreamerClass = STREAMER_CLASSES[stt_model]
262
- streamer_noisy = StreamerClass(DEFAULT_SR, f"{sample_id}_noisy", on_update=on_noisy)
263
- streamer_enhanced = StreamerClass(DEFAULT_SR, f"{sample_id}_enhanced", on_update=on_enhanced)
264
 
265
  accumulated_enhanced: list[np.ndarray] = []
266
- n = len(sample_16k)
267
 
268
  for i in range(0, n, chunk_size):
269
- raw_chunk = sample_16k[i : i + chunk_size]
270
  if raw_chunk.size < chunk_size:
271
  raw_chunk = np.pad(
272
  raw_chunk,
@@ -292,7 +96,7 @@ def run_offline_pipeline_streaming(
292
  enhanced_transcript = streamer_enhanced.render_tokens(streamer_enhanced.final_tokens, [])
293
 
294
  enhanced_array = np.concatenate(accumulated_enhanced).astype(np.float32)
295
- gradio_enhanced_audio = to_gradio_audio(enhanced_array, DEFAULT_SR)
296
 
297
  enhanced_spec_path = f"{APP_TMP_DIR}/{sample_id}_enhanced_spectrogram.png"
298
  spec_image(enhanced_array).save(enhanced_spec_path)
@@ -318,10 +122,9 @@ def run_offline_pipeline_streaming(
318
  precomputed_noisy,
319
  )
320
 
321
-
322
  def load_local_file(
323
  sample_path: str
324
- ) -> tuple[np.ndarray, str]:
325
  if not sample_path or not os.path.exists(sample_path):
326
  gr.Warning("Please upload a valid audio file.")
327
  raise ValueError("Missing audio sample. Please upload an audio sample or use the microphone input.")
@@ -329,18 +132,18 @@ def load_local_file(
329
  gr.Warning("File size exceeds 5 MB limit. Please upload a smaller file.")
330
  raise ValueError("Uploaded file exceeds the 5 MB size limit.")
331
  new_sample_stem = os.path.splitext(os.path.basename(sample_path))[0]
332
- y_16k, _ = librosa.load(sample_path, sr=DEFAULT_SR, dtype="float32", mono=True)
333
- return y_16k, new_sample_stem
334
 
335
- def load_file_from_dataset(sample_id: str) -> tuple[tuple | None, np.ndarray | None, str]:
336
  if not sample_id:
337
  gr.Warning("Please select a sample from the dropdown.")
338
- return None, None, ""
339
  new_sample_stem = sample_id
340
  try:
341
- y_16k, sr = get_audio(sample_id, prefix="mix")
342
- y_16k_for_gradio = to_gradio_audio(y_16k, sr)
343
  except Exception as e: # Convert to 16-bit PCM for gradio audio component
344
  gr.Warning(f"{e}")
345
  raise e
346
- return y_16k_for_gradio, y_16k, new_sample_stem
 
1
  import os
 
 
2
 
3
  import gradio as gr
4
+ import soundfile as sf
5
  from sdk import SDKWrapper
6
+ from utils import spec_image, compute_wer, to_gradio_audio
7
  from hf_dataset_utils import get_audio, get_transcript
8
+ from constants import APP_TMP_DIR, STREAMER_CLASSES
9
  import numpy as np
10
 
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  def _close_stt_stream(streamer) -> None:
13
  """Signal end-of-stream; streamer type may be Soniox (close_stream) or Deepgram (close)."""
14
  if hasattr(streamer, "close_stream"):
 
16
  else:
17
  streamer.close()
18
 
 
19
  def run_offline_pipeline_streaming(
20
+ sample: np.ndarray,
21
+ sample_rate: int,
22
  enhancement_level: float,
23
  sample_id: str,
24
  stt_model: str,
 
28
  via progress_state['noisy'] and progress_state['enhanced']. Enhanced audio is returned
29
  only at the end; the app plays it automatically when processing is complete.
30
  Returns same tuple as run_offline_pipeline_ordered."""
31
+ if sample is None:
32
  raise ValueError("No audio to enhance. Please upload a file first.")
33
+ sample = np.asarray(sample, dtype=np.float32).flatten()
34
 
35
  sdk = SDKWrapper()
36
  sdk.init_processor(
37
+ sample_rate=sample_rate,
38
  enhancement_level=float(enhancement_level) / 100.0,
39
  )
40
  chunk_size = sdk.num_frames
 
63
  if stt_model not in STREAMER_CLASSES:
64
  raise ValueError(f"Unknown STT model: {stt_model}")
65
  StreamerClass = STREAMER_CLASSES[stt_model]
66
+ streamer_noisy = StreamerClass(sample_rate, f"{sample_id}_noisy", on_update=on_noisy)
67
+ streamer_enhanced = StreamerClass(sample_rate, f"{sample_id}_enhanced", on_update=on_enhanced)
68
 
69
  accumulated_enhanced: list[np.ndarray] = []
70
+ n = len(sample)
71
 
72
  for i in range(0, n, chunk_size):
73
+ raw_chunk = sample[i : i + chunk_size]
74
  if raw_chunk.size < chunk_size:
75
  raw_chunk = np.pad(
76
  raw_chunk,
 
96
  enhanced_transcript = streamer_enhanced.render_tokens(streamer_enhanced.final_tokens, [])
97
 
98
  enhanced_array = np.concatenate(accumulated_enhanced).astype(np.float32)
99
+ gradio_enhanced_audio = to_gradio_audio(enhanced_array, sample_rate)
100
 
101
  enhanced_spec_path = f"{APP_TMP_DIR}/{sample_id}_enhanced_spectrogram.png"
102
  spec_image(enhanced_array).save(enhanced_spec_path)
 
122
  precomputed_noisy,
123
  )
124
 
 
125
  def load_local_file(
126
  sample_path: str
127
+ ) -> tuple[np.ndarray, str, int]:
128
  if not sample_path or not os.path.exists(sample_path):
129
  gr.Warning("Please upload a valid audio file.")
130
  raise ValueError("Missing audio sample. Please upload an audio sample or use the microphone input.")
 
132
  gr.Warning("File size exceeds 5 MB limit. Please upload a smaller file.")
133
  raise ValueError("Uploaded file exceeds the 5 MB size limit.")
134
  new_sample_stem = os.path.splitext(os.path.basename(sample_path))[0]
135
+ y, sample_rate = sf.read(sample_path, dtype="float32", always_2d=False)
136
+ return y, new_sample_stem, sample_rate
137
 
138
+ def load_file_from_dataset(sample_id: str) -> tuple[tuple | None, np.ndarray | None, str, int | None]:
139
  if not sample_id:
140
  gr.Warning("Please select a sample from the dropdown.")
141
+ return None, None, "", None
142
  new_sample_stem = sample_id
143
  try:
144
+ y, sample_rate = get_audio(sample_id, prefix="mix")
145
+ y_for_gradio = to_gradio_audio(y, sample_rate)
146
  except Exception as e: # Convert to 16-bit PCM for gradio audio component
147
  gr.Warning(f"{e}")
148
  raise e
149
+ return y_for_gradio, y, new_sample_stem, sample_rate
stt_streamers/deepgram_streamer.py CHANGED
@@ -78,12 +78,11 @@ class DeepgramStreamer:
78
  """
79
  Returns parameters for the Deepgram V1 URL query string.
80
  """
81
- assert fs_hz == 16000, "Only 16 kHz audio is supported."
82
 
83
  return {
84
  "model": "nova-3", # Recommended general model
85
  "encoding": "linear16", # Corresponds to pcm_s16le
86
- "sample_rate": 16000,
87
  "channels": 1,
88
  "smart_format": "true", # handling punctuation/formatting
89
  "interim_results": "true", # required for non-final updates
 
78
  """
79
  Returns parameters for the Deepgram V1 URL query string.
80
  """
 
81
 
82
  return {
83
  "model": "nova-3", # Recommended general model
84
  "encoding": "linear16", # Corresponds to pcm_s16le
85
+ "sample_rate": fs_hz,
86
  "channels": 1,
87
  "smart_format": "true", # handling punctuation/formatting
88
  "interim_results": "true", # required for non-final updates
stt_streamers/soniox_streamer.py CHANGED
@@ -13,7 +13,6 @@ class SonioxStreamer:
13
  api_key = os.environ.get("SONIOX_API_KEY")
14
  if not api_key:
15
  raise RuntimeError("Missing SONIOX_API_KEY.")
16
- assert fs_hz == 16000, "Only 16 kHz audio is supported."
17
 
18
  self.stream_name = stream_name
19
  self.api_name = "Soniox RT"
@@ -69,7 +68,7 @@ class SonioxStreamer:
69
  "api_key": api_key,
70
  "model": "stt-rt-v3",
71
  "audio_format": "pcm_s16le",
72
- "sample_rate": 16000,
73
  "num_channels": 1,
74
  "language_hints": ["en", "de"],
75
  "language_hints_strict": True,
 
13
  api_key = os.environ.get("SONIOX_API_KEY")
14
  if not api_key:
15
  raise RuntimeError("Missing SONIOX_API_KEY.")
 
16
 
17
  self.stream_name = stream_name
18
  self.api_name = "Soniox RT"
 
68
  "api_key": api_key,
69
  "model": "stt-rt-v3",
70
  "audio_format": "pcm_s16le",
71
+ "sample_rate": fs_hz,
72
  "num_channels": 1,
73
  "language_hints": ["en", "de"],
74
  "language_hints_strict": True,
utils.py CHANGED
@@ -93,43 +93,4 @@ def compute_wer(reference: str, hypothesis: str) -> float:
93
  d[i - 1][j - 1] + cost, # Substitution
94
  )
95
  wer = d[len(ref_words)][len(hyp_words)] / max(len(ref_words), 1)
96
- return wer
97
-
98
-
99
-
100
- def transcribe_audio(
101
- audio_array: np.ndarray,
102
- sr: int,
103
- streamer_type: str = "Deepgram",
104
- stream_name: str = "RAW",
105
- on_update: Optional[Callable[[str], None]] = None,
106
- ):
107
- """
108
- Transcribe an audio array using the specified STT streamer.
109
-
110
- Args:
111
- audio_array (np.ndarray): Audio data array
112
- sr (int): Sample rate of the audio array
113
- streamer_type (str): "Soniox" or "Deepgram"
114
- stream_name (str): Optional label for streamer instance ("RAW", "ENHANCED", etc.)
115
- on_update: Optional callback(text: str) called with partial transcript as results stream in.
116
-
117
- Returns:
118
- str: Final transcript text
119
- """
120
- if sr != DEFAULT_SR:
121
- audio_array = resampy.resample(audio_array, sr, DEFAULT_SR)
122
- sr = DEFAULT_SR
123
-
124
- if streamer_type not in STREAMER_CLASSES:
125
- raise ValueError(
126
- f"Invalid streamer_type '{streamer_type}'. "
127
- f"Choose from: {', '.join(STREAMER_CLASSES.keys())}"
128
- )
129
-
130
- StreamerClass = STREAMER_CLASSES[streamer_type]
131
- streamer = StreamerClass(sr, stream_name, on_update=on_update)
132
-
133
- transcript = streamer.stream_array(audio_array)
134
-
135
- return transcript
 
93
  d[i - 1][j - 1] + cost, # Substitution
94
  )
95
  wer = d[len(ref_words)][len(hyp_words)] / max(len(ref_words), 1)
96
+ return wer