mariesig commited on
Commit
32dcdfe
·
1 Parent(s): 909a8c1

- remove websocket assignment during shutdown (ac7c4481f1d26fd8161dfc668c79440b88d09a5c)
- add vad and sr to streaming (7274b791d482afe7ef9ab2020f74ceace38e16c2)
- move VAD constants to seperate file (06fe4294bf3df2784ab123904982e173aec34394)
- dataset tab stops recording (62744142a409edfa5686bd086b723901e71e92a1)
- remove redundant code from tab switch (0a77e8e90695c7eb99d272461aec14c057c9b08a)
- improve layout (ba97774b30ae8d6dc089f66e0b1d8471dbe10098)
- refactor streaming pipeline (b22fd96890c75efae0e6c56d5bc5d9c329be7192)
- validate sample rate in stt streamers (ab0481a43d19026f3fd291f05b2b146c857ad746)
- default SDK Params (43283c8a389a539e100033f0ecda6038e0b1f1ea)
- remove unused DEFAULT_SAMPLE_ID constant (df08ce34defdaf2ab622249f3012fc53b5585986)
- reorder/name to match offline pipeline (c4c5df8d3f6acc14eb6d426047b74576b68a6e4f)
- fix ui on tab switch (be39c5be95c88f69b66c46e15501b808e3c25507)
- refactor offline pipeline (4e945b9cf36c72bb129d1bb2ce23a9a849fc4af4)
- replace soundfile with librosa (03a504941537d06664bacde8b154e48e9348a429)
- bug fix (4319862f89f59aed7f8739bf83be99d34001698f)
- ruff (6606020cb7ce644e81865e4c79066f7f82fd2d7c)
- VAD in spectrogram (a7c506c419746c71d1d63d8719017ebc50b2ef07)
- stt model change on streaming (12039524efd8521399fecb520e0274213ff044ee)
- fix vad bar height (b949ffac592b85f0fd3b52d975485659f7c4d98a)

app.py CHANGED
@@ -1,162 +1,33 @@
1
  import os
2
- import threading
3
- import time
4
  import gradio as gr
5
  from pathlib import Path
6
 
7
- from constants import STREAM_EVERY, WARMUP_SECONDS, APP_TMP_DIR
8
- from hf_dataset_utils import ALL_FILES, get_transcript
 
9
 
10
  from stream_pipeline import (
11
- clear_ui,
12
- get_live_transcripts,
13
  on_stop_recording,
14
- set_stt_streamer,
15
- stop_online_backend,
16
- transcribe_stream,
17
  shutdown_streamers,
 
18
  )
19
  from offline_pipeline import (
20
  load_file_from_dataset,
21
  load_local_file,
22
- run_offline_pipeline_streaming,
23
  )
24
- from utils import spec_image
25
  from clean_up import purge_tmp_directory, cleanup_previous_run
26
 
27
- # Active light HTML: whole container is the light (gray = warming up, red = ready)
28
- ACTIVE_LIGHT_GRAY = (
29
- '<div class="active-light active-light--off" title="Warming up" style="display:flex;justify-content:center;align-items:center;width:100%;height:100%;min-height:3rem;">'
30
- '<span class="active-light__label">Warming up...</span></div>'
31
- )
32
- ACTIVE_LIGHT_RED = (
33
- '<div class="active-light active-light--on" title="Ready" style="display:flex;justify-content:center;align-items:center;width:100%;height:100%;min-height:3rem;">'
34
- '<span class="active-light__label">Ready!</span></div>'
35
- )
36
- WARMUP_TICKS = max(1, int(WARMUP_SECONDS * 2)) # timer ticks every 0.5s
37
-
38
-
39
- def warmup_tick(on_stream_tab, warmup_elapsed, _current_html):
40
- if not on_stream_tab:
41
- return 0, ACTIVE_LIGHT_GRAY, gr.update(active=True)
42
- if warmup_elapsed >= WARMUP_TICKS:
43
- return warmup_elapsed, ACTIVE_LIGHT_RED, gr.update(active=False)
44
- return warmup_elapsed + 1, ACTIVE_LIGHT_GRAY, gr.update(active=True)
45
-
46
-
47
- def process_with_live_transcript(
48
- input_array,
49
- enhancement_level,
50
- sample_stem,
51
- stt_model,
52
- last_sample_stem,
53
- current_sample_rate,
54
- ):
55
- """Generator that runs the offline pipeline in real time (chunked): enhanced audio and
56
- both transcripts stream from the first chunk so playback and transcription start immediately."""
57
- progress_state = {}
58
- result_holder = {}
59
-
60
- def worker():
61
- try:
62
- result_holder["result"] = run_offline_pipeline_streaming(
63
- input_array,
64
- current_sample_rate,
65
- enhancement_level,
66
- sample_stem,
67
- stt_model,
68
- progress_state
69
- )
70
- except Exception as e:
71
- result_holder["error"] = e
72
-
73
- # 1) First yield: ground truth + input spectrogram only (no audio, no enhanced spec, no transcripts yet)
74
- _ = cleanup_previous_run(last_sample_stem)
75
- noisy_spec_path = f"{APP_TMP_DIR}/{sample_stem}_noisy_spectrogram.png"
76
- if input_array is not None:
77
- try:
78
- spec_image(input_array).save(noisy_spec_path)
79
- except Exception:
80
- noisy_spec_path = None
81
- else:
82
- noisy_spec_path = None
83
- try:
84
- original_transcript = get_transcript(sample_stem)
85
- except Exception:
86
- original_transcript = "Unavailable"
87
-
88
- yield (
89
- gr.update(visible=True),
90
- None, # enhanced_audio: set only in final yield (smooth playback)
91
- gr.update(value=None), # enhanced_image: clear until step 3 (last)
92
- gr.update(value=noisy_spec_path), # noisy_image: input spectrogram (step 1)
93
- original_transcript,
94
- "",
95
- "",
96
- sample_stem,
97
- None,
98
- "",
99
- )
100
- # Let the UI render step 1 before we flood with polling updates
101
- time.sleep(0.2)
102
-
103
- thread = threading.Thread(target=worker, daemon=True)
104
- thread.start()
105
-
106
- poll_interval = 0.05
107
- while "result" not in result_holder and "error" not in result_holder:
108
- time.sleep(poll_interval)
109
- # 2) Realtime: stream transcripts only; audio set in final yield for smooth playback
110
- yield (
111
- gr.update(visible=True),
112
- gr.update(), # enhanced_audio: set only in final yield, then autoplay
113
- gr.update(), # enhanced_image: reveal only in step 3 (final yield)
114
- gr.update(), # noisy_image already set in step 1
115
- gr.update(), # original_transcript unchanged
116
- gr.update(value=progress_state.get("noisy", "")),
117
- gr.update(value=progress_state.get("enhanced", "")),
118
- gr.update(),
119
- gr.update(),
120
- gr.update(),
121
- )
122
-
123
- if "error" in result_holder:
124
- raise result_holder["error"]
125
-
126
- (
127
- enhanced_spec_path,
128
- enhanced_transcript,
129
- noisy_transcript_with_wer,
130
- enhanced_audio,
131
- last_stem,
132
- enhanced_array,
133
- precomputed_noisy,
134
- ) = result_holder["result"]
135
-
136
- # 3) Last: reveal enhanced spectrogram (and final audio/transcripts)
137
- yield (
138
- gr.update(visible=True),
139
- enhanced_audio,
140
- enhanced_spec_path, # enhanced_image: show only now
141
- noisy_spec_path,
142
- original_transcript,
143
- noisy_transcript_with_wer,
144
- enhanced_transcript,
145
- last_stem,
146
- enhanced_array,
147
- precomputed_noisy,
148
- )
149
-
150
 
151
  _CSS_DIR = Path(__file__).resolve().parent / "assets"
152
  with gr.Blocks() as demo:
153
  sample_stem = gr.State("")
154
  last_sample_stem = gr.State("")
155
  input_array = gr.State()
156
- enhanced_array = gr.State()
157
- precomputed_noisy_transcript = gr.State("")
158
- current_sample_rate = gr.State(16000) # default sample rate for dataset samples; updated on local file load if different
159
-
160
  gr.HTML(
161
  '<a href="https://ai-coustics.com/" target="_blank">'
162
  '<img src="https://mintcdn.com/ai-coustics/Sxcrv8jVSE2qWMR1/logo/dark.svg?fit=max&auto=format&n=Sxcrv8jVSE2qWMR1&q=85&s=7f26caaf21e963912961cbd8541e6d84" '
@@ -182,80 +53,90 @@ with gr.Blocks() as demo:
182
  scale=2,
183
  )
184
 
185
- with gr.Group(elem_classes="panel"):
186
  with gr.Tab("Stream audio in real time") as stream_tab:
187
- gr.Markdown(open("docs/online.md", "r", encoding="utf-8").read())
188
- with gr.Group(elem_classes="panel"):
189
- stream_state = gr.State(None)
190
- on_stream_tab = gr.State(True) # True on load: stream tab is the default first tab
191
- warmup_elapsed = gr.State(0)
192
- input_gain_db = gr.Slider(
193
- minimum=0,
194
- maximum=20,
195
- step=0.5,
196
- value=0,
197
- label="Input gain (dB)",
198
- )
199
- with gr.Row(elem_classes="stream-row"):
200
- with gr.Column(scale=4, min_width=200):
201
- audio_stream = gr.Audio(
202
- sources=["microphone"], streaming=True, elem_id="audio_stream"
203
- )
204
- with gr.Column(scale=1, min_width=120, elem_classes="active-light-column"):
205
- active_light = gr.HTML(value=ACTIVE_LIGHT_GRAY)
206
- with gr.Group(elem_classes="panel"):
207
- with gr.Column(scale=5, min_width=320):
208
- enhanced_text = gr.Textbox(
209
- label="Enhanced Transcribed Text", lines=6, autoscroll=False
210
  )
211
- with gr.Column(scale=5, min_width=320):
212
- raw_text = gr.Textbox(
213
- label="Raw Transcribed Text", lines=6, autoscroll=False
 
 
214
  )
215
- # Poll transcript globals so interim results show live (streamers update in background)
216
- transcript_timer = gr.Timer(0.1, active=True)
217
- transcript_timer.tick(
218
- get_live_transcripts,
219
- inputs=None,
220
- outputs=[enhanced_text, raw_text],
221
- show_progress="hidden",
222
- )
223
- warmup_timer = gr.Timer(0.5, active=True)
224
- warmup_timer.tick(
225
- warmup_tick,
226
- inputs=[on_stream_tab, warmup_elapsed, active_light],
227
- outputs=[warmup_elapsed, active_light, warmup_timer],
228
- show_progress="hidden",
229
- )
230
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  with gr.Tab("Dataset: Dawn Chorus") as dataset_tab:
232
- with gr.Row():
233
- gr.Markdown(open("docs/dawn_chorus.md", "r", encoding="utf-8").read())
234
- dataset_dropdown = gr.Dropdown(
235
- choices=ALL_FILES, value="en_00412_i_h_36", label="Sample"
236
- )
237
- audio_file_from_dataset = gr.Audio(
238
- type="filepath", interactive=False, buttons=["download"], autoplay=False
239
- )
 
 
240
 
241
  with gr.Tab("Upload local file") as upload_tab:
242
- with gr.Row():
243
- gr.Markdown(open("docs/local_file.md", "r", encoding="utf-8").read())
244
- audio_file_upload = gr.File(
245
- file_types=[".wav", ".mp3", ".flac", ".m4a", ".ogg"],
246
- file_count="single",
247
- scale=3,
248
- )
249
- normalize = gr.Checkbox(label="Normalize audio", value=True)
250
- audio_preview = gr.Audio(
251
- label="Preview",
252
- autoplay=False,
253
- interactive=False,
254
- )
255
- enhance_btn = gr.Button("Enhance with Quail Voice Focus 2.0", scale=2, visible=False)
 
 
 
256
 
257
  with gr.Group(elem_classes="panel results-card", visible=False) as results_card:
258
- result_title = gr.Markdown("")
259
  enhanced_audio = gr.Audio(
260
  type="numpy",
261
  interactive=False,
@@ -284,107 +165,75 @@ with gr.Blocks() as demo:
284
  )
285
 
286
  # ------------------------------------------------------
287
- # GLOBAL EVENTS (TAB CHANGES, MODEL CHANGES)
288
  # ------------------------------------------------------
289
- DEFAULT_DATASET_SAMPLE = "en_00412_i_h_36"
290
-
291
- def load_dataset_sample_on_tab_visit(dropdown_value):
292
- """Load the selected sample when visiting the Dataset tab; use default if dropdown is empty."""
293
- sample_id = dropdown_value or DEFAULT_DATASET_SAMPLE
294
- audio_path, arr, stem, sample_rate = load_file_from_dataset(sample_id)
295
- return sample_id, audio_path, arr, stem, sample_rate
296
-
297
  stream_tab.select(
298
  lambda: (
299
  gr.update(visible=False),
300
  gr.update(visible=False),
301
- gr.update(visible=True, interactive=True, streaming=True, sources=["microphone"]),
302
- True,
303
- 0,
304
- ACTIVE_LIGHT_GRAY,
305
- gr.update(active=True),
306
  ),
307
- inputs=None,
308
- outputs=[results_card, enhance_btn, audio_stream, on_stream_tab, warmup_elapsed, active_light, warmup_timer],
309
  )
310
 
 
 
 
 
 
 
 
 
 
 
311
  upload_tab.select(
312
- lambda: (
313
- *stop_online_backend(),
314
- False,
315
- ACTIVE_LIGHT_GRAY,
316
- ),
317
- inputs=None,
318
- outputs=[stream_state, enhanced_text, raw_text, audio_stream, on_stream_tab, active_light],
319
  ).then(
320
- lambda: gr.update(visible=True),
321
- outputs=enhance_btn,
 
322
  )
323
-
324
  dataset_tab.select(
325
- lambda: (
326
- *stop_online_backend(),
327
- False,
328
- ACTIVE_LIGHT_GRAY,
329
- ),
330
- inputs=None,
331
- outputs=[stream_state, enhanced_text, raw_text, audio_stream, on_stream_tab, active_light],
332
- ).then(
333
- lambda: gr.update(visible=True),
334
- outputs=enhance_btn,
335
  ).then(
336
- load_dataset_sample_on_tab_visit,
337
- inputs=[dataset_dropdown],
338
- outputs=[dataset_dropdown, audio_file_from_dataset, input_array, sample_stem, current_sample_rate],
339
- )
340
-
341
- stt_model.change(
342
- fn=shutdown_streamers,
343
- ).then(
344
- clear_ui,
345
- inputs=None,
346
- outputs=[stream_state, enhanced_text, raw_text],
347
- ).then(
348
- set_stt_streamer,
349
- inputs=stt_model,
350
- outputs=None,
351
  )
352
-
353
  # ------------------------------------------------------
354
- # ONLINE PIPELINE EVENTS (DATASET + UPLOAD TABS)
355
  # ------------------------------------------------------
356
-
357
  audio_stream.stream(
358
- fn=transcribe_stream,
359
- inputs=[stream_state, audio_stream, enhancement_level, input_gain_db],
360
- outputs=[stream_state, enhanced_text, raw_text],
361
  stream_every=STREAM_EVERY,
362
  time_limit=60 * 2,
363
- concurrency_limit=1,
364
  )
365
 
366
  audio_stream.stop_recording(
367
  on_stop_recording,
 
 
 
368
  )
369
 
370
  audio_stream.start_recording(
371
- clear_ui,
372
- inputs=None,
373
- outputs=[stream_state, enhanced_text, raw_text],
374
- ).then(
375
- fn=set_stt_streamer,
376
- inputs=stt_model,
377
- outputs=None,
378
- )
379
 
380
  # ------------------------------------------------------
381
- # OFFLINE PIPELINE EVENTS (DATASET + UPLOAD TABS)
382
  # ------------------------------------------------------
383
 
384
  # Dataset dropdown selection triggers loading the audio file and hiding results until enhancement
385
  dataset_dropdown.change(
386
  lambda: gr.update(visible=False),
387
- inputs=None,
388
  outputs=results_card,
389
  ).then(
390
  load_file_from_dataset,
@@ -410,10 +259,15 @@ with gr.Blocks() as demo:
410
 
411
  # Enhancement button: run pipeline with live transcript progress (dataset + local file modes).
412
  enhance_btn.click(
413
- process_with_live_transcript,
414
- inputs=[input_array, enhancement_level, sample_stem, stt_model, last_sample_stem, current_sample_rate],
 
 
 
 
 
 
415
  outputs=[
416
- results_card,
417
  enhanced_audio,
418
  enhanced_image,
419
  noisy_image,
@@ -421,15 +275,15 @@ with gr.Blocks() as demo:
421
  noisy_transcript,
422
  enhanced_transcript,
423
  last_sample_stem,
424
- enhanced_array,
425
- precomputed_noisy_transcript,
426
- ],
427
  )
428
 
429
  os.makedirs(APP_TMP_DIR, exist_ok=True)
430
  purge_tmp_directory(max_age_minutes=0, tmp_dir=APP_TMP_DIR)
431
  demo.queue()
432
  demo.launch(
433
- css=(_CSS_DIR / "active_light.css").read_text(encoding="utf-8"),
434
  allowed_paths=[APP_TMP_DIR, "/tmp", "/"],
435
  )
 
1
  import os
 
 
2
  import gradio as gr
3
  from pathlib import Path
4
 
5
+ from constants import STREAM_EVERY, APP_TMP_DIR
6
+ from ui import LED_DOT_OFF
7
+ from hf_dataset_utils import ALL_FILES
8
 
9
  from stream_pipeline import (
10
+ on_start_recording,
 
11
  on_stop_recording,
 
 
 
12
  shutdown_streamers,
13
+ stream_step,
14
  )
15
  from offline_pipeline import (
16
  load_file_from_dataset,
17
  load_local_file,
18
+ run_offline_pipeline,
19
  )
 
20
  from clean_up import purge_tmp_directory, cleanup_previous_run
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  _CSS_DIR = Path(__file__).resolve().parent / "assets"
24
  with gr.Blocks() as demo:
25
  sample_stem = gr.State("")
26
  last_sample_stem = gr.State("")
27
  input_array = gr.State()
28
+ streaming_sr = gr.State(None)
29
+ current_sample_rate = gr.State(None)
30
+
 
31
  gr.HTML(
32
  '<a href="https://ai-coustics.com/" target="_blank">'
33
  '<img src="https://mintcdn.com/ai-coustics/Sxcrv8jVSE2qWMR1/logo/dark.svg?fit=max&auto=format&n=Sxcrv8jVSE2qWMR1&q=85&s=7f26caaf21e963912961cbd8541e6d84" '
 
53
  scale=2,
54
  )
55
 
56
+ with gr.Tabs():
57
  with gr.Tab("Stream audio in real time") as stream_tab:
58
+ with gr.Row(equal_height=False, elem_classes="stream-layout"):
59
+ with gr.Column(scale=4, min_width=320):
60
+ with gr.Group(elem_classes="panel section-panel"):
61
+ gr.Markdown("### Input", elem_classes="title")
62
+ gr.Markdown(open("docs/online.md", "r", encoding="utf-8").read(), elem_classes="tab-description")
63
+
64
+ input_gain_db = gr.Slider(
65
+ minimum=0,
66
+ maximum=20,
67
+ step=0.5,
68
+ value=0,
69
+ label="Input gain (dB)",
 
 
 
 
 
 
 
 
 
 
 
70
  )
71
+
72
+ audio_stream = gr.Audio(
73
+ sources=["microphone"],
74
+ streaming=True,
75
+ elem_id="audio_stream",
76
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
+ with gr.Column(scale=6, min_width=420):
79
+ with gr.Group(elem_classes="status-panel output-panel"):
80
+ gr.Markdown("### Live Output", elem_classes="title")
81
+ with gr.Row(equal_height=True, elem_classes="status-indicators"):
82
+ with gr.Group(elem_classes="status-card"):
83
+ gr.Markdown("**System Status**", elem_classes="status-card__label")
84
+ system_status_led = gr.HTML(value=LED_DOT_OFF, show_label=False)
85
+ system_status_text = gr.Markdown(
86
+ value="Off",
87
+ elem_classes="status-card__subtext",
88
+ )
89
+
90
+ with gr.Group(elem_classes="status-card"):
91
+ gr.Markdown("**Voice Activity**", elem_classes="status-card__label")
92
+ vad_led = gr.HTML(value=LED_DOT_OFF, show_label=False)
93
+
94
+ with gr.Row(equal_height=True, elem_classes="transcript-row transcript-row--large"):
95
+ enhanced_text = gr.Textbox(
96
+ label="Enhanced Transcript",
97
+ lines=10,
98
+ autoscroll=False,
99
+ )
100
+ raw_text = gr.Textbox(
101
+ label="Raw Transcript",
102
+ lines=10,
103
+ autoscroll=False,
104
+ )
105
+
106
+
107
  with gr.Tab("Dataset: Dawn Chorus") as dataset_tab:
108
+ with gr.Group(elem_classes="panel section-panel"):
109
+ gr.Markdown("### Input", elem_classes="title")
110
+ gr.Markdown(open("docs/dawn_chorus.md", "r", encoding="utf-8").read(), elem_classes="tab-description")
111
+
112
+ dataset_dropdown = gr.Dropdown(
113
+ choices=ALL_FILES, value="en_00412_i_h_36", label="Sample"
114
+ )
115
+ audio_file_from_dataset = gr.Audio(
116
+ type="filepath", interactive=False, buttons=["download"], autoplay=False
117
+ )
118
 
119
  with gr.Tab("Upload local file") as upload_tab:
120
+ with gr.Group(elem_classes="panel section-panel"):
121
+ gr.Markdown("### Input", elem_classes="title")
122
+ gr.Markdown(open("docs/local_file.md", "r", encoding="utf-8").read(), elem_classes="tab-description")
123
+
124
+ audio_file_upload = gr.File(
125
+ file_types=[".wav", ".mp3", ".flac", ".m4a", ".ogg"],
126
+ file_count="single",
127
+ scale=3,
128
+ )
129
+ normalize = gr.Checkbox(label="Normalize audio", value=True)
130
+ audio_preview = gr.Audio(
131
+ label="Preview",
132
+ autoplay=False,
133
+ interactive=False,
134
+ )
135
+
136
+ enhance_btn = gr.Button("Enhance with Quail Voice Focus 2.0", scale=2, visible=False)
137
 
138
  with gr.Group(elem_classes="panel results-card", visible=False) as results_card:
139
+ result_title = gr.Markdown("", elem_classes="title")
140
  enhanced_audio = gr.Audio(
141
  type="numpy",
142
  interactive=False,
 
165
  )
166
 
167
  # ------------------------------------------------------
168
+ # TAB CHANGES
169
  # ------------------------------------------------------
 
 
 
 
 
 
 
 
170
  stream_tab.select(
171
  lambda: (
172
  gr.update(visible=False),
173
  gr.update(visible=False),
174
+ gr.update(sources=["microphone"], streaming=True, interactive= True),
 
 
 
 
175
  ),
176
+ outputs=[results_card, enhance_btn, audio_stream],
 
177
  )
178
 
179
+ def _on_not_streaming_tab():
180
+ shutdown_streamers()
181
+ return (
182
+ gr.update(streaming=False, interactive=False),
183
+ gr.update(visible=True),
184
+ LED_DOT_OFF,
185
+ LED_DOT_OFF,
186
+ "Off",
187
+ )
188
+
189
  upload_tab.select(
190
+ _on_not_streaming_tab,
191
+ outputs=[audio_stream, enhance_btn, vad_led, system_status_led, system_status_text],
 
 
 
 
 
192
  ).then(
193
+ load_local_file,
194
+ inputs=[audio_file_upload, normalize],
195
+ outputs=[input_array, sample_stem, audio_preview, current_sample_rate],
196
  )
197
+
198
  dataset_tab.select(
199
+ _on_not_streaming_tab,
200
+ outputs=[audio_stream, enhance_btn, vad_led, system_status_led, system_status_text],
 
 
 
 
 
 
 
 
201
  ).then(
202
+ load_file_from_dataset,
203
+ inputs=dataset_dropdown,
204
+ outputs=[audio_file_from_dataset, input_array, sample_stem, current_sample_rate],
 
 
 
 
 
 
 
 
 
 
 
 
205
  )
 
206
  # ------------------------------------------------------
207
+ # STREAMING EVENTS
208
  # ------------------------------------------------------
 
209
  audio_stream.stream(
210
+ stream_step,
211
+ inputs=[audio_stream, streaming_sr, stt_model, enhancement_level, input_gain_db],
212
+ outputs=[streaming_sr, system_status_led, system_status_text,enhanced_text, raw_text, vad_led],
213
  stream_every=STREAM_EVERY,
214
  time_limit=60 * 2,
215
+ concurrency_limit=1,
216
  )
217
 
218
  audio_stream.stop_recording(
219
  on_stop_recording,
220
+ outputs=[vad_led, system_status_led, system_status_text, streaming_sr],
221
+ ).then(
222
+ shutdown_streamers,
223
  )
224
 
225
  audio_stream.start_recording(
226
+ on_start_recording,
227
+ outputs=[enhanced_text, raw_text, system_status_led, system_status_text],
228
+ )
 
 
 
 
 
229
 
230
  # ------------------------------------------------------
231
+ # OFFLINE EVENTS (DATASET + LOCAL FILE)
232
  # ------------------------------------------------------
233
 
234
  # Dataset dropdown selection triggers loading the audio file and hiding results until enhancement
235
  dataset_dropdown.change(
236
  lambda: gr.update(visible=False),
 
237
  outputs=results_card,
238
  ).then(
239
  load_file_from_dataset,
 
259
 
260
  # Enhancement button: run pipeline with live transcript progress (dataset + local file modes).
261
  enhance_btn.click(
262
+ cleanup_previous_run,
263
+ inputs=[last_sample_stem]
264
+ ).then(
265
+ lambda: gr.update(visible=True),
266
+ outputs=results_card,
267
+ ).then(
268
+ run_offline_pipeline,
269
+ inputs=[input_array, current_sample_rate, enhancement_level, stt_model, sample_stem],
270
  outputs=[
 
271
  enhanced_audio,
272
  enhanced_image,
273
  noisy_image,
 
275
  noisy_transcript,
276
  enhanced_transcript,
277
  last_sample_stem,
278
+ ]
279
+ ).failure(
280
+ lambda: gr.Warning("Enhancement failed. Please refresh page and make sure you have a stable connection.")
281
  )
282
 
283
  os.makedirs(APP_TMP_DIR, exist_ok=True)
284
  purge_tmp_directory(max_age_minutes=0, tmp_dir=APP_TMP_DIR)
285
  demo.queue()
286
  demo.launch(
287
+ css=(_CSS_DIR / "styling.css").read_text(encoding="utf-8"),
288
  allowed_paths=[APP_TMP_DIR, "/tmp", "/"],
289
  )
assets/active_light.css DELETED
@@ -1,68 +0,0 @@
1
- /* Stream row: stretch columns to match audio height */
2
- .stream-row {
3
- align-items: stretch !important;
4
- }
5
-
6
- /* Active light column: flex container, fill height */
7
- .stream-row > div:last-child {
8
- display: flex !important;
9
- align-items: stretch !important;
10
- min-height: 100%;
11
- }
12
-
13
- /* Gradio block wrapper: fill and flex so child can stretch */
14
- .stream-row > div:last-child > div {
15
- display: flex !important;
16
- width: 100% !important;
17
- min-height: 100% !important;
18
- height: 100% !important;
19
- box-sizing: border-box;
20
- }
21
-
22
- /* All divs in light column: fill so panel stretches */
23
- .stream-row > div:last-child div {
24
- width: 100% !important;
25
- min-height: 100% !important;
26
- height: 100% !important;
27
- box-sizing: border-box;
28
- }
29
-
30
- /* Active light panel: fill container, base styles */
31
- .active-light {
32
- width: 100% !important;
33
- min-height: 100% !important;
34
- height: 100% !important;
35
- box-sizing: border-box;
36
- border-radius: 8px;
37
- border: 1px solid var(--border-color-primary);
38
- transition: background 0.2s, box-shadow 0.2s;
39
- padding: 0.5rem;
40
- }
41
-
42
- /* Warming up state */
43
- .active-light--off {
44
- background: #555;
45
- box-shadow: inset 0 2px 8px rgba(0, 0, 0, 0.4);
46
- }
47
-
48
- /* Ready state */
49
- .active-light--on {
50
- background: #c00;
51
- box-shadow:
52
- 0 0 16px rgba(220, 0, 0, 0.5),
53
- inset 0 0 12px rgba(255, 80, 80, 0.3);
54
- }
55
-
56
- /* Label text */
57
- .active-light__label {
58
- font-size: 12px;
59
- font-weight: 500;
60
- }
61
-
62
- .active-light--off .active-light__label {
63
- color: rgba(255, 255, 255, 0.85);
64
- }
65
-
66
- .active-light--on .active-light__label {
67
- color: #fff;
68
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
assets/styling.css ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .status-panel {
2
+ padding: 16px;
3
+ border-radius: 16px;
4
+ border: 1px solid #27272a;
5
+ background: #111827;
6
+ }
7
+
8
+ .title, .tab-description {
9
+ margin-left: 2px;
10
+ margin-top: 2px;
11
+ margin-bottom: 0px;
12
+ }
13
+
14
+ .transcript-row {
15
+ gap: 12px;
16
+ }
17
+
18
+ .transcript-row--large textarea {
19
+ min-height: 260px;
20
+ }
21
+
22
+ .status-card {
23
+ height: 50px;
24
+ border: 1px solid #3f3f46;
25
+ border-radius: 14px;
26
+ background: #18181b;
27
+ padding: 5px 5px;
28
+ display: flex;
29
+ flex-direction: column;
30
+ justify-content: center;
31
+ align-items: center;
32
+ box-sizing: border-box;
33
+ }
34
+
35
+
36
+ .status-card__label {
37
+ text-align: center;
38
+ margin-bottom: 0px;
39
+ color: #e4e4e7;
40
+ }
41
+
42
+ .status-card__subtext {
43
+ margin-top: 0px;
44
+ font-size: 0.85rem;
45
+ color: #a1a1aa;
46
+ text-align: center;
47
+ }
48
+
49
+ .led-dot {
50
+ width: 30px;
51
+ height: 30px;
52
+ border-radius: 9999px;
53
+ border: 1px solid #71717a;
54
+ margin: 0 auto;
55
+ }
56
+
57
+ .led-dot--black {
58
+ background: #1c1c1c;
59
+ box-shadow: none;
60
+ }
61
+
62
+ .led-dot--red {
63
+ background: #ef4444;
64
+ box-shadow: 0 0 14px rgba(239, 68, 68, 0.85);
65
+ }
66
+
67
+ .led-dot--green {
68
+ background: #22c55e;
69
+ box-shadow: 0 0 14px rgba(34, 197, 94, 0.85);
70
+ }
71
+
72
+ .led-dot--yellow {
73
+ background: #facc15;
74
+ box-shadow: 0 0 14px rgba(250, 204, 21, 0.85);
75
+ }
76
+
77
+
78
+
79
+ .led-dot--off {
80
+ background: #3f3f46;
81
+ box-shadow: none;
82
+ }
clean_up.py CHANGED
@@ -2,7 +2,6 @@ import os
2
  import time
3
  from loguru import logger
4
  from constants import MINUTES_KEEP, APP_TMP_DIR
5
- import gradio as gr
6
 
7
 
8
  def purge_tmp_directory(
@@ -93,11 +92,9 @@ def cleanup_previous_run(
93
  sample_stem: str,
94
  tmp_dir: str = APP_TMP_DIR,
95
  max_age_minutes: int = MINUTES_KEEP,
96
- ) -> tuple[None, None, str, str, str]:
97
- gr.Info("Processing started. This may take a moment. Please do not refresh or close the window.")
98
  try:
99
  remove_files_related_to(sample_stem, tmp_dir=tmp_dir)
100
  except Exception as e:
101
  print(f"Failed to delete last run with id {sample_stem}: {e}")
102
- purge_tmp_directory(max_age_minutes=max_age_minutes, tmp_dir=tmp_dir)
103
- return None, None, "", "", ""
 
2
  import time
3
  from loguru import logger
4
  from constants import MINUTES_KEEP, APP_TMP_DIR
 
5
 
6
 
7
  def purge_tmp_directory(
 
92
  sample_stem: str,
93
  tmp_dir: str = APP_TMP_DIR,
94
  max_age_minutes: int = MINUTES_KEEP,
95
+ ):
 
96
  try:
97
  remove_files_related_to(sample_stem, tmp_dir=tmp_dir)
98
  except Exception as e:
99
  print(f"Failed to delete last run with id {sample_stem}: {e}")
100
+ purge_tmp_directory(max_age_minutes=max_age_minutes, tmp_dir=tmp_dir)
 
constants.py CHANGED
@@ -19,7 +19,6 @@ MIX_DIR: Final = "mix"
19
  SPEECH_DIR: Final = "speech"
20
  TRANS_DIR: Final = "transcripts"
21
 
22
- DEFAULT_SR: Final = 16000
23
  STREAM_EVERY: Final = 0.2
24
  WARMUP_SECONDS: Final = 2 # seconds before "recording ready" light turns on
25
 
@@ -30,3 +29,6 @@ STREAMER_CLASSES: Final = {
30
  "Deepgram Nova-3 RT": DeepgramStreamer,
31
  "Soniox STT-RT v3": SonioxStreamer,
32
  }
 
 
 
 
19
  SPEECH_DIR: Final = "speech"
20
  TRANS_DIR: Final = "transcripts"
21
 
 
22
  STREAM_EVERY: Final = 0.2
23
  WARMUP_SECONDS: Final = 2 # seconds before "recording ready" light turns on
24
 
 
29
  "Deepgram Nova-3 RT": DeepgramStreamer,
30
  "Soniox STT-RT v3": SonioxStreamer,
31
  }
32
+
33
+ VAD_ON: Final = "🟢"
34
+ VAD_OFF: Final = "⚫"
offline_pipeline.py CHANGED
@@ -1,153 +1,277 @@
1
  import os
2
- from random import sample
3
 
4
  import gradio as gr
5
- import soundfile as sf
6
- from sdk import SDKWrapper
7
- from utils import spec_image, compute_wer, to_gradio_audio, normalize_lufs
8
- from hf_dataset_utils import get_audio, get_transcript
9
- from constants import APP_TMP_DIR, STREAMER_CLASSES
10
  import numpy as np
 
 
 
 
 
 
 
 
 
 
 
 
11
 
 
12
 
13
- def _close_stt_stream(streamer) -> None:
14
- """Signal end-of-stream; streamer type may be Soniox (close_stream) or Deepgram (close)."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  if hasattr(streamer, "close_stream"):
16
  streamer.close_stream()
17
  else:
18
  streamer.close()
19
 
20
- def run_offline_pipeline_streaming(
21
- sample: np.ndarray,
22
- sample_rate: int,
23
- enhancement_level: float,
24
- sample_id: str,
25
- stt_model: str,
26
- progress_state: dict,
27
- ) -> tuple[str, str, str, tuple[int, np.ndarray], str, np.ndarray, str]:
28
- """Run enhancement and both STTs in real time by processing in chunks. Transcripts stream
29
- via progress_state['noisy'] and progress_state['enhanced']. Enhanced audio is returned
30
- only at the end; the app plays it automatically when processing is complete.
31
- Returns same tuple as run_offline_pipeline_ordered."""
32
- if sample is None:
33
- raise ValueError("No audio to enhance. Please upload a file first.")
34
- sample = np.asarray(sample, dtype=np.float32).flatten()
35
-
36
- sdk = SDKWrapper()
37
- sdk.init_processor(
38
  sample_rate=sample_rate,
39
- enhancement_level=float(enhancement_level) / 100.0,
40
  )
41
- chunk_size = sdk.num_frames
 
42
 
43
- # Sync transcript callbacks so both boxes update together
44
- progress_state["noisy_pending"] = ""
45
- progress_state["enhanced_pending"] = ""
46
- progress_state["noisy_has_sent"] = False
47
- progress_state["enhanced_has_sent"] = False
48
 
49
- def _flush_both():
50
- if progress_state.get("noisy_has_sent") and progress_state.get("enhanced_has_sent"):
51
- progress_state["noisy"] = progress_state.get("noisy_pending", "")
52
- progress_state["enhanced"] = progress_state.get("enhanced_pending", "")
 
 
 
 
53
 
54
- def on_noisy(t: str):
55
- progress_state["noisy_pending"] = t
56
- progress_state["noisy_has_sent"] = True
57
- _flush_both()
58
 
59
- def on_enhanced(t: str):
60
- progress_state["enhanced_pending"] = t
61
- progress_state["enhanced_has_sent"] = True
62
- _flush_both()
 
 
 
63
 
64
- if stt_model not in STREAMER_CLASSES:
65
- raise ValueError(f"Unknown STT model: {stt_model}")
66
- StreamerClass = STREAMER_CLASSES[stt_model]
67
- streamer_noisy = StreamerClass(sample_rate, f"{sample_id}_noisy", on_update=on_noisy)
68
- streamer_enhanced = StreamerClass(sample_rate, f"{sample_id}_enhanced", on_update=on_enhanced)
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  accumulated_enhanced: list[np.ndarray] = []
 
71
  n = len(sample)
72
 
73
  for i in range(0, n, chunk_size):
74
  raw_chunk = sample[i : i + chunk_size]
75
- if raw_chunk.size < chunk_size:
 
 
76
  raw_chunk = np.pad(
77
  raw_chunk,
78
- (0, chunk_size - raw_chunk.size),
79
  mode="constant",
80
  constant_values=0.0,
81
  )
82
- raw_2d = raw_chunk.reshape(1, -1)
83
- enhanced_chunk = sdk.process_chunk(raw_2d)
84
- enhanced_1d = np.asarray(enhanced_chunk).flatten()
 
85
  streamer_noisy.process_chunk(raw_chunk)
86
  streamer_enhanced.process_chunk(enhanced_1d)
87
  accumulated_enhanced.append(enhanced_1d)
88
 
89
- _close_stt_stream(streamer_noisy)
90
- _close_stt_stream(streamer_enhanced)
91
- streamer_noisy.finished_event.wait()
92
- streamer_enhanced.finished_event.wait()
 
 
93
 
94
- with streamer_noisy.lock:
95
- noisy_transcript = streamer_noisy.render_tokens(streamer_noisy.final_tokens, [])
96
- with streamer_enhanced.lock:
97
- enhanced_transcript = streamer_enhanced.render_tokens(streamer_enhanced.final_tokens, [])
98
 
99
  enhanced_array = np.concatenate(accumulated_enhanced).astype(np.float32)
100
- gradio_enhanced_audio = to_gradio_audio(enhanced_array, sample_rate)
101
 
102
- enhanced_spec_path = f"{APP_TMP_DIR}/{sample_id}_enhanced_spectrogram.png"
103
- spec_image(enhanced_array).save(enhanced_spec_path)
104
- progress_state["enhanced_spec_path"] = enhanced_spec_path
105
 
106
- precomputed_noisy = noisy_transcript
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  try:
108
  original_transcript = get_transcript(sample_id)
109
- wer_enhanced = compute_wer(original_transcript, enhanced_transcript)
110
- wer_noisy = compute_wer(original_transcript, noisy_transcript)
111
- enhanced_transcript += f" (WER: {wer_enhanced * 100:.2f}%)"
112
- noisy_transcript += f" (WER: {wer_noisy * 100:.2f}%)"
113
  except Exception:
114
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
  return (
 
117
  enhanced_spec_path,
118
- enhanced_transcript,
 
119
  noisy_transcript,
120
- gradio_enhanced_audio,
121
  sample_id,
122
- enhanced_array,
123
- precomputed_noisy,
124
  )
125
 
 
126
  def load_local_file(
127
  sample_path: str,
128
  normalize: bool = True,
129
- ) -> tuple[np.ndarray | None, str, tuple | None, int]:
130
  if not sample_path or not os.path.exists(sample_path):
131
- return None, "", None
 
132
  if os.path.getsize(sample_path) > 5 * 1024 * 1024:
133
  gr.Warning("File size exceeds 5 MB limit. Please upload a smaller file.")
134
  raise ValueError("Uploaded file exceeds the 5 MB size limit.")
 
135
  new_sample_stem = os.path.splitext(os.path.basename(sample_path))[0]
136
- y, sample_rate = sf.read(sample_path, dtype="float32", always_2d=False)
 
 
137
  if normalize:
138
  y = normalize_lufs(y, sample_rate)
139
  gradio_audio = to_gradio_audio(y, sample_rate)
140
  return y, new_sample_stem, gradio_audio, sample_rate
141
 
142
- def load_file_from_dataset(sample_id: str) -> tuple[tuple | None, np.ndarray | None, str, int | None]:
 
 
 
143
  if not sample_id:
144
  gr.Warning("Please select a sample from the dropdown.")
145
  return None, None, "", None
 
146
  new_sample_stem = sample_id
 
147
  try:
148
  y, sample_rate = get_audio(sample_id, prefix="mix")
149
- y_for_gradio = to_gradio_audio(y, sample_rate)
150
- except Exception as e: # Convert to 16-bit PCM for gradio audio component
151
- gr.Warning(f"{e}")
152
- raise e
153
- return y_for_gradio, y, new_sample_stem, sample_rate
 
 
 
 
1
  import os
2
+ from typing import Any
3
 
4
  import gradio as gr
 
 
 
 
 
5
  import numpy as np
6
+ import librosa
7
+
8
+ from constants import APP_TMP_DIR, STREAMER_CLASSES
9
+ from hf_dataset_utils import get_audio, get_transcript
10
+ from sdk import SDKParams, SDKWrapper
11
+ from utils import (
12
+ compute_wer,
13
+ get_vad_labels,
14
+ normalize_lufs,
15
+ spec_image,
16
+ to_gradio_audio,
17
+ )
18
 
19
+ SDK_OFFLINE = SDKWrapper()
20
 
21
+
22
+ def _safe_progress(progress: gr.Progress, value: float, desc: str) -> None:
23
+ progress(max(0.0, min(1.0, value)), desc=desc)
24
+
25
+
26
+ def _empty_pipeline_result(sample_id: str) -> tuple[Any, str, str, str, str, str, str]:
27
+ return (
28
+ None,
29
+ "",
30
+ "",
31
+ "Unavailable",
32
+ "Unavailable",
33
+ "Unavailable",
34
+ sample_id,
35
+ )
36
+
37
+
38
+ def _finalize_stream_transcript(streamer) -> str:
39
  if hasattr(streamer, "close_stream"):
40
  streamer.close_stream()
41
  else:
42
  streamer.close()
43
 
44
+ streamer.finished_event.wait()
45
+ with streamer.lock:
46
+ return streamer.render_tokens(streamer.final_tokens, [])
47
+
48
+
49
+ def _init_sdk(sample_rate: int, enhancement_level: int) -> int:
50
+ sdk_params = SDKParams(
 
 
 
 
 
 
 
 
 
 
 
51
  sample_rate=sample_rate,
52
+ enhancement_level=enhancement_level / 100.0,
53
  )
54
+ SDK_OFFLINE.init_processor(sdk_params)
55
+ return SDK_OFFLINE.num_frames
56
 
 
 
 
 
 
57
 
58
+ def _init_streamers(
59
+ sample_rate: int,
60
+ stt_model: str,
61
+ sample_id: str,
62
+ progress: gr.Progress,
63
+ ):
64
+ if stt_model not in STREAMER_CLASSES:
65
+ raise ValueError(f"Unknown STT model: {stt_model}")
66
 
67
+ streamer_class = STREAMER_CLASSES[stt_model]
 
 
 
68
 
69
+ _safe_progress(progress, 0.12, f"Initializing {stt_model} stream 1/2...")
70
+ streamer_noisy = streamer_class(sample_rate, f"{sample_id}_noisy")
71
+
72
+ _safe_progress(progress, 0.18, f"Initializing {stt_model} stream 2/2...")
73
+ streamer_enhanced = streamer_class(sample_rate, f"{sample_id}_enhanced")
74
+
75
+ return streamer_noisy, streamer_enhanced
76
 
 
 
 
 
 
77
 
78
+ def _attach_wer(
79
+ original_transcript: str,
80
+ noisy_transcript: str,
81
+ enhanced_transcript: str,
82
+ ) -> tuple[str, str]:
83
+ wer_enhanced = compute_wer(original_transcript, enhanced_transcript)
84
+ wer_noisy = compute_wer(original_transcript, noisy_transcript)
85
+
86
+ noisy_transcript = f"{noisy_transcript} (WER: {wer_noisy * 100:.2f}%)"
87
+ enhanced_transcript = f"{enhanced_transcript} (WER: {wer_enhanced * 100:.2f}%)"
88
+ return noisy_transcript, enhanced_transcript
89
+
90
+
91
+ def _process_audio_chunks(
92
+ sample: np.ndarray,
93
+ sample_rate: int,
94
+ chunk_size: int,
95
+ streamer_noisy,
96
+ streamer_enhanced,
97
+ progress: gr.Progress,
98
+ ) -> tuple[np.ndarray, list[list[float]]]:
99
  accumulated_enhanced: list[np.ndarray] = []
100
+ vad_timestamps: list[list[float]] = []
101
  n = len(sample)
102
 
103
  for i in range(0, n, chunk_size):
104
  raw_chunk = sample[i : i + chunk_size]
105
+ original_chunk_len = raw_chunk.size
106
+
107
+ if original_chunk_len < chunk_size:
108
  raw_chunk = np.pad(
109
  raw_chunk,
110
+ (0, chunk_size - original_chunk_len),
111
  mode="constant",
112
  constant_values=0.0,
113
  )
114
+
115
+ enhanced_chunk = SDK_OFFLINE.process_chunk(raw_chunk.reshape(1, -1))
116
+ enhanced_1d = np.asarray(enhanced_chunk, dtype=np.float32).flatten()
117
+
118
  streamer_noisy.process_chunk(raw_chunk)
119
  streamer_enhanced.process_chunk(enhanced_1d)
120
  accumulated_enhanced.append(enhanced_1d)
121
 
122
+ loop_progress = (i + original_chunk_len) / n if n > 0 else 1.0
123
+ _safe_progress(
124
+ progress,
125
+ 0.20 + 0.50 * loop_progress,
126
+ "Enhancing audio...",
127
+ )
128
 
129
+ if SDK_OFFLINE.vad_context.is_speech_detected():
130
+ start_in_sec = i / sample_rate
131
+ end_in_sec = min(i + original_chunk_len, n) / sample_rate
132
+ vad_timestamps.append([start_in_sec, end_in_sec])
133
 
134
  enhanced_array = np.concatenate(accumulated_enhanced).astype(np.float32)
135
+ return enhanced_array, vad_timestamps
136
 
 
 
 
137
 
138
+ def _save_spectrograms(
139
+ sample: np.ndarray,
140
+ enhanced_array: np.ndarray,
141
+ sample_rate: int,
142
+ sample_id: str,
143
+ vad_timestamps: list[list[float]],
144
+ ) -> tuple[str, str]:
145
+ os.makedirs(APP_TMP_DIR, exist_ok=True)
146
+
147
+ enhanced_spec_path = os.path.join(APP_TMP_DIR, f"{sample_id}_enhanced_spectrogram.png")
148
+ noisy_spec_path = os.path.join(APP_TMP_DIR, f"{sample_id}_noisy_spectrogram.png")
149
+
150
+ spec_image(enhanced_array, sr=sample_rate, vad_timestamps=vad_timestamps).save(enhanced_spec_path)
151
+ spec_image(sample, sr=sample_rate, vad_timestamps=vad_timestamps).save(noisy_spec_path)
152
+
153
+ return enhanced_spec_path, noisy_spec_path
154
+
155
+
156
+ def run_offline_pipeline(
157
+ sample: np.ndarray,
158
+ sample_rate: int,
159
+ enhancement_level: int,
160
+ stt_model: str,
161
+ sample_id: str,
162
+ progress=gr.Progress(),
163
+ ) -> tuple[Any, str, str, str, str, str, str]:
164
+ _safe_progress(progress, 0.00, "Starting...")
165
+
166
+ if sample is None or len(sample) == 0:
167
+ gr.Warning("No audio to enhance. Please upload a file first.")
168
+ return _empty_pipeline_result(sample_id)
169
+
170
+ _safe_progress(progress, 0.05, "Initializing enhancement...")
171
+ chunk_size = _init_sdk(sample_rate, enhancement_level)
172
+
173
+ try:
174
+ streamer_noisy, streamer_enhanced = _init_streamers(
175
+ sample_rate=sample_rate,
176
+ stt_model=stt_model,
177
+ sample_id=sample_id,
178
+ progress=progress,
179
+ )
180
+ except Exception as e:
181
+ raise RuntimeError(f"Failed to initialize STT streaming: {e}") from e
182
+
183
+ enhanced_array, vad_timestamps = _process_audio_chunks(
184
+ sample=sample,
185
+ sample_rate=sample_rate,
186
+ chunk_size=chunk_size,
187
+ streamer_noisy=streamer_noisy,
188
+ streamer_enhanced=streamer_enhanced,
189
+ progress=progress,
190
+ )
191
+
192
+ _safe_progress(progress, 0.72, "Finalizing transcripts...")
193
+ noisy_transcript = _finalize_stream_transcript(streamer_noisy)
194
+ _safe_progress(progress, 0.80, "Finalizing transcripts...")
195
+ enhanced_transcript = _finalize_stream_transcript(streamer_enhanced)
196
+
197
+ _safe_progress(progress, 0.94, "Loading reference transcript...")
198
  try:
199
  original_transcript = get_transcript(sample_id)
 
 
 
 
200
  except Exception:
201
+ original_transcript = "Unavailable"
202
+ if original_transcript != "Unavailable":
203
+ _safe_progress(progress, 0.96, "Computing WER...")
204
+ noisy_transcript, enhanced_transcript = _attach_wer(
205
+ original_transcript=original_transcript,
206
+ noisy_transcript=noisy_transcript,
207
+ enhanced_transcript=enhanced_transcript,
208
+ )
209
+
210
+ _safe_progress(progress, 0.99, "Generating outputs...")
211
+ gradio_enhanced_audio = to_gradio_audio(enhanced_array, sample_rate)
212
+ enhanced_spec_path, noisy_spec_path = _save_spectrograms(
213
+ sample=sample,
214
+ enhanced_array=enhanced_array,
215
+ sample_rate=sample_rate,
216
+ sample_id=sample_id,
217
+ vad_timestamps=vad_timestamps
218
+ )
219
+
220
+ vad_labels = get_vad_labels(
221
+ vad_timestamps,
222
+ length=len(sample) / sample_rate,
223
+ )
224
+
225
+ _safe_progress(progress, 1.00, "Done.")
226
 
227
  return (
228
+ gr.update(value=gradio_enhanced_audio, subtitles=vad_labels),
229
  enhanced_spec_path,
230
+ noisy_spec_path,
231
+ original_transcript,
232
  noisy_transcript,
233
+ enhanced_transcript,
234
  sample_id,
 
 
235
  )
236
 
237
+
238
  def load_local_file(
239
  sample_path: str,
240
  normalize: bool = True,
241
+ ) -> tuple[np.ndarray | None, str, tuple | None, int | None]:
242
  if not sample_path or not os.path.exists(sample_path):
243
+ return None, "", None, None
244
+
245
  if os.path.getsize(sample_path) > 5 * 1024 * 1024:
246
  gr.Warning("File size exceeds 5 MB limit. Please upload a smaller file.")
247
  raise ValueError("Uploaded file exceeds the 5 MB size limit.")
248
+
249
  new_sample_stem = os.path.splitext(os.path.basename(sample_path))[0]
250
+ y, sample_rate = librosa.load(sample_path, sr=None, mono=True)
251
+ sample_rate = int(sample_rate)
252
+ y = np.asarray(y, dtype=np.float32)
253
  if normalize:
254
  y = normalize_lufs(y, sample_rate)
255
  gradio_audio = to_gradio_audio(y, sample_rate)
256
  return y, new_sample_stem, gradio_audio, sample_rate
257
 
258
+
259
+ def load_file_from_dataset(
260
+ sample_id: str,
261
+ ) -> tuple[tuple | None, np.ndarray | None, str, int | None]:
262
  if not sample_id:
263
  gr.Warning("Please select a sample from the dropdown.")
264
  return None, None, "", None
265
+
266
  new_sample_stem = sample_id
267
+
268
  try:
269
  y, sample_rate = get_audio(sample_id, prefix="mix")
270
+ except Exception as e:
271
+ gr.Warning(str(e))
272
+ raise
273
+ y = np.asarray(y, dtype=np.float32)
274
+ if y.ndim > 1:
275
+ y = np.mean(y, axis=0)
276
+ gradio_audio = to_gradio_audio(y, sample_rate)
277
+ return gradio_audio, y, new_sample_stem, sample_rate
sdk.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import numpy as np
2
  from dotenv import load_dotenv
3
  import aic_sdk as aic
@@ -8,6 +9,24 @@ from constants import MODEL_ID
8
  load_dotenv()
9
 
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  class SDKWrapper:
12
  def __init__(self, model_id: str = MODEL_ID, models_dir: str = "./models"):
13
  if os.getenv("AIC_SDK_KEY") is None:
@@ -16,25 +35,25 @@ class SDKWrapper:
16
  model_path = aic.Model.download(model_id, models_dir)
17
  self.model = aic.Model.from_file(model_path)
18
 
19
-
20
- def init_processor(self, sample_rate: int, enhancement_level: float, allow_variable_frames: bool = False, num_frames: int | None = None,num_channels: int = 1, sync: bool = True):
21
- self.processor_sample_rate = sample_rate
22
- processor_optimal_frames = self.model.get_optimal_num_frames(sample_rate)
23
- self.num_frames = num_frames if num_frames else processor_optimal_frames
24
- config = aic.ProcessorConfig(
25
- sample_rate=sample_rate,
26
- num_channels=num_channels,
27
  num_frames=self.num_frames,
28
- allow_variable_frames=allow_variable_frames,
29
  )
30
- if sync:
31
- processor = aic.Processor(self.model, self.sdk_key, config)
32
  else:
33
- processor = aic.ProcessorAsync(self.model, self.sdk_key, config)
34
- processor.get_processor_context().set_parameter(
35
- aic.ProcessorParameter.EnhancementLevel, float(enhancement_level)
36
  )
37
- self.processor = processor
 
38
 
39
  def change_enhancement_level(self, enhancement_level: float):
40
  if not hasattr(self, "processor"):
@@ -42,6 +61,7 @@ class SDKWrapper:
42
  self.processor.get_processor_context().set_parameter(
43
  aic.ProcessorParameter.EnhancementLevel, float(enhancement_level)
44
  )
 
45
 
46
  def _check_shape(self, audio: np.ndarray) -> np.ndarray:
47
  if len(audio.shape) == 1:
@@ -50,15 +70,17 @@ class SDKWrapper:
50
  raise ValueError("Expected audio with shape (n, frames)")
51
  return audio
52
 
53
- def process_sync(
54
  self,
55
  audio: np.ndarray,
56
- ) -> np.ndarray:
57
  """
58
  audio_array: 2D NumPy array with shape (num_channels, samples) containing audio data to be enhanced
59
  """
60
  audio = self._check_shape(audio)
61
  out = np.zeros_like(audio)
 
 
62
  chunk_size = self.num_frames
63
  n = audio.shape[1]
64
  for i in range(0, n, chunk_size):
@@ -72,7 +94,11 @@ class SDKWrapper:
72
  break
73
  enhanced = self.processor.process(chunk)
74
  out[:, i : i + chunk_size] = enhanced[:, :chunk_size]
75
- return out
 
 
 
 
76
 
77
  def process_chunk(self, audio: np.ndarray) -> np.ndarray:
78
  audio = self._check_shape(audio)
 
1
+
2
  import numpy as np
3
  from dotenv import load_dotenv
4
  import aic_sdk as aic
 
9
  load_dotenv()
10
 
11
 
12
+ class SDKParams:
13
+ def __init__(
14
+ self,
15
+ sample_rate: int = 16000,
16
+ enhancement_level: float = 1.0,
17
+ allow_variable_frames: bool = False,
18
+ num_channels: int = 1,
19
+ sync: bool = True,
20
+ num_frames: int | None = None,
21
+
22
+ ):
23
+ self.sample_rate = sample_rate
24
+ self.enhancement_level = enhancement_level
25
+ self.allow_variable_frames = allow_variable_frames
26
+ self.num_channels = num_channels
27
+ self.sync = sync
28
+ self.num_frames = num_frames # to be set after processor init
29
+
30
  class SDKWrapper:
31
  def __init__(self, model_id: str = MODEL_ID, models_dir: str = "./models"):
32
  if os.getenv("AIC_SDK_KEY") is None:
 
35
  model_path = aic.Model.download(model_id, models_dir)
36
  self.model = aic.Model.from_file(model_path)
37
 
38
+ def init_processor(self, sdk_params: SDKParams):
39
+ optimal_frames = self.model.get_optimal_num_frames(sdk_params.sample_rate)
40
+ self.num_frames = sdk_params.num_frames if sdk_params.num_frames else optimal_frames
41
+ self.sample_rate = sdk_params.sample_rate
42
+ aic_config = aic.ProcessorConfig(
43
+ sample_rate=sdk_params.sample_rate,
44
+ num_channels=sdk_params.num_channels,
 
45
  num_frames=self.num_frames,
46
+ allow_variable_frames=sdk_params.allow_variable_frames,
47
  )
48
+ if sdk_params.sync:
49
+ self.processor = aic.Processor(self.model, self.sdk_key, aic_config)
50
  else:
51
+ self.processor = aic.ProcessorAsync(self.model, self.sdk_key, aic_config)
52
+ self.processor.get_processor_context().set_parameter(
53
+ aic.ProcessorParameter.EnhancementLevel, float(sdk_params.enhancement_level)
54
  )
55
+ self.enhancement_level = sdk_params.enhancement_level
56
+ self.vad_context = self.processor.get_vad_context()
57
 
58
  def change_enhancement_level(self, enhancement_level: float):
59
  if not hasattr(self, "processor"):
 
61
  self.processor.get_processor_context().set_parameter(
62
  aic.ProcessorParameter.EnhancementLevel, float(enhancement_level)
63
  )
64
+ self.enhancement_level = enhancement_level
65
 
66
  def _check_shape(self, audio: np.ndarray) -> np.ndarray:
67
  if len(audio.shape) == 1:
 
70
  raise ValueError("Expected audio with shape (n, frames)")
71
  return audio
72
 
73
+ def process_with_vad(
74
  self,
75
  audio: np.ndarray,
76
+ ) -> tuple[np.ndarray, bool]:
77
  """
78
  audio_array: 2D NumPy array with shape (num_channels, samples) containing audio data to be enhanced
79
  """
80
  audio = self._check_shape(audio)
81
  out = np.zeros_like(audio)
82
+ vad_per_sample = np.zeros_like(audio, dtype=bool)
83
+ vad_overall = False
84
  chunk_size = self.num_frames
85
  n = audio.shape[1]
86
  for i in range(0, n, chunk_size):
 
94
  break
95
  enhanced = self.processor.process(chunk)
96
  out[:, i : i + chunk_size] = enhanced[:, :chunk_size]
97
+ if self.vad_context.is_speech_detected():
98
+ vad_per_sample[:, i : i + chunk_size] = True
99
+ if vad_per_sample.mean() > 0.5:
100
+ vad_overall = True
101
+ return out, vad_overall
102
 
103
  def process_chunk(self, audio: np.ndarray) -> np.ndarray:
104
  audio = self._check_shape(audio)
stream_pipeline.py CHANGED
@@ -1,191 +1,166 @@
1
- import gradio as gr
2
  import numpy as np
3
- import soxr
4
- from constants import DEFAULT_SR, STREAMER_CLASSES
5
  from stt_streamers import DeepgramStreamer
6
- from sdk import SDKWrapper
7
- from dataclasses import dataclass
 
 
 
 
 
 
 
8
 
9
- # ----------------------------
10
- # Global transcript store (UI pulls from this)
11
- # ----------------------------
12
- _ENHANCED_TRANSCRIPT: str = ""
13
- _RAW_TRANSCRIPT: str = ""
14
 
15
  def _set_transcript_enhanced(text: str) -> None:
16
- """Deepgram callback: update latest transcript text (no printing)."""
17
  global _ENHANCED_TRANSCRIPT
18
  _ENHANCED_TRANSCRIPT = text
19
 
 
20
  def _set_transcript_raw(text: str) -> None:
21
- """Deepgram callback: update latest transcript text (no printing)."""
22
  global _RAW_TRANSCRIPT
23
  _RAW_TRANSCRIPT = text
24
 
25
 
26
- def get_live_transcripts() -> tuple[str, str]:
27
- """Return current enhanced and raw transcript for live UI updates."""
28
- return _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT
 
29
 
30
 
31
- SDK = SDKWrapper()
32
- SDK.init_processor(
33
- sample_rate=DEFAULT_SR,
34
- enhancement_level=1.0,
35
- allow_variable_frames=True, # streaming chunks are variable-sized
36
- num_channels=1,
37
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
- # Created on first start_recording (lazy) to avoid Soniox "No audio received" timeout at app load
40
- Streamer_enhanced = None
41
- Streamer_raw = None
42
- _streamer_generation = 0
43
- _last_stop_generation = 1 # so first stop doesn't skip (1 > 1 is False)
44
-
45
-
46
- @dataclass
47
- class EnhanceSession:
48
- pending: np.ndarray # 1D float32 @ processor sample rate
49
- sr: int
50
- num_frames: int
51
- @dataclass
52
- class StreamSession:
53
- # nur was du wirklich brauchst
54
- resampler: soxr.ResampleStream | None
55
- sr_in: int | None
56
- tail_16k: np.ndarray # ring buffer (z.B. letzte 10s)
57
- tail_max: int # max samples
58
-
59
- def _get_or_init_session(session: StreamSession | None, sr_in: int) -> StreamSession:
60
- if session is None or session.sr_in != sr_in:
61
- # ResampleStream ist für real-time processing gedacht citeturn8view0
62
- resampler = None if sr_in == DEFAULT_SR else soxr.ResampleStream(sr_in, DEFAULT_SR, num_channels=1, dtype="float32")
63
- return StreamSession(resampler=resampler, sr_in=sr_in, tail_16k=np.zeros((0,), dtype=np.float32), tail_max=10 * DEFAULT_SR)
64
- return session
65
 
66
  def _to_float32_mono(y: np.ndarray) -> np.ndarray:
67
- # Gradio liefert int16 (oder (samples, channels)). citeturn1view4
68
  y = np.asarray(y)
69
  if y.ndim > 1:
70
  y = y.mean(axis=1)
71
  if y.dtype == np.int16:
72
- y = (y.astype(np.float32) / 32768.0)
73
  else:
74
  y = y.astype(np.float32)
75
- return y
76
 
77
 
78
- def transcribe_stream(session: StreamSession | None, new_chunk, enhancement_level, input_gain_db: float = 0.0):
79
- if (
80
- Streamer_enhanced is None
 
 
 
81
  or Streamer_raw is None
82
- or Streamer_enhanced.ws is None
83
- or Streamer_raw.ws is None
84
- ):
85
- return session, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT
86
- if new_chunk is None or new_chunk[1] is None:
87
- return session, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT
88
-
89
- sr, y = new_chunk
90
- y = _to_float32_mono(y)
91
- # Apply input gain: linear = 10^(dB/20), clip to avoid overflow
92
- if input_gain_db is not None and input_gain_db > 0:
93
- gain_linear = np.float32(10.0 ** (float(input_gain_db) / 20.0))
94
- y = (y * gain_linear).astype(np.float32)
95
- y = np.clip(y, -1.0, 1.0)
96
-
97
- session = _get_or_init_session(session, sr)
98
- SDK.change_enhancement_level(float(enhancement_level) / 100.0)
99
- if session.resampler is not None:
100
- y_16k = session.resampler.resample_chunk(y)
101
- else:
102
- y_16k = y
103
-
104
- # Ensure 1D float32 for SDK and streamers (resample_chunk can return 0 samples or 2D)
105
- y_16k = np.asarray(y_16k, dtype=np.float32).flatten()
106
-
107
- # Ringbuffer (nicht unendlich konkatenieren)
108
- if y_16k.size > 0:
109
- tail = np.concatenate([session.tail_16k, y_16k])
110
- if tail.size > session.tail_max:
111
- tail = tail[-session.tail_max:]
112
- session.tail_16k = tail
113
-
114
- # Only send when we have samples (resample_chunk can return empty; SDK needs valid input)
115
- if y_16k.size == 0:
116
- return session, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT
117
-
118
- # Parallel path: send raw to STT immediately, then enhance and send enhanced.
119
- # SDK requires fixed num_frames (AudioConfigMismatchError if we use process_chunk with variable size).
120
- Streamer_raw.process_chunk(y_16k)
121
- enhanced_chunk_16k = SDK.process_sync(y_16k)
122
- out_1d = np.asarray(enhanced_chunk_16k, dtype=np.float32).flatten()
123
- # Always send something to enhanced so Soniox doesn't close with "No audio received"
124
- if out_1d.size > 0:
125
- Streamer_enhanced.process_chunk(out_1d)
126
- else:
127
- Streamer_enhanced.process_chunk(np.zeros(160, dtype=np.float32))
128
- return session, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT
129
 
 
 
 
 
130
 
131
- def shutdown_streamers(from_stop_recording: bool = False):
132
- """Shut down STT streamers. If from_stop_recording, skip when streamers were
133
- created after the last stop (avoids delayed stop killing new streamers)."""
134
- global Streamer_enhanced, Streamer_raw, _streamer_generation, _last_stop_generation
135
- if from_stop_recording and _streamer_generation > _last_stop_generation:
136
- return
137
- gen = _streamer_generation
138
  try:
139
- if Streamer_enhanced is not None and Streamer_enhanced.ws is not None:
140
- Streamer_enhanced.shutdown()
141
- if Streamer_raw is not None and Streamer_raw.ws is not None:
142
- Streamer_raw.shutdown()
143
- except Exception:
144
- print("Failed to shutdown streamers.")
145
- finally:
146
- Streamer_enhanced = None
147
- Streamer_raw = None
148
- if from_stop_recording:
149
- _last_stop_generation = gen
150
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  def on_stop_recording():
152
- """Call from Gradio stop_recording so streamers shut down when user clicks Stop."""
153
- shutdown_streamers(from_stop_recording=True)
154
 
155
 
156
- def clear_ui():
157
- global _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT
158
- _ENHANCED_TRANSCRIPT = ""
159
- _RAW_TRANSCRIPT = ""
160
- return None, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT
161
-
162
- def stop_online_backend():
163
- """Stop streamers and clear transcripts. Do not update the Audio component:
164
- toggling streaming=False then back to True can make the frontend lose the
165
- microphone (getUserMedia not re-called), so we leave it unchanged."""
166
- shutdown_streamers()
167
- session, enhanced_transcript, raw_transcript = clear_ui()
168
- return session, enhanced_transcript, raw_transcript, gr.update()
169
-
170
-
171
- def set_stt_streamer(model_name):
172
- StreamerCls = STREAMER_CLASSES.get(model_name, DeepgramStreamer)
173
- global Streamer_enhanced, Streamer_raw, _streamer_generation
174
- # Shut down current streamers first so we don't leak
175
- if Streamer_enhanced is not None or Streamer_raw is not None:
176
- shutdown_streamers()
177
- # Create both before assigning so transcribe_stream never sees one new and one old
178
- new_enhanced = StreamerCls(
179
- fs_hz=DEFAULT_SR,
180
- stream_name="enhanced",
181
- on_update=_set_transcript_enhanced,
182
- )
183
- new_raw = StreamerCls(
184
- fs_hz=DEFAULT_SR,
185
- stream_name="raw",
186
- on_update=_set_transcript_raw,
187
- )
188
- _streamer_generation += 1
189
- Streamer_enhanced = new_enhanced
190
- Streamer_raw = new_raw
191
-
 
 
1
  import numpy as np
2
+ from constants import STREAMER_CLASSES
3
+ import gradio as gr
4
  from stt_streamers import DeepgramStreamer
5
+ from sdk import SDKWrapper, SDKParams
6
+ from ui import LED_DOT_BLACK, LED_DOT_GREEN, LED_DOT_OFF, LED_DOT_RED, LED_DOT_YELLOW
7
+
8
+ _ENHANCED_TRANSCRIPT = ""
9
+ _RAW_TRANSCRIPT = ""
10
+
11
+ SDK_STREAMING = SDKWrapper()
12
+ Streamer_enhanced = None
13
+ Streamer_raw = None
14
 
 
 
 
 
 
15
 
16
  def _set_transcript_enhanced(text: str) -> None:
 
17
  global _ENHANCED_TRANSCRIPT
18
  _ENHANCED_TRANSCRIPT = text
19
 
20
+
21
  def _set_transcript_raw(text: str) -> None:
 
22
  global _RAW_TRANSCRIPT
23
  _RAW_TRANSCRIPT = text
24
 
25
 
26
+ def clear_live_transcripts():
27
+ global _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT
28
+ _ENHANCED_TRANSCRIPT = ""
29
+ _RAW_TRANSCRIPT = ""
30
 
31
 
32
+ def render_system_status(status: str):
33
+ if status == "off":
34
+ return LED_DOT_OFF, "Off"
35
+ if status == "init":
36
+ return LED_DOT_YELLOW, "Initializing..."
37
+ if status == "ready":
38
+ return LED_DOT_GREEN, "Ready"
39
+ if status == "error":
40
+ return LED_DOT_RED, "Error. Please refresh the page."
41
+ raise ValueError(f"Invalid status: {status}")
42
+
43
+ def shutdown_streamers():
44
+ global Streamer_enhanced, Streamer_raw
45
+ try:
46
+ if Streamer_enhanced is not None:
47
+ Streamer_enhanced.shutdown()
48
+ if Streamer_raw is not None:
49
+ Streamer_raw.shutdown()
50
+ except Exception as e:
51
+ print(f"Error shutting down streamers: {e}")
52
+ finally:
53
+ Streamer_enhanced = None
54
+ Streamer_raw = None
55
 
56
+
57
+ def set_stt_streamer(sample_rate: int, stt_model: str):
58
+ global Streamer_enhanced, Streamer_raw
59
+ StreamerCls = STREAMER_CLASSES.get(stt_model, DeepgramStreamer)
60
+ try:
61
+ Streamer_enhanced = StreamerCls(
62
+ fs_hz=sample_rate,
63
+ stream_name="Enhanced",
64
+ on_update=_set_transcript_enhanced,
65
+ )
66
+ Streamer_raw = StreamerCls(
67
+ fs_hz=sample_rate,
68
+ stream_name="Raw",
69
+ on_update=_set_transcript_raw,
70
+ )
71
+ except Exception as e:
72
+ Streamer_enhanced = None
73
+ Streamer_raw = None
74
+ raise RuntimeError(f"Error initializing STT streamer '{stt_model}': {e}")
 
 
 
 
 
 
 
75
 
76
  def _to_float32_mono(y: np.ndarray) -> np.ndarray:
 
77
  y = np.asarray(y)
78
  if y.ndim > 1:
79
  y = y.mean(axis=1)
80
  if y.dtype == np.int16:
81
+ y = y.astype(np.float32) / 32768.0
82
  else:
83
  y = y.astype(np.float32)
84
+ return np.asarray(y, dtype=np.float32).flatten()
85
 
86
 
87
+ def _ensure_initialized(sr: int, streaming_sr, stt_model: str, enhancement_level: float):
88
+ streamer_cls = STREAMER_CLASSES[stt_model]
89
+ needs_init = (
90
+ streaming_sr is None
91
+ or streaming_sr != sr
92
+ or Streamer_enhanced is None
93
  or Streamer_raw is None
94
+ or not isinstance(Streamer_enhanced, streamer_cls)
95
+ or not isinstance(Streamer_raw, streamer_cls)
96
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
+ if not needs_init:
99
+ if SDK_STREAMING.enhancement_level != enhancement_level:
100
+ SDK_STREAMING.change_enhancement_level(enhancement_level)
101
+ return streaming_sr, *render_system_status("ready")
102
 
 
 
 
 
 
 
 
103
  try:
104
+ shutdown_streamers()
105
+ sdk_params = SDKParams(
106
+ sample_rate=sr,
107
+ enhancement_level=enhancement_level
108
+ )
109
+ SDK_STREAMING.init_processor(sdk_params)
110
+ set_stt_streamer(sr, stt_model)
111
+ return sr, *render_system_status("ready")
112
+ except Exception as e:
113
+ gr.Warning(f"Streaming process failed: {e}")
114
+ return None, *render_system_status("error")
115
+
116
+
117
+ def stream_step(audio_stream, streaming_sr, stt_model, enhancement_level, input_gain_db):
118
+ if audio_stream is None:
119
+ return streaming_sr, *render_system_status("off"), _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT, LED_DOT_OFF
120
+
121
+ sr, chunk = audio_stream
122
+ if chunk is None:
123
+ return streaming_sr, *render_system_status("off"), _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT, LED_DOT_OFF
124
+
125
+ enhancement_level_float = enhancement_level / 100.0
126
+
127
+ streaming_sr, system_led, system_text = _ensure_initialized(
128
+ sr=sr,
129
+ streaming_sr=streaming_sr,
130
+ stt_model=stt_model,
131
+ enhancement_level=enhancement_level_float,
132
+ )
133
+
134
+ try:
135
+ y = _to_float32_mono(chunk)
136
+
137
+ if input_gain_db and input_gain_db > 0:
138
+ gain_linear = np.float32(10.0 ** (float(input_gain_db) / 20.0))
139
+ y = np.clip(y * gain_linear, -1.0, 1.0).astype(np.float32)
140
+
141
+ enhanced_chunk_16k, vad_detected = SDK_STREAMING.process_with_vad(y)
142
+ enhanced_chunk_16k = np.asarray(enhanced_chunk_16k, dtype=np.float32).flatten()
143
+
144
+ Streamer_raw.process_chunk(y)
145
+ Streamer_enhanced.process_chunk(enhanced_chunk_16k)
146
+
147
+ vad_led = LED_DOT_GREEN if vad_detected else LED_DOT_BLACK
148
+ return streaming_sr, system_led, system_text, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT, vad_led
149
+
150
+ except Exception as e:
151
+ gr.Warning(f"Streaming process failed: {e}")
152
+ err_led, err_text = render_system_status("error")
153
+ return streaming_sr, err_led, err_text, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT, LED_DOT_OFF
154
+
155
+
156
+ def on_start_recording():
157
+ clear_live_transcripts()
158
+ led, text = render_system_status("init")
159
+ return "", "", led, text
160
+
161
+
162
  def on_stop_recording():
163
+ led, text = render_system_status("off")
164
+ return led, led, text, None
165
 
166
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
stt_streamers/deepgram_streamer.py CHANGED
@@ -15,7 +15,8 @@ class DeepgramStreamer:
15
  api_key = os.environ.get("DEEPGRAM_API_KEY")
16
  if not api_key:
17
  raise RuntimeError("Missing DEEPGRAM_API_KEY.")
18
-
 
19
  self.stream_name = stream_name
20
  self.api_name = "Deepgram V1 Nova-3"
21
  self.on_update = on_update
@@ -210,7 +211,6 @@ class DeepgramStreamer:
210
  self.thread.join(timeout=1.0)
211
  if hasattr(self, "keepalive_thread") and self.keepalive_thread.is_alive():
212
  self.keepalive_thread.join(timeout=1.0)
213
- self.ws = None
214
  print(f"DeepgramStreamer '{self.stream_name}' shutdown complete.")
215
 
216
 
 
15
  api_key = os.environ.get("DEEPGRAM_API_KEY")
16
  if not api_key:
17
  raise RuntimeError("Missing DEEPGRAM_API_KEY.")
18
+ if not fs_hz:
19
+ raise ValueError("Sample rate (fs_hz) must be specified.")
20
  self.stream_name = stream_name
21
  self.api_name = "Deepgram V1 Nova-3"
22
  self.on_update = on_update
 
211
  self.thread.join(timeout=1.0)
212
  if hasattr(self, "keepalive_thread") and self.keepalive_thread.is_alive():
213
  self.keepalive_thread.join(timeout=1.0)
 
214
  print(f"DeepgramStreamer '{self.stream_name}' shutdown complete.")
215
 
216
 
stt_streamers/soniox_streamer.py CHANGED
@@ -13,7 +13,8 @@ class SonioxStreamer:
13
  api_key = os.environ.get("SONIOX_API_KEY")
14
  if not api_key:
15
  raise RuntimeError("Missing SONIOX_API_KEY.")
16
-
 
17
  self.stream_name = stream_name
18
  self.api_name = "Soniox RT"
19
  self.on_update = on_update
@@ -207,6 +208,4 @@ class SonioxStreamer:
207
  self.thread.join(timeout=1.0)
208
  if hasattr(self, "keepalive_thread") and self.keepalive_thread.is_alive():
209
  self.keepalive_thread.join(timeout=1.0)
210
-
211
- self.ws = None
212
  print(f"SonioxStreamer '{self.stream_name}' shutdown complete.")
 
13
  api_key = os.environ.get("SONIOX_API_KEY")
14
  if not api_key:
15
  raise RuntimeError("Missing SONIOX_API_KEY.")
16
+ if not fs_hz:
17
+ raise ValueError("Sample rate (fs_hz) must be specified.")
18
  self.stream_name = stream_name
19
  self.api_name = "Soniox RT"
20
  self.on_update = on_update
 
208
  self.thread.join(timeout=1.0)
209
  if hasattr(self, "keepalive_thread") and self.keepalive_thread.is_alive():
210
  self.keepalive_thread.join(timeout=1.0)
 
 
211
  print(f"SonioxStreamer '{self.stream_name}' shutdown complete.")
ui.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ LED_DOT_RED = """
2
+ <div class="led-dot led-dot--red"></div>
3
+ """
4
+
5
+ LED_DOT_BLACK = """
6
+ <div class="led-dot led-dot--black"></div>
7
+ """
8
+
9
+ LED_DOT_GREEN = """
10
+ <div class="led-dot led-dot--green"></div>
11
+ """
12
+
13
+ LED_DOT_OFF = """
14
+ <div class="led-dot led-dot--off"></div>
15
+ """
16
+
17
+
18
+ LED_DOT_YELLOW = """
19
+ <div class="led-dot led-dot--yellow"></div>
20
+ """
utils.py CHANGED
@@ -1,50 +1,99 @@
1
  from typing import Optional
2
- import numpy as np
3
- import librosa
4
- from PIL import Image
5
  import io
6
- import matplotlib.pyplot as plt
7
- from constants import DEFAULT_SR, TARGET_LOUDNESS, TARGET_TP
8
  import warnings
 
 
 
 
 
9
  import pyloudnorm as pyln
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
 
12
  def to_gradio_audio(x: np.ndarray, sr: int) -> tuple[int, np.ndarray]:
13
- """Return (sample_rate, int16 mono array) for Gradio Audio. Gradio expects int16;
14
- passing float32 triggers an internal conversion and a warning."""
15
  x = np.asarray(x)
16
-
17
- # Remove extra dims like (1, n, 1) etc.
18
  x = np.squeeze(x)
19
 
20
- # If it's (channels, samples), transpose to (samples, channels)
21
  if x.ndim == 2 and x.shape[0] in (1, 2) and x.shape[1] > x.shape[0]:
22
  x = x.T
23
 
24
- # Ensure mono is (n_samples,)
25
  if x.ndim == 2 and x.shape[1] == 1:
26
  x = x[:, 0]
27
 
28
  x = x.astype(np.float32)
29
  x = np.clip(x, -1.0, 1.0)
30
- # Gradio Audio expects int16; convert here so Gradio doesn't convert and warn
31
  x = (x * 32767).astype(np.int16)
32
 
33
- return (sr, x)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
 
36
  def spec_image(
37
  audio_array: np.ndarray,
38
- sr: int = DEFAULT_SR,
39
  n_fft: int = 2048,
40
  hop_length: int = 512,
41
  n_mels: int = 128,
42
  fmax: Optional[float] = None,
 
43
  ) -> Image.Image:
44
- """
45
- Generate a mel-spectrogram image from an audio array.
46
- """
47
- y = audio_array.flatten() # Ensure it's 1D
48
  S = librosa.feature.melspectrogram(
49
  y=y,
50
  sr=sr,
@@ -53,20 +102,66 @@ def spec_image(
53
  n_mels=n_mels,
54
  fmax=fmax or sr // 2,
55
  )
56
- S_db = librosa.power_to_db(S, ref=np.max(S))
 
57
  fig, ax = plt.subplots(figsize=(8, 3), dpi=150)
 
58
  img = librosa.display.specshow(
59
- S_db, sr=sr, hop_length=hop_length, x_axis="time", y_axis="mel", ax=ax
 
 
 
 
 
 
60
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  cbar = fig.colorbar(img, ax=ax, format="%+2.0f dB")
62
  cbar.set_label("dB")
 
63
  ax.set_title("Mel-spectrogram")
64
  ax.set_xlabel("Time in s")
65
  ax.set_ylabel("Frequency in Hz")
 
66
  fig.tight_layout(pad=0.2)
 
67
  buf = io.BytesIO()
68
  fig.savefig(buf, format="png", bbox_inches="tight", pad_inches=0)
69
  plt.close(fig)
 
70
  buf.seek(0)
71
  return Image.open(buf).convert("RGB")
72
 
@@ -77,24 +172,24 @@ def compute_wer(reference: str, hypothesis: str) -> float:
77
  """
78
  ref_words = reference.split()
79
  hyp_words = hypothesis.split()
80
- d = np.zeros((len(ref_words) + 1, len(hyp_words) + 1), dtype=np.uint8)
 
 
81
  for i in range(len(ref_words) + 1):
82
  d[i][0] = i
83
  for j in range(len(hyp_words) + 1):
84
  d[0][j] = j
 
85
  for i in range(1, len(ref_words) + 1):
86
  for j in range(1, len(hyp_words) + 1):
87
- if ref_words[i - 1] == hyp_words[j - 1]:
88
- cost = 0
89
- else:
90
- cost = 1
91
  d[i][j] = min(
92
- d[i - 1][j] + 1, # Deletion
93
- d[i][j - 1] + 1, # Insertion
94
- d[i - 1][j - 1] + cost, # Substitution
95
  )
96
- wer = d[len(ref_words)][len(hyp_words)] / max(len(ref_words), 1)
97
- return wer
98
 
99
 
100
  def measure_loudness(x: np.ndarray, sr: int) -> float:
@@ -102,7 +197,11 @@ def measure_loudness(x: np.ndarray, sr: int) -> float:
102
  return float(meter.integrated_loudness(x))
103
 
104
 
105
- def true_peak_limiter(x: np.ndarray, sr: int, max_true_peak: float = TARGET_TP) -> np.ndarray:
 
 
 
 
106
  upsampled_sr = 192000
107
  x_upsampled = librosa.resample(x, orig_sr=sr, target_sr=upsampled_sr)
108
  true_peak = np.max(np.abs(x_upsampled))
@@ -116,7 +215,7 @@ def true_peak_limiter(x: np.ndarray, sr: int, max_true_peak: float = TARGET_TP)
116
 
117
  x_limited = librosa.resample(x_upsampled, orig_sr=upsampled_sr, target_sr=sr)
118
  x_limited = librosa.util.fix_length(x_limited, size=x.shape[-1])
119
- return x_limited.astype("float32")
120
 
121
 
122
  def normalize_lufs(x: np.ndarray, sr: int) -> np.ndarray:
@@ -125,9 +224,9 @@ def normalize_lufs(x: np.ndarray, sr: int) -> np.ndarray:
125
  """
126
  try:
127
  current_lufs = measure_loudness(x, sr)
128
-
129
  if not np.isfinite(current_lufs):
130
- return x.astype("float32")
131
 
132
  gain_db = TARGET_LOUDNESS - current_lufs
133
  gain = 10 ** (gain_db / 20)
@@ -135,7 +234,7 @@ def normalize_lufs(x: np.ndarray, sr: int) -> np.ndarray:
135
  y = x * gain
136
  y = true_peak_limiter(y, sr, max_true_peak=TARGET_TP)
137
 
138
- return y.astype("float32")
139
  except Exception as e:
140
  warnings.warn(f"LUFS normalization failed, returning input unchanged: {e}")
141
- return x.astype("float32")
 
1
  from typing import Optional
 
 
 
2
  import io
 
 
3
  import warnings
4
+
5
+ import librosa
6
+ import librosa.display
7
+ import matplotlib.pyplot as plt
8
+ import numpy as np
9
  import pyloudnorm as pyln
10
+ from matplotlib.patches import Patch
11
+ from PIL import Image
12
+
13
+ from constants import TARGET_LOUDNESS, TARGET_TP, VAD_OFF, VAD_ON
14
+
15
+
16
+ def get_vad_labels(vad_timestamps: list[list[float]], length: float) -> list[dict]:
17
+ subtitles = []
18
+ cur = 0.0
19
+
20
+ for start, end in vad_timestamps:
21
+ if start > cur:
22
+ subtitles.append(
23
+ {
24
+ "text": f"Voice Detection: {VAD_OFF}",
25
+ "timestamp": [cur, start],
26
+ }
27
+ )
28
+
29
+ subtitles.append(
30
+ {
31
+ "text": f"Voice Detection: {VAD_ON}",
32
+ "timestamp": [start, end],
33
+ }
34
+ )
35
+ cur = end
36
+
37
+ if cur < length:
38
+ subtitles.append(
39
+ {
40
+ "text": f"Voice Detection: {VAD_OFF}",
41
+ "timestamp": [cur, length],
42
+ }
43
+ )
44
+
45
+ return subtitles
46
 
47
 
48
  def to_gradio_audio(x: np.ndarray, sr: int) -> tuple[int, np.ndarray]:
49
+ """Return (sample_rate, int16 array) for Gradio Audio."""
 
50
  x = np.asarray(x)
 
 
51
  x = np.squeeze(x)
52
 
 
53
  if x.ndim == 2 and x.shape[0] in (1, 2) and x.shape[1] > x.shape[0]:
54
  x = x.T
55
 
 
56
  if x.ndim == 2 and x.shape[1] == 1:
57
  x = x[:, 0]
58
 
59
  x = x.astype(np.float32)
60
  x = np.clip(x, -1.0, 1.0)
 
61
  x = (x * 32767).astype(np.int16)
62
 
63
+ return sr, x
64
+
65
+
66
+ def _merge_vad_segments(
67
+ vad_timestamps: list[list[float]],
68
+ gap_tolerance: float = 0.05,
69
+ ) -> list[tuple[float, float]]:
70
+ if not vad_timestamps:
71
+ return []
72
+
73
+ segments = sorted((float(start), float(end)) for start, end in vad_timestamps)
74
+ merged: list[tuple[float, float]] = [segments[0]]
75
+
76
+ for start, end in segments[1:]:
77
+ last_start, last_end = merged[-1]
78
+ if start <= last_end + gap_tolerance:
79
+ merged[-1] = (last_start, max(last_end, end))
80
+ else:
81
+ merged.append((start, end))
82
+
83
+ return merged
84
 
85
 
86
  def spec_image(
87
  audio_array: np.ndarray,
88
+ sr: int,
89
  n_fft: int = 2048,
90
  hop_length: int = 512,
91
  n_mels: int = 128,
92
  fmax: Optional[float] = None,
93
+ vad_timestamps: Optional[list[list[float]]] = None,
94
  ) -> Image.Image:
95
+ y = np.asarray(audio_array, dtype=np.float32).flatten()
96
+
 
 
97
  S = librosa.feature.melspectrogram(
98
  y=y,
99
  sr=sr,
 
102
  n_mels=n_mels,
103
  fmax=fmax or sr // 2,
104
  )
105
+ S_db = librosa.power_to_db(S, ref=np.max)
106
+
107
  fig, ax = plt.subplots(figsize=(8, 3), dpi=150)
108
+
109
  img = librosa.display.specshow(
110
+ S_db,
111
+ sr=sr,
112
+ hop_length=hop_length,
113
+ x_axis="time",
114
+ y_axis="mel",
115
+ cmap="magma",
116
+ ax=ax,
117
  )
118
+
119
+ if vad_timestamps:
120
+ vad_color = "#22C55E" # softer, cleaner green
121
+ merged_segments = _merge_vad_segments(vad_timestamps, gap_tolerance=0.05)
122
+
123
+ # Draw VAD bar as a fixed portion of the figure height (e.g., 4% of axes height)
124
+ bar_height_axes = 0.05 # 2% of axes height
125
+ bar_bottom_axes = 0.0 # 0% above the bottom
126
+
127
+ for start, end in merged_segments:
128
+ ax.fill_between(
129
+ [start, end],
130
+ [bar_bottom_axes, bar_bottom_axes],
131
+ [bar_bottom_axes + bar_height_axes, bar_bottom_axes + bar_height_axes],
132
+ color=vad_color,
133
+ alpha=0.95,
134
+ linewidth=0,
135
+ zorder=5,
136
+ transform=ax.get_xaxis_transform(),
137
+ )
138
+
139
+ vad_patch = Patch(
140
+ facecolor=vad_color,
141
+ edgecolor=vad_color,
142
+ label="Voice Activity",
143
+ )
144
+ ax.legend(
145
+ handles=[vad_patch],
146
+ loc="upper right",
147
+ fontsize=8,
148
+ frameon=True,
149
+ framealpha=0.9,
150
+ )
151
+
152
  cbar = fig.colorbar(img, ax=ax, format="%+2.0f dB")
153
  cbar.set_label("dB")
154
+
155
  ax.set_title("Mel-spectrogram")
156
  ax.set_xlabel("Time in s")
157
  ax.set_ylabel("Frequency in Hz")
158
+
159
  fig.tight_layout(pad=0.2)
160
+
161
  buf = io.BytesIO()
162
  fig.savefig(buf, format="png", bbox_inches="tight", pad_inches=0)
163
  plt.close(fig)
164
+
165
  buf.seek(0)
166
  return Image.open(buf).convert("RGB")
167
 
 
172
  """
173
  ref_words = reference.split()
174
  hyp_words = hypothesis.split()
175
+
176
+ d = np.zeros((len(ref_words) + 1, len(hyp_words) + 1), dtype=np.uint16)
177
+
178
  for i in range(len(ref_words) + 1):
179
  d[i][0] = i
180
  for j in range(len(hyp_words) + 1):
181
  d[0][j] = j
182
+
183
  for i in range(1, len(ref_words) + 1):
184
  for j in range(1, len(hyp_words) + 1):
185
+ cost = 0 if ref_words[i - 1] == hyp_words[j - 1] else 1
 
 
 
186
  d[i][j] = min(
187
+ d[i - 1][j] + 1,
188
+ d[i][j - 1] + 1,
189
+ d[i - 1][j - 1] + cost,
190
  )
191
+
192
+ return d[len(ref_words)][len(hyp_words)] / max(len(ref_words), 1)
193
 
194
 
195
  def measure_loudness(x: np.ndarray, sr: int) -> float:
 
197
  return float(meter.integrated_loudness(x))
198
 
199
 
200
+ def true_peak_limiter(
201
+ x: np.ndarray,
202
+ sr: int,
203
+ max_true_peak: float = TARGET_TP,
204
+ ) -> np.ndarray:
205
  upsampled_sr = 192000
206
  x_upsampled = librosa.resample(x, orig_sr=sr, target_sr=upsampled_sr)
207
  true_peak = np.max(np.abs(x_upsampled))
 
215
 
216
  x_limited = librosa.resample(x_upsampled, orig_sr=upsampled_sr, target_sr=sr)
217
  x_limited = librosa.util.fix_length(x_limited, size=x.shape[-1])
218
+ return x_limited.astype(np.float32)
219
 
220
 
221
  def normalize_lufs(x: np.ndarray, sr: int) -> np.ndarray:
 
224
  """
225
  try:
226
  current_lufs = measure_loudness(x, sr)
227
+
228
  if not np.isfinite(current_lufs):
229
+ return x.astype(np.float32)
230
 
231
  gain_db = TARGET_LOUDNESS - current_lufs
232
  gain = 10 ** (gain_db / 20)
 
234
  y = x * gain
235
  y = true_peak_limiter(y, sr, max_true_peak=TARGET_TP)
236
 
237
+ return y.astype(np.float32)
238
  except Exception as e:
239
  warnings.warn(f"LUFS normalization failed, returning input unchanged: {e}")
240
+ return x.astype(np.float32)