mariesig commited on
Commit
4e945b9
·
1 Parent(s): be39c5b

refactor offline pipeline

Browse files
Files changed (4) hide show
  1. app.py +16 -119
  2. clean_up.py +2 -4
  3. offline_pipeline.py +201 -89
  4. utils.py +5 -0
app.py CHANGED
@@ -1,12 +1,10 @@
1
  import os
2
- import threading
3
- import time
4
  import gradio as gr
5
  from pathlib import Path
6
 
7
  from constants import STREAM_EVERY, APP_TMP_DIR
8
  from ui import LED_DOT_OFF
9
- from hf_dataset_utils import ALL_FILES, get_transcript
10
 
11
  from stream_pipeline import (
12
  on_start_recording,
@@ -17,114 +15,10 @@ from stream_pipeline import (
17
  from offline_pipeline import (
18
  load_file_from_dataset,
19
  load_local_file,
20
- run_offline_pipeline_streaming,
21
  )
22
- from utils import spec_image
23
  from clean_up import purge_tmp_directory, cleanup_previous_run
24
 
25
- def process_with_live_transcript(
26
- input_array,
27
- enhancement_level,
28
- sample_stem,
29
- stt_model,
30
- last_sample_stem,
31
- current_sample_rate,
32
- ):
33
- """Generator that runs the offline pipeline in real time (chunked): enhanced audio and
34
- both transcripts stream from the first chunk so playback and transcription start immediately."""
35
- progress_state = {}
36
- result_holder = {}
37
-
38
- def worker():
39
- try:
40
- result_holder["result"] = run_offline_pipeline_streaming(
41
- input_array,
42
- current_sample_rate,
43
- enhancement_level,
44
- sample_stem,
45
- stt_model,
46
- progress_state
47
- )
48
- except Exception as e:
49
- result_holder["error"] = e
50
-
51
- # 1) First yield: ground truth + input spectrogram only (no audio, no enhanced spec, no transcripts yet)
52
- _ = cleanup_previous_run(last_sample_stem)
53
- noisy_spec_path = f"{APP_TMP_DIR}/{sample_stem}_noisy_spectrogram.png"
54
- if input_array is not None:
55
- try:
56
- spec_image(input_array, sr = current_sample_rate).save(noisy_spec_path)
57
- except Exception:
58
- noisy_spec_path = None
59
- else:
60
- noisy_spec_path = None
61
- try:
62
- original_transcript = get_transcript(sample_stem)
63
- except Exception:
64
- original_transcript = "Unavailable"
65
-
66
- yield (
67
- gr.update(visible=True),
68
- None, # enhanced_audio: set only in final yield (smooth playback)
69
- gr.update(value=None), # enhanced_image: clear until step 3 (last)
70
- gr.update(value=noisy_spec_path), # noisy_image: input spectrogram (step 1)
71
- original_transcript,
72
- "",
73
- "",
74
- sample_stem,
75
- None,
76
- "",
77
- )
78
- # Let the UI render step 1 before we flood with polling updates
79
- time.sleep(0.2)
80
-
81
- thread = threading.Thread(target=worker, daemon=True)
82
- thread.start()
83
-
84
- poll_interval = 0.05
85
- while "result" not in result_holder and "error" not in result_holder:
86
- time.sleep(poll_interval)
87
- # 2) Realtime: stream transcripts only; audio set in final yield for smooth playback
88
- yield (
89
- gr.update(visible=True),
90
- gr.update(), # enhanced_audio: set only in final yield, then autoplay
91
- gr.update(), # enhanced_image: reveal only in step 3 (final yield)
92
- gr.update(), # noisy_image already set in step 1
93
- gr.update(), # original_transcript unchanged
94
- gr.update(value=progress_state.get("noisy", "")),
95
- gr.update(value=progress_state.get("enhanced", "")),
96
- gr.update(),
97
- gr.update(),
98
- gr.update(),
99
- )
100
-
101
- if "error" in result_holder:
102
- raise result_holder["error"]
103
-
104
- (
105
- enhanced_spec_path,
106
- enhanced_transcript,
107
- noisy_transcript_with_wer,
108
- enhanced_audio,
109
- vad_labels,
110
- last_stem,
111
- enhanced_array,
112
- precomputed_noisy,
113
- ) = result_holder["result"]
114
- # 3) Last: reveal enhanced spectrogram (and final audio/transcripts)
115
- yield (
116
- gr.update(visible=True),
117
- gr.update(value=enhanced_audio, subtitles=vad_labels),
118
- enhanced_spec_path, # enhanced_image: show only now
119
- noisy_spec_path,
120
- original_transcript,
121
- noisy_transcript_with_wer,
122
- enhanced_transcript,
123
- last_stem,
124
- enhanced_array,
125
- precomputed_noisy,
126
- )
127
-
128
 
129
  _CSS_DIR = Path(__file__).resolve().parent / "assets"
130
  with gr.Blocks() as demo:
@@ -132,8 +26,6 @@ with gr.Blocks() as demo:
132
  last_sample_stem = gr.State("")
133
  input_array = gr.State()
134
  streaming_sr = gr.State(None)
135
- enhanced_array = gr.State()
136
- precomputed_noisy_transcript = gr.State("")
137
  current_sample_rate = gr.State(None)
138
 
139
  gr.HTML(
@@ -290,16 +182,17 @@ with gr.Blocks() as demo:
290
  gr.update(streaming=False, interactive=False),
291
  gr.update(visible=True),
292
  LED_DOT_OFF,
 
293
  "Off",
294
  )
295
 
296
  upload_tab.select(
297
  _on_not_streaming_tab,
298
- outputs=[audio_stream, enhance_btn, system_status_led, system_status_text],
299
  ).then(
300
  load_local_file,
301
  inputs=[audio_file_upload, normalize],
302
- outputs=[input_array, sample_stem, audio_preview, current_sample_rate],
303
  )
304
 
305
  dataset_tab.select(
@@ -341,7 +234,6 @@ with gr.Blocks() as demo:
341
  # Dataset dropdown selection triggers loading the audio file and hiding results until enhancement
342
  dataset_dropdown.change(
343
  lambda: gr.update(visible=False),
344
- inputs=None,
345
  outputs=results_card,
346
  ).then(
347
  load_file_from_dataset,
@@ -367,10 +259,15 @@ with gr.Blocks() as demo:
367
 
368
  # Enhancement button: run pipeline with live transcript progress (dataset + local file modes).
369
  enhance_btn.click(
370
- process_with_live_transcript,
371
- inputs=[input_array, enhancement_level, sample_stem, stt_model, last_sample_stem, current_sample_rate],
 
 
 
 
 
 
372
  outputs=[
373
- results_card,
374
  enhanced_audio,
375
  enhanced_image,
376
  noisy_image,
@@ -378,9 +275,9 @@ with gr.Blocks() as demo:
378
  noisy_transcript,
379
  enhanced_transcript,
380
  last_sample_stem,
381
- enhanced_array,
382
- precomputed_noisy_transcript,
383
- ],
384
  )
385
 
386
  os.makedirs(APP_TMP_DIR, exist_ok=True)
 
1
  import os
 
 
2
  import gradio as gr
3
  from pathlib import Path
4
 
5
  from constants import STREAM_EVERY, APP_TMP_DIR
6
  from ui import LED_DOT_OFF
7
+ from hf_dataset_utils import ALL_FILES
8
 
9
  from stream_pipeline import (
10
  on_start_recording,
 
15
  from offline_pipeline import (
16
  load_file_from_dataset,
17
  load_local_file,
18
+ run_offline_pipeline,
19
  )
 
20
  from clean_up import purge_tmp_directory, cleanup_previous_run
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  _CSS_DIR = Path(__file__).resolve().parent / "assets"
24
  with gr.Blocks() as demo:
 
26
  last_sample_stem = gr.State("")
27
  input_array = gr.State()
28
  streaming_sr = gr.State(None)
 
 
29
  current_sample_rate = gr.State(None)
30
 
31
  gr.HTML(
 
182
  gr.update(streaming=False, interactive=False),
183
  gr.update(visible=True),
184
  LED_DOT_OFF,
185
+ LED_DOT_OFF,
186
  "Off",
187
  )
188
 
189
  upload_tab.select(
190
  _on_not_streaming_tab,
191
+ outputs=[audio_stream, enhance_btn, vad_led, system_status_led, system_status_text],
192
  ).then(
193
  load_local_file,
194
  inputs=[audio_file_upload, normalize],
195
+ outputs=[input_array, sample_stem, vad_led,audio_preview, current_sample_rate],
196
  )
197
 
198
  dataset_tab.select(
 
234
  # Dataset dropdown selection triggers loading the audio file and hiding results until enhancement
235
  dataset_dropdown.change(
236
  lambda: gr.update(visible=False),
 
237
  outputs=results_card,
238
  ).then(
239
  load_file_from_dataset,
 
259
 
260
  # Enhancement button: run pipeline with live transcript progress (dataset + local file modes).
261
  enhance_btn.click(
262
+ cleanup_previous_run,
263
+ inputs=[last_sample_stem]
264
+ ).then(
265
+ lambda: gr.update(visible=True),
266
+ outputs=results_card,
267
+ ).then(
268
+ run_offline_pipeline,
269
+ inputs=[input_array, current_sample_rate, enhancement_level, stt_model, sample_stem],
270
  outputs=[
 
271
  enhanced_audio,
272
  enhanced_image,
273
  noisy_image,
 
275
  noisy_transcript,
276
  enhanced_transcript,
277
  last_sample_stem,
278
+ ]
279
+ ).failure(
280
+ lambda: gr.Warning("Enhancement failed. Please refresh page and make sure you have a stable connection.")
281
  )
282
 
283
  os.makedirs(APP_TMP_DIR, exist_ok=True)
clean_up.py CHANGED
@@ -93,11 +93,9 @@ def cleanup_previous_run(
93
  sample_stem: str,
94
  tmp_dir: str = APP_TMP_DIR,
95
  max_age_minutes: int = MINUTES_KEEP,
96
- ) -> tuple[None, None, str, str, str]:
97
- gr.Info("Processing started. This may take a moment. Please do not refresh or close the window.")
98
  try:
99
  remove_files_related_to(sample_stem, tmp_dir=tmp_dir)
100
  except Exception as e:
101
  print(f"Failed to delete last run with id {sample_stem}: {e}")
102
- purge_tmp_directory(max_age_minutes=max_age_minutes, tmp_dir=tmp_dir)
103
- return None, None, "", "", ""
 
93
  sample_stem: str,
94
  tmp_dir: str = APP_TMP_DIR,
95
  max_age_minutes: int = MINUTES_KEEP,
96
+ ):
 
97
  try:
98
  remove_files_related_to(sample_stem, tmp_dir=tmp_dir)
99
  except Exception as e:
100
  print(f"Failed to delete last run with id {sample_stem}: {e}")
101
+ purge_tmp_directory(max_age_minutes=max_age_minutes, tmp_dir=tmp_dir)
 
offline_pipeline.py CHANGED
@@ -1,164 +1,276 @@
1
  import os
2
- from random import sample
3
 
4
  import gradio as gr
5
- import soundfile as sf
6
- from sdk import SDKWrapper, SDKParams
7
- from utils import spec_image, compute_wer, to_gradio_audio, normalize_lufs, get_vad_labels
8
- from hf_dataset_utils import get_audio, get_transcript
9
- from constants import APP_TMP_DIR, STREAMER_CLASSES
10
  import numpy as np
 
11
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  SDK_OFFLINE = SDKWrapper()
14
 
15
- def _close_stt_stream(streamer) -> None:
16
- """Signal end-of-stream; streamer type may be Soniox (close_stream) or Deepgram (close)."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  if hasattr(streamer, "close_stream"):
18
  streamer.close_stream()
19
  else:
20
  streamer.close()
21
 
22
- def run_offline_pipeline_streaming(
23
- sample: np.ndarray,
24
- sample_rate: int,
25
- enhancement_level: float,
26
- sample_id: str,
27
- stt_model: str,
28
- progress_state: dict,
29
- ) -> tuple[str, str, str, tuple[int, np.ndarray], list, str, np.ndarray, str]:
30
- """Run enhancement and both STTs in real time by processing in chunks. Transcripts stream
31
- via progress_state['noisy'] and progress_state['enhanced']. Enhanced audio is returned
32
- only at the end; the app plays it automatically when processing is complete.
33
- Returns same tuple as run_offline_pipeline_ordered."""
34
- if sample is None:
35
- raise ValueError("No audio to enhance. Please upload a file first.")
36
- sample = np.asarray(sample, dtype=np.float32).flatten()
37
  sdk_params = SDKParams(
38
  sample_rate=sample_rate,
39
- enhancement_level=enhancement_level/100.0,
40
- allow_variable_frames=False, # streaming pipeline uses fixed frames for simplicity
41
- num_channels=1,
42
  )
43
  SDK_OFFLINE.init_processor(sdk_params)
44
- chunk_size = SDK_OFFLINE.num_frames
45
 
46
- # Sync transcript callbacks so both boxes update together
47
- progress_state["noisy_pending"] = ""
48
- progress_state["enhanced_pending"] = ""
49
- progress_state["noisy_has_sent"] = False
50
- progress_state["enhanced_has_sent"] = False
51
 
52
- def _flush_both():
53
- if progress_state.get("noisy_has_sent") and progress_state.get("enhanced_has_sent"):
54
- progress_state["noisy"] = progress_state.get("noisy_pending", "")
55
- progress_state["enhanced"] = progress_state.get("enhanced_pending", "")
 
 
 
 
56
 
57
- def on_noisy(t: str):
58
- progress_state["noisy_pending"] = t
59
- progress_state["noisy_has_sent"] = True
60
- _flush_both()
61
 
62
- def on_enhanced(t: str):
63
- progress_state["enhanced_pending"] = t
64
- progress_state["enhanced_has_sent"] = True
65
- _flush_both()
66
 
67
- if stt_model not in STREAMER_CLASSES:
68
- raise ValueError(f"Unknown STT model: {stt_model}")
69
- StreamerClass = STREAMER_CLASSES[stt_model]
70
- streamer_noisy = StreamerClass(sample_rate, f"{sample_id}_noisy", on_update=on_noisy)
71
- streamer_enhanced = StreamerClass(sample_rate, f"{sample_id}_enhanced", on_update=on_enhanced)
 
 
 
 
 
 
 
 
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  accumulated_enhanced: list[np.ndarray] = []
74
- vad_timestamps = []
75
  n = len(sample)
76
 
77
  for i in range(0, n, chunk_size):
78
  raw_chunk = sample[i : i + chunk_size]
79
- if raw_chunk.size < chunk_size:
 
 
80
  raw_chunk = np.pad(
81
  raw_chunk,
82
- (0, chunk_size - raw_chunk.size),
83
  mode="constant",
84
  constant_values=0.0,
85
  )
86
- raw_2d = raw_chunk.reshape(1, -1)
87
- enhanced_chunk = SDK_OFFLINE.process_chunk(raw_2d)
88
- enhanced_1d = np.asarray(enhanced_chunk).flatten()
 
89
  streamer_noisy.process_chunk(raw_chunk)
90
  streamer_enhanced.process_chunk(enhanced_1d)
91
  accumulated_enhanced.append(enhanced_1d)
92
-
 
 
 
 
 
 
 
93
  if SDK_OFFLINE.vad_context.is_speech_detected():
94
- start_in_sec = i/ sample_rate
95
- end_in_sec = (i + chunk_size) / sample_rate
96
  vad_timestamps.append([start_in_sec, end_in_sec])
97
-
98
- _close_stt_stream(streamer_noisy)
99
- _close_stt_stream(streamer_enhanced)
100
- streamer_noisy.finished_event.wait()
101
- streamer_enhanced.finished_event.wait()
102
-
103
- with streamer_noisy.lock:
104
- noisy_transcript = streamer_noisy.render_tokens(streamer_noisy.final_tokens, [])
105
- with streamer_enhanced.lock:
106
- enhanced_transcript = streamer_enhanced.render_tokens(streamer_enhanced.final_tokens, [])
107
 
108
  enhanced_array = np.concatenate(accumulated_enhanced).astype(np.float32)
109
- gradio_enhanced_audio = to_gradio_audio(enhanced_array, sample_rate)
 
 
 
 
 
 
 
 
 
 
110
 
111
- enhanced_spec_path = f"{APP_TMP_DIR}/{sample_id}_enhanced_spectrogram.png"
112
- spec_image(enhanced_array, sr = sample_rate).save(enhanced_spec_path)
113
- progress_state["enhanced_spec_path"] = enhanced_spec_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
 
115
- precomputed_noisy = noisy_transcript
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  try:
117
  original_transcript = get_transcript(sample_id)
118
- wer_enhanced = compute_wer(original_transcript, enhanced_transcript)
119
- wer_noisy = compute_wer(original_transcript, noisy_transcript)
120
- enhanced_transcript += f" (WER: {wer_enhanced * 100:.2f}%)"
121
- noisy_transcript += f" (WER: {wer_noisy * 100:.2f}%)"
122
  except Exception:
123
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
- vad_labels = get_vad_labels(vad_timestamps, length=len(sample)/sample_rate)
126
  return (
 
127
  enhanced_spec_path,
128
- enhanced_transcript,
 
129
  noisy_transcript,
130
- gradio_enhanced_audio,
131
- vad_labels,
132
  sample_id,
133
- enhanced_array,
134
- precomputed_noisy,
135
  )
136
 
 
137
  def load_local_file(
138
  sample_path: str,
139
  normalize: bool = True,
140
- ) -> tuple[np.ndarray | None, str, tuple | None, int | None]:
141
  if not sample_path or not os.path.exists(sample_path):
142
  return None, "", None, None
 
143
  if os.path.getsize(sample_path) > 5 * 1024 * 1024:
144
  gr.Warning("File size exceeds 5 MB limit. Please upload a smaller file.")
145
  raise ValueError("Uploaded file exceeds the 5 MB size limit.")
 
146
  new_sample_stem = os.path.splitext(os.path.basename(sample_path))[0]
147
  y, sample_rate = sf.read(sample_path, dtype="float32", always_2d=False)
148
  if normalize:
149
  y = normalize_lufs(y, sample_rate)
 
150
  gradio_audio = to_gradio_audio(y, sample_rate)
151
  return y, new_sample_stem, gradio_audio, sample_rate
152
 
153
- def load_file_from_dataset(sample_id: str) -> tuple[tuple | None, np.ndarray | None, str, int | None]:
 
 
 
154
  if not sample_id:
155
  gr.Warning("Please select a sample from the dropdown.")
156
  return None, None, "", None
 
157
  new_sample_stem = sample_id
 
158
  try:
159
  y, sample_rate = get_audio(sample_id, prefix="mix")
160
  y_for_gradio = to_gradio_audio(y, sample_rate)
161
- except Exception as e: # Convert to 16-bit PCM for gradio audio component
162
- gr.Warning(f"{e}")
163
- raise e
 
164
  return y_for_gradio, y, new_sample_stem, sample_rate
 
1
  import os
2
+ from typing import Any
3
 
4
  import gradio as gr
 
 
 
 
 
5
  import numpy as np
6
+ import soundfile as sf
7
 
8
+ from constants import APP_TMP_DIR, STREAMER_CLASSES
9
+ from hf_dataset_utils import get_audio, get_transcript
10
+ from sdk import SDKParams, SDKWrapper
11
+ from utils import (
12
+ compute_wer,
13
+ get_vad_labels,
14
+ normalize_lufs,
15
+ spec_image,
16
+ to_gradio_audio,
17
+ )
18
 
19
  SDK_OFFLINE = SDKWrapper()
20
 
21
+
22
+ def _safe_progress(progress: gr.Progress, value: float, desc: str) -> None:
23
+ progress(max(0.0, min(1.0, value)), desc=desc)
24
+
25
+
26
+ def _empty_pipeline_result(sample_id: str) -> tuple[Any, str, str, str, str, str, str]:
27
+ return (
28
+ None,
29
+ "",
30
+ "",
31
+ "Unavailable",
32
+ "Unavailable",
33
+ "Unavailable",
34
+ sample_id,
35
+ )
36
+
37
+
38
+ def _finalize_stream_transcript(streamer) -> str:
39
  if hasattr(streamer, "close_stream"):
40
  streamer.close_stream()
41
  else:
42
  streamer.close()
43
 
44
+ streamer.finished_event.wait()
45
+ with streamer.lock:
46
+ return streamer.render_tokens(streamer.final_tokens, [])
47
+
48
+
49
+ def _init_sdk(sample_rate: int, enhancement_level: int) -> int:
 
 
 
 
 
 
 
 
 
50
  sdk_params = SDKParams(
51
  sample_rate=sample_rate,
52
+ enhancement_level=enhancement_level / 100.0,
 
 
53
  )
54
  SDK_OFFLINE.init_processor(sdk_params)
55
+ return SDK_OFFLINE.num_frames
56
 
 
 
 
 
 
57
 
58
+ def _init_streamers(
59
+ sample_rate: int,
60
+ stt_model: str,
61
+ sample_id: str,
62
+ progress: gr.Progress,
63
+ ):
64
+ if stt_model not in STREAMER_CLASSES:
65
+ raise ValueError(f"Unknown STT model: {stt_model}")
66
 
67
+ streamer_class = STREAMER_CLASSES[stt_model]
 
 
 
68
 
69
+ _safe_progress(progress, 0.12, f"Initializing {stt_model} stream 1/2...")
70
+ streamer_noisy = streamer_class(sample_rate, f"{sample_id}_noisy")
 
 
71
 
72
+ _safe_progress(progress, 0.18, f"Initializing {stt_model} stream 2/2...")
73
+ streamer_enhanced = streamer_class(sample_rate, f"{sample_id}_enhanced")
74
+
75
+ return streamer_noisy, streamer_enhanced
76
+
77
+
78
+ def _attach_wer(
79
+ original_transcript: str,
80
+ noisy_transcript: str,
81
+ enhanced_transcript: str,
82
+ ) -> tuple[str, str]:
83
+ wer_enhanced = compute_wer(original_transcript, enhanced_transcript)
84
+ wer_noisy = compute_wer(original_transcript, noisy_transcript)
85
 
86
+ noisy_transcript = f"{noisy_transcript} (WER: {wer_noisy * 100:.2f}%)"
87
+ enhanced_transcript = f"{enhanced_transcript} (WER: {wer_enhanced * 100:.2f}%)"
88
+ return noisy_transcript, enhanced_transcript
89
+
90
+
91
+ def _process_audio_chunks(
92
+ sample: np.ndarray,
93
+ sample_rate: int,
94
+ chunk_size: int,
95
+ streamer_noisy,
96
+ streamer_enhanced,
97
+ progress: gr.Progress,
98
+ ) -> tuple[np.ndarray, list[list[float]]]:
99
  accumulated_enhanced: list[np.ndarray] = []
100
+ vad_timestamps: list[list[float]] = []
101
  n = len(sample)
102
 
103
  for i in range(0, n, chunk_size):
104
  raw_chunk = sample[i : i + chunk_size]
105
+ original_chunk_len = raw_chunk.size
106
+
107
+ if original_chunk_len < chunk_size:
108
  raw_chunk = np.pad(
109
  raw_chunk,
110
+ (0, chunk_size - original_chunk_len),
111
  mode="constant",
112
  constant_values=0.0,
113
  )
114
+
115
+ enhanced_chunk = SDK_OFFLINE.process_chunk(raw_chunk.reshape(1, -1))
116
+ enhanced_1d = np.asarray(enhanced_chunk, dtype=np.float32).flatten()
117
+
118
  streamer_noisy.process_chunk(raw_chunk)
119
  streamer_enhanced.process_chunk(enhanced_1d)
120
  accumulated_enhanced.append(enhanced_1d)
121
+
122
+ loop_progress = (i + original_chunk_len) / n if n > 0 else 1.0
123
+ _safe_progress(
124
+ progress,
125
+ 0.20 + 0.60 * loop_progress,
126
+ "Enhancing audio...",
127
+ )
128
+
129
  if SDK_OFFLINE.vad_context.is_speech_detected():
130
+ start_in_sec = i / sample_rate
131
+ end_in_sec = min(i + original_chunk_len, n) / sample_rate
132
  vad_timestamps.append([start_in_sec, end_in_sec])
 
 
 
 
 
 
 
 
 
 
133
 
134
  enhanced_array = np.concatenate(accumulated_enhanced).astype(np.float32)
135
+ return enhanced_array, vad_timestamps
136
+
137
+
138
+ def _save_spectrograms(
139
+ sample: np.ndarray,
140
+ enhanced_array: np.ndarray,
141
+ sample_rate: int,
142
+ sample_id: str,
143
+ vad_timestamps: list[list[float]],
144
+ ) -> tuple[str, str]:
145
+ os.makedirs(APP_TMP_DIR, exist_ok=True)
146
 
147
+ enhanced_spec_path = os.path.join(APP_TMP_DIR, f"{sample_id}_enhanced_spectrogram.png")
148
+ noisy_spec_path = os.path.join(APP_TMP_DIR, f"{sample_id}_noisy_spectrogram.png")
149
+
150
+ spec_image(enhanced_array, sr=sample_rate, vad_timestamps=vad_timestamps).save(enhanced_spec_path)
151
+ spec_image(sample, sr=sample_rate, vad_timestamps=vad_timestamps).save(noisy_spec_path)
152
+
153
+ return enhanced_spec_path, noisy_spec_path
154
+
155
+
156
+ def run_offline_pipeline(
157
+ sample: np.ndarray,
158
+ sample_rate: int,
159
+ enhancement_level: int,
160
+ stt_model: str,
161
+ sample_id: str,
162
+ progress=gr.Progress(),
163
+ ) -> tuple[Any, str, str, str, str, str, str]:
164
+ _safe_progress(progress, 0.00, "Starting...")
165
+
166
+ if sample is None or len(sample) == 0:
167
+ gr.Warning("No audio to enhance. Please upload a file first.")
168
+ return _empty_pipeline_result(sample_id)
169
+
170
+ sample = np.asarray(sample, dtype=np.float32).flatten()
171
+
172
+ _safe_progress(progress, 0.05, "Initializing enhancement...")
173
+ chunk_size = _init_sdk(sample_rate, enhancement_level)
174
 
175
+ try:
176
+ streamer_noisy, streamer_enhanced = _init_streamers(
177
+ sample_rate=sample_rate,
178
+ stt_model=stt_model,
179
+ sample_id=sample_id,
180
+ progress=progress,
181
+ )
182
+ except Exception as e:
183
+ raise RuntimeError(f"Failed to initialize STT streaming: {e}") from e
184
+
185
+ enhanced_array, vad_timestamps = _process_audio_chunks(
186
+ sample=sample,
187
+ sample_rate=sample_rate,
188
+ chunk_size=chunk_size,
189
+ streamer_noisy=streamer_noisy,
190
+ streamer_enhanced=streamer_enhanced,
191
+ progress=progress,
192
+ )
193
+
194
+ _safe_progress(progress, 0.82, "Finalizing transcripts...")
195
+ noisy_transcript = _finalize_stream_transcript(streamer_noisy)
196
+ _safe_progress(progress, 0.88, "Finalizing transcripts...")
197
+ enhanced_transcript = _finalize_stream_transcript(streamer_enhanced)
198
+
199
+ _safe_progress(progress, 0.94, "Loading reference transcript...")
200
  try:
201
  original_transcript = get_transcript(sample_id)
 
 
 
 
202
  except Exception:
203
+ original_transcript = "Unavailable"
204
+ if original_transcript != "Unavailable":
205
+ _safe_progress(progress, 0.96, "Computing WER...")
206
+ noisy_transcript, enhanced_transcript = _attach_wer(
207
+ original_transcript=original_transcript,
208
+ noisy_transcript=noisy_transcript,
209
+ enhanced_transcript=enhanced_transcript,
210
+ )
211
+
212
+ _safe_progress(progress, 0.99, "Generating outputs...")
213
+ gradio_enhanced_audio = to_gradio_audio(enhanced_array, sample_rate)
214
+ enhanced_spec_path, noisy_spec_path = _save_spectrograms(
215
+ sample=sample,
216
+ enhanced_array=enhanced_array,
217
+ sample_rate=sample_rate,
218
+ sample_id=sample_id,
219
+ vad_timestamps=vad_timestamps
220
+ )
221
+
222
+ vad_labels = get_vad_labels(
223
+ vad_timestamps,
224
+ length=len(sample) / sample_rate,
225
+ )
226
+
227
+ _safe_progress(progress, 1.00, "Done.")
228
 
 
229
  return (
230
+ gr.update(value=gradio_enhanced_audio, subtitles=vad_labels),
231
  enhanced_spec_path,
232
+ noisy_spec_path,
233
+ original_transcript,
234
  noisy_transcript,
235
+ enhanced_transcript,
 
236
  sample_id,
 
 
237
  )
238
 
239
+
240
  def load_local_file(
241
  sample_path: str,
242
  normalize: bool = True,
243
+ ) -> tuple[np.ndarray | None, str, tuple | None, int | None]:
244
  if not sample_path or not os.path.exists(sample_path):
245
  return None, "", None, None
246
+
247
  if os.path.getsize(sample_path) > 5 * 1024 * 1024:
248
  gr.Warning("File size exceeds 5 MB limit. Please upload a smaller file.")
249
  raise ValueError("Uploaded file exceeds the 5 MB size limit.")
250
+
251
  new_sample_stem = os.path.splitext(os.path.basename(sample_path))[0]
252
  y, sample_rate = sf.read(sample_path, dtype="float32", always_2d=False)
253
  if normalize:
254
  y = normalize_lufs(y, sample_rate)
255
+
256
  gradio_audio = to_gradio_audio(y, sample_rate)
257
  return y, new_sample_stem, gradio_audio, sample_rate
258
 
259
+
260
+ def load_file_from_dataset(
261
+ sample_id: str,
262
+ ) -> tuple[tuple | None, np.ndarray | None, str, int | None]:
263
  if not sample_id:
264
  gr.Warning("Please select a sample from the dropdown.")
265
  return None, None, "", None
266
+
267
  new_sample_stem = sample_id
268
+
269
  try:
270
  y, sample_rate = get_audio(sample_id, prefix="mix")
271
  y_for_gradio = to_gradio_audio(y, sample_rate)
272
+ except Exception as e:
273
+ gr.Warning(str(e))
274
+ raise
275
+
276
  return y_for_gradio, y, new_sample_stem, sample_rate
utils.py CHANGED
@@ -63,6 +63,7 @@ def spec_image(
63
  hop_length: int = 512,
64
  n_mels: int = 128,
65
  fmax: Optional[float] = None,
 
66
  ) -> Image.Image:
67
  """
68
  Generate a mel-spectrogram image from an audio array.
@@ -89,6 +90,10 @@ def spec_image(
89
  fig.tight_layout(pad=0.2)
90
  buf = io.BytesIO()
91
  fig.savefig(buf, format="png", bbox_inches="tight", pad_inches=0)
 
 
 
 
92
  plt.close(fig)
93
  buf.seek(0)
94
  return Image.open(buf).convert("RGB")
 
63
  hop_length: int = 512,
64
  n_mels: int = 128,
65
  fmax: Optional[float] = None,
66
+ vad_timestamps: Optional[list[list[float]]] = None,
67
  ) -> Image.Image:
68
  """
69
  Generate a mel-spectrogram image from an audio array.
 
90
  fig.tight_layout(pad=0.2)
91
  buf = io.BytesIO()
92
  fig.savefig(buf, format="png", bbox_inches="tight", pad_inches=0)
93
+ if vad_timestamps:
94
+ for start, end in vad_timestamps:
95
+ ax.axvspan(start, end, color="red", alpha=0.3)
96
+
97
  plt.close(fig)
98
  buf.seek(0)
99
  return Image.open(buf).convert("RGB")