mariesig commited on
Commit
7274b79
·
1 Parent(s): ac7c448

add vad and sr to streaming

Browse files
Files changed (5) hide show
  1. app.py +22 -24
  2. offline_pipeline.py +24 -13
  3. sdk.py +45 -18
  4. stream_pipeline.py +44 -80
  5. utils.py +58 -3
app.py CHANGED
@@ -21,7 +21,7 @@ from offline_pipeline import (
21
  load_local_file,
22
  run_offline_pipeline_streaming,
23
  )
24
- from utils import spec_image
25
  from clean_up import purge_tmp_directory, cleanup_previous_run
26
 
27
  # Active light HTML: whole container is the light (gray = warming up, red = ready)
@@ -75,7 +75,7 @@ def process_with_live_transcript(
75
  noisy_spec_path = f"{APP_TMP_DIR}/{sample_stem}_noisy_spectrogram.png"
76
  if input_array is not None:
77
  try:
78
- spec_image(input_array).save(noisy_spec_path)
79
  except Exception:
80
  noisy_spec_path = None
81
  else:
@@ -128,15 +128,15 @@ def process_with_live_transcript(
128
  enhanced_transcript,
129
  noisy_transcript_with_wer,
130
  enhanced_audio,
 
131
  last_stem,
132
  enhanced_array,
133
  precomputed_noisy,
134
  ) = result_holder["result"]
135
-
136
  # 3) Last: reveal enhanced spectrogram (and final audio/transcripts)
137
  yield (
138
  gr.update(visible=True),
139
- enhanced_audio,
140
  enhanced_spec_path, # enhanced_image: show only now
141
  noisy_spec_path,
142
  original_transcript,
@@ -186,7 +186,7 @@ with gr.Blocks() as demo:
186
  with gr.Tab("Stream audio in real time") as stream_tab:
187
  gr.Markdown(open("docs/online.md", "r", encoding="utf-8").read())
188
  with gr.Group(elem_classes="panel"):
189
- stream_state = gr.State(None)
190
  on_stream_tab = gr.State(True) # True on load: stream tab is the default first tab
191
  warmup_elapsed = gr.State(0)
192
  input_gain_db = gr.Slider(
@@ -204,6 +204,7 @@ with gr.Blocks() as demo:
204
  with gr.Column(scale=1, min_width=120, elem_classes="active-light-column"):
205
  active_light = gr.HTML(value=ACTIVE_LIGHT_GRAY)
206
  with gr.Group(elem_classes="panel"):
 
207
  with gr.Column(scale=5, min_width=320):
208
  enhanced_text = gr.Textbox(
209
  label="Enhanced Transcribed Text", lines=6, autoscroll=False
@@ -298,7 +299,7 @@ with gr.Blocks() as demo:
298
  lambda: (
299
  gr.update(visible=False),
300
  gr.update(visible=False),
301
- gr.update(visible=True, interactive=True, streaming=True, sources=["microphone"]),
302
  True,
303
  0,
304
  ACTIVE_LIGHT_GRAY,
@@ -309,16 +310,15 @@ with gr.Blocks() as demo:
309
  )
310
 
311
  upload_tab.select(
312
- lambda: (
313
- *stop_online_backend(),
314
- False,
315
- ACTIVE_LIGHT_GRAY,
316
- ),
317
  inputs=None,
318
- outputs=[stream_state, enhanced_text, raw_text, audio_stream, on_stream_tab, active_light],
319
  ).then(
320
- lambda: gr.update(visible=True),
321
- outputs=enhance_btn,
 
 
 
322
  )
323
 
324
  dataset_tab.select(
@@ -328,7 +328,7 @@ with gr.Blocks() as demo:
328
  ACTIVE_LIGHT_GRAY,
329
  ),
330
  inputs=None,
331
- outputs=[stream_state, enhanced_text, raw_text, audio_stream, on_stream_tab, active_light],
332
  ).then(
333
  lambda: gr.update(visible=True),
334
  outputs=enhance_btn,
@@ -343,10 +343,10 @@ with gr.Blocks() as demo:
343
  ).then(
344
  clear_ui,
345
  inputs=None,
346
- outputs=[stream_state, enhanced_text, raw_text],
347
  ).then(
348
  set_stt_streamer,
349
- inputs=stt_model,
350
  outputs=None,
351
  )
352
 
@@ -356,8 +356,8 @@ with gr.Blocks() as demo:
356
 
357
  audio_stream.stream(
358
  fn=transcribe_stream,
359
- inputs=[stream_state, audio_stream, enhancement_level, input_gain_db],
360
- outputs=[stream_state, enhanced_text, raw_text],
361
  stream_every=STREAM_EVERY,
362
  time_limit=60 * 2,
363
  concurrency_limit=1,
@@ -365,17 +365,15 @@ with gr.Blocks() as demo:
365
 
366
  audio_stream.stop_recording(
367
  on_stop_recording,
 
368
  )
369
 
370
  audio_stream.start_recording(
371
  clear_ui,
372
  inputs=None,
373
- outputs=[stream_state, enhanced_text, raw_text],
374
- ).then(
375
- fn=set_stt_streamer,
376
- inputs=stt_model,
377
- outputs=None,
378
  )
 
379
 
380
  # ------------------------------------------------------
381
  # OFFLINE PIPELINE EVENTS (DATASET + UPLOAD TABS)
 
21
  load_local_file,
22
  run_offline_pipeline_streaming,
23
  )
24
+ from utils import spec_image, render_vad_led
25
  from clean_up import purge_tmp_directory, cleanup_previous_run
26
 
27
  # Active light HTML: whole container is the light (gray = warming up, red = ready)
 
75
  noisy_spec_path = f"{APP_TMP_DIR}/{sample_stem}_noisy_spectrogram.png"
76
  if input_array is not None:
77
  try:
78
+ spec_image(input_array, sr = current_sample_rate).save(noisy_spec_path)
79
  except Exception:
80
  noisy_spec_path = None
81
  else:
 
128
  enhanced_transcript,
129
  noisy_transcript_with_wer,
130
  enhanced_audio,
131
+ vad_labels,
132
  last_stem,
133
  enhanced_array,
134
  precomputed_noisy,
135
  ) = result_holder["result"]
 
136
  # 3) Last: reveal enhanced spectrogram (and final audio/transcripts)
137
  yield (
138
  gr.update(visible=True),
139
+ gr.update(value=enhanced_audio, subtitles=vad_labels),
140
  enhanced_spec_path, # enhanced_image: show only now
141
  noisy_spec_path,
142
  original_transcript,
 
186
  with gr.Tab("Stream audio in real time") as stream_tab:
187
  gr.Markdown(open("docs/online.md", "r", encoding="utf-8").read())
188
  with gr.Group(elem_classes="panel"):
189
+ streaming_sr = gr.State(16000)
190
  on_stream_tab = gr.State(True) # True on load: stream tab is the default first tab
191
  warmup_elapsed = gr.State(0)
192
  input_gain_db = gr.Slider(
 
204
  with gr.Column(scale=1, min_width=120, elem_classes="active-light-column"):
205
  active_light = gr.HTML(value=ACTIVE_LIGHT_GRAY)
206
  with gr.Group(elem_classes="panel"):
207
+ vad_led = gr.HTML(value=render_vad_led(False), label="Voice Activity")
208
  with gr.Column(scale=5, min_width=320):
209
  enhanced_text = gr.Textbox(
210
  label="Enhanced Transcribed Text", lines=6, autoscroll=False
 
299
  lambda: (
300
  gr.update(visible=False),
301
  gr.update(visible=False),
302
+ gr.update(sources=["microphone"], streaming=True, interactive= True),
303
  True,
304
  0,
305
  ACTIVE_LIGHT_GRAY,
 
310
  )
311
 
312
  upload_tab.select(
313
+ lambda: gr.update(streaming=False, interactive=False),
 
 
 
 
314
  inputs=None,
315
+ outputs=[audio_stream],
316
  ).then(
317
+ lambda: (
318
+ gr.update(visible=True),
319
+ *on_stop_recording(),
320
+ ),
321
+ outputs=[enhance_btn, vad_led, streaming_sr],
322
  )
323
 
324
  dataset_tab.select(
 
328
  ACTIVE_LIGHT_GRAY,
329
  ),
330
  inputs=None,
331
+ outputs=[streaming_sr, enhanced_text, raw_text, audio_stream, on_stream_tab, active_light],
332
  ).then(
333
  lambda: gr.update(visible=True),
334
  outputs=enhance_btn,
 
343
  ).then(
344
  clear_ui,
345
  inputs=None,
346
+ outputs=[enhanced_text, raw_text],
347
  ).then(
348
  set_stt_streamer,
349
+ inputs=[stt_model, streaming_sr],
350
  outputs=None,
351
  )
352
 
 
356
 
357
  audio_stream.stream(
358
  fn=transcribe_stream,
359
+ inputs=[streaming_sr, audio_stream, enhancement_level, input_gain_db, stt_model],
360
+ outputs=[streaming_sr, enhanced_text, raw_text, vad_led],
361
  stream_every=STREAM_EVERY,
362
  time_limit=60 * 2,
363
  concurrency_limit=1,
 
365
 
366
  audio_stream.stop_recording(
367
  on_stop_recording,
368
+ outputs=[vad_led, streaming_sr]
369
  )
370
 
371
  audio_stream.start_recording(
372
  clear_ui,
373
  inputs=None,
374
+ outputs=[enhanced_text, raw_text],
 
 
 
 
375
  )
376
+
377
 
378
  # ------------------------------------------------------
379
  # OFFLINE PIPELINE EVENTS (DATASET + UPLOAD TABS)
offline_pipeline.py CHANGED
@@ -3,13 +3,15 @@ from random import sample
3
 
4
  import gradio as gr
5
  import soundfile as sf
6
- from sdk import SDKWrapper
7
- from utils import spec_image, compute_wer, to_gradio_audio, normalize_lufs
8
  from hf_dataset_utils import get_audio, get_transcript
9
  from constants import APP_TMP_DIR, STREAMER_CLASSES
10
  import numpy as np
11
 
12
 
 
 
13
  def _close_stt_stream(streamer) -> None:
14
  """Signal end-of-stream; streamer type may be Soniox (close_stream) or Deepgram (close)."""
15
  if hasattr(streamer, "close_stream"):
@@ -24,7 +26,7 @@ def run_offline_pipeline_streaming(
24
  sample_id: str,
25
  stt_model: str,
26
  progress_state: dict,
27
- ) -> tuple[str, str, str, tuple[int, np.ndarray], str, np.ndarray, str]:
28
  """Run enhancement and both STTs in real time by processing in chunks. Transcripts stream
29
  via progress_state['noisy'] and progress_state['enhanced']. Enhanced audio is returned
30
  only at the end; the app plays it automatically when processing is complete.
@@ -32,13 +34,14 @@ def run_offline_pipeline_streaming(
32
  if sample is None:
33
  raise ValueError("No audio to enhance. Please upload a file first.")
34
  sample = np.asarray(sample, dtype=np.float32).flatten()
35
-
36
- sdk = SDKWrapper()
37
- sdk.init_processor(
38
  sample_rate=sample_rate,
39
- enhancement_level=float(enhancement_level) / 100.0,
 
 
40
  )
41
- chunk_size = sdk.num_frames
 
42
 
43
  # Sync transcript callbacks so both boxes update together
44
  progress_state["noisy_pending"] = ""
@@ -68,6 +71,7 @@ def run_offline_pipeline_streaming(
68
  streamer_enhanced = StreamerClass(sample_rate, f"{sample_id}_enhanced", on_update=on_enhanced)
69
 
70
  accumulated_enhanced: list[np.ndarray] = []
 
71
  n = len(sample)
72
 
73
  for i in range(0, n, chunk_size):
@@ -80,12 +84,17 @@ def run_offline_pipeline_streaming(
80
  constant_values=0.0,
81
  )
82
  raw_2d = raw_chunk.reshape(1, -1)
83
- enhanced_chunk = sdk.process_chunk(raw_2d)
84
  enhanced_1d = np.asarray(enhanced_chunk).flatten()
85
  streamer_noisy.process_chunk(raw_chunk)
86
  streamer_enhanced.process_chunk(enhanced_1d)
87
  accumulated_enhanced.append(enhanced_1d)
88
-
 
 
 
 
 
89
  _close_stt_stream(streamer_noisy)
90
  _close_stt_stream(streamer_enhanced)
91
  streamer_noisy.finished_event.wait()
@@ -100,7 +109,7 @@ def run_offline_pipeline_streaming(
100
  gradio_enhanced_audio = to_gradio_audio(enhanced_array, sample_rate)
101
 
102
  enhanced_spec_path = f"{APP_TMP_DIR}/{sample_id}_enhanced_spectrogram.png"
103
- spec_image(enhanced_array).save(enhanced_spec_path)
104
  progress_state["enhanced_spec_path"] = enhanced_spec_path
105
 
106
  precomputed_noisy = noisy_transcript
@@ -113,11 +122,13 @@ def run_offline_pipeline_streaming(
113
  except Exception:
114
  pass
115
 
 
116
  return (
117
  enhanced_spec_path,
118
  enhanced_transcript,
119
  noisy_transcript,
120
  gradio_enhanced_audio,
 
121
  sample_id,
122
  enhanced_array,
123
  precomputed_noisy,
@@ -126,9 +137,9 @@ def run_offline_pipeline_streaming(
126
  def load_local_file(
127
  sample_path: str,
128
  normalize: bool = True,
129
- ) -> tuple[np.ndarray | None, str, tuple | None, int]:
130
  if not sample_path or not os.path.exists(sample_path):
131
- return None, "", None
132
  if os.path.getsize(sample_path) > 5 * 1024 * 1024:
133
  gr.Warning("File size exceeds 5 MB limit. Please upload a smaller file.")
134
  raise ValueError("Uploaded file exceeds the 5 MB size limit.")
 
3
 
4
  import gradio as gr
5
  import soundfile as sf
6
+ from sdk import SDKWrapper, SDKParams
7
+ from utils import spec_image, compute_wer, to_gradio_audio, normalize_lufs, get_vad_labels
8
  from hf_dataset_utils import get_audio, get_transcript
9
  from constants import APP_TMP_DIR, STREAMER_CLASSES
10
  import numpy as np
11
 
12
 
13
+ SDK_OFFLINE = SDKWrapper()
14
+
15
  def _close_stt_stream(streamer) -> None:
16
  """Signal end-of-stream; streamer type may be Soniox (close_stream) or Deepgram (close)."""
17
  if hasattr(streamer, "close_stream"):
 
26
  sample_id: str,
27
  stt_model: str,
28
  progress_state: dict,
29
+ ) -> tuple[str, str, str, tuple[int, np.ndarray], list, str, np.ndarray, str]:
30
  """Run enhancement and both STTs in real time by processing in chunks. Transcripts stream
31
  via progress_state['noisy'] and progress_state['enhanced']. Enhanced audio is returned
32
  only at the end; the app plays it automatically when processing is complete.
 
34
  if sample is None:
35
  raise ValueError("No audio to enhance. Please upload a file first.")
36
  sample = np.asarray(sample, dtype=np.float32).flatten()
37
+ sdk_params = SDKParams(
 
 
38
  sample_rate=sample_rate,
39
+ enhancement_level=enhancement_level/100.0,
40
+ allow_variable_frames=False, # streaming pipeline uses fixed frames for simplicity
41
+ num_channels=1,
42
  )
43
+ SDK_OFFLINE.init_processor(sdk_params)
44
+ chunk_size = SDK_OFFLINE.num_frames
45
 
46
  # Sync transcript callbacks so both boxes update together
47
  progress_state["noisy_pending"] = ""
 
71
  streamer_enhanced = StreamerClass(sample_rate, f"{sample_id}_enhanced", on_update=on_enhanced)
72
 
73
  accumulated_enhanced: list[np.ndarray] = []
74
+ vad_timestamps = []
75
  n = len(sample)
76
 
77
  for i in range(0, n, chunk_size):
 
84
  constant_values=0.0,
85
  )
86
  raw_2d = raw_chunk.reshape(1, -1)
87
+ enhanced_chunk = SDK_OFFLINE.process_chunk(raw_2d)
88
  enhanced_1d = np.asarray(enhanced_chunk).flatten()
89
  streamer_noisy.process_chunk(raw_chunk)
90
  streamer_enhanced.process_chunk(enhanced_1d)
91
  accumulated_enhanced.append(enhanced_1d)
92
+
93
+ if SDK_OFFLINE.vad_context.is_speech_detected():
94
+ start_in_sec = i/ sample_rate
95
+ end_in_sec = (i + chunk_size) / sample_rate
96
+ vad_timestamps.append([start_in_sec, end_in_sec])
97
+
98
  _close_stt_stream(streamer_noisy)
99
  _close_stt_stream(streamer_enhanced)
100
  streamer_noisy.finished_event.wait()
 
109
  gradio_enhanced_audio = to_gradio_audio(enhanced_array, sample_rate)
110
 
111
  enhanced_spec_path = f"{APP_TMP_DIR}/{sample_id}_enhanced_spectrogram.png"
112
+ spec_image(enhanced_array, sr = sample_rate).save(enhanced_spec_path)
113
  progress_state["enhanced_spec_path"] = enhanced_spec_path
114
 
115
  precomputed_noisy = noisy_transcript
 
122
  except Exception:
123
  pass
124
 
125
+ vad_labels = get_vad_labels(vad_timestamps, length=len(sample)/sample_rate)
126
  return (
127
  enhanced_spec_path,
128
  enhanced_transcript,
129
  noisy_transcript,
130
  gradio_enhanced_audio,
131
+ vad_labels,
132
  sample_id,
133
  enhanced_array,
134
  precomputed_noisy,
 
137
  def load_local_file(
138
  sample_path: str,
139
  normalize: bool = True,
140
+ ) -> tuple[np.ndarray | None, str, tuple | None, int | None]:
141
  if not sample_path or not os.path.exists(sample_path):
142
+ return None, "", None, None
143
  if os.path.getsize(sample_path) > 5 * 1024 * 1024:
144
  gr.Warning("File size exceeds 5 MB limit. Please upload a smaller file.")
145
  raise ValueError("Uploaded file exceeds the 5 MB size limit.")
sdk.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import numpy as np
2
  from dotenv import load_dotenv
3
  import aic_sdk as aic
@@ -8,6 +10,24 @@ from constants import MODEL_ID
8
  load_dotenv()
9
 
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  class SDKWrapper:
12
  def __init__(self, model_id: str = MODEL_ID, models_dir: str = "./models"):
13
  if os.getenv("AIC_SDK_KEY") is None:
@@ -16,25 +36,25 @@ class SDKWrapper:
16
  model_path = aic.Model.download(model_id, models_dir)
17
  self.model = aic.Model.from_file(model_path)
18
 
19
-
20
- def init_processor(self, sample_rate: int, enhancement_level: float, allow_variable_frames: bool = False, num_frames: int | None = None,num_channels: int = 1, sync: bool = True):
21
- self.processor_sample_rate = sample_rate
22
- processor_optimal_frames = self.model.get_optimal_num_frames(sample_rate)
23
- self.num_frames = num_frames if num_frames else processor_optimal_frames
24
- config = aic.ProcessorConfig(
25
- sample_rate=sample_rate,
26
- num_channels=num_channels,
27
  num_frames=self.num_frames,
28
- allow_variable_frames=allow_variable_frames,
29
  )
30
- if sync:
31
- processor = aic.Processor(self.model, self.sdk_key, config)
32
  else:
33
- processor = aic.ProcessorAsync(self.model, self.sdk_key, config)
34
- processor.get_processor_context().set_parameter(
35
- aic.ProcessorParameter.EnhancementLevel, float(enhancement_level)
36
  )
37
- self.processor = processor
 
38
 
39
  def change_enhancement_level(self, enhancement_level: float):
40
  if not hasattr(self, "processor"):
@@ -42,6 +62,7 @@ class SDKWrapper:
42
  self.processor.get_processor_context().set_parameter(
43
  aic.ProcessorParameter.EnhancementLevel, float(enhancement_level)
44
  )
 
45
 
46
  def _check_shape(self, audio: np.ndarray) -> np.ndarray:
47
  if len(audio.shape) == 1:
@@ -50,15 +71,17 @@ class SDKWrapper:
50
  raise ValueError("Expected audio with shape (n, frames)")
51
  return audio
52
 
53
- def process_sync(
54
  self,
55
  audio: np.ndarray,
56
- ) -> np.ndarray:
57
  """
58
  audio_array: 2D NumPy array with shape (num_channels, samples) containing audio data to be enhanced
59
  """
60
  audio = self._check_shape(audio)
61
  out = np.zeros_like(audio)
 
 
62
  chunk_size = self.num_frames
63
  n = audio.shape[1]
64
  for i in range(0, n, chunk_size):
@@ -72,7 +95,11 @@ class SDKWrapper:
72
  break
73
  enhanced = self.processor.process(chunk)
74
  out[:, i : i + chunk_size] = enhanced[:, :chunk_size]
75
- return out
 
 
 
 
76
 
77
  def process_chunk(self, audio: np.ndarray) -> np.ndarray:
78
  audio = self._check_shape(audio)
 
1
+ from logging import config
2
+
3
  import numpy as np
4
  from dotenv import load_dotenv
5
  import aic_sdk as aic
 
10
  load_dotenv()
11
 
12
 
13
+ class SDKParams:
14
+ def __init__(
15
+ self,
16
+ sample_rate: int = 16000,
17
+ enhancement_level: float = 1.0,
18
+ allow_variable_frames: bool = False,
19
+ num_channels: int = 1,
20
+ sync: bool = True,
21
+ num_frames: int | None = None,
22
+
23
+ ):
24
+ self.sample_rate = sample_rate
25
+ self.enhancement_level = enhancement_level
26
+ self.allow_variable_frames = allow_variable_frames
27
+ self.num_channels = num_channels
28
+ self.sync = sync
29
+ self.num_frames = num_frames # to be set after processor init
30
+
31
  class SDKWrapper:
32
  def __init__(self, model_id: str = MODEL_ID, models_dir: str = "./models"):
33
  if os.getenv("AIC_SDK_KEY") is None:
 
36
  model_path = aic.Model.download(model_id, models_dir)
37
  self.model = aic.Model.from_file(model_path)
38
 
39
+ def init_processor(self, sdk_params: SDKParams):
40
+ optimal_frames = self.model.get_optimal_num_frames(sdk_params.sample_rate)
41
+ self.num_frames = sdk_params.num_frames if sdk_params.num_frames else optimal_frames
42
+ self.sample_rate = sdk_params.sample_rate
43
+ aic_config = aic.ProcessorConfig(
44
+ sample_rate=sdk_params.sample_rate,
45
+ num_channels=sdk_params.num_channels,
 
46
  num_frames=self.num_frames,
47
+ allow_variable_frames=sdk_params.allow_variable_frames,
48
  )
49
+ if sdk_params.sync:
50
+ self.processor = aic.Processor(self.model, self.sdk_key, aic_config)
51
  else:
52
+ self.processor = aic.ProcessorAsync(self.model, self.sdk_key, aic_config)
53
+ self.processor.get_processor_context().set_parameter(
54
+ aic.ProcessorParameter.EnhancementLevel, float(sdk_params.enhancement_level)
55
  )
56
+ self.enhancement_level = sdk_params.enhancement_level
57
+ self.vad_context = self.processor.get_vad_context()
58
 
59
  def change_enhancement_level(self, enhancement_level: float):
60
  if not hasattr(self, "processor"):
 
62
  self.processor.get_processor_context().set_parameter(
63
  aic.ProcessorParameter.EnhancementLevel, float(enhancement_level)
64
  )
65
+ self.enhancement_level = enhancement_level
66
 
67
  def _check_shape(self, audio: np.ndarray) -> np.ndarray:
68
  if len(audio.shape) == 1:
 
71
  raise ValueError("Expected audio with shape (n, frames)")
72
  return audio
73
 
74
+ def process_with_vad(
75
  self,
76
  audio: np.ndarray,
77
+ ) -> tuple[np.ndarray, bool]:
78
  """
79
  audio_array: 2D NumPy array with shape (num_channels, samples) containing audio data to be enhanced
80
  """
81
  audio = self._check_shape(audio)
82
  out = np.zeros_like(audio)
83
+ vad_per_sample = np.zeros_like(audio, dtype=bool)
84
+ vad_overall = False
85
  chunk_size = self.num_frames
86
  n = audio.shape[1]
87
  for i in range(0, n, chunk_size):
 
95
  break
96
  enhanced = self.processor.process(chunk)
97
  out[:, i : i + chunk_size] = enhanced[:, :chunk_size]
98
+ if self.vad_context.is_speech_detected():
99
+ vad_per_sample[:, i : i + chunk_size] = True
100
+ if vad_per_sample.mean() > 0.5:
101
+ vad_overall = True
102
+ return out, vad_overall
103
 
104
  def process_chunk(self, audio: np.ndarray) -> np.ndarray:
105
  audio = self._check_shape(audio)
stream_pipeline.py CHANGED
@@ -1,10 +1,12 @@
 
1
  import gradio as gr
 
2
  import numpy as np
3
- import soxr
4
- from constants import DEFAULT_SR, STREAMER_CLASSES
5
  from stt_streamers import DeepgramStreamer
6
- from sdk import SDKWrapper
7
- from dataclasses import dataclass
 
8
 
9
  # ----------------------------
10
  # Global transcript store (UI pulls from this)
@@ -28,13 +30,8 @@ def get_live_transcripts() -> tuple[str, str]:
28
  return _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT
29
 
30
 
31
- SDK = SDKWrapper()
32
- SDK.init_processor(
33
- sample_rate=DEFAULT_SR,
34
- enhancement_level=1.0,
35
- allow_variable_frames=True, # streaming chunks are variable-sized
36
- num_channels=1,
37
- )
38
 
39
  # Created on first start_recording (lazy) to avoid Soniox "No audio received" timeout at app load
40
  Streamer_enhanced = None
@@ -43,25 +40,16 @@ _streamer_generation = 0
43
  _last_stop_generation = 1 # so first stop doesn't skip (1 > 1 is False)
44
 
45
 
46
- @dataclass
47
- class EnhanceSession:
48
- pending: np.ndarray # 1D float32 @ processor sample rate
49
- sr: int
50
- num_frames: int
51
- @dataclass
52
- class StreamSession:
53
- # nur was du wirklich brauchst
54
- resampler: soxr.ResampleStream | None
55
- sr_in: int | None
56
- tail_16k: np.ndarray # ring buffer (z.B. letzte 10s)
57
- tail_max: int # max samples
58
-
59
- def _get_or_init_session(session: StreamSession | None, sr_in: int) -> StreamSession:
60
- if session is None or session.sr_in != sr_in:
61
- # ResampleStream ist für real-time processing gedacht citeturn8view0
62
- resampler = None if sr_in == DEFAULT_SR else soxr.ResampleStream(sr_in, DEFAULT_SR, num_channels=1, dtype="float32")
63
- return StreamSession(resampler=resampler, sr_in=sr_in, tail_16k=np.zeros((0,), dtype=np.float32), tail_max=10 * DEFAULT_SR)
64
- return session
65
 
66
  def _to_float32_mono(y: np.ndarray) -> np.ndarray:
67
  # Gradio liefert int16 (oder (samples, channels)). citeturn1view4
@@ -75,57 +63,32 @@ def _to_float32_mono(y: np.ndarray) -> np.ndarray:
75
  return y
76
 
77
 
78
- def transcribe_stream(session: StreamSession | None, new_chunk, enhancement_level, input_gain_db: float = 0.0):
79
- if (
80
- Streamer_enhanced is None
81
- or Streamer_raw is None
82
- or Streamer_enhanced.ws is None
83
- or Streamer_raw.ws is None
84
- ):
85
- return session, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT
86
  if new_chunk is None or new_chunk[1] is None:
87
- return session, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT
88
-
89
  sr, y = new_chunk
 
 
 
 
 
 
90
  y = _to_float32_mono(y)
91
- # Apply input gain: linear = 10^(dB/20), clip to avoid overflow
92
  if input_gain_db is not None and input_gain_db > 0:
93
  gain_linear = np.float32(10.0 ** (float(input_gain_db) / 20.0))
94
  y = (y * gain_linear).astype(np.float32)
95
  y = np.clip(y, -1.0, 1.0)
96
-
97
- session = _get_or_init_session(session, sr)
98
- SDK.change_enhancement_level(float(enhancement_level) / 100.0)
99
- if session.resampler is not None:
100
- y_16k = session.resampler.resample_chunk(y)
101
- else:
102
- y_16k = y
103
-
104
- # Ensure 1D float32 for SDK and streamers (resample_chunk can return 0 samples or 2D)
105
- y_16k = np.asarray(y_16k, dtype=np.float32).flatten()
106
-
107
- # Ringbuffer (nicht unendlich konkatenieren)
108
- if y_16k.size > 0:
109
- tail = np.concatenate([session.tail_16k, y_16k])
110
- if tail.size > session.tail_max:
111
- tail = tail[-session.tail_max:]
112
- session.tail_16k = tail
113
-
114
- # Only send when we have samples (resample_chunk can return empty; SDK needs valid input)
115
- if y_16k.size == 0:
116
- return session, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT
117
-
118
- # Parallel path: send raw to STT immediately, then enhance and send enhanced.
119
- # SDK requires fixed num_frames (AudioConfigMismatchError if we use process_chunk with variable size).
120
- Streamer_raw.process_chunk(y_16k)
121
- enhanced_chunk_16k = SDK.process_sync(y_16k)
122
  out_1d = np.asarray(enhanced_chunk_16k, dtype=np.float32).flatten()
123
- # Always send something to enhanced so Soniox doesn't close with "No audio received"
124
- if out_1d.size > 0:
 
 
 
125
  Streamer_enhanced.process_chunk(out_1d)
126
- else:
127
- Streamer_enhanced.process_chunk(np.zeros(160, dtype=np.float32))
128
- return session, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT
129
 
130
 
131
  def shutdown_streamers(from_stop_recording: bool = False):
@@ -149,26 +112,26 @@ def shutdown_streamers(from_stop_recording: bool = False):
149
  _last_stop_generation = gen
150
 
151
  def on_stop_recording():
152
- """Call from Gradio stop_recording so streamers shut down when user clicks Stop."""
153
- shutdown_streamers(from_stop_recording=True)
154
 
155
 
156
  def clear_ui():
157
  global _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT
158
  _ENHANCED_TRANSCRIPT = ""
159
  _RAW_TRANSCRIPT = ""
160
- return None, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT
161
 
162
  def stop_online_backend():
163
  """Stop streamers and clear transcripts. Do not update the Audio component:
164
  toggling streaming=False then back to True can make the frontend lose the
165
  microphone (getUserMedia not re-called), so we leave it unchanged."""
166
  shutdown_streamers()
167
- session, enhanced_transcript, raw_transcript = clear_ui()
168
- return session, enhanced_transcript, raw_transcript, gr.update()
169
 
170
 
171
- def set_stt_streamer(model_name):
172
  StreamerCls = STREAMER_CLASSES.get(model_name, DeepgramStreamer)
173
  global Streamer_enhanced, Streamer_raw, _streamer_generation
174
  # Shut down current streamers first so we don't leak
@@ -176,16 +139,17 @@ def set_stt_streamer(model_name):
176
  shutdown_streamers()
177
  # Create both before assigning so transcribe_stream never sees one new and one old
178
  new_enhanced = StreamerCls(
179
- fs_hz=DEFAULT_SR,
180
  stream_name="enhanced",
181
  on_update=_set_transcript_enhanced,
182
  )
183
  new_raw = StreamerCls(
184
- fs_hz=DEFAULT_SR,
185
  stream_name="raw",
186
  on_update=_set_transcript_raw,
187
  )
188
  _streamer_generation += 1
189
  Streamer_enhanced = new_enhanced
190
  Streamer_raw = new_raw
 
191
 
 
1
+ from datasets import streaming
2
  import gradio as gr
3
+ from httpx import stream
4
  import numpy as np
5
+ from constants import STREAMER_CLASSES
 
6
  from stt_streamers import DeepgramStreamer
7
+ from sdk import SDKWrapper, SDKParams
8
+ from typing import Any
9
+ from utils import render_vad_led
10
 
11
  # ----------------------------
12
  # Global transcript store (UI pulls from this)
 
30
  return _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT
31
 
32
 
33
+ SDK_STREAMING = SDKWrapper()
34
+
 
 
 
 
 
35
 
36
  # Created on first start_recording (lazy) to avoid Soniox "No audio received" timeout at app load
37
  Streamer_enhanced = None
 
40
  _last_stop_generation = 1 # so first stop doesn't skip (1 > 1 is False)
41
 
42
 
43
+ def init_streaming_sdk(sample_rate: int, enhancement_level: float):
44
+ """Initialize SDK processor and STT streamers. Call on first start_recording to avoid Soniox timeout at app load."""
45
+ sdk_params = SDKParams(
46
+ sample_rate=sample_rate,
47
+ enhancement_level=enhancement_level,
48
+ allow_variable_frames=False,
49
+ num_channels=1,
50
+ sync=True,
51
+ )
52
+ SDK_STREAMING.init_processor(sdk_params)
 
 
 
 
 
 
 
 
 
53
 
54
  def _to_float32_mono(y: np.ndarray) -> np.ndarray:
55
  # Gradio liefert int16 (oder (samples, channels)). citeturn1view4
 
63
  return y
64
 
65
 
66
+ def transcribe_stream(current_sr: int | None, new_chunk, enhancement_level, input_gain_db: float = 0.0, stt_streamer: str = "deepgram") -> tuple[int | None, str, str, Any]:
67
+ print("Transcribing")
 
 
 
 
 
 
68
  if new_chunk is None or new_chunk[1] is None:
69
+ return None, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT, render_vad_led(False) # No audio, so no VAD; return LED off
 
70
  sr, y = new_chunk
71
+ if current_sr != sr:
72
+ init_streaming_sdk(sample_rate=sr, enhancement_level=enhancement_level/100.0)
73
+ set_stt_streamer(stt_streamer, sr)
74
+ current_sr = sr
75
+ if SDK_STREAMING.enhancement_level != enhancement_level:
76
+ SDK_STREAMING.change_enhancement_level(enhancement_level/100.0)
77
  y = _to_float32_mono(y)
 
78
  if input_gain_db is not None and input_gain_db > 0:
79
  gain_linear = np.float32(10.0 ** (float(input_gain_db) / 20.0))
80
  y = (y * gain_linear).astype(np.float32)
81
  y = np.clip(y, -1.0, 1.0)
82
+ y = np.asarray(y, dtype=np.float32).flatten()
83
+ enhanced_chunk_16k, vad_detected = SDK_STREAMING.process_with_vad(y)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  out_1d = np.asarray(enhanced_chunk_16k, dtype=np.float32).flatten()
85
+ if (
86
+ Streamer_enhanced is not None
87
+ and Streamer_raw is not None
88
+ ):
89
+ Streamer_raw.process_chunk(y)
90
  Streamer_enhanced.process_chunk(out_1d)
91
+ return current_sr, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT, render_vad_led(vad_detected)
 
 
92
 
93
 
94
  def shutdown_streamers(from_stop_recording: bool = False):
 
112
  _last_stop_generation = gen
113
 
114
  def on_stop_recording():
115
+ shutdown_streamers()
116
+ return render_vad_led(False), None
117
 
118
 
119
  def clear_ui():
120
  global _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT
121
  _ENHANCED_TRANSCRIPT = ""
122
  _RAW_TRANSCRIPT = ""
123
+ return _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT
124
 
125
  def stop_online_backend():
126
  """Stop streamers and clear transcripts. Do not update the Audio component:
127
  toggling streaming=False then back to True can make the frontend lose the
128
  microphone (getUserMedia not re-called), so we leave it unchanged."""
129
  shutdown_streamers()
130
+ enhanced_transcript, raw_transcript = clear_ui()
131
+ return None, enhanced_transcript, raw_transcript, gr.update(streaming=False, interactive=False)
132
 
133
 
134
+ def set_stt_streamer(model_name, fs_hz):
135
  StreamerCls = STREAMER_CLASSES.get(model_name, DeepgramStreamer)
136
  global Streamer_enhanced, Streamer_raw, _streamer_generation
137
  # Shut down current streamers first so we don't leak
 
139
  shutdown_streamers()
140
  # Create both before assigning so transcribe_stream never sees one new and one old
141
  new_enhanced = StreamerCls(
142
+ fs_hz=fs_hz,
143
  stream_name="enhanced",
144
  on_update=_set_transcript_enhanced,
145
  )
146
  new_raw = StreamerCls(
147
+ fs_hz=fs_hz,
148
  stream_name="raw",
149
  on_update=_set_transcript_raw,
150
  )
151
  _streamer_generation += 1
152
  Streamer_enhanced = new_enhanced
153
  Streamer_raw = new_raw
154
+
155
 
utils.py CHANGED
@@ -4,10 +4,63 @@ import librosa
4
  from PIL import Image
5
  import io
6
  import matplotlib.pyplot as plt
7
- from constants import DEFAULT_SR, TARGET_LOUDNESS, TARGET_TP
8
- import warnings
9
  import pyloudnorm as pyln
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  def to_gradio_audio(x: np.ndarray, sr: int) -> tuple[int, np.ndarray]:
13
  """Return (sample_rate, int16 mono array) for Gradio Audio. Gradio expects int16;
@@ -35,7 +88,7 @@ def to_gradio_audio(x: np.ndarray, sr: int) -> tuple[int, np.ndarray]:
35
 
36
  def spec_image(
37
  audio_array: np.ndarray,
38
- sr: int = DEFAULT_SR,
39
  n_fft: int = 2048,
40
  hop_length: int = 512,
41
  n_mels: int = 128,
@@ -139,3 +192,5 @@ def normalize_lufs(x: np.ndarray, sr: int) -> np.ndarray:
139
  except Exception as e:
140
  warnings.warn(f"LUFS normalization failed, returning input unchanged: {e}")
141
  return x.astype("float32")
 
 
 
4
  from PIL import Image
5
  import io
6
  import matplotlib.pyplot as plt
7
+ from constants import TARGET_LOUDNESS, TARGET_TP
 
8
  import pyloudnorm as pyln
9
 
10
+ VAD_ON_HTML = """
11
+ <div style="display:flex; align-items:center; gap:10px;">
12
+ <div style="
13
+ width:25px;
14
+ height:25px;
15
+ border-radius:9999px;
16
+ background:#22c55e;
17
+ box-shadow:0 0 16px rgba(34,197,94,0.9);
18
+ border:1px solid #666;
19
+ "></div>
20
+ </div>
21
+ """
22
+
23
+ VAD_OFF_HTML = """
24
+ <div style="display:flex; align-items:center; gap:10px;">
25
+ <div style="
26
+ width:25px;
27
+ height:25px;
28
+ border-radius:9999px;
29
+ background:#3f3f46;
30
+ box-shadow:none;
31
+ border:1px solid #666;
32
+ "></div>
33
+ </div>
34
+ """
35
+
36
+ SUB_ON = "🟢"
37
+ SUB_OFF = "⚫"
38
+
39
+ def get_vad_labels(vad_timestamps: list[list[float]], length: float) -> list[dict]:
40
+ subtitles = []
41
+ cur = 0.0
42
+ for start, end in vad_timestamps:
43
+ if start > cur:
44
+ subtitles.append({
45
+ "text": f"Voice Detection: {SUB_OFF}",
46
+ "timestamp": [cur, start]
47
+ })
48
+
49
+ subtitles.append({
50
+ "text": f"Voice Detection: {SUB_ON}",
51
+ "timestamp": [start, end]
52
+ })
53
+
54
+ cur = end
55
+ if cur < length:
56
+ subtitles.append({
57
+ "text": f"Voice Detection: {SUB_OFF}",
58
+ "timestamp": [cur, length]
59
+ })
60
+ return subtitles
61
+
62
+ def render_vad_led(is_speech: bool) -> str:
63
+ return VAD_ON_HTML if is_speech else VAD_OFF_HTML
64
 
65
  def to_gradio_audio(x: np.ndarray, sr: int) -> tuple[int, np.ndarray]:
66
  """Return (sample_rate, int16 mono array) for Gradio Audio. Gradio expects int16;
 
88
 
89
  def spec_image(
90
  audio_array: np.ndarray,
91
+ sr: int,
92
  n_fft: int = 2048,
93
  hop_length: int = 512,
94
  n_mels: int = 128,
 
192
  except Exception as e:
193
  warnings.warn(f"LUFS normalization failed, returning input unchanged: {e}")
194
  return x.astype("float32")
195
+
196
+