mariesig commited on
Commit
06fe429
·
1 Parent(s): 7274b79

move VAD constants to seperate file

Browse files
Files changed (5) hide show
  1. app.py +6 -18
  2. constants.py +5 -0
  3. stream_pipeline.py +5 -5
  4. ui.py +35 -0
  5. utils.py +5 -35
app.py CHANGED
@@ -5,6 +5,7 @@ import gradio as gr
5
  from pathlib import Path
6
 
7
  from constants import STREAM_EVERY, WARMUP_SECONDS, APP_TMP_DIR
 
8
  from hf_dataset_utils import ALL_FILES, get_transcript
9
 
10
  from stream_pipeline import (
@@ -21,18 +22,9 @@ from offline_pipeline import (
21
  load_local_file,
22
  run_offline_pipeline_streaming,
23
  )
24
- from utils import spec_image, render_vad_led
25
  from clean_up import purge_tmp_directory, cleanup_previous_run
26
 
27
- # Active light HTML: whole container is the light (gray = warming up, red = ready)
28
- ACTIVE_LIGHT_GRAY = (
29
- '<div class="active-light active-light--off" title="Warming up" style="display:flex;justify-content:center;align-items:center;width:100%;height:100%;min-height:3rem;">'
30
- '<span class="active-light__label">Warming up...</span></div>'
31
- )
32
- ACTIVE_LIGHT_RED = (
33
- '<div class="active-light active-light--on" title="Ready" style="display:flex;justify-content:center;align-items:center;width:100%;height:100%;min-height:3rem;">'
34
- '<span class="active-light__label">Ready!</span></div>'
35
- )
36
  WARMUP_TICKS = max(1, int(WARMUP_SECONDS * 2)) # timer ticks every 0.5s
37
 
38
 
@@ -196,15 +188,11 @@ with gr.Blocks() as demo:
196
  value=0,
197
  label="Input gain (dB)",
198
  )
199
- with gr.Row(elem_classes="stream-row"):
200
- with gr.Column(scale=4, min_width=200):
201
- audio_stream = gr.Audio(
202
- sources=["microphone"], streaming=True, elem_id="audio_stream"
203
- )
204
- with gr.Column(scale=1, min_width=120, elem_classes="active-light-column"):
205
- active_light = gr.HTML(value=ACTIVE_LIGHT_GRAY)
206
  with gr.Group(elem_classes="panel"):
207
- vad_led = gr.HTML(value=render_vad_led(False), label="Voice Activity")
 
 
208
  with gr.Column(scale=5, min_width=320):
209
  enhanced_text = gr.Textbox(
210
  label="Enhanced Transcribed Text", lines=6, autoscroll=False
 
5
  from pathlib import Path
6
 
7
  from constants import STREAM_EVERY, WARMUP_SECONDS, APP_TMP_DIR
8
+ from ui import ACTIVE_LIGHT_GRAY, ACTIVE_LIGHT_RED, VAD_OFF_HTML
9
  from hf_dataset_utils import ALL_FILES, get_transcript
10
 
11
  from stream_pipeline import (
 
22
  load_local_file,
23
  run_offline_pipeline_streaming,
24
  )
25
+ from utils import spec_image
26
  from clean_up import purge_tmp_directory, cleanup_previous_run
27
 
 
 
 
 
 
 
 
 
 
28
  WARMUP_TICKS = max(1, int(WARMUP_SECONDS * 2)) # timer ticks every 0.5s
29
 
30
 
 
188
  value=0,
189
  label="Input gain (dB)",
190
  )
191
+ audio_stream = gr.Audio(sources=["microphone"], streaming=True, elem_id="audio_stream")
 
 
 
 
 
 
192
  with gr.Group(elem_classes="panel"):
193
+ with gr.Row(scale=1, elem_classes="status-row"):
194
+ active_light = gr.HTML(value=ACTIVE_LIGHT_GRAY, label="System Status", show_label=True)
195
+ vad_led = gr.HTML(value=VAD_OFF_HTML, label="Voice Activity Detection", show_label=True)
196
  with gr.Column(scale=5, min_width=320):
197
  enhanced_text = gr.Textbox(
198
  label="Enhanced Transcribed Text", lines=6, autoscroll=False
constants.py CHANGED
@@ -30,3 +30,8 @@ STREAMER_CLASSES: Final = {
30
  "Deepgram Nova-3 RT": DeepgramStreamer,
31
  "Soniox STT-RT v3": SonioxStreamer,
32
  }
 
 
 
 
 
 
30
  "Deepgram Nova-3 RT": DeepgramStreamer,
31
  "Soniox STT-RT v3": SonioxStreamer,
32
  }
33
+
34
+ VAD_ON: Final = "🟢"
35
+ VAD_OFF: Final = "⚫"
36
+
37
+
stream_pipeline.py CHANGED
@@ -6,8 +6,7 @@ from constants import STREAMER_CLASSES
6
  from stt_streamers import DeepgramStreamer
7
  from sdk import SDKWrapper, SDKParams
8
  from typing import Any
9
- from utils import render_vad_led
10
-
11
  # ----------------------------
12
  # Global transcript store (UI pulls from this)
13
  # ----------------------------
@@ -66,7 +65,7 @@ def _to_float32_mono(y: np.ndarray) -> np.ndarray:
66
  def transcribe_stream(current_sr: int | None, new_chunk, enhancement_level, input_gain_db: float = 0.0, stt_streamer: str = "deepgram") -> tuple[int | None, str, str, Any]:
67
  print("Transcribing")
68
  if new_chunk is None or new_chunk[1] is None:
69
- return None, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT, render_vad_led(False) # No audio, so no VAD; return LED off
70
  sr, y = new_chunk
71
  if current_sr != sr:
72
  init_streaming_sdk(sample_rate=sr, enhancement_level=enhancement_level/100.0)
@@ -88,7 +87,8 @@ def transcribe_stream(current_sr: int | None, new_chunk, enhancement_level, inpu
88
  ):
89
  Streamer_raw.process_chunk(y)
90
  Streamer_enhanced.process_chunk(out_1d)
91
- return current_sr, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT, render_vad_led(vad_detected)
 
92
 
93
 
94
  def shutdown_streamers(from_stop_recording: bool = False):
@@ -113,7 +113,7 @@ def shutdown_streamers(from_stop_recording: bool = False):
113
 
114
  def on_stop_recording():
115
  shutdown_streamers()
116
- return render_vad_led(False), None
117
 
118
 
119
  def clear_ui():
 
6
  from stt_streamers import DeepgramStreamer
7
  from sdk import SDKWrapper, SDKParams
8
  from typing import Any
9
+ from ui import VAD_OFF_HTML, VAD_ON_HTML
 
10
  # ----------------------------
11
  # Global transcript store (UI pulls from this)
12
  # ----------------------------
 
65
  def transcribe_stream(current_sr: int | None, new_chunk, enhancement_level, input_gain_db: float = 0.0, stt_streamer: str = "deepgram") -> tuple[int | None, str, str, Any]:
66
  print("Transcribing")
67
  if new_chunk is None or new_chunk[1] is None:
68
+ return None, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT, VAD_OFF_HTML
69
  sr, y = new_chunk
70
  if current_sr != sr:
71
  init_streaming_sdk(sample_rate=sr, enhancement_level=enhancement_level/100.0)
 
87
  ):
88
  Streamer_raw.process_chunk(y)
89
  Streamer_enhanced.process_chunk(out_1d)
90
+ vad_led = VAD_ON_HTML if vad_detected else VAD_OFF_HTML
91
+ return current_sr, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT, vad_led
92
 
93
 
94
  def shutdown_streamers(from_stop_recording: bool = False):
 
113
 
114
  def on_stop_recording():
115
  shutdown_streamers()
116
+ return VAD_OFF_HTML, None
117
 
118
 
119
  def clear_ui():
ui.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ACTIVE_LIGHT_GRAY = (
2
+ '<div class="active-light active-light--off" title="Warming up" style="display:flex;justify-content:center;align-items:center;width:100%;height:100%;min-height:3rem;">'
3
+ '<span class="active-light__label">Warming up...</span></div>'
4
+ )
5
+ ACTIVE_LIGHT_RED = (
6
+ '<div class="active-light active-light--on" title="Ready" style="display:flex;justify-content:center;align-items:center;width:100%;height:100%;min-height:3rem;">'
7
+ '<span class="active-light__label">Ready!</span></div>'
8
+ )
9
+
10
+
11
+ VAD_ON_HTML = """
12
+ <div style="display:flex; align-items:center; gap:10px;">
13
+ <div style="
14
+ width:25px;
15
+ height:25px;
16
+ border-radius:9999px;
17
+ background:#22c55e;
18
+ box-shadow:0 0 16px rgba(34,197,94,0.9);
19
+ border:1px solid #666;
20
+ "></div>
21
+ </div>
22
+ """
23
+
24
+ VAD_OFF_HTML = """
25
+ <div style="display:flex; align-items:center; gap:10px;">
26
+ <div style="
27
+ width:25px;
28
+ height:25px;
29
+ border-radius:9999px;
30
+ background:#3f3f46;
31
+ box-shadow:none;
32
+ border:1px solid #666;
33
+ "></div>
34
+ </div>
35
+ """
utils.py CHANGED
@@ -4,37 +4,9 @@ import librosa
4
  from PIL import Image
5
  import io
6
  import matplotlib.pyplot as plt
7
- from constants import TARGET_LOUDNESS, TARGET_TP
8
  import pyloudnorm as pyln
9
-
10
- VAD_ON_HTML = """
11
- <div style="display:flex; align-items:center; gap:10px;">
12
- <div style="
13
- width:25px;
14
- height:25px;
15
- border-radius:9999px;
16
- background:#22c55e;
17
- box-shadow:0 0 16px rgba(34,197,94,0.9);
18
- border:1px solid #666;
19
- "></div>
20
- </div>
21
- """
22
-
23
- VAD_OFF_HTML = """
24
- <div style="display:flex; align-items:center; gap:10px;">
25
- <div style="
26
- width:25px;
27
- height:25px;
28
- border-radius:9999px;
29
- background:#3f3f46;
30
- box-shadow:none;
31
- border:1px solid #666;
32
- "></div>
33
- </div>
34
- """
35
-
36
- SUB_ON = "🟢"
37
- SUB_OFF = "⚫"
38
 
39
  def get_vad_labels(vad_timestamps: list[list[float]], length: float) -> list[dict]:
40
  subtitles = []
@@ -42,25 +14,23 @@ def get_vad_labels(vad_timestamps: list[list[float]], length: float) -> list[dic
42
  for start, end in vad_timestamps:
43
  if start > cur:
44
  subtitles.append({
45
- "text": f"Voice Detection: {SUB_OFF}",
46
  "timestamp": [cur, start]
47
  })
48
 
49
  subtitles.append({
50
- "text": f"Voice Detection: {SUB_ON}",
51
  "timestamp": [start, end]
52
  })
53
 
54
  cur = end
55
  if cur < length:
56
  subtitles.append({
57
- "text": f"Voice Detection: {SUB_OFF}",
58
  "timestamp": [cur, length]
59
  })
60
  return subtitles
61
 
62
- def render_vad_led(is_speech: bool) -> str:
63
- return VAD_ON_HTML if is_speech else VAD_OFF_HTML
64
 
65
  def to_gradio_audio(x: np.ndarray, sr: int) -> tuple[int, np.ndarray]:
66
  """Return (sample_rate, int16 mono array) for Gradio Audio. Gradio expects int16;
 
4
  from PIL import Image
5
  import io
6
  import matplotlib.pyplot as plt
7
+ from constants import TARGET_LOUDNESS, TARGET_TP, VAD_OFF, VAD_ON
8
  import pyloudnorm as pyln
9
+ import warnings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  def get_vad_labels(vad_timestamps: list[list[float]], length: float) -> list[dict]:
12
  subtitles = []
 
14
  for start, end in vad_timestamps:
15
  if start > cur:
16
  subtitles.append({
17
+ "text": f"Voice Detection: {VAD_OFF}",
18
  "timestamp": [cur, start]
19
  })
20
 
21
  subtitles.append({
22
+ "text": f"Voice Detection: {VAD_ON}",
23
  "timestamp": [start, end]
24
  })
25
 
26
  cur = end
27
  if cur < length:
28
  subtitles.append({
29
+ "text": f"Voice Detection: {VAD_OFF}",
30
  "timestamp": [cur, length]
31
  })
32
  return subtitles
33
 
 
 
34
 
35
  def to_gradio_audio(x: np.ndarray, sr: int) -> tuple[int, np.ndarray]:
36
  """Return (sample_rate, int16 mono array) for Gradio Audio. Gradio expects int16;