Spaces:

ai-coustics
/

VoiceFocus

Running on CPU Upgrade

App Files Files Community

mariesig commited on Mar 17

Commit

06fe429

1 Parent(s): 7274b79

move VAD constants to seperate file

Browse files

Files changed (5) hide show

app.py +6 -18
constants.py +5 -0
stream_pipeline.py +5 -5
ui.py +35 -0
utils.py +5 -35

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ import gradio as gr
 from pathlib import Path
 from constants import STREAM_EVERY, WARMUP_SECONDS, APP_TMP_DIR
 from hf_dataset_utils import ALL_FILES, get_transcript
 from stream_pipeline import (
@@ -21,18 +22,9 @@ from offline_pipeline import (
     load_local_file,
     run_offline_pipeline_streaming,
 )
-from utils import spec_image, render_vad_led
 from clean_up import purge_tmp_directory, cleanup_previous_run
-# Active light HTML: whole container is the light (gray = warming up, red = ready)
-ACTIVE_LIGHT_GRAY = (
-    '<div class="active-light active-light--off" title="Warming up" style="display:flex;justify-content:center;align-items:center;width:100%;height:100%;min-height:3rem;">'
-    '<span class="active-light__label">Warming up...</span></div>'
-)
-ACTIVE_LIGHT_RED = (
-    '<div class="active-light active-light--on" title="Ready" style="display:flex;justify-content:center;align-items:center;width:100%;height:100%;min-height:3rem;">'
-    '<span class="active-light__label">Ready!</span></div>'
-)
 WARMUP_TICKS = max(1, int(WARMUP_SECONDS * 2))  # timer ticks every 0.5s
@@ -196,15 +188,11 @@ with gr.Blocks() as demo:
                     value=0,
                     label="Input gain (dB)",
                 )
-                with gr.Row(elem_classes="stream-row"):
-                    with gr.Column(scale=4, min_width=200):
-                        audio_stream = gr.Audio(
-                            sources=["microphone"], streaming=True, elem_id="audio_stream"
-                        )
-                    with gr.Column(scale=1, min_width=120, elem_classes="active-light-column"):
-                        active_light = gr.HTML(value=ACTIVE_LIGHT_GRAY)
                 with gr.Group(elem_classes="panel"):
-                    vad_led = gr.HTML(value=render_vad_led(False), label="Voice Activity")
                     with gr.Column(scale=5, min_width=320):
                         enhanced_text = gr.Textbox(
                             label="Enhanced Transcribed Text", lines=6, autoscroll=False

 from pathlib import Path
 from constants import STREAM_EVERY, WARMUP_SECONDS, APP_TMP_DIR
+from ui import ACTIVE_LIGHT_GRAY, ACTIVE_LIGHT_RED, VAD_OFF_HTML
 from hf_dataset_utils import ALL_FILES, get_transcript
 from stream_pipeline import (
     load_local_file,
     run_offline_pipeline_streaming,
 )
+from utils import spec_image
 from clean_up import purge_tmp_directory, cleanup_previous_run
 WARMUP_TICKS = max(1, int(WARMUP_SECONDS * 2))  # timer ticks every 0.5s
                     value=0,
                     label="Input gain (dB)",
                 )
+                audio_stream = gr.Audio(sources=["microphone"], streaming=True, elem_id="audio_stream")
                 with gr.Group(elem_classes="panel"):
+                    with gr.Row(scale=1, elem_classes="status-row"):
+                        active_light = gr.HTML(value=ACTIVE_LIGHT_GRAY, label="System Status", show_label=True)
+                        vad_led = gr.HTML(value=VAD_OFF_HTML, label="Voice Activity Detection", show_label=True)
                     with gr.Column(scale=5, min_width=320):
                         enhanced_text = gr.Textbox(
                             label="Enhanced Transcribed Text", lines=6, autoscroll=False

constants.py CHANGED Viewed

@@ -30,3 +30,8 @@ STREAMER_CLASSES: Final = {
     "Deepgram Nova-3 RT": DeepgramStreamer,
     "Soniox STT-RT v3": SonioxStreamer,
 }

     "Deepgram Nova-3 RT": DeepgramStreamer,
     "Soniox STT-RT v3": SonioxStreamer,
 }
+VAD_ON: Final = "🟢"
+VAD_OFF: Final = "⚫"

stream_pipeline.py CHANGED Viewed

@@ -6,8 +6,7 @@ from constants import STREAMER_CLASSES
 from stt_streamers import DeepgramStreamer
 from sdk import SDKWrapper, SDKParams
 from typing import Any
-from utils import render_vad_led
 # ----------------------------
 # Global transcript store (UI pulls from this)
 # ----------------------------
@@ -66,7 +65,7 @@ def _to_float32_mono(y: np.ndarray) -> np.ndarray:
 def transcribe_stream(current_sr: int | None, new_chunk, enhancement_level, input_gain_db: float = 0.0, stt_streamer: str = "deepgram") -> tuple[int | None, str, str, Any]:
     print("Transcribing")
     if new_chunk is None or new_chunk[1] is None:
-        return None, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT, render_vad_led(False)  # No audio, so no VAD; return LED off
     sr, y = new_chunk
     if current_sr != sr:
         init_streaming_sdk(sample_rate=sr, enhancement_level=enhancement_level/100.0)
@@ -88,7 +87,8 @@ def transcribe_stream(current_sr: int | None, new_chunk, enhancement_level, inpu
     ):
         Streamer_raw.process_chunk(y)
         Streamer_enhanced.process_chunk(out_1d)
-    return current_sr, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT, render_vad_led(vad_detected)
 def shutdown_streamers(from_stop_recording: bool = False):
@@ -113,7 +113,7 @@ def shutdown_streamers(from_stop_recording: bool = False):
 def on_stop_recording():
     shutdown_streamers()
-    return render_vad_led(False), None
 def clear_ui():

 from stt_streamers import DeepgramStreamer
 from sdk import SDKWrapper, SDKParams
 from typing import Any
+from ui import VAD_OFF_HTML, VAD_ON_HTML
 # ----------------------------
 # Global transcript store (UI pulls from this)
 # ----------------------------
 def transcribe_stream(current_sr: int | None, new_chunk, enhancement_level, input_gain_db: float = 0.0, stt_streamer: str = "deepgram") -> tuple[int | None, str, str, Any]:
     print("Transcribing")
     if new_chunk is None or new_chunk[1] is None:
+        return None, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT, VAD_OFF_HTML
     sr, y = new_chunk
     if current_sr != sr:
         init_streaming_sdk(sample_rate=sr, enhancement_level=enhancement_level/100.0)
     ):
         Streamer_raw.process_chunk(y)
         Streamer_enhanced.process_chunk(out_1d)
+    vad_led = VAD_ON_HTML if vad_detected else VAD_OFF_HTML
+    return current_sr, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT, vad_led
 def shutdown_streamers(from_stop_recording: bool = False):
 def on_stop_recording():
     shutdown_streamers()
+    return VAD_OFF_HTML, None
 def clear_ui():

ui.py ADDED Viewed

	@@ -0,0 +1,35 @@

+ACTIVE_LIGHT_GRAY = (
+    '<div class="active-light active-light--off" title="Warming up" style="display:flex;justify-content:center;align-items:center;width:100%;height:100%;min-height:3rem;">'
+    '<span class="active-light__label">Warming up...</span></div>'
+)
+ACTIVE_LIGHT_RED = (
+    '<div class="active-light active-light--on" title="Ready" style="display:flex;justify-content:center;align-items:center;width:100%;height:100%;min-height:3rem;">'
+    '<span class="active-light__label">Ready!</span></div>'
+)
+VAD_ON_HTML = """
+<div style="display:flex; align-items:center; gap:10px;">
+    <div style="
+        width:25px;
+        height:25px;
+        border-radius:9999px;
+        background:#22c55e;
+        box-shadow:0 0 16px rgba(34,197,94,0.9);
+        border:1px solid #666;
+    "></div>
+</div>
+"""
+VAD_OFF_HTML = """
+<div style="display:flex; align-items:center; gap:10px;">
+    <div style="
+        width:25px;
+        height:25px;
+        border-radius:9999px;
+        background:#3f3f46;
+        box-shadow:none;
+        border:1px solid #666;
+    "></div>
+</div>
+"""

utils.py CHANGED Viewed

@@ -4,37 +4,9 @@ import librosa
 from PIL import Image
 import io
 import matplotlib.pyplot as plt
-from constants import TARGET_LOUDNESS, TARGET_TP
 import pyloudnorm as pyln
-VAD_ON_HTML = """
-<div style="display:flex; align-items:center; gap:10px;">
-    <div style="
-        width:25px;
-        height:25px;
-        border-radius:9999px;
-        background:#22c55e;
-        box-shadow:0 0 16px rgba(34,197,94,0.9);
-        border:1px solid #666;
-    "></div>
-</div>
-"""
-VAD_OFF_HTML = """
-<div style="display:flex; align-items:center; gap:10px;">
-    <div style="
-        width:25px;
-        height:25px;
-        border-radius:9999px;
-        background:#3f3f46;
-        box-shadow:none;
-        border:1px solid #666;
-    "></div>
-</div>
-"""
-SUB_ON = "🟢"
-SUB_OFF = "⚫"
 def get_vad_labels(vad_timestamps: list[list[float]], length: float) -> list[dict]:
     subtitles = []
@@ -42,25 +14,23 @@ def get_vad_labels(vad_timestamps: list[list[float]], length: float) -> list[dic
     for start, end in vad_timestamps:
         if start > cur:
             subtitles.append({
-                "text": f"Voice Detection: {SUB_OFF}",
                 "timestamp": [cur, start]
             })
         subtitles.append({
-            "text": f"Voice Detection: {SUB_ON}",
             "timestamp": [start, end]
         })
         cur = end
     if cur < length:
         subtitles.append({
-            "text": f"Voice Detection: {SUB_OFF}",
             "timestamp": [cur, length]
         })
     return subtitles
-def render_vad_led(is_speech: bool) -> str:
-    return VAD_ON_HTML if is_speech else VAD_OFF_HTML
 def to_gradio_audio(x: np.ndarray, sr: int) -> tuple[int, np.ndarray]:
     """Return (sample_rate, int16 mono array) for Gradio Audio. Gradio expects int16;

 from PIL import Image
 import io
 import matplotlib.pyplot as plt
+from constants import TARGET_LOUDNESS, TARGET_TP, VAD_OFF, VAD_ON
 import pyloudnorm as pyln
+import warnings
 def get_vad_labels(vad_timestamps: list[list[float]], length: float) -> list[dict]:
     subtitles = []
     for start, end in vad_timestamps:
         if start > cur:
             subtitles.append({
+                "text": f"Voice Detection: {VAD_OFF}",
                 "timestamp": [cur, start]
             })
         subtitles.append({
+            "text": f"Voice Detection: {VAD_ON}",
             "timestamp": [start, end]
         })
         cur = end
     if cur < length:
         subtitles.append({
+            "text": f"Voice Detection: {VAD_OFF}",
             "timestamp": [cur, length]
         })
     return subtitles
 def to_gradio_audio(x: np.ndarray, sr: int) -> tuple[int, np.ndarray]:
     """Return (sample_rate, int16 mono array) for Gradio Audio. Gradio expects int16;