Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
mariesig commited on
Commit ·
06fe429
1
Parent(s): 7274b79
move VAD constants to seperate file
Browse files- app.py +6 -18
- constants.py +5 -0
- stream_pipeline.py +5 -5
- ui.py +35 -0
- utils.py +5 -35
app.py
CHANGED
|
@@ -5,6 +5,7 @@ import gradio as gr
|
|
| 5 |
from pathlib import Path
|
| 6 |
|
| 7 |
from constants import STREAM_EVERY, WARMUP_SECONDS, APP_TMP_DIR
|
|
|
|
| 8 |
from hf_dataset_utils import ALL_FILES, get_transcript
|
| 9 |
|
| 10 |
from stream_pipeline import (
|
|
@@ -21,18 +22,9 @@ from offline_pipeline import (
|
|
| 21 |
load_local_file,
|
| 22 |
run_offline_pipeline_streaming,
|
| 23 |
)
|
| 24 |
-
from utils import spec_image
|
| 25 |
from clean_up import purge_tmp_directory, cleanup_previous_run
|
| 26 |
|
| 27 |
-
# Active light HTML: whole container is the light (gray = warming up, red = ready)
|
| 28 |
-
ACTIVE_LIGHT_GRAY = (
|
| 29 |
-
'<div class="active-light active-light--off" title="Warming up" style="display:flex;justify-content:center;align-items:center;width:100%;height:100%;min-height:3rem;">'
|
| 30 |
-
'<span class="active-light__label">Warming up...</span></div>'
|
| 31 |
-
)
|
| 32 |
-
ACTIVE_LIGHT_RED = (
|
| 33 |
-
'<div class="active-light active-light--on" title="Ready" style="display:flex;justify-content:center;align-items:center;width:100%;height:100%;min-height:3rem;">'
|
| 34 |
-
'<span class="active-light__label">Ready!</span></div>'
|
| 35 |
-
)
|
| 36 |
WARMUP_TICKS = max(1, int(WARMUP_SECONDS * 2)) # timer ticks every 0.5s
|
| 37 |
|
| 38 |
|
|
@@ -196,15 +188,11 @@ with gr.Blocks() as demo:
|
|
| 196 |
value=0,
|
| 197 |
label="Input gain (dB)",
|
| 198 |
)
|
| 199 |
-
|
| 200 |
-
with gr.Column(scale=4, min_width=200):
|
| 201 |
-
audio_stream = gr.Audio(
|
| 202 |
-
sources=["microphone"], streaming=True, elem_id="audio_stream"
|
| 203 |
-
)
|
| 204 |
-
with gr.Column(scale=1, min_width=120, elem_classes="active-light-column"):
|
| 205 |
-
active_light = gr.HTML(value=ACTIVE_LIGHT_GRAY)
|
| 206 |
with gr.Group(elem_classes="panel"):
|
| 207 |
-
|
|
|
|
|
|
|
| 208 |
with gr.Column(scale=5, min_width=320):
|
| 209 |
enhanced_text = gr.Textbox(
|
| 210 |
label="Enhanced Transcribed Text", lines=6, autoscroll=False
|
|
|
|
| 5 |
from pathlib import Path
|
| 6 |
|
| 7 |
from constants import STREAM_EVERY, WARMUP_SECONDS, APP_TMP_DIR
|
| 8 |
+
from ui import ACTIVE_LIGHT_GRAY, ACTIVE_LIGHT_RED, VAD_OFF_HTML
|
| 9 |
from hf_dataset_utils import ALL_FILES, get_transcript
|
| 10 |
|
| 11 |
from stream_pipeline import (
|
|
|
|
| 22 |
load_local_file,
|
| 23 |
run_offline_pipeline_streaming,
|
| 24 |
)
|
| 25 |
+
from utils import spec_image
|
| 26 |
from clean_up import purge_tmp_directory, cleanup_previous_run
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
WARMUP_TICKS = max(1, int(WARMUP_SECONDS * 2)) # timer ticks every 0.5s
|
| 29 |
|
| 30 |
|
|
|
|
| 188 |
value=0,
|
| 189 |
label="Input gain (dB)",
|
| 190 |
)
|
| 191 |
+
audio_stream = gr.Audio(sources=["microphone"], streaming=True, elem_id="audio_stream")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
with gr.Group(elem_classes="panel"):
|
| 193 |
+
with gr.Row(scale=1, elem_classes="status-row"):
|
| 194 |
+
active_light = gr.HTML(value=ACTIVE_LIGHT_GRAY, label="System Status", show_label=True)
|
| 195 |
+
vad_led = gr.HTML(value=VAD_OFF_HTML, label="Voice Activity Detection", show_label=True)
|
| 196 |
with gr.Column(scale=5, min_width=320):
|
| 197 |
enhanced_text = gr.Textbox(
|
| 198 |
label="Enhanced Transcribed Text", lines=6, autoscroll=False
|
constants.py
CHANGED
|
@@ -30,3 +30,8 @@ STREAMER_CLASSES: Final = {
|
|
| 30 |
"Deepgram Nova-3 RT": DeepgramStreamer,
|
| 31 |
"Soniox STT-RT v3": SonioxStreamer,
|
| 32 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
"Deepgram Nova-3 RT": DeepgramStreamer,
|
| 31 |
"Soniox STT-RT v3": SonioxStreamer,
|
| 32 |
}
|
| 33 |
+
|
| 34 |
+
VAD_ON: Final = "🟢"
|
| 35 |
+
VAD_OFF: Final = "⚫"
|
| 36 |
+
|
| 37 |
+
|
stream_pipeline.py
CHANGED
|
@@ -6,8 +6,7 @@ from constants import STREAMER_CLASSES
|
|
| 6 |
from stt_streamers import DeepgramStreamer
|
| 7 |
from sdk import SDKWrapper, SDKParams
|
| 8 |
from typing import Any
|
| 9 |
-
from
|
| 10 |
-
|
| 11 |
# ----------------------------
|
| 12 |
# Global transcript store (UI pulls from this)
|
| 13 |
# ----------------------------
|
|
@@ -66,7 +65,7 @@ def _to_float32_mono(y: np.ndarray) -> np.ndarray:
|
|
| 66 |
def transcribe_stream(current_sr: int | None, new_chunk, enhancement_level, input_gain_db: float = 0.0, stt_streamer: str = "deepgram") -> tuple[int | None, str, str, Any]:
|
| 67 |
print("Transcribing")
|
| 68 |
if new_chunk is None or new_chunk[1] is None:
|
| 69 |
-
return None, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT,
|
| 70 |
sr, y = new_chunk
|
| 71 |
if current_sr != sr:
|
| 72 |
init_streaming_sdk(sample_rate=sr, enhancement_level=enhancement_level/100.0)
|
|
@@ -88,7 +87,8 @@ def transcribe_stream(current_sr: int | None, new_chunk, enhancement_level, inpu
|
|
| 88 |
):
|
| 89 |
Streamer_raw.process_chunk(y)
|
| 90 |
Streamer_enhanced.process_chunk(out_1d)
|
| 91 |
-
|
|
|
|
| 92 |
|
| 93 |
|
| 94 |
def shutdown_streamers(from_stop_recording: bool = False):
|
|
@@ -113,7 +113,7 @@ def shutdown_streamers(from_stop_recording: bool = False):
|
|
| 113 |
|
| 114 |
def on_stop_recording():
|
| 115 |
shutdown_streamers()
|
| 116 |
-
return
|
| 117 |
|
| 118 |
|
| 119 |
def clear_ui():
|
|
|
|
| 6 |
from stt_streamers import DeepgramStreamer
|
| 7 |
from sdk import SDKWrapper, SDKParams
|
| 8 |
from typing import Any
|
| 9 |
+
from ui import VAD_OFF_HTML, VAD_ON_HTML
|
|
|
|
| 10 |
# ----------------------------
|
| 11 |
# Global transcript store (UI pulls from this)
|
| 12 |
# ----------------------------
|
|
|
|
| 65 |
def transcribe_stream(current_sr: int | None, new_chunk, enhancement_level, input_gain_db: float = 0.0, stt_streamer: str = "deepgram") -> tuple[int | None, str, str, Any]:
|
| 66 |
print("Transcribing")
|
| 67 |
if new_chunk is None or new_chunk[1] is None:
|
| 68 |
+
return None, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT, VAD_OFF_HTML
|
| 69 |
sr, y = new_chunk
|
| 70 |
if current_sr != sr:
|
| 71 |
init_streaming_sdk(sample_rate=sr, enhancement_level=enhancement_level/100.0)
|
|
|
|
| 87 |
):
|
| 88 |
Streamer_raw.process_chunk(y)
|
| 89 |
Streamer_enhanced.process_chunk(out_1d)
|
| 90 |
+
vad_led = VAD_ON_HTML if vad_detected else VAD_OFF_HTML
|
| 91 |
+
return current_sr, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT, vad_led
|
| 92 |
|
| 93 |
|
| 94 |
def shutdown_streamers(from_stop_recording: bool = False):
|
|
|
|
| 113 |
|
| 114 |
def on_stop_recording():
|
| 115 |
shutdown_streamers()
|
| 116 |
+
return VAD_OFF_HTML, None
|
| 117 |
|
| 118 |
|
| 119 |
def clear_ui():
|
ui.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ACTIVE_LIGHT_GRAY = (
|
| 2 |
+
'<div class="active-light active-light--off" title="Warming up" style="display:flex;justify-content:center;align-items:center;width:100%;height:100%;min-height:3rem;">'
|
| 3 |
+
'<span class="active-light__label">Warming up...</span></div>'
|
| 4 |
+
)
|
| 5 |
+
ACTIVE_LIGHT_RED = (
|
| 6 |
+
'<div class="active-light active-light--on" title="Ready" style="display:flex;justify-content:center;align-items:center;width:100%;height:100%;min-height:3rem;">'
|
| 7 |
+
'<span class="active-light__label">Ready!</span></div>'
|
| 8 |
+
)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
VAD_ON_HTML = """
|
| 12 |
+
<div style="display:flex; align-items:center; gap:10px;">
|
| 13 |
+
<div style="
|
| 14 |
+
width:25px;
|
| 15 |
+
height:25px;
|
| 16 |
+
border-radius:9999px;
|
| 17 |
+
background:#22c55e;
|
| 18 |
+
box-shadow:0 0 16px rgba(34,197,94,0.9);
|
| 19 |
+
border:1px solid #666;
|
| 20 |
+
"></div>
|
| 21 |
+
</div>
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
VAD_OFF_HTML = """
|
| 25 |
+
<div style="display:flex; align-items:center; gap:10px;">
|
| 26 |
+
<div style="
|
| 27 |
+
width:25px;
|
| 28 |
+
height:25px;
|
| 29 |
+
border-radius:9999px;
|
| 30 |
+
background:#3f3f46;
|
| 31 |
+
box-shadow:none;
|
| 32 |
+
border:1px solid #666;
|
| 33 |
+
"></div>
|
| 34 |
+
</div>
|
| 35 |
+
"""
|
utils.py
CHANGED
|
@@ -4,37 +4,9 @@ import librosa
|
|
| 4 |
from PIL import Image
|
| 5 |
import io
|
| 6 |
import matplotlib.pyplot as plt
|
| 7 |
-
from constants import TARGET_LOUDNESS, TARGET_TP
|
| 8 |
import pyloudnorm as pyln
|
| 9 |
-
|
| 10 |
-
VAD_ON_HTML = """
|
| 11 |
-
<div style="display:flex; align-items:center; gap:10px;">
|
| 12 |
-
<div style="
|
| 13 |
-
width:25px;
|
| 14 |
-
height:25px;
|
| 15 |
-
border-radius:9999px;
|
| 16 |
-
background:#22c55e;
|
| 17 |
-
box-shadow:0 0 16px rgba(34,197,94,0.9);
|
| 18 |
-
border:1px solid #666;
|
| 19 |
-
"></div>
|
| 20 |
-
</div>
|
| 21 |
-
"""
|
| 22 |
-
|
| 23 |
-
VAD_OFF_HTML = """
|
| 24 |
-
<div style="display:flex; align-items:center; gap:10px;">
|
| 25 |
-
<div style="
|
| 26 |
-
width:25px;
|
| 27 |
-
height:25px;
|
| 28 |
-
border-radius:9999px;
|
| 29 |
-
background:#3f3f46;
|
| 30 |
-
box-shadow:none;
|
| 31 |
-
border:1px solid #666;
|
| 32 |
-
"></div>
|
| 33 |
-
</div>
|
| 34 |
-
"""
|
| 35 |
-
|
| 36 |
-
SUB_ON = "🟢"
|
| 37 |
-
SUB_OFF = "⚫"
|
| 38 |
|
| 39 |
def get_vad_labels(vad_timestamps: list[list[float]], length: float) -> list[dict]:
|
| 40 |
subtitles = []
|
|
@@ -42,25 +14,23 @@ def get_vad_labels(vad_timestamps: list[list[float]], length: float) -> list[dic
|
|
| 42 |
for start, end in vad_timestamps:
|
| 43 |
if start > cur:
|
| 44 |
subtitles.append({
|
| 45 |
-
"text": f"Voice Detection: {
|
| 46 |
"timestamp": [cur, start]
|
| 47 |
})
|
| 48 |
|
| 49 |
subtitles.append({
|
| 50 |
-
"text": f"Voice Detection: {
|
| 51 |
"timestamp": [start, end]
|
| 52 |
})
|
| 53 |
|
| 54 |
cur = end
|
| 55 |
if cur < length:
|
| 56 |
subtitles.append({
|
| 57 |
-
"text": f"Voice Detection: {
|
| 58 |
"timestamp": [cur, length]
|
| 59 |
})
|
| 60 |
return subtitles
|
| 61 |
|
| 62 |
-
def render_vad_led(is_speech: bool) -> str:
|
| 63 |
-
return VAD_ON_HTML if is_speech else VAD_OFF_HTML
|
| 64 |
|
| 65 |
def to_gradio_audio(x: np.ndarray, sr: int) -> tuple[int, np.ndarray]:
|
| 66 |
"""Return (sample_rate, int16 mono array) for Gradio Audio. Gradio expects int16;
|
|
|
|
| 4 |
from PIL import Image
|
| 5 |
import io
|
| 6 |
import matplotlib.pyplot as plt
|
| 7 |
+
from constants import TARGET_LOUDNESS, TARGET_TP, VAD_OFF, VAD_ON
|
| 8 |
import pyloudnorm as pyln
|
| 9 |
+
import warnings
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
def get_vad_labels(vad_timestamps: list[list[float]], length: float) -> list[dict]:
|
| 12 |
subtitles = []
|
|
|
|
| 14 |
for start, end in vad_timestamps:
|
| 15 |
if start > cur:
|
| 16 |
subtitles.append({
|
| 17 |
+
"text": f"Voice Detection: {VAD_OFF}",
|
| 18 |
"timestamp": [cur, start]
|
| 19 |
})
|
| 20 |
|
| 21 |
subtitles.append({
|
| 22 |
+
"text": f"Voice Detection: {VAD_ON}",
|
| 23 |
"timestamp": [start, end]
|
| 24 |
})
|
| 25 |
|
| 26 |
cur = end
|
| 27 |
if cur < length:
|
| 28 |
subtitles.append({
|
| 29 |
+
"text": f"Voice Detection: {VAD_OFF}",
|
| 30 |
"timestamp": [cur, length]
|
| 31 |
})
|
| 32 |
return subtitles
|
| 33 |
|
|
|
|
|
|
|
| 34 |
|
| 35 |
def to_gradio_audio(x: np.ndarray, sr: int) -> tuple[int, np.ndarray]:
|
| 36 |
"""Return (sample_rate, int16 mono array) for Gradio Audio. Gradio expects int16;
|