Spaces:

ai-coustics
/

VoiceFocus

Sleeping

App Files Files Community

mariesig commited on Mar 4

Commit

990f149

1 Parent(s): aabdfb2

Refactor online streaming functionality and enhance documentation

Browse files

Files changed (9) hide show

app.py +27 -14
constants.py +7 -3
docs/dawn_chorus.md +1 -0
docs/intro.md +2 -9
docs/local_file.md +1 -0
docs/online.md +2 -0
online_pipeline.py +68 -70
sdk.py +24 -31
stt_streamers/deepgram_streamer.py +49 -19

app.py CHANGED Viewed

@@ -1,7 +1,9 @@
 import gradio as gr
 from hf_dataset_utils import ALL_FILES
-from online_pipeline import transcribe_stream, reset_streamers, stop_streaming, change_stt_model
 from offline_pipeline import load_file_from_dataset, load_local_file, denoise_audio, retrieve_audio_information
 from clean_up import purge_tmp_directory, cleanup_previous_run
@@ -33,7 +35,7 @@ with gr.Blocks() as demo:
     )
     # Online STT streamer swap uses the same global control
-    stt_model.change(fn=change_stt_model, inputs=stt_model, outputs=[])
     with gr.Tabs(elem_classes="main-tabs"):
         # =========================
@@ -42,10 +44,14 @@ with gr.Blocks() as demo:
         with gr.Tab("Offline", elem_classes="tab-offline") as offline_tab:
             with gr.Group(elem_classes="panel"):
                 with gr.Tab("Upload local file", elem_classes="upload-tab") as upload_tab:
                     audio_file_upload = gr.Audio(type="filepath", sources=["upload"])
                     enhance_btn_for_upload = gr.Button("Enhance", scale=2)
                 with gr.Tab("Dataset: Dawn Chorus", elem_classes="dataset-tab") as dataset_tab:
                     dataset_dropdown = gr.Dropdown(choices=ALL_FILES,  value=ALL_FILES[0], label="Select a sample from the Dawn Chorus dataset")
                     audio_file_from_dataset = gr.Audio(type="filepath", interactive=False)
                     enhance_btn_for_dataset = gr.Button("Enhance", scale=2)
@@ -114,6 +120,7 @@ with gr.Blocks() as demo:
         # ONLINE TAB
         # =========================
         with gr.Tab("Online", elem_classes="tab-online"):
             with gr.Group(elem_classes="panel"):
                 stream_state = gr.State(None)
                 audio_stream = gr.Audio(sources=["microphone"], streaming=True)
@@ -123,26 +130,32 @@ with gr.Blocks() as demo:
                     with gr.Column(scale=5, min_width=320):
                         raw_text = gr.Textbox(label="Raw Transcribed Text", lines=6)
                 clear_btn = gr.Button("Clear")
-                audio_stream.stream(
                     fn=transcribe_stream,
                     inputs=[stream_state, audio_stream, enhancement_level],
                     outputs=[stream_state, enhanced_text, raw_text],
-                    stream_every=0.05,
                 )
                 clear_btn.click(
-                    fn=reset_streamers,
-                    outputs=[stream_state, enhanced_text, raw_text],
                 )
-        offline_tab.select(
-            fn=stop_streaming,
-            inputs=None,
-            outputs=[audio_stream, stream_state, enhanced_text, raw_text],
-        )
 purge_tmp_directory(max_age_minutes=0, skip_substrings=[])
 demo.launch(allowed_paths=["/tmp", "/"])

 import gradio as gr
+from constants import STREAM_EVERY
 from hf_dataset_utils import ALL_FILES
+from online_pipeline import set_stt_streamer, transcribe_stream,  stop_streaming, set_stt_streamer
 from offline_pipeline import load_file_from_dataset, load_local_file, denoise_audio, retrieve_audio_information
 from clean_up import purge_tmp_directory, cleanup_previous_run
     )
     # Online STT streamer swap uses the same global control
+    stt_model.change(fn=set_stt_streamer, inputs=stt_model, outputs=[])
     with gr.Tabs(elem_classes="main-tabs"):
         # =========================
         with gr.Tab("Offline", elem_classes="tab-offline") as offline_tab:
             with gr.Group(elem_classes="panel"):
                 with gr.Tab("Upload local file", elem_classes="upload-tab") as upload_tab:
+                    with gr.Row():
+                        gr.Markdown(open("docs/local_file.md", "r", encoding="utf-8").read())
                     audio_file_upload = gr.Audio(type="filepath", sources=["upload"])
                     enhance_btn_for_upload = gr.Button("Enhance", scale=2)
                 with gr.Tab("Dataset: Dawn Chorus", elem_classes="dataset-tab") as dataset_tab:
+                    with gr.Row():
+                        gr.Markdown(open("docs/dawn_chorus.md", "r", encoding="utf-8").read())
                     dataset_dropdown = gr.Dropdown(choices=ALL_FILES,  value=ALL_FILES[0], label="Select a sample from the Dawn Chorus dataset")
                     audio_file_from_dataset = gr.Audio(type="filepath", interactive=False)
                     enhance_btn_for_dataset = gr.Button("Enhance", scale=2)
         # ONLINE TAB
         # =========================
         with gr.Tab("Online", elem_classes="tab-online"):
+            gr.Markdown(open("docs/online.md", "r", encoding="utf-8").read())
             with gr.Group(elem_classes="panel"):
                 stream_state = gr.State(None)
                 audio_stream = gr.Audio(sources=["microphone"], streaming=True)
                     with gr.Column(scale=5, min_width=320):
                         raw_text = gr.Textbox(label="Raw Transcribed Text", lines=6)
                 clear_btn = gr.Button("Clear")
+                stream_evt = audio_stream.stream(
                     fn=transcribe_stream,
                     inputs=[stream_state, audio_stream, enhancement_level],
                     outputs=[stream_state, enhanced_text, raw_text],
+                    stream_every=STREAM_EVERY,
+                    time_limit=60*2,
+                    concurrency_limit=1,
                 )
                 clear_btn.click(
+                    fn=stop_streaming,
+                    outputs=[audio_stream,stream_state, enhanced_text, raw_text],
+                    cancels=[stream_evt]
+                ).then(
+                    set_stt_streamer,
+                    inputs=stt_model,
+                    outputs=None,
                 )
+            offline_tab.select(
+                fn=stop_streaming,
+                outputs=[audio_stream,stream_state, enhanced_text, raw_text],
+                cancels=[stream_evt]
+            )
 purge_tmp_directory(max_age_minutes=0, skip_substrings=[])
 demo.launch(allowed_paths=["/tmp", "/"])

constants.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from typing import Final
-import os
-from stt_streamers.soniox_streamer import SONIOX_WEBSOCKET_URL
 CHUNK_SIZE: Final = 1024
 TIMEOUT_FACTOR_MB: Final = 60
@@ -17,6 +17,10 @@ MIX_DIR: Final = "mix"
 SPEECH_DIR: Final = "speech"
 TRANS_DIR: Final = "transcripts"
-# Private access token from Space secrets:
 DEFAULT_SR: Final = 16000

+from re import S
 from typing import Final
+from stt_streamers import DeepgramStreamer, SonioxStreamer
 CHUNK_SIZE: Final = 1024
 TIMEOUT_FACTOR_MB: Final = 60
 SPEECH_DIR: Final = "speech"
 TRANS_DIR: Final = "transcripts"
 DEFAULT_SR: Final = 16000
+STREAM_EVERY: Final = 0.2
+STREAMER_CLASSES: Final = {
+    "Deepgram": DeepgramStreamer,
+    "Soniox": SonioxStreamer,
+}

docs/dawn_chorus.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ Select a sample from our open-source [Dawn Chorus English](https://huggingface.co/datasets/ai-coustics/dawn_chorus_en), which features challenging cases with background voice activity.

docs/intro.md CHANGED Viewed

@@ -1,9 +1,2 @@
-Welcome! This Space lets you try **ai‑coustics VoiceFocus (Quail Voice Focus)** — a real‑time, STT‑oriented enhancement model that **isolates the foreground speaker** and **suppresses competing voices + background noise**. It’s tuned to keep the phonetic cues speech‑to‑text systems need, so the output isn’t always “prettier” — just cleaner for transcription.
-**Offline:** upload an audio file or pick a sample from the dataset, then listen to the enhanced result and compare raw vs enhanced transcripts.
-**Online:** stream from your microphone and watch raw vs enhanced text update live.
-Use **Enhancement level (0–100)** to dial in the strength, and switch the **STT backend (Deepgram / Soniox)** to see how different engines react to cleaner input.
-Tip: speak close to your mic (near‑field) and keep a steady level for best results. Please don’t upload sensitive or private audio—use test material only.


1	+ Welcome! This Space lets you try ai‑coustics Quail Voice Focus — a real‑time, STT‑oriented enhancement model that isolates the foreground speaker and suppresses competing voices and background noise. For more information visit our [docs](https://docs.ai-coustics.com/guides/models#quail).
2	+ The model is tuned to preserve the phonetic cues needed for speech‑to‑text systems, so the output isn’t always “prettier”—just cleaner for transcription.

docs/local_file.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ Upload an audio file from your computer. For best results, choose a recording with overlapping or background speech to observe how the model handles challenging scenarios.

docs/online.md ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ Use your microphone to stream live audio and see, in real time, how our enhancement technology surpresses voices in the background and excludes them from transcription.
2	+ Tip: speak close to your mic (near‑field) for best results. Please don’t upload sensitive or private audio—use test material only.

online_pipeline.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import numpy as np
 import soxr
-from constants import DEFAULT_SR
-from stt_streamers import DeepgramStreamer, SonioxStreamer
 from sdk import SDKWrapper
-import gradio as gr
 # ----------------------------
 # Global transcript store (UI pulls from this)
 # ----------------------------
@@ -12,7 +12,6 @@ _ENHANCED_TRANSCRIPT: str = ""
 _RAW_TRANSCRIPT: str = ""
 def _set_transcript_enhanced(text: str) -> None:
     """Deepgram callback: update latest transcript text (no printing)."""
     global _ENHANCED_TRANSCRIPT
@@ -23,14 +22,8 @@ def _set_transcript_raw(text: str) -> None:
     global _RAW_TRANSCRIPT
     _RAW_TRANSCRIPT = text
-map_streamer_to_callback = {
-    "Deepgram": DeepgramStreamer,
-    "Soniox": SonioxStreamer,
-}
-# ----------------------------
-# Single global streamer (stays the same)
-# ----------------------------
 global Streamer_enhanced, Streamer_raw, SDK
 Streamer_enhanced = DeepgramStreamer(
@@ -44,82 +37,88 @@ Streamer_raw = DeepgramStreamer(
     on_update=_set_transcript_raw,
 )
-ResampleStream = soxr.ResampleStream(48000, DEFAULT_SR,1,dtype='float32')
 SDK = SDKWrapper()
-SDK.init_processor(sample_rate=DEFAULT_SR, enhancement_level=1.0, allow_variable_frames=True, num_frames=800)  # 100% enhancement for online demo
-# ----------------------------
-# Gradio stream handler
-# ----------------------------
-def transcribe_stream(stream_16k, new_chunk, enhancement_level):
-    """
-    stream_16k: np.ndarray | None  (we store the running buffer in 16 kHz)
-    new_chunk: None OR (sr:int, y:np.ndarray)
-    returns: (stream_16k_state, enhanced_text, raw_text)
-    """
-    # Gradio can send None when stream ends / resets / no audio yet
-    if new_chunk is None:
-        return stream_16k, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT
-    sr, y = new_chunk
-    if y is None:
-        return stream_16k, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT
     y = np.asarray(y)
-    if y.size == 0:
-        return stream_16k, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT
-    # Convert to mono if stereo: y can be (frames,) or (frames, channels)
     if y.ndim > 1:
         y = y.mean(axis=1)
-    # Convert dtype correctly
     if y.dtype == np.int16:
-        y = y.astype(np.float32) / 32768.0
     else:
         y = y.astype(np.float32)
-    if sr != 16000:
-        y_16k = soxr.resample(y, sr, 16000).astype(np.float32)
-    else:
-        y_16k = y
-    # ✅ Save stream in 16 kHz
-    stream_16k = y_16k if stream_16k is None else np.concatenate([stream_16k, y_16k])
-    # Enhance at 16k
     SDK.change_enhancement_level(float(enhancement_level) / 100.0)
-    enhanced_chunk_16k = SDK.process_chunk(y_16k)
-    Streamer_enhanced.process_chunk(enhanced_chunk_16k)
-    Streamer_raw.process_chunk(y_16k)
-    return stream_16k, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT
-def reset_streamers():
-    global _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT
     try:
-        Streamer_enhanced.clear_text()
-        Streamer_raw.clear_text()
-        ResampleStream.clear()
     except Exception:
         pass
-    _ENHANCED_TRANSCRIPT = ""
-    _RAW_TRANSCRIPT = ""
-    return None, "", ""
-def stop_streaming():
-    reset_streamers()
     return None, None, "", ""
-def change_stt_model(model_name):
-    StreamerCls = map_streamer_to_callback.get(model_name, DeepgramStreamer)
-    print(StreamerCls)
     global Streamer_enhanced, Streamer_raw
     Streamer_enhanced = StreamerCls(
         fs_hz=DEFAULT_SR,
@@ -131,5 +130,4 @@ def change_stt_model(model_name):
         stream_name="raw",
         on_update=_set_transcript_raw,
     )

 import numpy as np
 import soxr
+from constants import DEFAULT_SR, STREAM_EVERY, STREAMER_CLASSES
+from stt_streamers import DeepgramStreamer
 from sdk import SDKWrapper
+from dataclasses import dataclass
 # ----------------------------
 # Global transcript store (UI pulls from this)
 # ----------------------------
 _RAW_TRANSCRIPT: str = ""
 def _set_transcript_enhanced(text: str) -> None:
     """Deepgram callback: update latest transcript text (no printing)."""
     global _ENHANCED_TRANSCRIPT
     global _RAW_TRANSCRIPT
     _RAW_TRANSCRIPT = text
 global Streamer_enhanced, Streamer_raw, SDK
 Streamer_enhanced = DeepgramStreamer(
     on_update=_set_transcript_raw,
 )
 SDK = SDKWrapper()
+SDK.init_processor(
+    sample_rate=DEFAULT_SR,
+    enhancement_level=1.0,
+    allow_variable_frames=False,
+    num_channels=1,
+)
+@dataclass
+class EnhanceSession:
+    pending: np.ndarray        # 1D float32 @ processor sample rate
+    sr: int
+    num_frames: int
+@dataclass
+class StreamSession:
+    # nur was du wirklich brauchst
+    resampler: soxr.ResampleStream | None
+    sr_in: int | None
+    tail_16k: np.ndarray  # ring buffer (z.B. letzte 10s)
+    tail_max: int         # max samples
+def _get_or_init_session(session: StreamSession | None, sr_in: int) -> StreamSession:
+    if session is None or session.sr_in != sr_in:
+        # ResampleStream ist für real-time processing gedacht citeturn8view0
+        resampler = None if sr_in == 16000 else soxr.ResampleStream(sr_in, 16000, num_channels=1, dtype="float32")
+        return StreamSession(resampler=resampler, sr_in=sr_in, tail_16k=np.zeros((0,), dtype=np.float32), tail_max=10 * 16000)
+    return session
+def _to_float32_mono(y: np.ndarray) -> np.ndarray:
+    # Gradio liefert int16 (oder (samples, channels)). citeturn1view4
     y = np.asarray(y)
     if y.ndim > 1:
         y = y.mean(axis=1)
     if y.dtype == np.int16:
+        y = (y.astype(np.float32) / 32768.0)
     else:
         y = y.astype(np.float32)
+    return y
+def transcribe_stream(session: StreamSession | None, new_chunk, enhancement_level):
+    if new_chunk is None or new_chunk[1] is None:
+        return session, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT
+    sr, y = new_chunk
+    y = _to_float32_mono(y)
+    session = _get_or_init_session(session, sr)
     SDK.change_enhancement_level(float(enhancement_level) / 100.0)
+    if session.resampler is not None:
+        y_16k = session.resampler.resample_chunk(y)
+    else:
+        y_16k = y
+    # Ringbuffer (nicht unendlich konkatenieren)
+    if y_16k.size > 0:
+        tail = np.concatenate([session.tail_16k, y_16k])
+        if tail.size > session.tail_max:
+            tail = tail[-session.tail_max:]
+        session.tail_16k = tail
+    enhanced_chunk_16k = SDK.process_sync(y_16k)
+    Streamer_enhanced.process_chunk(enhanced_chunk_16k.flatten())
+    Streamer_raw.process_chunk(y_16k.flatten())
+    return session, _ENHANCED_TRANSCRIPT, _RAW_TRANSCRIPT
+def stop_streaming():
     try:
+        Streamer_enhanced.shutdown()
+    except Exception:
+        pass
+    try:
+        Streamer_raw.shutdown()
     except Exception:
         pass
     return None, None, "", ""
+def set_stt_streamer(model_name):
+    StreamerCls = STREAMER_CLASSES.get(model_name, DeepgramStreamer)
     global Streamer_enhanced, Streamer_raw
     Streamer_enhanced = StreamerCls(
         fs_hz=DEFAULT_SR,
         stream_name="raw",
         on_update=_set_transcript_raw,
     )

sdk.py CHANGED Viewed

@@ -1,10 +1,4 @@
-# sdk_audio.py (or keep inline)
-from __future__ import annotations
 import numpy as np
-import librosa
-import soundfile as sf
 from dotenv import load_dotenv
 import aic_sdk as aic
 import os
@@ -22,16 +16,21 @@ class SDKWrapper:
         model_path = aic.Model.download(model_id, models_dir)
         self.model = aic.Model.from_file(model_path)
-    def init_processor(self, sample_rate: int, enhancement_level: float, allow_variable_frames: bool = False, num_frames: int | None = None):
         self.processor_sample_rate = sample_rate
-        self.processor_optimal_frames = self.model.get_optimal_num_frames(sample_rate)
         config = aic.ProcessorConfig(
             sample_rate=sample_rate,
-            num_channels=1,
-            num_frames=self.processor_optimal_frames if num_frames is None else num_frames,
             allow_variable_frames=allow_variable_frames,
         )
-        processor = aic.Processor(self.model, self.sdk_key, config)
         processor.get_processor_context().set_parameter(
             aic.ProcessorParameter.EnhancementLevel, float(enhancement_level)
         )
@@ -44,6 +43,13 @@ class SDKWrapper:
             aic.ProcessorParameter.EnhancementLevel, float(enhancement_level)
         )
     def process_sync(
         self,
         audio: np.ndarray,
@@ -51,12 +57,9 @@ class SDKWrapper:
         """
             audio_array: 2D NumPy array with shape (num_channels, samples) containing audio data to be enhanced
         """
-        if len(audio.shape) == 1:
-            audio = audio.reshape(1, -1)
-        if audio.shape[0] > 2 or len(audio.shape) != 2:
-            raise ValueError("Expected audio with shape (n, frames)")
         out = np.zeros_like(audio)
-        chunk_size = self.processor_optimal_frames
         n = audio.shape[1]
         for i in range(0, n, chunk_size):
             chunk = audio[:, i : i + chunk_size]
@@ -71,18 +74,8 @@ class SDKWrapper:
             out[:, i : i + chunk_size] = enhanced[:, :chunk_size]
         return out
-    def process_chunk(
-        self,
-        chunk: np.ndarray,
-    ) -> np.ndarray:
-        """
-        Realtime processing: process a single chunk of audio and return enhanced chunk.
-        """
-        if not hasattr(self, "processor"):
-            raise ValueError("Processor not initialized")
-        chunk = np.asarray(chunk, dtype=np.float32).flatten()
-        if chunk.size == 0:
-            return chunk
-        chunk_planar = chunk.reshape(1, -1)
-        enhanced_planar = self.processor.process(chunk_planar)
-        return enhanced_planar.flatten()

 import numpy as np
 from dotenv import load_dotenv
 import aic_sdk as aic
 import os
         model_path = aic.Model.download(model_id, models_dir)
         self.model = aic.Model.from_file(model_path)
+    def init_processor(self, sample_rate: int, enhancement_level: float, allow_variable_frames: bool = False, num_frames: int | None = None,num_channels: int = 1, sync: bool = True):
         self.processor_sample_rate = sample_rate
+        processor_optimal_frames = self.model.get_optimal_num_frames(sample_rate)
+        self.num_frames = num_frames if num_frames else processor_optimal_frames
         config = aic.ProcessorConfig(
             sample_rate=sample_rate,
+            num_channels=num_channels,
+            num_frames=self.num_frames,
             allow_variable_frames=allow_variable_frames,
         )
+        if sync:
+            processor = aic.Processor(self.model, self.sdk_key, config)
+        else:
+            processor = aic.ProcessorAsync(self.model, self.sdk_key, config)
         processor.get_processor_context().set_parameter(
             aic.ProcessorParameter.EnhancementLevel, float(enhancement_level)
         )
             aic.ProcessorParameter.EnhancementLevel, float(enhancement_level)
         )
+    def _check_shape(self, audio: np.ndarray) -> np.ndarray:
+        if len(audio.shape) == 1:
+            audio = audio.reshape(1, -1)
+        if audio.shape[0] > 2 or len(audio.shape) != 2:
+            raise ValueError("Expected audio with shape (n, frames)")
+        return audio
     def process_sync(
         self,
         audio: np.ndarray,
         """
             audio_array: 2D NumPy array with shape (num_channels, samples) containing audio data to be enhanced
         """
+        audio = self._check_shape(audio)
         out = np.zeros_like(audio)
+        chunk_size = self.num_frames
         n = audio.shape[1]
         for i in range(0, n, chunk_size):
             chunk = audio[:, i : i + chunk_size]
             out[:, i : i + chunk_size] = enhanced[:, :chunk_size]
         return out
+    def process_chunk(self, audio: np.ndarray) -> np.ndarray:
+        audio = self._check_shape(audio)
+        result = self.processor.process(audio)
+        return result

stt_streamers/deepgram_streamer.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import json
 import os
 import threading
 import urllib.parse
 import numpy as np
 from websockets.sync.client import connect
@@ -33,10 +34,16 @@ class DeepgramStreamer:
         # Deepgram requires the API key in the headers
         headers = {"Authorization": f"Token {api_key}"}
         self.ws = connect(url_with_params, additional_headers=headers)
         # 3. Start the receiving thread
         self.thread = threading.Thread(target=self._receive_loop, daemon=True)
         self.thread.start()
     def stream_array(self, pcm: np.ndarray) -> str:
         """
@@ -86,16 +93,16 @@ class DeepgramStreamer:
         }
     def process_chunk(self, chunk: np.ndarray) -> None:
-        """
-        Converts float32 numpy array to int16 bytes and sends to WebSocket.
-        """
         chunk = np.clip(chunk, -1.0, 1.0)
         chunk_int16 = (chunk * 32767).astype(np.int16)
-        if len(chunk_int16) > 0:
-            try:
                 self.ws.send(chunk_int16.tobytes())
-            except Exception:
-                pass
     def render_tokens(
         self, final_tokens: list[dict], non_final_tokens: list[dict]
@@ -170,18 +177,41 @@ class DeepgramStreamer:
             print(f"Deepgram receive loop error: {e}")
         finally:
             self.finished_event.set()
-    def close(self) -> str:
-        """
-        Sends the specific JSON message Deepgram expects to close the stream.
-        """
-        if hasattr(self, "ws"):
-            try:
-                # Deepgram V1 expects this specific JSON to close the stream
                 self.ws.send(json.dumps({"type": "CloseStream"}))
-            except Exception:
-                pass
-        return self.render_tokens(self.final_tokens, [])
     def _ensure_closed(self) -> None:
         """

 import json
 import os
 import threading
+import time
 import urllib.parse
 import numpy as np
 from websockets.sync.client import connect
         # Deepgram requires the API key in the headers
         headers = {"Authorization": f"Token {api_key}"}
         self.ws = connect(url_with_params, additional_headers=headers)
+        self._send_lock = threading.Lock()
+        self._stop_evt = threading.Event()
+        self._last_send_ts = time.monotonic()
         # 3. Start the receiving thread
         self.thread = threading.Thread(target=self._receive_loop, daemon=True)
         self.thread.start()
+        self.keepalive_thread = threading.Thread(target=self._keepalive_loop, daemon=True)
+        self.keepalive_thread.start()
     def stream_array(self, pcm: np.ndarray) -> str:
         """
         }
     def process_chunk(self, chunk: np.ndarray) -> None:
         chunk = np.clip(chunk, -1.0, 1.0)
         chunk_int16 = (chunk * 32767).astype(np.int16)
+        if len(chunk_int16) == 0:
+            return
+        try:
+            with self._send_lock:
                 self.ws.send(chunk_int16.tobytes())
+            self._last_send_ts = time.monotonic()
+        except Exception as e:
+            print(f"[{self.stream_name}] send failed: {e}")
     def render_tokens(
         self, final_tokens: list[dict], non_final_tokens: list[dict]
             print(f"Deepgram receive loop error: {e}")
         finally:
             self.finished_event.set()
+    def _keepalive_loop(self):
+        # Deepgram: KeepAlive als Text-Message senden citeturn4search25
+        while not self._stop_evt.is_set():
+            time.sleep(0.5)
+            if time.monotonic() - self._last_send_ts >= 3.0:
+                try:
+                    with self._send_lock:
+                        self.ws.send(json.dumps({"type": "KeepAlive"}))
+                    self._last_send_ts = time.monotonic()
+                except Exception:
+                    # bei Fehler: loop endet, send wird später reconnecten können
+                    return
+    def close(self) -> None:
+        # Deepgram CloseStream: {"type":"CloseStream"} citeturn2view3
+        try:
+            with self._send_lock:
                 self.ws.send(json.dumps({"type": "CloseStream"}))
+        except Exception:
+            pass
+    def shutdown(self) -> None:
+        self._stop_evt.set()
+        self.close()
+        try:
+            self.ws.close()
+        except Exception:
+            pass
+        if hasattr(self, "thread") and self.thread.is_alive():
+            self.thread.join(timeout=1.0)
+        if hasattr(self, "keepalive_thread") and self.keepalive_thread.is_alive():
+            self.keepalive_thread.join(timeout=1.0)
     def _ensure_closed(self) -> None:
         """