Sbboss's picture
RAG, language updates
0b2d478
"""Voice activity detection using Silero VAD (ONNX)."""
from __future__ import annotations
from array import array
from dataclasses import dataclass
from silero_vad_lite import SileroVAD
@dataclass
class VADDecision:
speech_started: bool
speech_ended: bool
speech_ms: int
silence_ms: int
class SileroVADStream:
"""Streaming VAD state machine for 16kHz mono PCM."""
def __init__(
self,
sample_rate: int = 16000,
speech_threshold: float = 0.8,
min_speech_ms: int = 300,
end_silence_ms: int = 1400,
min_speech_frames: int = 1,
min_silence_frames: int = 1,
prob_smoothing: float = 0.7,
) -> None:
self._sample_rate = sample_rate
self._frame_samples = 512 # 32ms @ 16kHz
self._frame_bytes = self._frame_samples * 2 # int16
self._vad = SileroVAD(sample_rate=sample_rate)
self._speech_threshold = speech_threshold
self._min_speech_ms = min_speech_ms
self._end_silence_ms = end_silence_ms
self._min_speech_frames = min_speech_frames
self._min_silence_frames = min_silence_frames
self._prob_smoothing = prob_smoothing
self._buffer = bytearray()
self._in_speech = False
self._speech_ms = 0
self._silence_ms = 0
self._speech_frames = 0
self._silence_frames = 0
self._prob_ema = 0.0
def reset(self) -> None:
self._buffer.clear()
self._in_speech = False
self._speech_ms = 0
self._silence_ms = 0
self._speech_frames = 0
self._silence_frames = 0
self._prob_ema = 0.0
def has_speech(self) -> bool:
return self._speech_ms >= self._min_speech_ms
def update(self, pcm_bytes: bytes) -> VADDecision:
"""Feed PCM bytes and return VAD decision for the latest frames."""
self._buffer.extend(pcm_bytes)
speech_started = False
speech_ended = False
while len(self._buffer) >= self._frame_bytes:
frame = self._buffer[: self._frame_bytes]
del self._buffer[: self._frame_bytes]
samples = array("h", frame)
float32 = [s / 32768.0 for s in samples]
prob = self._vad.process(float32)
self._prob_ema = (
self._prob_ema * self._prob_smoothing
+ prob * (1.0 - self._prob_smoothing)
)
if self._prob_ema >= self._speech_threshold:
self._speech_frames += 1
self._silence_frames = 0
if not self._in_speech and self._speech_frames >= self._min_speech_frames:
speech_started = True
self._in_speech = True
self._speech_ms = 0
if self._in_speech:
self._speech_ms += 32
self._silence_ms = 0
else:
self._silence_frames += 1
self._speech_frames = 0
if self._in_speech:
self._silence_ms += 32
if (
self._speech_ms >= self._min_speech_ms
and self._silence_ms >= self._end_silence_ms
and self._silence_frames >= self._min_silence_frames
):
speech_ended = True
self._in_speech = False
self._silence_ms = 0
return VADDecision(
speech_started=speech_started,
speech_ended=speech_ended,
speech_ms=self._speech_ms,
silence_ms=self._silence_ms,
)