Spaces:
Sleeping
Sleeping
| """Voice activity detection using Silero VAD (ONNX).""" | |
| from __future__ import annotations | |
| from array import array | |
| from dataclasses import dataclass | |
| from silero_vad_lite import SileroVAD | |
| class VADDecision: | |
| speech_started: bool | |
| speech_ended: bool | |
| speech_ms: int | |
| silence_ms: int | |
| class SileroVADStream: | |
| """Streaming VAD state machine for 16kHz mono PCM.""" | |
| def __init__( | |
| self, | |
| sample_rate: int = 16000, | |
| speech_threshold: float = 0.8, | |
| min_speech_ms: int = 300, | |
| end_silence_ms: int = 1400, | |
| min_speech_frames: int = 1, | |
| min_silence_frames: int = 1, | |
| prob_smoothing: float = 0.7, | |
| ) -> None: | |
| self._sample_rate = sample_rate | |
| self._frame_samples = 512 # 32ms @ 16kHz | |
| self._frame_bytes = self._frame_samples * 2 # int16 | |
| self._vad = SileroVAD(sample_rate=sample_rate) | |
| self._speech_threshold = speech_threshold | |
| self._min_speech_ms = min_speech_ms | |
| self._end_silence_ms = end_silence_ms | |
| self._min_speech_frames = min_speech_frames | |
| self._min_silence_frames = min_silence_frames | |
| self._prob_smoothing = prob_smoothing | |
| self._buffer = bytearray() | |
| self._in_speech = False | |
| self._speech_ms = 0 | |
| self._silence_ms = 0 | |
| self._speech_frames = 0 | |
| self._silence_frames = 0 | |
| self._prob_ema = 0.0 | |
| def reset(self) -> None: | |
| self._buffer.clear() | |
| self._in_speech = False | |
| self._speech_ms = 0 | |
| self._silence_ms = 0 | |
| self._speech_frames = 0 | |
| self._silence_frames = 0 | |
| self._prob_ema = 0.0 | |
| def has_speech(self) -> bool: | |
| return self._speech_ms >= self._min_speech_ms | |
| def update(self, pcm_bytes: bytes) -> VADDecision: | |
| """Feed PCM bytes and return VAD decision for the latest frames.""" | |
| self._buffer.extend(pcm_bytes) | |
| speech_started = False | |
| speech_ended = False | |
| while len(self._buffer) >= self._frame_bytes: | |
| frame = self._buffer[: self._frame_bytes] | |
| del self._buffer[: self._frame_bytes] | |
| samples = array("h", frame) | |
| float32 = [s / 32768.0 for s in samples] | |
| prob = self._vad.process(float32) | |
| self._prob_ema = ( | |
| self._prob_ema * self._prob_smoothing | |
| + prob * (1.0 - self._prob_smoothing) | |
| ) | |
| if self._prob_ema >= self._speech_threshold: | |
| self._speech_frames += 1 | |
| self._silence_frames = 0 | |
| if not self._in_speech and self._speech_frames >= self._min_speech_frames: | |
| speech_started = True | |
| self._in_speech = True | |
| self._speech_ms = 0 | |
| if self._in_speech: | |
| self._speech_ms += 32 | |
| self._silence_ms = 0 | |
| else: | |
| self._silence_frames += 1 | |
| self._speech_frames = 0 | |
| if self._in_speech: | |
| self._silence_ms += 32 | |
| if ( | |
| self._speech_ms >= self._min_speech_ms | |
| and self._silence_ms >= self._end_silence_ms | |
| and self._silence_frames >= self._min_silence_frames | |
| ): | |
| speech_ended = True | |
| self._in_speech = False | |
| self._silence_ms = 0 | |
| return VADDecision( | |
| speech_started=speech_started, | |
| speech_ended=speech_ended, | |
| speech_ms=self._speech_ms, | |
| silence_ms=self._silence_ms, | |
| ) | |