import webrtcvad from collections import deque class VADFrameSplitter: """ Converts raw PCM stream into speech segments. - 20ms frames - speech/silence detection - endpoint detection """ def __init__(self, sample_rate=16000, mode=2): self.vad = webrtcvad.Vad(mode) self.sample_rate = sample_rate self.frame_buffer = deque() self.speech_count = 0 self.silence_count = 0 def process(self, frame: bytes): """ Returns: - True if speaking - False if silence """ is_speech = self.vad.is_speech(frame, self.sample_rate) self.frame_buffer.append(frame) if is_speech: self.speech_count += 1 self.silence_count = 0 else: self.silence_count += 1 return is_speech def is_end_of_utterance(self): """ Detect speech completion. """ return self.silence_count > 15 def flush(self): audio = b"".join(self.frame_buffer) self.frame_buffer.clear() self.speech_count = 0 self.silence_count = 0 return audio