VOICE_TO_VOICE_AI / app /agent /vad_frames.py
USER
Initial clean voice agent - no venv or git history
2651a17
import webrtcvad
from collections import deque
class VADFrameSplitter:
"""
Converts raw PCM stream into speech segments.
- 20ms frames
- speech/silence detection
- endpoint detection
"""
def __init__(self, sample_rate=16000, mode=2):
self.vad = webrtcvad.Vad(mode)
self.sample_rate = sample_rate
self.frame_buffer = deque()
self.speech_count = 0
self.silence_count = 0
def process(self, frame: bytes):
"""
Returns:
- True if speaking
- False if silence
"""
is_speech = self.vad.is_speech(frame, self.sample_rate)
self.frame_buffer.append(frame)
if is_speech:
self.speech_count += 1
self.silence_count = 0
else:
self.silence_count += 1
return is_speech
def is_end_of_utterance(self):
"""
Detect speech completion.
"""
return self.silence_count > 15
def flush(self):
audio = b"".join(self.frame_buffer)
self.frame_buffer.clear()
self.speech_count = 0
self.silence_count = 0
return audio