Spaces:

rudyByte
/

maya-voice-agent

Paused

rudyByte

fix: implement debounced barge-in check, greeting isolation, and remove aggressive dynamic VAD fast-tracking to eliminate premature filler interruptions and cut-off greetings

61cd1af 6 days ago

raw

history blame contribute delete

12 kB

	"""
	vad_engine.py — Neural Voice Activity Detection for Maya

	Replaces the old RMS energy-based VAD (SPEECH_THRESHOLD = 400)
	with Silero VAD, a 1MB neural model that:
	- Processes one audio chunk in under 1ms on CPU
	- Was trained on 6000+ languages including Hindi and Gujarati
	- Handles background noise, phone audio quality, and soft speech
	- Zero false positives from fans, AC units, or background TV
	- MIT licensed, no API key needed, runs fully offline

	Architecture:
	Each incoming Twilio audio chunk (8kHz µ-law) passes through:
	1. Audio format conversion: µ-law 8kHz → PCM 16kHz float32
	2. Silero VAD inference: returns speech probability 0.0–1.0
	3. Hangover logic: keeps speech "open" for language-appropriate
	silence duration before declaring end-of-turn
	4. Pre-roll buffer: keeps 200ms before speech onset so first
	syllable is never clipped (preserves existing Maya behavior)

	Language-aware silence thresholds (research-backed):
	Gujarati: 1000ms — Gujarati speakers pause longer between phrases
	Hindi: 800ms — Standard South Asian speech rhythm
	English: 600ms — Standard conversational English pause

	Usage:
	vad = SileroVADEngine()
	result = vad.process_chunk(raw_mulaw_bytes, current_language)
	if result.end_of_turn:
	# User has finished speaking — send to STT
	audio_for_stt = result.speech_audio
	"""

	import numpy as np
	import audioop
	from dataclasses import dataclass, field
	from typing import Optional
	from silero_vad import load_silero_vad
	import torch


	# ── LANGUAGE-AWARE SILENCE DURATIONS ────────────────────────────────────────
	# Values in milliseconds. Based on cross-linguistic speech research showing
	# Gujarati and Hindi speakers have longer natural inter-phrase pauses
	# compared to English. Using a single threshold cuts them off mid-sentence.
	SILENCE_DURATION_MS = {
	"gujarati": 1000, # 1.0s — Gujarati has the longest natural pauses
	"hindi": 800, # 0.8s — Hindi, slightly faster rhythm than Gujarati
	"english": 600, # 0.6s — Standard conversational English
	"default": 800, # fallback
	}

	# VAD inference threshold: 0.5 is standard. Raising to 0.75 for Maya
	# to prevent false-positives from phone line noise/echo.
	SPEECH_PROBABILITY_THRESHOLD = 0.75

	# Audio sample rate constants
	INPUT_SAMPLE_RATE = 8000 # Twilio sends 8kHz µ-law (G.711 PCMU)
	TARGET_SAMPLE_RATE = 16000 # Silero VAD and Whisper both work at 16kHz
	CHUNK_DURATION_MS = 32 # Process audio in 32ms chunks (Silero requirement: min 512 samples)
	CHUNK_SIZE_SAMPLES = 512 # 16000 * 0.032 = 512

	# Pre-roll buffer: keep 200ms before speech onset
	# Ensures first syllable is never clipped (existing Maya behavior preserved)
	PRE_ROLL_MS = 200
	PRE_ROLL_SAMPLES = int(TARGET_SAMPLE_RATE * PRE_ROLL_MS / 1000) # 3200 samples

	# Safety cap: max audio before force-flush (preserves existing 15s cap)
	MAX_SPEECH_DURATION_MS = 15000
	MAX_SPEECH_SAMPLES = int(TARGET_SAMPLE_RATE * MAX_SPEECH_DURATION_MS / 1000)


	@dataclass
	class VADResult:
	"""Result from processing one audio chunk through Silero VAD."""
	is_speech: bool = False # Is this chunk classified as speech?
	end_of_turn: bool = False # Has the user finished their turn?
	speech_audio: bytes = b"" # Complete speech audio ready for STT (PCM 16kHz)
	speech_prob: float = 0.0 # Raw Silero probability (useful for debugging)


	class SileroVADEngine:
	"""
	Drop-in replacement for Maya's RMS energy VAD.

	The Silero model is loaded ONCE as a class-level singleton and reused
	across all sessions. Each instance maintains its own per-call state
	(speech buffer, silence counter, pre-roll buffer) so concurrent calls
	don't interfere with each other.
	"""

	# ── Class-level singleton so the model is loaded only once ───────────────
	_shared_model = None

	@classmethod
	def _ensure_model_loaded(cls):
	if cls._shared_model is None:
	print("[VAD] Loading Silero VAD model...")
	cls._shared_model = load_silero_vad()
	cls._shared_model.eval()
	torch.set_num_threads(1)
	print("[VAD] ✅ Silero VAD loaded — 1ms per chunk on CPU")

	def __init__(self):
	SileroVADEngine._ensure_model_loaded()
	self.model = SileroVADEngine._shared_model
	# Per-call state — reset on each new call via reset()
	self._reset_state()

	def _reset_state(self):
	"""Reset all buffers. Call at the start of each new phone call."""
	self._pre_roll_buffer: list = [] # circular buffer for pre-roll
	self._speech_buffer: list = [] # accumulating speech audio
	self._silence_frames: int = 0 # consecutive silent frames count
	self._in_speech: bool = False
	self._audio_remainder: bytes = b"" # leftover bytes < one chunk size

	def reset(self):
	"""Public reset — call at start of each new phone call."""
	self._reset_state()
	# Also reset Silero's internal state machine
	self.model.reset_states()
	print("[VAD] State reset for new call")

	def _convert_mulaw_to_pcm16k(self, mulaw_bytes: bytes) -> np.ndarray:
	"""
	Convert Twilio's G.711 µ-law 8kHz audio to 16kHz float32 PCM.
	This is the exact format Silero VAD and Whisper expect.

	Steps:
	1. µ-law → 16-bit linear PCM (audioop.ulaw2lin)
	2. 8kHz → 16kHz resample (audioop.ratecv)
	3. int16 numpy array → float32 normalized -1.0 to 1.0
	"""
	# Step 1: µ-law → 16-bit PCM at 8kHz
	pcm_8k = audioop.ulaw2lin(mulaw_bytes, 2)

	# Step 2: Resample 8kHz → 16kHz
	pcm_16k, _ = audioop.ratecv(pcm_8k, 2, 1, INPUT_SAMPLE_RATE,
	TARGET_SAMPLE_RATE, None)

	# Step 3: Convert to float32 numpy array (Silero's required format)
	audio_int16 = np.frombuffer(pcm_16k, dtype=np.int16)
	audio_float32 = audio_int16.astype(np.float32) / 32768.0
	return audio_float32

	def _get_silence_frames_threshold(self, language: str) -> int:
	"""
	Returns how many consecutive silent 30ms frames = end of turn.
	Language-aware: Gujarati gets more time before being cut off.
	"""
	silence_ms = SILENCE_DURATION_MS.get(language,
	SILENCE_DURATION_MS["default"])
	return silence_ms // CHUNK_DURATION_MS # e.g. 1000ms / 30ms = 33 frames

	def process_chunk(self, mulaw_bytes: bytes, language: str = "gujarati") -> VADResult:
	"""
	Process one Twilio audio chunk through Silero VAD.

	Args:
	mulaw_bytes: Raw G.711 µ-law bytes from Twilio 'media' event
	language: Current detected language of the call session

	Returns:
	VADResult — check .end_of_turn to know when to send to STT
	"""
	result = VADResult()

	# Convert incoming audio to Silero's required format
	audio_float32 = self._convert_mulaw_to_pcm16k(mulaw_bytes)

	# Combine with any leftover bytes from previous chunk
	combined = np.concatenate(
	[np.frombuffer(self._audio_remainder, dtype=np.float32), audio_float32]
	) if self._audio_remainder else audio_float32

	# Process in CHUNK_SIZE_SAMPLES (480 sample) windows
	idx = 0
	while idx + CHUNK_SIZE_SAMPLES <= len(combined):
	chunk = combined[idx : idx + CHUNK_SIZE_SAMPLES]
	idx += CHUNK_SIZE_SAMPLES

	# Run Silero VAD inference — takes ~0.5ms on CPU
	chunk_tensor = torch.FloatTensor(chunk)
	speech_prob = self.model(chunk_tensor,
	TARGET_SAMPLE_RATE).item()
	is_speech = speech_prob >= SPEECH_PROBABILITY_THRESHOLD

	result.speech_prob = speech_prob
	result.is_speech = is_speech

	# ── STATE MACHINE ───────────────────────────────────────────────

	if not self._in_speech:
	# Update pre-roll circular buffer (always running)
	self._pre_roll_buffer.append(chunk)
	if len(self._pre_roll_buffer) > PRE_ROLL_SAMPLES // CHUNK_SIZE_SAMPLES:
	self._pre_roll_buffer.pop(0)

	if is_speech:
	# Speech onset detected — start accumulating
	self._in_speech = True
	self._silence_frames = 0
	# Prepend pre-roll buffer so first syllable is intact
	self._speech_buffer = list(self._pre_roll_buffer) + [chunk]
	print(f"[VAD] Speech start detected (prob={speech_prob:.2f},"
	f" lang={language})")

	else:
	# We are currently in speech
	self._speech_buffer.append(chunk)

	# Safety cap: force flush if speech too long
	total_samples = len(self._speech_buffer) * CHUNK_SIZE_SAMPLES
	if total_samples >= MAX_SPEECH_SAMPLES:
	print("[VAD] Safety cap reached — force flushing")
	result.end_of_turn = True
	result.speech_audio = self._build_speech_bytes()
	self._in_speech = False
	self._speech_buffer = []
	self._silence_frames = 0
	break

	if not is_speech:
	self._silence_frames += 1
	base_threshold = self._get_silence_frames_threshold(language)

	# Respect the full, research-backed cross-linguistic base silence thresholds
	# to prevent premature cutoffs when the user takes a natural pause or breath.
	threshold = base_threshold

	if self._silence_frames >= threshold:
	# User has finished their turn
	print(f"[VAD] End of turn detected after "
	f"{self._silence_frames * CHUNK_DURATION_MS}ms silence"
	f" (threshold={threshold * CHUNK_DURATION_MS}ms,"
	f" lang={language})")
	result.end_of_turn = True
	result.speech_audio = self._build_speech_bytes()
	self._in_speech = False
	self._speech_buffer = []
	self._silence_frames = 0
	break
	else:
	# Speech resumed — reset silence counter
	self._silence_frames = 0

	# Store remaining bytes for next chunk
	remaining_samples = len(combined) - idx
	if remaining_samples > 0:
	self._audio_remainder = combined[idx:].tobytes()
	else:
	self._audio_remainder = b""

	return result

	def _build_speech_bytes(self) -> bytes:
	"""
	Concatenates all buffered speech chunks into a single bytes object
	in 16kHz float32 PCM format, ready for Whisper transcription.
	"""
	if not self._speech_buffer:
	return b""
	combined = np.concatenate(self._speech_buffer)
	return combined.tobytes()

	def get_debug_stats(self) -> dict:
	"""Returns current VAD state for logging and debugging."""
	return {
	"in_speech": self._in_speech,
	"silence_frames": self._silence_frames,
	"speech_buf_secs": len(self._speech_buffer) * CHUNK_DURATION_MS / 1000,
	"pre_roll_frames": len(self._pre_roll_buffer),
	}