voice-agent / app /models.py
RalphThings's picture
Deploy Hugging Face Space
a8bcb70
from dataclasses import dataclass, field
import time
import numpy as np
@dataclass
class UtteranceState:
sample_rate: int
speech_frames: list[np.ndarray] = field(default_factory=list)
silence_ms: float = 0.0
pending_speech_ms: float = 0.0
in_speech: bool = False
preroll_frames: list[np.ndarray] = field(default_factory=list)
assistant_ignore_until: float = 0.0
assistant_barge_until: float = 0.0
pending_barge_ms: float = 0.0
turn_speech_ms: float = 0.0
active_speech_ms: float = 0.0
last_backchannel_at: float = 0.0
backchannel_count: int = 0
last_partial_transcript_at: float = 0.0
last_partial_transcript_text: str = ""
last_partial_transcript_change_at: float = 0.0
last_partial_response_text: str = ""
dynamic_endpoint_target_ms: float = 0.0
barge_in_active: bool = False
recent_backchannels: list[str] = field(default_factory=list)
turn_started_at: float = 0.0
def push_preroll(self, frame: np.ndarray, max_samples: int) -> None:
self.preroll_frames.append(frame)
total_samples = sum(chunk.size for chunk in self.preroll_frames)
while self.preroll_frames and total_samples > max_samples:
total_samples -= self.preroll_frames.pop(0).size
def start(self) -> None:
self.in_speech = True
self.turn_started_at = time.monotonic()
if self.preroll_frames:
self.speech_frames.extend(self.preroll_frames)
self.preroll_frames.clear()
def append(self, frame: np.ndarray) -> None:
self.speech_frames.append(frame)
def reset_input(self) -> None:
self.clear_active_input(preserve_preroll=False)
def clear_active_input(self, *, preserve_preroll: bool) -> None:
self.in_speech = False
self.silence_ms = 0.0
self.pending_speech_ms = 0.0
self.pending_barge_ms = 0.0
self.turn_speech_ms = 0.0
self.active_speech_ms = 0.0
self.backchannel_count = 0
self.last_partial_transcript_at = 0.0
self.last_partial_transcript_text = ""
self.last_partial_transcript_change_at = 0.0
self.last_partial_response_text = ""
self.dynamic_endpoint_target_ms = 0.0
self.barge_in_active = False
self.turn_started_at = 0.0
self.recent_backchannels.clear()
self.speech_frames.clear()
if not preserve_preroll:
self.preroll_frames.clear()
def set_assistant_active(self, duration_ms: float, holdoff_ms: int) -> None:
now = time.monotonic()
total_s = max(duration_ms, 0.0) / 1000.0 + max(holdoff_ms, 0) / 1000.0
self.assistant_ignore_until = max(self.assistant_ignore_until, now + total_s)
def interrupt_assistant(self) -> None:
self.assistant_ignore_until = 0.0
self.assistant_barge_until = 0.0
def should_ignore_input(self) -> bool:
return time.monotonic() < self.assistant_ignore_until
def set_barge_grace(self, grace_ms: int) -> None:
grace_s = max(grace_ms, 0) / 1000.0
self.assistant_barge_until = max(self.assistant_barge_until, time.monotonic() + grace_s)
def can_barge_in(self) -> bool:
return time.monotonic() >= self.assistant_barge_until
def finish(self) -> np.ndarray:
self.in_speech = False
self.silence_ms = 0.0
self.pending_speech_ms = 0.0
self.pending_barge_ms = 0.0
self.turn_speech_ms = 0.0
self.active_speech_ms = 0.0
self.backchannel_count = 0
self.last_partial_transcript_at = 0.0
self.last_partial_transcript_text = ""
self.last_partial_transcript_change_at = 0.0
self.last_partial_response_text = ""
self.dynamic_endpoint_target_ms = 0.0
self.barge_in_active = False
self.turn_started_at = 0.0
self.recent_backchannels.clear()
audio = np.concatenate(self.speech_frames) if self.speech_frames else np.zeros(0, dtype=np.float32)
self.speech_frames.clear()
self.preroll_frames.clear()
return audio
def current_audio(self) -> np.ndarray:
if not self.speech_frames:
return np.zeros(0, dtype=np.float32)
return np.concatenate(self.speech_frames)