Spaces:
Sleeping
Sleeping
| from dataclasses import dataclass, field | |
| import time | |
| import numpy as np | |
| class UtteranceState: | |
| sample_rate: int | |
| speech_frames: list[np.ndarray] = field(default_factory=list) | |
| silence_ms: float = 0.0 | |
| pending_speech_ms: float = 0.0 | |
| in_speech: bool = False | |
| preroll_frames: list[np.ndarray] = field(default_factory=list) | |
| assistant_ignore_until: float = 0.0 | |
| assistant_barge_until: float = 0.0 | |
| pending_barge_ms: float = 0.0 | |
| turn_speech_ms: float = 0.0 | |
| active_speech_ms: float = 0.0 | |
| last_backchannel_at: float = 0.0 | |
| backchannel_count: int = 0 | |
| last_partial_transcript_at: float = 0.0 | |
| last_partial_transcript_text: str = "" | |
| last_partial_transcript_change_at: float = 0.0 | |
| last_partial_response_text: str = "" | |
| dynamic_endpoint_target_ms: float = 0.0 | |
| barge_in_active: bool = False | |
| recent_backchannels: list[str] = field(default_factory=list) | |
| turn_started_at: float = 0.0 | |
| def push_preroll(self, frame: np.ndarray, max_samples: int) -> None: | |
| self.preroll_frames.append(frame) | |
| total_samples = sum(chunk.size for chunk in self.preroll_frames) | |
| while self.preroll_frames and total_samples > max_samples: | |
| total_samples -= self.preroll_frames.pop(0).size | |
| def start(self) -> None: | |
| self.in_speech = True | |
| self.turn_started_at = time.monotonic() | |
| if self.preroll_frames: | |
| self.speech_frames.extend(self.preroll_frames) | |
| self.preroll_frames.clear() | |
| def append(self, frame: np.ndarray) -> None: | |
| self.speech_frames.append(frame) | |
| def reset_input(self) -> None: | |
| self.clear_active_input(preserve_preroll=False) | |
| def clear_active_input(self, *, preserve_preroll: bool) -> None: | |
| self.in_speech = False | |
| self.silence_ms = 0.0 | |
| self.pending_speech_ms = 0.0 | |
| self.pending_barge_ms = 0.0 | |
| self.turn_speech_ms = 0.0 | |
| self.active_speech_ms = 0.0 | |
| self.backchannel_count = 0 | |
| self.last_partial_transcript_at = 0.0 | |
| self.last_partial_transcript_text = "" | |
| self.last_partial_transcript_change_at = 0.0 | |
| self.last_partial_response_text = "" | |
| self.dynamic_endpoint_target_ms = 0.0 | |
| self.barge_in_active = False | |
| self.turn_started_at = 0.0 | |
| self.recent_backchannels.clear() | |
| self.speech_frames.clear() | |
| if not preserve_preroll: | |
| self.preroll_frames.clear() | |
| def set_assistant_active(self, duration_ms: float, holdoff_ms: int) -> None: | |
| now = time.monotonic() | |
| total_s = max(duration_ms, 0.0) / 1000.0 + max(holdoff_ms, 0) / 1000.0 | |
| self.assistant_ignore_until = max(self.assistant_ignore_until, now + total_s) | |
| def interrupt_assistant(self) -> None: | |
| self.assistant_ignore_until = 0.0 | |
| self.assistant_barge_until = 0.0 | |
| def should_ignore_input(self) -> bool: | |
| return time.monotonic() < self.assistant_ignore_until | |
| def set_barge_grace(self, grace_ms: int) -> None: | |
| grace_s = max(grace_ms, 0) / 1000.0 | |
| self.assistant_barge_until = max(self.assistant_barge_until, time.monotonic() + grace_s) | |
| def can_barge_in(self) -> bool: | |
| return time.monotonic() >= self.assistant_barge_until | |
| def finish(self) -> np.ndarray: | |
| self.in_speech = False | |
| self.silence_ms = 0.0 | |
| self.pending_speech_ms = 0.0 | |
| self.pending_barge_ms = 0.0 | |
| self.turn_speech_ms = 0.0 | |
| self.active_speech_ms = 0.0 | |
| self.backchannel_count = 0 | |
| self.last_partial_transcript_at = 0.0 | |
| self.last_partial_transcript_text = "" | |
| self.last_partial_transcript_change_at = 0.0 | |
| self.last_partial_response_text = "" | |
| self.dynamic_endpoint_target_ms = 0.0 | |
| self.barge_in_active = False | |
| self.turn_started_at = 0.0 | |
| self.recent_backchannels.clear() | |
| audio = np.concatenate(self.speech_frames) if self.speech_frames else np.zeros(0, dtype=np.float32) | |
| self.speech_frames.clear() | |
| self.preroll_frames.clear() | |
| return audio | |
| def current_audio(self) -> np.ndarray: | |
| if not self.speech_frames: | |
| return np.zeros(0, dtype=np.float32) | |
| return np.concatenate(self.speech_frames) | |