Spaces:
Running
feat: Enhance voice activity detection and turn tracing features
Browse files- Updated VAD parameters in `.env.example` for improved responsiveness and accuracy, including adjustments to `VAD_MIN_SILENCE_DURATION`, `VAD_THRESHOLD`, and endpointing delays.
- Introduced a new `PendingUserUtterance` class in `metrics_collector.py` to manage user utterances more effectively, allowing for better tracking of speech-to-text transitions.
- Refactored `TurnTracer` to support coalescing of user transcripts, enabling the merging of immediate continuations into prior turns for a more seamless conversation flow.
- Updated settings in `settings.py` to reflect new defaults for voice processing and turn detection.
- Enhanced tests to validate the new features and ensure proper functionality of the updated VAD and turn tracing mechanisms.
- .env.example +11 -10
- .gitignore +2 -1
- src/agent/runtime/session.py +25 -3
- src/agent/traces/metrics_collector.py +156 -26
- src/agent/traces/turn_tracer.py +190 -9
- src/core/settings.py +23 -11
- tests/test_langfuse_turn_tracing.py +322 -0
- tests/test_runtime_settings.py +21 -5
- tests/test_session_conn_options.py +53 -4
|
@@ -46,6 +46,7 @@ LANGFUSE_TRACE_FINALIZE_TIMEOUT_MS=8000
|
|
| 46 |
LANGFUSE_POST_TOOL_RESPONSE_TIMEOUT_MS=30000
|
| 47 |
LANGFUSE_MAX_PENDING_TRACE_TASKS=200
|
| 48 |
LANGFUSE_TRACE_FLUSH_TIMEOUT_MS=1000
|
|
|
|
| 49 |
|
| 50 |
# Common LLM Parameters
|
| 51 |
LLM_TEMPERATURE=0.7
|
|
@@ -85,19 +86,19 @@ LIVEKIT_NUM_IDLE_PROCESSES=1 # Use 0-1 locally to reduce memory pressure
|
|
| 85 |
LIVEKIT_INITIALIZE_PROCESS_TIMEOUT_SEC=20.0 # Increase idle worker bootstrap timeout
|
| 86 |
LIVEKIT_JOB_MEMORY_WARN_MB=6144 # Per-job memory warning threshold (6 GB)
|
| 87 |
|
| 88 |
-
# LiveKit
|
| 89 |
LIVEKIT_SAMPLE_RATE=24000
|
| 90 |
LIVEKIT_NUM_CHANNELS=1
|
| 91 |
-
LIVEKIT_FRAME_SIZE_MS=
|
| 92 |
LIVEKIT_PRE_CONNECT_AUDIO=true
|
| 93 |
LIVEKIT_PRE_CONNECT_TIMEOUT=3.0
|
| 94 |
|
| 95 |
-
# Voice Activity Detection (VAD)
|
| 96 |
-
VAD_MIN_SPEECH_DURATION=0.18 # Require 180ms of speech before activation
|
| 97 |
-
VAD_MIN_SILENCE_DURATION=0.
|
| 98 |
-
VAD_THRESHOLD=0.
|
| 99 |
|
| 100 |
-
# Turn endpointing
|
| 101 |
-
MIN_ENDPOINTING_DELAY=0.
|
| 102 |
-
MAX_ENDPOINTING_DELAY=
|
| 103 |
-
PREEMPTIVE_GENERATION=
|
|
|
|
| 46 |
LANGFUSE_POST_TOOL_RESPONSE_TIMEOUT_MS=30000
|
| 47 |
LANGFUSE_MAX_PENDING_TRACE_TASKS=200
|
| 48 |
LANGFUSE_TRACE_FLUSH_TIMEOUT_MS=1000
|
| 49 |
+
LANGFUSE_CONTINUATION_COALESCE_WINDOW_MS=1500 # Merge immediate continuation turns into one trace; 0 disables it
|
| 50 |
|
| 51 |
# Common LLM Parameters
|
| 52 |
LLM_TEMPERATURE=0.7
|
|
|
|
| 86 |
LIVEKIT_INITIALIZE_PROCESS_TIMEOUT_SEC=20.0 # Increase idle worker bootstrap timeout
|
| 87 |
LIVEKIT_JOB_MEMORY_WARN_MB=6144 # Per-job memory warning threshold (6 GB)
|
| 88 |
|
| 89 |
+
# LiveKit audio input configuration
|
| 90 |
LIVEKIT_SAMPLE_RATE=24000
|
| 91 |
LIVEKIT_NUM_CHANNELS=1
|
| 92 |
+
LIVEKIT_FRAME_SIZE_MS=60 # Larger frames slightly reduce responsiveness but avoid over-eager VAD transitions
|
| 93 |
LIVEKIT_PRE_CONNECT_AUDIO=true
|
| 94 |
LIVEKIT_PRE_CONNECT_TIMEOUT=3.0
|
| 95 |
|
| 96 |
+
# Voice Activity Detection (VAD) configuration
|
| 97 |
+
VAD_MIN_SPEECH_DURATION=0.18 # Require 180ms of speech before activation
|
| 98 |
+
VAD_MIN_SILENCE_DURATION=0.55 # Wait longer before treating a pause as end of speech
|
| 99 |
+
VAD_THRESHOLD=0.5 # Silero default; keep balanced sensitivity for speech vs background noise
|
| 100 |
|
| 101 |
+
# Turn endpointing tuning
|
| 102 |
+
MIN_ENDPOINTING_DELAY=0.8 # Add a bit more patience before committing a turn to avoid false splits
|
| 103 |
+
MAX_ENDPOINTING_DELAY=3.0 # Let the detector wait longer when phrasing suggests continuation
|
| 104 |
+
PREEMPTIVE_GENERATION=false # Wait for the committed turn before generating a reply
|
|
@@ -29,4 +29,5 @@ model_cache/
|
|
| 29 |
|
| 30 |
# OS
|
| 31 |
.DS_Store
|
| 32 |
-
codex-skills/
|
|
|
|
|
|
| 29 |
|
| 30 |
# OS
|
| 31 |
.DS_Store
|
| 32 |
+
codex-skills/
|
| 33 |
+
blog/
|
|
@@ -12,7 +12,7 @@ from livekit.agents import AgentServer, AgentSession, room_io
|
|
| 12 |
from livekit.agents.types import APIConnectOptions
|
| 13 |
from livekit.agents.voice.agent_session import SessionConnectOptions
|
| 14 |
from livekit.plugins import noise_cancellation, silero
|
| 15 |
-
from livekit.plugins.turn_detector.
|
| 16 |
|
| 17 |
from src.agent.models.llm_runtime import (
|
| 18 |
build_llm_runtime,
|
|
@@ -71,6 +71,15 @@ def _resolve_stt_metrics_model_name() -> str:
|
|
| 71 |
return settings.stt.NVIDIA_STT_MODEL
|
| 72 |
|
| 73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
def _build_session_connect_options() -> tuple[APIConnectOptions, SessionConnectOptions]:
|
| 75 |
llm_conn_options = build_api_connect_options(
|
| 76 |
max_retry=settings.llm.LLM_CONN_MAX_RETRY,
|
|
@@ -173,8 +182,21 @@ async def session_handler(ctx: agents.JobContext) -> None:
|
|
| 173 |
model=llm_runtime.model,
|
| 174 |
)
|
| 175 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
session_kwargs: dict[str, Any] = dict(
|
| 177 |
-
stt=
|
| 178 |
llm=llm_runtime.llm,
|
| 179 |
tts=tts_engine,
|
| 180 |
vad=silero.VAD.load(
|
|
@@ -182,7 +204,7 @@ async def session_handler(ctx: agents.JobContext) -> None:
|
|
| 182 |
min_silence_duration=settings.voice.VAD_MIN_SILENCE_DURATION,
|
| 183 |
activation_threshold=settings.voice.VAD_THRESHOLD,
|
| 184 |
),
|
| 185 |
-
turn_detection=
|
| 186 |
min_endpointing_delay=settings.voice.MIN_ENDPOINTING_DELAY,
|
| 187 |
max_endpointing_delay=settings.voice.MAX_ENDPOINTING_DELAY,
|
| 188 |
preemptive_generation=settings.voice.PREEMPTIVE_GENERATION,
|
|
|
|
| 12 |
from livekit.agents.types import APIConnectOptions
|
| 13 |
from livekit.agents.voice.agent_session import SessionConnectOptions
|
| 14 |
from livekit.plugins import noise_cancellation, silero
|
| 15 |
+
from livekit.plugins.turn_detector.english import EnglishModel
|
| 16 |
|
| 17 |
from src.agent.models.llm_runtime import (
|
| 18 |
build_llm_runtime,
|
|
|
|
| 71 |
return settings.stt.NVIDIA_STT_MODEL
|
| 72 |
|
| 73 |
|
| 74 |
+
def _resolve_stt_language() -> str:
|
| 75 |
+
provider = settings.stt.STT_PROVIDER.lower()
|
| 76 |
+
if provider == "moonshine":
|
| 77 |
+
return settings.stt.MOONSHINE_LANGUAGE
|
| 78 |
+
if provider == "deepgram":
|
| 79 |
+
return settings.stt.DEEPGRAM_STT_LANGUAGE
|
| 80 |
+
return settings.stt.NVIDIA_STT_LANGUAGE_CODE
|
| 81 |
+
|
| 82 |
+
|
| 83 |
def _build_session_connect_options() -> tuple[APIConnectOptions, SessionConnectOptions]:
|
| 84 |
llm_conn_options = build_api_connect_options(
|
| 85 |
max_retry=settings.llm.LLM_CONN_MAX_RETRY,
|
|
|
|
| 182 |
model=llm_runtime.model,
|
| 183 |
)
|
| 184 |
|
| 185 |
+
stt_engine = create_stt()
|
| 186 |
+
logger.info(
|
| 187 |
+
"Turn profile: detector=%s stt_provider=%s stt_model=%s stt_language=%s vad_min_silence=%.2fs min_endpointing=%.2fs max_endpointing=%.2fs preemptive_generation=%s",
|
| 188 |
+
"EnglishModel",
|
| 189 |
+
settings.stt.STT_PROVIDER,
|
| 190 |
+
_resolve_stt_metrics_model_name(),
|
| 191 |
+
_resolve_stt_language(),
|
| 192 |
+
settings.voice.VAD_MIN_SILENCE_DURATION,
|
| 193 |
+
settings.voice.MIN_ENDPOINTING_DELAY,
|
| 194 |
+
settings.voice.MAX_ENDPOINTING_DELAY,
|
| 195 |
+
settings.voice.PREEMPTIVE_GENERATION,
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
session_kwargs: dict[str, Any] = dict(
|
| 199 |
+
stt=stt_engine,
|
| 200 |
llm=llm_runtime.llm,
|
| 201 |
tts=tts_engine,
|
| 202 |
vad=silero.VAD.load(
|
|
|
|
| 204 |
min_silence_duration=settings.voice.VAD_MIN_SILENCE_DURATION,
|
| 205 |
activation_threshold=settings.voice.VAD_THRESHOLD,
|
| 206 |
),
|
| 207 |
+
turn_detection=EnglishModel(),
|
| 208 |
min_endpointing_delay=settings.voice.MIN_ENDPOINTING_DELAY,
|
| 209 |
max_endpointing_delay=settings.voice.MAX_ENDPOINTING_DELAY,
|
| 210 |
preemptive_generation=settings.voice.PREEMPTIVE_GENERATION,
|
|
@@ -202,6 +202,17 @@ class TurnState:
|
|
| 202 |
first_audio_monotonic: Optional[float] = None
|
| 203 |
|
| 204 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
# ------------------------------------------------------------------
|
| 206 |
# Facade
|
| 207 |
# ------------------------------------------------------------------
|
|
@@ -241,12 +252,11 @@ class MetricsCollector:
|
|
| 241 |
)
|
| 242 |
|
| 243 |
self._publisher = ChannelPublisher(room)
|
| 244 |
-
self.
|
| 245 |
self._pending_agent_transcripts: deque[str] = deque()
|
| 246 |
self._pending_speech_ids_for_first_audio: deque[str] = deque()
|
| 247 |
self._latest_agent_speech_id: Optional[str] = None
|
| 248 |
self._turns: dict[str, TurnState] = {}
|
| 249 |
-
self._pending_llm_watchdog_ids: deque[str] = deque()
|
| 250 |
self._llm_stall_tasks: dict[str, asyncio.Task[None]] = {}
|
| 251 |
self._latest_vad_metrics: Optional[VADMetrics] = None
|
| 252 |
self._latest_vad_metric_attributes: Optional[dict[str, Any]] = None
|
|
@@ -318,18 +328,34 @@ class MetricsCollector:
|
|
| 318 |
normalized = transcript.strip()
|
| 319 |
if not normalized:
|
| 320 |
return
|
| 321 |
-
self.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 322 |
if not self._first_final_user_turn_logged:
|
| 323 |
self._first_final_user_turn_logged = True
|
| 324 |
logger.info(
|
| 325 |
"First finalized user transcript received: room=%s chars=%s preview=%r",
|
| 326 |
self._room_name,
|
| 327 |
-
len(
|
| 328 |
-
|
| 329 |
)
|
| 330 |
-
self._start_llm_stall_watchdog(transcript=normalized)
|
| 331 |
room_id = await self._resolve_room_id()
|
| 332 |
-
await self._tracer.create_turn(
|
|
|
|
|
|
|
|
|
|
| 333 |
|
| 334 |
async def on_conversation_item_added(
|
| 335 |
self,
|
|
@@ -345,7 +371,28 @@ class MetricsCollector:
|
|
| 345 |
if not normalized:
|
| 346 |
return
|
| 347 |
if role == "user":
|
| 348 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 349 |
return
|
| 350 |
assistant_event_created_at = (
|
| 351 |
item_created_at if item_created_at is not None else event_created_at
|
|
@@ -519,8 +566,10 @@ class MetricsCollector:
|
|
| 519 |
if isinstance(collected_metrics, metrics.STTMetrics):
|
| 520 |
speech_id = collected_metrics.request_id
|
| 521 |
turn_metrics = self._get_or_create_turn(speech_id, role="user")
|
| 522 |
-
|
| 523 |
-
|
|
|
|
|
|
|
| 524 |
turn_metrics.stt = STTMetrics(
|
| 525 |
type=collected_metrics.type,
|
| 526 |
label=collected_metrics.label,
|
|
@@ -605,6 +654,7 @@ class MetricsCollector:
|
|
| 605 |
elif isinstance(collected_metrics, metrics.EOUMetrics):
|
| 606 |
speech_id = collected_metrics.speech_id
|
| 607 |
if speech_id:
|
|
|
|
| 608 |
state = self._get_or_create_state(speech_id)
|
| 609 |
if state.speech_end_monotonic is None:
|
| 610 |
state.speech_end_monotonic = monotonic()
|
|
@@ -788,12 +838,44 @@ class MetricsCollector:
|
|
| 788 |
s for s in self._pending_speech_ids_for_first_audio if s != speech_id
|
| 789 |
)
|
| 790 |
|
| 791 |
-
def
|
| 792 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 793 |
return
|
| 794 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 795 |
watchdog_id = str(uuid.uuid4())
|
| 796 |
-
self._pending_llm_watchdog_ids.append(watchdog_id)
|
| 797 |
task = asyncio.create_task(
|
| 798 |
self._warn_if_turn_stalled_before_llm(
|
| 799 |
watchdog_id=watchdog_id,
|
|
@@ -807,14 +889,36 @@ class MetricsCollector:
|
|
| 807 |
self._llm_stall_tasks.pop(watchdog_id, None)
|
| 808 |
|
| 809 |
task.add_done_callback(_on_done)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 810 |
|
| 811 |
def _mark_llm_stage_reached(self) -> None:
|
| 812 |
-
|
| 813 |
-
|
| 814 |
-
|
| 815 |
-
|
| 816 |
-
|
| 817 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 818 |
|
| 819 |
async def _warn_if_turn_stalled_before_llm(
|
| 820 |
self,
|
|
@@ -827,21 +931,21 @@ class MetricsCollector:
|
|
| 827 |
except asyncio.CancelledError:
|
| 828 |
return
|
| 829 |
|
| 830 |
-
|
|
|
|
| 831 |
return
|
| 832 |
|
| 833 |
-
|
| 834 |
-
|
| 835 |
-
|
| 836 |
-
|
| 837 |
-
preview = transcript[:80]
|
| 838 |
logger.warning(
|
| 839 |
"Turn stalled before LLM stage: timeout=%.2fs room=%s transcript_chars=%s transcript_preview=%r",
|
| 840 |
self._llm_stall_timeout_sec,
|
| 841 |
self._room_name,
|
| 842 |
-
len(transcript),
|
| 843 |
preview,
|
| 844 |
)
|
|
|
|
| 845 |
|
| 846 |
async def _resolve_room_id(self) -> str:
|
| 847 |
if self._room_id and self._room_id != self._room_name:
|
|
@@ -881,6 +985,32 @@ def _append_if_new(queue: deque[str], value: str) -> None:
|
|
| 881 |
queue.append(value)
|
| 882 |
|
| 883 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 884 |
def _trace_turn_has_tool_activity(trace_turn: Optional[TraceTurn]) -> bool:
|
| 885 |
if trace_turn is None:
|
| 886 |
return False
|
|
|
|
| 202 |
first_audio_monotonic: Optional[float] = None
|
| 203 |
|
| 204 |
|
| 205 |
+
@dataclass
|
| 206 |
+
class PendingUserUtterance:
|
| 207 |
+
"""Logical user utterance that may span multiple final STT chunks."""
|
| 208 |
+
|
| 209 |
+
transcript: str
|
| 210 |
+
committed: bool = False
|
| 211 |
+
stt_observed: bool = False
|
| 212 |
+
llm_started: bool = False
|
| 213 |
+
watchdog_id: Optional[str] = None
|
| 214 |
+
|
| 215 |
+
|
| 216 |
# ------------------------------------------------------------------
|
| 217 |
# Facade
|
| 218 |
# ------------------------------------------------------------------
|
|
|
|
| 252 |
)
|
| 253 |
|
| 254 |
self._publisher = ChannelPublisher(room)
|
| 255 |
+
self._pending_user_utterances: deque[PendingUserUtterance] = deque()
|
| 256 |
self._pending_agent_transcripts: deque[str] = deque()
|
| 257 |
self._pending_speech_ids_for_first_audio: deque[str] = deque()
|
| 258 |
self._latest_agent_speech_id: Optional[str] = None
|
| 259 |
self._turns: dict[str, TurnState] = {}
|
|
|
|
| 260 |
self._llm_stall_tasks: dict[str, asyncio.Task[None]] = {}
|
| 261 |
self._latest_vad_metrics: Optional[VADMetrics] = None
|
| 262 |
self._latest_vad_metric_attributes: Optional[dict[str, Any]] = None
|
|
|
|
| 328 |
normalized = transcript.strip()
|
| 329 |
if not normalized:
|
| 330 |
return
|
| 331 |
+
utterance = self._current_open_user_utterance()
|
| 332 |
+
if utterance is None:
|
| 333 |
+
utterance = PendingUserUtterance(transcript=normalized)
|
| 334 |
+
utterance.watchdog_id = self._start_llm_stall_watchdog(transcript=normalized)
|
| 335 |
+
self._pending_user_utterances.append(utterance)
|
| 336 |
+
else:
|
| 337 |
+
utterance.transcript = _merge_user_transcripts(
|
| 338 |
+
utterance.transcript,
|
| 339 |
+
normalized,
|
| 340 |
+
)
|
| 341 |
+
if utterance.watchdog_id is not None:
|
| 342 |
+
self._update_llm_stall_watchdog(
|
| 343 |
+
utterance.watchdog_id,
|
| 344 |
+
utterance.transcript,
|
| 345 |
+
)
|
| 346 |
if not self._first_final_user_turn_logged:
|
| 347 |
self._first_final_user_turn_logged = True
|
| 348 |
logger.info(
|
| 349 |
"First finalized user transcript received: room=%s chars=%s preview=%r",
|
| 350 |
self._room_name,
|
| 351 |
+
len(utterance.transcript),
|
| 352 |
+
utterance.transcript[:80],
|
| 353 |
)
|
|
|
|
| 354 |
room_id = await self._resolve_room_id()
|
| 355 |
+
await self._tracer.create_turn(
|
| 356 |
+
user_transcript=utterance.transcript,
|
| 357 |
+
room_id=room_id,
|
| 358 |
+
)
|
| 359 |
|
| 360 |
async def on_conversation_item_added(
|
| 361 |
self,
|
|
|
|
| 371 |
if not normalized:
|
| 372 |
return
|
| 373 |
if role == "user":
|
| 374 |
+
utterance = self._latest_user_utterance()
|
| 375 |
+
if utterance is None:
|
| 376 |
+
utterance = PendingUserUtterance(
|
| 377 |
+
transcript=normalized,
|
| 378 |
+
committed=True,
|
| 379 |
+
)
|
| 380 |
+
self._pending_user_utterances.append(utterance)
|
| 381 |
+
else:
|
| 382 |
+
utterance.transcript = normalized
|
| 383 |
+
utterance.committed = True
|
| 384 |
+
if utterance.watchdog_id is not None:
|
| 385 |
+
self._update_llm_stall_watchdog(
|
| 386 |
+
utterance.watchdog_id,
|
| 387 |
+
utterance.transcript,
|
| 388 |
+
)
|
| 389 |
+
user_event_created_at = (
|
| 390 |
+
item_created_at if item_created_at is not None else event_created_at
|
| 391 |
+
)
|
| 392 |
+
await self._tracer.attach_user_text(
|
| 393 |
+
normalized,
|
| 394 |
+
event_created_at=user_event_created_at,
|
| 395 |
+
)
|
| 396 |
return
|
| 397 |
assistant_event_created_at = (
|
| 398 |
item_created_at if item_created_at is not None else event_created_at
|
|
|
|
| 566 |
if isinstance(collected_metrics, metrics.STTMetrics):
|
| 567 |
speech_id = collected_metrics.request_id
|
| 568 |
turn_metrics = self._get_or_create_turn(speech_id, role="user")
|
| 569 |
+
utterance = self._next_user_utterance_for_stt()
|
| 570 |
+
if utterance is not None:
|
| 571 |
+
turn_metrics.transcript = utterance.transcript
|
| 572 |
+
utterance.stt_observed = True
|
| 573 |
turn_metrics.stt = STTMetrics(
|
| 574 |
type=collected_metrics.type,
|
| 575 |
label=collected_metrics.label,
|
|
|
|
| 654 |
elif isinstance(collected_metrics, metrics.EOUMetrics):
|
| 655 |
speech_id = collected_metrics.speech_id
|
| 656 |
if speech_id:
|
| 657 |
+
self._mark_oldest_open_user_utterance_committed()
|
| 658 |
state = self._get_or_create_state(speech_id)
|
| 659 |
if state.speech_end_monotonic is None:
|
| 660 |
state.speech_end_monotonic = monotonic()
|
|
|
|
| 838 |
s for s in self._pending_speech_ids_for_first_audio if s != speech_id
|
| 839 |
)
|
| 840 |
|
| 841 |
+
def _current_open_user_utterance(self) -> Optional[PendingUserUtterance]:
|
| 842 |
+
utterance = self._latest_user_utterance()
|
| 843 |
+
if utterance is None or utterance.committed:
|
| 844 |
+
return None
|
| 845 |
+
return utterance
|
| 846 |
+
|
| 847 |
+
def _latest_user_utterance(self) -> Optional[PendingUserUtterance]:
|
| 848 |
+
if not self._pending_user_utterances:
|
| 849 |
+
return None
|
| 850 |
+
return self._pending_user_utterances[-1]
|
| 851 |
+
|
| 852 |
+
def _next_user_utterance_for_stt(self) -> Optional[PendingUserUtterance]:
|
| 853 |
+
for utterance in self._pending_user_utterances:
|
| 854 |
+
if utterance.stt_observed:
|
| 855 |
+
continue
|
| 856 |
+
return utterance
|
| 857 |
+
return None
|
| 858 |
+
|
| 859 |
+
def _mark_oldest_open_user_utterance_committed(self) -> None:
|
| 860 |
+
for utterance in self._pending_user_utterances:
|
| 861 |
+
if utterance.committed:
|
| 862 |
+
continue
|
| 863 |
+
utterance.committed = True
|
| 864 |
+
self._prune_resolved_user_utterances()
|
| 865 |
return
|
| 866 |
|
| 867 |
+
def _prune_resolved_user_utterances(self) -> None:
|
| 868 |
+
while self._pending_user_utterances:
|
| 869 |
+
utterance = self._pending_user_utterances[0]
|
| 870 |
+
if not utterance.committed or not utterance.llm_started:
|
| 871 |
+
break
|
| 872 |
+
self._pending_user_utterances.popleft()
|
| 873 |
+
|
| 874 |
+
def _start_llm_stall_watchdog(self, *, transcript: str) -> str | None:
|
| 875 |
+
if self._llm_stall_timeout_sec <= 0:
|
| 876 |
+
return None
|
| 877 |
+
|
| 878 |
watchdog_id = str(uuid.uuid4())
|
|
|
|
| 879 |
task = asyncio.create_task(
|
| 880 |
self._warn_if_turn_stalled_before_llm(
|
| 881 |
watchdog_id=watchdog_id,
|
|
|
|
| 889 |
self._llm_stall_tasks.pop(watchdog_id, None)
|
| 890 |
|
| 891 |
task.add_done_callback(_on_done)
|
| 892 |
+
return watchdog_id
|
| 893 |
+
|
| 894 |
+
def _update_llm_stall_watchdog(self, watchdog_id: str, transcript: str) -> None:
|
| 895 |
+
utterance = self._find_user_utterance_by_watchdog(watchdog_id)
|
| 896 |
+
if utterance is None or utterance.llm_started:
|
| 897 |
+
return
|
| 898 |
+
utterance.transcript = transcript
|
| 899 |
|
| 900 |
def _mark_llm_stage_reached(self) -> None:
|
| 901 |
+
for utterance in self._pending_user_utterances:
|
| 902 |
+
if utterance.llm_started:
|
| 903 |
+
continue
|
| 904 |
+
utterance.llm_started = True
|
| 905 |
+
watchdog_id = utterance.watchdog_id
|
| 906 |
+
utterance.watchdog_id = None
|
| 907 |
+
if watchdog_id is not None:
|
| 908 |
+
task = self._llm_stall_tasks.pop(watchdog_id, None)
|
| 909 |
+
if task:
|
| 910 |
+
task.cancel()
|
| 911 |
+
self._prune_resolved_user_utterances()
|
| 912 |
+
return
|
| 913 |
+
|
| 914 |
+
def _find_user_utterance_by_watchdog(
|
| 915 |
+
self,
|
| 916 |
+
watchdog_id: str,
|
| 917 |
+
) -> Optional[PendingUserUtterance]:
|
| 918 |
+
for utterance in self._pending_user_utterances:
|
| 919 |
+
if utterance.watchdog_id == watchdog_id:
|
| 920 |
+
return utterance
|
| 921 |
+
return None
|
| 922 |
|
| 923 |
async def _warn_if_turn_stalled_before_llm(
|
| 924 |
self,
|
|
|
|
| 931 |
except asyncio.CancelledError:
|
| 932 |
return
|
| 933 |
|
| 934 |
+
utterance = self._find_user_utterance_by_watchdog(watchdog_id)
|
| 935 |
+
if utterance is None or utterance.llm_started:
|
| 936 |
return
|
| 937 |
|
| 938 |
+
utterance.watchdog_id = None
|
| 939 |
+
utterance.llm_started = True
|
| 940 |
+
preview = utterance.transcript[:80] if utterance.transcript else transcript[:80]
|
|
|
|
|
|
|
| 941 |
logger.warning(
|
| 942 |
"Turn stalled before LLM stage: timeout=%.2fs room=%s transcript_chars=%s transcript_preview=%r",
|
| 943 |
self._llm_stall_timeout_sec,
|
| 944 |
self._room_name,
|
| 945 |
+
len(utterance.transcript or transcript),
|
| 946 |
preview,
|
| 947 |
)
|
| 948 |
+
self._prune_resolved_user_utterances()
|
| 949 |
|
| 950 |
async def _resolve_room_id(self) -> str:
|
| 951 |
if self._room_id and self._room_id != self._room_name:
|
|
|
|
| 985 |
queue.append(value)
|
| 986 |
|
| 987 |
|
| 988 |
+
def _merge_user_transcripts(existing: str, incoming: str) -> str:
|
| 989 |
+
left = existing.strip()
|
| 990 |
+
right = incoming.strip()
|
| 991 |
+
if not left:
|
| 992 |
+
return right
|
| 993 |
+
if not right:
|
| 994 |
+
return left
|
| 995 |
+
if left == right:
|
| 996 |
+
return left
|
| 997 |
+
if right.startswith(left):
|
| 998 |
+
return right
|
| 999 |
+
if left.startswith(right):
|
| 1000 |
+
return left
|
| 1001 |
+
|
| 1002 |
+
left_words = left.split()
|
| 1003 |
+
right_words = right.split()
|
| 1004 |
+
max_overlap = min(len(left_words), len(right_words))
|
| 1005 |
+
for overlap in range(max_overlap, 0, -1):
|
| 1006 |
+
left_suffix = [word.casefold() for word in left_words[-overlap:]]
|
| 1007 |
+
right_prefix = [word.casefold() for word in right_words[:overlap]]
|
| 1008 |
+
if left_suffix == right_prefix:
|
| 1009 |
+
merged_words = [*left_words, *right_words[overlap:]]
|
| 1010 |
+
return " ".join(merged_words).strip()
|
| 1011 |
+
return f"{left} {right}".strip()
|
| 1012 |
+
|
| 1013 |
+
|
| 1014 |
def _trace_turn_has_tool_activity(trace_turn: Optional[TraceTurn]) -> bool:
|
| 1015 |
if trace_turn is None:
|
| 1016 |
return False
|
|
@@ -32,6 +32,10 @@ class TraceTurn:
|
|
| 32 |
participant_id: str
|
| 33 |
user_transcript: str
|
| 34 |
prompt_text: str
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
response_text: str = ""
|
| 36 |
assistant_text: str = ""
|
| 37 |
assistant_text_missing: bool = False
|
|
@@ -72,6 +76,9 @@ class TraceTurn:
|
|
| 72 |
tts_updated_order: Optional[int] = None
|
| 73 |
event_counter: int = 0
|
| 74 |
tool_post_response_missing: bool = False
|
|
|
|
|
|
|
|
|
|
| 75 |
trace_id: Optional[str] = None
|
| 76 |
|
| 77 |
|
|
@@ -153,6 +160,7 @@ _DEFAULT_TRACE_FINALIZE_TIMEOUT_MS = 8000.0
|
|
| 153 |
_DEFAULT_POST_TOOL_RESPONSE_TIMEOUT_MS = 30000.0
|
| 154 |
_DEFAULT_MAX_PENDING_TRACE_TASKS = 200
|
| 155 |
_DEFAULT_TRACE_FLUSH_TIMEOUT_SEC = 1.0
|
|
|
|
| 156 |
_TOOL_ERROR_FALLBACK_TEXT = "I couldn't complete that tool request. Please rephrase the query."
|
| 157 |
|
| 158 |
|
|
@@ -233,6 +241,17 @@ class TurnTracer:
|
|
| 233 |
)
|
| 234 |
/ 1000.0
|
| 235 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
|
| 237 |
# ------------------------------------------------------------------
|
| 238 |
# Session context
|
|
@@ -287,16 +306,60 @@ class TurnTracer:
|
|
| 287 |
|
| 288 |
async def create_turn(self, *, user_transcript: str, room_id: str) -> None:
|
| 289 |
async with self._trace_lock:
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 300 |
|
| 301 |
# ------------------------------------------------------------------
|
| 302 |
# Stage attachment
|
|
@@ -358,6 +421,10 @@ class TurnTracer:
|
|
| 358 |
if turn.stt_duration_ms is None:
|
| 359 |
turn.stt_duration_ms = turn.stt_total_latency_ms
|
| 360 |
turn.eou_attributes = _sanitize_component_attributes(metric_attributes)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 361 |
metric_speech_id = _normalize_optional_str(
|
| 362 |
turn.eou_attributes.get("speech_id")
|
| 363 |
)
|
|
@@ -620,6 +687,88 @@ class TurnTracer:
|
|
| 620 |
# Internal helpers
|
| 621 |
# ------------------------------------------------------------------
|
| 622 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 623 |
def _next_turn_where(
|
| 624 |
self,
|
| 625 |
predicate: Callable[[TraceTurn], bool],
|
|
@@ -2005,6 +2154,11 @@ def _set_root_attributes(
|
|
| 2005 |
"langfuse.trace.metadata.stt_status": turn.stt_status,
|
| 2006 |
"langfuse.trace.metadata.tool_phase_announced": turn.tool_step_announced,
|
| 2007 |
"langfuse.trace.metadata.tool_post_response_missing": turn.tool_post_response_missing,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2008 |
"duration_ms": _total_duration_ms(turn),
|
| 2009 |
"latency_ms.user_input": vals["user_input_duration_ms"],
|
| 2010 |
"latency_ms.vad": vals["vad_duration_ms"],
|
|
@@ -2036,6 +2190,7 @@ def _set_root_attributes(
|
|
| 2036 |
"tool.phase_announced": turn.tool_step_announced,
|
| 2037 |
"tool.post_response_missing": turn.tool_post_response_missing,
|
| 2038 |
"stt_status": turn.stt_status,
|
|
|
|
| 2039 |
}
|
| 2040 |
for key, value in attrs.items():
|
| 2041 |
if value is not None:
|
|
@@ -2100,6 +2255,32 @@ def _stringify_observation(value: Any) -> str:
|
|
| 2100 |
return str(value)
|
| 2101 |
|
| 2102 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2103 |
def _emit_component_span(
|
| 2104 |
_tracer: Any,
|
| 2105 |
*,
|
|
|
|
| 32 |
participant_id: str
|
| 33 |
user_transcript: str
|
| 34 |
prompt_text: str
|
| 35 |
+
created_at: float = field(default_factory=time)
|
| 36 |
+
user_turn_committed: bool = False
|
| 37 |
+
user_turn_committed_at: Optional[float] = None
|
| 38 |
+
user_transcript_updated_at: Optional[float] = None
|
| 39 |
response_text: str = ""
|
| 40 |
assistant_text: str = ""
|
| 41 |
assistant_text_missing: bool = False
|
|
|
|
| 76 |
tts_updated_order: Optional[int] = None
|
| 77 |
event_counter: int = 0
|
| 78 |
tool_post_response_missing: bool = False
|
| 79 |
+
coalesced_turn_ids: list[str] = field(default_factory=list)
|
| 80 |
+
coalesced_user_transcripts: list[str] = field(default_factory=list)
|
| 81 |
+
coalesced_fragment_count: int = 0
|
| 82 |
trace_id: Optional[str] = None
|
| 83 |
|
| 84 |
|
|
|
|
| 160 |
_DEFAULT_POST_TOOL_RESPONSE_TIMEOUT_MS = 30000.0
|
| 161 |
_DEFAULT_MAX_PENDING_TRACE_TASKS = 200
|
| 162 |
_DEFAULT_TRACE_FLUSH_TIMEOUT_SEC = 1.0
|
| 163 |
+
_DEFAULT_CONTINUATION_COALESCE_WINDOW_MS = 1500.0
|
| 164 |
_TOOL_ERROR_FALLBACK_TEXT = "I couldn't complete that tool request. Please rephrase the query."
|
| 165 |
|
| 166 |
|
|
|
|
| 241 |
)
|
| 242 |
/ 1000.0
|
| 243 |
)
|
| 244 |
+
self._continuation_coalesce_window_sec = (
|
| 245 |
+
max(
|
| 246 |
+
getattr(
|
| 247 |
+
settings.langfuse,
|
| 248 |
+
"LANGFUSE_CONTINUATION_COALESCE_WINDOW_MS",
|
| 249 |
+
_DEFAULT_CONTINUATION_COALESCE_WINDOW_MS,
|
| 250 |
+
),
|
| 251 |
+
0.0,
|
| 252 |
+
)
|
| 253 |
+
/ 1000.0
|
| 254 |
+
)
|
| 255 |
|
| 256 |
# ------------------------------------------------------------------
|
| 257 |
# Session context
|
|
|
|
| 306 |
|
| 307 |
async def create_turn(self, *, user_transcript: str, room_id: str) -> None:
|
| 308 |
async with self._trace_lock:
|
| 309 |
+
normalized = user_transcript.strip()
|
| 310 |
+
if not normalized:
|
| 311 |
+
return
|
| 312 |
+
|
| 313 |
+
current_turn = self._latest_turn_where(lambda c: not c.user_turn_committed)
|
| 314 |
+
if current_turn is not None:
|
| 315 |
+
self._update_user_turn_text(current_turn, normalized)
|
| 316 |
+
return
|
| 317 |
+
|
| 318 |
+
new_turn = TraceTurn(
|
| 319 |
+
turn_id=str(uuid.uuid4()),
|
| 320 |
+
session_id=self._session_id,
|
| 321 |
+
room_id=room_id,
|
| 322 |
+
participant_id=self._participant_id,
|
| 323 |
+
user_transcript=normalized,
|
| 324 |
+
prompt_text=normalized,
|
| 325 |
+
)
|
| 326 |
+
new_turn.user_transcript_updated_at = new_turn.created_at
|
| 327 |
+
|
| 328 |
+
coalesced_turn = self._coalesced_turn_candidate()
|
| 329 |
+
if coalesced_turn is not None:
|
| 330 |
+
self._absorb_coalesced_turn_metadata(new_turn, coalesced_turn)
|
| 331 |
+
self._pending_trace_turns.remove(coalesced_turn)
|
| 332 |
+
self._cancel_finalize_timeout(coalesced_turn.turn_id)
|
| 333 |
+
|
| 334 |
+
self._pending_trace_turns.append(new_turn)
|
| 335 |
+
|
| 336 |
+
async def attach_user_text(
|
| 337 |
+
self,
|
| 338 |
+
user_transcript: str,
|
| 339 |
+
*,
|
| 340 |
+
event_created_at: Optional[float] = None,
|
| 341 |
+
) -> Optional[TraceTurn]:
|
| 342 |
+
async with self._trace_lock:
|
| 343 |
+
turn = self._latest_turn_where(lambda c: not c.assistant_text.strip())
|
| 344 |
+
if turn is None:
|
| 345 |
+
turn = self._latest_turn_where(lambda _: True)
|
| 346 |
+
if turn is None:
|
| 347 |
+
return None
|
| 348 |
+
|
| 349 |
+
normalized = user_transcript.strip()
|
| 350 |
+
if not normalized:
|
| 351 |
+
return turn
|
| 352 |
+
|
| 353 |
+
self._update_user_turn_text(
|
| 354 |
+
turn,
|
| 355 |
+
normalized,
|
| 356 |
+
event_created_at=event_created_at,
|
| 357 |
)
|
| 358 |
+
turn.user_turn_committed = True
|
| 359 |
+
turn.user_turn_committed_at = _resolved_event_timestamp(
|
| 360 |
+
_to_optional_float(event_created_at)
|
| 361 |
+
)
|
| 362 |
+
return turn
|
| 363 |
|
| 364 |
# ------------------------------------------------------------------
|
| 365 |
# Stage attachment
|
|
|
|
| 421 |
if turn.stt_duration_ms is None:
|
| 422 |
turn.stt_duration_ms = turn.stt_total_latency_ms
|
| 423 |
turn.eou_attributes = _sanitize_component_attributes(metric_attributes)
|
| 424 |
+
turn.user_turn_committed = True
|
| 425 |
+
turn.user_turn_committed_at = _resolved_event_timestamp(
|
| 426 |
+
_to_optional_float(turn.eou_attributes.get("timestamp"))
|
| 427 |
+
)
|
| 428 |
metric_speech_id = _normalize_optional_str(
|
| 429 |
turn.eou_attributes.get("speech_id")
|
| 430 |
)
|
|
|
|
| 687 |
# Internal helpers
|
| 688 |
# ------------------------------------------------------------------
|
| 689 |
|
| 690 |
+
def _update_user_turn_text(
|
| 691 |
+
self,
|
| 692 |
+
turn: TraceTurn,
|
| 693 |
+
user_transcript: str,
|
| 694 |
+
*,
|
| 695 |
+
event_created_at: Optional[float] = None,
|
| 696 |
+
) -> None:
|
| 697 |
+
normalized = user_transcript.strip()
|
| 698 |
+
if not normalized:
|
| 699 |
+
return
|
| 700 |
+
merged = _merge_user_transcripts(turn.user_transcript, normalized)
|
| 701 |
+
turn.user_transcript = merged
|
| 702 |
+
turn.prompt_text = merged
|
| 703 |
+
turn.user_transcript_updated_at = _resolved_event_timestamp(
|
| 704 |
+
_to_optional_float(event_created_at)
|
| 705 |
+
)
|
| 706 |
+
|
| 707 |
+
def _coalesced_turn_candidate(self) -> Optional[TraceTurn]:
|
| 708 |
+
if self._continuation_coalesce_window_sec <= 0.0:
|
| 709 |
+
return None
|
| 710 |
+
|
| 711 |
+
now = time()
|
| 712 |
+
for turn in reversed(self._pending_trace_turns):
|
| 713 |
+
if not self._can_coalesce_turn(turn):
|
| 714 |
+
continue
|
| 715 |
+
activity_at = self._turn_recent_activity_at(turn)
|
| 716 |
+
if activity_at is None:
|
| 717 |
+
continue
|
| 718 |
+
if now - activity_at > self._continuation_coalesce_window_sec:
|
| 719 |
+
return None
|
| 720 |
+
return turn
|
| 721 |
+
return None
|
| 722 |
+
|
| 723 |
+
def _can_coalesce_turn(self, turn: TraceTurn) -> bool:
|
| 724 |
+
if not turn.user_turn_committed:
|
| 725 |
+
return False
|
| 726 |
+
if not turn.user_transcript.strip():
|
| 727 |
+
return False
|
| 728 |
+
if turn.assistant_text.strip() or turn.response_text.strip():
|
| 729 |
+
return False
|
| 730 |
+
if turn.tool_step_announced or turn.tool_executions or turn.last_tool_event_order is not None:
|
| 731 |
+
return False
|
| 732 |
+
return bool(turn.llm_calls and turn.tts_calls)
|
| 733 |
+
|
| 734 |
+
def _turn_recent_activity_at(self, turn: TraceTurn) -> Optional[float]:
|
| 735 |
+
candidates = [
|
| 736 |
+
turn.assistant_text_updated_at,
|
| 737 |
+
turn.tts_updated_at,
|
| 738 |
+
turn.last_tool_completed_at,
|
| 739 |
+
turn.last_tool_event_at,
|
| 740 |
+
turn.user_turn_committed_at,
|
| 741 |
+
turn.user_transcript_updated_at,
|
| 742 |
+
turn.created_at,
|
| 743 |
+
]
|
| 744 |
+
resolved = [candidate for candidate in candidates if candidate is not None]
|
| 745 |
+
if not resolved:
|
| 746 |
+
return None
|
| 747 |
+
return max(resolved)
|
| 748 |
+
|
| 749 |
+
def _absorb_coalesced_turn_metadata(
|
| 750 |
+
self,
|
| 751 |
+
new_turn: TraceTurn,
|
| 752 |
+
absorbed_turn: TraceTurn,
|
| 753 |
+
) -> None:
|
| 754 |
+
combined_input = _merge_user_transcripts(
|
| 755 |
+
absorbed_turn.user_transcript,
|
| 756 |
+
new_turn.user_transcript,
|
| 757 |
+
)
|
| 758 |
+
new_turn.user_transcript = combined_input
|
| 759 |
+
new_turn.prompt_text = combined_input
|
| 760 |
+
new_turn.coalesced_turn_ids = [
|
| 761 |
+
*absorbed_turn.coalesced_turn_ids,
|
| 762 |
+
absorbed_turn.turn_id,
|
| 763 |
+
]
|
| 764 |
+
new_turn.coalesced_user_transcripts = [
|
| 765 |
+
*absorbed_turn.coalesced_user_transcripts,
|
| 766 |
+
absorbed_turn.user_transcript,
|
| 767 |
+
]
|
| 768 |
+
new_turn.coalesced_fragment_count = (
|
| 769 |
+
absorbed_turn.coalesced_fragment_count + 1
|
| 770 |
+
)
|
| 771 |
+
|
| 772 |
def _next_turn_where(
|
| 773 |
self,
|
| 774 |
predicate: Callable[[TraceTurn], bool],
|
|
|
|
| 2154 |
"langfuse.trace.metadata.stt_status": turn.stt_status,
|
| 2155 |
"langfuse.trace.metadata.tool_phase_announced": turn.tool_step_announced,
|
| 2156 |
"langfuse.trace.metadata.tool_post_response_missing": turn.tool_post_response_missing,
|
| 2157 |
+
"langfuse.trace.metadata.user_turn_committed": turn.user_turn_committed,
|
| 2158 |
+
"langfuse.trace.metadata.coalesced_turn_count": len(turn.coalesced_turn_ids),
|
| 2159 |
+
"langfuse.trace.metadata.coalesced_fragment_count": turn.coalesced_fragment_count,
|
| 2160 |
+
"langfuse.trace.metadata.coalesced_turn_ids": turn.coalesced_turn_ids,
|
| 2161 |
+
"langfuse.trace.metadata.coalesced_inputs": turn.coalesced_user_transcripts,
|
| 2162 |
"duration_ms": _total_duration_ms(turn),
|
| 2163 |
"latency_ms.user_input": vals["user_input_duration_ms"],
|
| 2164 |
"latency_ms.vad": vals["vad_duration_ms"],
|
|
|
|
| 2190 |
"tool.phase_announced": turn.tool_step_announced,
|
| 2191 |
"tool.post_response_missing": turn.tool_post_response_missing,
|
| 2192 |
"stt_status": turn.stt_status,
|
| 2193 |
+
"user_turn.committed": turn.user_turn_committed,
|
| 2194 |
}
|
| 2195 |
for key, value in attrs.items():
|
| 2196 |
if value is not None:
|
|
|
|
| 2255 |
return str(value)
|
| 2256 |
|
| 2257 |
|
| 2258 |
+
def _merge_user_transcripts(existing: str, incoming: str) -> str:
|
| 2259 |
+
left = existing.strip()
|
| 2260 |
+
right = incoming.strip()
|
| 2261 |
+
if not left:
|
| 2262 |
+
return right
|
| 2263 |
+
if not right:
|
| 2264 |
+
return left
|
| 2265 |
+
if left == right:
|
| 2266 |
+
return left
|
| 2267 |
+
if right.startswith(left):
|
| 2268 |
+
return right
|
| 2269 |
+
if left.startswith(right):
|
| 2270 |
+
return left
|
| 2271 |
+
|
| 2272 |
+
left_words = left.split()
|
| 2273 |
+
right_words = right.split()
|
| 2274 |
+
max_overlap = min(len(left_words), len(right_words))
|
| 2275 |
+
for overlap in range(max_overlap, 0, -1):
|
| 2276 |
+
left_suffix = [word.casefold() for word in left_words[-overlap:]]
|
| 2277 |
+
right_prefix = [word.casefold() for word in right_words[:overlap]]
|
| 2278 |
+
if left_suffix == right_prefix:
|
| 2279 |
+
merged_words = [*left_words, *right_words[overlap:]]
|
| 2280 |
+
return " ".join(merged_words).strip()
|
| 2281 |
+
return f"{left} {right}".strip()
|
| 2282 |
+
|
| 2283 |
+
|
| 2284 |
def _emit_component_span(
|
| 2285 |
_tracer: Any,
|
| 2286 |
*,
|
|
@@ -60,7 +60,7 @@ class CoreSettings(BaseSettings):
|
|
| 60 |
|
| 61 |
class VoiceSettings(CoreSettings):
|
| 62 |
TTS_PROVIDER: str = Field(
|
| 63 |
-
default="
|
| 64 |
description="TTS provider: 'pocket', 'deepgram', or 'nvidia'",
|
| 65 |
)
|
| 66 |
DEEPGRAM_API_KEY: Optional[str] = Field(
|
|
@@ -129,7 +129,7 @@ class VoiceSettings(CoreSettings):
|
|
| 129 |
description="Number of audio input channels (1=mono)",
|
| 130 |
)
|
| 131 |
LIVEKIT_FRAME_SIZE_MS: int = Field(
|
| 132 |
-
default=
|
| 133 |
ge=10,
|
| 134 |
le=100,
|
| 135 |
description="Audio frame size in milliseconds (smaller = faster VAD response)",
|
|
@@ -153,31 +153,34 @@ class VoiceSettings(CoreSettings):
|
|
| 153 |
description="Minimum speech duration (seconds) before VAD activation",
|
| 154 |
)
|
| 155 |
VAD_MIN_SILENCE_DURATION: float = Field(
|
| 156 |
-
default=0.
|
| 157 |
ge=0.1,
|
| 158 |
le=2.0,
|
| 159 |
description="Minimum silence duration (seconds) before VAD deactivation",
|
| 160 |
)
|
| 161 |
VAD_THRESHOLD: float = Field(
|
| 162 |
-
default=0.
|
| 163 |
ge=0.0,
|
| 164 |
le=1.0,
|
| 165 |
description="VAD activation threshold (higher = less sensitive, 0.5 is Silero default)",
|
| 166 |
)
|
| 167 |
MIN_ENDPOINTING_DELAY: float = Field(
|
| 168 |
-
default=0.
|
| 169 |
ge=0.0,
|
| 170 |
le=10.0,
|
| 171 |
-
description=
|
|
|
|
|
|
|
|
|
|
| 172 |
)
|
| 173 |
MAX_ENDPOINTING_DELAY: float = Field(
|
| 174 |
-
default=
|
| 175 |
ge=0.1,
|
| 176 |
le=10.0,
|
| 177 |
description="Maximum endpointing delay (seconds) when turn detector expects continuation",
|
| 178 |
)
|
| 179 |
PREEMPTIVE_GENERATION: bool = Field(
|
| 180 |
-
default=
|
| 181 |
description="Enable speculative LLM/TTS generation before final turn commit",
|
| 182 |
)
|
| 183 |
|
|
@@ -280,7 +283,7 @@ class LLMSettings(CoreSettings):
|
|
| 280 |
),
|
| 281 |
)
|
| 282 |
OLLAMA_MODEL: str = Field(
|
| 283 |
-
default= "ministral-3:14b-cloud", #"ministral-3:8b-cloud", #"qwen3-coder-next",#minimax-m2.5 #"ministral-3:8b", #"qwen2.5:7b" #"qwen3:8b" #"qwen3.5:4b",
|
| 284 |
description="Ollama model tag",
|
| 285 |
)
|
| 286 |
OLLAMA_API_KEY: Optional[str] = Field(
|
|
@@ -290,7 +293,7 @@ class LLMSettings(CoreSettings):
|
|
| 290 |
|
| 291 |
# Common LLM parameters
|
| 292 |
LLM_TEMPERATURE: float = Field(default=0.7, ge=0.0, le=2.0)
|
| 293 |
-
LLM_MAX_TOKENS: int = Field(default=
|
| 294 |
LLM_CONN_TIMEOUT_SEC: float = Field(
|
| 295 |
default=20.0,
|
| 296 |
gt=0.0,
|
|
@@ -349,7 +352,7 @@ class LiveKitSettings(CoreSettings):
|
|
| 349 |
LIVEKIT_API_KEY: Optional[str] = Field(default=None)
|
| 350 |
LIVEKIT_API_SECRET: Optional[str] = Field(default=None)
|
| 351 |
LIVEKIT_AGENT_NAME: str = Field(default="open-voice-agent")
|
| 352 |
-
LIVEKIT_NUM_IDLE_PROCESSES: int = Field(default=
|
| 353 |
LIVEKIT_INITIALIZE_PROCESS_TIMEOUT_SEC: float = Field(
|
| 354 |
default=20.0,
|
| 355 |
gt=0.0,
|
|
@@ -413,6 +416,15 @@ class LangfuseSettings(CoreSettings):
|
|
| 413 |
le=10000.0,
|
| 414 |
description="Best-effort tracer flush timeout in milliseconds",
|
| 415 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 416 |
|
| 417 |
|
| 418 |
class Settings(CoreSettings):
|
|
|
|
| 60 |
|
| 61 |
class VoiceSettings(CoreSettings):
|
| 62 |
TTS_PROVIDER: str = Field(
|
| 63 |
+
default="pocket",
|
| 64 |
description="TTS provider: 'pocket', 'deepgram', or 'nvidia'",
|
| 65 |
)
|
| 66 |
DEEPGRAM_API_KEY: Optional[str] = Field(
|
|
|
|
| 129 |
description="Number of audio input channels (1=mono)",
|
| 130 |
)
|
| 131 |
LIVEKIT_FRAME_SIZE_MS: int = Field(
|
| 132 |
+
default=60,
|
| 133 |
ge=10,
|
| 134 |
le=100,
|
| 135 |
description="Audio frame size in milliseconds (smaller = faster VAD response)",
|
|
|
|
| 153 |
description="Minimum speech duration (seconds) before VAD activation",
|
| 154 |
)
|
| 155 |
VAD_MIN_SILENCE_DURATION: float = Field(
|
| 156 |
+
default=0.55,
|
| 157 |
ge=0.1,
|
| 158 |
le=2.0,
|
| 159 |
description="Minimum silence duration (seconds) before VAD deactivation",
|
| 160 |
)
|
| 161 |
VAD_THRESHOLD: float = Field(
|
| 162 |
+
default=0.5,
|
| 163 |
ge=0.0,
|
| 164 |
le=1.0,
|
| 165 |
description="VAD activation threshold (higher = less sensitive, 0.5 is Silero default)",
|
| 166 |
)
|
| 167 |
MIN_ENDPOINTING_DELAY: float = Field(
|
| 168 |
+
default=0.5,
|
| 169 |
ge=0.0,
|
| 170 |
le=10.0,
|
| 171 |
+
description=(
|
| 172 |
+
"Minimum endpointing delay (seconds) before committing user turn; "
|
| 173 |
+
"slightly higher values reduce false turn splits"
|
| 174 |
+
),
|
| 175 |
)
|
| 176 |
MAX_ENDPOINTING_DELAY: float = Field(
|
| 177 |
+
default=3.0,
|
| 178 |
ge=0.1,
|
| 179 |
le=10.0,
|
| 180 |
description="Maximum endpointing delay (seconds) when turn detector expects continuation",
|
| 181 |
)
|
| 182 |
PREEMPTIVE_GENERATION: bool = Field(
|
| 183 |
+
default=False,
|
| 184 |
description="Enable speculative LLM/TTS generation before final turn commit",
|
| 185 |
)
|
| 186 |
|
|
|
|
| 283 |
),
|
| 284 |
)
|
| 285 |
OLLAMA_MODEL: str = Field(
|
| 286 |
+
default= "ministral-3:14b", #"ministral-3:14b-cloud", #"ministral-3:8b-cloud", #"qwen3-coder-next",#minimax-m2.5 #"ministral-3:8b", #"qwen2.5:7b" #"qwen3:8b" #"qwen3.5:4b",
|
| 287 |
description="Ollama model tag",
|
| 288 |
)
|
| 289 |
OLLAMA_API_KEY: Optional[str] = Field(
|
|
|
|
| 293 |
|
| 294 |
# Common LLM parameters
|
| 295 |
LLM_TEMPERATURE: float = Field(default=0.7, ge=0.0, le=2.0)
|
| 296 |
+
LLM_MAX_TOKENS: int = Field(default=256, gt=0)
|
| 297 |
LLM_CONN_TIMEOUT_SEC: float = Field(
|
| 298 |
default=20.0,
|
| 299 |
gt=0.0,
|
|
|
|
| 352 |
LIVEKIT_API_KEY: Optional[str] = Field(default=None)
|
| 353 |
LIVEKIT_API_SECRET: Optional[str] = Field(default=None)
|
| 354 |
LIVEKIT_AGENT_NAME: str = Field(default="open-voice-agent")
|
| 355 |
+
LIVEKIT_NUM_IDLE_PROCESSES: int = Field(default=0, ge=0)
|
| 356 |
LIVEKIT_INITIALIZE_PROCESS_TIMEOUT_SEC: float = Field(
|
| 357 |
default=20.0,
|
| 358 |
gt=0.0,
|
|
|
|
| 416 |
le=10000.0,
|
| 417 |
description="Best-effort tracer flush timeout in milliseconds",
|
| 418 |
)
|
| 419 |
+
LANGFUSE_CONTINUATION_COALESCE_WINDOW_MS: float = Field(
|
| 420 |
+
default=1500.0,
|
| 421 |
+
ge=0.0,
|
| 422 |
+
le=10000.0,
|
| 423 |
+
description=(
|
| 424 |
+
"Window to merge an immediately-following continuation into a prior aborted "
|
| 425 |
+
"turn trace; set to 0 to disable"
|
| 426 |
+
),
|
| 427 |
+
)
|
| 428 |
|
| 429 |
|
| 430 |
class Settings(CoreSettings):
|
|
@@ -1676,6 +1676,328 @@ def test_creates_new_trace_for_each_finalized_transcript(
|
|
| 1676 |
assert turn_spans[0].trace_id != turn_spans[1].trace_id
|
| 1677 |
|
| 1678 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1679 |
def test_trace_emits_without_stt_metrics(monkeypatch: pytest.MonkeyPatch) -> None:
|
| 1680 |
import src.agent.traces.metrics_collector as metrics_collector_module
|
| 1681 |
|
|
|
|
| 1676 |
assert turn_spans[0].trace_id != turn_spans[1].trace_id
|
| 1677 |
|
| 1678 |
|
| 1679 |
+
def test_multiple_final_transcripts_are_merged_into_one_turn(
|
| 1680 |
+
monkeypatch: pytest.MonkeyPatch,
|
| 1681 |
+
) -> None:
|
| 1682 |
+
import src.agent.traces.metrics_collector as metrics_collector_module
|
| 1683 |
+
|
| 1684 |
+
fake_tracer = _FakeTracer()
|
| 1685 |
+
monkeypatch.setattr(metrics_collector_module, "tracer", fake_tracer)
|
| 1686 |
+
|
| 1687 |
+
room = _FakeRoom()
|
| 1688 |
+
collector = MetricsCollector(
|
| 1689 |
+
room=room, # type: ignore[arg-type]
|
| 1690 |
+
model_name="moonshine",
|
| 1691 |
+
room_name=room.name,
|
| 1692 |
+
room_id="RM123",
|
| 1693 |
+
participant_id="web-123",
|
| 1694 |
+
langfuse_enabled=True,
|
| 1695 |
+
)
|
| 1696 |
+
|
| 1697 |
+
async def _run() -> None:
|
| 1698 |
+
await collector.on_session_metadata(
|
| 1699 |
+
session_id="session-merged-finals",
|
| 1700 |
+
participant_id="web-123",
|
| 1701 |
+
)
|
| 1702 |
+
await collector.on_user_input_transcribed("What", is_final=True)
|
| 1703 |
+
await collector.on_user_input_transcribed(
|
| 1704 |
+
"the difference between speech to text and speech recognition?",
|
| 1705 |
+
is_final=True,
|
| 1706 |
+
)
|
| 1707 |
+
await collector.on_metrics_collected(_make_stt_metrics("stt-merged"))
|
| 1708 |
+
await collector.on_metrics_collected(
|
| 1709 |
+
_make_eou_metrics("speech-merged", delay=0.9, transcription_delay=0.2)
|
| 1710 |
+
)
|
| 1711 |
+
await collector.on_conversation_item_added(
|
| 1712 |
+
role="user",
|
| 1713 |
+
content="What the difference between speech to text and speech recognition?",
|
| 1714 |
+
)
|
| 1715 |
+
await collector.on_metrics_collected(_make_llm_metrics("speech-merged"))
|
| 1716 |
+
await collector.on_conversation_item_added(role="assistant", content="Speech to text writes words down.")
|
| 1717 |
+
await collector.on_metrics_collected(_make_tts_metrics("speech-merged"))
|
| 1718 |
+
await collector.wait_for_pending_trace_tasks()
|
| 1719 |
+
|
| 1720 |
+
asyncio.run(_run())
|
| 1721 |
+
|
| 1722 |
+
turn_spans = [span for span in fake_tracer.spans if span.name == "turn"]
|
| 1723 |
+
assert len(turn_spans) == 1
|
| 1724 |
+
root = turn_spans[0]
|
| 1725 |
+
stt_span = next(span for span in fake_tracer.spans if span.name == "STTMetrics")
|
| 1726 |
+
assert (
|
| 1727 |
+
root.attributes["langfuse.trace.input"]
|
| 1728 |
+
== "What the difference between speech to text and speech recognition?"
|
| 1729 |
+
)
|
| 1730 |
+
assert (
|
| 1731 |
+
stt_span.attributes["user_transcript"]
|
| 1732 |
+
== "What the difference between speech to text and speech recognition?"
|
| 1733 |
+
)
|
| 1734 |
+
assert root.attributes["langfuse.trace.metadata.coalesced_turn_count"] == 0
|
| 1735 |
+
|
| 1736 |
+
|
| 1737 |
+
def test_immediate_continuation_coalesces_aborted_prior_turn(
|
| 1738 |
+
monkeypatch: pytest.MonkeyPatch,
|
| 1739 |
+
) -> None:
|
| 1740 |
+
import src.agent.traces.metrics_collector as metrics_collector_module
|
| 1741 |
+
|
| 1742 |
+
fake_tracer = _FakeTracer()
|
| 1743 |
+
monkeypatch.setattr(metrics_collector_module, "tracer", fake_tracer)
|
| 1744 |
+
monkeypatch.setattr(
|
| 1745 |
+
metrics_collector_module.settings.langfuse,
|
| 1746 |
+
"LANGFUSE_CONTINUATION_COALESCE_WINDOW_MS",
|
| 1747 |
+
1500.0,
|
| 1748 |
+
)
|
| 1749 |
+
|
| 1750 |
+
room = _FakeRoom()
|
| 1751 |
+
collector = MetricsCollector(
|
| 1752 |
+
room=room, # type: ignore[arg-type]
|
| 1753 |
+
model_name="moonshine",
|
| 1754 |
+
room_name=room.name,
|
| 1755 |
+
room_id="RM123",
|
| 1756 |
+
participant_id="web-123",
|
| 1757 |
+
langfuse_enabled=True,
|
| 1758 |
+
)
|
| 1759 |
+
|
| 1760 |
+
async def _run() -> None:
|
| 1761 |
+
await collector.on_session_metadata(
|
| 1762 |
+
session_id="session-coalesce",
|
| 1763 |
+
participant_id="web-123",
|
| 1764 |
+
)
|
| 1765 |
+
await collector.on_user_input_transcribed("What", is_final=True)
|
| 1766 |
+
await collector.on_metrics_collected(
|
| 1767 |
+
_make_eou_metrics("speech-a", delay=0.7, transcription_delay=0.2)
|
| 1768 |
+
)
|
| 1769 |
+
await collector.on_metrics_collected(_make_llm_metrics("speech-a"))
|
| 1770 |
+
await collector.on_metrics_collected(_make_tts_metrics("speech-a"))
|
| 1771 |
+
|
| 1772 |
+
await collector.on_user_input_transcribed(
|
| 1773 |
+
"the difference between speech to text and speech recognition?",
|
| 1774 |
+
is_final=True,
|
| 1775 |
+
)
|
| 1776 |
+
await collector.on_metrics_collected(
|
| 1777 |
+
_make_eou_metrics("speech-b", delay=0.7, transcription_delay=0.2)
|
| 1778 |
+
)
|
| 1779 |
+
await collector.on_conversation_item_added(
|
| 1780 |
+
role="user",
|
| 1781 |
+
content="What the difference between speech to text and speech recognition?",
|
| 1782 |
+
)
|
| 1783 |
+
await collector.on_metrics_collected(_make_llm_metrics("speech-b"))
|
| 1784 |
+
await collector.on_conversation_item_added(role="assistant", content="Speech to text writes words down.")
|
| 1785 |
+
await collector.on_metrics_collected(_make_tts_metrics("speech-b"))
|
| 1786 |
+
await collector.wait_for_pending_trace_tasks()
|
| 1787 |
+
|
| 1788 |
+
asyncio.run(_run())
|
| 1789 |
+
|
| 1790 |
+
turn_spans = [span for span in fake_tracer.spans if span.name == "turn"]
|
| 1791 |
+
assert len(turn_spans) == 1
|
| 1792 |
+
root = turn_spans[0]
|
| 1793 |
+
assert (
|
| 1794 |
+
root.attributes["langfuse.trace.input"]
|
| 1795 |
+
== "What the difference between speech to text and speech recognition?"
|
| 1796 |
+
)
|
| 1797 |
+
assert root.attributes["langfuse.trace.metadata.coalesced_turn_count"] == 1
|
| 1798 |
+
assert root.attributes["langfuse.trace.metadata.coalesced_fragment_count"] == 1
|
| 1799 |
+
assert root.attributes["langfuse.trace.metadata.coalesced_inputs"] == ["What"]
|
| 1800 |
+
|
| 1801 |
+
|
| 1802 |
+
def test_visible_assistant_reply_prevents_continuation_coalescing(
|
| 1803 |
+
monkeypatch: pytest.MonkeyPatch,
|
| 1804 |
+
) -> None:
|
| 1805 |
+
import src.agent.traces.metrics_collector as metrics_collector_module
|
| 1806 |
+
|
| 1807 |
+
fake_tracer = _FakeTracer()
|
| 1808 |
+
monkeypatch.setattr(metrics_collector_module, "tracer", fake_tracer)
|
| 1809 |
+
|
| 1810 |
+
room = _FakeRoom()
|
| 1811 |
+
collector = MetricsCollector(
|
| 1812 |
+
room=room, # type: ignore[arg-type]
|
| 1813 |
+
model_name="moonshine",
|
| 1814 |
+
room_name=room.name,
|
| 1815 |
+
room_id="RM123",
|
| 1816 |
+
participant_id="web-123",
|
| 1817 |
+
langfuse_enabled=True,
|
| 1818 |
+
)
|
| 1819 |
+
|
| 1820 |
+
async def _run() -> None:
|
| 1821 |
+
await collector.on_session_metadata(
|
| 1822 |
+
session_id="session-no-coalesce-visible-reply",
|
| 1823 |
+
participant_id="web-123",
|
| 1824 |
+
)
|
| 1825 |
+
await collector.on_user_input_transcribed("first turn", is_final=True)
|
| 1826 |
+
await collector.on_metrics_collected(_make_stt_metrics("stt-first-visible"))
|
| 1827 |
+
await collector.on_metrics_collected(_make_eou_metrics("speech-first-visible"))
|
| 1828 |
+
await collector.on_metrics_collected(_make_llm_metrics("speech-first-visible"))
|
| 1829 |
+
await collector.on_conversation_item_added(role="assistant", content="first reply")
|
| 1830 |
+
await collector.on_metrics_collected(_make_tts_metrics("speech-first-visible"))
|
| 1831 |
+
|
| 1832 |
+
await collector.on_user_input_transcribed("second turn", is_final=True)
|
| 1833 |
+
await collector.on_metrics_collected(_make_stt_metrics("stt-second-visible"))
|
| 1834 |
+
await collector.on_metrics_collected(_make_eou_metrics("speech-second-visible"))
|
| 1835 |
+
await collector.on_metrics_collected(_make_llm_metrics("speech-second-visible"))
|
| 1836 |
+
await collector.on_conversation_item_added(role="assistant", content="second reply")
|
| 1837 |
+
await collector.on_metrics_collected(_make_tts_metrics("speech-second-visible"))
|
| 1838 |
+
await collector.wait_for_pending_trace_tasks()
|
| 1839 |
+
|
| 1840 |
+
asyncio.run(_run())
|
| 1841 |
+
|
| 1842 |
+
turn_spans = [span for span in fake_tracer.spans if span.name == "turn"]
|
| 1843 |
+
assert len(turn_spans) == 2
|
| 1844 |
+
assert turn_spans[0].attributes["langfuse.trace.metadata.coalesced_turn_count"] == 0
|
| 1845 |
+
assert turn_spans[1].attributes["langfuse.trace.metadata.coalesced_turn_count"] == 0
|
| 1846 |
+
|
| 1847 |
+
|
| 1848 |
+
def test_tool_activity_prevents_continuation_coalescing(
|
| 1849 |
+
monkeypatch: pytest.MonkeyPatch,
|
| 1850 |
+
) -> None:
|
| 1851 |
+
import src.agent.traces.metrics_collector as metrics_collector_module
|
| 1852 |
+
|
| 1853 |
+
fake_tracer = _FakeTracer()
|
| 1854 |
+
monkeypatch.setattr(metrics_collector_module, "tracer", fake_tracer)
|
| 1855 |
+
|
| 1856 |
+
room = _FakeRoom()
|
| 1857 |
+
collector = MetricsCollector(
|
| 1858 |
+
room=room, # type: ignore[arg-type]
|
| 1859 |
+
model_name="moonshine",
|
| 1860 |
+
room_name=room.name,
|
| 1861 |
+
room_id="RM123",
|
| 1862 |
+
participant_id="web-123",
|
| 1863 |
+
langfuse_enabled=True,
|
| 1864 |
+
)
|
| 1865 |
+
collector._trace_finalize_timeout_sec = 0.05
|
| 1866 |
+
collector._trace_post_tool_response_timeout_sec = 0.05
|
| 1867 |
+
|
| 1868 |
+
async def _run() -> None:
|
| 1869 |
+
await collector.on_session_metadata(
|
| 1870 |
+
session_id="session-no-coalesce-tools",
|
| 1871 |
+
participant_id="web-123",
|
| 1872 |
+
)
|
| 1873 |
+
await collector.on_user_input_transcribed("run tool", is_final=True)
|
| 1874 |
+
await collector.on_metrics_collected(_make_eou_metrics("speech-tool-a"))
|
| 1875 |
+
await collector.on_metrics_collected(_make_llm_metrics("speech-tool-a"))
|
| 1876 |
+
await collector.on_metrics_collected(_make_tts_metrics("speech-tool-a"))
|
| 1877 |
+
await collector.on_tool_step_started()
|
| 1878 |
+
await collector.on_function_tools_executed(
|
| 1879 |
+
function_calls=[
|
| 1880 |
+
_FakeFunctionCall(
|
| 1881 |
+
name="search_web",
|
| 1882 |
+
call_id="tool-a",
|
| 1883 |
+
arguments='{"q":"speech models"}',
|
| 1884 |
+
created_at=1.0,
|
| 1885 |
+
)
|
| 1886 |
+
],
|
| 1887 |
+
function_call_outputs=[
|
| 1888 |
+
_FakeFunctionCallOutput(
|
| 1889 |
+
output='{"results":[]}',
|
| 1890 |
+
is_error=False,
|
| 1891 |
+
created_at=1.1,
|
| 1892 |
+
)
|
| 1893 |
+
],
|
| 1894 |
+
created_at=1.1,
|
| 1895 |
+
)
|
| 1896 |
+
|
| 1897 |
+
await collector.on_user_input_transcribed("follow-up turn", is_final=True)
|
| 1898 |
+
await collector.on_metrics_collected(_make_eou_metrics("speech-tool-b"))
|
| 1899 |
+
await collector.on_metrics_collected(_make_llm_metrics("speech-tool-b"))
|
| 1900 |
+
await collector.on_conversation_item_added(role="assistant", content="second reply")
|
| 1901 |
+
await collector.on_metrics_collected(_make_tts_metrics("speech-tool-b"))
|
| 1902 |
+
await asyncio.sleep(0.08)
|
| 1903 |
+
await collector.wait_for_pending_trace_tasks()
|
| 1904 |
+
|
| 1905 |
+
asyncio.run(_run())
|
| 1906 |
+
|
| 1907 |
+
turn_spans = [span for span in fake_tracer.spans if span.name == "turn"]
|
| 1908 |
+
assert len(turn_spans) == 2
|
| 1909 |
+
assert turn_spans[1].attributes["langfuse.trace.metadata.coalesced_turn_count"] == 0
|
| 1910 |
+
|
| 1911 |
+
|
| 1912 |
+
def test_multiple_final_transcripts_share_one_llm_stall_watchdog() -> None:
|
| 1913 |
+
room = _FakeRoom()
|
| 1914 |
+
collector = MetricsCollector(
|
| 1915 |
+
room=room, # type: ignore[arg-type]
|
| 1916 |
+
model_name="moonshine",
|
| 1917 |
+
room_name=room.name,
|
| 1918 |
+
room_id="RM123",
|
| 1919 |
+
participant_id="web-123",
|
| 1920 |
+
langfuse_enabled=False,
|
| 1921 |
+
)
|
| 1922 |
+
|
| 1923 |
+
async def _run() -> None:
|
| 1924 |
+
await collector.on_user_input_transcribed("Search for the most popular", is_final=True)
|
| 1925 |
+
await collector.on_user_input_transcribed("test to speech model.", is_final=True)
|
| 1926 |
+
assert len(collector._pending_user_utterances) == 1
|
| 1927 |
+
assert len(collector._llm_stall_tasks) == 1
|
| 1928 |
+
assert (
|
| 1929 |
+
collector._pending_user_utterances[0].transcript
|
| 1930 |
+
== "Search for the most popular test to speech model."
|
| 1931 |
+
)
|
| 1932 |
+
|
| 1933 |
+
await collector.on_metrics_collected(_make_llm_metrics("speech-watchdog"))
|
| 1934 |
+
await asyncio.sleep(0)
|
| 1935 |
+
|
| 1936 |
+
asyncio.run(_run())
|
| 1937 |
+
|
| 1938 |
+
assert not collector._llm_stall_tasks
|
| 1939 |
+
|
| 1940 |
+
|
| 1941 |
+
def test_continuation_coalescing_can_be_disabled(
|
| 1942 |
+
monkeypatch: pytest.MonkeyPatch,
|
| 1943 |
+
) -> None:
|
| 1944 |
+
import src.agent.traces.metrics_collector as metrics_collector_module
|
| 1945 |
+
|
| 1946 |
+
fake_tracer = _FakeTracer()
|
| 1947 |
+
monkeypatch.setattr(metrics_collector_module, "tracer", fake_tracer)
|
| 1948 |
+
monkeypatch.setattr(
|
| 1949 |
+
metrics_collector_module.settings.langfuse,
|
| 1950 |
+
"LANGFUSE_CONTINUATION_COALESCE_WINDOW_MS",
|
| 1951 |
+
0.0,
|
| 1952 |
+
)
|
| 1953 |
+
|
| 1954 |
+
room = _FakeRoom()
|
| 1955 |
+
collector = MetricsCollector(
|
| 1956 |
+
room=room, # type: ignore[arg-type]
|
| 1957 |
+
model_name="moonshine",
|
| 1958 |
+
room_name=room.name,
|
| 1959 |
+
room_id="RM123",
|
| 1960 |
+
participant_id="web-123",
|
| 1961 |
+
langfuse_enabled=True,
|
| 1962 |
+
)
|
| 1963 |
+
collector._trace_finalize_timeout_sec = 0.01
|
| 1964 |
+
|
| 1965 |
+
async def _run() -> None:
|
| 1966 |
+
await collector.on_session_metadata(
|
| 1967 |
+
session_id="session-no-coalesce-disabled",
|
| 1968 |
+
participant_id="web-123",
|
| 1969 |
+
)
|
| 1970 |
+
await collector.on_user_input_transcribed("What", is_final=True)
|
| 1971 |
+
await collector.on_metrics_collected(
|
| 1972 |
+
_make_eou_metrics("speech-disabled-a", delay=0.7, transcription_delay=0.2)
|
| 1973 |
+
)
|
| 1974 |
+
await collector.on_metrics_collected(_make_llm_metrics("speech-disabled-a"))
|
| 1975 |
+
await collector.on_metrics_collected(_make_tts_metrics("speech-disabled-a"))
|
| 1976 |
+
await asyncio.sleep(0.03)
|
| 1977 |
+
|
| 1978 |
+
await collector.on_user_input_transcribed(
|
| 1979 |
+
"the difference between speech to text and speech recognition?",
|
| 1980 |
+
is_final=True,
|
| 1981 |
+
)
|
| 1982 |
+
await collector.on_metrics_collected(
|
| 1983 |
+
_make_eou_metrics("speech-disabled-b", delay=0.7, transcription_delay=0.2)
|
| 1984 |
+
)
|
| 1985 |
+
await collector.on_metrics_collected(_make_llm_metrics("speech-disabled-b"))
|
| 1986 |
+
await collector.on_conversation_item_added(role="assistant", content="Speech to text writes words down.")
|
| 1987 |
+
await collector.on_metrics_collected(_make_tts_metrics("speech-disabled-b"))
|
| 1988 |
+
await collector.wait_for_pending_trace_tasks()
|
| 1989 |
+
|
| 1990 |
+
asyncio.run(_run())
|
| 1991 |
+
|
| 1992 |
+
turn_spans = [span for span in fake_tracer.spans if span.name == "turn"]
|
| 1993 |
+
assert len(turn_spans) == 2
|
| 1994 |
+
assert turn_spans[0].attributes["langfuse.trace.input"] == "What"
|
| 1995 |
+
assert (
|
| 1996 |
+
turn_spans[1].attributes["langfuse.trace.input"]
|
| 1997 |
+
== "the difference between speech to text and speech recognition?"
|
| 1998 |
+
)
|
| 1999 |
+
|
| 2000 |
+
|
| 2001 |
def test_trace_emits_without_stt_metrics(monkeypatch: pytest.MonkeyPatch) -> None:
|
| 2002 |
import src.agent.traces.metrics_collector as metrics_collector_module
|
| 2003 |
|
|
@@ -3,7 +3,14 @@ from __future__ import annotations
|
|
| 3 |
import pytest
|
| 4 |
from pydantic import ValidationError
|
| 5 |
|
| 6 |
-
from src.core.settings import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
def test_llm_runtime_tuning_defaults_are_declared() -> None:
|
|
@@ -15,7 +22,7 @@ def test_llm_runtime_tuning_defaults_are_declared() -> None:
|
|
| 15 |
assert fields["MCP_SERVER_URL"].default == "https://huggingface.co/mcp"
|
| 16 |
assert fields["MCP_EXTRA_SERVER_URLS"].default == "https://docs.livekit.io/mcp"
|
| 17 |
assert fields["OLLAMA_CLOUD_MODE"].default is True
|
| 18 |
-
assert fields["OLLAMA_MODEL"].default == "ministral-3:14b
|
| 19 |
assert fields["OLLAMA_API_KEY"].default == "ollama"
|
| 20 |
assert fields["LLM_CONN_TIMEOUT_SEC"].default == 20.0
|
| 21 |
assert fields["LLM_CONN_MAX_RETRY"].default == 1
|
|
@@ -28,7 +35,7 @@ def test_llm_runtime_tuning_defaults_are_declared() -> None:
|
|
| 28 |
def test_livekit_runtime_tuning_defaults_are_declared() -> None:
|
| 29 |
fields = LiveKitSettings.model_fields
|
| 30 |
|
| 31 |
-
assert fields["LIVEKIT_NUM_IDLE_PROCESSES"].default ==
|
| 32 |
assert fields["LIVEKIT_INITIALIZE_PROCESS_TIMEOUT_SEC"].default == 20.0
|
| 33 |
assert fields["LIVEKIT_JOB_MEMORY_WARN_MB"].default == 6144
|
| 34 |
|
|
@@ -36,16 +43,25 @@ def test_livekit_runtime_tuning_defaults_are_declared() -> None:
|
|
| 36 |
def test_voice_runtime_tuning_defaults_are_declared() -> None:
|
| 37 |
fields = VoiceSettings.model_fields
|
| 38 |
|
| 39 |
-
assert fields["TTS_PROVIDER"].default == "
|
| 40 |
assert fields["NVIDIA_TTS_VOICE"].default == "Magpie-Multilingual.EN-US.Leo"
|
| 41 |
assert fields["NVIDIA_TTS_USE_SSL"].default is True
|
| 42 |
assert fields["POCKET_TTS_CONN_TIMEOUT_SEC"].default == 45.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
|
| 45 |
def test_stt_runtime_tuning_defaults_are_declared() -> None:
|
| 46 |
fields = STTSettings.model_fields
|
| 47 |
|
| 48 |
-
assert fields["STT_PROVIDER"].default == "
|
| 49 |
assert fields["DEEPGRAM_STT_MODEL"].default == "nova-3"
|
| 50 |
assert fields["DEEPGRAM_STT_LANGUAGE"].default == "en-US"
|
| 51 |
|
|
|
|
| 3 |
import pytest
|
| 4 |
from pydantic import ValidationError
|
| 5 |
|
| 6 |
+
from src.core.settings import (
|
| 7 |
+
LLMSettings,
|
| 8 |
+
LangfuseSettings,
|
| 9 |
+
LiveKitSettings,
|
| 10 |
+
STTSettings,
|
| 11 |
+
Settings,
|
| 12 |
+
VoiceSettings,
|
| 13 |
+
)
|
| 14 |
|
| 15 |
|
| 16 |
def test_llm_runtime_tuning_defaults_are_declared() -> None:
|
|
|
|
| 22 |
assert fields["MCP_SERVER_URL"].default == "https://huggingface.co/mcp"
|
| 23 |
assert fields["MCP_EXTRA_SERVER_URLS"].default == "https://docs.livekit.io/mcp"
|
| 24 |
assert fields["OLLAMA_CLOUD_MODE"].default is True
|
| 25 |
+
assert fields["OLLAMA_MODEL"].default == "ministral-3:14b"
|
| 26 |
assert fields["OLLAMA_API_KEY"].default == "ollama"
|
| 27 |
assert fields["LLM_CONN_TIMEOUT_SEC"].default == 20.0
|
| 28 |
assert fields["LLM_CONN_MAX_RETRY"].default == 1
|
|
|
|
| 35 |
def test_livekit_runtime_tuning_defaults_are_declared() -> None:
|
| 36 |
fields = LiveKitSettings.model_fields
|
| 37 |
|
| 38 |
+
assert fields["LIVEKIT_NUM_IDLE_PROCESSES"].default == 0
|
| 39 |
assert fields["LIVEKIT_INITIALIZE_PROCESS_TIMEOUT_SEC"].default == 20.0
|
| 40 |
assert fields["LIVEKIT_JOB_MEMORY_WARN_MB"].default == 6144
|
| 41 |
|
|
|
|
| 43 |
def test_voice_runtime_tuning_defaults_are_declared() -> None:
|
| 44 |
fields = VoiceSettings.model_fields
|
| 45 |
|
| 46 |
+
assert fields["TTS_PROVIDER"].default == "pocket"
|
| 47 |
assert fields["NVIDIA_TTS_VOICE"].default == "Magpie-Multilingual.EN-US.Leo"
|
| 48 |
assert fields["NVIDIA_TTS_USE_SSL"].default is True
|
| 49 |
assert fields["POCKET_TTS_CONN_TIMEOUT_SEC"].default == 45.0
|
| 50 |
+
assert fields["MIN_ENDPOINTING_DELAY"].default == 0.8
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def test_langfuse_runtime_tuning_defaults_are_declared() -> None:
|
| 54 |
+
fields = LangfuseSettings.model_fields
|
| 55 |
+
|
| 56 |
+
assert fields["LANGFUSE_TRACE_FINALIZE_TIMEOUT_MS"].default == 8000.0
|
| 57 |
+
assert fields["LANGFUSE_POST_TOOL_RESPONSE_TIMEOUT_MS"].default == 30000.0
|
| 58 |
+
assert fields["LANGFUSE_CONTINUATION_COALESCE_WINDOW_MS"].default == 1500.0
|
| 59 |
|
| 60 |
|
| 61 |
def test_stt_runtime_tuning_defaults_are_declared() -> None:
|
| 62 |
fields = STTSettings.model_fields
|
| 63 |
|
| 64 |
+
assert fields["STT_PROVIDER"].default == "deepgram"
|
| 65 |
assert fields["DEEPGRAM_STT_MODEL"].default == "nova-3"
|
| 66 |
assert fields["DEEPGRAM_STT_LANGUAGE"].default == "en-US"
|
| 67 |
|
|
@@ -2,11 +2,15 @@ from __future__ import annotations
|
|
| 2 |
|
| 3 |
import asyncio
|
| 4 |
import types
|
|
|
|
| 5 |
|
| 6 |
from livekit.agents.inference_runner import _InferenceRunner
|
| 7 |
|
| 8 |
from src.agent.runtime import session as runtime_session
|
| 9 |
-
from src.core.settings import settings
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
class _FakeJobContext:
|
|
@@ -70,8 +74,8 @@ def test_build_server_uses_livekit_process_initialization_settings(monkeypatch)
|
|
| 70 |
assert server._job_memory_warn_mb == 8192.0
|
| 71 |
|
| 72 |
|
| 73 |
-
def
|
| 74 |
-
assert "
|
| 75 |
|
| 76 |
|
| 77 |
def test_resolve_stt_metrics_model_name_uses_deepgram_model(monkeypatch) -> None:
|
|
@@ -119,9 +123,12 @@ def test_session_handler_runs_llm_warmup_before_session_start(monkeypatch) -> No
|
|
| 119 |
monkeypatch.setattr(runtime_session, "install_mcp_generate_reply_guard", lambda *args, **kwargs: None)
|
| 120 |
monkeypatch.setattr(runtime_session, "run_startup_greeting", lambda *args, **kwargs: None)
|
| 121 |
monkeypatch.setattr(runtime_session.silero.VAD, "load", lambda **kwargs: "vad")
|
| 122 |
-
monkeypatch.setattr(runtime_session, "
|
| 123 |
monkeypatch.setattr(runtime_session.room_io, "AudioInputOptions", lambda **kwargs: kwargs)
|
| 124 |
monkeypatch.setattr(runtime_session.room_io, "RoomOptions", lambda **kwargs: kwargs)
|
|
|
|
|
|
|
|
|
|
| 125 |
|
| 126 |
async def _fake_run_llm_warmup(**kwargs) -> None:
|
| 127 |
order.append("llm")
|
|
@@ -133,3 +140,45 @@ def test_session_handler_runs_llm_warmup_before_session_start(monkeypatch) -> No
|
|
| 133 |
assert order == ["llm", "start"]
|
| 134 |
assert len(created_sessions) == 1
|
| 135 |
assert created_sessions[0].start_calls
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
import asyncio
|
| 4 |
import types
|
| 5 |
+
from pathlib import Path
|
| 6 |
|
| 7 |
from livekit.agents.inference_runner import _InferenceRunner
|
| 8 |
|
| 9 |
from src.agent.runtime import session as runtime_session
|
| 10 |
+
from src.core.settings import VoiceSettings, settings
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
ENV_EXAMPLE_PATH = Path(__file__).resolve().parents[1] / ".env.example"
|
| 14 |
|
| 15 |
|
| 16 |
class _FakeJobContext:
|
|
|
|
| 74 |
assert server._job_memory_warn_mb == 8192.0
|
| 75 |
|
| 76 |
|
| 77 |
+
def test_importing_session_registers_english_turn_detector_runner() -> None:
|
| 78 |
+
assert "lk_end_of_utterance_en" in _InferenceRunner.registered_runners
|
| 79 |
|
| 80 |
|
| 81 |
def test_resolve_stt_metrics_model_name_uses_deepgram_model(monkeypatch) -> None:
|
|
|
|
| 123 |
monkeypatch.setattr(runtime_session, "install_mcp_generate_reply_guard", lambda *args, **kwargs: None)
|
| 124 |
monkeypatch.setattr(runtime_session, "run_startup_greeting", lambda *args, **kwargs: None)
|
| 125 |
monkeypatch.setattr(runtime_session.silero.VAD, "load", lambda **kwargs: "vad")
|
| 126 |
+
monkeypatch.setattr(runtime_session, "EnglishModel", lambda: "turn-detector")
|
| 127 |
monkeypatch.setattr(runtime_session.room_io, "AudioInputOptions", lambda **kwargs: kwargs)
|
| 128 |
monkeypatch.setattr(runtime_session.room_io, "RoomOptions", lambda **kwargs: kwargs)
|
| 129 |
+
monkeypatch.setattr(settings.voice, "MIN_ENDPOINTING_DELAY", 1.0)
|
| 130 |
+
monkeypatch.setattr(settings.voice, "MAX_ENDPOINTING_DELAY", 4.0)
|
| 131 |
+
monkeypatch.setattr(settings.voice, "PREEMPTIVE_GENERATION", False)
|
| 132 |
|
| 133 |
async def _fake_run_llm_warmup(**kwargs) -> None:
|
| 134 |
order.append("llm")
|
|
|
|
| 140 |
assert order == ["llm", "start"]
|
| 141 |
assert len(created_sessions) == 1
|
| 142 |
assert created_sessions[0].start_calls
|
| 143 |
+
assert created_sessions[0].kwargs["turn_detection"] == "turn-detector"
|
| 144 |
+
assert created_sessions[0].kwargs["min_endpointing_delay"] == 1.0
|
| 145 |
+
assert created_sessions[0].kwargs["max_endpointing_delay"] == 4.0
|
| 146 |
+
assert created_sessions[0].kwargs["preemptive_generation"] is False
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
def test_env_example_turn_profile_matches_voice_defaults() -> None:
|
| 150 |
+
env_values = _parse_env_file(ENV_EXAMPLE_PATH)
|
| 151 |
+
|
| 152 |
+
assert env_values["LIVEKIT_FRAME_SIZE_MS"] == str(
|
| 153 |
+
VoiceSettings.model_fields["LIVEKIT_FRAME_SIZE_MS"].default
|
| 154 |
+
)
|
| 155 |
+
assert env_values["VAD_MIN_SILENCE_DURATION"] == str(
|
| 156 |
+
VoiceSettings.model_fields["VAD_MIN_SILENCE_DURATION"].default
|
| 157 |
+
)
|
| 158 |
+
assert env_values["VAD_THRESHOLD"] == str(
|
| 159 |
+
VoiceSettings.model_fields["VAD_THRESHOLD"].default
|
| 160 |
+
)
|
| 161 |
+
assert env_values["MIN_ENDPOINTING_DELAY"] == str(
|
| 162 |
+
VoiceSettings.model_fields["MIN_ENDPOINTING_DELAY"].default
|
| 163 |
+
)
|
| 164 |
+
assert env_values["MAX_ENDPOINTING_DELAY"] == str(
|
| 165 |
+
VoiceSettings.model_fields["MAX_ENDPOINTING_DELAY"].default
|
| 166 |
+
)
|
| 167 |
+
assert env_values["PREEMPTIVE_GENERATION"] == _env_bool(
|
| 168 |
+
VoiceSettings.model_fields["PREEMPTIVE_GENERATION"].default
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
def _parse_env_file(path: Path) -> dict[str, str]:
|
| 173 |
+
values: dict[str, str] = {}
|
| 174 |
+
for raw_line in path.read_text(encoding="utf-8").splitlines():
|
| 175 |
+
line = raw_line.strip()
|
| 176 |
+
if not line or line.startswith("#") or "=" not in line:
|
| 177 |
+
continue
|
| 178 |
+
key, value = line.split("=", 1)
|
| 179 |
+
values[key.strip()] = value.split(" #", 1)[0].strip()
|
| 180 |
+
return values
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
def _env_bool(value: bool) -> str:
|
| 184 |
+
return "true" if value else "false"
|