dvalle08 commited on
Commit
4a03ace
·
1 Parent(s): f2de5e8

feat: Enhance voice activity detection and turn tracing features

Browse files

- Updated VAD parameters in `.env.example` for improved responsiveness and accuracy, including adjustments to `VAD_MIN_SILENCE_DURATION`, `VAD_THRESHOLD`, and endpointing delays.
- Introduced a new `PendingUserUtterance` class in `metrics_collector.py` to manage user utterances more effectively, allowing for better tracking of speech-to-text transitions.
- Refactored `TurnTracer` to support coalescing of user transcripts, enabling the merging of immediate continuations into prior turns for a more seamless conversation flow.
- Updated settings in `settings.py` to reflect new defaults for voice processing and turn detection.
- Enhanced tests to validate the new features and ensure proper functionality of the updated VAD and turn tracing mechanisms.

.env.example CHANGED
@@ -46,6 +46,7 @@ LANGFUSE_TRACE_FINALIZE_TIMEOUT_MS=8000
46
  LANGFUSE_POST_TOOL_RESPONSE_TIMEOUT_MS=30000
47
  LANGFUSE_MAX_PENDING_TRACE_TASKS=200
48
  LANGFUSE_TRACE_FLUSH_TIMEOUT_MS=1000
 
49
 
50
  # Common LLM Parameters
51
  LLM_TEMPERATURE=0.7
@@ -85,19 +86,19 @@ LIVEKIT_NUM_IDLE_PROCESSES=1 # Use 0-1 locally to reduce memory pressure
85
  LIVEKIT_INITIALIZE_PROCESS_TIMEOUT_SEC=20.0 # Increase idle worker bootstrap timeout
86
  LIVEKIT_JOB_MEMORY_WARN_MB=6144 # Per-job memory warning threshold (6 GB)
87
 
88
- # LiveKit Audio Input Configuration - OPTIMIZED FOR FALSE DETECTION FIX
89
  LIVEKIT_SAMPLE_RATE=24000
90
  LIVEKIT_NUM_CHANNELS=1
91
- LIVEKIT_FRAME_SIZE_MS=20 # Smaller = faster VAD response, less latency
92
  LIVEKIT_PRE_CONNECT_AUDIO=true
93
  LIVEKIT_PRE_CONNECT_TIMEOUT=3.0
94
 
95
- # Voice Activity Detection (VAD) Configuration - OPTIMIZED FOR FALSE DETECTION FIX
96
- VAD_MIN_SPEECH_DURATION=0.18 # Require 180ms of speech before activation (faster turn pickup)
97
- VAD_MIN_SILENCE_DURATION=0.30 # Require 300ms of silence before deactivation (faster turn end)
98
- VAD_THRESHOLD=0.6 # Higher = less sensitive to noise (0.5 is default)
99
 
100
- # Turn endpointing and responsiveness tuning
101
- MIN_ENDPOINTING_DELAY=0.35 # Minimum wait before end-of-turn commit
102
- MAX_ENDPOINTING_DELAY=1.5 # Upper bound wait when turn detector predicts continuation
103
- PREEMPTIVE_GENERATION=true # Start generating earlier to reduce perceived latency
 
46
  LANGFUSE_POST_TOOL_RESPONSE_TIMEOUT_MS=30000
47
  LANGFUSE_MAX_PENDING_TRACE_TASKS=200
48
  LANGFUSE_TRACE_FLUSH_TIMEOUT_MS=1000
49
+ LANGFUSE_CONTINUATION_COALESCE_WINDOW_MS=1500 # Merge immediate continuation turns into one trace; 0 disables it
50
 
51
  # Common LLM Parameters
52
  LLM_TEMPERATURE=0.7
 
86
  LIVEKIT_INITIALIZE_PROCESS_TIMEOUT_SEC=20.0 # Increase idle worker bootstrap timeout
87
  LIVEKIT_JOB_MEMORY_WARN_MB=6144 # Per-job memory warning threshold (6 GB)
88
 
89
+ # LiveKit audio input configuration
90
  LIVEKIT_SAMPLE_RATE=24000
91
  LIVEKIT_NUM_CHANNELS=1
92
+ LIVEKIT_FRAME_SIZE_MS=60 # Larger frames slightly reduce responsiveness but avoid over-eager VAD transitions
93
  LIVEKIT_PRE_CONNECT_AUDIO=true
94
  LIVEKIT_PRE_CONNECT_TIMEOUT=3.0
95
 
96
+ # Voice Activity Detection (VAD) configuration
97
+ VAD_MIN_SPEECH_DURATION=0.18 # Require 180ms of speech before activation
98
+ VAD_MIN_SILENCE_DURATION=0.55 # Wait longer before treating a pause as end of speech
99
+ VAD_THRESHOLD=0.5 # Silero default; keep balanced sensitivity for speech vs background noise
100
 
101
+ # Turn endpointing tuning
102
+ MIN_ENDPOINTING_DELAY=0.8 # Add a bit more patience before committing a turn to avoid false splits
103
+ MAX_ENDPOINTING_DELAY=3.0 # Let the detector wait longer when phrasing suggests continuation
104
+ PREEMPTIVE_GENERATION=false # Wait for the committed turn before generating a reply
.gitignore CHANGED
@@ -29,4 +29,5 @@ model_cache/
29
 
30
  # OS
31
  .DS_Store
32
- codex-skills/
 
 
29
 
30
  # OS
31
  .DS_Store
32
+ codex-skills/
33
+ blog/
src/agent/runtime/session.py CHANGED
@@ -12,7 +12,7 @@ from livekit.agents import AgentServer, AgentSession, room_io
12
  from livekit.agents.types import APIConnectOptions
13
  from livekit.agents.voice.agent_session import SessionConnectOptions
14
  from livekit.plugins import noise_cancellation, silero
15
- from livekit.plugins.turn_detector.multilingual import MultilingualModel
16
 
17
  from src.agent.models.llm_runtime import (
18
  build_llm_runtime,
@@ -71,6 +71,15 @@ def _resolve_stt_metrics_model_name() -> str:
71
  return settings.stt.NVIDIA_STT_MODEL
72
 
73
 
 
 
 
 
 
 
 
 
 
74
  def _build_session_connect_options() -> tuple[APIConnectOptions, SessionConnectOptions]:
75
  llm_conn_options = build_api_connect_options(
76
  max_retry=settings.llm.LLM_CONN_MAX_RETRY,
@@ -173,8 +182,21 @@ async def session_handler(ctx: agents.JobContext) -> None:
173
  model=llm_runtime.model,
174
  )
175
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  session_kwargs: dict[str, Any] = dict(
177
- stt=create_stt(),
178
  llm=llm_runtime.llm,
179
  tts=tts_engine,
180
  vad=silero.VAD.load(
@@ -182,7 +204,7 @@ async def session_handler(ctx: agents.JobContext) -> None:
182
  min_silence_duration=settings.voice.VAD_MIN_SILENCE_DURATION,
183
  activation_threshold=settings.voice.VAD_THRESHOLD,
184
  ),
185
- turn_detection=MultilingualModel(),
186
  min_endpointing_delay=settings.voice.MIN_ENDPOINTING_DELAY,
187
  max_endpointing_delay=settings.voice.MAX_ENDPOINTING_DELAY,
188
  preemptive_generation=settings.voice.PREEMPTIVE_GENERATION,
 
12
  from livekit.agents.types import APIConnectOptions
13
  from livekit.agents.voice.agent_session import SessionConnectOptions
14
  from livekit.plugins import noise_cancellation, silero
15
+ from livekit.plugins.turn_detector.english import EnglishModel
16
 
17
  from src.agent.models.llm_runtime import (
18
  build_llm_runtime,
 
71
  return settings.stt.NVIDIA_STT_MODEL
72
 
73
 
74
+ def _resolve_stt_language() -> str:
75
+ provider = settings.stt.STT_PROVIDER.lower()
76
+ if provider == "moonshine":
77
+ return settings.stt.MOONSHINE_LANGUAGE
78
+ if provider == "deepgram":
79
+ return settings.stt.DEEPGRAM_STT_LANGUAGE
80
+ return settings.stt.NVIDIA_STT_LANGUAGE_CODE
81
+
82
+
83
  def _build_session_connect_options() -> tuple[APIConnectOptions, SessionConnectOptions]:
84
  llm_conn_options = build_api_connect_options(
85
  max_retry=settings.llm.LLM_CONN_MAX_RETRY,
 
182
  model=llm_runtime.model,
183
  )
184
 
185
+ stt_engine = create_stt()
186
+ logger.info(
187
+ "Turn profile: detector=%s stt_provider=%s stt_model=%s stt_language=%s vad_min_silence=%.2fs min_endpointing=%.2fs max_endpointing=%.2fs preemptive_generation=%s",
188
+ "EnglishModel",
189
+ settings.stt.STT_PROVIDER,
190
+ _resolve_stt_metrics_model_name(),
191
+ _resolve_stt_language(),
192
+ settings.voice.VAD_MIN_SILENCE_DURATION,
193
+ settings.voice.MIN_ENDPOINTING_DELAY,
194
+ settings.voice.MAX_ENDPOINTING_DELAY,
195
+ settings.voice.PREEMPTIVE_GENERATION,
196
+ )
197
+
198
  session_kwargs: dict[str, Any] = dict(
199
+ stt=stt_engine,
200
  llm=llm_runtime.llm,
201
  tts=tts_engine,
202
  vad=silero.VAD.load(
 
204
  min_silence_duration=settings.voice.VAD_MIN_SILENCE_DURATION,
205
  activation_threshold=settings.voice.VAD_THRESHOLD,
206
  ),
207
+ turn_detection=EnglishModel(),
208
  min_endpointing_delay=settings.voice.MIN_ENDPOINTING_DELAY,
209
  max_endpointing_delay=settings.voice.MAX_ENDPOINTING_DELAY,
210
  preemptive_generation=settings.voice.PREEMPTIVE_GENERATION,
src/agent/traces/metrics_collector.py CHANGED
@@ -202,6 +202,17 @@ class TurnState:
202
  first_audio_monotonic: Optional[float] = None
203
 
204
 
 
 
 
 
 
 
 
 
 
 
 
205
  # ------------------------------------------------------------------
206
  # Facade
207
  # ------------------------------------------------------------------
@@ -241,12 +252,11 @@ class MetricsCollector:
241
  )
242
 
243
  self._publisher = ChannelPublisher(room)
244
- self._pending_transcripts: deque[str] = deque()
245
  self._pending_agent_transcripts: deque[str] = deque()
246
  self._pending_speech_ids_for_first_audio: deque[str] = deque()
247
  self._latest_agent_speech_id: Optional[str] = None
248
  self._turns: dict[str, TurnState] = {}
249
- self._pending_llm_watchdog_ids: deque[str] = deque()
250
  self._llm_stall_tasks: dict[str, asyncio.Task[None]] = {}
251
  self._latest_vad_metrics: Optional[VADMetrics] = None
252
  self._latest_vad_metric_attributes: Optional[dict[str, Any]] = None
@@ -318,18 +328,34 @@ class MetricsCollector:
318
  normalized = transcript.strip()
319
  if not normalized:
320
  return
321
- self._pending_transcripts.append(normalized)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
  if not self._first_final_user_turn_logged:
323
  self._first_final_user_turn_logged = True
324
  logger.info(
325
  "First finalized user transcript received: room=%s chars=%s preview=%r",
326
  self._room_name,
327
- len(normalized),
328
- normalized[:80],
329
  )
330
- self._start_llm_stall_watchdog(transcript=normalized)
331
  room_id = await self._resolve_room_id()
332
- await self._tracer.create_turn(user_transcript=normalized, room_id=room_id)
 
 
 
333
 
334
  async def on_conversation_item_added(
335
  self,
@@ -345,7 +371,28 @@ class MetricsCollector:
345
  if not normalized:
346
  return
347
  if role == "user":
348
- _append_if_new(self._pending_transcripts, normalized)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
349
  return
350
  assistant_event_created_at = (
351
  item_created_at if item_created_at is not None else event_created_at
@@ -519,8 +566,10 @@ class MetricsCollector:
519
  if isinstance(collected_metrics, metrics.STTMetrics):
520
  speech_id = collected_metrics.request_id
521
  turn_metrics = self._get_or_create_turn(speech_id, role="user")
522
- if self._pending_transcripts:
523
- turn_metrics.transcript = self._pending_transcripts.popleft()
 
 
524
  turn_metrics.stt = STTMetrics(
525
  type=collected_metrics.type,
526
  label=collected_metrics.label,
@@ -605,6 +654,7 @@ class MetricsCollector:
605
  elif isinstance(collected_metrics, metrics.EOUMetrics):
606
  speech_id = collected_metrics.speech_id
607
  if speech_id:
 
608
  state = self._get_or_create_state(speech_id)
609
  if state.speech_end_monotonic is None:
610
  state.speech_end_monotonic = monotonic()
@@ -788,12 +838,44 @@ class MetricsCollector:
788
  s for s in self._pending_speech_ids_for_first_audio if s != speech_id
789
  )
790
 
791
- def _start_llm_stall_watchdog(self, *, transcript: str) -> None:
792
- if self._llm_stall_timeout_sec <= 0:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
793
  return
794
 
 
 
 
 
 
 
 
 
 
 
 
795
  watchdog_id = str(uuid.uuid4())
796
- self._pending_llm_watchdog_ids.append(watchdog_id)
797
  task = asyncio.create_task(
798
  self._warn_if_turn_stalled_before_llm(
799
  watchdog_id=watchdog_id,
@@ -807,14 +889,36 @@ class MetricsCollector:
807
  self._llm_stall_tasks.pop(watchdog_id, None)
808
 
809
  task.add_done_callback(_on_done)
 
 
 
 
 
 
 
810
 
811
  def _mark_llm_stage_reached(self) -> None:
812
- while self._pending_llm_watchdog_ids:
813
- watchdog_id = self._pending_llm_watchdog_ids.popleft()
814
- task = self._llm_stall_tasks.pop(watchdog_id, None)
815
- if task:
816
- task.cancel()
817
- return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
818
 
819
  async def _warn_if_turn_stalled_before_llm(
820
  self,
@@ -827,21 +931,21 @@ class MetricsCollector:
827
  except asyncio.CancelledError:
828
  return
829
 
830
- if watchdog_id not in self._pending_llm_watchdog_ids:
 
831
  return
832
 
833
- self._pending_llm_watchdog_ids = deque(
834
- wid for wid in self._pending_llm_watchdog_ids if wid != watchdog_id
835
- )
836
-
837
- preview = transcript[:80]
838
  logger.warning(
839
  "Turn stalled before LLM stage: timeout=%.2fs room=%s transcript_chars=%s transcript_preview=%r",
840
  self._llm_stall_timeout_sec,
841
  self._room_name,
842
- len(transcript),
843
  preview,
844
  )
 
845
 
846
  async def _resolve_room_id(self) -> str:
847
  if self._room_id and self._room_id != self._room_name:
@@ -881,6 +985,32 @@ def _append_if_new(queue: deque[str], value: str) -> None:
881
  queue.append(value)
882
 
883
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
884
  def _trace_turn_has_tool_activity(trace_turn: Optional[TraceTurn]) -> bool:
885
  if trace_turn is None:
886
  return False
 
202
  first_audio_monotonic: Optional[float] = None
203
 
204
 
205
+ @dataclass
206
+ class PendingUserUtterance:
207
+ """Logical user utterance that may span multiple final STT chunks."""
208
+
209
+ transcript: str
210
+ committed: bool = False
211
+ stt_observed: bool = False
212
+ llm_started: bool = False
213
+ watchdog_id: Optional[str] = None
214
+
215
+
216
  # ------------------------------------------------------------------
217
  # Facade
218
  # ------------------------------------------------------------------
 
252
  )
253
 
254
  self._publisher = ChannelPublisher(room)
255
+ self._pending_user_utterances: deque[PendingUserUtterance] = deque()
256
  self._pending_agent_transcripts: deque[str] = deque()
257
  self._pending_speech_ids_for_first_audio: deque[str] = deque()
258
  self._latest_agent_speech_id: Optional[str] = None
259
  self._turns: dict[str, TurnState] = {}
 
260
  self._llm_stall_tasks: dict[str, asyncio.Task[None]] = {}
261
  self._latest_vad_metrics: Optional[VADMetrics] = None
262
  self._latest_vad_metric_attributes: Optional[dict[str, Any]] = None
 
328
  normalized = transcript.strip()
329
  if not normalized:
330
  return
331
+ utterance = self._current_open_user_utterance()
332
+ if utterance is None:
333
+ utterance = PendingUserUtterance(transcript=normalized)
334
+ utterance.watchdog_id = self._start_llm_stall_watchdog(transcript=normalized)
335
+ self._pending_user_utterances.append(utterance)
336
+ else:
337
+ utterance.transcript = _merge_user_transcripts(
338
+ utterance.transcript,
339
+ normalized,
340
+ )
341
+ if utterance.watchdog_id is not None:
342
+ self._update_llm_stall_watchdog(
343
+ utterance.watchdog_id,
344
+ utterance.transcript,
345
+ )
346
  if not self._first_final_user_turn_logged:
347
  self._first_final_user_turn_logged = True
348
  logger.info(
349
  "First finalized user transcript received: room=%s chars=%s preview=%r",
350
  self._room_name,
351
+ len(utterance.transcript),
352
+ utterance.transcript[:80],
353
  )
 
354
  room_id = await self._resolve_room_id()
355
+ await self._tracer.create_turn(
356
+ user_transcript=utterance.transcript,
357
+ room_id=room_id,
358
+ )
359
 
360
  async def on_conversation_item_added(
361
  self,
 
371
  if not normalized:
372
  return
373
  if role == "user":
374
+ utterance = self._latest_user_utterance()
375
+ if utterance is None:
376
+ utterance = PendingUserUtterance(
377
+ transcript=normalized,
378
+ committed=True,
379
+ )
380
+ self._pending_user_utterances.append(utterance)
381
+ else:
382
+ utterance.transcript = normalized
383
+ utterance.committed = True
384
+ if utterance.watchdog_id is not None:
385
+ self._update_llm_stall_watchdog(
386
+ utterance.watchdog_id,
387
+ utterance.transcript,
388
+ )
389
+ user_event_created_at = (
390
+ item_created_at if item_created_at is not None else event_created_at
391
+ )
392
+ await self._tracer.attach_user_text(
393
+ normalized,
394
+ event_created_at=user_event_created_at,
395
+ )
396
  return
397
  assistant_event_created_at = (
398
  item_created_at if item_created_at is not None else event_created_at
 
566
  if isinstance(collected_metrics, metrics.STTMetrics):
567
  speech_id = collected_metrics.request_id
568
  turn_metrics = self._get_or_create_turn(speech_id, role="user")
569
+ utterance = self._next_user_utterance_for_stt()
570
+ if utterance is not None:
571
+ turn_metrics.transcript = utterance.transcript
572
+ utterance.stt_observed = True
573
  turn_metrics.stt = STTMetrics(
574
  type=collected_metrics.type,
575
  label=collected_metrics.label,
 
654
  elif isinstance(collected_metrics, metrics.EOUMetrics):
655
  speech_id = collected_metrics.speech_id
656
  if speech_id:
657
+ self._mark_oldest_open_user_utterance_committed()
658
  state = self._get_or_create_state(speech_id)
659
  if state.speech_end_monotonic is None:
660
  state.speech_end_monotonic = monotonic()
 
838
  s for s in self._pending_speech_ids_for_first_audio if s != speech_id
839
  )
840
 
841
+ def _current_open_user_utterance(self) -> Optional[PendingUserUtterance]:
842
+ utterance = self._latest_user_utterance()
843
+ if utterance is None or utterance.committed:
844
+ return None
845
+ return utterance
846
+
847
+ def _latest_user_utterance(self) -> Optional[PendingUserUtterance]:
848
+ if not self._pending_user_utterances:
849
+ return None
850
+ return self._pending_user_utterances[-1]
851
+
852
+ def _next_user_utterance_for_stt(self) -> Optional[PendingUserUtterance]:
853
+ for utterance in self._pending_user_utterances:
854
+ if utterance.stt_observed:
855
+ continue
856
+ return utterance
857
+ return None
858
+
859
+ def _mark_oldest_open_user_utterance_committed(self) -> None:
860
+ for utterance in self._pending_user_utterances:
861
+ if utterance.committed:
862
+ continue
863
+ utterance.committed = True
864
+ self._prune_resolved_user_utterances()
865
  return
866
 
867
+ def _prune_resolved_user_utterances(self) -> None:
868
+ while self._pending_user_utterances:
869
+ utterance = self._pending_user_utterances[0]
870
+ if not utterance.committed or not utterance.llm_started:
871
+ break
872
+ self._pending_user_utterances.popleft()
873
+
874
+ def _start_llm_stall_watchdog(self, *, transcript: str) -> str | None:
875
+ if self._llm_stall_timeout_sec <= 0:
876
+ return None
877
+
878
  watchdog_id = str(uuid.uuid4())
 
879
  task = asyncio.create_task(
880
  self._warn_if_turn_stalled_before_llm(
881
  watchdog_id=watchdog_id,
 
889
  self._llm_stall_tasks.pop(watchdog_id, None)
890
 
891
  task.add_done_callback(_on_done)
892
+ return watchdog_id
893
+
894
+ def _update_llm_stall_watchdog(self, watchdog_id: str, transcript: str) -> None:
895
+ utterance = self._find_user_utterance_by_watchdog(watchdog_id)
896
+ if utterance is None or utterance.llm_started:
897
+ return
898
+ utterance.transcript = transcript
899
 
900
  def _mark_llm_stage_reached(self) -> None:
901
+ for utterance in self._pending_user_utterances:
902
+ if utterance.llm_started:
903
+ continue
904
+ utterance.llm_started = True
905
+ watchdog_id = utterance.watchdog_id
906
+ utterance.watchdog_id = None
907
+ if watchdog_id is not None:
908
+ task = self._llm_stall_tasks.pop(watchdog_id, None)
909
+ if task:
910
+ task.cancel()
911
+ self._prune_resolved_user_utterances()
912
+ return
913
+
914
+ def _find_user_utterance_by_watchdog(
915
+ self,
916
+ watchdog_id: str,
917
+ ) -> Optional[PendingUserUtterance]:
918
+ for utterance in self._pending_user_utterances:
919
+ if utterance.watchdog_id == watchdog_id:
920
+ return utterance
921
+ return None
922
 
923
  async def _warn_if_turn_stalled_before_llm(
924
  self,
 
931
  except asyncio.CancelledError:
932
  return
933
 
934
+ utterance = self._find_user_utterance_by_watchdog(watchdog_id)
935
+ if utterance is None or utterance.llm_started:
936
  return
937
 
938
+ utterance.watchdog_id = None
939
+ utterance.llm_started = True
940
+ preview = utterance.transcript[:80] if utterance.transcript else transcript[:80]
 
 
941
  logger.warning(
942
  "Turn stalled before LLM stage: timeout=%.2fs room=%s transcript_chars=%s transcript_preview=%r",
943
  self._llm_stall_timeout_sec,
944
  self._room_name,
945
+ len(utterance.transcript or transcript),
946
  preview,
947
  )
948
+ self._prune_resolved_user_utterances()
949
 
950
  async def _resolve_room_id(self) -> str:
951
  if self._room_id and self._room_id != self._room_name:
 
985
  queue.append(value)
986
 
987
 
988
+ def _merge_user_transcripts(existing: str, incoming: str) -> str:
989
+ left = existing.strip()
990
+ right = incoming.strip()
991
+ if not left:
992
+ return right
993
+ if not right:
994
+ return left
995
+ if left == right:
996
+ return left
997
+ if right.startswith(left):
998
+ return right
999
+ if left.startswith(right):
1000
+ return left
1001
+
1002
+ left_words = left.split()
1003
+ right_words = right.split()
1004
+ max_overlap = min(len(left_words), len(right_words))
1005
+ for overlap in range(max_overlap, 0, -1):
1006
+ left_suffix = [word.casefold() for word in left_words[-overlap:]]
1007
+ right_prefix = [word.casefold() for word in right_words[:overlap]]
1008
+ if left_suffix == right_prefix:
1009
+ merged_words = [*left_words, *right_words[overlap:]]
1010
+ return " ".join(merged_words).strip()
1011
+ return f"{left} {right}".strip()
1012
+
1013
+
1014
  def _trace_turn_has_tool_activity(trace_turn: Optional[TraceTurn]) -> bool:
1015
  if trace_turn is None:
1016
  return False
src/agent/traces/turn_tracer.py CHANGED
@@ -32,6 +32,10 @@ class TraceTurn:
32
  participant_id: str
33
  user_transcript: str
34
  prompt_text: str
 
 
 
 
35
  response_text: str = ""
36
  assistant_text: str = ""
37
  assistant_text_missing: bool = False
@@ -72,6 +76,9 @@ class TraceTurn:
72
  tts_updated_order: Optional[int] = None
73
  event_counter: int = 0
74
  tool_post_response_missing: bool = False
 
 
 
75
  trace_id: Optional[str] = None
76
 
77
 
@@ -153,6 +160,7 @@ _DEFAULT_TRACE_FINALIZE_TIMEOUT_MS = 8000.0
153
  _DEFAULT_POST_TOOL_RESPONSE_TIMEOUT_MS = 30000.0
154
  _DEFAULT_MAX_PENDING_TRACE_TASKS = 200
155
  _DEFAULT_TRACE_FLUSH_TIMEOUT_SEC = 1.0
 
156
  _TOOL_ERROR_FALLBACK_TEXT = "I couldn't complete that tool request. Please rephrase the query."
157
 
158
 
@@ -233,6 +241,17 @@ class TurnTracer:
233
  )
234
  / 1000.0
235
  )
 
 
 
 
 
 
 
 
 
 
 
236
 
237
  # ------------------------------------------------------------------
238
  # Session context
@@ -287,16 +306,60 @@ class TurnTracer:
287
 
288
  async def create_turn(self, *, user_transcript: str, room_id: str) -> None:
289
  async with self._trace_lock:
290
- self._pending_trace_turns.append(
291
- TraceTurn(
292
- turn_id=str(uuid.uuid4()),
293
- session_id=self._session_id,
294
- room_id=room_id,
295
- participant_id=self._participant_id,
296
- user_transcript=user_transcript,
297
- prompt_text=user_transcript,
298
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
  )
 
 
 
 
 
300
 
301
  # ------------------------------------------------------------------
302
  # Stage attachment
@@ -358,6 +421,10 @@ class TurnTracer:
358
  if turn.stt_duration_ms is None:
359
  turn.stt_duration_ms = turn.stt_total_latency_ms
360
  turn.eou_attributes = _sanitize_component_attributes(metric_attributes)
 
 
 
 
361
  metric_speech_id = _normalize_optional_str(
362
  turn.eou_attributes.get("speech_id")
363
  )
@@ -620,6 +687,88 @@ class TurnTracer:
620
  # Internal helpers
621
  # ------------------------------------------------------------------
622
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
623
  def _next_turn_where(
624
  self,
625
  predicate: Callable[[TraceTurn], bool],
@@ -2005,6 +2154,11 @@ def _set_root_attributes(
2005
  "langfuse.trace.metadata.stt_status": turn.stt_status,
2006
  "langfuse.trace.metadata.tool_phase_announced": turn.tool_step_announced,
2007
  "langfuse.trace.metadata.tool_post_response_missing": turn.tool_post_response_missing,
 
 
 
 
 
2008
  "duration_ms": _total_duration_ms(turn),
2009
  "latency_ms.user_input": vals["user_input_duration_ms"],
2010
  "latency_ms.vad": vals["vad_duration_ms"],
@@ -2036,6 +2190,7 @@ def _set_root_attributes(
2036
  "tool.phase_announced": turn.tool_step_announced,
2037
  "tool.post_response_missing": turn.tool_post_response_missing,
2038
  "stt_status": turn.stt_status,
 
2039
  }
2040
  for key, value in attrs.items():
2041
  if value is not None:
@@ -2100,6 +2255,32 @@ def _stringify_observation(value: Any) -> str:
2100
  return str(value)
2101
 
2102
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2103
  def _emit_component_span(
2104
  _tracer: Any,
2105
  *,
 
32
  participant_id: str
33
  user_transcript: str
34
  prompt_text: str
35
+ created_at: float = field(default_factory=time)
36
+ user_turn_committed: bool = False
37
+ user_turn_committed_at: Optional[float] = None
38
+ user_transcript_updated_at: Optional[float] = None
39
  response_text: str = ""
40
  assistant_text: str = ""
41
  assistant_text_missing: bool = False
 
76
  tts_updated_order: Optional[int] = None
77
  event_counter: int = 0
78
  tool_post_response_missing: bool = False
79
+ coalesced_turn_ids: list[str] = field(default_factory=list)
80
+ coalesced_user_transcripts: list[str] = field(default_factory=list)
81
+ coalesced_fragment_count: int = 0
82
  trace_id: Optional[str] = None
83
 
84
 
 
160
  _DEFAULT_POST_TOOL_RESPONSE_TIMEOUT_MS = 30000.0
161
  _DEFAULT_MAX_PENDING_TRACE_TASKS = 200
162
  _DEFAULT_TRACE_FLUSH_TIMEOUT_SEC = 1.0
163
+ _DEFAULT_CONTINUATION_COALESCE_WINDOW_MS = 1500.0
164
  _TOOL_ERROR_FALLBACK_TEXT = "I couldn't complete that tool request. Please rephrase the query."
165
 
166
 
 
241
  )
242
  / 1000.0
243
  )
244
+ self._continuation_coalesce_window_sec = (
245
+ max(
246
+ getattr(
247
+ settings.langfuse,
248
+ "LANGFUSE_CONTINUATION_COALESCE_WINDOW_MS",
249
+ _DEFAULT_CONTINUATION_COALESCE_WINDOW_MS,
250
+ ),
251
+ 0.0,
252
+ )
253
+ / 1000.0
254
+ )
255
 
256
  # ------------------------------------------------------------------
257
  # Session context
 
306
 
307
  async def create_turn(self, *, user_transcript: str, room_id: str) -> None:
308
  async with self._trace_lock:
309
+ normalized = user_transcript.strip()
310
+ if not normalized:
311
+ return
312
+
313
+ current_turn = self._latest_turn_where(lambda c: not c.user_turn_committed)
314
+ if current_turn is not None:
315
+ self._update_user_turn_text(current_turn, normalized)
316
+ return
317
+
318
+ new_turn = TraceTurn(
319
+ turn_id=str(uuid.uuid4()),
320
+ session_id=self._session_id,
321
+ room_id=room_id,
322
+ participant_id=self._participant_id,
323
+ user_transcript=normalized,
324
+ prompt_text=normalized,
325
+ )
326
+ new_turn.user_transcript_updated_at = new_turn.created_at
327
+
328
+ coalesced_turn = self._coalesced_turn_candidate()
329
+ if coalesced_turn is not None:
330
+ self._absorb_coalesced_turn_metadata(new_turn, coalesced_turn)
331
+ self._pending_trace_turns.remove(coalesced_turn)
332
+ self._cancel_finalize_timeout(coalesced_turn.turn_id)
333
+
334
+ self._pending_trace_turns.append(new_turn)
335
+
336
+ async def attach_user_text(
337
+ self,
338
+ user_transcript: str,
339
+ *,
340
+ event_created_at: Optional[float] = None,
341
+ ) -> Optional[TraceTurn]:
342
+ async with self._trace_lock:
343
+ turn = self._latest_turn_where(lambda c: not c.assistant_text.strip())
344
+ if turn is None:
345
+ turn = self._latest_turn_where(lambda _: True)
346
+ if turn is None:
347
+ return None
348
+
349
+ normalized = user_transcript.strip()
350
+ if not normalized:
351
+ return turn
352
+
353
+ self._update_user_turn_text(
354
+ turn,
355
+ normalized,
356
+ event_created_at=event_created_at,
357
  )
358
+ turn.user_turn_committed = True
359
+ turn.user_turn_committed_at = _resolved_event_timestamp(
360
+ _to_optional_float(event_created_at)
361
+ )
362
+ return turn
363
 
364
  # ------------------------------------------------------------------
365
  # Stage attachment
 
421
  if turn.stt_duration_ms is None:
422
  turn.stt_duration_ms = turn.stt_total_latency_ms
423
  turn.eou_attributes = _sanitize_component_attributes(metric_attributes)
424
+ turn.user_turn_committed = True
425
+ turn.user_turn_committed_at = _resolved_event_timestamp(
426
+ _to_optional_float(turn.eou_attributes.get("timestamp"))
427
+ )
428
  metric_speech_id = _normalize_optional_str(
429
  turn.eou_attributes.get("speech_id")
430
  )
 
687
  # Internal helpers
688
  # ------------------------------------------------------------------
689
 
690
+ def _update_user_turn_text(
691
+ self,
692
+ turn: TraceTurn,
693
+ user_transcript: str,
694
+ *,
695
+ event_created_at: Optional[float] = None,
696
+ ) -> None:
697
+ normalized = user_transcript.strip()
698
+ if not normalized:
699
+ return
700
+ merged = _merge_user_transcripts(turn.user_transcript, normalized)
701
+ turn.user_transcript = merged
702
+ turn.prompt_text = merged
703
+ turn.user_transcript_updated_at = _resolved_event_timestamp(
704
+ _to_optional_float(event_created_at)
705
+ )
706
+
707
+ def _coalesced_turn_candidate(self) -> Optional[TraceTurn]:
708
+ if self._continuation_coalesce_window_sec <= 0.0:
709
+ return None
710
+
711
+ now = time()
712
+ for turn in reversed(self._pending_trace_turns):
713
+ if not self._can_coalesce_turn(turn):
714
+ continue
715
+ activity_at = self._turn_recent_activity_at(turn)
716
+ if activity_at is None:
717
+ continue
718
+ if now - activity_at > self._continuation_coalesce_window_sec:
719
+ return None
720
+ return turn
721
+ return None
722
+
723
+ def _can_coalesce_turn(self, turn: TraceTurn) -> bool:
724
+ if not turn.user_turn_committed:
725
+ return False
726
+ if not turn.user_transcript.strip():
727
+ return False
728
+ if turn.assistant_text.strip() or turn.response_text.strip():
729
+ return False
730
+ if turn.tool_step_announced or turn.tool_executions or turn.last_tool_event_order is not None:
731
+ return False
732
+ return bool(turn.llm_calls and turn.tts_calls)
733
+
734
+ def _turn_recent_activity_at(self, turn: TraceTurn) -> Optional[float]:
735
+ candidates = [
736
+ turn.assistant_text_updated_at,
737
+ turn.tts_updated_at,
738
+ turn.last_tool_completed_at,
739
+ turn.last_tool_event_at,
740
+ turn.user_turn_committed_at,
741
+ turn.user_transcript_updated_at,
742
+ turn.created_at,
743
+ ]
744
+ resolved = [candidate for candidate in candidates if candidate is not None]
745
+ if not resolved:
746
+ return None
747
+ return max(resolved)
748
+
749
+ def _absorb_coalesced_turn_metadata(
750
+ self,
751
+ new_turn: TraceTurn,
752
+ absorbed_turn: TraceTurn,
753
+ ) -> None:
754
+ combined_input = _merge_user_transcripts(
755
+ absorbed_turn.user_transcript,
756
+ new_turn.user_transcript,
757
+ )
758
+ new_turn.user_transcript = combined_input
759
+ new_turn.prompt_text = combined_input
760
+ new_turn.coalesced_turn_ids = [
761
+ *absorbed_turn.coalesced_turn_ids,
762
+ absorbed_turn.turn_id,
763
+ ]
764
+ new_turn.coalesced_user_transcripts = [
765
+ *absorbed_turn.coalesced_user_transcripts,
766
+ absorbed_turn.user_transcript,
767
+ ]
768
+ new_turn.coalesced_fragment_count = (
769
+ absorbed_turn.coalesced_fragment_count + 1
770
+ )
771
+
772
  def _next_turn_where(
773
  self,
774
  predicate: Callable[[TraceTurn], bool],
 
2154
  "langfuse.trace.metadata.stt_status": turn.stt_status,
2155
  "langfuse.trace.metadata.tool_phase_announced": turn.tool_step_announced,
2156
  "langfuse.trace.metadata.tool_post_response_missing": turn.tool_post_response_missing,
2157
+ "langfuse.trace.metadata.user_turn_committed": turn.user_turn_committed,
2158
+ "langfuse.trace.metadata.coalesced_turn_count": len(turn.coalesced_turn_ids),
2159
+ "langfuse.trace.metadata.coalesced_fragment_count": turn.coalesced_fragment_count,
2160
+ "langfuse.trace.metadata.coalesced_turn_ids": turn.coalesced_turn_ids,
2161
+ "langfuse.trace.metadata.coalesced_inputs": turn.coalesced_user_transcripts,
2162
  "duration_ms": _total_duration_ms(turn),
2163
  "latency_ms.user_input": vals["user_input_duration_ms"],
2164
  "latency_ms.vad": vals["vad_duration_ms"],
 
2190
  "tool.phase_announced": turn.tool_step_announced,
2191
  "tool.post_response_missing": turn.tool_post_response_missing,
2192
  "stt_status": turn.stt_status,
2193
+ "user_turn.committed": turn.user_turn_committed,
2194
  }
2195
  for key, value in attrs.items():
2196
  if value is not None:
 
2255
  return str(value)
2256
 
2257
 
2258
+ def _merge_user_transcripts(existing: str, incoming: str) -> str:
2259
+ left = existing.strip()
2260
+ right = incoming.strip()
2261
+ if not left:
2262
+ return right
2263
+ if not right:
2264
+ return left
2265
+ if left == right:
2266
+ return left
2267
+ if right.startswith(left):
2268
+ return right
2269
+ if left.startswith(right):
2270
+ return left
2271
+
2272
+ left_words = left.split()
2273
+ right_words = right.split()
2274
+ max_overlap = min(len(left_words), len(right_words))
2275
+ for overlap in range(max_overlap, 0, -1):
2276
+ left_suffix = [word.casefold() for word in left_words[-overlap:]]
2277
+ right_prefix = [word.casefold() for word in right_words[:overlap]]
2278
+ if left_suffix == right_prefix:
2279
+ merged_words = [*left_words, *right_words[overlap:]]
2280
+ return " ".join(merged_words).strip()
2281
+ return f"{left} {right}".strip()
2282
+
2283
+
2284
  def _emit_component_span(
2285
  _tracer: Any,
2286
  *,
src/core/settings.py CHANGED
@@ -60,7 +60,7 @@ class CoreSettings(BaseSettings):
60
 
61
  class VoiceSettings(CoreSettings):
62
  TTS_PROVIDER: str = Field(
63
- default="nvidia",
64
  description="TTS provider: 'pocket', 'deepgram', or 'nvidia'",
65
  )
66
  DEEPGRAM_API_KEY: Optional[str] = Field(
@@ -129,7 +129,7 @@ class VoiceSettings(CoreSettings):
129
  description="Number of audio input channels (1=mono)",
130
  )
131
  LIVEKIT_FRAME_SIZE_MS: int = Field(
132
- default=20,
133
  ge=10,
134
  le=100,
135
  description="Audio frame size in milliseconds (smaller = faster VAD response)",
@@ -153,31 +153,34 @@ class VoiceSettings(CoreSettings):
153
  description="Minimum speech duration (seconds) before VAD activation",
154
  )
155
  VAD_MIN_SILENCE_DURATION: float = Field(
156
- default=0.30,
157
  ge=0.1,
158
  le=2.0,
159
  description="Minimum silence duration (seconds) before VAD deactivation",
160
  )
161
  VAD_THRESHOLD: float = Field(
162
- default=0.6,
163
  ge=0.0,
164
  le=1.0,
165
  description="VAD activation threshold (higher = less sensitive, 0.5 is Silero default)",
166
  )
167
  MIN_ENDPOINTING_DELAY: float = Field(
168
- default=0.15,
169
  ge=0.0,
170
  le=10.0,
171
- description="Minimum endpointing delay (seconds) before committing user turn",
 
 
 
172
  )
173
  MAX_ENDPOINTING_DELAY: float = Field(
174
- default=1.5,
175
  ge=0.1,
176
  le=10.0,
177
  description="Maximum endpointing delay (seconds) when turn detector expects continuation",
178
  )
179
  PREEMPTIVE_GENERATION: bool = Field(
180
- default=True,
181
  description="Enable speculative LLM/TTS generation before final turn commit",
182
  )
183
 
@@ -280,7 +283,7 @@ class LLMSettings(CoreSettings):
280
  ),
281
  )
282
  OLLAMA_MODEL: str = Field(
283
- default= "ministral-3:14b-cloud", #"ministral-3:8b-cloud", #"qwen3-coder-next",#minimax-m2.5 #"ministral-3:8b", #"qwen2.5:7b" #"qwen3:8b" #"qwen3.5:4b",
284
  description="Ollama model tag",
285
  )
286
  OLLAMA_API_KEY: Optional[str] = Field(
@@ -290,7 +293,7 @@ class LLMSettings(CoreSettings):
290
 
291
  # Common LLM parameters
292
  LLM_TEMPERATURE: float = Field(default=0.7, ge=0.0, le=2.0)
293
- LLM_MAX_TOKENS: int = Field(default=1024, gt=0)
294
  LLM_CONN_TIMEOUT_SEC: float = Field(
295
  default=20.0,
296
  gt=0.0,
@@ -349,7 +352,7 @@ class LiveKitSettings(CoreSettings):
349
  LIVEKIT_API_KEY: Optional[str] = Field(default=None)
350
  LIVEKIT_API_SECRET: Optional[str] = Field(default=None)
351
  LIVEKIT_AGENT_NAME: str = Field(default="open-voice-agent")
352
- LIVEKIT_NUM_IDLE_PROCESSES: int = Field(default=1, ge=0)
353
  LIVEKIT_INITIALIZE_PROCESS_TIMEOUT_SEC: float = Field(
354
  default=20.0,
355
  gt=0.0,
@@ -413,6 +416,15 @@ class LangfuseSettings(CoreSettings):
413
  le=10000.0,
414
  description="Best-effort tracer flush timeout in milliseconds",
415
  )
 
 
 
 
 
 
 
 
 
416
 
417
 
418
  class Settings(CoreSettings):
 
60
 
61
  class VoiceSettings(CoreSettings):
62
  TTS_PROVIDER: str = Field(
63
+ default="pocket",
64
  description="TTS provider: 'pocket', 'deepgram', or 'nvidia'",
65
  )
66
  DEEPGRAM_API_KEY: Optional[str] = Field(
 
129
  description="Number of audio input channels (1=mono)",
130
  )
131
  LIVEKIT_FRAME_SIZE_MS: int = Field(
132
+ default=60,
133
  ge=10,
134
  le=100,
135
  description="Audio frame size in milliseconds (smaller = faster VAD response)",
 
153
  description="Minimum speech duration (seconds) before VAD activation",
154
  )
155
  VAD_MIN_SILENCE_DURATION: float = Field(
156
+ default=0.55,
157
  ge=0.1,
158
  le=2.0,
159
  description="Minimum silence duration (seconds) before VAD deactivation",
160
  )
161
  VAD_THRESHOLD: float = Field(
162
+ default=0.5,
163
  ge=0.0,
164
  le=1.0,
165
  description="VAD activation threshold (higher = less sensitive, 0.5 is Silero default)",
166
  )
167
  MIN_ENDPOINTING_DELAY: float = Field(
168
+ default=0.5,
169
  ge=0.0,
170
  le=10.0,
171
+ description=(
172
+ "Minimum endpointing delay (seconds) before committing user turn; "
173
+ "slightly higher values reduce false turn splits"
174
+ ),
175
  )
176
  MAX_ENDPOINTING_DELAY: float = Field(
177
+ default=3.0,
178
  ge=0.1,
179
  le=10.0,
180
  description="Maximum endpointing delay (seconds) when turn detector expects continuation",
181
  )
182
  PREEMPTIVE_GENERATION: bool = Field(
183
+ default=False,
184
  description="Enable speculative LLM/TTS generation before final turn commit",
185
  )
186
 
 
283
  ),
284
  )
285
  OLLAMA_MODEL: str = Field(
286
+ default= "ministral-3:14b", #"ministral-3:14b-cloud", #"ministral-3:8b-cloud", #"qwen3-coder-next",#minimax-m2.5 #"ministral-3:8b", #"qwen2.5:7b" #"qwen3:8b" #"qwen3.5:4b",
287
  description="Ollama model tag",
288
  )
289
  OLLAMA_API_KEY: Optional[str] = Field(
 
293
 
294
  # Common LLM parameters
295
  LLM_TEMPERATURE: float = Field(default=0.7, ge=0.0, le=2.0)
296
+ LLM_MAX_TOKENS: int = Field(default=256, gt=0)
297
  LLM_CONN_TIMEOUT_SEC: float = Field(
298
  default=20.0,
299
  gt=0.0,
 
352
  LIVEKIT_API_KEY: Optional[str] = Field(default=None)
353
  LIVEKIT_API_SECRET: Optional[str] = Field(default=None)
354
  LIVEKIT_AGENT_NAME: str = Field(default="open-voice-agent")
355
+ LIVEKIT_NUM_IDLE_PROCESSES: int = Field(default=0, ge=0)
356
  LIVEKIT_INITIALIZE_PROCESS_TIMEOUT_SEC: float = Field(
357
  default=20.0,
358
  gt=0.0,
 
416
  le=10000.0,
417
  description="Best-effort tracer flush timeout in milliseconds",
418
  )
419
+ LANGFUSE_CONTINUATION_COALESCE_WINDOW_MS: float = Field(
420
+ default=1500.0,
421
+ ge=0.0,
422
+ le=10000.0,
423
+ description=(
424
+ "Window to merge an immediately-following continuation into a prior aborted "
425
+ "turn trace; set to 0 to disable"
426
+ ),
427
+ )
428
 
429
 
430
  class Settings(CoreSettings):
tests/test_langfuse_turn_tracing.py CHANGED
@@ -1676,6 +1676,328 @@ def test_creates_new_trace_for_each_finalized_transcript(
1676
  assert turn_spans[0].trace_id != turn_spans[1].trace_id
1677
 
1678
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1679
  def test_trace_emits_without_stt_metrics(monkeypatch: pytest.MonkeyPatch) -> None:
1680
  import src.agent.traces.metrics_collector as metrics_collector_module
1681
 
 
1676
  assert turn_spans[0].trace_id != turn_spans[1].trace_id
1677
 
1678
 
1679
+ def test_multiple_final_transcripts_are_merged_into_one_turn(
1680
+ monkeypatch: pytest.MonkeyPatch,
1681
+ ) -> None:
1682
+ import src.agent.traces.metrics_collector as metrics_collector_module
1683
+
1684
+ fake_tracer = _FakeTracer()
1685
+ monkeypatch.setattr(metrics_collector_module, "tracer", fake_tracer)
1686
+
1687
+ room = _FakeRoom()
1688
+ collector = MetricsCollector(
1689
+ room=room, # type: ignore[arg-type]
1690
+ model_name="moonshine",
1691
+ room_name=room.name,
1692
+ room_id="RM123",
1693
+ participant_id="web-123",
1694
+ langfuse_enabled=True,
1695
+ )
1696
+
1697
+ async def _run() -> None:
1698
+ await collector.on_session_metadata(
1699
+ session_id="session-merged-finals",
1700
+ participant_id="web-123",
1701
+ )
1702
+ await collector.on_user_input_transcribed("What", is_final=True)
1703
+ await collector.on_user_input_transcribed(
1704
+ "the difference between speech to text and speech recognition?",
1705
+ is_final=True,
1706
+ )
1707
+ await collector.on_metrics_collected(_make_stt_metrics("stt-merged"))
1708
+ await collector.on_metrics_collected(
1709
+ _make_eou_metrics("speech-merged", delay=0.9, transcription_delay=0.2)
1710
+ )
1711
+ await collector.on_conversation_item_added(
1712
+ role="user",
1713
+ content="What the difference between speech to text and speech recognition?",
1714
+ )
1715
+ await collector.on_metrics_collected(_make_llm_metrics("speech-merged"))
1716
+ await collector.on_conversation_item_added(role="assistant", content="Speech to text writes words down.")
1717
+ await collector.on_metrics_collected(_make_tts_metrics("speech-merged"))
1718
+ await collector.wait_for_pending_trace_tasks()
1719
+
1720
+ asyncio.run(_run())
1721
+
1722
+ turn_spans = [span for span in fake_tracer.spans if span.name == "turn"]
1723
+ assert len(turn_spans) == 1
1724
+ root = turn_spans[0]
1725
+ stt_span = next(span for span in fake_tracer.spans if span.name == "STTMetrics")
1726
+ assert (
1727
+ root.attributes["langfuse.trace.input"]
1728
+ == "What the difference between speech to text and speech recognition?"
1729
+ )
1730
+ assert (
1731
+ stt_span.attributes["user_transcript"]
1732
+ == "What the difference between speech to text and speech recognition?"
1733
+ )
1734
+ assert root.attributes["langfuse.trace.metadata.coalesced_turn_count"] == 0
1735
+
1736
+
1737
+ def test_immediate_continuation_coalesces_aborted_prior_turn(
1738
+ monkeypatch: pytest.MonkeyPatch,
1739
+ ) -> None:
1740
+ import src.agent.traces.metrics_collector as metrics_collector_module
1741
+
1742
+ fake_tracer = _FakeTracer()
1743
+ monkeypatch.setattr(metrics_collector_module, "tracer", fake_tracer)
1744
+ monkeypatch.setattr(
1745
+ metrics_collector_module.settings.langfuse,
1746
+ "LANGFUSE_CONTINUATION_COALESCE_WINDOW_MS",
1747
+ 1500.0,
1748
+ )
1749
+
1750
+ room = _FakeRoom()
1751
+ collector = MetricsCollector(
1752
+ room=room, # type: ignore[arg-type]
1753
+ model_name="moonshine",
1754
+ room_name=room.name,
1755
+ room_id="RM123",
1756
+ participant_id="web-123",
1757
+ langfuse_enabled=True,
1758
+ )
1759
+
1760
+ async def _run() -> None:
1761
+ await collector.on_session_metadata(
1762
+ session_id="session-coalesce",
1763
+ participant_id="web-123",
1764
+ )
1765
+ await collector.on_user_input_transcribed("What", is_final=True)
1766
+ await collector.on_metrics_collected(
1767
+ _make_eou_metrics("speech-a", delay=0.7, transcription_delay=0.2)
1768
+ )
1769
+ await collector.on_metrics_collected(_make_llm_metrics("speech-a"))
1770
+ await collector.on_metrics_collected(_make_tts_metrics("speech-a"))
1771
+
1772
+ await collector.on_user_input_transcribed(
1773
+ "the difference between speech to text and speech recognition?",
1774
+ is_final=True,
1775
+ )
1776
+ await collector.on_metrics_collected(
1777
+ _make_eou_metrics("speech-b", delay=0.7, transcription_delay=0.2)
1778
+ )
1779
+ await collector.on_conversation_item_added(
1780
+ role="user",
1781
+ content="What the difference between speech to text and speech recognition?",
1782
+ )
1783
+ await collector.on_metrics_collected(_make_llm_metrics("speech-b"))
1784
+ await collector.on_conversation_item_added(role="assistant", content="Speech to text writes words down.")
1785
+ await collector.on_metrics_collected(_make_tts_metrics("speech-b"))
1786
+ await collector.wait_for_pending_trace_tasks()
1787
+
1788
+ asyncio.run(_run())
1789
+
1790
+ turn_spans = [span for span in fake_tracer.spans if span.name == "turn"]
1791
+ assert len(turn_spans) == 1
1792
+ root = turn_spans[0]
1793
+ assert (
1794
+ root.attributes["langfuse.trace.input"]
1795
+ == "What the difference between speech to text and speech recognition?"
1796
+ )
1797
+ assert root.attributes["langfuse.trace.metadata.coalesced_turn_count"] == 1
1798
+ assert root.attributes["langfuse.trace.metadata.coalesced_fragment_count"] == 1
1799
+ assert root.attributes["langfuse.trace.metadata.coalesced_inputs"] == ["What"]
1800
+
1801
+
1802
+ def test_visible_assistant_reply_prevents_continuation_coalescing(
1803
+ monkeypatch: pytest.MonkeyPatch,
1804
+ ) -> None:
1805
+ import src.agent.traces.metrics_collector as metrics_collector_module
1806
+
1807
+ fake_tracer = _FakeTracer()
1808
+ monkeypatch.setattr(metrics_collector_module, "tracer", fake_tracer)
1809
+
1810
+ room = _FakeRoom()
1811
+ collector = MetricsCollector(
1812
+ room=room, # type: ignore[arg-type]
1813
+ model_name="moonshine",
1814
+ room_name=room.name,
1815
+ room_id="RM123",
1816
+ participant_id="web-123",
1817
+ langfuse_enabled=True,
1818
+ )
1819
+
1820
+ async def _run() -> None:
1821
+ await collector.on_session_metadata(
1822
+ session_id="session-no-coalesce-visible-reply",
1823
+ participant_id="web-123",
1824
+ )
1825
+ await collector.on_user_input_transcribed("first turn", is_final=True)
1826
+ await collector.on_metrics_collected(_make_stt_metrics("stt-first-visible"))
1827
+ await collector.on_metrics_collected(_make_eou_metrics("speech-first-visible"))
1828
+ await collector.on_metrics_collected(_make_llm_metrics("speech-first-visible"))
1829
+ await collector.on_conversation_item_added(role="assistant", content="first reply")
1830
+ await collector.on_metrics_collected(_make_tts_metrics("speech-first-visible"))
1831
+
1832
+ await collector.on_user_input_transcribed("second turn", is_final=True)
1833
+ await collector.on_metrics_collected(_make_stt_metrics("stt-second-visible"))
1834
+ await collector.on_metrics_collected(_make_eou_metrics("speech-second-visible"))
1835
+ await collector.on_metrics_collected(_make_llm_metrics("speech-second-visible"))
1836
+ await collector.on_conversation_item_added(role="assistant", content="second reply")
1837
+ await collector.on_metrics_collected(_make_tts_metrics("speech-second-visible"))
1838
+ await collector.wait_for_pending_trace_tasks()
1839
+
1840
+ asyncio.run(_run())
1841
+
1842
+ turn_spans = [span for span in fake_tracer.spans if span.name == "turn"]
1843
+ assert len(turn_spans) == 2
1844
+ assert turn_spans[0].attributes["langfuse.trace.metadata.coalesced_turn_count"] == 0
1845
+ assert turn_spans[1].attributes["langfuse.trace.metadata.coalesced_turn_count"] == 0
1846
+
1847
+
1848
+ def test_tool_activity_prevents_continuation_coalescing(
1849
+ monkeypatch: pytest.MonkeyPatch,
1850
+ ) -> None:
1851
+ import src.agent.traces.metrics_collector as metrics_collector_module
1852
+
1853
+ fake_tracer = _FakeTracer()
1854
+ monkeypatch.setattr(metrics_collector_module, "tracer", fake_tracer)
1855
+
1856
+ room = _FakeRoom()
1857
+ collector = MetricsCollector(
1858
+ room=room, # type: ignore[arg-type]
1859
+ model_name="moonshine",
1860
+ room_name=room.name,
1861
+ room_id="RM123",
1862
+ participant_id="web-123",
1863
+ langfuse_enabled=True,
1864
+ )
1865
+ collector._trace_finalize_timeout_sec = 0.05
1866
+ collector._trace_post_tool_response_timeout_sec = 0.05
1867
+
1868
+ async def _run() -> None:
1869
+ await collector.on_session_metadata(
1870
+ session_id="session-no-coalesce-tools",
1871
+ participant_id="web-123",
1872
+ )
1873
+ await collector.on_user_input_transcribed("run tool", is_final=True)
1874
+ await collector.on_metrics_collected(_make_eou_metrics("speech-tool-a"))
1875
+ await collector.on_metrics_collected(_make_llm_metrics("speech-tool-a"))
1876
+ await collector.on_metrics_collected(_make_tts_metrics("speech-tool-a"))
1877
+ await collector.on_tool_step_started()
1878
+ await collector.on_function_tools_executed(
1879
+ function_calls=[
1880
+ _FakeFunctionCall(
1881
+ name="search_web",
1882
+ call_id="tool-a",
1883
+ arguments='{"q":"speech models"}',
1884
+ created_at=1.0,
1885
+ )
1886
+ ],
1887
+ function_call_outputs=[
1888
+ _FakeFunctionCallOutput(
1889
+ output='{"results":[]}',
1890
+ is_error=False,
1891
+ created_at=1.1,
1892
+ )
1893
+ ],
1894
+ created_at=1.1,
1895
+ )
1896
+
1897
+ await collector.on_user_input_transcribed("follow-up turn", is_final=True)
1898
+ await collector.on_metrics_collected(_make_eou_metrics("speech-tool-b"))
1899
+ await collector.on_metrics_collected(_make_llm_metrics("speech-tool-b"))
1900
+ await collector.on_conversation_item_added(role="assistant", content="second reply")
1901
+ await collector.on_metrics_collected(_make_tts_metrics("speech-tool-b"))
1902
+ await asyncio.sleep(0.08)
1903
+ await collector.wait_for_pending_trace_tasks()
1904
+
1905
+ asyncio.run(_run())
1906
+
1907
+ turn_spans = [span for span in fake_tracer.spans if span.name == "turn"]
1908
+ assert len(turn_spans) == 2
1909
+ assert turn_spans[1].attributes["langfuse.trace.metadata.coalesced_turn_count"] == 0
1910
+
1911
+
1912
+ def test_multiple_final_transcripts_share_one_llm_stall_watchdog() -> None:
1913
+ room = _FakeRoom()
1914
+ collector = MetricsCollector(
1915
+ room=room, # type: ignore[arg-type]
1916
+ model_name="moonshine",
1917
+ room_name=room.name,
1918
+ room_id="RM123",
1919
+ participant_id="web-123",
1920
+ langfuse_enabled=False,
1921
+ )
1922
+
1923
+ async def _run() -> None:
1924
+ await collector.on_user_input_transcribed("Search for the most popular", is_final=True)
1925
+ await collector.on_user_input_transcribed("test to speech model.", is_final=True)
1926
+ assert len(collector._pending_user_utterances) == 1
1927
+ assert len(collector._llm_stall_tasks) == 1
1928
+ assert (
1929
+ collector._pending_user_utterances[0].transcript
1930
+ == "Search for the most popular test to speech model."
1931
+ )
1932
+
1933
+ await collector.on_metrics_collected(_make_llm_metrics("speech-watchdog"))
1934
+ await asyncio.sleep(0)
1935
+
1936
+ asyncio.run(_run())
1937
+
1938
+ assert not collector._llm_stall_tasks
1939
+
1940
+
1941
+ def test_continuation_coalescing_can_be_disabled(
1942
+ monkeypatch: pytest.MonkeyPatch,
1943
+ ) -> None:
1944
+ import src.agent.traces.metrics_collector as metrics_collector_module
1945
+
1946
+ fake_tracer = _FakeTracer()
1947
+ monkeypatch.setattr(metrics_collector_module, "tracer", fake_tracer)
1948
+ monkeypatch.setattr(
1949
+ metrics_collector_module.settings.langfuse,
1950
+ "LANGFUSE_CONTINUATION_COALESCE_WINDOW_MS",
1951
+ 0.0,
1952
+ )
1953
+
1954
+ room = _FakeRoom()
1955
+ collector = MetricsCollector(
1956
+ room=room, # type: ignore[arg-type]
1957
+ model_name="moonshine",
1958
+ room_name=room.name,
1959
+ room_id="RM123",
1960
+ participant_id="web-123",
1961
+ langfuse_enabled=True,
1962
+ )
1963
+ collector._trace_finalize_timeout_sec = 0.01
1964
+
1965
+ async def _run() -> None:
1966
+ await collector.on_session_metadata(
1967
+ session_id="session-no-coalesce-disabled",
1968
+ participant_id="web-123",
1969
+ )
1970
+ await collector.on_user_input_transcribed("What", is_final=True)
1971
+ await collector.on_metrics_collected(
1972
+ _make_eou_metrics("speech-disabled-a", delay=0.7, transcription_delay=0.2)
1973
+ )
1974
+ await collector.on_metrics_collected(_make_llm_metrics("speech-disabled-a"))
1975
+ await collector.on_metrics_collected(_make_tts_metrics("speech-disabled-a"))
1976
+ await asyncio.sleep(0.03)
1977
+
1978
+ await collector.on_user_input_transcribed(
1979
+ "the difference between speech to text and speech recognition?",
1980
+ is_final=True,
1981
+ )
1982
+ await collector.on_metrics_collected(
1983
+ _make_eou_metrics("speech-disabled-b", delay=0.7, transcription_delay=0.2)
1984
+ )
1985
+ await collector.on_metrics_collected(_make_llm_metrics("speech-disabled-b"))
1986
+ await collector.on_conversation_item_added(role="assistant", content="Speech to text writes words down.")
1987
+ await collector.on_metrics_collected(_make_tts_metrics("speech-disabled-b"))
1988
+ await collector.wait_for_pending_trace_tasks()
1989
+
1990
+ asyncio.run(_run())
1991
+
1992
+ turn_spans = [span for span in fake_tracer.spans if span.name == "turn"]
1993
+ assert len(turn_spans) == 2
1994
+ assert turn_spans[0].attributes["langfuse.trace.input"] == "What"
1995
+ assert (
1996
+ turn_spans[1].attributes["langfuse.trace.input"]
1997
+ == "the difference between speech to text and speech recognition?"
1998
+ )
1999
+
2000
+
2001
  def test_trace_emits_without_stt_metrics(monkeypatch: pytest.MonkeyPatch) -> None:
2002
  import src.agent.traces.metrics_collector as metrics_collector_module
2003
 
tests/test_runtime_settings.py CHANGED
@@ -3,7 +3,14 @@ from __future__ import annotations
3
  import pytest
4
  from pydantic import ValidationError
5
 
6
- from src.core.settings import LLMSettings, LiveKitSettings, STTSettings, Settings, VoiceSettings
 
 
 
 
 
 
 
7
 
8
 
9
  def test_llm_runtime_tuning_defaults_are_declared() -> None:
@@ -15,7 +22,7 @@ def test_llm_runtime_tuning_defaults_are_declared() -> None:
15
  assert fields["MCP_SERVER_URL"].default == "https://huggingface.co/mcp"
16
  assert fields["MCP_EXTRA_SERVER_URLS"].default == "https://docs.livekit.io/mcp"
17
  assert fields["OLLAMA_CLOUD_MODE"].default is True
18
- assert fields["OLLAMA_MODEL"].default == "ministral-3:14b-cloud"
19
  assert fields["OLLAMA_API_KEY"].default == "ollama"
20
  assert fields["LLM_CONN_TIMEOUT_SEC"].default == 20.0
21
  assert fields["LLM_CONN_MAX_RETRY"].default == 1
@@ -28,7 +35,7 @@ def test_llm_runtime_tuning_defaults_are_declared() -> None:
28
  def test_livekit_runtime_tuning_defaults_are_declared() -> None:
29
  fields = LiveKitSettings.model_fields
30
 
31
- assert fields["LIVEKIT_NUM_IDLE_PROCESSES"].default == 1
32
  assert fields["LIVEKIT_INITIALIZE_PROCESS_TIMEOUT_SEC"].default == 20.0
33
  assert fields["LIVEKIT_JOB_MEMORY_WARN_MB"].default == 6144
34
 
@@ -36,16 +43,25 @@ def test_livekit_runtime_tuning_defaults_are_declared() -> None:
36
  def test_voice_runtime_tuning_defaults_are_declared() -> None:
37
  fields = VoiceSettings.model_fields
38
 
39
- assert fields["TTS_PROVIDER"].default == "deepgram"
40
  assert fields["NVIDIA_TTS_VOICE"].default == "Magpie-Multilingual.EN-US.Leo"
41
  assert fields["NVIDIA_TTS_USE_SSL"].default is True
42
  assert fields["POCKET_TTS_CONN_TIMEOUT_SEC"].default == 45.0
 
 
 
 
 
 
 
 
 
43
 
44
 
45
  def test_stt_runtime_tuning_defaults_are_declared() -> None:
46
  fields = STTSettings.model_fields
47
 
48
- assert fields["STT_PROVIDER"].default == "moonshine"
49
  assert fields["DEEPGRAM_STT_MODEL"].default == "nova-3"
50
  assert fields["DEEPGRAM_STT_LANGUAGE"].default == "en-US"
51
 
 
3
  import pytest
4
  from pydantic import ValidationError
5
 
6
+ from src.core.settings import (
7
+ LLMSettings,
8
+ LangfuseSettings,
9
+ LiveKitSettings,
10
+ STTSettings,
11
+ Settings,
12
+ VoiceSettings,
13
+ )
14
 
15
 
16
  def test_llm_runtime_tuning_defaults_are_declared() -> None:
 
22
  assert fields["MCP_SERVER_URL"].default == "https://huggingface.co/mcp"
23
  assert fields["MCP_EXTRA_SERVER_URLS"].default == "https://docs.livekit.io/mcp"
24
  assert fields["OLLAMA_CLOUD_MODE"].default is True
25
+ assert fields["OLLAMA_MODEL"].default == "ministral-3:14b"
26
  assert fields["OLLAMA_API_KEY"].default == "ollama"
27
  assert fields["LLM_CONN_TIMEOUT_SEC"].default == 20.0
28
  assert fields["LLM_CONN_MAX_RETRY"].default == 1
 
35
  def test_livekit_runtime_tuning_defaults_are_declared() -> None:
36
  fields = LiveKitSettings.model_fields
37
 
38
+ assert fields["LIVEKIT_NUM_IDLE_PROCESSES"].default == 0
39
  assert fields["LIVEKIT_INITIALIZE_PROCESS_TIMEOUT_SEC"].default == 20.0
40
  assert fields["LIVEKIT_JOB_MEMORY_WARN_MB"].default == 6144
41
 
 
43
  def test_voice_runtime_tuning_defaults_are_declared() -> None:
44
  fields = VoiceSettings.model_fields
45
 
46
+ assert fields["TTS_PROVIDER"].default == "pocket"
47
  assert fields["NVIDIA_TTS_VOICE"].default == "Magpie-Multilingual.EN-US.Leo"
48
  assert fields["NVIDIA_TTS_USE_SSL"].default is True
49
  assert fields["POCKET_TTS_CONN_TIMEOUT_SEC"].default == 45.0
50
+ assert fields["MIN_ENDPOINTING_DELAY"].default == 0.8
51
+
52
+
53
+ def test_langfuse_runtime_tuning_defaults_are_declared() -> None:
54
+ fields = LangfuseSettings.model_fields
55
+
56
+ assert fields["LANGFUSE_TRACE_FINALIZE_TIMEOUT_MS"].default == 8000.0
57
+ assert fields["LANGFUSE_POST_TOOL_RESPONSE_TIMEOUT_MS"].default == 30000.0
58
+ assert fields["LANGFUSE_CONTINUATION_COALESCE_WINDOW_MS"].default == 1500.0
59
 
60
 
61
  def test_stt_runtime_tuning_defaults_are_declared() -> None:
62
  fields = STTSettings.model_fields
63
 
64
+ assert fields["STT_PROVIDER"].default == "deepgram"
65
  assert fields["DEEPGRAM_STT_MODEL"].default == "nova-3"
66
  assert fields["DEEPGRAM_STT_LANGUAGE"].default == "en-US"
67
 
tests/test_session_conn_options.py CHANGED
@@ -2,11 +2,15 @@ from __future__ import annotations
2
 
3
  import asyncio
4
  import types
 
5
 
6
  from livekit.agents.inference_runner import _InferenceRunner
7
 
8
  from src.agent.runtime import session as runtime_session
9
- from src.core.settings import settings
 
 
 
10
 
11
 
12
  class _FakeJobContext:
@@ -70,8 +74,8 @@ def test_build_server_uses_livekit_process_initialization_settings(monkeypatch)
70
  assert server._job_memory_warn_mb == 8192.0
71
 
72
 
73
- def test_importing_session_registers_multilingual_turn_detector_runner() -> None:
74
- assert "lk_end_of_utterance_multilingual" in _InferenceRunner.registered_runners
75
 
76
 
77
  def test_resolve_stt_metrics_model_name_uses_deepgram_model(monkeypatch) -> None:
@@ -119,9 +123,12 @@ def test_session_handler_runs_llm_warmup_before_session_start(monkeypatch) -> No
119
  monkeypatch.setattr(runtime_session, "install_mcp_generate_reply_guard", lambda *args, **kwargs: None)
120
  monkeypatch.setattr(runtime_session, "run_startup_greeting", lambda *args, **kwargs: None)
121
  monkeypatch.setattr(runtime_session.silero.VAD, "load", lambda **kwargs: "vad")
122
- monkeypatch.setattr(runtime_session, "MultilingualModel", lambda: "turn-detector")
123
  monkeypatch.setattr(runtime_session.room_io, "AudioInputOptions", lambda **kwargs: kwargs)
124
  monkeypatch.setattr(runtime_session.room_io, "RoomOptions", lambda **kwargs: kwargs)
 
 
 
125
 
126
  async def _fake_run_llm_warmup(**kwargs) -> None:
127
  order.append("llm")
@@ -133,3 +140,45 @@ def test_session_handler_runs_llm_warmup_before_session_start(monkeypatch) -> No
133
  assert order == ["llm", "start"]
134
  assert len(created_sessions) == 1
135
  assert created_sessions[0].start_calls
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  import asyncio
4
  import types
5
+ from pathlib import Path
6
 
7
  from livekit.agents.inference_runner import _InferenceRunner
8
 
9
  from src.agent.runtime import session as runtime_session
10
+ from src.core.settings import VoiceSettings, settings
11
+
12
+
13
+ ENV_EXAMPLE_PATH = Path(__file__).resolve().parents[1] / ".env.example"
14
 
15
 
16
  class _FakeJobContext:
 
74
  assert server._job_memory_warn_mb == 8192.0
75
 
76
 
77
+ def test_importing_session_registers_english_turn_detector_runner() -> None:
78
+ assert "lk_end_of_utterance_en" in _InferenceRunner.registered_runners
79
 
80
 
81
  def test_resolve_stt_metrics_model_name_uses_deepgram_model(monkeypatch) -> None:
 
123
  monkeypatch.setattr(runtime_session, "install_mcp_generate_reply_guard", lambda *args, **kwargs: None)
124
  monkeypatch.setattr(runtime_session, "run_startup_greeting", lambda *args, **kwargs: None)
125
  monkeypatch.setattr(runtime_session.silero.VAD, "load", lambda **kwargs: "vad")
126
+ monkeypatch.setattr(runtime_session, "EnglishModel", lambda: "turn-detector")
127
  monkeypatch.setattr(runtime_session.room_io, "AudioInputOptions", lambda **kwargs: kwargs)
128
  monkeypatch.setattr(runtime_session.room_io, "RoomOptions", lambda **kwargs: kwargs)
129
+ monkeypatch.setattr(settings.voice, "MIN_ENDPOINTING_DELAY", 1.0)
130
+ monkeypatch.setattr(settings.voice, "MAX_ENDPOINTING_DELAY", 4.0)
131
+ monkeypatch.setattr(settings.voice, "PREEMPTIVE_GENERATION", False)
132
 
133
  async def _fake_run_llm_warmup(**kwargs) -> None:
134
  order.append("llm")
 
140
  assert order == ["llm", "start"]
141
  assert len(created_sessions) == 1
142
  assert created_sessions[0].start_calls
143
+ assert created_sessions[0].kwargs["turn_detection"] == "turn-detector"
144
+ assert created_sessions[0].kwargs["min_endpointing_delay"] == 1.0
145
+ assert created_sessions[0].kwargs["max_endpointing_delay"] == 4.0
146
+ assert created_sessions[0].kwargs["preemptive_generation"] is False
147
+
148
+
149
+ def test_env_example_turn_profile_matches_voice_defaults() -> None:
150
+ env_values = _parse_env_file(ENV_EXAMPLE_PATH)
151
+
152
+ assert env_values["LIVEKIT_FRAME_SIZE_MS"] == str(
153
+ VoiceSettings.model_fields["LIVEKIT_FRAME_SIZE_MS"].default
154
+ )
155
+ assert env_values["VAD_MIN_SILENCE_DURATION"] == str(
156
+ VoiceSettings.model_fields["VAD_MIN_SILENCE_DURATION"].default
157
+ )
158
+ assert env_values["VAD_THRESHOLD"] == str(
159
+ VoiceSettings.model_fields["VAD_THRESHOLD"].default
160
+ )
161
+ assert env_values["MIN_ENDPOINTING_DELAY"] == str(
162
+ VoiceSettings.model_fields["MIN_ENDPOINTING_DELAY"].default
163
+ )
164
+ assert env_values["MAX_ENDPOINTING_DELAY"] == str(
165
+ VoiceSettings.model_fields["MAX_ENDPOINTING_DELAY"].default
166
+ )
167
+ assert env_values["PREEMPTIVE_GENERATION"] == _env_bool(
168
+ VoiceSettings.model_fields["PREEMPTIVE_GENERATION"].default
169
+ )
170
+
171
+
172
+ def _parse_env_file(path: Path) -> dict[str, str]:
173
+ values: dict[str, str] = {}
174
+ for raw_line in path.read_text(encoding="utf-8").splitlines():
175
+ line = raw_line.strip()
176
+ if not line or line.startswith("#") or "=" not in line:
177
+ continue
178
+ key, value = line.split("=", 1)
179
+ values[key.strip()] = value.split(" #", 1)[0].strip()
180
+ return values
181
+
182
+
183
+ def _env_bool(value: bool) -> str:
184
+ return "true" if value else "false"