dvalle08 commited on
Commit
9ef4d80
·
1 Parent(s): 3d758c5

Enhance metrics handling and add EOU support: Update ChannelPublisher and TurnTracer to include end-of-utterance (EOU) metrics, refactor latency calculations, and improve metric attribute handling. Modify UI tooltips for clarity and adjust test cases to validate new metrics structure.

Browse files
src/agent/_channel_metrics.py CHANGED
@@ -48,7 +48,13 @@ class ChannelPublisher:
48
  "speech_id": speech_id,
49
  "stage": stage,
50
  "role": role,
51
- "metrics": {"stt": None, "llm": None, "tts": None, "vad": None},
 
 
 
 
 
 
52
  "latencies": None,
53
  "diagnostic": diagnostic,
54
  }
@@ -59,6 +65,8 @@ class ChannelPublisher:
59
  **asdict(turn_metrics.stt),
60
  "display_duration": _stt_display_duration(turn_metrics.stt),
61
  }
 
 
62
  if turn_metrics.llm:
63
  payload["metrics"]["llm"] = asdict(turn_metrics.llm)
64
  if turn_metrics.tts:
@@ -154,7 +162,7 @@ def _build_partial_latencies(
154
  if not has_signal:
155
  return None
156
 
157
- baseline = eou_delay + stt_finalization_delay + llm_ttft + tts_ttfb
158
  total = max(baseline, observed_total_latency if observed_total_latency is not None else 0.0)
159
  return {
160
  "total_latency": total,
 
48
  "speech_id": speech_id,
49
  "stage": stage,
50
  "role": role,
51
+ "metrics": {
52
+ "stt": None,
53
+ "eou": None,
54
+ "llm": None,
55
+ "tts": None,
56
+ "vad": None,
57
+ },
58
  "latencies": None,
59
  "diagnostic": diagnostic,
60
  }
 
65
  **asdict(turn_metrics.stt),
66
  "display_duration": _stt_display_duration(turn_metrics.stt),
67
  }
68
+ if turn_metrics.eou:
69
+ payload["metrics"]["eou"] = asdict(turn_metrics.eou)
70
  if turn_metrics.llm:
71
  payload["metrics"]["llm"] = asdict(turn_metrics.llm)
72
  if turn_metrics.tts:
 
162
  if not has_signal:
163
  return None
164
 
165
+ baseline = eou_delay + llm_ttft + tts_ttfb
166
  total = max(baseline, observed_total_latency if observed_total_latency is not None else 0.0)
167
  return {
168
  "total_latency": total,
src/agent/_turn_tracer.py CHANGED
@@ -9,7 +9,7 @@ from __future__ import annotations
9
  import asyncio
10
  import uuid
11
  from collections import deque
12
- from dataclasses import dataclass
13
  from time import time_ns
14
  from typing import TYPE_CHECKING, Any, Callable, Optional
15
 
@@ -40,6 +40,7 @@ class TraceTurn:
40
  stt_duration_ms: Optional[float] = None
41
  stt_finalization_ms: Optional[float] = None
42
  stt_total_latency_ms: Optional[float] = None
 
43
  llm_duration_ms: Optional[float] = None
44
  llm_ttft_ms: Optional[float] = None
45
  llm_total_latency_ms: Optional[float] = None
@@ -47,6 +48,11 @@ class TraceTurn:
47
  tts_ttfb_ms: Optional[float] = None
48
  conversational_latency_ms: Optional[float] = None
49
  llm_to_tts_handoff_ms: Optional[float] = None
 
 
 
 
 
50
  trace_id: Optional[str] = None
51
 
52
 
@@ -195,6 +201,7 @@ class TurnTracer:
195
  transcript: str,
196
  duration: float,
197
  fallback_duration: float,
 
198
  ) -> Optional[TraceTurn]:
199
  async with self._trace_lock:
200
  turn = self._next_turn_where(
@@ -213,14 +220,18 @@ class TurnTracer:
213
  else:
214
  turn.stt_duration_ms = measured_ms
215
  turn.stt_status = "measured"
 
216
  _recompute_conversational_latency(turn)
217
  return turn
218
 
219
- async def attach_vad(
220
  self,
221
  *,
222
  duration: float,
223
  transcription_delay: float,
 
 
 
224
  ) -> Optional[TraceTurn]:
225
  async with self._trace_lock:
226
  turn = self._next_turn_where(
@@ -232,10 +243,15 @@ class TurnTracer:
232
  turn.vad_duration_ms = eou_delay_ms
233
  turn.stt_finalization_ms = _duration_to_ms(transcription_delay, 0.0)
234
  turn.stt_total_latency_ms = eou_delay_ms + (turn.stt_finalization_ms or 0.0)
 
 
 
235
  if turn.stt_total_latency_ms > 0:
236
  turn.stt_status = "measured"
237
  if turn.stt_duration_ms is None:
238
  turn.stt_duration_ms = turn.stt_total_latency_ms
 
 
239
  _recompute_conversational_latency(turn)
240
  return turn
241
 
@@ -244,6 +260,7 @@ class TurnTracer:
244
  *,
245
  duration: float,
246
  ttft: float,
 
247
  ) -> Optional[TraceTurn]:
248
  async with self._trace_lock:
249
  turn = self._next_turn_where(
@@ -255,6 +272,7 @@ class TurnTracer:
255
  turn.llm_duration_ms = _duration_to_ms(duration, 0.0)
256
  turn.llm_total_latency_ms = turn.llm_duration_ms
257
  turn.llm_ttft_ms = _duration_to_ms(ttft, 0.0)
 
258
  _recompute_conversational_latency(turn)
259
  return turn
260
 
@@ -265,6 +283,7 @@ class TurnTracer:
265
  fallback_duration: float,
266
  ttfb: float,
267
  observed_total_latency: Optional[float],
 
268
  ) -> Optional[TraceTurn]:
269
  async with self._trace_lock:
270
  turn = self._next_turn_where(
@@ -276,6 +295,7 @@ class TurnTracer:
276
  return None
277
  turn.tts_duration_ms = _duration_to_ms(duration, fallback_duration)
278
  turn.tts_ttfb_ms = _duration_to_ms(ttfb, 0.0)
 
279
  _recompute_conversational_latency(turn)
280
  if observed_total_latency is not None:
281
  observed_ms = observed_total_latency * 1000.0
@@ -287,7 +307,6 @@ class TurnTracer:
287
  turn.llm_to_tts_handoff_ms = _compute_llm_to_tts_handoff_ms(
288
  total_latency_ms=turn.conversational_latency_ms,
289
  vad_duration_ms=turn.vad_duration_ms,
290
- stt_finalization_ms=turn.stt_finalization_ms,
291
  llm_ttft_ms=turn.llm_ttft_ms,
292
  tts_ttfb_ms=turn.tts_ttfb_ms,
293
  )
@@ -508,23 +527,46 @@ class TurnTracer:
508
  attributes={"user_transcript": turn.user_transcript},
509
  observation_input=turn.user_transcript,
510
  )
511
- vad_start_ns = cursor_ns
512
- cursor_ns = _emit_component_span(
513
  _tracer,
514
- name="vad",
515
  context=ctx,
516
- start_ns=cursor_ns,
 
 
 
 
 
 
 
 
 
 
 
 
 
517
  duration_ms=vals["vad_duration_ms"],
518
- attributes={"eou_delay_ms": vals["vad_duration_ms"]},
 
 
 
 
 
 
 
 
 
519
  observation_output=str(vals["vad_duration_ms"]),
520
  )
521
  stt_end_ns = _emit_component_span(
522
  _tracer,
523
- name="stt",
524
  context=ctx,
525
- start_ns=vad_start_ns,
526
  duration_ms=vals["stt_span_duration_ms"],
527
  attributes={
 
528
  "user_transcript": turn.user_transcript,
529
  "stt_status": turn.stt_status,
530
  "stt_processing_ms": vals["stt_processing_ms"],
@@ -533,33 +575,39 @@ class TurnTracer:
533
  },
534
  observation_output=turn.user_transcript,
535
  )
536
- cursor_ns = max(cursor_ns, stt_end_ns)
537
  cursor_ns = _emit_component_span(
538
  _tracer,
539
- name="llm",
540
  context=ctx,
541
  start_ns=cursor_ns,
542
  duration_ms=vals["llm_duration_ms"],
543
- attributes={
544
- "prompt_text": turn.prompt_text,
545
- "response_text": turn.response_text,
546
- "ttft_ms": vals["llm_ttft_ms"],
547
- "llm_total_latency_ms": vals["llm_total_latency_ms"],
548
- },
 
 
 
549
  observation_input=turn.prompt_text,
550
  observation_output=turn.response_text,
551
  )
552
  cursor_ns = _emit_component_span(
553
  _tracer,
554
- name="tts",
555
  context=ctx,
556
  start_ns=cursor_ns,
557
  duration_ms=vals["tts_duration_ms"],
558
- attributes={
559
- "assistant_text": turn.assistant_text,
560
- "assistant_text_missing": turn.assistant_text_missing,
561
- "ttfb_ms": vals["tts_ttfb_ms"],
562
- },
 
 
 
563
  observation_input=turn.assistant_text,
564
  observation_output=turn.assistant_text,
565
  )
@@ -569,12 +617,11 @@ class TurnTracer:
569
  _tracer,
570
  name="conversation_latency",
571
  context=ctx,
572
- start_ns=vad_start_ns,
573
  duration_ms=conv_ms,
574
  attributes={
575
  "speech_end_to_assistant_speech_start_ms": conv_ms,
576
  "eou_delay_ms": vals["vad_duration_ms"],
577
- "stt_finalization_ms": vals["stt_finalization_ms"],
578
  "llm_ttft_ms": vals["llm_ttft_ms"],
579
  "llm_to_tts_handoff_ms": vals["llm_to_tts_handoff_ms"],
580
  "tts_ttfb_ms": vals["tts_ttfb_ms"],
@@ -583,9 +630,8 @@ class TurnTracer:
583
  )
584
  handoff_ms = vals["llm_to_tts_handoff_ms"]
585
  if handoff_ms is not None and handoff_ms > 0:
586
- handoff_start_ns = vad_start_ns + _ms_to_ns(
587
  max(vals["vad_duration_ms"], 0.0)
588
- + max(vals["stt_finalization_ms"] or 0.0, 0.0)
589
  + max(vals["llm_ttft_ms"], 0.0)
590
  )
591
  _emit_component_span(
@@ -598,7 +644,6 @@ class TurnTracer:
598
  "llm_to_tts_handoff_ms": handoff_ms,
599
  "speech_end_to_assistant_speech_start_ms": conv_ms,
600
  "eou_delay_ms": vals["vad_duration_ms"],
601
- "stt_finalization_ms": vals["stt_finalization_ms"],
602
  "llm_ttft_ms": vals["llm_ttft_ms"],
603
  "tts_ttfb_ms": vals["tts_ttfb_ms"],
604
  },
@@ -662,7 +707,6 @@ def _ms_to_ns(ms: float) -> int:
662
  def _recompute_conversational_latency(turn: TraceTurn) -> None:
663
  turn.conversational_latency_ms = _compute_conversational_latency_ms(
664
  vad_duration_ms=turn.vad_duration_ms,
665
- stt_finalization_ms=turn.stt_finalization_ms,
666
  llm_ttft_ms=turn.llm_ttft_ms,
667
  tts_ttfb_ms=turn.tts_ttfb_ms,
668
  )
@@ -671,11 +715,10 @@ def _recompute_conversational_latency(turn: TraceTurn) -> None:
671
  def _compute_conversational_latency_ms(
672
  *,
673
  vad_duration_ms: Optional[float],
674
- stt_finalization_ms: Optional[float],
675
  llm_ttft_ms: Optional[float],
676
  tts_ttfb_ms: Optional[float],
677
  ) -> Optional[float]:
678
- components = (vad_duration_ms, stt_finalization_ms, llm_ttft_ms, tts_ttfb_ms)
679
  if any(c is None for c in components):
680
  return None
681
  return sum(c for c in components if c is not None)
@@ -685,7 +728,6 @@ def _compute_llm_to_tts_handoff_ms(
685
  *,
686
  total_latency_ms: Optional[float],
687
  vad_duration_ms: Optional[float],
688
- stt_finalization_ms: Optional[float],
689
  llm_ttft_ms: Optional[float],
690
  tts_ttfb_ms: Optional[float],
691
  ) -> Optional[float]:
@@ -693,7 +735,6 @@ def _compute_llm_to_tts_handoff_ms(
693
  return None
694
  baseline = _compute_conversational_latency_ms(
695
  vad_duration_ms=vad_duration_ms,
696
- stt_finalization_ms=stt_finalization_ms,
697
  llm_ttft_ms=llm_ttft_ms,
698
  tts_ttfb_ms=tts_ttfb_ms,
699
  )
@@ -703,18 +744,14 @@ def _compute_llm_to_tts_handoff_ms(
703
 
704
 
705
  def _total_duration_ms(turn: TraceTurn) -> float:
706
- stt = (
707
- turn.stt_finalization_ms
708
- if turn.stt_finalization_ms is not None
709
- else (turn.stt_duration_ms if turn.stt_duration_ms is not None else 0.0)
710
- )
711
  llm = (
712
  turn.llm_total_latency_ms
713
  if turn.llm_total_latency_ms is not None
714
  else (turn.llm_duration_ms or 0.0)
715
  )
 
716
  calculated = (
717
- (turn.vad_duration_ms or 0.0) + stt + llm + (turn.tts_duration_ms or 0.0)
718
  )
719
  if turn.conversational_latency_ms is not None:
720
  calculated = max(calculated, turn.conversational_latency_ms)
@@ -725,6 +762,9 @@ def _prepare_span_values(turn: TraceTurn) -> dict[str, Any]:
725
  """Pre-compute derived values used by span emission."""
726
  user_input_duration_ms = 0.0 if turn.user_transcript else None
727
  vad_duration_ms = max(turn.vad_duration_ms or 0.0, 0.0)
 
 
 
728
  stt_processing_ms = (
729
  max(turn.stt_duration_ms, 0.0) if turn.stt_duration_ms is not None else None
730
  )
@@ -739,12 +779,12 @@ def _prepare_span_values(turn: TraceTurn) -> dict[str, Any]:
739
  else None
740
  )
741
  stt_span_duration_ms: Optional[float] = None
742
- if stt_total_latency_ms is not None and stt_total_latency_ms > 0:
743
- stt_span_duration_ms = stt_total_latency_ms
744
  elif stt_finalization_ms is not None and stt_finalization_ms > 0:
745
  stt_span_duration_ms = stt_finalization_ms
746
  else:
747
- stt_span_duration_ms = stt_processing_ms
748
 
749
  llm_duration_ms = max(turn.llm_duration_ms or 0.0, 0.0)
750
  llm_ttft_ms = max(turn.llm_ttft_ms or 0.0, 0.0)
@@ -765,13 +805,20 @@ def _prepare_span_values(turn: TraceTurn) -> dict[str, Any]:
765
  if turn.llm_to_tts_handoff_ms is not None
766
  else None
767
  )
 
 
 
 
 
768
  return {
769
  "user_input_duration_ms": user_input_duration_ms,
770
  "vad_duration_ms": vad_duration_ms,
 
771
  "stt_processing_ms": stt_processing_ms,
772
  "stt_finalization_ms": stt_finalization_ms,
773
  "stt_total_latency_ms": stt_total_latency_ms,
774
  "stt_span_duration_ms": stt_span_duration_ms,
 
775
  "llm_duration_ms": llm_duration_ms,
776
  "llm_ttft_ms": llm_ttft_ms,
777
  "llm_total_latency_ms": llm_total_latency_ms,
@@ -814,6 +861,9 @@ def _set_root_attributes(
814
  "latency_ms.stt_processing": vals["stt_processing_ms"],
815
  "latency_ms.stt_finalization": vals["stt_finalization_ms"],
816
  "latency_ms.stt_total": vals["stt_total_latency_ms"],
 
 
 
817
  "latency_ms.llm": vals["llm_duration_ms"],
818
  "latency_ms.llm_ttft": vals["llm_ttft_ms"],
819
  "latency_ms.llm_total": vals["llm_total_latency_ms"],
@@ -831,6 +881,45 @@ def _set_root_attributes(
831
  span.set_attribute(key, value)
832
 
833
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
834
  def _emit_component_span(
835
  _tracer: Any,
836
  *,
 
9
  import asyncio
10
  import uuid
11
  from collections import deque
12
+ from dataclasses import dataclass, field
13
  from time import time_ns
14
  from typing import TYPE_CHECKING, Any, Callable, Optional
15
 
 
40
  stt_duration_ms: Optional[float] = None
41
  stt_finalization_ms: Optional[float] = None
42
  stt_total_latency_ms: Optional[float] = None
43
+ eou_on_user_turn_completed_ms: Optional[float] = None
44
  llm_duration_ms: Optional[float] = None
45
  llm_ttft_ms: Optional[float] = None
46
  llm_total_latency_ms: Optional[float] = None
 
48
  tts_ttfb_ms: Optional[float] = None
49
  conversational_latency_ms: Optional[float] = None
50
  llm_to_tts_handoff_ms: Optional[float] = None
51
+ stt_attributes: dict[str, Any] = field(default_factory=dict)
52
+ eou_attributes: dict[str, Any] = field(default_factory=dict)
53
+ vad_attributes: dict[str, Any] = field(default_factory=dict)
54
+ llm_attributes: dict[str, Any] = field(default_factory=dict)
55
+ tts_attributes: dict[str, Any] = field(default_factory=dict)
56
  trace_id: Optional[str] = None
57
 
58
 
 
201
  transcript: str,
202
  duration: float,
203
  fallback_duration: float,
204
+ metric_attributes: Optional[dict[str, Any]] = None,
205
  ) -> Optional[TraceTurn]:
206
  async with self._trace_lock:
207
  turn = self._next_turn_where(
 
220
  else:
221
  turn.stt_duration_ms = measured_ms
222
  turn.stt_status = "measured"
223
+ turn.stt_attributes = _sanitize_component_attributes(metric_attributes)
224
  _recompute_conversational_latency(turn)
225
  return turn
226
 
227
+ async def attach_eou(
228
  self,
229
  *,
230
  duration: float,
231
  transcription_delay: float,
232
+ on_user_turn_completed_delay: float = 0.0,
233
+ metric_attributes: Optional[dict[str, Any]] = None,
234
+ vad_metric_attributes: Optional[dict[str, Any]] = None,
235
  ) -> Optional[TraceTurn]:
236
  async with self._trace_lock:
237
  turn = self._next_turn_where(
 
243
  turn.vad_duration_ms = eou_delay_ms
244
  turn.stt_finalization_ms = _duration_to_ms(transcription_delay, 0.0)
245
  turn.stt_total_latency_ms = eou_delay_ms + (turn.stt_finalization_ms or 0.0)
246
+ turn.eou_on_user_turn_completed_ms = _duration_to_ms(
247
+ on_user_turn_completed_delay, 0.0
248
+ )
249
  if turn.stt_total_latency_ms > 0:
250
  turn.stt_status = "measured"
251
  if turn.stt_duration_ms is None:
252
  turn.stt_duration_ms = turn.stt_total_latency_ms
253
+ turn.eou_attributes = _sanitize_component_attributes(metric_attributes)
254
+ turn.vad_attributes = _sanitize_component_attributes(vad_metric_attributes)
255
  _recompute_conversational_latency(turn)
256
  return turn
257
 
 
260
  *,
261
  duration: float,
262
  ttft: float,
263
+ metric_attributes: Optional[dict[str, Any]] = None,
264
  ) -> Optional[TraceTurn]:
265
  async with self._trace_lock:
266
  turn = self._next_turn_where(
 
272
  turn.llm_duration_ms = _duration_to_ms(duration, 0.0)
273
  turn.llm_total_latency_ms = turn.llm_duration_ms
274
  turn.llm_ttft_ms = _duration_to_ms(ttft, 0.0)
275
+ turn.llm_attributes = _sanitize_component_attributes(metric_attributes)
276
  _recompute_conversational_latency(turn)
277
  return turn
278
 
 
283
  fallback_duration: float,
284
  ttfb: float,
285
  observed_total_latency: Optional[float],
286
+ metric_attributes: Optional[dict[str, Any]] = None,
287
  ) -> Optional[TraceTurn]:
288
  async with self._trace_lock:
289
  turn = self._next_turn_where(
 
295
  return None
296
  turn.tts_duration_ms = _duration_to_ms(duration, fallback_duration)
297
  turn.tts_ttfb_ms = _duration_to_ms(ttfb, 0.0)
298
+ turn.tts_attributes = _sanitize_component_attributes(metric_attributes)
299
  _recompute_conversational_latency(turn)
300
  if observed_total_latency is not None:
301
  observed_ms = observed_total_latency * 1000.0
 
307
  turn.llm_to_tts_handoff_ms = _compute_llm_to_tts_handoff_ms(
308
  total_latency_ms=turn.conversational_latency_ms,
309
  vad_duration_ms=turn.vad_duration_ms,
 
310
  llm_ttft_ms=turn.llm_ttft_ms,
311
  tts_ttfb_ms=turn.tts_ttfb_ms,
312
  )
 
527
  attributes={"user_transcript": turn.user_transcript},
528
  observation_input=turn.user_transcript,
529
  )
530
+ speech_end_start_ns = cursor_ns
531
+ _emit_component_span(
532
  _tracer,
533
+ name="VADMetrics",
534
  context=ctx,
535
+ start_ns=speech_end_start_ns,
536
+ duration_ms=vals["vad_metrics_duration_ms"],
537
+ attributes=_merge_component_attributes(
538
+ turn.vad_attributes,
539
+ {
540
+ "eou_delay_ms": vals["vad_duration_ms"],
541
+ },
542
+ ),
543
+ )
544
+ eou_end_ns = _emit_component_span(
545
+ _tracer,
546
+ name="EOUMetrics",
547
+ context=ctx,
548
+ start_ns=speech_end_start_ns,
549
  duration_ms=vals["vad_duration_ms"],
550
+ attributes=_merge_component_attributes(
551
+ turn.eou_attributes,
552
+ {
553
+ "end_of_utterance_delay_ms": vals["vad_duration_ms"],
554
+ "transcription_delay_ms": vals["stt_finalization_ms"],
555
+ "on_user_turn_completed_delay_ms": vals[
556
+ "eou_on_user_turn_completed_ms"
557
+ ],
558
+ },
559
+ ),
560
  observation_output=str(vals["vad_duration_ms"]),
561
  )
562
  stt_end_ns = _emit_component_span(
563
  _tracer,
564
+ name="STTMetrics",
565
  context=ctx,
566
+ start_ns=speech_end_start_ns,
567
  duration_ms=vals["stt_span_duration_ms"],
568
  attributes={
569
+ **turn.stt_attributes,
570
  "user_transcript": turn.user_transcript,
571
  "stt_status": turn.stt_status,
572
  "stt_processing_ms": vals["stt_processing_ms"],
 
575
  },
576
  observation_output=turn.user_transcript,
577
  )
578
+ cursor_ns = max(cursor_ns, eou_end_ns, stt_end_ns)
579
  cursor_ns = _emit_component_span(
580
  _tracer,
581
+ name="LLMMetrics",
582
  context=ctx,
583
  start_ns=cursor_ns,
584
  duration_ms=vals["llm_duration_ms"],
585
+ attributes=_merge_component_attributes(
586
+ turn.llm_attributes,
587
+ {
588
+ "prompt_text": turn.prompt_text,
589
+ "response_text": turn.response_text,
590
+ "ttft_ms": vals["llm_ttft_ms"],
591
+ "llm_total_latency_ms": vals["llm_total_latency_ms"],
592
+ },
593
+ ),
594
  observation_input=turn.prompt_text,
595
  observation_output=turn.response_text,
596
  )
597
  cursor_ns = _emit_component_span(
598
  _tracer,
599
+ name="TTSMetrics",
600
  context=ctx,
601
  start_ns=cursor_ns,
602
  duration_ms=vals["tts_duration_ms"],
603
+ attributes=_merge_component_attributes(
604
+ turn.tts_attributes,
605
+ {
606
+ "assistant_text": turn.assistant_text,
607
+ "assistant_text_missing": turn.assistant_text_missing,
608
+ "ttfb_ms": vals["tts_ttfb_ms"],
609
+ },
610
+ ),
611
  observation_input=turn.assistant_text,
612
  observation_output=turn.assistant_text,
613
  )
 
617
  _tracer,
618
  name="conversation_latency",
619
  context=ctx,
620
+ start_ns=speech_end_start_ns,
621
  duration_ms=conv_ms,
622
  attributes={
623
  "speech_end_to_assistant_speech_start_ms": conv_ms,
624
  "eou_delay_ms": vals["vad_duration_ms"],
 
625
  "llm_ttft_ms": vals["llm_ttft_ms"],
626
  "llm_to_tts_handoff_ms": vals["llm_to_tts_handoff_ms"],
627
  "tts_ttfb_ms": vals["tts_ttfb_ms"],
 
630
  )
631
  handoff_ms = vals["llm_to_tts_handoff_ms"]
632
  if handoff_ms is not None and handoff_ms > 0:
633
+ handoff_start_ns = speech_end_start_ns + _ms_to_ns(
634
  max(vals["vad_duration_ms"], 0.0)
 
635
  + max(vals["llm_ttft_ms"], 0.0)
636
  )
637
  _emit_component_span(
 
644
  "llm_to_tts_handoff_ms": handoff_ms,
645
  "speech_end_to_assistant_speech_start_ms": conv_ms,
646
  "eou_delay_ms": vals["vad_duration_ms"],
 
647
  "llm_ttft_ms": vals["llm_ttft_ms"],
648
  "tts_ttfb_ms": vals["tts_ttfb_ms"],
649
  },
 
707
  def _recompute_conversational_latency(turn: TraceTurn) -> None:
708
  turn.conversational_latency_ms = _compute_conversational_latency_ms(
709
  vad_duration_ms=turn.vad_duration_ms,
 
710
  llm_ttft_ms=turn.llm_ttft_ms,
711
  tts_ttfb_ms=turn.tts_ttfb_ms,
712
  )
 
715
  def _compute_conversational_latency_ms(
716
  *,
717
  vad_duration_ms: Optional[float],
 
718
  llm_ttft_ms: Optional[float],
719
  tts_ttfb_ms: Optional[float],
720
  ) -> Optional[float]:
721
+ components = (vad_duration_ms, llm_ttft_ms, tts_ttfb_ms)
722
  if any(c is None for c in components):
723
  return None
724
  return sum(c for c in components if c is not None)
 
728
  *,
729
  total_latency_ms: Optional[float],
730
  vad_duration_ms: Optional[float],
 
731
  llm_ttft_ms: Optional[float],
732
  tts_ttfb_ms: Optional[float],
733
  ) -> Optional[float]:
 
735
  return None
736
  baseline = _compute_conversational_latency_ms(
737
  vad_duration_ms=vad_duration_ms,
 
738
  llm_ttft_ms=llm_ttft_ms,
739
  tts_ttfb_ms=tts_ttfb_ms,
740
  )
 
744
 
745
 
746
  def _total_duration_ms(turn: TraceTurn) -> float:
 
 
 
 
 
747
  llm = (
748
  turn.llm_total_latency_ms
749
  if turn.llm_total_latency_ms is not None
750
  else (turn.llm_duration_ms or 0.0)
751
  )
752
+ handoff = turn.llm_to_tts_handoff_ms or 0.0
753
  calculated = (
754
+ (turn.vad_duration_ms or 0.0) + llm + handoff + (turn.tts_duration_ms or 0.0)
755
  )
756
  if turn.conversational_latency_ms is not None:
757
  calculated = max(calculated, turn.conversational_latency_ms)
 
762
  """Pre-compute derived values used by span emission."""
763
  user_input_duration_ms = 0.0 if turn.user_transcript else None
764
  vad_duration_ms = max(turn.vad_duration_ms or 0.0, 0.0)
765
+ vad_metrics_duration_ms = _duration_attribute_to_ms(
766
+ turn.vad_attributes.get("inference_duration_total")
767
+ )
768
  stt_processing_ms = (
769
  max(turn.stt_duration_ms, 0.0) if turn.stt_duration_ms is not None else None
770
  )
 
779
  else None
780
  )
781
  stt_span_duration_ms: Optional[float] = None
782
+ if stt_processing_ms is not None and stt_processing_ms > 0:
783
+ stt_span_duration_ms = stt_processing_ms
784
  elif stt_finalization_ms is not None and stt_finalization_ms > 0:
785
  stt_span_duration_ms = stt_finalization_ms
786
  else:
787
+ stt_span_duration_ms = stt_total_latency_ms
788
 
789
  llm_duration_ms = max(turn.llm_duration_ms or 0.0, 0.0)
790
  llm_ttft_ms = max(turn.llm_ttft_ms or 0.0, 0.0)
 
805
  if turn.llm_to_tts_handoff_ms is not None
806
  else None
807
  )
808
+ eou_on_user_turn_completed_ms = (
809
+ max(turn.eou_on_user_turn_completed_ms, 0.0)
810
+ if turn.eou_on_user_turn_completed_ms is not None
811
+ else None
812
+ )
813
  return {
814
  "user_input_duration_ms": user_input_duration_ms,
815
  "vad_duration_ms": vad_duration_ms,
816
+ "vad_metrics_duration_ms": vad_metrics_duration_ms,
817
  "stt_processing_ms": stt_processing_ms,
818
  "stt_finalization_ms": stt_finalization_ms,
819
  "stt_total_latency_ms": stt_total_latency_ms,
820
  "stt_span_duration_ms": stt_span_duration_ms,
821
+ "eou_on_user_turn_completed_ms": eou_on_user_turn_completed_ms,
822
  "llm_duration_ms": llm_duration_ms,
823
  "llm_ttft_ms": llm_ttft_ms,
824
  "llm_total_latency_ms": llm_total_latency_ms,
 
861
  "latency_ms.stt_processing": vals["stt_processing_ms"],
862
  "latency_ms.stt_finalization": vals["stt_finalization_ms"],
863
  "latency_ms.stt_total": vals["stt_total_latency_ms"],
864
+ "latency_ms.eou_on_user_turn_completed": vals[
865
+ "eou_on_user_turn_completed_ms"
866
+ ],
867
  "latency_ms.llm": vals["llm_duration_ms"],
868
  "latency_ms.llm_ttft": vals["llm_ttft_ms"],
869
  "latency_ms.llm_total": vals["llm_total_latency_ms"],
 
881
  span.set_attribute(key, value)
882
 
883
 
884
+ def _sanitize_component_attributes(
885
+ attributes: Optional[dict[str, Any]],
886
+ ) -> dict[str, Any]:
887
+ if not attributes:
888
+ return {}
889
+ sanitized: dict[str, Any] = {}
890
+ for key, value in attributes.items():
891
+ if value is None:
892
+ continue
893
+ sanitized[key] = _safe_attribute_value(value)
894
+ return sanitized
895
+
896
+
897
+ def _merge_component_attributes(
898
+ existing: dict[str, Any],
899
+ extra: dict[str, Any],
900
+ ) -> dict[str, Any]:
901
+ merged = dict(existing)
902
+ for key, value in extra.items():
903
+ if value is None:
904
+ continue
905
+ merged[key] = _safe_attribute_value(value)
906
+ return merged
907
+
908
+
909
+ def _safe_attribute_value(value: Any) -> Any:
910
+ if isinstance(value, (str, bool, int, float)):
911
+ return value
912
+ if isinstance(value, (list, tuple)):
913
+ return [_safe_attribute_value(v) for v in value]
914
+ return str(value)
915
+
916
+
917
+ def _duration_attribute_to_ms(value: Any) -> Optional[float]:
918
+ if isinstance(value, (int, float)):
919
+ return max(float(value), 0.0) * 1000.0
920
+ return None
921
+
922
+
923
  def _emit_component_span(
924
  _tracer: Any,
925
  *,
src/agent/metrics_collector.py CHANGED
@@ -29,37 +29,80 @@ from src.core.settings import settings
29
  class STTMetrics:
30
  """Speech-to-text metrics."""
31
 
 
 
 
 
32
  model_name: str
33
- audio_duration: float
34
  duration: float
 
 
 
35
 
36
 
37
  @dataclass
38
  class LLMMetrics:
39
  """Language model metrics."""
40
 
41
- ttft: float
 
 
 
42
  duration: float
43
- tokens: int
 
 
 
 
 
44
  tokens_per_second: float
 
 
45
 
46
 
47
  @dataclass
48
  class TTSMetrics:
49
  """Text-to-speech metrics."""
50
 
51
- ttfb: float
 
 
 
52
  duration: float
 
53
  audio_duration: float
 
 
 
 
 
 
54
 
55
 
56
  @dataclass
57
  class VADMetrics:
58
  """Voice activity detection metrics."""
59
 
 
 
 
60
  idle_time: float
61
  inference_duration_total: float
62
  inference_count: int
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
 
65
  @dataclass
@@ -84,6 +127,7 @@ class TurnMetrics:
84
  role: str
85
  transcript: str = ""
86
  stt: Optional[STTMetrics] = None
 
87
  llm: Optional[LLMMetrics] = None
88
  tts: Optional[TTSMetrics] = None
89
  vad: Optional[VADMetrics] = None
@@ -97,7 +141,7 @@ class TurnMetrics:
97
  ) -> None:
98
  llm_ttft = self.llm.ttft if self.llm else 0.0
99
  tts_ttfb = self.tts.ttfb if self.tts else 0.0
100
- baseline = eou_delay + stt_finalization_delay + llm_ttft + tts_ttfb
101
  observed = observed_total_latency if observed_total_latency is not None else 0.0
102
  total = max(baseline, observed)
103
  self.latencies = Latencies(
@@ -129,6 +173,7 @@ class TurnMetrics:
129
  "transcript": self.transcript,
130
  "metrics": {
131
  "stt": stt_metrics,
 
132
  "llm": asdict(self.llm) if self.llm else None,
133
  "tts": asdict(self.tts) if self.tts else None,
134
  "vad": asdict(self.vad) if self.vad else None,
@@ -150,6 +195,7 @@ class TurnState:
150
  """Per-speech-id state consolidating turn metrics and timing data."""
151
 
152
  metrics: Optional[TurnMetrics] = None
 
153
  eou_delay: float = 0.0
154
  stt_finalization_delay: float = 0.0
155
  speech_end_monotonic: Optional[float] = None
@@ -202,6 +248,8 @@ class MetricsCollector:
202
  self._turns: dict[str, TurnState] = {}
203
  self._pending_llm_watchdog_ids: deque[str] = deque()
204
  self._llm_stall_tasks: dict[str, asyncio.Task[None]] = {}
 
 
205
  self._llm_stall_timeout_sec = max(
206
  float(
207
  getattr(
@@ -362,7 +410,17 @@ class MetricsCollector:
362
  speech_id = self._latest_agent_speech_id or f"tts-{uuid.uuid4()}"
363
  turn_metrics = self._get_or_create_turn(speech_id, role="agent")
364
  turn_metrics.tts = TTSMetrics(
365
- ttfb=ttfb, duration=duration, audio_duration=audio_duration
 
 
 
 
 
 
 
 
 
 
366
  )
367
  await self._publish_live_update(speech_id=speech_id, stage="tts", turn_metrics=turn_metrics)
368
  logger.debug("TTS fallback metrics collected: speech_id=%s, ttfb=%.3fs", speech_id, ttfb)
@@ -373,6 +431,19 @@ class MetricsCollector:
373
  fallback_duration=audio_duration,
374
  ttfb=ttfb,
375
  observed_total_latency=self._observed_total_latency(speech_id),
 
 
 
 
 
 
 
 
 
 
 
 
 
376
  )
377
  await self._tracer.maybe_finalize(trace_turn)
378
 
@@ -396,9 +467,15 @@ class MetricsCollector:
396
  if self._pending_transcripts:
397
  turn_metrics.transcript = self._pending_transcripts.popleft()
398
  turn_metrics.stt = STTMetrics(
 
 
 
 
399
  model_name=self._model_name,
400
- audio_duration=collected_metrics.audio_duration,
401
  duration=collected_metrics.duration,
 
 
 
402
  )
403
  await self._publish_live_update(speech_id=speech_id, stage="stt", turn_metrics=turn_metrics)
404
  logger.debug("STT metrics collected: request_id=%s, duration=%.3fs", speech_id, collected_metrics.duration)
@@ -406,6 +483,7 @@ class MetricsCollector:
406
  transcript=turn_metrics.transcript,
407
  duration=collected_metrics.duration,
408
  fallback_duration=collected_metrics.audio_duration,
 
409
  )
410
 
411
  elif isinstance(collected_metrics, metrics.LLMMetrics):
@@ -416,29 +494,46 @@ class MetricsCollector:
416
  if self._pending_agent_transcripts and not turn_metrics.transcript:
417
  turn_metrics.transcript = self._pending_agent_transcripts.popleft()
418
  turn_metrics.llm = LLMMetrics(
419
- ttft=collected_metrics.ttft,
 
 
 
420
  duration=collected_metrics.duration,
421
- tokens=collected_metrics.completion_tokens,
422
- tokens_per_second=(
423
- collected_metrics.completion_tokens / collected_metrics.duration
424
- if collected_metrics.duration > 0
425
- else 0.0
426
- ),
 
 
 
427
  )
428
  await self._publish_live_update(speech_id=speech_id, stage="llm", turn_metrics=turn_metrics)
429
  logger.debug("LLM metrics collected: speech_id=%s, ttft=%.3fs", speech_id, collected_metrics.ttft)
430
  trace_turn = await self._tracer.attach_llm(
431
  duration=collected_metrics.duration,
432
  ttft=collected_metrics.ttft,
 
433
  )
434
 
435
  elif isinstance(collected_metrics, metrics.TTSMetrics):
436
  speech_id = collected_metrics.speech_id or collected_metrics.request_id
437
  turn_metrics = self._get_or_create_turn(speech_id, role="agent")
438
  turn_metrics.tts = TTSMetrics(
439
- ttfb=collected_metrics.ttfb,
 
 
 
440
  duration=collected_metrics.duration,
 
441
  audio_duration=collected_metrics.audio_duration,
 
 
 
 
 
 
442
  )
443
  await self._publish_live_update(speech_id=speech_id, stage="tts", turn_metrics=turn_metrics)
444
  logger.debug("TTS metrics collected: speech_id=%s, ttfb=%.3fs", speech_id, collected_metrics.ttfb)
@@ -447,6 +542,7 @@ class MetricsCollector:
447
  fallback_duration=collected_metrics.audio_duration,
448
  ttfb=collected_metrics.ttfb,
449
  observed_total_latency=self._observed_total_latency(speech_id),
 
450
  )
451
 
452
  elif isinstance(collected_metrics, metrics.EOUMetrics):
@@ -457,39 +553,57 @@ class MetricsCollector:
457
  state.speech_end_monotonic = monotonic()
458
  state.eou_delay = collected_metrics.end_of_utterance_delay
459
  state.stt_finalization_delay = collected_metrics.transcription_delay
 
 
 
 
 
 
 
 
 
460
  turn_metrics = state.metrics
 
 
 
 
461
  await self._publish_live_update(
462
  speech_id=speech_id,
463
  stage="eou",
464
  turn_metrics=turn_metrics,
465
  )
466
  logger.debug("EOU metrics collected: speech_id=%s, delay=%.3fs", speech_id, collected_metrics.end_of_utterance_delay)
467
- trace_turn = await self._tracer.attach_vad(
468
  duration=collected_metrics.end_of_utterance_delay,
469
  transcription_delay=collected_metrics.transcription_delay,
 
 
 
470
  )
471
 
472
  elif isinstance(collected_metrics, metrics.VADMetrics):
473
  speech_id = getattr(collected_metrics, "speech_id", None)
 
 
 
 
 
 
 
 
 
 
474
  if speech_id:
475
  state = self._turns.get(speech_id)
476
  turn_metrics = state.metrics if state else None
477
  if speech_id and turn_metrics:
478
- turn_metrics.vad = VADMetrics(
479
- idle_time=collected_metrics.idle_time,
480
- inference_duration_total=collected_metrics.inference_duration_total,
481
- inference_count=collected_metrics.inference_count,
482
- )
483
  await self._publisher.publish_live_update(
484
  speech_id=speech_id,
485
  stage="vad",
486
  role=turn_metrics.role if turn_metrics else None,
487
  turn_metrics=turn_metrics,
488
- vad_metrics=VADMetrics(
489
- idle_time=collected_metrics.idle_time,
490
- inference_duration_total=collected_metrics.inference_duration_total,
491
- inference_count=collected_metrics.inference_count,
492
- ),
493
  diagnostic=not bool(speech_id and turn_metrics),
494
  eou_delay=self._turns[speech_id].eou_delay if speech_id and speech_id in self._turns else 0.0,
495
  stt_finalization_delay=self._turns[speech_id].stt_finalization_delay if speech_id and speech_id in self._turns else 0.0,
@@ -741,3 +855,127 @@ def _extract_text_from_chat_items(chat_items: Any) -> str:
741
  if text.strip():
742
  parts.append(text.strip())
743
  return parts[-1] if parts else ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  class STTMetrics:
30
  """Speech-to-text metrics."""
31
 
32
+ type: str
33
+ label: str
34
+ request_id: str
35
+ timestamp: float
36
  model_name: str
 
37
  duration: float
38
+ audio_duration: float
39
+ streamed: bool
40
+ metadata: Optional[dict[str, Any]] = None
41
 
42
 
43
  @dataclass
44
  class LLMMetrics:
45
  """Language model metrics."""
46
 
47
+ type: str
48
+ label: str
49
+ request_id: str
50
+ timestamp: float
51
  duration: float
52
+ ttft: float
53
+ cancelled: bool
54
+ completion_tokens: int
55
+ prompt_tokens: int
56
+ prompt_cached_tokens: int
57
+ total_tokens: int
58
  tokens_per_second: float
59
+ speech_id: Optional[str] = None
60
+ metadata: Optional[dict[str, Any]] = None
61
 
62
 
63
  @dataclass
64
  class TTSMetrics:
65
  """Text-to-speech metrics."""
66
 
67
+ type: str
68
+ label: str
69
+ request_id: str
70
+ timestamp: float
71
  duration: float
72
+ ttfb: float
73
  audio_duration: float
74
+ cancelled: bool
75
+ characters_count: int
76
+ streamed: bool
77
+ segment_id: Optional[str] = None
78
+ speech_id: Optional[str] = None
79
+ metadata: Optional[dict[str, Any]] = None
80
 
81
 
82
  @dataclass
83
  class VADMetrics:
84
  """Voice activity detection metrics."""
85
 
86
+ type: str
87
+ label: str
88
+ timestamp: float
89
  idle_time: float
90
  inference_duration_total: float
91
  inference_count: int
92
+ metadata: Optional[dict[str, Any]] = None
93
+
94
+
95
+ @dataclass
96
+ class EOUMetrics:
97
+ """End-of-utterance metrics."""
98
+
99
+ type: str
100
+ timestamp: float
101
+ end_of_utterance_delay: float
102
+ transcription_delay: float
103
+ on_user_turn_completed_delay: float
104
+ speech_id: Optional[str] = None
105
+ metadata: Optional[dict[str, Any]] = None
106
 
107
 
108
  @dataclass
 
127
  role: str
128
  transcript: str = ""
129
  stt: Optional[STTMetrics] = None
130
+ eou: Optional[EOUMetrics] = None
131
  llm: Optional[LLMMetrics] = None
132
  tts: Optional[TTSMetrics] = None
133
  vad: Optional[VADMetrics] = None
 
141
  ) -> None:
142
  llm_ttft = self.llm.ttft if self.llm else 0.0
143
  tts_ttfb = self.tts.ttfb if self.tts else 0.0
144
+ baseline = eou_delay + llm_ttft + tts_ttfb
145
  observed = observed_total_latency if observed_total_latency is not None else 0.0
146
  total = max(baseline, observed)
147
  self.latencies = Latencies(
 
173
  "transcript": self.transcript,
174
  "metrics": {
175
  "stt": stt_metrics,
176
+ "eou": asdict(self.eou) if self.eou else None,
177
  "llm": asdict(self.llm) if self.llm else None,
178
  "tts": asdict(self.tts) if self.tts else None,
179
  "vad": asdict(self.vad) if self.vad else None,
 
195
  """Per-speech-id state consolidating turn metrics and timing data."""
196
 
197
  metrics: Optional[TurnMetrics] = None
198
+ eou_metrics: Optional[EOUMetrics] = None
199
  eou_delay: float = 0.0
200
  stt_finalization_delay: float = 0.0
201
  speech_end_monotonic: Optional[float] = None
 
248
  self._turns: dict[str, TurnState] = {}
249
  self._pending_llm_watchdog_ids: deque[str] = deque()
250
  self._llm_stall_tasks: dict[str, asyncio.Task[None]] = {}
251
+ self._latest_vad_metrics: Optional[VADMetrics] = None
252
+ self._latest_vad_metric_attributes: Optional[dict[str, Any]] = None
253
  self._llm_stall_timeout_sec = max(
254
  float(
255
  getattr(
 
410
  speech_id = self._latest_agent_speech_id or f"tts-{uuid.uuid4()}"
411
  turn_metrics = self._get_or_create_turn(speech_id, role="agent")
412
  turn_metrics.tts = TTSMetrics(
413
+ type="tts_metrics",
414
+ label="tts_fallback",
415
+ request_id=f"fallback-{speech_id}",
416
+ timestamp=time(),
417
+ duration=duration,
418
+ ttfb=ttfb,
419
+ audio_duration=audio_duration,
420
+ cancelled=False,
421
+ characters_count=0,
422
+ streamed=True,
423
+ speech_id=speech_id,
424
  )
425
  await self._publish_live_update(speech_id=speech_id, stage="tts", turn_metrics=turn_metrics)
426
  logger.debug("TTS fallback metrics collected: speech_id=%s, ttfb=%.3fs", speech_id, ttfb)
 
431
  fallback_duration=audio_duration,
432
  ttfb=ttfb,
433
  observed_total_latency=self._observed_total_latency(speech_id),
434
+ metric_attributes={
435
+ "type": "tts_metrics",
436
+ "label": "tts_fallback",
437
+ "request_id": f"fallback-{speech_id}",
438
+ "timestamp": time(),
439
+ "duration": duration,
440
+ "ttfb": ttfb,
441
+ "audio_duration": audio_duration,
442
+ "cancelled": False,
443
+ "characters_count": 0,
444
+ "streamed": True,
445
+ "speech_id": speech_id,
446
+ },
447
  )
448
  await self._tracer.maybe_finalize(trace_turn)
449
 
 
467
  if self._pending_transcripts:
468
  turn_metrics.transcript = self._pending_transcripts.popleft()
469
  turn_metrics.stt = STTMetrics(
470
+ type=collected_metrics.type,
471
+ label=collected_metrics.label,
472
+ request_id=collected_metrics.request_id,
473
+ timestamp=collected_metrics.timestamp,
474
  model_name=self._model_name,
 
475
  duration=collected_metrics.duration,
476
+ audio_duration=collected_metrics.audio_duration,
477
+ streamed=collected_metrics.streamed,
478
+ metadata=_metric_metadata_to_dict(collected_metrics.metadata),
479
  )
480
  await self._publish_live_update(speech_id=speech_id, stage="stt", turn_metrics=turn_metrics)
481
  logger.debug("STT metrics collected: request_id=%s, duration=%.3fs", speech_id, collected_metrics.duration)
 
483
  transcript=turn_metrics.transcript,
484
  duration=collected_metrics.duration,
485
  fallback_duration=collected_metrics.audio_duration,
486
+ metric_attributes=_stt_metric_attributes(collected_metrics),
487
  )
488
 
489
  elif isinstance(collected_metrics, metrics.LLMMetrics):
 
494
  if self._pending_agent_transcripts and not turn_metrics.transcript:
495
  turn_metrics.transcript = self._pending_agent_transcripts.popleft()
496
  turn_metrics.llm = LLMMetrics(
497
+ type=collected_metrics.type,
498
+ label=collected_metrics.label,
499
+ request_id=collected_metrics.request_id,
500
+ timestamp=collected_metrics.timestamp,
501
  duration=collected_metrics.duration,
502
+ ttft=collected_metrics.ttft,
503
+ cancelled=collected_metrics.cancelled,
504
+ completion_tokens=collected_metrics.completion_tokens,
505
+ prompt_tokens=collected_metrics.prompt_tokens,
506
+ prompt_cached_tokens=collected_metrics.prompt_cached_tokens,
507
+ total_tokens=collected_metrics.total_tokens,
508
+ tokens_per_second=collected_metrics.tokens_per_second,
509
+ speech_id=collected_metrics.speech_id,
510
+ metadata=_metric_metadata_to_dict(collected_metrics.metadata),
511
  )
512
  await self._publish_live_update(speech_id=speech_id, stage="llm", turn_metrics=turn_metrics)
513
  logger.debug("LLM metrics collected: speech_id=%s, ttft=%.3fs", speech_id, collected_metrics.ttft)
514
  trace_turn = await self._tracer.attach_llm(
515
  duration=collected_metrics.duration,
516
  ttft=collected_metrics.ttft,
517
+ metric_attributes=_llm_metric_attributes(collected_metrics),
518
  )
519
 
520
  elif isinstance(collected_metrics, metrics.TTSMetrics):
521
  speech_id = collected_metrics.speech_id or collected_metrics.request_id
522
  turn_metrics = self._get_or_create_turn(speech_id, role="agent")
523
  turn_metrics.tts = TTSMetrics(
524
+ type=collected_metrics.type,
525
+ label=collected_metrics.label,
526
+ request_id=collected_metrics.request_id,
527
+ timestamp=collected_metrics.timestamp,
528
  duration=collected_metrics.duration,
529
+ ttfb=collected_metrics.ttfb,
530
  audio_duration=collected_metrics.audio_duration,
531
+ cancelled=collected_metrics.cancelled,
532
+ characters_count=collected_metrics.characters_count,
533
+ streamed=collected_metrics.streamed,
534
+ segment_id=collected_metrics.segment_id,
535
+ speech_id=collected_metrics.speech_id,
536
+ metadata=_metric_metadata_to_dict(collected_metrics.metadata),
537
  )
538
  await self._publish_live_update(speech_id=speech_id, stage="tts", turn_metrics=turn_metrics)
539
  logger.debug("TTS metrics collected: speech_id=%s, ttfb=%.3fs", speech_id, collected_metrics.ttfb)
 
542
  fallback_duration=collected_metrics.audio_duration,
543
  ttfb=collected_metrics.ttfb,
544
  observed_total_latency=self._observed_total_latency(speech_id),
545
+ metric_attributes=_tts_metric_attributes(collected_metrics),
546
  )
547
 
548
  elif isinstance(collected_metrics, metrics.EOUMetrics):
 
553
  state.speech_end_monotonic = monotonic()
554
  state.eou_delay = collected_metrics.end_of_utterance_delay
555
  state.stt_finalization_delay = collected_metrics.transcription_delay
556
+ state.eou_metrics = EOUMetrics(
557
+ type=collected_metrics.type,
558
+ timestamp=collected_metrics.timestamp,
559
+ end_of_utterance_delay=collected_metrics.end_of_utterance_delay,
560
+ transcription_delay=collected_metrics.transcription_delay,
561
+ on_user_turn_completed_delay=collected_metrics.on_user_turn_completed_delay,
562
+ speech_id=collected_metrics.speech_id,
563
+ metadata=_metric_metadata_to_dict(collected_metrics.metadata),
564
+ )
565
  turn_metrics = state.metrics
566
+ if turn_metrics:
567
+ turn_metrics.eou = state.eou_metrics
568
+ if self._latest_vad_metrics and turn_metrics.vad is None:
569
+ turn_metrics.vad = self._latest_vad_metrics
570
  await self._publish_live_update(
571
  speech_id=speech_id,
572
  stage="eou",
573
  turn_metrics=turn_metrics,
574
  )
575
  logger.debug("EOU metrics collected: speech_id=%s, delay=%.3fs", speech_id, collected_metrics.end_of_utterance_delay)
576
+ trace_turn = await self._tracer.attach_eou(
577
  duration=collected_metrics.end_of_utterance_delay,
578
  transcription_delay=collected_metrics.transcription_delay,
579
+ on_user_turn_completed_delay=collected_metrics.on_user_turn_completed_delay,
580
+ metric_attributes=_eou_metric_attributes(collected_metrics),
581
+ vad_metric_attributes=self._latest_vad_metric_attributes,
582
  )
583
 
584
  elif isinstance(collected_metrics, metrics.VADMetrics):
585
  speech_id = getattr(collected_metrics, "speech_id", None)
586
+ self._latest_vad_metrics = VADMetrics(
587
+ type=collected_metrics.type,
588
+ label=collected_metrics.label,
589
+ timestamp=collected_metrics.timestamp,
590
+ idle_time=collected_metrics.idle_time,
591
+ inference_duration_total=collected_metrics.inference_duration_total,
592
+ inference_count=collected_metrics.inference_count,
593
+ metadata=_metric_metadata_to_dict(collected_metrics.metadata),
594
+ )
595
+ self._latest_vad_metric_attributes = _vad_metric_attributes(collected_metrics)
596
  if speech_id:
597
  state = self._turns.get(speech_id)
598
  turn_metrics = state.metrics if state else None
599
  if speech_id and turn_metrics:
600
+ turn_metrics.vad = self._latest_vad_metrics
 
 
 
 
601
  await self._publisher.publish_live_update(
602
  speech_id=speech_id,
603
  stage="vad",
604
  role=turn_metrics.role if turn_metrics else None,
605
  turn_metrics=turn_metrics,
606
+ vad_metrics=self._latest_vad_metrics,
 
 
 
 
607
  diagnostic=not bool(speech_id and turn_metrics),
608
  eou_delay=self._turns[speech_id].eou_delay if speech_id and speech_id in self._turns else 0.0,
609
  stt_finalization_delay=self._turns[speech_id].stt_finalization_delay if speech_id and speech_id in self._turns else 0.0,
 
855
  if text.strip():
856
  parts.append(text.strip())
857
  return parts[-1] if parts else ""
858
+
859
+
860
+ def _metric_metadata_to_dict(metadata: Any) -> Optional[dict[str, Any]]:
861
+ if metadata is None:
862
+ return None
863
+ if hasattr(metadata, "model_dump"):
864
+ dumped = metadata.model_dump(exclude_none=True)
865
+ if isinstance(dumped, dict):
866
+ return dumped
867
+ return {"value": dumped}
868
+ if isinstance(metadata, dict):
869
+ return metadata
870
+ return {"value": str(metadata)}
871
+
872
+
873
+ def _metadata_attributes(metadata: Any) -> dict[str, Any]:
874
+ data = _metric_metadata_to_dict(metadata)
875
+ if not data:
876
+ return {}
877
+ return _flatten_attributes(data, prefix="metadata")
878
+
879
+
880
+ def _flatten_attributes(
881
+ data: dict[str, Any], *, prefix: str = ""
882
+ ) -> dict[str, Any]:
883
+ flattened: dict[str, Any] = {}
884
+ for key, value in data.items():
885
+ full_key = f"{prefix}.{key}" if prefix else str(key)
886
+ if value is None:
887
+ continue
888
+ if isinstance(value, dict):
889
+ flattened.update(_flatten_attributes(value, prefix=full_key))
890
+ continue
891
+ if isinstance(value, (list, tuple)):
892
+ serialized = [_safe_attr_value(v) for v in value]
893
+ flattened[full_key] = serialized
894
+ continue
895
+ flattened[full_key] = _safe_attr_value(value)
896
+ return flattened
897
+
898
+
899
+ def _safe_attr_value(value: Any) -> Any:
900
+ if isinstance(value, (str, bool, int, float)):
901
+ return value
902
+ return str(value)
903
+
904
+
905
+ def _stt_metric_attributes(collected_metrics: metrics.STTMetrics) -> dict[str, Any]:
906
+ attrs = {
907
+ "type": collected_metrics.type,
908
+ "label": collected_metrics.label,
909
+ "request_id": collected_metrics.request_id,
910
+ "timestamp": collected_metrics.timestamp,
911
+ "duration": collected_metrics.duration,
912
+ "audio_duration": collected_metrics.audio_duration,
913
+ "streamed": collected_metrics.streamed,
914
+ }
915
+ attrs.update(_metadata_attributes(collected_metrics.metadata))
916
+ return attrs
917
+
918
+
919
+ def _eou_metric_attributes(collected_metrics: metrics.EOUMetrics) -> dict[str, Any]:
920
+ attrs = {
921
+ "type": collected_metrics.type,
922
+ "timestamp": collected_metrics.timestamp,
923
+ "end_of_utterance_delay": collected_metrics.end_of_utterance_delay,
924
+ "transcription_delay": collected_metrics.transcription_delay,
925
+ "on_user_turn_completed_delay": collected_metrics.on_user_turn_completed_delay,
926
+ "speech_id": collected_metrics.speech_id,
927
+ }
928
+ attrs.update(_metadata_attributes(collected_metrics.metadata))
929
+ return attrs
930
+
931
+
932
+ def _vad_metric_attributes(collected_metrics: metrics.VADMetrics) -> dict[str, Any]:
933
+ attrs = {
934
+ "type": collected_metrics.type,
935
+ "label": collected_metrics.label,
936
+ "timestamp": collected_metrics.timestamp,
937
+ "idle_time": collected_metrics.idle_time,
938
+ "inference_duration_total": collected_metrics.inference_duration_total,
939
+ "inference_count": collected_metrics.inference_count,
940
+ }
941
+ attrs.update(_metadata_attributes(collected_metrics.metadata))
942
+ return attrs
943
+
944
+
945
+ def _llm_metric_attributes(collected_metrics: metrics.LLMMetrics) -> dict[str, Any]:
946
+ attrs = {
947
+ "type": collected_metrics.type,
948
+ "label": collected_metrics.label,
949
+ "request_id": collected_metrics.request_id,
950
+ "timestamp": collected_metrics.timestamp,
951
+ "duration": collected_metrics.duration,
952
+ "ttft": collected_metrics.ttft,
953
+ "cancelled": collected_metrics.cancelled,
954
+ "completion_tokens": collected_metrics.completion_tokens,
955
+ "prompt_tokens": collected_metrics.prompt_tokens,
956
+ "prompt_cached_tokens": collected_metrics.prompt_cached_tokens,
957
+ "total_tokens": collected_metrics.total_tokens,
958
+ "tokens_per_second": collected_metrics.tokens_per_second,
959
+ "speech_id": collected_metrics.speech_id,
960
+ }
961
+ attrs.update(_metadata_attributes(collected_metrics.metadata))
962
+ return attrs
963
+
964
+
965
+ def _tts_metric_attributes(collected_metrics: metrics.TTSMetrics) -> dict[str, Any]:
966
+ attrs = {
967
+ "type": collected_metrics.type,
968
+ "label": collected_metrics.label,
969
+ "request_id": collected_metrics.request_id,
970
+ "timestamp": collected_metrics.timestamp,
971
+ "ttfb": collected_metrics.ttfb,
972
+ "duration": collected_metrics.duration,
973
+ "audio_duration": collected_metrics.audio_duration,
974
+ "cancelled": collected_metrics.cancelled,
975
+ "characters_count": collected_metrics.characters_count,
976
+ "streamed": collected_metrics.streamed,
977
+ "segment_id": collected_metrics.segment_id,
978
+ "speech_id": collected_metrics.speech_id,
979
+ }
980
+ attrs.update(_metadata_attributes(collected_metrics.metadata))
981
+ return attrs
src/ui/index.html CHANGED
@@ -666,7 +666,7 @@
666
  <span>End-to-End Latency</span>
667
  <div class="tooltip tooltip-right">
668
  <button type="button" class="tooltip-trigger" aria-label="What is End-to-End Latency?">i</button>
669
- <span class="tooltip-content" role="tooltip">Full response latency from end of user speech to first assistant audio, including EOU detection, STT finalization, LLM TTFT, and TTS startup.</span>
670
  </div>
671
  </div>
672
  </div>
 
666
  <span>End-to-End Latency</span>
667
  <div class="tooltip tooltip-right">
668
  <button type="button" class="tooltip-trigger" aria-label="What is End-to-End Latency?">i</button>
669
+ <span class="tooltip-content" role="tooltip">Full response latency from end of user speech to first assistant audio: EOU delay (inclusive of transcription delay), LLM TTFT, model-to-TTS handoff, and TTS startup.</span>
670
  </div>
671
  </div>
672
  </div>
src/ui/main.js CHANGED
@@ -730,7 +730,6 @@ function updateLiveMetrics(turn) {
730
 
731
  const hasAllStages = (
732
  isFiniteNumber(liveTurnValues.eouDelay) &&
733
- isFiniteNumber(liveTurnValues.sttFinalizationDelay) &&
734
  isFiniteNumber(liveTurnValues.llmTtft) &&
735
  isFiniteNumber(liveTurnValues.llmToTtsHandoff) &&
736
  isFiniteNumber(liveTurnValues.ttsTtfb)
@@ -739,7 +738,6 @@ function updateLiveMetrics(turn) {
739
  if (hasAllStages) {
740
  const computedTotal =
741
  liveTurnValues.eouDelay +
742
- liveTurnValues.sttFinalizationDelay +
743
  liveTurnValues.llmTtft +
744
  liveTurnValues.llmToTtsHandoff +
745
  liveTurnValues.ttsTtfb;
 
730
 
731
  const hasAllStages = (
732
  isFiniteNumber(liveTurnValues.eouDelay) &&
 
733
  isFiniteNumber(liveTurnValues.llmTtft) &&
734
  isFiniteNumber(liveTurnValues.llmToTtsHandoff) &&
735
  isFiniteNumber(liveTurnValues.ttsTtfb)
 
738
  if (hasAllStages) {
739
  const computedTotal =
740
  liveTurnValues.eouDelay +
 
741
  liveTurnValues.llmTtft +
742
  liveTurnValues.llmToTtsHandoff +
743
  liveTurnValues.ttsTtfb;
tests/test_langfuse_turn_tracing.py CHANGED
@@ -290,9 +290,27 @@ def test_turn_trace_has_required_metadata_and_spans(monkeypatch: pytest.MonkeyPa
290
  asyncio.run(_run())
291
 
292
  span_names = [span.name for span in fake_tracer.spans]
293
- assert span_names == ["turn", "user_input", "vad", "stt", "llm", "tts", "conversation_latency"]
 
 
 
 
 
 
 
 
 
294
 
295
- root, user_input_span, vad_span, stt_span, llm_span, tts_span, conversational_span = fake_tracer.spans
 
 
 
 
 
 
 
 
 
296
  assert root.attributes["session_id"] == "session-abc"
297
  assert root.attributes["room_id"] == "RM123"
298
  assert root.attributes["participant_id"] == "web-123"
@@ -304,15 +322,23 @@ def test_turn_trace_has_required_metadata_and_spans(monkeypatch: pytest.MonkeyPa
304
  assert root.attributes["latency_ms.llm_ttft"] > 0
305
  assert root.attributes["latency_ms.llm_total"] > 0
306
  assert root.attributes["latency_ms.tts_ttfb"] > 0
307
- assert root.attributes["latency_ms.conversational"] == pytest.approx(1600.0)
308
- assert root.attributes["latency_ms.speech_end_to_assistant_speech_start"] == pytest.approx(1600.0)
309
 
310
  assert user_input_span.attributes["user_transcript"] == "hello there"
311
- assert vad_span.attributes["duration_ms"] == pytest.approx(1100.0)
 
 
 
 
 
 
312
 
313
  assert stt_span.attributes["user_transcript"] == "hello there"
314
  assert stt_span.attributes["stt_status"] == "measured"
315
- assert stt_span.attributes["duration_ms"] == pytest.approx(1350.0)
 
 
316
  assert stt_span.attributes["stt_finalization_ms"] == pytest.approx(250.0)
317
  assert stt_span.attributes["stt_total_latency_ms"] == pytest.approx(1350.0)
318
 
@@ -323,20 +349,23 @@ def test_turn_trace_has_required_metadata_and_spans(monkeypatch: pytest.MonkeyPa
323
  assert llm_span.attributes["input"] == "hello there"
324
  assert llm_span.attributes["output"] == "hi, how can I help?"
325
  assert llm_span.attributes["duration_ms"] > 0
 
 
326
 
327
  assert tts_span.attributes["assistant_text"] == "hi, how can I help?"
328
  assert tts_span.attributes["ttfb_ms"] > 0
329
  assert tts_span.attributes["input"] == "hi, how can I help?"
330
  assert tts_span.attributes["output"] == "hi, how can I help?"
331
  assert tts_span.attributes["duration_ms"] > 0
 
 
332
 
333
- assert conversational_span.attributes["duration_ms"] == pytest.approx(1600.0)
334
  assert (
335
  conversational_span.attributes["speech_end_to_assistant_speech_start_ms"]
336
- == pytest.approx(1600.0)
337
  )
338
  assert conversational_span.attributes["eou_delay_ms"] == pytest.approx(1100.0)
339
- assert conversational_span.attributes["stt_finalization_ms"] == pytest.approx(250.0)
340
  assert conversational_span.attributes["llm_ttft_ms"] > 0
341
  assert conversational_span.attributes["tts_ttfb_ms"] > 0
342
  assert all(span.end_count == 1 for span in fake_tracer.spans)
@@ -474,8 +503,16 @@ def test_trace_emits_without_stt_metrics(monkeypatch: pytest.MonkeyPatch) -> Non
474
  asyncio.run(_run())
475
 
476
  span_names = [span.name for span in fake_tracer.spans]
477
- assert span_names == ["turn", "user_input", "vad", "stt", "llm", "tts"]
478
- stt_span = fake_tracer.spans[3]
 
 
 
 
 
 
 
 
479
  assert stt_span.attributes["user_transcript"] == "turn without stt metrics"
480
  assert stt_span.attributes["stt_status"] == "missing"
481
  assert "duration_ms" not in stt_span.attributes
@@ -715,7 +752,7 @@ def test_long_response_latency_accounts_for_llm_to_tts_handoff(
715
  await collector.on_speech_created(_FakeSpeechHandle(chat_items=[], speech_id=speech_id))
716
  await collector.on_user_input_transcribed("Explain neural networks", is_final=True)
717
  await collector.on_metrics_collected(
718
- _make_eou_metrics(speech_id, delay=0.0, transcription_delay=0.0)
719
  )
720
  await collector.on_metrics_collected(
721
  _make_llm_metrics(speech_id, duration=2.0, ttft=0.01)
@@ -741,12 +778,12 @@ def test_long_response_latency_accounts_for_llm_to_tts_handoff(
741
  assert root.attributes["latency_ms.llm_to_tts_handoff"] > 150.0
742
  assert root.attributes["latency_ms.conversational"] == pytest.approx(
743
  root.attributes["latency_ms.eou_delay"]
744
- + root.attributes["latency_ms.stt_finalization"]
745
  + root.attributes["latency_ms.llm_ttft"]
746
  + root.attributes["latency_ms.llm_to_tts_handoff"]
747
  + root.attributes["latency_ms.tts_ttfb"],
748
  abs=5.0,
749
  )
 
750
 
751
  gap_spans = [span for span in fake_tracer.spans if span.name == "llm_to_tts_handoff"]
752
  assert len(gap_spans) == 1
 
290
  asyncio.run(_run())
291
 
292
  span_names = [span.name for span in fake_tracer.spans]
293
+ assert span_names == [
294
+ "turn",
295
+ "user_input",
296
+ "VADMetrics",
297
+ "EOUMetrics",
298
+ "STTMetrics",
299
+ "LLMMetrics",
300
+ "TTSMetrics",
301
+ "conversation_latency",
302
+ ]
303
 
304
+ (
305
+ root,
306
+ user_input_span,
307
+ vad_span,
308
+ eou_span,
309
+ stt_span,
310
+ llm_span,
311
+ tts_span,
312
+ conversational_span,
313
+ ) = fake_tracer.spans
314
  assert root.attributes["session_id"] == "session-abc"
315
  assert root.attributes["room_id"] == "RM123"
316
  assert root.attributes["participant_id"] == "web-123"
 
322
  assert root.attributes["latency_ms.llm_ttft"] > 0
323
  assert root.attributes["latency_ms.llm_total"] > 0
324
  assert root.attributes["latency_ms.tts_ttfb"] > 0
325
+ assert root.attributes["latency_ms.conversational"] == pytest.approx(1350.0)
326
+ assert root.attributes["latency_ms.speech_end_to_assistant_speech_start"] == pytest.approx(1350.0)
327
 
328
  assert user_input_span.attributes["user_transcript"] == "hello there"
329
+ assert vad_span.attributes["eou_delay_ms"] == pytest.approx(1100.0)
330
+
331
+ assert eou_span.attributes["duration_ms"] == pytest.approx(1100.0)
332
+ assert eou_span.attributes["end_of_utterance_delay"] == pytest.approx(1.1)
333
+ assert eou_span.attributes["transcription_delay"] == pytest.approx(0.25)
334
+ assert eou_span.attributes["on_user_turn_completed_delay"] == pytest.approx(0.0)
335
+ assert eou_span.attributes["speech_id"] == "speech-1"
336
 
337
  assert stt_span.attributes["user_transcript"] == "hello there"
338
  assert stt_span.attributes["stt_status"] == "measured"
339
+ assert stt_span.attributes["duration_ms"] == pytest.approx(200.0)
340
+ assert stt_span.attributes["request_id"] == "stt-1"
341
+ assert stt_span.attributes["streamed"] is True
342
  assert stt_span.attributes["stt_finalization_ms"] == pytest.approx(250.0)
343
  assert stt_span.attributes["stt_total_latency_ms"] == pytest.approx(1350.0)
344
 
 
349
  assert llm_span.attributes["input"] == "hello there"
350
  assert llm_span.attributes["output"] == "hi, how can I help?"
351
  assert llm_span.attributes["duration_ms"] > 0
352
+ assert llm_span.attributes["prompt_tokens"] == 12
353
+ assert llm_span.attributes["completion_tokens"] == 24
354
 
355
  assert tts_span.attributes["assistant_text"] == "hi, how can I help?"
356
  assert tts_span.attributes["ttfb_ms"] > 0
357
  assert tts_span.attributes["input"] == "hi, how can I help?"
358
  assert tts_span.attributes["output"] == "hi, how can I help?"
359
  assert tts_span.attributes["duration_ms"] > 0
360
+ assert tts_span.attributes["characters_count"] == 42
361
+ assert tts_span.attributes["streamed"] is True
362
 
363
+ assert conversational_span.attributes["duration_ms"] == pytest.approx(1350.0)
364
  assert (
365
  conversational_span.attributes["speech_end_to_assistant_speech_start_ms"]
366
+ == pytest.approx(1350.0)
367
  )
368
  assert conversational_span.attributes["eou_delay_ms"] == pytest.approx(1100.0)
 
369
  assert conversational_span.attributes["llm_ttft_ms"] > 0
370
  assert conversational_span.attributes["tts_ttfb_ms"] > 0
371
  assert all(span.end_count == 1 for span in fake_tracer.spans)
 
503
  asyncio.run(_run())
504
 
505
  span_names = [span.name for span in fake_tracer.spans]
506
+ assert span_names == [
507
+ "turn",
508
+ "user_input",
509
+ "VADMetrics",
510
+ "EOUMetrics",
511
+ "STTMetrics",
512
+ "LLMMetrics",
513
+ "TTSMetrics",
514
+ ]
515
+ stt_span = fake_tracer.spans[4]
516
  assert stt_span.attributes["user_transcript"] == "turn without stt metrics"
517
  assert stt_span.attributes["stt_status"] == "missing"
518
  assert "duration_ms" not in stt_span.attributes
 
752
  await collector.on_speech_created(_FakeSpeechHandle(chat_items=[], speech_id=speech_id))
753
  await collector.on_user_input_transcribed("Explain neural networks", is_final=True)
754
  await collector.on_metrics_collected(
755
+ _make_eou_metrics(speech_id, delay=0.0, transcription_delay=0.2)
756
  )
757
  await collector.on_metrics_collected(
758
  _make_llm_metrics(speech_id, duration=2.0, ttft=0.01)
 
778
  assert root.attributes["latency_ms.llm_to_tts_handoff"] > 150.0
779
  assert root.attributes["latency_ms.conversational"] == pytest.approx(
780
  root.attributes["latency_ms.eou_delay"]
 
781
  + root.attributes["latency_ms.llm_ttft"]
782
  + root.attributes["latency_ms.llm_to_tts_handoff"]
783
  + root.attributes["latency_ms.tts_ttfb"],
784
  abs=5.0,
785
  )
786
+ assert root.attributes["latency_ms.stt_finalization"] == pytest.approx(200.0)
787
 
788
  gap_spans = [span for span in fake_tracer.spans if span.name == "llm_to_tts_handoff"]
789
  assert len(gap_spans) == 1