Spaces:

dvalle08
/

open-voice-agent

Running

App Files Files Community

dvalle08 commited on 12 days ago

Commit

ef8254e

1 Parent(s): d9055df

Enhance VAD configuration and add fallback participant ID support: Update VAD_MIN_SPEECH_DURATION and VAD_MIN_SILENCE_DURATION for faster response times. Introduce fallback participant ID logic in MetricsCollector to handle cases where participant metadata is absent, ensuring accurate trace reporting.

Browse files

Files changed (7) hide show

.env.example +2 -2
src/agent/agent.py +8 -0
src/agent/metrics_collector.py +28 -7
src/core/settings.py +2 -2
src/plugins/pocket_tts/tts.py +151 -27
tests/test_langfuse_turn_tracing.py +101 -0
tests/test_pocket_tts_plugin.py +76 -0

.env.example CHANGED Viewed

@@ -61,6 +61,6 @@ LIVEKIT_PRE_CONNECT_AUDIO=true
 LIVEKIT_PRE_CONNECT_TIMEOUT=3.0
 # Voice Activity Detection (VAD) Configuration - OPTIMIZED FOR FALSE DETECTION FIX
-VAD_MIN_SPEECH_DURATION=0.25  # Require 250ms of speech before activation
-VAD_MIN_SILENCE_DURATION=0.5  # Require 500ms of silence before deactivation
 VAD_THRESHOLD=0.6  # Higher = less sensitive to noise (0.5 is default)

 LIVEKIT_PRE_CONNECT_TIMEOUT=3.0
 # Voice Activity Detection (VAD) Configuration - OPTIMIZED FOR FALSE DETECTION FIX
+VAD_MIN_SPEECH_DURATION=0.18  # Require 180ms of speech before activation (faster turn pickup)
+VAD_MIN_SILENCE_DURATION=0.30  # Require 300ms of silence before deactivation (faster turn end)
 VAD_THRESHOLD=0.6  # Higher = less sensitive to noise (0.5 is default)

src/agent/agent.py CHANGED Viewed

@@ -43,6 +43,13 @@ def _fallback_session_prefix() -> str | None:
     return None
 def setup_langfuse_tracer() -> TracerProvider | None:
     """Configure LiveKit telemetry tracer to export traces to Langfuse."""
     global _langfuse_tracer_provider
@@ -169,6 +176,7 @@ async def session_handler(ctx: agents.JobContext) -> None:
         room_id=initial_room_id,
         participant_id=initial_participant_id,
         fallback_session_prefix=_fallback_session_prefix(),
         langfuse_enabled=trace_provider is not None,
     )

     return None
+def _fallback_participant_prefix() -> str | None:
+    """Use console-prefixed fallback participant id when running `... console`."""
+    if any(arg == "console" for arg in sys.argv[1:]):
+        return "console"
+    return None
 def setup_langfuse_tracer() -> TracerProvider | None:
     """Configure LiveKit telemetry tracer to export traces to Langfuse."""
     global _langfuse_tracer_provider
         room_id=initial_room_id,
         participant_id=initial_participant_id,
         fallback_session_prefix=_fallback_session_prefix(),
+        fallback_participant_prefix=_fallback_participant_prefix(),
         langfuse_enabled=trace_provider is not None,
     )

src/agent/metrics_collector.py CHANGED Viewed

@@ -180,6 +180,7 @@ class MetricsCollector:
         room_id: Optional[str] = None,
         participant_id: Optional[str] = None,
         fallback_session_prefix: Optional[str] = None,
         langfuse_enabled: bool = False,
     ) -> None:
         """Initialize metrics collector.
@@ -192,6 +193,8 @@ class MetricsCollector:
             participant_id: LiveKit participant identity when available
             fallback_session_prefix: Prefix used for generated fallback session id
                 (e.g. "console" -> "console_<uuid>") when no metadata session id exists
             langfuse_enabled: Enable one-trace-per-turn Langfuse traces
         """
         self._room = room
@@ -208,7 +211,14 @@ class MetricsCollector:
             fallback_session_prefix
         )
         self._session_id = self._fallback_session_id or self.UNKNOWN_SESSION_ID
-        self._participant_id = participant_id or self.UNKNOWN_PARTICIPANT_ID
         self._langfuse_enabled = langfuse_enabled
         self._pending_trace_turns: deque[TraceTurn] = deque()
         self._trace_lock = asyncio.Lock()
@@ -271,8 +281,10 @@ class MetricsCollector:
                 ):
                     turn.session_id = self._session_id
                 if (
-                    turn.participant_id == self.UNKNOWN_PARTICIPANT_ID
-                    and self._participant_id != self.UNKNOWN_PARTICIPANT_ID
                 ):
                     turn.participant_id = self._participant_id
@@ -997,11 +1009,12 @@ class MetricsCollector:
             root_start_ns = time_ns()
             cursor_ns = root_start_ns
-            with tracer.start_as_current_span(
                 "turn",
                 context=root_context,
                 start_time=root_start_ns,
-            ) as turn_span:
                 turn.trace_id = trace.format_trace_id(turn_span.get_span_context().trace_id)
                 turn_span.set_attribute("session_id", turn.session_id)
                 turn_span.set_attribute("room_id", turn.room_id)
@@ -1127,7 +1140,7 @@ class MetricsCollector:
                         },
                         observation_output=str(conversational_latency_ms),
                     )
                 self._close_span_at(turn_span, cursor_ns)
             logger.info(
                 "Langfuse turn trace emitted: trace_id=%s turn_id=%s session_id=%s room_id=%s participant_id=%s",
@@ -1155,7 +1168,8 @@ class MetricsCollector:
         actual_duration_ms = max(duration_ms, 0.0) if duration_ms is not None else None
         end_ns = start_ns + self._duration_ms_to_ns(actual_duration_ms or 0.0)
-        with tracer.start_as_current_span(name, context=context, start_time=start_ns) as span:
             if actual_duration_ms is not None:
                 span.set_attribute("duration_ms", actual_duration_ms)
             if observation_input is not None:
@@ -1168,6 +1182,7 @@ class MetricsCollector:
                 if value is None:
                     continue
                 span.set_attribute(key, value)
             self._close_span_at(span, end_ns)
         return end_ns
@@ -1261,6 +1276,12 @@ class MetricsCollector:
             return None
         return f"{normalized_prefix}_{uuid.uuid4()}"
     async def _resolve_room_id(self) -> str:
         if self._room_id and self._room_id != self._room_name:
             return self._room_id

         room_id: Optional[str] = None,
         participant_id: Optional[str] = None,
         fallback_session_prefix: Optional[str] = None,
+        fallback_participant_prefix: Optional[str] = None,
         langfuse_enabled: bool = False,
     ) -> None:
         """Initialize metrics collector.
             participant_id: LiveKit participant identity when available
             fallback_session_prefix: Prefix used for generated fallback session id
                 (e.g. "console" -> "console_<uuid>") when no metadata session id exists
+            fallback_participant_prefix: Prefix used for generated fallback participant id
+                (e.g. "console" -> "console_<uuid>") when no participant identity exists
             langfuse_enabled: Enable one-trace-per-turn Langfuse traces
         """
         self._room = room
             fallback_session_prefix
         )
         self._session_id = self._fallback_session_id or self.UNKNOWN_SESSION_ID
+        self._fallback_participant_id = self._build_fallback_participant_id(
+            fallback_participant_prefix
+        )
+        self._participant_id = (
+            self._normalize_optional_text(participant_id)
+            or self._fallback_participant_id
+            or self.UNKNOWN_PARTICIPANT_ID
+        )
         self._langfuse_enabled = langfuse_enabled
         self._pending_trace_turns: deque[TraceTurn] = deque()
         self._trace_lock = asyncio.Lock()
                 ):
                     turn.session_id = self._session_id
                 if (
+                    turn.participant_id
+                    in {self.UNKNOWN_PARTICIPANT_ID, self._fallback_participant_id}
+                    and self._participant_id
+                    not in {self.UNKNOWN_PARTICIPANT_ID, self._fallback_participant_id}
                 ):
                     turn.participant_id = self._participant_id
             root_start_ns = time_ns()
             cursor_ns = root_start_ns
+            turn_span = tracer.start_span(
                 "turn",
                 context=root_context,
                 start_time=root_start_ns,
+            )
+            try:
                 turn.trace_id = trace.format_trace_id(turn_span.get_span_context().trace_id)
                 turn_span.set_attribute("session_id", turn.session_id)
                 turn_span.set_attribute("room_id", turn.room_id)
                         },
                         observation_output=str(conversational_latency_ms),
                     )
+            finally:
                 self._close_span_at(turn_span, cursor_ns)
             logger.info(
                 "Langfuse turn trace emitted: trace_id=%s turn_id=%s session_id=%s room_id=%s participant_id=%s",
         actual_duration_ms = max(duration_ms, 0.0) if duration_ms is not None else None
         end_ns = start_ns + self._duration_ms_to_ns(actual_duration_ms or 0.0)
+        span = tracer.start_span(name, context=context, start_time=start_ns)
+        try:
             if actual_duration_ms is not None:
                 span.set_attribute("duration_ms", actual_duration_ms)
             if observation_input is not None:
                 if value is None:
                     continue
                 span.set_attribute(key, value)
+        finally:
             self._close_span_at(span, end_ns)
         return end_ns
             return None
         return f"{normalized_prefix}_{uuid.uuid4()}"
+    def _build_fallback_participant_id(self, prefix: Optional[str]) -> Optional[str]:
+        normalized_prefix = self._normalize_optional_text(prefix)
+        if not normalized_prefix:
+            return None
+        return f"{normalized_prefix}_{uuid.uuid4()}"
     async def _resolve_room_id(self) -> str:
         if self._room_id and self._room_id != self._room_name:
             return self._room_id

src/core/settings.py CHANGED Viewed

@@ -91,13 +91,13 @@ class VoiceSettings(CoreSettings):
     # Voice Activity Detection Settings
     VAD_MIN_SPEECH_DURATION: float = Field(
-        default=0.25,
         ge=0.1,
         le=1.0,
         description="Minimum speech duration (seconds) before VAD activation",
     )
     VAD_MIN_SILENCE_DURATION: float = Field(
-        default=0.5,
         ge=0.1,
         le=2.0,
         description="Minimum silence duration (seconds) before VAD deactivation",

     # Voice Activity Detection Settings
     VAD_MIN_SPEECH_DURATION: float = Field(
+        default=0.18,
         ge=0.1,
         le=1.0,
         description="Minimum speech duration (seconds) before VAD activation",
     )
     VAD_MIN_SILENCE_DURATION: float = Field(
+        default=0.30,
         ge=0.1,
         le=2.0,
         description="Minimum silence duration (seconds) before VAD deactivation",

src/plugins/pocket_tts/tts.py CHANGED Viewed

@@ -4,6 +4,7 @@ import asyncio
 import contextlib
 import logging
 import queue
 import time
 from collections.abc import AsyncIterator
 from dataclasses import dataclass
@@ -27,6 +28,12 @@ logging.getLogger("pocket_tts.conditioners.text").setLevel(logging.WARNING)
 DEFAULT_VOICE = "alba"
 NATIVE_SAMPLE_RATE = 24000
 class TTSMetricsCallback(Protocol):
@@ -218,6 +225,13 @@ class PocketTTS(tts.TTS):
         audio_duration = _bytes_to_duration(total_bytes=total_bytes, sample_rate=self.sample_rate)
         return first_chunk_ttfb, generation_duration, audio_duration
 class PocketChunkedStream(tts.ChunkedStream):
     def __init__(
@@ -240,19 +254,32 @@ class PocketChunkedStream(tts.ChunkedStream):
             stream=False,
         )
-        (
-            first_chunk_ttfb,
-            generation_duration,
-            audio_duration,
-        ) = await pocket_tts._push_generated_audio(
-            text=self._input_text,
-            conn_options=self._conn_options,
-            output_emitter=output_emitter,
-        )
         output_emitter.flush()
-        if pocket_tts._metrics_callback:
             pocket_tts._metrics_callback(
                 ttfb=first_chunk_ttfb,
                 duration=generation_duration,
@@ -290,33 +317,130 @@ class PocketSynthesizeStream(tts.SynthesizeStream):
     async def _flush_text_buffer(
         self, *, text_buffer: str, output_emitter: tts.AudioEmitter
     ) -> None:
-        if not text_buffer.strip():
             return
-        segment_id = shortuuid("SEG_")
-        output_emitter.start_segment(segment_id=segment_id)
-        await self._synthesize_segment(text_buffer, output_emitter)
-        output_emitter.end_segment()
-    async def _synthesize_segment(self, text: str, output_emitter: tts.AudioEmitter) -> None:
         self._mark_started()
         pocket_tts = cast(PocketTTS, self._tts)
-        (
-            first_chunk_ttfb,
-            generation_duration,
-            audio_duration,
-        ) = await pocket_tts._push_generated_audio(
             text=text,
             conn_options=self._conn_options,
             output_emitter=output_emitter,
         )
-        if pocket_tts._metrics_callback:
-            pocket_tts._metrics_callback(
-                ttfb=first_chunk_ttfb,
-                duration=generation_duration,
-                audio_duration=audio_duration,
-            )
 def _tensor_to_pcm_bytes(

 import contextlib
 import logging
 import queue
+import re
 import time
 from collections.abc import AsyncIterator
 from dataclasses import dataclass
 DEFAULT_VOICE = "alba"
 NATIVE_SAMPLE_RATE = 24000
+MAX_TTS_SEGMENT_CHARS = 220
+_BULLET_PREFIX_RE = re.compile(r"^\s*(?:[-*+]|(?:\d+[\.\)]))\s+")
+_MARKDOWN_LINK_RE = re.compile(r"\[([^\]]+)\]\((?:[^)]+)\)")
+_SENTENCE_SPLIT_RE = re.compile(r"(?<=[.!?])\s+")
+_WHITESPACE_RE = re.compile(r"\s+")
 class TTSMetricsCallback(Protocol):
         audio_duration = _bytes_to_duration(total_bytes=total_bytes, sample_rate=self.sample_rate)
         return first_chunk_ttfb, generation_duration, audio_duration
+    def _prepare_text_segments(self, text: str) -> list[str]:
+        """Normalize text for TTS and split into short chunks for lower tail latency."""
+        cleaned = _sanitize_tts_text(text)
+        if not cleaned:
+            return []
+        return _chunk_tts_text(cleaned, max_chars=MAX_TTS_SEGMENT_CHARS)
 class PocketChunkedStream(tts.ChunkedStream):
     def __init__(
             stream=False,
         )
+        text_segments = pocket_tts._prepare_text_segments(self._input_text)
+        if not text_segments:
+            output_emitter.flush()
+            return
+        first_chunk_ttfb = -1.0
+        generation_duration = 0.0
+        audio_duration = 0.0
+        for text_segment in text_segments:
+            (
+                segment_ttfb,
+                segment_duration,
+                segment_audio_duration,
+            ) = await pocket_tts._push_generated_audio(
+                text=text_segment,
+                conn_options=self._conn_options,
+                output_emitter=output_emitter,
+            )
+            if first_chunk_ttfb < 0 and segment_ttfb >= 0:
+                first_chunk_ttfb = segment_ttfb
+            generation_duration += segment_duration
+            audio_duration += segment_audio_duration
         output_emitter.flush()
+        if pocket_tts._metrics_callback and first_chunk_ttfb >= 0:
             pocket_tts._metrics_callback(
                 ttfb=first_chunk_ttfb,
                 duration=generation_duration,
     async def _flush_text_buffer(
         self, *, text_buffer: str, output_emitter: tts.AudioEmitter
     ) -> None:
+        pocket_tts = cast(PocketTTS, self._tts)
+        text_segments = pocket_tts._prepare_text_segments(text_buffer)
+        if not text_segments:
             return
+        # LiveKit expects one segment per flushed text buffer in streaming mode.
+        output_emitter.start_segment(segment_id=shortuuid("SEG_"))
+        first_chunk_ttfb = -1.0
+        generation_duration = 0.0
+        audio_duration = 0.0
+        try:
+            for text_segment in text_segments:
+                (
+                    segment_ttfb,
+                    segment_duration,
+                    segment_audio_duration,
+                ) = await self._synthesize_segment(text_segment, output_emitter)
+                if first_chunk_ttfb < 0 and segment_ttfb >= 0:
+                    first_chunk_ttfb = segment_ttfb
+                generation_duration += segment_duration
+                audio_duration += segment_audio_duration
+        finally:
+            output_emitter.end_segment()
+        if pocket_tts._metrics_callback and first_chunk_ttfb >= 0:
+            pocket_tts._metrics_callback(
+                ttfb=first_chunk_ttfb,
+                duration=generation_duration,
+                audio_duration=audio_duration,
+            )
+    async def _synthesize_segment(
+        self, text: str, output_emitter: tts.AudioEmitter
+    ) -> tuple[float, float, float]:
         self._mark_started()
         pocket_tts = cast(PocketTTS, self._tts)
+        return await pocket_tts._push_generated_audio(
             text=text,
             conn_options=self._conn_options,
             output_emitter=output_emitter,
         )
+def _sanitize_tts_text(text: str) -> str:
+    if not text:
+        return ""
+    normalized = text.replace("\r\n", "\n").replace("\r", "\n")
+    normalized = _MARKDOWN_LINK_RE.sub(r"\1", normalized)
+    cleaned_lines: list[str] = []
+    for raw_line in normalized.split("\n"):
+        line = raw_line.strip()
+        if not line:
+            continue
+        line = _BULLET_PREFIX_RE.sub("", line)
+        line = line.lstrip("#> ").strip()
+        line = line.replace("**", "")
+        line = line.replace("__", "")
+        line = line.replace("`", "")
+        line = line.replace("*", "")
+        line = line.replace("|", " ")
+        cleaned_lines.append(line)
+    cleaned = " ".join(cleaned_lines)
+    cleaned = _WHITESPACE_RE.sub(" ", cleaned).strip()
+    return cleaned
+def _chunk_tts_text(text: str, *, max_chars: int) -> list[str]:
+    if not text.strip():
+        return []
+    if len(text) <= max_chars:
+        return [text]
+    sentences = [s.strip() for s in _SENTENCE_SPLIT_RE.split(text) if s.strip()]
+    if not sentences:
+        sentences = [text.strip()]
+    chunks: list[str] = []
+    current = ""
+    for sentence in sentences:
+        for sentence_part in _split_overlong_text(sentence, max_chars=max_chars):
+            if not current:
+                current = sentence_part
+                continue
+            candidate = f"{current} {sentence_part}"
+            if len(candidate) <= max_chars:
+                current = candidate
+            else:
+                chunks.append(current)
+                current = sentence_part
+    if current:
+        chunks.append(current)
+    return chunks
+def _split_overlong_text(text: str, *, max_chars: int) -> list[str]:
+    if len(text) <= max_chars:
+        return [text]
+    words = text.split()
+    if not words:
+        return []
+    chunks: list[str] = []
+    current_words: list[str] = []
+    current_len = 0
+    for word in words:
+        additional_len = len(word) if not current_words else len(word) + 1
+        if current_words and current_len + additional_len > max_chars:
+            chunks.append(" ".join(current_words))
+            current_words = [word]
+            current_len = len(word)
+            continue
+        current_words.append(word)
+        current_len += additional_len
+    if current_words:
+        chunks.append(" ".join(current_words))
+    return chunks
 def _tensor_to_pcm_bytes(

tests/test_langfuse_turn_tracing.py CHANGED Viewed

@@ -8,6 +8,7 @@ from typing import Any
 import pytest
 from livekit.agents import metrics
 from src.agent.metrics_collector import MetricsCollector
@@ -23,6 +24,7 @@ class _FakeSpan:
     name: str
     trace_id: int
     attributes: dict[str, Any] = field(default_factory=dict)
     def set_attribute(self, key: str, value: Any) -> None:
         self.attributes[key] = value
@@ -30,6 +32,10 @@ class _FakeSpan:
     def get_span_context(self) -> _FakeSpanContext:
         return _FakeSpanContext(trace_id=self.trace_id)
 class _FakeTracer:
     def __init__(self) -> None:
@@ -37,6 +43,22 @@ class _FakeTracer:
         self._stack: list[_FakeSpan] = []
         self._next_trace_id = 1
     @contextmanager
     def start_as_current_span(self, name: str, **_: Any):  # type: ignore[no-untyped-def]
         if self._stack:
@@ -55,6 +77,9 @@ class _FakeTracer:
 class _BrokenTracer:
     @contextmanager
     def start_as_current_span(self, name: str, **_: Any):  # type: ignore[no-untyped-def]
         raise RuntimeError(f"broken tracer for {name}")
@@ -255,6 +280,7 @@ def test_turn_trace_has_required_metadata_and_spans(monkeypatch: pytest.MonkeyPa
     assert conversational_span.attributes["stt_finalization_ms"] == pytest.approx(250.0)
     assert conversational_span.attributes["llm_ttft_ms"] > 0
     assert conversational_span.attributes["tts_ttfb_ms"] > 0
     payloads = _decode_payloads(room)
     trace_updates = [payload for payload in payloads if payload.get("type") == "trace_update"]
@@ -566,3 +592,78 @@ def test_real_session_metadata_overrides_fallback_for_pending_turns(
     turn_spans = [span for span in fake_tracer.spans if span.name == "turn"]
     assert len(turn_spans) == 1
     assert turn_spans[0].attributes["session_id"] == "session-real"

 import pytest
+from opentelemetry import trace as otel_trace
 from livekit.agents import metrics
 from src.agent.metrics_collector import MetricsCollector
     name: str
     trace_id: int
     attributes: dict[str, Any] = field(default_factory=dict)
+    end_count: int = 0
     def set_attribute(self, key: str, value: Any) -> None:
         self.attributes[key] = value
     def get_span_context(self) -> _FakeSpanContext:
         return _FakeSpanContext(trace_id=self.trace_id)
+    def end(self, end_time: Any = None) -> None:
+        _ = end_time
+        self.end_count += 1
 class _FakeTracer:
     def __init__(self) -> None:
         self._stack: list[_FakeSpan] = []
         self._next_trace_id = 1
+    def start_span(self, name: str, **kwargs: Any) -> _FakeSpan:
+        trace_id = None
+        context = kwargs.get("context")
+        if context is not None:
+            parent_span = otel_trace.get_current_span(context)
+            get_span_context = getattr(parent_span, "get_span_context", None)
+            if callable(get_span_context):
+                trace_id = get_span_context().trace_id
+        if not trace_id:
+            trace_id = self._next_trace_id
+            self._next_trace_id += 1
+        span = _FakeSpan(name=name, trace_id=trace_id)
+        self.spans.append(span)
+        return span
     @contextmanager
     def start_as_current_span(self, name: str, **_: Any):  # type: ignore[no-untyped-def]
         if self._stack:
 class _BrokenTracer:
+    def start_span(self, name: str, **_: Any) -> Any:
+        raise RuntimeError(f"broken tracer for {name}")
     @contextmanager
     def start_as_current_span(self, name: str, **_: Any):  # type: ignore[no-untyped-def]
         raise RuntimeError(f"broken tracer for {name}")
     assert conversational_span.attributes["stt_finalization_ms"] == pytest.approx(250.0)
     assert conversational_span.attributes["llm_ttft_ms"] > 0
     assert conversational_span.attributes["tts_ttfb_ms"] > 0
+    assert all(span.end_count == 1 for span in fake_tracer.spans)
     payloads = _decode_payloads(room)
     trace_updates = [payload for payload in payloads if payload.get("type") == "trace_update"]
     turn_spans = [span for span in fake_tracer.spans if span.name == "turn"]
     assert len(turn_spans) == 1
     assert turn_spans[0].attributes["session_id"] == "session-real"
+def test_fallback_console_participant_id_is_used_when_metadata_absent(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    import src.agent.metrics_collector as metrics_collector_module
+    fake_tracer = _FakeTracer()
+    monkeypatch.setattr(metrics_collector_module, "tracer", fake_tracer)
+    room = _FakeRoom()
+    collector = MetricsCollector(
+        room=room,  # type: ignore[arg-type]
+        model_name="moonshine",
+        room_name=room.name,
+        room_id="RM123",
+        participant_id=None,
+        fallback_session_prefix="console",
+        fallback_participant_prefix="console",
+        langfuse_enabled=True,
+    )
+    async def _run() -> None:
+        await collector.on_user_input_transcribed("console participant", is_final=True)
+        await collector.on_metrics_collected(_make_llm_metrics("speech-console-participant"))
+        await collector.on_conversation_item_added(role="assistant", content="ok")
+        await collector.on_metrics_collected(_make_tts_metrics("speech-console-participant"))
+        await collector.wait_for_pending_trace_tasks()
+    asyncio.run(_run())
+    turn_spans = [span for span in fake_tracer.spans if span.name == "turn"]
+    assert len(turn_spans) == 1
+    participant_id = turn_spans[0].attributes["participant_id"]
+    assert participant_id.startswith("console_")
+    assert participant_id != "unknown-participant"
+def test_real_participant_metadata_overrides_fallback_for_pending_turns(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    import src.agent.metrics_collector as metrics_collector_module
+    fake_tracer = _FakeTracer()
+    monkeypatch.setattr(metrics_collector_module, "tracer", fake_tracer)
+    room = _FakeRoom()
+    collector = MetricsCollector(
+        room=room,  # type: ignore[arg-type]
+        model_name="moonshine",
+        room_name=room.name,
+        room_id="RM123",
+        participant_id=None,
+        fallback_session_prefix="console",
+        fallback_participant_prefix="console",
+        langfuse_enabled=True,
+    )
+    async def _run() -> None:
+        await collector.on_user_input_transcribed("override participant", is_final=True)
+        await collector.on_metrics_collected(_make_llm_metrics("speech-override-participant"))
+        await collector.on_metrics_collected(_make_tts_metrics("speech-override-participant"))
+        await collector.on_session_metadata(
+            session_id="session-real-participant",
+            participant_id="web-real-participant",
+        )
+        await collector.on_conversation_item_added(role="assistant", content="reply")
+        await collector.wait_for_pending_trace_tasks()
+    asyncio.run(_run())
+    turn_spans = [span for span in fake_tracer.spans if span.name == "turn"]
+    assert len(turn_spans) == 1
+    assert turn_spans[0].attributes["session_id"] == "session-real-participant"
+    assert turn_spans[0].attributes["participant_id"] == "web-real-participant"

tests/test_pocket_tts_plugin.py CHANGED Viewed

@@ -27,6 +27,7 @@ def pocket_plugin(monkeypatch: pytest.MonkeyPatch) -> Any:
         "raise_on_generate": None,
         "active_generations": 0,
         "max_active_generations": 0,
     }
     class _FakeModel:
@@ -46,6 +47,7 @@ def pocket_plugin(monkeypatch: pytest.MonkeyPatch) -> Any:
         ) -> Generator[np.ndarray[Any, np.dtype[np.float32]], None, None]:
             calls["state"] = state
             calls["text"] = text
             calls["copy_state"] = copy_state
             calls["active_generations"] += 1
             calls["max_active_generations"] = max(
@@ -203,6 +205,30 @@ def test_stream_emits_before_generation_completes(pocket_plugin: Any) -> None:
     asyncio.run(_run())
 def test_chunked_generation_serializes_concurrent_requests(pocket_plugin: Any) -> None:
     module = pocket_plugin["module"]
     pocket_plugin["per_chunk_sleep"] = 0.03
@@ -253,5 +279,55 @@ def test_generation_timeout_is_mapped_to_api_timeout_error(pocket_plugin: Any) -
         gate.set()
 async def _collect_events(stream: Any) -> list[Any]:
     return [event async for event in stream]

         "raise_on_generate": None,
         "active_generations": 0,
         "max_active_generations": 0,
+        "texts": [],
     }
     class _FakeModel:
         ) -> Generator[np.ndarray[Any, np.dtype[np.float32]], None, None]:
             calls["state"] = state
             calls["text"] = text
+            calls["texts"].append(text)
             calls["copy_state"] = copy_state
             calls["active_generations"] += 1
             calls["max_active_generations"] = max(
     asyncio.run(_run())
+def test_stream_uses_single_segment_for_one_flush(pocket_plugin: Any) -> None:
+    module = pocket_plugin["module"]
+    tts_v = module.PocketTTS(voice="alba")
+    long_text = (
+        "First sentence with enough words to trigger internal chunking. " * 6
+        + "Second sentence also long enough to split. " * 6
+    )
+    async def _run() -> None:
+        async with tts_v.stream() as synth_stream:
+            synth_stream.push_text(long_text)
+            synth_stream.end_input()
+            events = await asyncio.wait_for(_collect_events(synth_stream), timeout=3.0)
+        segment_ids = {
+            event.segment_id
+            for event in events
+            if not event.is_final and isinstance(event.segment_id, str) and event.segment_id
+        }
+        assert len(segment_ids) == 1
+    asyncio.run(_run())
 def test_chunked_generation_serializes_concurrent_requests(pocket_plugin: Any) -> None:
     module = pocket_plugin["module"]
     pocket_plugin["per_chunk_sleep"] = 0.03
         gate.set()
+def test_sanitize_tts_text_removes_markdown_noise(pocket_plugin: Any) -> None:
+    module = pocket_plugin["module"]
+    raw_text = """
+    ## Title
+    - **Bold** item with [link text](https://example.com)
+    1. `code` item
+    """
+    sanitized = module._sanitize_tts_text(raw_text)
+    assert "##" not in sanitized
+    assert "**" not in sanitized
+    assert "`" not in sanitized
+    assert "[link text]" not in sanitized
+    assert "(https://example.com)" not in sanitized
+    assert "link text" in sanitized
+    assert "Bold item with" in sanitized
+def test_chunk_tts_text_respects_length_limit(pocket_plugin: Any) -> None:
+    module = pocket_plugin["module"]
+    text = " ".join(["word"] * 80)
+    chunks = module._chunk_tts_text(text, max_chars=40)
+    assert len(chunks) > 1
+    assert all(len(chunk) <= 40 for chunk in chunks)
+    assert " ".join(chunks).replace("  ", " ").strip() == text
+def test_chunked_synthesize_sanitizes_and_splits_long_text(pocket_plugin: Any) -> None:
+    module = pocket_plugin["module"]
+    tts_v = module.PocketTTS(voice="alba")
+    text = (
+        "## Header\n"
+        "- **First** item with [a link](https://example.com).\n"
+        + "Second sentence keeps going with enough words to exceed the segment limit. " * 5
+        + "Third sentence keeps going with enough words to exceed the segment limit. " * 5
+    )
+    async def _run() -> None:
+        await _collect_events(tts_v.synthesize(text))
+    asyncio.run(_run())
+    generated_texts = pocket_plugin["texts"]
+    assert len(generated_texts) >= 2
+    assert all(len(part) <= module.MAX_TTS_SEGMENT_CHARS for part in generated_texts)
+    assert all("**" not in part and "`" not in part for part in generated_texts)
+    assert all("https://example.com" not in part for part in generated_texts)
 async def _collect_events(stream: Any) -> list[Any]:
     return [event async for event in stream]