Spaces:

dvalle08
/

open-voice-agent

Running

App Files Files Community

dvalle08 commited on 24 days ago

Commit

11c8a27

1 Parent(s): af17212

Add Pocket TTS plugin with configuration settings and integration into LLMFactory

Browse files

Files changed (7) hide show

pyproject.toml +1 -0
src/agent/llm_factory.py +71 -36
src/core/settings.py +17 -0
src/plugins/pocket_tts/__init__.py +5 -0
src/plugins/pocket_tts/tts.py +213 -0
testing/livekit_custom.py +2 -1
uv.lock +16 -0

pyproject.toml CHANGED Viewed

@@ -14,4 +14,5 @@ dependencies = [
   "livekit-agents[silero,turn-detector]~=1.3",
   "livekit-plugins-noise-cancellation~=0.2",
   "langgraph>=1.0.8",
 ]

   "livekit-agents[silero,turn-detector]~=1.3",
   "livekit-plugins-noise-cancellation~=0.2",
   "langgraph>=1.0.8",
+  "pydantic-settings>=2.12.0",
 ]

src/agent/llm_factory.py CHANGED Viewed

@@ -3,9 +3,9 @@ from typing import Any, Union
 from huggingface_hub import InferenceClient
 from transformers import pipeline
-from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint, HuggingFacePipeline
-from kokoro import KPipeline
 import torch
 from langchain_nvidia_ai_endpoints import ChatNVIDIA
@@ -32,40 +32,40 @@ class LLMFactory:
             max_completion_tokens=max_tokens,
         )
-    @staticmethod
-    def create_huggingface_llm(
-        model_id: str,
-        provider: str = "auto",
-        temperature: float = settings.llm.LLM_TEMPERATURE,
-        max_tokens: int = settings.llm.LLM_MAX_TOKENS,
-        run_local: bool = False,
-    ) -> ChatHuggingFace:
-        if run_local:
-            logger.info(f"Initializing local HuggingFace LLM: {model_id}")
-            llm = HuggingFacePipeline.from_model_id(
-                model_id=model_id,
-                task="text-generation",
-                pipeline_kwargs={
-                    "temperature": temperature,
-                    "max_new_tokens": max_tokens,
-                },
-            )
-            return ChatHuggingFace(llm=llm)
-        token = (settings.llm.HF_TOKEN or "").strip()
-        if not token:
-            raise ValueError("HF_TOKEN must be set to use the HuggingFace LLM provider.")
-        logger.info(f"Initializing HuggingFace LLM: {model_id} via provider={provider}")
-        llm = HuggingFaceEndpoint(
-            repo_id=model_id,
-            provider=provider,
-            huggingfacehub_api_token=token,
-            temperature=temperature,
-            max_new_tokens=max_tokens,
-        )
-        return ChatHuggingFace(llm=llm)
     @staticmethod
     def create_huggingface_stt(
@@ -126,3 +126,38 @@ class LLMFactory:
         logger.info(f"Initializing Moonshine ONNX STT: {model_size}")
         from src.plugins.moonshine_stt import MoonshineSTT
         return MoonshineSTT(model_size=model_size, language=language)

 from huggingface_hub import InferenceClient
 from transformers import pipeline
+#from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint, HuggingFacePipeline
+#from kokoro import KPipeline
 import torch
 from langchain_nvidia_ai_endpoints import ChatNVIDIA
             max_completion_tokens=max_tokens,
         )
+    # @staticmethod
+    # def create_huggingface_llm(
+    #     model_id: str,
+    #     provider: str = "auto",
+    #     temperature: float = settings.llm.LLM_TEMPERATURE,
+    #     max_tokens: int = settings.llm.LLM_MAX_TOKENS,
+    #     run_local: bool = False,
+    # ) -> ChatHuggingFace:
+    #     if run_local:
+    #         logger.info(f"Initializing local HuggingFace LLM: {model_id}")
+    #         llm = HuggingFacePipeline.from_model_id(
+    #             model_id=model_id,
+    #             task="text-generation",
+    #             pipeline_kwargs={
+    #                 "temperature": temperature,
+    #                 "max_new_tokens": max_tokens,
+    #             },
+    #         )
+    #         return ChatHuggingFace(llm=llm)
+    #     token = (settings.llm.HF_TOKEN or "").strip()
+    #     if not token:
+    #         raise ValueError("HF_TOKEN must be set to use the HuggingFace LLM provider.")
+    #     logger.info(f"Initializing HuggingFace LLM: {model_id} via provider={provider}")
+    #     llm = HuggingFaceEndpoint(
+    #         repo_id=model_id,
+    #         provider=provider,
+    #         huggingfacehub_api_token=token,
+    #         temperature=temperature,
+    #         max_new_tokens=max_tokens,
+    #     )
+    #     return ChatHuggingFace(llm=llm)
     @staticmethod
     def create_huggingface_stt(
         logger.info(f"Initializing Moonshine ONNX STT: {model_size}")
         from src.plugins.moonshine_stt import MoonshineSTT
         return MoonshineSTT(model_size=model_size, language=language)
+    @staticmethod
+    def create_pocket_tts(
+        voice: str | None = None,
+        temperature: float | None = None,
+        lsd_decode_steps: int | None = None,
+    ) -> "PocketTTS":
+        """Initialize Pocket TTS plugin.
+        Args:
+            voice: Voice name (alba, marius, etc.) or path to audio file.
+                   If None, uses settings.voice.POCKET_TTS_VOICE
+            temperature: Sampling temperature (0.0-2.0).
+                        If None, uses settings.voice.POCKET_TTS_TEMPERATURE
+            lsd_decode_steps: LSD decoding steps for quality.
+                             If None, uses settings.voice.POCKET_TTS_LSD_DECODE_STEPS
+        Returns:
+            PocketTTS plugin instance
+        """
+        from src.plugins.pocket_tts import PocketTTS
+        if voice is None:
+            voice = settings.voice.POCKET_TTS_VOICE
+        if temperature is None:
+            temperature = settings.voice.POCKET_TTS_TEMPERATURE
+        if lsd_decode_steps is None:
+            lsd_decode_steps = settings.voice.POCKET_TTS_LSD_DECODE_STEPS
+        logger.info(f"Initializing Pocket TTS: voice={voice}, temp={temperature}, lsd_steps={lsd_decode_steps}")
+        return PocketTTS(
+            voice=voice,
+            temperature=temperature,
+            lsd_decode_steps=lsd_decode_steps,
+        )

src/core/settings.py CHANGED Viewed

@@ -69,6 +69,23 @@ class VoiceSettings(CoreSettings):
         description="Moonshine model size: tiny, base, or small"
     )
 class LLMSettings(CoreSettings):
     NVIDIA_API_KEY: Optional[str] = Field(default=None)

         description="Moonshine model size: tiny, base, or small"
     )
+    # TTS (Text-to-Speech) Settings - Pocket TTS
+    POCKET_TTS_VOICE: str = Field(
+        default="alba",
+        description="Default voice (alba, marius, javert, jean, fantine, cosette, eponine, azelma) or path to audio file"
+    )
+    POCKET_TTS_TEMPERATURE: float = Field(
+        default=0.7,
+        ge=0.0,
+        le=2.0,
+        description="Sampling temperature for generation"
+    )
+    POCKET_TTS_LSD_DECODE_STEPS: int = Field(
+        default=1,
+        ge=1,
+        description="LSD decoding steps (higher = better quality, slower)"
+    )
 class LLMSettings(CoreSettings):
     NVIDIA_API_KEY: Optional[str] = Field(default=None)

src/plugins/pocket_tts/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""Pocket TTS Plugin - Local text-to-speech using Kyutai's pocket-tts."""
+from .tts import PocketTTS
+__all__ = ["PocketTTS"]

src/plugins/pocket_tts/tts.py ADDED Viewed

	@@ -0,0 +1,213 @@

+# tts.py - Pocket TTS Plugin for LiveKit Agents
+from __future__ import annotations
+import asyncio
+import logging
+import uuid
+from typing import Any
+import numpy as np
+import torch
+from pocket_tts import TTSModel
+from livekit.agents import tts
+from livekit.agents.types import APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS
+from src.core.logger import logger
+# Reduce verbosity of pocket_tts library to avoid console spam
+logging.getLogger("pocket_tts").setLevel(logging.WARNING)
+logging.getLogger("pocket_tts.models.tts_model").setLevel(logging.WARNING)
+logging.getLogger("pocket_tts.utils.utils").setLevel(logging.WARNING)
+logging.getLogger("pocket_tts.conditioners.text").setLevel(logging.WARNING)
+class PocketTTS(tts.TTS):
+    def __init__(
+        self,
+        *,
+        voice: str = "alba",
+        temperature: float = 0.7,
+        lsd_decode_steps: int = 1,
+    ) -> None:
+        """Initialize Pocket TTS plugin.
+        Args:
+            voice: Voice name (alba, marius, javert, jean, fantine, cosette, eponine, azelma)
+                   or path to audio file for custom voice
+            temperature: Sampling temperature (0.0-2.0)
+            lsd_decode_steps: LSD decoding steps (higher = better quality, slower)
+        """
+        super().__init__(
+            capabilities=tts.TTSCapabilities(streaming=True, aligned_transcript=False),
+            sample_rate=24000,
+            num_channels=1,
+        )
+        self._voice = voice
+        self._temperature = temperature
+        self._lsd_decode_steps = lsd_decode_steps
+        try:
+            logger.info(f"Loading Pocket TTS model: temp={temperature}, lsd_steps={lsd_decode_steps}")
+            self._model = TTSModel.load_model(
+                temp=temperature,
+                lsd_decode_steps=lsd_decode_steps,
+            )
+            logger.info("Pocket TTS model loaded successfully")
+            logger.info(f"Loading voice state: {voice}")
+            self._voice_state = self._model.get_state_for_audio_prompt(voice, truncate=True)
+            logger.info(f"Voice state loaded for: {voice}")
+        except FileNotFoundError as e:
+            raise ValueError(f"Failed to load voice '{voice}': {e}") from e
+        except Exception as e:
+            logger.warning(f"Failed to load voice '{voice}': {e}, falling back to 'alba'")
+            try:
+                self._voice = "alba"
+                self._voice_state = self._model.get_state_for_audio_prompt("alba", truncate=True)
+                logger.info("Fallback to 'alba' voice successful")
+            except Exception as fallback_error:
+                raise ValueError(f"Failed to load Pocket TTS model: {fallback_error}") from fallback_error
+    @property
+    def model(self) -> str:
+        return "pocket-tts"
+    @property
+    def provider(self) -> str:
+        return "kyutai"
+    def synthesize(
+        self,
+        text: str,
+        *,
+        conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
+    ) -> tts.ChunkedStream:
+        """Synthesize text to speech using batch generation.
+        Args:
+            text: Text to synthesize
+            conn_options: API connection options
+        Returns:
+            ChunkedStream for batch synthesis
+        """
+        return self._synthesize_with_stream(text, conn_options=conn_options)
+    def stream(
+        self,
+        *,
+        conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
+    ) -> tts.SynthesizeStream:
+        """Create a streaming synthesis stream.
+        Args:
+            conn_options: API connection options
+        Returns:
+            PocketSynthesizeStream for progressive synthesis
+        """
+        return PocketSynthesizeStream(
+            tts=self,
+            conn_options=conn_options,
+        )
+class PocketSynthesizeStream(tts.SynthesizeStream):
+    def __init__(
+        self,
+        *,
+        tts: PocketTTS,
+        conn_options: APIConnectOptions,
+    ) -> None:
+        """Initialize streaming synthesis stream.
+        Args:
+            tts: PocketTTS instance
+            conn_options: API connection options
+        """
+        super().__init__(tts=tts, conn_options=conn_options)
+        self._tts = tts
+    async def _run(self, output_emitter: tts.AudioEmitter) -> None:
+        """Process input stream and generate audio progressively.
+        Args:
+            output_emitter: Audio emitter for pushing generated audio
+        """
+        request_id = str(uuid.uuid4())
+        segment_id = str(uuid.uuid4())
+        output_emitter.initialize(
+            request_id=request_id,
+            sample_rate=24000,
+            num_channels=1,
+            mime_type="audio/pcm",
+            stream=True,
+        )
+        output_emitter.start_segment(segment_id=segment_id)
+        text_buffer = ""
+        async for data in self._input_ch:
+            if isinstance(data, self._FlushSentinel):
+                if text_buffer.strip():
+                    await self._synthesize_segment(text_buffer, output_emitter)
+                    text_buffer = ""
+                output_emitter.end_segment()
+                segment_id = str(uuid.uuid4())
+                output_emitter.start_segment(segment_id=segment_id)
+                continue
+            text_buffer += data
+        if text_buffer.strip():
+            await self._synthesize_segment(text_buffer, output_emitter)
+        output_emitter.end_segment()
+    async def _synthesize_segment(
+        self,
+        text: str,
+        output_emitter: tts.AudioEmitter,
+    ) -> None:
+        """Synthesize a text segment and push audio chunks to emitter.
+        Args:
+            text: Text segment to synthesize
+            output_emitter: Audio emitter for pushing generated audio
+        """
+        try:
+            def _generate_and_push() -> None:
+                for audio_chunk in self._tts._model.generate_audio_stream(
+                    self._tts._voice_state,
+                    text,
+                    copy_state=True,
+                ):
+                    audio_bytes = self._tensor_to_pcm_bytes(audio_chunk)
+                    output_emitter.push(audio_bytes)
+            await asyncio.to_thread(_generate_and_push)
+        except Exception as e:
+            logger.error(f"Error synthesizing segment: {e}")
+            raise
+    def _tensor_to_pcm_bytes(self, audio_tensor: torch.Tensor) -> bytes:
+        """Convert audio tensor to PCM bytes.
+        Args:
+            audio_tensor: Audio tensor with shape [samples] or [channels, samples]
+        Returns:
+            PCM audio bytes (int16)
+        """
+        if audio_tensor.ndim > 1:
+            audio_tensor = audio_tensor.mean(dim=0)
+        audio_int16 = (audio_tensor.clamp(-1.0, 1.0) * 32767.0).short()
+        return audio_int16.cpu().numpy().tobytes()

testing/livekit_custom.py CHANGED Viewed

@@ -24,6 +24,7 @@ from huggingface_hub import InferenceClient
 import io
 import wave
 from src.plugins.moonshine_stt import MoonshineSTT
 load_dotenv(".env")
@@ -76,7 +77,7 @@ async def my_agent(ctx: agents.JobContext):
     session = AgentSession(
         stt=MoonshineSTT(model_id="UsefulSensors/moonshine-streaming-medium"),
         llm=langchain.LLMAdapter(create_nvidia_workflow()),
-        tts="cartesia/sonic-3:9626c31c-bec5-4cca-baa8-f8ba9e84c8bc",
         vad=silero.VAD.load(),
         turn_detection=MultilingualModel(),
     )

 import io
 import wave
 from src.plugins.moonshine_stt import MoonshineSTT
+from src.agent.llm_factory import LLMFactory
 load_dotenv(".env")
     session = AgentSession(
         stt=MoonshineSTT(model_id="UsefulSensors/moonshine-streaming-medium"),
         llm=langchain.LLMAdapter(create_nvidia_workflow()),
+        tts=LLMFactory.create_pocket_tts(voice="alba"),
         vad=silero.VAD.load(),
         turn_detection=MultilingualModel(),
     )

uv.lock CHANGED Viewed

@@ -2088,6 +2088,7 @@ dependencies = [
     { name = "livekit-agents", extra = ["silero", "turn-detector"] },
     { name = "livekit-plugins-noise-cancellation" },
     { name = "nemo-toolkit", extra = ["asr"] },
     { name = "python-dotenv" },
     { name = "torch" },
     { name = "torchaudio" },
@@ -2101,6 +2102,7 @@ requires-dist = [
     { name = "livekit-agents", extras = ["silero", "turn-detector"], specifier = "~=1.3" },
     { name = "livekit-plugins-noise-cancellation", specifier = "~=0.2" },
     { name = "nemo-toolkit", extras = ["asr"] },
     { name = "python-dotenv", specifier = ">=1.2.1" },
     { name = "torch", specifier = "==2.10.0" },
     { name = "torchaudio", specifier = "==2.10.0" },
@@ -2650,6 +2652,20 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/2f/02/8559b1f26ee0d502c74f9cca5c0d2fd97e967e083e006bbbb4e97f3a043a/pydantic_core-2.41.5-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d3a978c4f57a597908b7e697229d996d77a6d3c94901e9edee593adada95ce1a", size = 2147009, upload-time = "2025-11-04T13:43:23.286Z" },
 ]
 [[package]]
 name = "pydub"
 version = "0.25.1"

     { name = "livekit-agents", extra = ["silero", "turn-detector"] },
     { name = "livekit-plugins-noise-cancellation" },
     { name = "nemo-toolkit", extra = ["asr"] },
+    { name = "pydantic-settings" },
     { name = "python-dotenv" },
     { name = "torch" },
     { name = "torchaudio" },
     { name = "livekit-agents", extras = ["silero", "turn-detector"], specifier = "~=1.3" },
     { name = "livekit-plugins-noise-cancellation", specifier = "~=0.2" },
     { name = "nemo-toolkit", extras = ["asr"] },
+    { name = "pydantic-settings", specifier = ">=2.12.0" },
     { name = "python-dotenv", specifier = ">=1.2.1" },
     { name = "torch", specifier = "==2.10.0" },
     { name = "torchaudio", specifier = "==2.10.0" },
     { url = "https://files.pythonhosted.org/packages/2f/02/8559b1f26ee0d502c74f9cca5c0d2fd97e967e083e006bbbb4e97f3a043a/pydantic_core-2.41.5-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d3a978c4f57a597908b7e697229d996d77a6d3c94901e9edee593adada95ce1a", size = 2147009, upload-time = "2025-11-04T13:43:23.286Z" },
 ]
+[[package]]
+name = "pydantic-settings"
+version = "2.12.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pydantic" },
+    { name = "python-dotenv" },
+    { name = "typing-inspection" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/43/4b/ac7e0aae12027748076d72a8764ff1c9d82ca75a7a52622e67ed3f765c54/pydantic_settings-2.12.0.tar.gz", hash = "sha256:005538ef951e3c2a68e1c08b292b5f2e71490def8589d4221b95dab00dafcfd0", size = 194184, upload-time = "2025-11-10T14:25:47.013Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c1/60/5d4751ba3f4a40a6891f24eec885f51afd78d208498268c734e256fb13c4/pydantic_settings-2.12.0-py3-none-any.whl", hash = "sha256:fddb9fd99a5b18da837b29710391e945b1e30c135477f484084ee513adb93809", size = 51880, upload-time = "2025-11-10T14:25:45.546Z" },
+]
 [[package]]
 name = "pydub"
 version = "0.25.1"