Spaces:

dvalle08
/

open-voice-agent

Running

App Files Files Community

dvalle08 commited on 17 days ago

Commit

5e32359

1 Parent(s): 0bfc688

Refactor .gitignore and enhance Pocket TTS and Moonshine STT: Simplify .gitignore by removing unnecessary entries and adding environment-specific files. Update Pocket TTS to use a configurable sample rate and replace UUID with shortuuid for request IDs. Modify Moonshine STT to use NotGivenOr for language parameter and update stream class inheritance for better clarity.

Browse files

Files changed (4) hide show

.gitignore +23 -182
src/agent/agent.py +1 -0
src/plugins/moonshine_stt/stt.py +6 -7
src/plugins/pocket_tts/tts.py +8 -7

.gitignore CHANGED Viewed

@@ -1,191 +1,32 @@
-# Created by https://www.toptal.com/developers/gitignore/api/python,pythonvanilla
-# Edit at https://www.toptal.com/developers/gitignore?templates=python,pythonvanilla
-### Python ###
-# Byte-compiled / optimized / DLL files
 __pycache__/
-*.py[cod]
-*$py.class
-# C extensions
-*.so
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-share/python-wheels/
 *.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-# Translations
-*.mo
-*.pot
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-# Flask stuff:
-instance/
-.webassets-cache
-# Scrapy stuff:
-.scrapy
-# Sphinx documentation
-docs/_build/
-# PyBuilder
-.pybuilder/
-target/
-# Jupyter Notebook
-.ipynb_checkpoints
-# IPython
-profile_default/
-ipython_config.py
-# pyenv
-#   For a library or package, you might want to ignore these files since the code is
-#   intended to run in multiple environments; otherwise, check them in:
-# .python-version
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-# poetry
-#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
-#   This is especially recommended for binary packages to ensure reproducibility, and is more
-#   commonly ignored for libraries.
-#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
-#poetry.lock
-# pdm
-#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
-#pdm.lock
-#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
-#   in version control.
-#   https://pdm.fming.dev/#use-with-ide
-.pdm.toml
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
-__pypackages__/
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-# SageMath parsed files
-*.sage.py
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
 dev/
-nvidia_services/cache/asr/
-nvidia_services/cache/tts/
-.claude/
-.cursor/
 .pytest_cache/
-# Spyder project settings
-.spyderproject
-.spyproject
-# Rope project settings
-.ropeproject
-# mkdocs documentation
-/site
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-# Pyre type checker
-.pyre/
-# pytype static type analyzer
-.pytype/
-# Cython debug symbols
-cython_debug/
-# PyCharm
-#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
-#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
-#  and can be added to the global gitignore or merged into this file.  For a more nuclear
-#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
-### Python Patch ###
-# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
-poetry.toml
-# ruff
-.ruff_cache/
-# LSP config files
-pyrightconfig.json
-### PythonVanilla ###
-# Byte-compiled / optimized / DLL files
-# C extensions
-# Distribution / packaging
-# Installer logs
-# Translations
-# pyenv
-#   For a library or package, you might want to ignore these files since the code is
-#   intended to run in multiple environments; otherwise, check them in:
-# .python-version
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow
-# End of https://www.toptal.com/developers/gitignore/api/python,pythonvanillanvidia_services/cache/

+# Environment
+.env
+.venv/
+.streamlit/
+# Python
 __pycache__/
+*.pyc
+*.pyo
 *.egg-info/
+dist/
+build/
+# IDE
+.cursor/
+.cursorignore
+.claude/
+CLAUDE.md
+# Dev files
 dev/
 .pytest_cache/
+# Model weights (never commit)
+*.onnx
+*.pt
+*.bin
+*.safetensors
+model_cache/
+# OS
+.DS_Store

src/agent/agent.py CHANGED Viewed

@@ -91,6 +91,7 @@ async def session_handler(ctx: agents.JobContext) -> None:
         voice=settings.voice.POCKET_TTS_VOICE,
         temperature=settings.voice.POCKET_TTS_TEMPERATURE,
         lsd_decode_steps=settings.voice.POCKET_TTS_LSD_DECODE_STEPS,
         metrics_callback=tts_metrics_callback,
     )

         voice=settings.voice.POCKET_TTS_VOICE,
         temperature=settings.voice.POCKET_TTS_TEMPERATURE,
         lsd_decode_steps=settings.voice.POCKET_TTS_LSD_DECODE_STEPS,
+        sample_rate=settings.voice.SAMPLE_RATE_OUTPUT,
         metrics_callback=tts_metrics_callback,
     )

src/plugins/moonshine_stt/stt.py CHANGED Viewed

@@ -10,8 +10,8 @@ import torch
 from transformers import AutoProcessor, MoonshineStreamingForConditionalGeneration
 from livekit import rtc
 from livekit.agents import stt
-from livekit.agents.types import APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS
-from livekit.agents.utils import AudioBuffer
 @dataclass
@@ -49,7 +49,7 @@ class MoonshineSTT(stt.STT):
         self,
         buffer: AudioBuffer,
         *,
-        language: str | None = None,
         conn_options: APIConnectOptions,
     ) -> stt.SpeechEvent:
         config = self._sanitize_options(language=language)
@@ -85,7 +85,7 @@ class MoonshineSTT(stt.STT):
         *,
         language: str | None = None,
         conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
-    ) -> stt.SpeechStream:
         config = self._sanitize_options(language=language)
         return MoonshineSTTStream(
             stt=self,
@@ -98,7 +98,7 @@ class MoonshineSTT(stt.STT):
         )
-class MoonshineSTTStream(stt.SpeechStream):
     def __init__(
         self,
         *,
@@ -148,8 +148,7 @@ class MoonshineSTTStream(stt.SpeechStream):
     async def _finalize_segment(self) -> None:
         # Generate a unique request ID for this segment
-        import uuid
-        request_id = str(uuid.uuid4())
         if len(self._buffer) == 0:
             # Don't emit metrics for empty segments - just return

 from transformers import AutoProcessor, MoonshineStreamingForConditionalGeneration
 from livekit import rtc
 from livekit.agents import stt
+from livekit.agents.types import APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS, NOT_GIVEN, NotGivenOr
+from livekit.agents.utils import AudioBuffer, shortuuid
 @dataclass
         self,
         buffer: AudioBuffer,
         *,
+        language: NotGivenOr[str] = NOT_GIVEN,
         conn_options: APIConnectOptions,
     ) -> stt.SpeechEvent:
         config = self._sanitize_options(language=language)
         *,
         language: str | None = None,
         conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
+    ) -> stt.RecognizeStream:
         config = self._sanitize_options(language=language)
         return MoonshineSTTStream(
             stt=self,
         )
+class MoonshineSTTStream(stt.RecognizeStream):
     def __init__(
         self,
         *,
     async def _finalize_segment(self) -> None:
         # Generate a unique request ID for this segment
+        request_id = shortuuid("STT_")
         if len(self._buffer) == 0:
             # Don't emit metrics for empty segments - just return

src/plugins/pocket_tts/tts.py CHANGED Viewed

@@ -4,7 +4,6 @@ from __future__ import annotations
 import asyncio
 import logging
 import time
-import uuid
 from typing import Callable
 import numpy as np
@@ -14,9 +13,9 @@ from scipy import signal
 from livekit.agents import tts
 from livekit.agents.types import APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS
-from src.core.logger import logger
-from src.core.settings import settings
 # Reduce verbosity of pocket_tts library to avoid console spam
 logging.getLogger("pocket_tts").setLevel(logging.WARNING)
@@ -34,6 +33,7 @@ class PocketTTS(tts.TTS):
         voice: str = "alba",
         temperature: float = 0.7,
         lsd_decode_steps: int = 1,
         metrics_callback: OptionalTTSMetricsCallback = None,
     ) -> None:
         """Initialize Pocket TTS plugin.
@@ -43,9 +43,10 @@ class PocketTTS(tts.TTS):
                    or path to audio file for custom voice
             temperature: Sampling temperature (0.0-2.0)
             lsd_decode_steps: LSD decoding steps (higher = better quality, slower)
         """
         # Use the configured output sample rate (default 48000 Hz)
-        self._output_sample_rate = settings.voice.SAMPLE_RATE_OUTPUT
         self._native_sample_rate = 24000  # Pocket TTS native rate
         super().__init__(
@@ -149,7 +150,7 @@ class PocketSynthesizeStream(tts.SynthesizeStream):
         Args:
             output_emitter: Audio emitter for pushing generated audio
         """
-        request_id = str(uuid.uuid4())
         output_emitter.initialize(
             request_id=request_id,
@@ -165,7 +166,7 @@ class PocketSynthesizeStream(tts.SynthesizeStream):
             if isinstance(data, self._FlushSentinel):
                 if text_buffer.strip():
                     # Create a new segment for each text chunk
-                    segment_id = str(uuid.uuid4())
                     output_emitter.start_segment(segment_id=segment_id)
                     await self._synthesize_segment(text_buffer, output_emitter, segment_id)
                     output_emitter.end_segment()
@@ -177,7 +178,7 @@ class PocketSynthesizeStream(tts.SynthesizeStream):
         # Process any remaining text
         if text_buffer.strip():
-            segment_id = str(uuid.uuid4())
             output_emitter.start_segment(segment_id=segment_id)
             await self._synthesize_segment(text_buffer, output_emitter, segment_id)
             output_emitter.end_segment()

 import asyncio
 import logging
 import time
 from typing import Callable
 import numpy as np
 from livekit.agents import tts
 from livekit.agents.types import APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS
+from livekit.agents.utils import shortuuid
+logger = logging.getLogger(__name__)
 # Reduce verbosity of pocket_tts library to avoid console spam
 logging.getLogger("pocket_tts").setLevel(logging.WARNING)
         voice: str = "alba",
         temperature: float = 0.7,
         lsd_decode_steps: int = 1,
+        sample_rate: int = 48000,
         metrics_callback: OptionalTTSMetricsCallback = None,
     ) -> None:
         """Initialize Pocket TTS plugin.
                    or path to audio file for custom voice
             temperature: Sampling temperature (0.0-2.0)
             lsd_decode_steps: LSD decoding steps (higher = better quality, slower)
+            sample_rate: Output sample rate in Hz (default 48000)
         """
         # Use the configured output sample rate (default 48000 Hz)
+        self._output_sample_rate = sample_rate
         self._native_sample_rate = 24000  # Pocket TTS native rate
         super().__init__(
         Args:
             output_emitter: Audio emitter for pushing generated audio
         """
+        request_id = shortuuid("TTS_")
         output_emitter.initialize(
             request_id=request_id,
             if isinstance(data, self._FlushSentinel):
                 if text_buffer.strip():
                     # Create a new segment for each text chunk
+                    segment_id = shortuuid("SEG_")
                     output_emitter.start_segment(segment_id=segment_id)
                     await self._synthesize_segment(text_buffer, output_emitter, segment_id)
                     output_emitter.end_segment()
         # Process any remaining text
         if text_buffer.strip():
+            segment_id = shortuuid("SEG_")
             output_emitter.start_segment(segment_id=segment_id)
             await self._synthesize_segment(text_buffer, output_emitter, segment_id)
             output_emitter.end_segment()