Spaces:

dvalle08
/

open-voice-agent

Running

App Files Files Community

dvalle08 commited on Feb 8

Commit

9af190b

1 Parent(s): ffad511

Refactor Open Voice Agent: Transition to Hatch for build system, restructure agent components, and streamline conversation graph

Browse files

Files changed (23) hide show

.gitignore +3 -1
main.py +0 -84
pyproject.toml +7 -0
src/agent/__init__.py +4 -4
testing/livekit_custom.py → src/agent/agent.py +20 -66
src/agent/graph.py +18 -112
src/agent/llm_factory.py +0 -163
src/agent/prompts.py +0 -41
src/agent/state.py +0 -10
src/api/__init__.py +0 -5
src/api/main.py +0 -69
src/core/__init__.py +4 -0
src/core/settings.py +13 -45
src/models/__init__.py +0 -1
src/models/voice/__init__.py +0 -18
src/models/voice/base.py +0 -53
src/models/voice/types.py +0 -43
src/plugins/pocket_tts/tts.py +0 -5
src/streamlit_app.py +0 -288
testing/asr_moonshine.py +0 -48
testing/nvidia_.py +0 -4
testing/pocket_tts_test.py +0 -13
uv.lock +1 -1

.gitignore CHANGED Viewed

@@ -119,7 +119,9 @@ venv.bak/
 dev/
 nvidia_services/cache/asr/
 nvidia_services/cache/tts/
-test/
 # Spyder project settings
 .spyderproject
 .spyproject

 dev/
 nvidia_services/cache/asr/
 nvidia_services/cache/tts/
+.claude/
+.cursor/
+.pytest_cache/
 # Spyder project settings
 .spyderproject
 .spyproject

main.py DELETED Viewed

@@ -1,84 +0,0 @@
-import argparse
-import multiprocessing
-import subprocess
-import sys
-from src.core.logger import logger
-def run_api():
-    logger.info("Starting FastAPI server...")
-    import uvicorn
-    from src.core.settings import settings
-    uvicorn.run(
-        "src.api.main:app",
-        host=settings.api.API_HOST,
-        port=settings.api.API_PORT,
-        workers=settings.api.API_WORKERS,
-        reload=True,
-        log_level="info",
-    )
-def run_streamlit():
-    logger.info("Starting Streamlit UI...")
-    subprocess.run([
-        sys.executable,
-        "-m",
-        "streamlit",
-        "run",
-        "src/streamlit_app.py",
-        "--server.port=8501",
-        "--server.address=localhost",
-    ])
-def run_both():
-    logger.info("Starting both FastAPI server and Streamlit UI...")
-    api_process = multiprocessing.Process(target=run_api, name="FastAPI")
-    streamlit_process = multiprocessing.Process(target=run_streamlit, name="Streamlit")
-    try:
-        api_process.start()
-        streamlit_process.start()
-        api_process.join()
-        streamlit_process.join()
-    except KeyboardInterrupt:
-        logger.info("Shutting down...")
-        api_process.terminate()
-        streamlit_process.terminate()
-        api_process.join()
-        streamlit_process.join()
-        logger.info("Shutdown complete")
-def main():
-    parser = argparse.ArgumentParser(
-        description="Open Voice Agent - Real-time AI voice conversations"
-    )
-    parser.add_argument(
-        "mode",
-        choices=["api", "streamlit", "both"],
-        default="both",
-        nargs="?",
-        help="Run mode: 'api' (FastAPI server), 'streamlit' (UI), or 'both' (default)",
-    )
-    args = parser.parse_args()
-    logger.info(f"Starting Open Voice Agent in '{args.mode}' mode...")
-    if args.mode == "api":
-        run_api()
-    elif args.mode == "streamlit":
-        run_streamlit()
-    else:
-        run_both()
-if __name__ == "__main__":
-    main()

pyproject.toml CHANGED Viewed

@@ -1,3 +1,10 @@
 [project]
 name = "open-voice-agent"
 version = "0.1.0"

+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[tool.hatch.build.targets.wheel]
+packages = ["src"]
 [project]
 name = "open-voice-agent"
 version = "0.1.0"

src/agent/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
-"""Conversation agent using LangGraph."""
-from src.agent.graph import create_conversation_graph
-from src.agent.state import ConversationState
-__all__ = ["create_conversation_graph", "ConversationState"]

+"""LiveKit voice agent using LangGraph."""
+from src.agent.graph import create_graph
+from src.agent.agent import Assistant
+__all__ = ["create_graph", "Assistant"]

testing/livekit_custom.py → src/agent/agent.py RENAMED Viewed

@@ -1,64 +1,14 @@
-from __future__ import annotations
-from pathlib import Path
-import sys
-project_root = Path(__file__).resolve().parents[1]
-if str(project_root) not in sys.path:
-    sys.path.insert(0, str(project_root))
-from dotenv import load_dotenv
 from livekit import agents, rtc
-from livekit.agents import AgentServer,AgentSession, Agent, room_io
 from livekit.plugins import noise_cancellation, silero
 from livekit.plugins.turn_detector.multilingual import MultilingualModel
-import os
-from langgraph.graph import StateGraph, MessagesState, START, END
-from langchain_nvidia_ai_endpoints import ChatNVIDIA
 from livekit.plugins import langchain
-from livekit.agents import stt, tts
-from huggingface_hub import InferenceClient
-import io
-import wave
 from src.plugins.moonshine_stt import MoonshineSTT
-from src.agent.llm_factory import LLMFactory
-load_dotenv(".env")
-# Simple LangGraph workflow with NVIDIA LLM
-def create_nvidia_workflow():
-    """Create a simple LangGraph workflow with NVIDIA ChatNVIDIA"""
-    # Initialize NVIDIA LLM
-    nvidia_llm = ChatNVIDIA(
-        model="meta/llama-3.1-8b-instruct",
-        api_key=os.getenv("NVIDIA_API_KEY"),
-        temperature=0.7,
-        max_tokens=150
-    )
-    # Define the conversation node
-    def call_model(state: MessagesState):
-        """Simple node that calls the NVIDIA LLM"""
-        response = nvidia_llm.invoke(state["messages"])
-        return {"messages": [response]}
-    # Build the graph
-    workflow = StateGraph(MessagesState)
-    # Add the single node
-    workflow.add_node("agent", call_model)
-    # Define the flow: START -> agent -> END
-    workflow.add_edge(START, "agent")
-    workflow.add_edge("agent", END)
-    # Compile and return
-    return workflow.compile()
 class Assistant(Agent):
@@ -70,32 +20,36 @@ class Assistant(Agent):
             You are curious, friendly, and have a sense of humor.""",
         )
 server = AgentServer()
 @server.rtc_session()
-async def my_agent(ctx: agents.JobContext):
     session = AgentSession(
-        stt=MoonshineSTT(model_id="UsefulSensors/moonshine-streaming-medium"),
-        llm=langchain.LLMAdapter(create_nvidia_workflow()),
-        tts=LLMFactory.create_pocket_tts(voice="alba"),
         vad=silero.VAD.load(),
         turn_detection=MultilingualModel(),
     )
     await session.start(
         room=ctx.room,
         agent=Assistant(),
         room_options=room_io.RoomOptions(
             audio_input=room_io.AudioInputOptions(
-                noise_cancellation=lambda params: noise_cancellation.BVCTelephony() if params.participant.kind == rtc.ParticipantKind.PARTICIPANT_KIND_SIP else noise_cancellation.BVC(),
             ),
         ),
     )
-    await session.generate_reply(
-        instructions="Greet the user and offer your assistance."
-    )
 if __name__ == "__main__":
-    agents.cli.run_app(server)

 from livekit import agents, rtc
+from livekit.agents import AgentServer, AgentSession, Agent, room_io
 from livekit.plugins import noise_cancellation, silero
 from livekit.plugins.turn_detector.multilingual import MultilingualModel
 from livekit.plugins import langchain
+from src.agent.graph import create_graph
 from src.plugins.moonshine_stt import MoonshineSTT
+from src.plugins.pocket_tts import PocketTTS
+from src.core.settings import settings
+from src.core.logger import logger
 class Assistant(Agent):
             You are curious, friendly, and have a sense of humor.""",
         )
 server = AgentServer()
 @server.rtc_session()
+async def session_handler(ctx: agents.JobContext) -> None:
     session = AgentSession(
+        stt=MoonshineSTT(model_id=settings.voice.MOONSHINE_MODEL_ID),
+        llm=langchain.LLMAdapter(create_graph()),
+        tts=PocketTTS(
+            voice=settings.voice.POCKET_TTS_VOICE,
+            temperature=settings.voice.POCKET_TTS_TEMPERATURE,
+            lsd_decode_steps=settings.voice.POCKET_TTS_LSD_DECODE_STEPS,
+        ),
         vad=silero.VAD.load(),
         turn_detection=MultilingualModel(),
     )
     await session.start(
         room=ctx.room,
         agent=Assistant(),
         room_options=room_io.RoomOptions(
             audio_input=room_io.AudioInputOptions(
+                noise_cancellation=lambda params: noise_cancellation.BVCTelephony()
+                if params.participant.kind == rtc.ParticipantKind.PARTICIPANT_KIND_SIP
+                else noise_cancellation.BVC(),
             ),
         ),
     )
+    await session.generate_reply(instructions="Greet the user and offer your assistance.")
 if __name__ == "__main__":
+    agents.cli.run_app(server)

src/agent/graph.py CHANGED Viewed

@@ -1,117 +1,23 @@
-from typing import Literal, Optional
-from langchain_core.language_models import BaseLanguageModel
-from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
-from langgraph.graph import StateGraph, END
-from langgraph.checkpoint.memory import MemorySaver
-from src.agent.llm_factory import LLMFactory
-from src.agent.prompts import get_system_prompt
-from src.agent.state import ConversationState
-from src.core.logger import logger
-def process_user_input(state: ConversationState) -> ConversationState:
-    transcript = state.get("current_transcript", "").strip()
-    if not transcript:
-        logger.debug("No transcript to process")
-        return state
-    logger.info(f"Processing user input: {transcript}")
-    messages = state.get("messages", [])
-    if not messages:
-        messages.append(SystemMessage(content=get_system_prompt()))
-    messages.append(HumanMessage(content=transcript))
-    return {
-        **state,
-        "messages": messages,
-        "current_transcript": "",
-    }
-def generate_response(state: ConversationState, llm: Optional[BaseLanguageModel] = None) -> ConversationState:
-    if llm is None:
-        llm = LLMFactory.create_llm()
-    messages = state.get("messages", [])
-    if not messages:
-        logger.warning("No messages to generate response from")
-        return state
-    logger.info("Generating AI response...")
-    try:
-        response = llm.invoke(messages)
-        if hasattr(response, "content"):
-            content = response.content
-        else:
-            content = str(response)
-        logger.info(f"Generated response: {content[:100]}...")
-        messages.append(AIMessage(content=content))
-        return {
-            **state,
-            "messages": messages,
-        }
-    except Exception as e:
-        logger.error(f"Error generating response: {e}")
-        fallback = "I'm sorry, I encountered an error. Could you please repeat that?"
-        messages.append(AIMessage(content=fallback))
-        return {
-            **state,
-            "messages": messages,
-        }
-def should_respond(state: ConversationState) -> Literal["generate", "wait"]:
-    turn_active = state.get("turn_active", False)
-    current_transcript = state.get("current_transcript", "").strip()
-    if turn_active:
-        logger.debug("Turn still active, waiting...")
-        return "wait"
-    if current_transcript:
-        logger.debug("Turn complete with transcript, generating response")
-        return "generate"
-    logger.debug("No action needed, waiting...")
-    return "wait"
-def create_conversation_graph() -> StateGraph:
-    logger.info("Creating conversation graph...")
-    workflow = StateGraph(ConversationState)
-    workflow.add_node("process_input", process_user_input)
-    workflow.add_node("generate_response", generate_response)
-    workflow.set_entry_point("process_input")
-    workflow.add_conditional_edges(
-        "process_input",
-        should_respond,
-        {
-            "generate": "generate_response",
-            "wait": END,
-        },
-    )
-    workflow.add_edge("generate_response", END)
-    memory = MemorySaver()
-    graph = workflow.compile(checkpointer=memory)
-    logger.info("Conversation graph created successfully")
-    return graph

+from langchain_nvidia_ai_endpoints import ChatNVIDIA
+from langgraph.graph import StateGraph, MessagesState, START, END
+from src.core.settings import settings
+def create_graph():
+    """Create a single-node LangGraph workflow using NVIDIA ChatNVIDIA."""
+    llm = ChatNVIDIA(
+        model=settings.llm.NVIDIA_MODEL,
+        api_key=settings.llm.NVIDIA_API_KEY,
+        temperature=settings.llm.LLM_TEMPERATURE,
+        max_tokens=settings.llm.LLM_MAX_TOKENS,
+    )
+    def call_model(state: MessagesState) -> dict:
+        return {"messages": [llm.invoke(state["messages"])]}
+    workflow = StateGraph(MessagesState)
+    workflow.add_node("agent", call_model)
+    workflow.add_edge(START, "agent")
+    workflow.add_edge("agent", END)
+    return workflow.compile()

src/agent/llm_factory.py DELETED Viewed

@@ -1,163 +0,0 @@
-from typing import Any, Union
-from huggingface_hub import InferenceClient
-from transformers import pipeline
-#from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint, HuggingFacePipeline
-#from kokoro import KPipeline
-import torch
-from langchain_nvidia_ai_endpoints import ChatNVIDIA
-from src.core.logger import logger
-from src.core.settings import settings
-class LLMFactory:
-    @staticmethod
-    def create_nvidia_llm(
-        model: str = settings.llm.NVIDIA_MODEL,
-        temperature: float = settings.llm.LLM_TEMPERATURE,
-        max_tokens: int = settings.llm.LLM_MAX_TOKENS,
-    ) -> ChatNVIDIA:
-        logger.info(f"Initializing NVIDIA LLM: {model}")
-        if not settings.llm.NVIDIA_API_KEY:
-            raise ValueError("NVIDIA_API_KEY must be set to use the NVIDIA LLM provider.")
-        return ChatNVIDIA(
-            model=model,
-            api_key=settings.llm.NVIDIA_API_KEY,
-            temperature=temperature,
-            max_completion_tokens=max_tokens,
-        )
-    # @staticmethod
-    # def create_huggingface_llm(
-    #     model_id: str,
-    #     provider: str = "auto",
-    #     temperature: float = settings.llm.LLM_TEMPERATURE,
-    #     max_tokens: int = settings.llm.LLM_MAX_TOKENS,
-    #     run_local: bool = False,
-    # ) -> ChatHuggingFace:
-    #     if run_local:
-    #         logger.info(f"Initializing local HuggingFace LLM: {model_id}")
-    #         llm = HuggingFacePipeline.from_model_id(
-    #             model_id=model_id,
-    #             task="text-generation",
-    #             pipeline_kwargs={
-    #                 "temperature": temperature,
-    #                 "max_new_tokens": max_tokens,
-    #             },
-    #         )
-    #         return ChatHuggingFace(llm=llm)
-    #     token = (settings.llm.HF_TOKEN or "").strip()
-    #     if not token:
-    #         raise ValueError("HF_TOKEN must be set to use the HuggingFace LLM provider.")
-    #     logger.info(f"Initializing HuggingFace LLM: {model_id} via provider={provider}")
-    #     llm = HuggingFaceEndpoint(
-    #         repo_id=model_id,
-    #         provider=provider,
-    #         huggingfacehub_api_token=token,
-    #         temperature=temperature,
-    #         max_new_tokens=max_tokens,
-    #     )
-    #     return ChatHuggingFace(llm=llm)
-    @staticmethod
-    def create_huggingface_stt(
-        model_id: str | None = None, run_local: bool = False
-    ) -> Union[InferenceClient, Any]:
-        if run_local:
-            logger.info(f"Initializing local HuggingFace STT: {model_id or 'default'}")
-            return pipeline("automatic-speech-recognition", model=model_id)
-        token = (settings.llm.HF_TOKEN or "").strip()
-        if not token:
-            raise ValueError("HF_TOKEN must be set to use the HuggingFace STT provider.")
-        logger.info(f"Initializing HuggingFace STT: {model_id or 'default'}")
-        return InferenceClient(model=model_id, token=token)
-    @staticmethod
-    def create_huggingface_tts(
-        model_id: str | None = None, run_local: bool = False
-    ) -> Union[InferenceClient, Any]:
-        if run_local:
-            logger.info(f"Initializing local HuggingFace TTS: {model_id or 'default'}")
-            return pipeline("text-to-speech", model=model_id)
-        token = (settings.llm.HF_TOKEN or "").strip()
-        if not token:
-            raise ValueError("HF_TOKEN must be set to use the HuggingFace TTS provider.")
-        logger.info(f"Initializing HuggingFace TTS: {model_id or 'default'}")
-        return InferenceClient(model=model_id, token=token)
-    @staticmethod
-    def create_kokoro_tts(lang_code: str = "a") -> Any:
-        if KPipeline is None:
-            raise ImportError(
-                "kokoro library not found. Please install it (pip install kokoro>=0.9.4) to use Kokoro TTS."
-            )
-        logger.info(f"Initializing Kokoro TTS Pipeline with lang_code: {lang_code}")
-        return KPipeline(lang_code=lang_code, repo_id="hexgrad/Kokoro-82M")
-    @staticmethod
-    def create_moonshine_stt(
-        model_size: str = "base",
-        language: str = "en",
-    ) -> "MoonshineSTT":
-        """Initialize Moonshine ONNX STT plugin.
-        Args:
-            model_size: "tiny" (26MB) or "base" (57MB), or language variants (e.g., "base-es", "tiny-ar")
-            language: Currently only "en" supported
-        Returns:
-            MoonshineSTT plugin instance
-        """
-        logger.info(f"Initializing Moonshine ONNX STT: {model_size}")
-        from src.plugins.moonshine_stt import MoonshineSTT
-        return MoonshineSTT(model_size=model_size, language=language)
-    @staticmethod
-    def create_pocket_tts(
-        voice: str | None = None,
-        temperature: float | None = None,
-        lsd_decode_steps: int | None = None,
-    ) -> "PocketTTS":
-        """Initialize Pocket TTS plugin.
-        Args:
-            voice: Voice name (alba, marius, etc.) or path to audio file.
-                   If None, uses settings.voice.POCKET_TTS_VOICE
-            temperature: Sampling temperature (0.0-2.0).
-                        If None, uses settings.voice.POCKET_TTS_TEMPERATURE
-            lsd_decode_steps: LSD decoding steps for quality.
-                             If None, uses settings.voice.POCKET_TTS_LSD_DECODE_STEPS
-        Returns:
-            PocketTTS plugin instance
-        """
-        from src.plugins.pocket_tts import PocketTTS
-        if voice is None:
-            voice = settings.voice.POCKET_TTS_VOICE
-        if temperature is None:
-            temperature = settings.voice.POCKET_TTS_TEMPERATURE
-        if lsd_decode_steps is None:
-            lsd_decode_steps = settings.voice.POCKET_TTS_LSD_DECODE_STEPS
-        logger.info(f"Initializing Pocket TTS: voice={voice}, temp={temperature}, lsd_steps={lsd_decode_steps}")
-        return PocketTTS(
-            voice=voice,
-            temperature=temperature,
-            lsd_decode_steps=lsd_decode_steps,
-        )

src/agent/prompts.py DELETED Viewed

@@ -1,41 +0,0 @@
-from typing import Any, Optional
-from enum import Enum
-class PromptVersion(str, Enum):
-    V1 = "v1"
-    DEFAULT = "v1"
-class PromptTemplate:
-    def __init__(self, template: str, version: PromptVersion = PromptVersion.DEFAULT):
-        self.template = template
-        self.version = version
-    def render(self, **kwargs: Any) -> str:
-        return self.template.format(**kwargs)
-SYSTEM_PROMPT_V1 = """You are a helpful AI voice assistant. You engage in natural, conversational dialogue with users.
-Guidelines:
-- Keep responses concise and natural for voice interaction
-- Be friendly and engaging
-- Ask clarifying questions when needed
-- Acknowledge what the user says before responding
-- Keep your responses focused and to the point (2-3 sentences typically)
-"""
-SYSTEM_PROMPTS = {
-    PromptVersion.V1: PromptTemplate(SYSTEM_PROMPT_V1, PromptVersion.V1),
-}
-def get_system_prompt(version: Optional[PromptVersion] = None) -> str:
-    version = version or PromptVersion.DEFAULT
-    return SYSTEM_PROMPTS[version].render()
-def get_custom_prompt(template: str, **context: Any) -> str:
-    prompt = PromptTemplate(template)
-    return prompt.render(**context)

src/agent/state.py DELETED Viewed

@@ -1,10 +0,0 @@
-from typing import Any, TypedDict
-from langchain_core.messages import BaseMessage
-class ConversationState(TypedDict):
-    messages: list[BaseMessage]
-    current_transcript: str
-    context: dict[str, Any]
-    turn_active: bool

src/api/__init__.py DELETED Viewed

@@ -1,5 +0,0 @@
-"""FastAPI application and WebSocket handlers."""
-from src.api.main import app
-__all__ = ["app"]

src/api/main.py DELETED Viewed

@@ -1,69 +0,0 @@
-from fastapi import FastAPI, WebSocket, WebSocketDisconnect
-from fastapi.middleware.cors import CORSMiddleware
-from src.api.websocket import VoiceWebSocketHandler
-from src.core.logger import logger
-from src.core.settings import settings
-app = FastAPI(
-    title="Open Voice Agent API",
-    description="Real-time voice conversation agent with WebSocket support",
-    version="0.1.0",
-)
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=settings.api.API_CORS_ORIGINS,
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-@app.get("/health")
-async def health_check():
-    return {
-        "status": "healthy",
-        "service": "open-voice-agent",
-        "version": "0.1.0",
-    }
-@app.websocket("/ws/voice")
-async def websocket_voice_endpoint(websocket: WebSocket):
-    handler = VoiceWebSocketHandler(websocket)
-    try:
-        await handler.connect()
-        await handler.handle_conversation()
-    except WebSocketDisconnect:
-        logger.info("Client disconnected")
-    except Exception as e:
-        logger.error(f"WebSocket error: {e}", exc_info=True)
-        await handler.send_error(str(e))
-    finally:
-        await handler.disconnect()
-@app.on_event("startup")
-async def startup_event():
-    logger.info("Starting Open Voice Agent API...")
-    logger.info(f"Voice provider: {settings.voice.VOICE_PROVIDER}")
-    logger.info(f"LLM provider: {settings.llm.LLM_PROVIDER}")
-@app.on_event("shutdown")
-async def shutdown_event():
-    logger.info("Shutting down Open Voice Agent API...")
-if __name__ == "__main__":
-    import uvicorn
-    uvicorn.run(
-        "src.api.main:app",
-        host=settings.api.API_HOST,
-        port=settings.api.API_PORT,
-        workers=settings.api.API_WORKERS,
-        reload=True,
-    )

src/core/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from src.core.settings import settings
+from src.core.logger import logger
+__all__ = ["settings", "logger"]

src/core/settings.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import json
 from pathlib import Path
-from typing import Any, Optional
 from pydantic import Field, ValidationError
 from pydantic_settings import BaseSettings, SettingsConfigDict
@@ -15,10 +15,10 @@ load_dotenv(ENV_FILE, override=True)
 logger.info(f"Loaded environment from: {ENV_FILE}")
-def mask_sensitive_data(data: dict[str, Any]) -> dict[str, Any]:
     masked = {}
     sensitive_keys = ["key", "token", "secret", "password"]
     for key, value in data.items():
         if isinstance(value, dict):
             masked[key] = mask_sensitive_data(value)
@@ -31,7 +31,7 @@ def mask_sensitive_data(data: dict[str, Any]) -> dict[str, Any]:
                 masked[key] = f"{value[:4]}...{value[-4:]}"
         else:
             masked[key] = value
     return masked
@@ -46,80 +46,48 @@ class CoreSettings(BaseSettings):
 class VoiceSettings(CoreSettings):
-    VOICE_PROVIDER: str = Field(default="nvidia")
-    NVIDIA_VOICE_LANGUAGE: str = Field(default="en-US")
-    NVIDIA_VOICE_NAME: str = Field(default="Magpie-Multilingual.EN-US.Aria")
-    NVIDIA_TTS_MODEL: str = Field(default="magpie-tts-multilingual")
-    NVIDIA_TTS_ENDPOINT: str = Field(default="")
-    SAMPLE_RATE_OUTPUT: int = Field(default=48000, gt=0)
-    CHUNK_DURATION_MS: int = Field(default=80, gt=0)
-    VAD_THRESHOLD: float = Field(default=0.5, ge=0.0, le=1.0)
-    VAD_HORIZON_INDEX: int = Field(default=2, ge=0)
-    # STT (Speech-to-Text) Settings
-    STT_PROVIDER: str = Field(
-        default="moonshine",
-        description="STT provider (moonshine, assemblyai, etc)"
-    )
-    MOONSHINE_MODEL_SIZE: str = Field(
-        default="small",
-        description="Moonshine model size: tiny, base, or small"
     )
-    # TTS (Text-to-Speech) Settings - Pocket TTS
     POCKET_TTS_VOICE: str = Field(
         default="alba",
-        description="Default voice (alba, marius, javert, jean, fantine, cosette, eponine, azelma) or path to audio file"
     )
     POCKET_TTS_TEMPERATURE: float = Field(
         default=0.7,
         ge=0.0,
         le=2.0,
-        description="Sampling temperature for generation"
     )
     POCKET_TTS_LSD_DECODE_STEPS: int = Field(
         default=1,
         ge=1,
-        description="LSD decoding steps (higher = better quality, slower)"
     )
 class LLMSettings(CoreSettings):
     NVIDIA_API_KEY: Optional[str] = Field(default=None)
     NVIDIA_MODEL: str = Field(default="meta/llama-3.1-8b-instruct")
-    NVIDIA_BASE_URL: str = Field(default="https://integrate.api.nvidia.com/v1")
-    HF_TOKEN: Optional[str] = Field(default=None)
     LLM_TEMPERATURE: float = Field(default=0.7, ge=0.0, le=2.0)
     LLM_MAX_TOKENS: int = Field(default=1024, gt=0)
-class APISettings(CoreSettings):
-    API_HOST: str = Field(default="0.0.0.0")
-    API_PORT: int = Field(default=8000, gt=0, lt=65536)
-    API_WORKERS: int = Field(default=1, gt=0)
-    API_CORS_ORIGINS: list[str] = Field(
-        default=["http://localhost:8501", "http://localhost:3000"]
-    )
 class Settings(CoreSettings):
     voice: VoiceSettings = Field(default_factory=VoiceSettings)
     llm: LLMSettings = Field(default_factory=LLMSettings)
-    api: APISettings = Field(default_factory=APISettings)
 try:
     settings = Settings()
     settings_dict = settings.model_dump()
     masked_settings = mask_sensitive_data(settings_dict)
     logger.info(f"Settings loaded: {json.dumps(masked_settings, indent=2)}")
 except ValidationError as e:
     logger.exception(f"Error validating settings: {e.json()}")
     raise

 import json
 from pathlib import Path
+from typing import Optional
 from pydantic import Field, ValidationError
 from pydantic_settings import BaseSettings, SettingsConfigDict
 logger.info(f"Loaded environment from: {ENV_FILE}")
+def mask_sensitive_data(data: dict) -> dict:
     masked = {}
     sensitive_keys = ["key", "token", "secret", "password"]
     for key, value in data.items():
         if isinstance(value, dict):
             masked[key] = mask_sensitive_data(value)
                 masked[key] = f"{value[:4]}...{value[-4:]}"
         else:
             masked[key] = value
     return masked
 class VoiceSettings(CoreSettings):
+    MOONSHINE_MODEL_ID: str = Field(
+        default="usefulsensors/moonshine-streaming-medium",
+        description="Moonshine model size: tiny, base, or small",
     )
     POCKET_TTS_VOICE: str = Field(
         default="alba",
+        description="Default voice (alba, marius, javert, jean, fantine, cosette, eponine, azelma) or path to audio file",
     )
+    SAMPLE_RATE_OUTPUT: int = Field(default=48000, gt=0)
     POCKET_TTS_TEMPERATURE: float = Field(
         default=0.7,
         ge=0.0,
         le=2.0,
+        description="Sampling temperature for generation",
     )
     POCKET_TTS_LSD_DECODE_STEPS: int = Field(
         default=1,
         ge=1,
+        description="LSD decoding steps (higher = better quality, slower)",
     )
 class LLMSettings(CoreSettings):
     NVIDIA_API_KEY: Optional[str] = Field(default=None)
     NVIDIA_MODEL: str = Field(default="meta/llama-3.1-8b-instruct")
     LLM_TEMPERATURE: float = Field(default=0.7, ge=0.0, le=2.0)
     LLM_MAX_TOKENS: int = Field(default=1024, gt=0)
 class Settings(CoreSettings):
     voice: VoiceSettings = Field(default_factory=VoiceSettings)
     llm: LLMSettings = Field(default_factory=LLMSettings)
 try:
     settings = Settings()
     settings_dict = settings.model_dump()
     masked_settings = mask_sensitive_data(settings_dict)
     logger.info(f"Settings loaded: {json.dumps(masked_settings, indent=2)}")
 except ValidationError as e:
     logger.exception(f"Error validating settings: {e.json()}")
     raise

src/models/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- """Models package for voice agents and data structures."""

src/models/voice/__init__.py DELETED Viewed

@@ -1,18 +0,0 @@
-"""Voice provider interfaces and implementations."""
-from src.models.voice.base import BaseVoiceProvider, VoiceProviderConfig
-from src.models.voice.types import (
-    AudioFormat,
-    VADInfo,
-    VoiceMessage,
-    TranscriptionResult,
-)
-__all__ = [
-    "BaseVoiceProvider",
-    "VoiceProviderConfig",
-    "AudioFormat",
-    "VADInfo",
-    "VoiceMessage",
-    "TranscriptionResult",
-]

src/models/voice/base.py DELETED Viewed

@@ -1,53 +0,0 @@
-from abc import ABC, abstractmethod
-from typing import AsyncIterator, Optional
-from pydantic import BaseModel
-from src.models.voice.types import TranscriptionResult, VADInfo
-class VoiceProviderConfig(BaseModel):
-    provider_name: str
-    sample_rate_input: int = 24000
-    sample_rate_output: int = 48000
-    chunk_duration_ms: int = 80
-class BaseVoiceProvider(ABC):
-    def __init__(self, config: VoiceProviderConfig):
-        self.config = config
-        self._connected = False
-    @abstractmethod
-    async def connect(self) -> None:
-        pass
-    @abstractmethod
-    async def disconnect(self) -> None:
-        pass
-    @abstractmethod
-    async def text_to_speech(
-        self, text: str, stream: bool = True
-    ) -> AsyncIterator[bytes]:
-        pass
-    async def speech_to_text(
-        self, audio_stream: AsyncIterator[bytes]
-    ) -> AsyncIterator[TranscriptionResult]:
-        raise NotImplementedError("Speech-to-text not supported by this provider")
-    @abstractmethod
-    async def get_vad_info(self) -> Optional[VADInfo]:
-        pass
-    @property
-    def is_connected(self) -> bool:
-        return self._connected
-    async def __aenter__(self):
-        await self.connect()
-        return self
-    async def __aexit__(self, exc_type, exc_val, exc_tb):
-        await self.disconnect()

src/models/voice/types.py DELETED Viewed

@@ -1,43 +0,0 @@
-from dataclasses import dataclass
-from enum import Enum
-from typing import Optional
-class AudioFormat(str, Enum):
-    PCM = "pcm"
-    WAV = "wav"
-    OPUS = "opus"
-    ULAW_8000 = "ulaw_8000"
-    ALAW_8000 = "alaw_8000"
-    PCM_16000 = "pcm_16000"
-    PCM_24000 = "pcm_24000"
-@dataclass
-class VoiceMessage:
-    type: str
-    content: str | bytes
-    timestamp: Optional[float] = None
-    metadata: Optional[dict] = None
-@dataclass
-class VADInfo:
-    inactivity_prob: float
-    horizon_s: float
-    step_idx: int
-    total_duration_s: float
-    @property
-    def is_turn_complete(self, threshold: float = 0.5) -> bool:
-        return self.inactivity_prob > threshold
-@dataclass
-class TranscriptionResult:
-    text: str
-    start_s: float
-    stop_s: Optional[float] = None
-    is_final: bool = False
-    confidence: Optional[float] = None
-    stream_id: Optional[int] = None

src/plugins/pocket_tts/tts.py CHANGED Viewed

@@ -199,7 +199,6 @@ class PocketSynthesizeStream(tts.SynthesizeStream):
                 ):
                     audio_bytes = self._tensor_to_pcm_bytes(audio_chunk)
                     chunks.append(audio_bytes)
-                    logger.debug(f"Generated chunk: {len(audio_bytes)} bytes")
                 logger.info(f"Total chunks generated: {len(chunks)}")
                 return chunks
@@ -210,10 +209,6 @@ class PocketSynthesizeStream(tts.SynthesizeStream):
             # Push raw PCM bytes to the emitter
             for i, chunk in enumerate(audio_chunks):
-                num_samples = len(chunk) // 2  # int16 = 2 bytes per sample
-                logger.debug(
-                    f"Pushing chunk {i+1}/{len(audio_chunks)}: {len(chunk)} bytes ({num_samples} samples @ {self._tts._output_sample_rate}Hz)"
-                )
                 output_emitter.push(chunk)
             logger.info(f"Successfully pushed all {len(audio_chunks)} chunks")

                 ):
                     audio_bytes = self._tensor_to_pcm_bytes(audio_chunk)
                     chunks.append(audio_bytes)
                 logger.info(f"Total chunks generated: {len(chunks)}")
                 return chunks
             # Push raw PCM bytes to the emitter
             for i, chunk in enumerate(audio_chunks):
                 output_emitter.push(chunk)
             logger.info(f"Successfully pushed all {len(audio_chunks)} chunks")

src/streamlit_app.py DELETED Viewed

@@ -1,288 +0,0 @@
-"""Streamlit UI for voice agent with audio I/O."""
-import asyncio
-import base64
-import json
-from threading import Thread
-from queue import Queue
-from typing import Optional
-import streamlit as st
-import websockets
-from src.core.logger import logger
-# Page configuration
-st.set_page_config(
-    page_title="Open Voice Agent",
-    page_icon="🎙️",
-    layout="wide",
-)
-# Initialize session state
-if "messages" not in st.session_state:
-    st.session_state.messages = []
-if "ws_connected" not in st.session_state:
-    st.session_state.ws_connected = False
-if "current_transcript" not in st.session_state:
-    st.session_state.current_transcript = ""
-if "processing" not in st.session_state:
-    st.session_state.processing = False
-if "response_queue" not in st.session_state:
-    st.session_state.response_queue = Queue()
-def send_audio_to_websocket(ws_url: str, audio_data: str, response_queue: Queue):
-    """Send audio to WebSocket and receive responses in background thread.
-    Args:
-        ws_url: WebSocket URL
-        audio_data: Base64 encoded audio data
-        response_queue: Queue to put responses
-    """
-    async def communicate():
-        try:
-            async with websockets.connect(ws_url) as websocket:
-                # Send audio message
-                await websocket.send(json.dumps({
-                    "type": "audio",
-                    "data": audio_data
-                }))
-                logger.info("Audio sent to WebSocket")
-                # Send end turn signal
-                await websocket.send(json.dumps({"type": "end_turn"}))
-                logger.info("End turn signal sent")
-                # Receive responses
-                while True:
-                    try:
-                        message = await asyncio.wait_for(websocket.recv(), timeout=30.0)
-                        response = json.loads(message)
-                        response_queue.put(response)
-                        # Stop if response is complete
-                        if response.get("type") == "response_complete":
-                            logger.info("Response complete received")
-                            break
-                    except asyncio.TimeoutError:
-                        logger.warning("WebSocket receive timeout")
-                        break
-                    except Exception as e:
-                        logger.error(f"Error receiving message: {e}")
-                        response_queue.put({"type": "error", "message": str(e)})
-                        break
-        except Exception as e:
-            logger.error(f"WebSocket error: {e}")
-            response_queue.put({"type": "error", "message": str(e)})
-    # Run async function
-    asyncio.run(communicate())
-# Title and description
-st.title("🎙️ Open Voice Agent")
-st.markdown(
-    """
-    Voice conversation with AI using NVIDIA API for speech synthesis
-    and LangGraph for conversation management.
-    """
-)
-# Sidebar configuration
-with st.sidebar:
-    st.header("⚙️ Settings")
-    # WebSocket connection settings
-    ws_url = st.text_input(
-        "WebSocket URL",
-        value="ws://localhost:8000/ws/voice",
-        help="URL of the FastAPI WebSocket server",
-    )
-    # Connection status
-    status_color = "🟢" if st.session_state.ws_connected else "🔴"
-    st.markdown(f"**Status:** {status_color} {'Connected' if st.session_state.ws_connected else 'Disconnected'}")
-    st.markdown("**Voice Provider:** NVIDIA API (configured on the server)")
-    # Test connection button
-    if st.button("🔍 Test Connection"):
-        try:
-            import requests
-            response = requests.get("http://localhost:8000/health", timeout=2)
-            if response.status_code == 200:
-                st.success("✅ Server is running!")
-                st.session_state.ws_connected = True
-            else:
-                st.error("❌ Server not responding correctly")
-                st.session_state.ws_connected = False
-        except Exception as e:
-            st.error(f"❌ Cannot connect to server: {e}")
-            st.session_state.ws_connected = False
-    st.divider()
-    if st.button("🗑️ Clear Conversation"):
-        st.session_state.messages = []
-        st.session_state.current_transcript = ""
-        st.rerun()
-    # Download conversation
-    if st.session_state.messages:
-        transcript = "\n\n".join(
-            [f"{msg['role'].upper()}: {msg['content']}" for msg in st.session_state.messages]
-        )
-        st.download_button(
-            label="📥 Download Transcript",
-            data=transcript,
-            file_name="conversation_transcript.txt",
-            mime="text/plain",
-        )
-# Main content area
-col1, col2 = st.columns([2, 1])
-with col1:
-    st.subheader("💬 Conversation")
-    # Chat container
-    chat_container = st.container(height=500)
-    with chat_container:
-        # Display messages
-        for message in st.session_state.messages:
-            with st.chat_message(message["role"]):
-                st.write(message["content"])
-        # Display current transcript being captured
-        if st.session_state.current_transcript:
-            with st.chat_message("user"):
-                st.write(f"*{st.session_state.current_transcript}...*")
-with col2:
-    st.subheader("🎤 Audio Controls")
-    # Check if server is running
-    if not st.session_state.ws_connected:
-        st.warning("⚠️ Please test connection first")
-    # Audio recorder using built-in st.audio_input
-    st.markdown("**Record your message:**")
-    audio_data = st.audio_input(
-        "Click to start/stop recording",
-        key="audio_input",
-        help="Click to start recording, click again to stop"
-    )
-    if audio_data is not None:
-        st.success("✓ Audio recorded!")
-        # Show audio player
-        st.audio(audio_data)
-        # Send button
-        if st.button("📤 Send Audio", disabled=st.session_state.processing):
-            if not st.session_state.ws_connected:
-                st.error("❌ Not connected to server")
-            else:
-                st.session_state.processing = True
-                # Get audio bytes
-                audio_bytes = audio_data.getvalue()
-                # Encode to base64
-                encoded_audio = base64.b64encode(audio_bytes).decode('utf-8')
-                # Show processing indicator
-                with st.spinner("🔊 Processing audio..."):
-                    # Start WebSocket communication in background thread
-                    thread = Thread(
-                        target=send_audio_to_websocket,
-                        args=(ws_url, encoded_audio, st.session_state.response_queue),
-                        daemon=True
-                    )
-                    thread.start()
-                    # Wait for thread to complete (with timeout)
-                    thread.join(timeout=30)
-                    # Process responses from queue
-                    transcript_text = ""
-                    response_text = ""
-                    audio_chunks = []
-                    while not st.session_state.response_queue.empty():
-                        response = st.session_state.response_queue.get()
-                        msg_type = response.get("type")
-                        if msg_type == "transcript":
-                            transcript_text += " " + response.get("text", "")
-                        elif msg_type == "response_text":
-                            response_text = response.get("text", "")
-                        elif msg_type == "audio":
-                            audio_chunks.append(response.get("data", ""))
-                        elif msg_type == "error":
-                            st.error(f"Error: {response.get('message')}")
-                    # Add messages to conversation
-                    if transcript_text.strip():
-                        st.session_state.messages.append({
-                            "role": "user",
-                            "content": transcript_text.strip()
-                        })
-                    if response_text:
-                        st.session_state.messages.append({
-                            "role": "assistant",
-                            "content": response_text
-                        })
-                st.session_state.processing = False
-                st.success("✅ Processing complete!")
-                st.rerun()
-    # Processing indicator
-    if st.session_state.processing:
-        st.info("⏳ Processing your message...")
-    # Instructions
-    with st.expander("📖 How to Use"):
-        st.markdown(
-            """
-            1. **Test connection** first to ensure server is running
-            2. **Click** the audio input to start recording
-            3. **Speak** your message clearly
-            4. **Click again** to stop recording
-            5. **Review** the recorded audio (optional)
-            6. **Send** the audio for processing
-            7. Wait for the AI response (text will appear in chat)
-            **Tips:**
-            - Speak clearly and at a normal pace
-            - Wait for the response before recording again
-            - Keep messages concise for better results
-            - Use headphones to avoid echo
-            """
-        )
-    # System info
-    with st.expander("ℹ️ System Info"):
-        st.markdown(f"""
-        **Voice Provider:** NVIDIA API
-        **WebSocket:** `{ws_url}`
-        **Messages:** {len(st.session_state.messages)}
-        """)
-# Footer
-st.markdown("---")
-st.markdown(
-    """
-    <div style='text-align: center'>
-        <small>Powered by NVIDIA API (Voice) + LangGraph (Conversations) + Streamlit (UI)</small>
-    </div>
-    """,
-    unsafe_allow_html=True,
-)

testing/asr_moonshine.py DELETED Viewed

@@ -1,48 +0,0 @@
-import io
-import math
-import numpy as np
-import soundfile as sf
-from scipy.signal import resample_poly
-import torch
-from transformers import AutoProcessor, MoonshineStreamingForConditionalGeneration
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
-model_id = "usefulsensors/moonshine-streaming-small"
-model = MoonshineStreamingForConditionalGeneration.from_pretrained(model_id).to(
-    device, torch_dtype
-)
-processor = AutoProcessor.from_pretrained(model_id)
-# Read audio file
-with open("dev/kokoro_tts.wav", "rb") as f:
-    audio_bytes = f.read()
-# Load audio using soundfile
-audio_np, sr = sf.read(io.BytesIO(audio_bytes), dtype="float32")
-if audio_np.ndim > 1:
-    audio_np = np.mean(audio_np, axis=1)
-if sr != 16000:
-    ratio_gcd = math.gcd(sr, 16000)
-    up = 16000 // ratio_gcd
-    down = sr // ratio_gcd
-    print(f"Resampling from {sr}Hz to 16000Hz")
-    audio_np = resample_poly(audio_np, up=up, down=down)
-inputs = processor(
-    audio_np,
-    return_tensors="pt",
-    sampling_rate=16000,
-).to(device, torch_dtype)
-token_limit_factor = 6.5 / 16000
-max_length = int((inputs.attention_mask.sum() * token_limit_factor).max().item())
-generated_ids = model.generate(**inputs, max_length=max_length)
-transcription = processor.decode(generated_ids[0], skip_special_tokens=True)
-print(f"Transcription: {transcription}")

testing/nvidia_.py DELETED Viewed

@@ -1,4 +0,0 @@
-import nemo.collections.asr as nemo_asr
-asr_model = nemo_asr.models.ASRModel.from_pretrained("nvidia/parakeet-tdt-0.6b-v2")
-transcriptions = asr_model.transcribe(["dev/kokoro_tts.wav"])

testing/pocket_tts_test.py DELETED Viewed

@@ -1,13 +0,0 @@
-from pocket_tts import TTSModel
-import scipy.io.wavfile
-tts_model = TTSModel.load_model()
-voice_state = tts_model.get_state_for_audio_prompt(
-    "alba"  # One of the pre-made voices, see above
-    # You can also use any voice file you have locally or from Hugging Face:
-    # "./some_audio.wav"
-    # or "hf://kyutai/tts-voices/expresso/ex01-ex02_default_001_channel2_198s.wav"
-)
-audio = tts_model.generate_audio(voice_state, "Hello world, this is a test.")
-# Audio is a 1D torch tensor containing PCM data.
-scipy.io.wavfile.write("dev/pocket_tts.wav", tts_model.sample_rate, audio.numpy())

uv.lock CHANGED Viewed

@@ -2081,7 +2081,7 @@ wheels = [
 [[package]]
 name = "open-voice-agent"
 version = "0.1.0"
-source = { virtual = "." }
 dependencies = [
     { name = "langgraph" },
     { name = "lhotse" },

 [[package]]
 name = "open-voice-agent"
 version = "0.1.0"
+source = { editable = "." }
 dependencies = [
     { name = "langgraph" },
     { name = "lhotse" },