open-voice-agent / .env.example
dvalle08's picture
feat: Enhance turn tracing and metrics collection
04178a2
# LLM Provider Selection
LLM_PROVIDER=ollama # Options: nvidia, ollama
MCP_ENABLED=true
MCP_SERVER_URL=https://huggingface.co/mcp
MCP_EXTRA_SERVER_URLS=https://docs.livekit.io/mcp # Comma-separated extra MCP servers (set empty to disable)
# STT Provider Selection
STT_PROVIDER=moonshine # Options: moonshine, nvidia, deepgram
# Moonshine STT Settings (local speech-to-text)
MOONSHINE_MODEL_ID=usefulsensors/moonshine-streaming-medium
MOONSHINE_LANGUAGE=en
# Deepgram STT Settings (cloud speech-to-text)
DEEPGRAM_STT_MODEL=nova-3
DEEPGRAM_STT_LANGUAGE=en-US
# NVIDIA STT Settings (cloud speech-to-text)
NVIDIA_STT_API_KEY= # Optional: uses NVIDIA_API_KEY if not set
NVIDIA_STT_MODEL=parakeet-1.1b-en-US-asr-streaming-silero-vad-sortformer
NVIDIA_STT_LANGUAGE_CODE=en-US
# NVIDIA API Key (shared by LLM and STT unless NVIDIA_STT_API_KEY is set)
NVIDIA_API_KEY=your_nvidia_api_key_here
# NVIDIA LLM Settings
NVIDIA_MODEL=meta/llama-3.1-8b-instruct
# Ollama LLM Settings
OLLAMA_CLOUD_MODE=true
OLLAMA_MODEL=qwen3-next:80b
OLLAMA_API_KEY=your_ollama_api_key_here
# Set OLLAMA_CLOUD_MODE=false to use local Ollama at http://localhost:11434/v1.
# Local mode can keep OLLAMA_API_KEY=ollama if your server ignores auth.
# Note: do not use ":cloud" aliases with the /v1 endpoint.
# Langfuse Tracing Settings (optional)
LANGFUSE_ENABLED=false
LANGFUSE_HOST=https://cloud.langfuse.com
LANGFUSE_BASE_URL= # Optional alternative to LANGFUSE_HOST
LANGFUSE_PROJECT_ID= # Required for frontend deep links: project/<project_id>/...
LANGFUSE_PUBLIC_KEY=
LANGFUSE_SECRET_KEY=
LANGFUSE_PUBLIC_TRACES=false # Mark traces public so non-members can open shared links
LANGFUSE_ASSISTANT_TEXT_GRACE_TIMEOUT_MS=500 # Short wait for assistant text on normal turns
LANGFUSE_TRACE_FINALIZE_TIMEOUT_MS=8000 # Legacy fallback retained for compatibility
LANGFUSE_POST_TOOL_RESPONSE_TIMEOUT_MS=30000
LANGFUSE_MAX_PENDING_TRACE_TASKS=200
LANGFUSE_TRACE_FLUSH_TIMEOUT_MS=1000
LANGFUSE_CONTINUATION_COALESCE_WINDOW_MS=1500 # Merge immediate continuation turns into one trace; 0 disables it
# Common LLM Parameters
LLM_TEMPERATURE=0.7
LLM_MAX_TOKENS=1024
# LLM/MCP API timeout/retry tuning.
LLM_CONN_TIMEOUT_SEC=20.0
MCP_CONN_TIMEOUT_SEC=20.0 # Timeout for one MCP tool request/response cycle
LLM_CONN_MAX_RETRY=1
LLM_CONN_RETRY_INTERVAL_SEC=1.0
TURN_LLM_STALL_TIMEOUT_SEC=12.0
MCP_STARTUP_GREETING_TIMEOUT_SEC=0.0 # Set >0 to force-interrupt slow startup greetings; 0 disables the cutoff
# TTS Provider Selection
TTS_PROVIDER=pocket # Options: pocket, deepgram, nvidia
DEEPGRAM_API_KEY= # Required when STT_PROVIDER=deepgram or TTS_PROVIDER=deepgram
NVIDIA_TTS_API_KEY= # Optional: uses NVIDIA_API_KEY if not set
# NVIDIA TTS Settings (cloud or self-hosted Riva)
NVIDIA_TTS_VOICE=Magpie-Multilingual.EN-US.Leo
NVIDIA_TTS_LANGUAGE_CODE=en-US
NVIDIA_TTS_SERVER=grpc.nvcf.nvidia.com:443
NVIDIA_TTS_FUNCTION_ID=877104f7-e885-42b9-8de8-f6e4c6303969
NVIDIA_TTS_USE_SSL=true # Set false for self-hosted Riva without TLS
# Pocket TTS Settings (local text-to-speech)
POCKET_TTS_VOICE=alba
POCKET_TTS_TEMPERATURE=0.7
POCKET_TTS_LSD_DECODE_STEPS=1
POCKET_TTS_CONN_TIMEOUT_SEC=45.0 # Timeout for one PocketTTS synthesis attempt
# PocketTTS output sample rate is fixed to native 24kHz.
# LiveKit Settings
LIVEKIT_URL=wss://your-livekit-server.example.com
LIVEKIT_API_KEY=your_livekit_api_key_here
LIVEKIT_API_SECRET=your_livekit_api_secret_here
LIVEKIT_AGENT_NAME=open-voice-agent-local # Use a unique name per environment to avoid worker collisions
LIVEKIT_NUM_IDLE_PROCESSES=1 # Use 0-1 locally to reduce memory pressure
LIVEKIT_INITIALIZE_PROCESS_TIMEOUT_SEC=20.0 # Increase idle worker bootstrap timeout
LIVEKIT_JOB_MEMORY_WARN_MB=6144 # Per-job memory warning threshold (6 GB)
# LiveKit audio input configuration
LIVEKIT_SAMPLE_RATE=24000
LIVEKIT_NUM_CHANNELS=1
LIVEKIT_FRAME_SIZE_MS=60 # Larger frames slightly reduce responsiveness but avoid over-eager VAD transitions
LIVEKIT_PRE_CONNECT_AUDIO=true
LIVEKIT_PRE_CONNECT_TIMEOUT=3.0
# Voice Activity Detection (VAD) configuration
VAD_MIN_SPEECH_DURATION=0.18 # Require 180ms of speech before activation
VAD_MIN_SILENCE_DURATION=0.55 # Wait longer before treating a pause as end of speech
VAD_THRESHOLD=0.5 # Silero default; keep balanced sensitivity for speech vs background noise
# Turn endpointing tuning
MIN_ENDPOINTING_DELAY=0.5 # Default turn commit delay before endpointing
MAX_ENDPOINTING_DELAY=3.0 # Let the detector wait longer when phrasing suggests continuation
PREEMPTIVE_GENERATION=false # Wait for the committed turn before generating a reply