from pydantic import Field from pydantic_settings import BaseSettings, SettingsConfigDict class Settings(BaseSettings): stt_backend: str = "whisper" whisper_model: str = "small" whisper_device: str = "cuda" whisper_compute_type: str = "int8" whisper_beam_size: int = 5 whisper_best_of: int = 5 whisper_temperature: float = 0.0 whisper_condition_on_previous_text: bool = False stt_language: str = "auto" stt_multilingual_enabled: bool = True parakeet_model_id: str = "nvidia/parakeet-tdt-0.6b-v3" parakeet_device: str = "cpu" tts_backend: str = "chatterbox-onnx" chatterbox_device: str = "cuda" chatterbox_onnx_provider: str = "cuda" chatterbox_onnx_dtype: str = "fp16" tts_gpu_resident_preferred: bool = True chatterbox_onnx_site_packages_path: str = "" chatterbox_onnx_model_id: str = "ResembleAI/chatterbox-turbo-ONNX" chatterbox_onnx_voice_path: str = ( "/media/rapheal/New WD m2 Udemba Boys/home-overflow/Projects/voice-latency-lab/" ".venv/lib/python3.11/site-packages/s3tokenizer/assets/BAC009S0764W0121.wav" ) chatterbox_onnx_max_new_tokens: int = 96 chatterbox_onnx_repetition_penalty: float = 1.15 kokoro_repo_id: str = "hexgrad/Kokoro-82M" kokoro_lang_code: str = "a" kokoro_voice: str = "af_heart" kokoro_device: str = "cuda" kokoro_speed: float = 1.0 host: str = "0.0.0.0" port: int = 8000 static_dir: str = "static" sample_rate: int = 16000 vad_provider: str = "silero" parakeet_bypass_silero_vad: bool = False parakeet_vad_rms_threshold: float = 0.01 parakeet_vad_start_ms: int = 80 parakeet_vad_stop_ms: int = 480 parakeet_barge_in_rms_threshold: float = 0.02 parakeet_barge_in_start_ms: int = 180 silero_vad_threshold: float = 0.14 silero_vad_start_ms: int = 16 silero_vad_min_rms: float = 0.00025 silero_vad_strong_threshold: float = 0.32 vad_frontend_enabled: bool = True vad_frontend_preemphasis: float = 0.97 vad_frontend_noise_floor_alpha: float = 0.96 vad_frontend_gate_multiplier: float = 1.08 vad_frontend_target_rms: float = 0.01 vad_frontend_max_gain: float = 2.5 vad_stop_ms: int = 560 dynamic_endpointing_enabled: bool = True dynamic_endpointing_min_ms: int = 180 dynamic_endpointing_max_ms: int = 2600 dynamic_endpointing_incomplete_bias_ms: int = 1500 dynamic_endpointing_complete_discount_ms: int = 320 dynamic_endpointing_question_discount_ms: int = 180 dynamic_endpointing_short_utterance_bias_ms: int = 220 dynamic_endpointing_no_partial_short_discount_ms: int = 180 dynamic_endpointing_no_partial_medium_discount_ms: int = 100 dynamic_endpointing_stale_partial_ms: int = 3200 dynamic_endpointing_max_turn_ms: int = 30000 preroll_ms: int = 450 min_utterance_ms: int = 220 assistant_holdoff_ms: int = 320 assistant_response_watchdog_ms: int = 3500 assistant_barge_in_grace_ms: int = 90 assistant_barge_in_start_ms: int = 60 assistant_barge_in_min_rms: float = 0.01 assistant_barge_in_prob_threshold: float = 0.45 transcription_trim_threshold: float = 0.005 transcription_min_rms: float = 0.002 transcription_min_peak: float = 0.01 speaker_focus_enabled: bool = True speaker_focus_min_utterance_ms: int = 900 speaker_focus_min_rms: float = 0.015 speaker_focus_similarity_threshold: float = 0.82 speaker_focus_profile_alpha: float = 0.18 speaker_focus_multi_speaker_threshold: float = 0.16 speaker_focus_reject_mixed: bool = True speaker_focus_debug: bool = True conversation_memory_enabled: bool = True conversation_memory_turns: int = 6 conversation_memory_max_chars: int = 1400 short_utterance_ms: int = 700 short_utterance_keep_edge_ms: int = 140 short_utterance_min_rms: float = 0.0015 short_utterance_min_peak: float = 0.008 short_utterance_pad_ms: int = 220 short_utterance_min_transcription_ms: int = 650 whisper_log_prob_threshold: float = -2.0 whisper_no_speech_threshold: float = 1.0 whisper_fallback_beam_size: int = 8 whisper_fallback_best_of: int = 8 whisper_fallback_log_prob_threshold: float = -3.0 whisper_fallback_no_speech_threshold: float = 1.0 partial_transcripts_enabled: bool = True partial_transcript_min_ms: int = 250 partial_transcript_interval_ms: int = 180 partial_response_enabled: bool = True partial_response_stable_ms: int = 180 partial_response_min_words: int = 4 partial_response_complete_min_words: int = 2 partial_response_complete_max_words: int = 6 partial_response_incomplete_min_words: int = 7 partial_response_complete_stable_ms: int = 120 partial_response_incomplete_stable_ms: int = 520 partial_response_min_silence_ms: int = 120 hallucination_max_rms: float = 0.02 hallucination_max_words: int = 3 hallucination_phrases: str = ( "thank you,thank you very much,all right,alright,thanks," "okay,ok,okay you,ok you,you,yeah,right,got it,mm hmm,uh huh" ) preload_models: bool = True assistant_backend: str = "my-agent-cli" hf_local_model_id: str = "Qwen/Qwen2.5-1.5B-Instruct" hf_local_device: str = "cpu" hf_local_dtype: str = "float32" hf_local_max_new_tokens: int = 40 hf_local_temperature: float = 0.6 hf_local_top_p: float = 0.95 hf_local_do_sample: bool = False hf_local_hide_thinking: bool = True hf_local_requires_hf_access: bool = False hf_local_prompt_prefix: str = ( "You are a warm, natural voice assistant. Reply in spoken language, usually one or two sentences and sometimes a short third sentence when it helps. " "Answer directly, but sound like a person instead of a bot. Keep the conversation flowing. " "When it feels natural, add a brief follow-up question or gentle prompt to keep the exchange going. " "Do not show your reasoning. Do not output tags. " "By default, reply in the same language as the user's latest message. " "If the user explicitly asks for translation, comparison, or output in two or more languages, include all requested languages. " "Do not mention Human, User, Assistant, Thought, or explain your answer style. " "Do not add emojis, roleplay markers, or meta commentary." ) my_agent_command: str = "/home/rapheal/.cargo/bin/my-agent" my_agent_cwd: str = "/home/rapheal" my_agent_active_tty_file: str | None = None my_agent_force: bool = True my_agent_model: str | None = None my_agent_voice_preamble: str = ( "Voice mode. Reply in spoken language, usually one or two sentences and sometimes a short third sentence when it helps. " "By default, reply in the same language as the user's latest message. " "If the user explicitly asks for translation, comparison, or output in two or more languages, include all requested languages. " "Sound warm, conversational, and human, not robotic or overly formal. " "When it fits, ask one short follow-up question or make one brief conversational observation to keep the exchange going. " "Do not use bullet points, numbered lists, headings, or dense datapoints unless the user explicitly asks. " "Sound natural, direct, and human. " "If the task involves a web page, browser state, or checking something online in a browser, " "prefer browser_use_task first, then browser_use_script, and only fall back to older browser_navigate/browser_snapshot/browser_act tools if browser-use fails or low-level control is explicitly needed. " "Keep browser-use vision off unless the task clearly requires visual reasoning." ) agent_progress_speak_min_interval_ms: int = 9000 notification_max_wait_ms: int = 15000 backchannel_enabled: bool = False backchannel_min_speech_ms: int = 1100 backchannel_stable_ms: int = 420 backchannel_min_interval_ms: int = 6000 backchannel_recent_limit: int = 3 tts_first_chunk_max_chars: int = 72 tts_first_chunk_max_words: int = 14 tts_auto_ack_prefix_enabled: bool = False assistant_stream_chunk_min_chars: int = 32 assistant_stream_chunk_min_words: int = 6 assistant_stream_chunk_max_chars: int = 56 tts_prefill_enabled: bool = False tts_prefill_min_chars: int = 48 tts_prefill_choices: str = "okay,yeah,right,got it" openrouter_api_key: str | None = Field(default=None, alias="OPENROUTER_API_KEY") openrouter_model: str = "liquid/lfm-2.5-1.2b-instruct:free" openrouter_base_url: str = "https://openrouter.ai/api/v1" openrouter_site_url: str = "https://voice.agent-assistant.org" openrouter_app_name: str = "Voice Latency Lab" openrouter_system_prompt: str = ( "You are a warm, natural voice assistant. Reply in spoken language, usually one or two sentences and sometimes a short third sentence when it helps. " "By default, reply in the same language as the user's latest message. " "If the user explicitly asks for translation, comparison, or output in two or more languages, include all requested languages. " "Use brief conversational acknowledgements when natural, such as 'yeah', 'right', 'mm-hmm', or 'got it', " "but do not overuse them. You may occasionally include a single inline paralinguistic tag like [chuckle], " "[laugh], [sigh], or [cough] when it clearly fits the moment. Keep answers useful, human, and easy to speak aloud. " "When appropriate, add one short follow-up question or gentle prompt so the conversation keeps moving." ) openrouter_max_tokens: int = 72 openrouter_temperature: float = 0.55 llama_base_url: str = "http://127.0.0.1:8081/v1" llama_model: str = "gemma-4-e2b-it" llama_api_key: str | None = None llama_system_prompt: str = ( "You are a warm, natural voice assistant. Reply in spoken language, usually one or two sentences and sometimes a short third sentence when it helps. " "By default, reply in the same language as the user's latest message. " "If the user explicitly asks for translation, comparison, or output in two or more languages, include all requested languages. " "Use brief conversational acknowledgements when natural, such as 'yeah', 'right', 'mm-hmm', or 'got it', " "but do not overuse them. You may occasionally include a single inline paralinguistic tag like [chuckle], " "[laugh], [sigh], or [cough] when it clearly fits the moment. Keep answers useful, human, and easy to speak aloud. " "When appropriate, add one short follow-up question or gentle prompt so the conversation keeps moving." ) llama_max_tokens: int = 56 llama_temperature: float = 1.0 llama_top_p: float = 0.95 llama_top_k: int = 64 llama_repetition_penalty: float = 1.0 llama_stop_tokens: str = "" llama_context_window: int = 8192 llama_threads: int = 8 model_config = SettingsConfigDict( env_prefix="VOICE_LAB_", env_file=".env", extra="ignore", populate_by_name=True, ) settings = Settings()