Spaces:
Sleeping
Sleeping
| from pydantic import Field | |
| from pydantic_settings import BaseSettings, SettingsConfigDict | |
| class Settings(BaseSettings): | |
| stt_backend: str = "whisper" | |
| whisper_model: str = "small" | |
| whisper_device: str = "cuda" | |
| whisper_compute_type: str = "int8" | |
| whisper_beam_size: int = 5 | |
| whisper_best_of: int = 5 | |
| whisper_temperature: float = 0.0 | |
| whisper_condition_on_previous_text: bool = False | |
| stt_language: str = "auto" | |
| stt_multilingual_enabled: bool = True | |
| parakeet_model_id: str = "nvidia/parakeet-tdt-0.6b-v3" | |
| parakeet_device: str = "cpu" | |
| tts_backend: str = "chatterbox-onnx" | |
| chatterbox_device: str = "cuda" | |
| chatterbox_onnx_provider: str = "cuda" | |
| chatterbox_onnx_dtype: str = "fp16" | |
| tts_gpu_resident_preferred: bool = True | |
| chatterbox_onnx_site_packages_path: str = "" | |
| chatterbox_onnx_model_id: str = "ResembleAI/chatterbox-turbo-ONNX" | |
| chatterbox_onnx_voice_path: str = ( | |
| "/media/rapheal/New WD m2 Udemba Boys/home-overflow/Projects/voice-latency-lab/" | |
| ".venv/lib/python3.11/site-packages/s3tokenizer/assets/BAC009S0764W0121.wav" | |
| ) | |
| chatterbox_onnx_max_new_tokens: int = 96 | |
| chatterbox_onnx_repetition_penalty: float = 1.15 | |
| kokoro_repo_id: str = "hexgrad/Kokoro-82M" | |
| kokoro_lang_code: str = "a" | |
| kokoro_voice: str = "af_heart" | |
| kokoro_device: str = "cuda" | |
| kokoro_speed: float = 1.0 | |
| host: str = "0.0.0.0" | |
| port: int = 8000 | |
| static_dir: str = "static" | |
| sample_rate: int = 16000 | |
| vad_provider: str = "silero" | |
| parakeet_bypass_silero_vad: bool = False | |
| parakeet_vad_rms_threshold: float = 0.01 | |
| parakeet_vad_start_ms: int = 80 | |
| parakeet_vad_stop_ms: int = 480 | |
| parakeet_barge_in_rms_threshold: float = 0.02 | |
| parakeet_barge_in_start_ms: int = 180 | |
| silero_vad_threshold: float = 0.14 | |
| silero_vad_start_ms: int = 16 | |
| silero_vad_min_rms: float = 0.00025 | |
| silero_vad_strong_threshold: float = 0.32 | |
| vad_frontend_enabled: bool = True | |
| vad_frontend_preemphasis: float = 0.97 | |
| vad_frontend_noise_floor_alpha: float = 0.96 | |
| vad_frontend_gate_multiplier: float = 1.08 | |
| vad_frontend_target_rms: float = 0.01 | |
| vad_frontend_max_gain: float = 2.5 | |
| vad_stop_ms: int = 560 | |
| dynamic_endpointing_enabled: bool = True | |
| dynamic_endpointing_min_ms: int = 180 | |
| dynamic_endpointing_max_ms: int = 2600 | |
| dynamic_endpointing_incomplete_bias_ms: int = 1500 | |
| dynamic_endpointing_complete_discount_ms: int = 320 | |
| dynamic_endpointing_question_discount_ms: int = 180 | |
| dynamic_endpointing_short_utterance_bias_ms: int = 220 | |
| dynamic_endpointing_no_partial_short_discount_ms: int = 180 | |
| dynamic_endpointing_no_partial_medium_discount_ms: int = 100 | |
| dynamic_endpointing_stale_partial_ms: int = 3200 | |
| dynamic_endpointing_max_turn_ms: int = 30000 | |
| preroll_ms: int = 450 | |
| min_utterance_ms: int = 220 | |
| assistant_holdoff_ms: int = 320 | |
| assistant_response_watchdog_ms: int = 3500 | |
| assistant_barge_in_grace_ms: int = 90 | |
| assistant_barge_in_start_ms: int = 60 | |
| assistant_barge_in_min_rms: float = 0.01 | |
| assistant_barge_in_prob_threshold: float = 0.45 | |
| transcription_trim_threshold: float = 0.005 | |
| transcription_min_rms: float = 0.002 | |
| transcription_min_peak: float = 0.01 | |
| speaker_focus_enabled: bool = True | |
| speaker_focus_min_utterance_ms: int = 900 | |
| speaker_focus_min_rms: float = 0.015 | |
| speaker_focus_similarity_threshold: float = 0.82 | |
| speaker_focus_profile_alpha: float = 0.18 | |
| speaker_focus_multi_speaker_threshold: float = 0.16 | |
| speaker_focus_reject_mixed: bool = True | |
| speaker_focus_debug: bool = True | |
| conversation_memory_enabled: bool = True | |
| conversation_memory_turns: int = 6 | |
| conversation_memory_max_chars: int = 1400 | |
| short_utterance_ms: int = 700 | |
| short_utterance_keep_edge_ms: int = 140 | |
| short_utterance_min_rms: float = 0.0015 | |
| short_utterance_min_peak: float = 0.008 | |
| short_utterance_pad_ms: int = 220 | |
| short_utterance_min_transcription_ms: int = 650 | |
| whisper_log_prob_threshold: float = -2.0 | |
| whisper_no_speech_threshold: float = 1.0 | |
| whisper_fallback_beam_size: int = 8 | |
| whisper_fallback_best_of: int = 8 | |
| whisper_fallback_log_prob_threshold: float = -3.0 | |
| whisper_fallback_no_speech_threshold: float = 1.0 | |
| partial_transcripts_enabled: bool = True | |
| partial_transcript_min_ms: int = 250 | |
| partial_transcript_interval_ms: int = 180 | |
| partial_response_enabled: bool = True | |
| partial_response_stable_ms: int = 180 | |
| partial_response_min_words: int = 4 | |
| partial_response_complete_min_words: int = 2 | |
| partial_response_complete_max_words: int = 6 | |
| partial_response_incomplete_min_words: int = 7 | |
| partial_response_complete_stable_ms: int = 120 | |
| partial_response_incomplete_stable_ms: int = 520 | |
| partial_response_min_silence_ms: int = 120 | |
| hallucination_max_rms: float = 0.02 | |
| hallucination_max_words: int = 3 | |
| hallucination_phrases: str = ( | |
| "thank you,thank you very much,all right,alright,thanks," | |
| "okay,ok,okay you,ok you,you,yeah,right,got it,mm hmm,uh huh" | |
| ) | |
| preload_models: bool = True | |
| assistant_backend: str = "my-agent-cli" | |
| hf_local_model_id: str = "Qwen/Qwen2.5-1.5B-Instruct" | |
| hf_local_device: str = "cpu" | |
| hf_local_dtype: str = "float32" | |
| hf_local_max_new_tokens: int = 40 | |
| hf_local_temperature: float = 0.6 | |
| hf_local_top_p: float = 0.95 | |
| hf_local_do_sample: bool = False | |
| hf_local_hide_thinking: bool = True | |
| hf_local_requires_hf_access: bool = False | |
| hf_local_prompt_prefix: str = ( | |
| "You are a warm, natural voice assistant. Reply in spoken language, usually one or two sentences and sometimes a short third sentence when it helps. " | |
| "Answer directly, but sound like a person instead of a bot. Keep the conversation flowing. " | |
| "When it feels natural, add a brief follow-up question or gentle prompt to keep the exchange going. " | |
| "Do not show your reasoning. Do not output <think> tags. " | |
| "By default, reply in the same language as the user's latest message. " | |
| "If the user explicitly asks for translation, comparison, or output in two or more languages, include all requested languages. " | |
| "Do not mention Human, User, Assistant, Thought, or explain your answer style. " | |
| "Do not add emojis, roleplay markers, or meta commentary." | |
| ) | |
| my_agent_command: str = "/home/rapheal/.cargo/bin/my-agent" | |
| my_agent_cwd: str = "/home/rapheal" | |
| my_agent_active_tty_file: str | None = None | |
| my_agent_force: bool = True | |
| my_agent_model: str | None = None | |
| my_agent_voice_preamble: str = ( | |
| "Voice mode. Reply in spoken language, usually one or two sentences and sometimes a short third sentence when it helps. " | |
| "By default, reply in the same language as the user's latest message. " | |
| "If the user explicitly asks for translation, comparison, or output in two or more languages, include all requested languages. " | |
| "Sound warm, conversational, and human, not robotic or overly formal. " | |
| "When it fits, ask one short follow-up question or make one brief conversational observation to keep the exchange going. " | |
| "Do not use bullet points, numbered lists, headings, or dense datapoints unless the user explicitly asks. " | |
| "Sound natural, direct, and human. " | |
| "If the task involves a web page, browser state, or checking something online in a browser, " | |
| "prefer browser_use_task first, then browser_use_script, and only fall back to older browser_navigate/browser_snapshot/browser_act tools if browser-use fails or low-level control is explicitly needed. " | |
| "Keep browser-use vision off unless the task clearly requires visual reasoning." | |
| ) | |
| agent_progress_speak_min_interval_ms: int = 9000 | |
| notification_max_wait_ms: int = 15000 | |
| backchannel_enabled: bool = False | |
| backchannel_min_speech_ms: int = 1100 | |
| backchannel_stable_ms: int = 420 | |
| backchannel_min_interval_ms: int = 6000 | |
| backchannel_recent_limit: int = 3 | |
| tts_first_chunk_max_chars: int = 72 | |
| tts_first_chunk_max_words: int = 14 | |
| tts_auto_ack_prefix_enabled: bool = False | |
| assistant_stream_chunk_min_chars: int = 32 | |
| assistant_stream_chunk_min_words: int = 6 | |
| assistant_stream_chunk_max_chars: int = 56 | |
| tts_prefill_enabled: bool = False | |
| tts_prefill_min_chars: int = 48 | |
| tts_prefill_choices: str = "okay,yeah,right,got it" | |
| openrouter_api_key: str | None = Field(default=None, alias="OPENROUTER_API_KEY") | |
| openrouter_model: str = "liquid/lfm-2.5-1.2b-instruct:free" | |
| openrouter_base_url: str = "https://openrouter.ai/api/v1" | |
| openrouter_site_url: str = "https://voice.agent-assistant.org" | |
| openrouter_app_name: str = "Voice Latency Lab" | |
| openrouter_system_prompt: str = ( | |
| "You are a warm, natural voice assistant. Reply in spoken language, usually one or two sentences and sometimes a short third sentence when it helps. " | |
| "By default, reply in the same language as the user's latest message. " | |
| "If the user explicitly asks for translation, comparison, or output in two or more languages, include all requested languages. " | |
| "Use brief conversational acknowledgements when natural, such as 'yeah', 'right', 'mm-hmm', or 'got it', " | |
| "but do not overuse them. You may occasionally include a single inline paralinguistic tag like [chuckle], " | |
| "[laugh], [sigh], or [cough] when it clearly fits the moment. Keep answers useful, human, and easy to speak aloud. " | |
| "When appropriate, add one short follow-up question or gentle prompt so the conversation keeps moving." | |
| ) | |
| openrouter_max_tokens: int = 72 | |
| openrouter_temperature: float = 0.55 | |
| llama_base_url: str = "http://127.0.0.1:8081/v1" | |
| llama_model: str = "gemma-4-e2b-it" | |
| llama_api_key: str | None = None | |
| llama_system_prompt: str = ( | |
| "You are a warm, natural voice assistant. Reply in spoken language, usually one or two sentences and sometimes a short third sentence when it helps. " | |
| "By default, reply in the same language as the user's latest message. " | |
| "If the user explicitly asks for translation, comparison, or output in two or more languages, include all requested languages. " | |
| "Use brief conversational acknowledgements when natural, such as 'yeah', 'right', 'mm-hmm', or 'got it', " | |
| "but do not overuse them. You may occasionally include a single inline paralinguistic tag like [chuckle], " | |
| "[laugh], [sigh], or [cough] when it clearly fits the moment. Keep answers useful, human, and easy to speak aloud. " | |
| "When appropriate, add one short follow-up question or gentle prompt so the conversation keeps moving." | |
| ) | |
| llama_max_tokens: int = 56 | |
| llama_temperature: float = 1.0 | |
| llama_top_p: float = 0.95 | |
| llama_top_k: int = 64 | |
| llama_repetition_penalty: float = 1.0 | |
| llama_stop_tokens: str = "<turn|>" | |
| llama_context_window: int = 8192 | |
| llama_threads: int = 8 | |
| model_config = SettingsConfigDict( | |
| env_prefix="VOICE_LAB_", | |
| env_file=".env", | |
| extra="ignore", | |
| populate_by_name=True, | |
| ) | |
| settings = Settings() | |