voice-agent / app /config.py
RalphThings's picture
Deploy Hugging Face Space
5f0a2ac
from pydantic import Field
from pydantic_settings import BaseSettings, SettingsConfigDict
class Settings(BaseSettings):
stt_backend: str = "whisper"
whisper_model: str = "small"
whisper_device: str = "cuda"
whisper_compute_type: str = "int8"
whisper_beam_size: int = 5
whisper_best_of: int = 5
whisper_temperature: float = 0.0
whisper_condition_on_previous_text: bool = False
stt_language: str = "auto"
stt_multilingual_enabled: bool = True
parakeet_model_id: str = "nvidia/parakeet-tdt-0.6b-v3"
parakeet_device: str = "cpu"
tts_backend: str = "chatterbox-onnx"
chatterbox_device: str = "cuda"
chatterbox_onnx_provider: str = "cuda"
chatterbox_onnx_dtype: str = "fp16"
tts_gpu_resident_preferred: bool = True
chatterbox_onnx_site_packages_path: str = ""
chatterbox_onnx_model_id: str = "ResembleAI/chatterbox-turbo-ONNX"
chatterbox_onnx_voice_path: str = (
"/media/rapheal/New WD m2 Udemba Boys/home-overflow/Projects/voice-latency-lab/"
".venv/lib/python3.11/site-packages/s3tokenizer/assets/BAC009S0764W0121.wav"
)
chatterbox_onnx_max_new_tokens: int = 96
chatterbox_onnx_repetition_penalty: float = 1.15
kokoro_repo_id: str = "hexgrad/Kokoro-82M"
kokoro_lang_code: str = "a"
kokoro_voice: str = "af_heart"
kokoro_device: str = "cuda"
kokoro_speed: float = 1.0
host: str = "0.0.0.0"
port: int = 8000
static_dir: str = "static"
sample_rate: int = 16000
vad_provider: str = "silero"
parakeet_bypass_silero_vad: bool = False
parakeet_vad_rms_threshold: float = 0.01
parakeet_vad_start_ms: int = 80
parakeet_vad_stop_ms: int = 480
parakeet_barge_in_rms_threshold: float = 0.02
parakeet_barge_in_start_ms: int = 180
silero_vad_threshold: float = 0.14
silero_vad_start_ms: int = 16
silero_vad_min_rms: float = 0.00025
silero_vad_strong_threshold: float = 0.32
vad_frontend_enabled: bool = True
vad_frontend_preemphasis: float = 0.97
vad_frontend_noise_floor_alpha: float = 0.96
vad_frontend_gate_multiplier: float = 1.08
vad_frontend_target_rms: float = 0.01
vad_frontend_max_gain: float = 2.5
vad_stop_ms: int = 560
dynamic_endpointing_enabled: bool = True
dynamic_endpointing_min_ms: int = 180
dynamic_endpointing_max_ms: int = 2600
dynamic_endpointing_incomplete_bias_ms: int = 1500
dynamic_endpointing_complete_discount_ms: int = 320
dynamic_endpointing_question_discount_ms: int = 180
dynamic_endpointing_short_utterance_bias_ms: int = 220
dynamic_endpointing_no_partial_short_discount_ms: int = 180
dynamic_endpointing_no_partial_medium_discount_ms: int = 100
dynamic_endpointing_stale_partial_ms: int = 3200
dynamic_endpointing_max_turn_ms: int = 30000
preroll_ms: int = 450
min_utterance_ms: int = 220
assistant_holdoff_ms: int = 320
assistant_response_watchdog_ms: int = 3500
assistant_barge_in_grace_ms: int = 90
assistant_barge_in_start_ms: int = 60
assistant_barge_in_min_rms: float = 0.01
assistant_barge_in_prob_threshold: float = 0.45
transcription_trim_threshold: float = 0.005
transcription_min_rms: float = 0.002
transcription_min_peak: float = 0.01
speaker_focus_enabled: bool = True
speaker_focus_min_utterance_ms: int = 900
speaker_focus_min_rms: float = 0.015
speaker_focus_similarity_threshold: float = 0.82
speaker_focus_profile_alpha: float = 0.18
speaker_focus_multi_speaker_threshold: float = 0.16
speaker_focus_reject_mixed: bool = True
speaker_focus_debug: bool = True
conversation_memory_enabled: bool = True
conversation_memory_turns: int = 6
conversation_memory_max_chars: int = 1400
short_utterance_ms: int = 700
short_utterance_keep_edge_ms: int = 140
short_utterance_min_rms: float = 0.0015
short_utterance_min_peak: float = 0.008
short_utterance_pad_ms: int = 220
short_utterance_min_transcription_ms: int = 650
whisper_log_prob_threshold: float = -2.0
whisper_no_speech_threshold: float = 1.0
whisper_fallback_beam_size: int = 8
whisper_fallback_best_of: int = 8
whisper_fallback_log_prob_threshold: float = -3.0
whisper_fallback_no_speech_threshold: float = 1.0
partial_transcripts_enabled: bool = True
partial_transcript_min_ms: int = 250
partial_transcript_interval_ms: int = 180
partial_response_enabled: bool = True
partial_response_stable_ms: int = 180
partial_response_min_words: int = 4
partial_response_complete_min_words: int = 2
partial_response_complete_max_words: int = 6
partial_response_incomplete_min_words: int = 7
partial_response_complete_stable_ms: int = 120
partial_response_incomplete_stable_ms: int = 520
partial_response_min_silence_ms: int = 120
hallucination_max_rms: float = 0.02
hallucination_max_words: int = 3
hallucination_phrases: str = (
"thank you,thank you very much,all right,alright,thanks,"
"okay,ok,okay you,ok you,you,yeah,right,got it,mm hmm,uh huh"
)
preload_models: bool = True
assistant_backend: str = "my-agent-cli"
hf_local_model_id: str = "Qwen/Qwen2.5-1.5B-Instruct"
hf_local_device: str = "cpu"
hf_local_dtype: str = "float32"
hf_local_max_new_tokens: int = 40
hf_local_temperature: float = 0.6
hf_local_top_p: float = 0.95
hf_local_do_sample: bool = False
hf_local_hide_thinking: bool = True
hf_local_requires_hf_access: bool = False
hf_local_prompt_prefix: str = (
"You are a warm, natural voice assistant. Reply in spoken language, usually one or two sentences and sometimes a short third sentence when it helps. "
"Answer directly, but sound like a person instead of a bot. Keep the conversation flowing. "
"When it feels natural, add a brief follow-up question or gentle prompt to keep the exchange going. "
"Do not show your reasoning. Do not output <think> tags. "
"By default, reply in the same language as the user's latest message. "
"If the user explicitly asks for translation, comparison, or output in two or more languages, include all requested languages. "
"Do not mention Human, User, Assistant, Thought, or explain your answer style. "
"Do not add emojis, roleplay markers, or meta commentary."
)
my_agent_command: str = "/home/rapheal/.cargo/bin/my-agent"
my_agent_cwd: str = "/home/rapheal"
my_agent_active_tty_file: str | None = None
my_agent_force: bool = True
my_agent_model: str | None = None
my_agent_voice_preamble: str = (
"Voice mode. Reply in spoken language, usually one or two sentences and sometimes a short third sentence when it helps. "
"By default, reply in the same language as the user's latest message. "
"If the user explicitly asks for translation, comparison, or output in two or more languages, include all requested languages. "
"Sound warm, conversational, and human, not robotic or overly formal. "
"When it fits, ask one short follow-up question or make one brief conversational observation to keep the exchange going. "
"Do not use bullet points, numbered lists, headings, or dense datapoints unless the user explicitly asks. "
"Sound natural, direct, and human. "
"If the task involves a web page, browser state, or checking something online in a browser, "
"prefer browser_use_task first, then browser_use_script, and only fall back to older browser_navigate/browser_snapshot/browser_act tools if browser-use fails or low-level control is explicitly needed. "
"Keep browser-use vision off unless the task clearly requires visual reasoning."
)
agent_progress_speak_min_interval_ms: int = 9000
notification_max_wait_ms: int = 15000
backchannel_enabled: bool = False
backchannel_min_speech_ms: int = 1100
backchannel_stable_ms: int = 420
backchannel_min_interval_ms: int = 6000
backchannel_recent_limit: int = 3
tts_first_chunk_max_chars: int = 72
tts_first_chunk_max_words: int = 14
tts_auto_ack_prefix_enabled: bool = False
assistant_stream_chunk_min_chars: int = 32
assistant_stream_chunk_min_words: int = 6
assistant_stream_chunk_max_chars: int = 56
tts_prefill_enabled: bool = False
tts_prefill_min_chars: int = 48
tts_prefill_choices: str = "okay,yeah,right,got it"
openrouter_api_key: str | None = Field(default=None, alias="OPENROUTER_API_KEY")
openrouter_model: str = "liquid/lfm-2.5-1.2b-instruct:free"
openrouter_base_url: str = "https://openrouter.ai/api/v1"
openrouter_site_url: str = "https://voice.agent-assistant.org"
openrouter_app_name: str = "Voice Latency Lab"
openrouter_system_prompt: str = (
"You are a warm, natural voice assistant. Reply in spoken language, usually one or two sentences and sometimes a short third sentence when it helps. "
"By default, reply in the same language as the user's latest message. "
"If the user explicitly asks for translation, comparison, or output in two or more languages, include all requested languages. "
"Use brief conversational acknowledgements when natural, such as 'yeah', 'right', 'mm-hmm', or 'got it', "
"but do not overuse them. You may occasionally include a single inline paralinguistic tag like [chuckle], "
"[laugh], [sigh], or [cough] when it clearly fits the moment. Keep answers useful, human, and easy to speak aloud. "
"When appropriate, add one short follow-up question or gentle prompt so the conversation keeps moving."
)
openrouter_max_tokens: int = 72
openrouter_temperature: float = 0.55
llama_base_url: str = "http://127.0.0.1:8081/v1"
llama_model: str = "gemma-4-e2b-it"
llama_api_key: str | None = None
llama_system_prompt: str = (
"You are a warm, natural voice assistant. Reply in spoken language, usually one or two sentences and sometimes a short third sentence when it helps. "
"By default, reply in the same language as the user's latest message. "
"If the user explicitly asks for translation, comparison, or output in two or more languages, include all requested languages. "
"Use brief conversational acknowledgements when natural, such as 'yeah', 'right', 'mm-hmm', or 'got it', "
"but do not overuse them. You may occasionally include a single inline paralinguistic tag like [chuckle], "
"[laugh], [sigh], or [cough] when it clearly fits the moment. Keep answers useful, human, and easy to speak aloud. "
"When appropriate, add one short follow-up question or gentle prompt so the conversation keeps moving."
)
llama_max_tokens: int = 56
llama_temperature: float = 1.0
llama_top_p: float = 0.95
llama_top_k: int = 64
llama_repetition_penalty: float = 1.0
llama_stop_tokens: str = "<turn|>"
llama_context_window: int = 8192
llama_threads: int = 8
model_config = SettingsConfigDict(
env_prefix="VOICE_LAB_",
env_file=".env",
extra="ignore",
populate_by_name=True,
)
settings = Settings()