Spaces:

RalphThings
/

voice-agent

Sleeping

App Files Files Community

voice-agent / app /config.py

RalphThings

Deploy Hugging Face Space

5f0a2ac 8 days ago

raw

history blame contribute delete

11.2 kB

	from pydantic import Field
	from pydantic_settings import BaseSettings, SettingsConfigDict


	class Settings(BaseSettings):
	stt_backend: str = "whisper"
	whisper_model: str = "small"
	whisper_device: str = "cuda"
	whisper_compute_type: str = "int8"
	whisper_beam_size: int = 5
	whisper_best_of: int = 5
	whisper_temperature: float = 0.0
	whisper_condition_on_previous_text: bool = False
	stt_language: str = "auto"
	stt_multilingual_enabled: bool = True
	parakeet_model_id: str = "nvidia/parakeet-tdt-0.6b-v3"
	parakeet_device: str = "cpu"
	tts_backend: str = "chatterbox-onnx"
	chatterbox_device: str = "cuda"
	chatterbox_onnx_provider: str = "cuda"
	chatterbox_onnx_dtype: str = "fp16"
	tts_gpu_resident_preferred: bool = True
	chatterbox_onnx_site_packages_path: str = ""
	chatterbox_onnx_model_id: str = "ResembleAI/chatterbox-turbo-ONNX"
	chatterbox_onnx_voice_path: str = (
	"/media/rapheal/New WD m2 Udemba Boys/home-overflow/Projects/voice-latency-lab/"
	".venv/lib/python3.11/site-packages/s3tokenizer/assets/BAC009S0764W0121.wav"
	)
	chatterbox_onnx_max_new_tokens: int = 96
	chatterbox_onnx_repetition_penalty: float = 1.15
	kokoro_repo_id: str = "hexgrad/Kokoro-82M"
	kokoro_lang_code: str = "a"
	kokoro_voice: str = "af_heart"
	kokoro_device: str = "cuda"
	kokoro_speed: float = 1.0
	host: str = "0.0.0.0"
	port: int = 8000
	static_dir: str = "static"
	sample_rate: int = 16000
	vad_provider: str = "silero"
	parakeet_bypass_silero_vad: bool = False
	parakeet_vad_rms_threshold: float = 0.01
	parakeet_vad_start_ms: int = 80
	parakeet_vad_stop_ms: int = 480
	parakeet_barge_in_rms_threshold: float = 0.02
	parakeet_barge_in_start_ms: int = 180
	silero_vad_threshold: float = 0.14
	silero_vad_start_ms: int = 16
	silero_vad_min_rms: float = 0.00025
	silero_vad_strong_threshold: float = 0.32
	vad_frontend_enabled: bool = True
	vad_frontend_preemphasis: float = 0.97
	vad_frontend_noise_floor_alpha: float = 0.96
	vad_frontend_gate_multiplier: float = 1.08
	vad_frontend_target_rms: float = 0.01
	vad_frontend_max_gain: float = 2.5
	vad_stop_ms: int = 560
	dynamic_endpointing_enabled: bool = True
	dynamic_endpointing_min_ms: int = 180
	dynamic_endpointing_max_ms: int = 2600
	dynamic_endpointing_incomplete_bias_ms: int = 1500
	dynamic_endpointing_complete_discount_ms: int = 320
	dynamic_endpointing_question_discount_ms: int = 180
	dynamic_endpointing_short_utterance_bias_ms: int = 220
	dynamic_endpointing_no_partial_short_discount_ms: int = 180
	dynamic_endpointing_no_partial_medium_discount_ms: int = 100
	dynamic_endpointing_stale_partial_ms: int = 3200
	dynamic_endpointing_max_turn_ms: int = 30000
	preroll_ms: int = 450
	min_utterance_ms: int = 220
	assistant_holdoff_ms: int = 320
	assistant_response_watchdog_ms: int = 3500
	assistant_barge_in_grace_ms: int = 90
	assistant_barge_in_start_ms: int = 60
	assistant_barge_in_min_rms: float = 0.01
	assistant_barge_in_prob_threshold: float = 0.45
	transcription_trim_threshold: float = 0.005
	transcription_min_rms: float = 0.002
	transcription_min_peak: float = 0.01
	speaker_focus_enabled: bool = True
	speaker_focus_min_utterance_ms: int = 900
	speaker_focus_min_rms: float = 0.015
	speaker_focus_similarity_threshold: float = 0.82
	speaker_focus_profile_alpha: float = 0.18
	speaker_focus_multi_speaker_threshold: float = 0.16
	speaker_focus_reject_mixed: bool = True
	speaker_focus_debug: bool = True
	conversation_memory_enabled: bool = True
	conversation_memory_turns: int = 6
	conversation_memory_max_chars: int = 1400
	short_utterance_ms: int = 700
	short_utterance_keep_edge_ms: int = 140
	short_utterance_min_rms: float = 0.0015
	short_utterance_min_peak: float = 0.008
	short_utterance_pad_ms: int = 220
	short_utterance_min_transcription_ms: int = 650
	whisper_log_prob_threshold: float = -2.0
	whisper_no_speech_threshold: float = 1.0
	whisper_fallback_beam_size: int = 8
	whisper_fallback_best_of: int = 8
	whisper_fallback_log_prob_threshold: float = -3.0
	whisper_fallback_no_speech_threshold: float = 1.0
	partial_transcripts_enabled: bool = True
	partial_transcript_min_ms: int = 250
	partial_transcript_interval_ms: int = 180
	partial_response_enabled: bool = True
	partial_response_stable_ms: int = 180
	partial_response_min_words: int = 4
	partial_response_complete_min_words: int = 2
	partial_response_complete_max_words: int = 6
	partial_response_incomplete_min_words: int = 7
	partial_response_complete_stable_ms: int = 120
	partial_response_incomplete_stable_ms: int = 520
	partial_response_min_silence_ms: int = 120
	hallucination_max_rms: float = 0.02
	hallucination_max_words: int = 3
	hallucination_phrases: str = (
	"thank you,thank you very much,all right,alright,thanks,"
	"okay,ok,okay you,ok you,you,yeah,right,got it,mm hmm,uh huh"
	)
	preload_models: bool = True
	assistant_backend: str = "my-agent-cli"
	hf_local_model_id: str = "Qwen/Qwen2.5-1.5B-Instruct"
	hf_local_device: str = "cpu"
	hf_local_dtype: str = "float32"
	hf_local_max_new_tokens: int = 40
	hf_local_temperature: float = 0.6
	hf_local_top_p: float = 0.95
	hf_local_do_sample: bool = False
	hf_local_hide_thinking: bool = True
	hf_local_requires_hf_access: bool = False
	hf_local_prompt_prefix: str = (
	"You are a warm, natural voice assistant. Reply in spoken language, usually one or two sentences and sometimes a short third sentence when it helps. "
	"Answer directly, but sound like a person instead of a bot. Keep the conversation flowing. "
	"When it feels natural, add a brief follow-up question or gentle prompt to keep the exchange going. "
	"Do not show your reasoning. Do not output <think> tags. "
	"By default, reply in the same language as the user's latest message. "
	"If the user explicitly asks for translation, comparison, or output in two or more languages, include all requested languages. "
	"Do not mention Human, User, Assistant, Thought, or explain your answer style. "
	"Do not add emojis, roleplay markers, or meta commentary."
	)
	my_agent_command: str = "/home/rapheal/.cargo/bin/my-agent"
	my_agent_cwd: str = "/home/rapheal"
	my_agent_active_tty_file: str \| None = None
	my_agent_force: bool = True
	my_agent_model: str \| None = None
	my_agent_voice_preamble: str = (
	"Voice mode. Reply in spoken language, usually one or two sentences and sometimes a short third sentence when it helps. "
	"By default, reply in the same language as the user's latest message. "
	"If the user explicitly asks for translation, comparison, or output in two or more languages, include all requested languages. "
	"Sound warm, conversational, and human, not robotic or overly formal. "
	"When it fits, ask one short follow-up question or make one brief conversational observation to keep the exchange going. "
	"Do not use bullet points, numbered lists, headings, or dense datapoints unless the user explicitly asks. "
	"Sound natural, direct, and human. "
	"If the task involves a web page, browser state, or checking something online in a browser, "
	"prefer browser_use_task first, then browser_use_script, and only fall back to older browser_navigate/browser_snapshot/browser_act tools if browser-use fails or low-level control is explicitly needed. "
	"Keep browser-use vision off unless the task clearly requires visual reasoning."
	)
	agent_progress_speak_min_interval_ms: int = 9000
	notification_max_wait_ms: int = 15000
	backchannel_enabled: bool = False
	backchannel_min_speech_ms: int = 1100
	backchannel_stable_ms: int = 420
	backchannel_min_interval_ms: int = 6000
	backchannel_recent_limit: int = 3
	tts_first_chunk_max_chars: int = 72
	tts_first_chunk_max_words: int = 14
	tts_auto_ack_prefix_enabled: bool = False
	assistant_stream_chunk_min_chars: int = 32
	assistant_stream_chunk_min_words: int = 6
	assistant_stream_chunk_max_chars: int = 56
	tts_prefill_enabled: bool = False
	tts_prefill_min_chars: int = 48
	tts_prefill_choices: str = "okay,yeah,right,got it"
	openrouter_api_key: str \| None = Field(default=None, alias="OPENROUTER_API_KEY")
	openrouter_model: str = "liquid/lfm-2.5-1.2b-instruct:free"
	openrouter_base_url: str = "https://openrouter.ai/api/v1"
	openrouter_site_url: str = "https://voice.agent-assistant.org"
	openrouter_app_name: str = "Voice Latency Lab"
	openrouter_system_prompt: str = (
	"You are a warm, natural voice assistant. Reply in spoken language, usually one or two sentences and sometimes a short third sentence when it helps. "
	"By default, reply in the same language as the user's latest message. "
	"If the user explicitly asks for translation, comparison, or output in two or more languages, include all requested languages. "
	"Use brief conversational acknowledgements when natural, such as 'yeah', 'right', 'mm-hmm', or 'got it', "
	"but do not overuse them. You may occasionally include a single inline paralinguistic tag like [chuckle], "
	"[laugh], [sigh], or [cough] when it clearly fits the moment. Keep answers useful, human, and easy to speak aloud. "
	"When appropriate, add one short follow-up question or gentle prompt so the conversation keeps moving."
	)
	openrouter_max_tokens: int = 72
	openrouter_temperature: float = 0.55
	llama_base_url: str = "http://127.0.0.1:8081/v1"
	llama_model: str = "gemma-4-e2b-it"
	llama_api_key: str \| None = None
	llama_system_prompt: str = (
	"You are a warm, natural voice assistant. Reply in spoken language, usually one or two sentences and sometimes a short third sentence when it helps. "
	"By default, reply in the same language as the user's latest message. "
	"If the user explicitly asks for translation, comparison, or output in two or more languages, include all requested languages. "
	"Use brief conversational acknowledgements when natural, such as 'yeah', 'right', 'mm-hmm', or 'got it', "
	"but do not overuse them. You may occasionally include a single inline paralinguistic tag like [chuckle], "
	"[laugh], [sigh], or [cough] when it clearly fits the moment. Keep answers useful, human, and easy to speak aloud. "
	"When appropriate, add one short follow-up question or gentle prompt so the conversation keeps moving."
	)
	llama_max_tokens: int = 56
	llama_temperature: float = 1.0
	llama_top_p: float = 0.95
	llama_top_k: int = 64
	llama_repetition_penalty: float = 1.0
	llama_stop_tokens: str = "<turn\|>"
	llama_context_window: int = 8192
	llama_threads: int = 8

	model_config = SettingsConfigDict(
	env_prefix="VOICE_LAB_",
	env_file=".env",
	extra="ignore",
	populate_by_name=True,
	)


	settings = Settings()