lesson-agent-dev / voice_models.yaml
msg encrypted ai
Feat/sprint last 2hours (#22)
aac5f23
Raw
History Blame Contribute Delete
2.71 kB
# Voice model presets for EchoCoach (ASR + TTS).
# Override defaults via ECHOCOACH_ASR_PRESET / ECHOCOACH_TTS_PRESET in .env
defaults:
asr_preset: whisper-cpp-base
tts_preset: piper-multilingual
# Realtime streaming TTS for TeacherVoice VoiceOut (set ECHOCOACH_TTS_PRESET to match)
realtime_tts_preset: vibevoice-realtime-0.5b
coach_model: minicpm5-1b-language-lesson-hub
coach_fallbacks:
- minicpm5-1b-language-lesson-lora
- minicpm5-1b
max_seconds: 30
languages:
- code: en
label: English
- code: fr
label: French
- code: de
label: German
- code: es
label: Spanish
- code: it
label: Italian
- code: pt
label: Portuguese
- code: nl
label: Dutch
- code: pl
label: Polish
- code: el
label: Greek
- code: ar
label: Arabic
- code: ja
label: Japanese
- code: zh
label: Chinese (Mandarin)
- code: vi
label: Vietnamese
- code: ko
label: Korean
asr:
cohere-transcribe:
label: Cohere Transcribe 2B (14 languages)
backend: cohere
model_id: CohereLabs/cohere-transcribe-03-2026
whisper-cpp-tiny:
label: Whisper.cpp tiny (CPU, fast)
backend: whisper_cpp
model_size: tiny
whisper-cpp-base:
label: Whisper.cpp base (CPU, better WER)
backend: whisper_cpp
model_size: base
# Optional omni speech-in/speech-out (GPU; set ECHOCOACH_VOICE_PROFILE=omni)
omni:
minicpm-o-4.5:
label: MiniCPM-o 4.5 (GPU omni falls back to pipeline until wired)
model_id: openbmb/MiniCPM-o-4_5
languages: [en, zh]
tts:
piper-multilingual:
label: Piper TTS (local VoiceOut)
backend: piper
voices:
en: en_US-lessac-medium
fr: fr_FR-siwis-medium
de: de_DE-thorsten-medium
es: es_ES-sharvard-medium
it: it_IT-riccardo-medium
pt: pt_BR-faber-medium
nl: nl_NL-mls-medium
pl: pl_PL-darkman-medium
el: el_GR-rapunzelina-low
ar: ar_JO-kareem-medium
ja: ja_JP-natsuki-medium
zh: zh_CN-huayan-medium
vi: vi_VN-25hours-single
ko: ko_KR-kss-medium
fallback_voice: en_US-lessac-medium
# Microsoft VibeVoice Realtime — streaming TTS, ~300ms to first audio (GPU recommended).
# https://huggingface.co/microsoft/VibeVoice-Realtime-0.5B
# English-first; also supports de, fr, it, es, pt, nl, pl, ja, ko (experimental).
# Requires transformers + torch; falls back to Piper until fully wired in echocoach.
vibevoice-realtime-0.5b:
label: VibeVoice Realtime 0.5B (streaming, ~300ms)
backend: vibevoice
model_id: microsoft/VibeVoice-Realtime-0.5B
streaming: true
realtime: true
languages: [en, de, fr, it, es, pt, nl, pl, ja, ko]
fallback_language: en