flozi00's picture
english
d0b9ec6
"""
Chatterbox Multilingual TTS Backend with Voice Cloning support.
This is the default backend for the Phone Announcements engine.
"""
from typing import Optional
import numpy as np
from loguru import logger
from .base import BackendConfig, TTSBackend, TTSResult
# Default voice prompts per language (high-quality reference samples)
DEFAULT_VOICE_PROMPTS = {
"ar": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ar_f/ar_prompts2.flac",
"da": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/da_m1.flac",
"de": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/de_f1.flac",
"el": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/el_m.flac",
"en": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/en_f1.flac",
"es": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/es_f1.flac",
"fi": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/fi_m.flac",
"fr": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/fr_f1.flac",
"he": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/he_m1.flac",
"hi": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/hi_f1.flac",
"it": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/it_m1.flac",
"ja": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ja/ja_prompts1.flac",
"ko": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ko_f.flac",
"ms": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ms_f.flac",
"nl": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/nl_m.flac",
"no": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/no_f1.flac",
"pl": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/pl_m.flac",
"pt": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/pt_m1.flac",
"ru": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ru_m.flac",
"sv": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/sv_f.flac",
"sw": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/sw_m.flac",
"tr": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/tr_m.flac",
"zh": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/zh_f2.flac",
}
class ChatterboxBackend(TTSBackend):
"""
Chatterbox Multilingual TTS Backend.
Features:
- 23 language support
- High-quality voice cloning
- Expressive speech synthesis
This backend uses the ResembleAI Chatterbox model for synthesis.
"""
# Optimal defaults for phone announcements (clear, professional)
DEFAULT_EXAGGERATION = (
0.35 # Slightly less expressive for professional announcements
)
DEFAULT_TEMPERATURE = 0.7 # Balanced randomness
DEFAULT_CFG_WEIGHT = 0.5 # Standard guidance
SUPPORTED_LANGUAGES = {
"ar": "Arabic",
"da": "Danish",
"de": "German",
"el": "Greek",
"en": "English",
"es": "Spanish",
"fi": "Finnish",
"fr": "French",
"he": "Hebrew",
"hi": "Hindi",
"it": "Italian",
"ja": "Japanese",
"ko": "Korean",
"ms": "Malay",
"nl": "Dutch",
"no": "Norwegian",
"pl": "Polish",
"pt": "Portuguese",
"ru": "Russian",
"sv": "Swedish",
"sw": "Swahili",
"tr": "Turkish",
"zh": "Chinese",
}
def __init__(self, config: Optional[BackendConfig] = None):
super().__init__(config)
self._model = None
self._device = None
@property
def name(self) -> str:
return "Chatterbox Multilingual"
@property
def supports_voice_cloning(self) -> bool:
return True
@property
def supported_languages(self) -> dict[str, str]:
return self.SUPPORTED_LANGUAGES.copy()
def load(self) -> None:
"""Load the Chatterbox model."""
if self._is_loaded:
logger.info("Chatterbox model already loaded")
return
logger.info("Loading Chatterbox Multilingual model...")
from src.chatterbox.mtl_tts import ChatterboxMultilingualTTS
self._device = self.config.resolve_device()
logger.info(f"Using device: {self._device}")
try:
self._model = ChatterboxMultilingualTTS.from_pretrained(self._device)
self._is_loaded = True
logger.info("Chatterbox model loaded successfully")
except Exception as e:
logger.error(f"Failed to load Chatterbox model: {e}")
raise
def unload(self) -> None:
"""Unload the model to free memory."""
if self._model is not None:
import torch
del self._model
self._model = None
if self._device == "cuda":
torch.cuda.empty_cache()
self._is_loaded = False
logger.info("Chatterbox model unloaded")
def get_default_voice(self, language: str) -> Optional[str]:
"""Get the default voice prompt URL for a language."""
return DEFAULT_VOICE_PROMPTS.get(language.lower())
def generate(
self,
text: str,
language: str = "de",
voice_audio_path: Optional[str] = None,
exaggeration: Optional[float] = None,
temperature: Optional[float] = None,
cfg_weight: Optional[float] = None,
seed: Optional[int] = None,
**kwargs,
) -> TTSResult:
"""
Generate speech from text using Chatterbox.
Args:
text: Text to synthesize
language: Language code (default: "de" for German)
voice_audio_path: Path/URL to reference audio for voice cloning
exaggeration: Speech expressiveness (0.25-2.0, default: 0.35)
temperature: Generation randomness (0.05-5.0, default: 0.7)
cfg_weight: CFG guidance weight (0.2-1.0, default: 0.5)
seed: Random seed for reproducibility (default: None = random)
Returns:
TTSResult with audio waveform and sample rate
"""
if not self._is_loaded:
self.load()
import random
import torch
# Apply seed if provided
if seed is not None and seed != 0:
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)
if self._device == "cuda":
torch.cuda.manual_seed_all(seed)
# Use defaults for unspecified parameters
exaggeration = (
exaggeration if exaggeration is not None else self.DEFAULT_EXAGGERATION
)
temperature = (
temperature if temperature is not None else self.DEFAULT_TEMPERATURE
)
cfg_weight = cfg_weight if cfg_weight is not None else self.DEFAULT_CFG_WEIGHT
# Resolve voice prompt
audio_prompt = voice_audio_path or self.get_default_voice(language)
# Validate language
lang_code = language.lower()
if lang_code not in self.SUPPORTED_LANGUAGES:
available = ", ".join(sorted(self.SUPPORTED_LANGUAGES.keys()))
raise ValueError(
f"Unsupported language '{language}'. Available: {available}"
)
logger.info(f"Generating speech: lang={lang_code}, text='{text[:50]}...'")
try:
wav = self._model.generate(
text=text,
language_id=lang_code,
audio_prompt_path=audio_prompt,
exaggeration=exaggeration,
temperature=temperature,
cfg_weight=cfg_weight,
)
# Convert to numpy array
audio_np = wav.squeeze().numpy()
return TTSResult(audio=audio_np, sample_rate=self._model.sr)
except Exception as e:
logger.error(f"TTS generation failed: {e}")
raise