""" Chatterbox Multilingual TTS Backend with Voice Cloning support. This is the default backend for the Phone Announcements engine. """ from typing import Optional import numpy as np from loguru import logger from .base import BackendConfig, TTSBackend, TTSResult # Default voice prompts per language (high-quality reference samples) DEFAULT_VOICE_PROMPTS = { "ar": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ar_f/ar_prompts2.flac", "da": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/da_m1.flac", "de": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/de_f1.flac", "el": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/el_m.flac", "en": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/en_f1.flac", "es": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/es_f1.flac", "fi": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/fi_m.flac", "fr": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/fr_f1.flac", "he": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/he_m1.flac", "hi": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/hi_f1.flac", "it": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/it_m1.flac", "ja": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ja/ja_prompts1.flac", "ko": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ko_f.flac", "ms": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ms_f.flac", "nl": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/nl_m.flac", "no": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/no_f1.flac", "pl": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/pl_m.flac", "pt": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/pt_m1.flac", "ru": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ru_m.flac", "sv": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/sv_f.flac", "sw": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/sw_m.flac", "tr": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/tr_m.flac", "zh": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/zh_f2.flac", } class ChatterboxBackend(TTSBackend): """ Chatterbox Multilingual TTS Backend. Features: - 23 language support - High-quality voice cloning - Expressive speech synthesis This backend uses the ResembleAI Chatterbox model for synthesis. """ # Optimal defaults for phone announcements (clear, professional) DEFAULT_EXAGGERATION = ( 0.35 # Slightly less expressive for professional announcements ) DEFAULT_TEMPERATURE = 0.7 # Balanced randomness DEFAULT_CFG_WEIGHT = 0.5 # Standard guidance SUPPORTED_LANGUAGES = { "ar": "Arabic", "da": "Danish", "de": "German", "el": "Greek", "en": "English", "es": "Spanish", "fi": "Finnish", "fr": "French", "he": "Hebrew", "hi": "Hindi", "it": "Italian", "ja": "Japanese", "ko": "Korean", "ms": "Malay", "nl": "Dutch", "no": "Norwegian", "pl": "Polish", "pt": "Portuguese", "ru": "Russian", "sv": "Swedish", "sw": "Swahili", "tr": "Turkish", "zh": "Chinese", } def __init__(self, config: Optional[BackendConfig] = None): super().__init__(config) self._model = None self._device = None @property def name(self) -> str: return "Chatterbox Multilingual" @property def supports_voice_cloning(self) -> bool: return True @property def supported_languages(self) -> dict[str, str]: return self.SUPPORTED_LANGUAGES.copy() def load(self) -> None: """Load the Chatterbox model.""" if self._is_loaded: logger.info("Chatterbox model already loaded") return logger.info("Loading Chatterbox Multilingual model...") from src.chatterbox.mtl_tts import ChatterboxMultilingualTTS self._device = self.config.resolve_device() logger.info(f"Using device: {self._device}") try: self._model = ChatterboxMultilingualTTS.from_pretrained(self._device) self._is_loaded = True logger.info("Chatterbox model loaded successfully") except Exception as e: logger.error(f"Failed to load Chatterbox model: {e}") raise def unload(self) -> None: """Unload the model to free memory.""" if self._model is not None: import torch del self._model self._model = None if self._device == "cuda": torch.cuda.empty_cache() self._is_loaded = False logger.info("Chatterbox model unloaded") def get_default_voice(self, language: str) -> Optional[str]: """Get the default voice prompt URL for a language.""" return DEFAULT_VOICE_PROMPTS.get(language.lower()) def generate( self, text: str, language: str = "de", voice_audio_path: Optional[str] = None, exaggeration: Optional[float] = None, temperature: Optional[float] = None, cfg_weight: Optional[float] = None, seed: Optional[int] = None, **kwargs, ) -> TTSResult: """ Generate speech from text using Chatterbox. Args: text: Text to synthesize language: Language code (default: "de" for German) voice_audio_path: Path/URL to reference audio for voice cloning exaggeration: Speech expressiveness (0.25-2.0, default: 0.35) temperature: Generation randomness (0.05-5.0, default: 0.7) cfg_weight: CFG guidance weight (0.2-1.0, default: 0.5) seed: Random seed for reproducibility (default: None = random) Returns: TTSResult with audio waveform and sample rate """ if not self._is_loaded: self.load() import random import torch # Apply seed if provided if seed is not None and seed != 0: torch.manual_seed(seed) random.seed(seed) np.random.seed(seed) if self._device == "cuda": torch.cuda.manual_seed_all(seed) # Use defaults for unspecified parameters exaggeration = ( exaggeration if exaggeration is not None else self.DEFAULT_EXAGGERATION ) temperature = ( temperature if temperature is not None else self.DEFAULT_TEMPERATURE ) cfg_weight = cfg_weight if cfg_weight is not None else self.DEFAULT_CFG_WEIGHT # Resolve voice prompt audio_prompt = voice_audio_path or self.get_default_voice(language) # Validate language lang_code = language.lower() if lang_code not in self.SUPPORTED_LANGUAGES: available = ", ".join(sorted(self.SUPPORTED_LANGUAGES.keys())) raise ValueError( f"Unsupported language '{language}'. Available: {available}" ) logger.info(f"Generating speech: lang={lang_code}, text='{text[:50]}...'") try: wav = self._model.generate( text=text, language_id=lang_code, audio_prompt_path=audio_prompt, exaggeration=exaggeration, temperature=temperature, cfg_weight=cfg_weight, ) # Convert to numpy array audio_np = wav.squeeze().numpy() return TTSResult(audio=audio_np, sample_rate=self._model.sr) except Exception as e: logger.error(f"TTS generation failed: {e}") raise