Spaces:
Sleeping
Sleeping
| """ | |
| Chatterbox Multilingual TTS Backend with Voice Cloning support. | |
| This is the default backend for the Phone Announcements engine. | |
| """ | |
| from typing import Optional | |
| import numpy as np | |
| from loguru import logger | |
| from .base import BackendConfig, TTSBackend, TTSResult | |
| # Default voice prompts per language (high-quality reference samples) | |
| DEFAULT_VOICE_PROMPTS = { | |
| "ar": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ar_f/ar_prompts2.flac", | |
| "da": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/da_m1.flac", | |
| "de": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/de_f1.flac", | |
| "el": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/el_m.flac", | |
| "en": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/en_f1.flac", | |
| "es": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/es_f1.flac", | |
| "fi": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/fi_m.flac", | |
| "fr": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/fr_f1.flac", | |
| "he": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/he_m1.flac", | |
| "hi": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/hi_f1.flac", | |
| "it": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/it_m1.flac", | |
| "ja": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ja/ja_prompts1.flac", | |
| "ko": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ko_f.flac", | |
| "ms": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ms_f.flac", | |
| "nl": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/nl_m.flac", | |
| "no": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/no_f1.flac", | |
| "pl": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/pl_m.flac", | |
| "pt": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/pt_m1.flac", | |
| "ru": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ru_m.flac", | |
| "sv": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/sv_f.flac", | |
| "sw": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/sw_m.flac", | |
| "tr": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/tr_m.flac", | |
| "zh": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/zh_f2.flac", | |
| } | |
| class ChatterboxBackend(TTSBackend): | |
| """ | |
| Chatterbox Multilingual TTS Backend. | |
| Features: | |
| - 23 language support | |
| - High-quality voice cloning | |
| - Expressive speech synthesis | |
| This backend uses the ResembleAI Chatterbox model for synthesis. | |
| """ | |
| # Optimal defaults for phone announcements (clear, professional) | |
| DEFAULT_EXAGGERATION = ( | |
| 0.35 # Slightly less expressive for professional announcements | |
| ) | |
| DEFAULT_TEMPERATURE = 0.7 # Balanced randomness | |
| DEFAULT_CFG_WEIGHT = 0.5 # Standard guidance | |
| SUPPORTED_LANGUAGES = { | |
| "ar": "Arabic", | |
| "da": "Danish", | |
| "de": "German", | |
| "el": "Greek", | |
| "en": "English", | |
| "es": "Spanish", | |
| "fi": "Finnish", | |
| "fr": "French", | |
| "he": "Hebrew", | |
| "hi": "Hindi", | |
| "it": "Italian", | |
| "ja": "Japanese", | |
| "ko": "Korean", | |
| "ms": "Malay", | |
| "nl": "Dutch", | |
| "no": "Norwegian", | |
| "pl": "Polish", | |
| "pt": "Portuguese", | |
| "ru": "Russian", | |
| "sv": "Swedish", | |
| "sw": "Swahili", | |
| "tr": "Turkish", | |
| "zh": "Chinese", | |
| } | |
| def __init__(self, config: Optional[BackendConfig] = None): | |
| super().__init__(config) | |
| self._model = None | |
| self._device = None | |
| def name(self) -> str: | |
| return "Chatterbox Multilingual" | |
| def supports_voice_cloning(self) -> bool: | |
| return True | |
| def supported_languages(self) -> dict[str, str]: | |
| return self.SUPPORTED_LANGUAGES.copy() | |
| def load(self) -> None: | |
| """Load the Chatterbox model.""" | |
| if self._is_loaded: | |
| logger.info("Chatterbox model already loaded") | |
| return | |
| logger.info("Loading Chatterbox Multilingual model...") | |
| from src.chatterbox.mtl_tts import ChatterboxMultilingualTTS | |
| self._device = self.config.resolve_device() | |
| logger.info(f"Using device: {self._device}") | |
| try: | |
| self._model = ChatterboxMultilingualTTS.from_pretrained(self._device) | |
| self._is_loaded = True | |
| logger.info("Chatterbox model loaded successfully") | |
| except Exception as e: | |
| logger.error(f"Failed to load Chatterbox model: {e}") | |
| raise | |
| def unload(self) -> None: | |
| """Unload the model to free memory.""" | |
| if self._model is not None: | |
| import torch | |
| del self._model | |
| self._model = None | |
| if self._device == "cuda": | |
| torch.cuda.empty_cache() | |
| self._is_loaded = False | |
| logger.info("Chatterbox model unloaded") | |
| def get_default_voice(self, language: str) -> Optional[str]: | |
| """Get the default voice prompt URL for a language.""" | |
| return DEFAULT_VOICE_PROMPTS.get(language.lower()) | |
| def generate( | |
| self, | |
| text: str, | |
| language: str = "de", | |
| voice_audio_path: Optional[str] = None, | |
| exaggeration: Optional[float] = None, | |
| temperature: Optional[float] = None, | |
| cfg_weight: Optional[float] = None, | |
| seed: Optional[int] = None, | |
| **kwargs, | |
| ) -> TTSResult: | |
| """ | |
| Generate speech from text using Chatterbox. | |
| Args: | |
| text: Text to synthesize | |
| language: Language code (default: "de" for German) | |
| voice_audio_path: Path/URL to reference audio for voice cloning | |
| exaggeration: Speech expressiveness (0.25-2.0, default: 0.35) | |
| temperature: Generation randomness (0.05-5.0, default: 0.7) | |
| cfg_weight: CFG guidance weight (0.2-1.0, default: 0.5) | |
| seed: Random seed for reproducibility (default: None = random) | |
| Returns: | |
| TTSResult with audio waveform and sample rate | |
| """ | |
| if not self._is_loaded: | |
| self.load() | |
| import random | |
| import torch | |
| # Apply seed if provided | |
| if seed is not None and seed != 0: | |
| torch.manual_seed(seed) | |
| random.seed(seed) | |
| np.random.seed(seed) | |
| if self._device == "cuda": | |
| torch.cuda.manual_seed_all(seed) | |
| # Use defaults for unspecified parameters | |
| exaggeration = ( | |
| exaggeration if exaggeration is not None else self.DEFAULT_EXAGGERATION | |
| ) | |
| temperature = ( | |
| temperature if temperature is not None else self.DEFAULT_TEMPERATURE | |
| ) | |
| cfg_weight = cfg_weight if cfg_weight is not None else self.DEFAULT_CFG_WEIGHT | |
| # Resolve voice prompt | |
| audio_prompt = voice_audio_path or self.get_default_voice(language) | |
| # Validate language | |
| lang_code = language.lower() | |
| if lang_code not in self.SUPPORTED_LANGUAGES: | |
| available = ", ".join(sorted(self.SUPPORTED_LANGUAGES.keys())) | |
| raise ValueError( | |
| f"Unsupported language '{language}'. Available: {available}" | |
| ) | |
| logger.info(f"Generating speech: lang={lang_code}, text='{text[:50]}...'") | |
| try: | |
| wav = self._model.generate( | |
| text=text, | |
| language_id=lang_code, | |
| audio_prompt_path=audio_prompt, | |
| exaggeration=exaggeration, | |
| temperature=temperature, | |
| cfg_weight=cfg_weight, | |
| ) | |
| # Convert to numpy array | |
| audio_np = wav.squeeze().numpy() | |
| return TTSResult(audio=audio_np, sample_rate=self._model.sr) | |
| except Exception as e: | |
| logger.error(f"TTS generation failed: {e}") | |
| raise | |