Chatterbox-Multilingual-TTS

Sleeping

App Files Files Community

Chatterbox-Multilingual-TTS / engine /backends /chatterbox_backend.py

flozi00

english

d0b9ec6 3 months ago

raw

history blame contribute delete

8.12 kB

	"""
	Chatterbox Multilingual TTS Backend with Voice Cloning support.
	This is the default backend for the Phone Announcements engine.
	"""

	from typing import Optional

	import numpy as np
	from loguru import logger

	from .base import BackendConfig, TTSBackend, TTSResult

	# Default voice prompts per language (high-quality reference samples)
	DEFAULT_VOICE_PROMPTS = {
	"ar": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ar_f/ar_prompts2.flac",
	"da": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/da_m1.flac",
	"de": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/de_f1.flac",
	"el": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/el_m.flac",
	"en": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/en_f1.flac",
	"es": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/es_f1.flac",
	"fi": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/fi_m.flac",
	"fr": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/fr_f1.flac",
	"he": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/he_m1.flac",
	"hi": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/hi_f1.flac",
	"it": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/it_m1.flac",
	"ja": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ja/ja_prompts1.flac",
	"ko": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ko_f.flac",
	"ms": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ms_f.flac",
	"nl": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/nl_m.flac",
	"no": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/no_f1.flac",
	"pl": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/pl_m.flac",
	"pt": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/pt_m1.flac",
	"ru": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ru_m.flac",
	"sv": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/sv_f.flac",
	"sw": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/sw_m.flac",
	"tr": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/tr_m.flac",
	"zh": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/zh_f2.flac",
	}


	class ChatterboxBackend(TTSBackend):
	"""
	Chatterbox Multilingual TTS Backend.

	Features:
	- 23 language support
	- High-quality voice cloning
	- Expressive speech synthesis

	This backend uses the ResembleAI Chatterbox model for synthesis.
	"""

	# Optimal defaults for phone announcements (clear, professional)
	DEFAULT_EXAGGERATION = (
	0.35 # Slightly less expressive for professional announcements
	)
	DEFAULT_TEMPERATURE = 0.7 # Balanced randomness
	DEFAULT_CFG_WEIGHT = 0.5 # Standard guidance

	SUPPORTED_LANGUAGES = {
	"ar": "Arabic",
	"da": "Danish",
	"de": "German",
	"el": "Greek",
	"en": "English",
	"es": "Spanish",
	"fi": "Finnish",
	"fr": "French",
	"he": "Hebrew",
	"hi": "Hindi",
	"it": "Italian",
	"ja": "Japanese",
	"ko": "Korean",
	"ms": "Malay",
	"nl": "Dutch",
	"no": "Norwegian",
	"pl": "Polish",
	"pt": "Portuguese",
	"ru": "Russian",
	"sv": "Swedish",
	"sw": "Swahili",
	"tr": "Turkish",
	"zh": "Chinese",
	}

	def __init__(self, config: Optional[BackendConfig] = None):
	super().__init__(config)
	self._model = None
	self._device = None

	@property
	def name(self) -> str:
	return "Chatterbox Multilingual"

	@property
	def supports_voice_cloning(self) -> bool:
	return True

	@property
	def supported_languages(self) -> dict[str, str]:
	return self.SUPPORTED_LANGUAGES.copy()

	def load(self) -> None:
	"""Load the Chatterbox model."""
	if self._is_loaded:
	logger.info("Chatterbox model already loaded")
	return

	logger.info("Loading Chatterbox Multilingual model...")

	from src.chatterbox.mtl_tts import ChatterboxMultilingualTTS

	self._device = self.config.resolve_device()
	logger.info(f"Using device: {self._device}")

	try:
	self._model = ChatterboxMultilingualTTS.from_pretrained(self._device)
	self._is_loaded = True
	logger.info("Chatterbox model loaded successfully")
	except Exception as e:
	logger.error(f"Failed to load Chatterbox model: {e}")
	raise

	def unload(self) -> None:
	"""Unload the model to free memory."""
	if self._model is not None:
	import torch

	del self._model
	self._model = None
	if self._device == "cuda":
	torch.cuda.empty_cache()
	self._is_loaded = False
	logger.info("Chatterbox model unloaded")

	def get_default_voice(self, language: str) -> Optional[str]:
	"""Get the default voice prompt URL for a language."""
	return DEFAULT_VOICE_PROMPTS.get(language.lower())

	def generate(
	self,
	text: str,
	language: str = "de",
	voice_audio_path: Optional[str] = None,
	exaggeration: Optional[float] = None,
	temperature: Optional[float] = None,
	cfg_weight: Optional[float] = None,
	seed: Optional[int] = None,
	**kwargs,
	) -> TTSResult:
	"""
	Generate speech from text using Chatterbox.

	Args:
	text: Text to synthesize
	language: Language code (default: "de" for German)
	voice_audio_path: Path/URL to reference audio for voice cloning
	exaggeration: Speech expressiveness (0.25-2.0, default: 0.35)
	temperature: Generation randomness (0.05-5.0, default: 0.7)
	cfg_weight: CFG guidance weight (0.2-1.0, default: 0.5)
	seed: Random seed for reproducibility (default: None = random)

	Returns:
	TTSResult with audio waveform and sample rate
	"""
	if not self._is_loaded:
	self.load()

	import random

	import torch

	# Apply seed if provided
	if seed is not None and seed != 0:
	torch.manual_seed(seed)
	random.seed(seed)
	np.random.seed(seed)
	if self._device == "cuda":
	torch.cuda.manual_seed_all(seed)

	# Use defaults for unspecified parameters
	exaggeration = (
	exaggeration if exaggeration is not None else self.DEFAULT_EXAGGERATION
	)
	temperature = (
	temperature if temperature is not None else self.DEFAULT_TEMPERATURE
	)
	cfg_weight = cfg_weight if cfg_weight is not None else self.DEFAULT_CFG_WEIGHT

	# Resolve voice prompt
	audio_prompt = voice_audio_path or self.get_default_voice(language)

	# Validate language
	lang_code = language.lower()
	if lang_code not in self.SUPPORTED_LANGUAGES:
	available = ", ".join(sorted(self.SUPPORTED_LANGUAGES.keys()))
	raise ValueError(
	f"Unsupported language '{language}'. Available: {available}"
	)

	logger.info(f"Generating speech: lang={lang_code}, text='{text[:50]}...'")

	try:
	wav = self._model.generate(
	text=text,
	language_id=lang_code,
	audio_prompt_path=audio_prompt,
	exaggeration=exaggeration,
	temperature=temperature,
	cfg_weight=cfg_weight,
	)

	# Convert to numpy array
	audio_np = wav.squeeze().numpy()

	return TTSResult(audio=audio_np, sample_rate=self._model.sr)

	except Exception as e:
	logger.error(f"TTS generation failed: {e}")
	raise