Chatterbox-Multilingual-TTS

Sleeping

App Files Files Community

Chatterbox-Multilingual-TTS / app.py

flozi00

chatterbox only

0849031 2 months ago

raw

history blame contribute delete

21.4 kB

	"""Phone Speaker TTS - Gradio Application.

	UI requirements:
	- Load default voices from a folder of .wav files (e.g. voices/flozi.wav -> "flozi")
	- Provide a dropdown to choose a voice
	- Include a "Voice cloning" option; when selected, show reference-audio upload
	and use Chatterbox (voice cloning capable) backend.
	"""

	import os
	import random
	from pathlib import Path

	import gradio as gr
	import numpy as np
	import torch

	try:
	import spaces

	HAS_SPACES = True
	except ImportError:
	HAS_SPACES = False

	# Create a dummy decorator
	class spaces:
	@staticmethod
	def GPU(func):
	return func


	from loguru import logger

	from engine import TTSEngine
	from engine.audio_processor import AudioProcessor
	from engine.backends.chatterbox_backend import DEFAULT_VOICE_PROMPTS

	# --- Configuration ---
	DEVICE = (
	"cuda"
	if torch.cuda.is_available()
	else "mps" if torch.backends.mps.is_available() else "cpu"
	)
	logger.info(f"🚀 Running on device: {DEVICE}")

	# Language display configuration
	LANGUAGE_DISPLAY = {
	"de": "🇩🇪 German",
	"en": "🇬🇧 English",
	"fr": "🇫🇷 French",
	"es": "🇪🇸 Spanish",
	"it": "🇮🇹 Italian",
	"nl": "🇳🇱 Dutch",
	"pl": "🇵🇱 Polish",
	"pt": "🇵🇹 Portuguese",
	"ru": "🇷🇺 Russian",
	"tr": "🇹🇷 Turkish",
	"ar": "🇸🇦 Arabic",
	"zh": "🇨🇳 Chinese",
	"ja": "🇯🇵 Japanese",
	"ko": "🇰🇷 Korean",
	"hi": "🇮🇳 Hindi",
	"da": "🇩🇰 Danish",
	"el": "🇬🇷 Greek",
	"fi": "🇫🇮 Finnish",
	"he": "🇮🇱 Hebrew",
	"ms": "🇲🇾 Malay",
	"no": "🇳🇴 Norwegian",
	"sv": "🇸🇪 Swedish",
	"sw": "🇰🇪 Swahili",
	}

	# Example texts per language
	EXAMPLE_TEXTS = {
	"de": "Herzlich willkommen. Sie sind mit unserem Kundenservice verbunden. Bitte haben Sie einen Moment Geduld, wir sind gleich für Sie da.",
	"en": "Welcome to our customer service. Please hold the line, one of our representatives will be with you shortly.",
	"fr": "Bienvenue sur notre service client. Veuillez patienter, un conseiller va prendre votre appel.",
	"es": "Bienvenido a nuestro servicio de atención al cliente. Por favor, espere un momento.",
	"it": "Benvenuto nel nostro servizio clienti. La preghiamo di attendere in linea.",
	"nl": "Welkom bij onze klantenservice. Een moment geduld alstublieft.",
	"pl": "Witamy w naszej obsłudze klienta. Proszę czekać na połączenie.",
	"pt": "Bem-vindo ao nosso serviço de apoio ao cliente. Por favor, aguarde um momento.",
	"ru": "Добро пожаловать в службу поддержки. Пожалуйста, оставайтесь на линии.",
	"tr": "Müşteri hizmetlerimize hoş geldiniz. Lütfen hatta kalın.",
	"ar": "مرحباً بكم في خدمة العملاء. يرجى الانتظار على الخط.",
	"zh": "欢迎致电客户服务中心。请稍候，我们的客服代表将很快为您服务。",
	"ja": "お電話ありがとうございます。担当者におつなぎしますので、少々お待ちください。",
	"ko": "고객 서비스에 오신 것을 환영합니다. 잠시만 기다려 주세요.",
	"hi": "हमारी ग्राहक सेवा में आपका स्वागत है। कृपया प्रतीक्षा करें।",
	"da": "Velkommen til vores kundeservice. Vent venligst.",
	"el": "Καλώς ήρθατε στην εξυπηρέτηση πελατών. Παρακαλώ περιμένετε.",
	"fi": "Tervetuloa asiakaspalveluumme. Odottakaa hetki.",
	"he": "ברוכים הבאים לשירות הלקוחות שלנו. אנא המתינו על הקו.",
	"ms": "Selamat datang ke perkhidmatan pelanggan kami. Sila tunggu sebentar.",
	"no": "Velkommen til vår kundeservice. Vennligst vent.",
	"sv": "Välkommen till vår kundtjänst. Vänligen vänta.",
	"sw": "Karibu kwa huduma yetu ya wateja. Tafadhali subiri.",
	}


	# --- Global Engine ---
	ENGINE = None


	VOICE_CLONING_OPTION = "Voice cloning"


	def _get_voices_dir() -> Path:
	env_dir = os.environ.get("PHONE_SPEAKER_TTS_VOICES_DIR")
	if env_dir and str(env_dir).strip():
	return Path(env_dir).expanduser()
	return Path(__file__).parent / "voices"


	def _list_default_voices() -> dict[str, Path]:
	voices_dir = _get_voices_dir()
	if not voices_dir.exists() or not voices_dir.is_dir():
	return {}
	voices: dict[str, Path] = {}
	for wav_path in sorted(voices_dir.glob("*.wav")):
	name = wav_path.stem.strip()
	if name:
	voices[name] = wav_path
	return voices


	def _has_default_voices() -> bool:
	return len(_list_default_voices()) > 0


	def get_engine() -> TTSEngine:
	"""Get or initialize the TTS engine."""
	global ENGINE
	if ENGINE is None:
	from engine import TTSEngine
	from engine.tts_engine import EngineConfig

	logger.info("Initializing TTS Engine...")
	ENGINE = TTSEngine(
	EngineConfig(
	default_backend="chatterbox",
	device=DEVICE,
	default_language="de",
	)
	)

	# Do not force-load models on startup; Chatterbox is heavy and should load on demand.
	ENGINE.set_backend("chatterbox")

	logger.info("TTS Engine ready!")

	return ENGINE


	# Initialize on startup
	try:
	get_engine()
	except Exception as e:
	logger.error(f"Failed to initialize engine on startup: {e}")


	# --- Helper Functions ---
	def get_language_choices() -> list[tuple[str, str]]:
	"""Get language choices for dropdown."""
	engine = get_engine()
	supported = engine.get_supported_languages()
	choices = []
	for code in supported.keys():
	display = LANGUAGE_DISPLAY.get(code, f"{supported[code]} ({code})")
	choices.append((display, code))
	# Sort by display name, but put German first
	choices.sort(key=lambda x: (x[1] != "de", x[0]))
	return choices


	def get_language_choices_for_backend(backend: str) -> list[tuple[str, str]]:
	engine = get_engine()
	supported = engine.get_supported_languages(backend=backend)
	choices = []
	for code in supported.keys():
	display = LANGUAGE_DISPLAY.get(code, f"{supported[code]} ({code})")
	choices.append((display, code))
	choices.sort(key=lambda x: (x[1] != "de", x[0]))
	return choices


	def get_example_text(language: str) -> str:
	"""Get example text for a language."""
	return EXAMPLE_TEXTS.get(language, EXAMPLE_TEXTS["en"])


	def get_default_voice(language: str) -> str:
	"""Get default voice prompt URL for a language."""
	return DEFAULT_VOICE_PROMPTS.get(language)


	def get_voice_choices() -> list[str]:
	"""Get voice dropdown choices.

	- Standard voices: local .wav prompts from voices folder
	- Special entry: Voice cloning (uses Chatterbox + user provided reference)
	"""
	voices = list(_list_default_voices().keys())
	if voices:
	voices.append(VOICE_CLONING_OPTION)
	return voices
	# If there are no default voices, force voice cloning.
	return [VOICE_CLONING_OPTION]


	def _resolve_backend_for_voice_choice(voice_choice: str) -> str:
	return "chatterbox"


	def get_background_music_choices() -> list[tuple[str, str]]:
	"""Get available background music choices."""
	processor = AudioProcessor()
	music_files = processor.list_available_music()

	logger.info(f"Background music files found: {music_files}")

	# Create choices with display names
	choices = [("🔇 No background music", "")]
	for name in music_files:
	# Create a nicer display name
	display = name.replace("_", " ").replace("-", " ").title()
	choices.append((f"🎵 {display}", name))

	logger.info(f"Background music choices: {len(choices) - 1} options available")
	return choices


	# --- Main Generation Function ---
	@spaces.GPU
	def generate_announcement(
	text: str,
	language: str,
	voice_choice: str,
	voice_audio: str = None,
	background_music: str = "",
	custom_music: str = None,
	music_volume: float = -15.0,
	fade_in: float = 0.5,
	fade_out: float = 0.5,
	seed: int = 0,
	) -> tuple[int, np.ndarray]:
	"""
	Generate a phone announcement.

	Args:
	text: Text to synthesize (supports long text with automatic sentence splitting)
	language: Language code
	voice_audio: Optional path to reference audio for voice cloning
	background_music: Name of preset background music file
	custom_music: Path to custom uploaded background music
	music_volume: Volume of background music in dB (default: -15)
	fade_in: Fade in duration in seconds
	fade_out: Fade out duration in seconds
	seed: Random seed (0 = random)

	Returns:
	Tuple of (sample_rate, audio_array) for Gradio audio component
	"""
	engine = get_engine()

	# Select backend based on voice choice
	backend_name = _resolve_backend_for_voice_choice(voice_choice)
	engine.set_backend(backend_name)

	# Set seed for reproducibility
	if seed != 0:
	torch.manual_seed(seed)
	random.seed(seed)
	np.random.seed(seed)
	if DEVICE == "cuda":
	torch.cuda.manual_seed_all(seed)

	# Voice resolution:
	# - Default voice: use voices/<name>.wav (local prompt)
	# - Voice cloning: use uploaded reference audio
	default_voices = _list_default_voices()

	if voice_choice != VOICE_CLONING_OPTION:
	if voice_choice not in default_voices:
	raise gr.Error(
	f"Unknown voice '{voice_choice}'. Add '{voice_choice}.wav' to '{_get_voices_dir()}' or select '{VOICE_CLONING_OPTION}'."
	)
	voice_audio = str(default_voices[voice_choice])
	else:
	# Force voice cloning when there are no default voices.
	if not _has_default_voices():
	if not voice_audio or not str(voice_audio).strip():
	raise gr.Error(
	f"No default voices found in '{_get_voices_dir()}'. Please upload a reference audio sample for voice cloning."
	)
	# If default voices exist, keep previous behavior: fall back to a per-language prompt.
	if (
	voice_audio is None or not str(voice_audio).strip()
	) and _has_default_voices():
	voice_audio = get_default_voice(language)

	# Determine which background music to use (custom upload takes priority)
	music_path = None
	if custom_music and str(custom_music).strip():
	music_path = custom_music
	logger.info(f"Using custom background music: {music_path}")
	elif background_music and str(background_music).strip():
	music_path = background_music
	logger.info(f"Using preset background music: {music_path}")

	logger.info(
	f"Generating: lang={language}, text='{text[:50]}...' ({len(text)} chars)"
	)

	# Generate audio (engine handles sentence splitting automatically)
	# If we have background music, we need to process the audio
	if music_path:
	# Generate raw audio first (with sentence splitting for long texts)
	result = engine.generate_raw(
	text=text,
	language=language,
	voice_audio=voice_audio,
	split_sentences=True,
	)

	# Process with background music
	from engine.audio_processor import AudioProcessingConfig, AudioProcessor

	processor = AudioProcessor(
	AudioProcessingConfig(
	background_music_path=music_path,
	music_volume_db=music_volume,
	fade_in_ms=int(fade_in * 1000),
	fade_out_ms=int(fade_out * 1000),
	padding_start_ms=int(
	fade_in * 1000 * 1.2
	), # Slightly longer padding for fades
	padding_end_ms=int(fade_out * 1000 * 1.2),
	)
	)

	# Process and get bytes
	processed_bytes = processor.process(
	audio=result.audio,
	sample_rate=result.sample_rate,
	)

	# Convert back to numpy for Gradio
	import io

	from pydub import AudioSegment

	audio_segment = AudioSegment.from_mp3(io.BytesIO(processed_bytes))
	samples = np.array(audio_segment.get_array_of_samples())

	# Convert to float32 normalized
	samples = samples.astype(np.float32) / 32768.0

	return (audio_segment.frame_rate, samples)
	else:
	# No background music, use direct generation
	result = engine.generate(
	text=text,
	language=language,
	voice_audio=voice_audio,
	split_sentences=True,
	)
	return result


	def on_language_change(language: str, voice_choice: str):
	"""Handle language selection change."""
	# Only update reference-audio default for voice cloning.
	if voice_choice == VOICE_CLONING_OPTION:
	return get_example_text(language), gr.update(value=None)
	return get_example_text(language), gr.update()


	def on_voice_choice_change(voice_choice: str):
	"""Switch UI elements depending on voice selection."""
	language_choices = get_language_choices_for_backend("chatterbox")
	default_language = (
	"de"
	if any(v == "de" for _, v in language_choices)
	else (language_choices[0][1] if language_choices else "en")
	)
	show_voice_audio = voice_choice == VOICE_CLONING_OPTION
	return (
	gr.update(choices=language_choices, value=default_language),
	gr.update(visible=show_voice_audio, value=None if show_voice_audio else None),
	gr.update(value=get_example_text(default_language)),
	)


	# --- Gradio Interface ---
	def create_interface():
	"""Create the Gradio interface."""

	with gr.Blocks(
	title="Phone Announcements Generator",
	theme=gr.themes.Soft(),
	css="""
	.main-title { text-align: center; margin-bottom: 1rem; }
	.generate-btn { min-height: 50px; font-size: 1.1rem; }
	""",
	) as demo:
	gr.Markdown(
	"""
	# 📞 Phone Announcements Generator

	Create professional phone announcements with AI-powered speech synthesis.
	Supports 23 languages with optional voice cloning.

	---
	""",
	elem_classes=["main-title"],
	)

	voices_dir = _get_voices_dir()
	gr.Markdown(
	f"""
	Default voices folder: `{voices_dir}`

	Put `.wav` files there named like `flozi.wav` → voice `flozi`.
	If the folder has no `.wav` files, the UI will force Voice cloning.
	"""
	)

	with gr.Row():
	# Left column - Input
	with gr.Column(scale=1):
	voice_choices = get_voice_choices()
	default_voice_choice = (
	voice_choices[0] if voice_choices else VOICE_CLONING_OPTION
	)

	voice_choice = gr.Dropdown(
	choices=voice_choices,
	value=default_voice_choice,
	label="🗣️ Voice",
	info="Default voices come from the voices folder. 'Voice cloning' uses uploaded reference audio.",
	)

	language = gr.Dropdown(
	choices=get_language_choices_for_backend("chatterbox"),
	value="de",
	label="🌍 Language",
	info="Choose the language of the announcement",
	)

	text = gr.Textbox(
	value=EXAMPLE_TEXTS["en"],
	label="📝 Announcement Text",
	placeholder="Enter your phone announcement text here...",
	lines=5,
	max_lines=15,
	info="Long texts will be automatically split into sentences",
	)

	with gr.Accordion("🎤 Voice Settings (Optional)", open=False):
	voice_audio = gr.Audio(
	sources=["upload", "microphone"],
	type="filepath",
	label="Reference audio for voice cloning",
	visible=(default_voice_choice == VOICE_CLONING_OPTION),
	value=None,
	)
	gr.Markdown(
	"""
	💡 Tip: Upload a short audio sample to clone a voice.
	The default voice will be used if no sample is provided.
	"""
	)

	with gr.Accordion("🎵 Background Music (Optional)", open=False):
	background_music = gr.Dropdown(
	choices=get_background_music_choices(),
	value="",
	label="Preset music",
	info="Choose background music from the library",
	)

	custom_music = gr.Audio(
	sources=["upload"],
	type="filepath",
	label="Or upload custom music",
	elem_id="custom_music",
	)

	music_volume = gr.Slider(
	minimum=-30,
	maximum=0,
	value=-15,
	step=1,
	label="🔊 Music volume (dB)",
	info="Background music volume relative to speech",
	)

	with gr.Row():
	fade_in = gr.Slider(
	minimum=0,
	maximum=3,
	value=0.5,
	step=0.1,
	label="⏫ Fade In (sec.)",
	info="Fade-In duration",
	)
	fade_out = gr.Slider(
	minimum=0,
	maximum=3,
	value=0.5,
	step=0.1,
	label="⏬ Fade Out (sec.)",
	info="Fade-Out duration",
	)

	gr.Markdown(
	"""
	💡 Note: Uploaded custom music takes precedence over the selection.
	Music will be automatically looped and trimmed to the announcement length.
	"""
	)

	with gr.Accordion("⚙️ Advanced Settings", open=False):
	seed = gr.Number(
	value=0,
	label="Random seed",
	info="0 = random, other values for reproducibility",
	precision=0,
	)

	generate_btn = gr.Button(
	"🎙️ Generate Announcement",
	variant="primary",
	elem_classes=["generate-btn"],
	)

	# Right column - Output
	with gr.Column(scale=1):
	audio_output = gr.Audio(
	label="📢 Generated Announcement", type="numpy", interactive=False
	)

	gr.Markdown(
	"""
	### ℹ️ Notes

	- Generation can take a few seconds
	- Long texts will be automatically split into sentences
	- Reference audio should be 5-15 seconds long
	- Background music will be looped automatically

	---

	Supported languages: German, English, French, Spanish,
	Italian, Dutch, Polish, Portuguese, Russian,
	Turkish, Arabic, Chinese, Japanese, Korean, Hindi,
	Danish, Greek, Finnish, Hebrew, Malay, Norwegian,
	Swedish, Swahili
	"""
	)

	# Event handlers
	voice_choice.change(
	fn=on_voice_choice_change,
	inputs=[voice_choice],
	outputs=[language, voice_audio, text],
	show_progress=False,
	)

	language.change(
	fn=on_language_change,
	inputs=[language, voice_choice],
	outputs=[text, voice_audio],
	show_progress=False,
	)

	generate_btn.click(
	fn=generate_announcement,
	inputs=[
	text,
	language,
	voice_choice,
	voice_audio,
	background_music,
	custom_music,
	music_volume,
	fade_in,
	fade_out,
	seed,
	],
	outputs=[audio_output],
	)

	return demo


	# --- Main ---
	if __name__ == "__main__":
	demo = create_interface()
	demo.launch(server_name="0.0.0.0", server_port=7860, share=False)