"""Phone Speaker TTS - Gradio Application. UI requirements: - Load default voices from a folder of .wav files (e.g. voices/flozi.wav -> "flozi") - Provide a dropdown to choose a voice - Include a "Voice cloning" option; when selected, show reference-audio upload and use Chatterbox (voice cloning capable) backend. """ import os import random from pathlib import Path import gradio as gr import numpy as np import torch try: import spaces HAS_SPACES = True except ImportError: HAS_SPACES = False # Create a dummy decorator class spaces: @staticmethod def GPU(func): return func from loguru import logger from engine import TTSEngine from engine.audio_processor import AudioProcessor from engine.backends.chatterbox_backend import DEFAULT_VOICE_PROMPTS # --- Configuration --- DEVICE = ( "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu" ) logger.info(f"🚀 Running on device: {DEVICE}") # Language display configuration LANGUAGE_DISPLAY = { "de": "🇩🇪 German", "en": "🇬🇧 English", "fr": "🇫🇷 French", "es": "🇪🇸 Spanish", "it": "🇮🇹 Italian", "nl": "🇳🇱 Dutch", "pl": "🇵🇱 Polish", "pt": "🇵🇹 Portuguese", "ru": "🇷🇺 Russian", "tr": "🇹🇷 Turkish", "ar": "🇸🇦 Arabic", "zh": "🇨🇳 Chinese", "ja": "🇯🇵 Japanese", "ko": "🇰🇷 Korean", "hi": "🇮🇳 Hindi", "da": "🇩🇰 Danish", "el": "🇬🇷 Greek", "fi": "🇫🇮 Finnish", "he": "🇮🇱 Hebrew", "ms": "🇲🇾 Malay", "no": "🇳🇴 Norwegian", "sv": "🇸🇪 Swedish", "sw": "🇰🇪 Swahili", } # Example texts per language EXAMPLE_TEXTS = { "de": "Herzlich willkommen. Sie sind mit unserem Kundenservice verbunden. Bitte haben Sie einen Moment Geduld, wir sind gleich für Sie da.", "en": "Welcome to our customer service. Please hold the line, one of our representatives will be with you shortly.", "fr": "Bienvenue sur notre service client. Veuillez patienter, un conseiller va prendre votre appel.", "es": "Bienvenido a nuestro servicio de atención al cliente. Por favor, espere un momento.", "it": "Benvenuto nel nostro servizio clienti. La preghiamo di attendere in linea.", "nl": "Welkom bij onze klantenservice. Een moment geduld alstublieft.", "pl": "Witamy w naszej obsłudze klienta. Proszę czekać na połączenie.", "pt": "Bem-vindo ao nosso serviço de apoio ao cliente. Por favor, aguarde um momento.", "ru": "Добро пожаловать в службу поддержки. Пожалуйста, оставайтесь на линии.", "tr": "Müşteri hizmetlerimize hoş geldiniz. Lütfen hatta kalın.", "ar": "مرحباً بكم في خدمة العملاء. يرجى الانتظار على الخط.", "zh": "欢迎致电客户服务中心。请稍候,我们的客服代表将很快为您服务。", "ja": "お電話ありがとうございます。担当者におつなぎしますので、少々お待ちください。", "ko": "고객 서비스에 오신 것을 환영합니다. 잠시만 기다려 주세요.", "hi": "हमारी ग्राहक सेवा में आपका स्वागत है। कृपया प्रतीक्षा करें।", "da": "Velkommen til vores kundeservice. Vent venligst.", "el": "Καλώς ήρθατε στην εξυπηρέτηση πελατών. Παρακαλώ περιμένετε.", "fi": "Tervetuloa asiakaspalveluumme. Odottakaa hetki.", "he": "ברוכים הבאים לשירות הלקוחות שלנו. אנא המתינו על הקו.", "ms": "Selamat datang ke perkhidmatan pelanggan kami. Sila tunggu sebentar.", "no": "Velkommen til vår kundeservice. Vennligst vent.", "sv": "Välkommen till vår kundtjänst. Vänligen vänta.", "sw": "Karibu kwa huduma yetu ya wateja. Tafadhali subiri.", } # --- Global Engine --- ENGINE = None VOICE_CLONING_OPTION = "Voice cloning" def _get_voices_dir() -> Path: env_dir = os.environ.get("PHONE_SPEAKER_TTS_VOICES_DIR") if env_dir and str(env_dir).strip(): return Path(env_dir).expanduser() return Path(__file__).parent / "voices" def _list_default_voices() -> dict[str, Path]: voices_dir = _get_voices_dir() if not voices_dir.exists() or not voices_dir.is_dir(): return {} voices: dict[str, Path] = {} for wav_path in sorted(voices_dir.glob("*.wav")): name = wav_path.stem.strip() if name: voices[name] = wav_path return voices def _has_default_voices() -> bool: return len(_list_default_voices()) > 0 def get_engine() -> TTSEngine: """Get or initialize the TTS engine.""" global ENGINE if ENGINE is None: from engine import TTSEngine from engine.tts_engine import EngineConfig logger.info("Initializing TTS Engine...") ENGINE = TTSEngine( EngineConfig( default_backend="chatterbox", device=DEVICE, default_language="de", ) ) # Do not force-load models on startup; Chatterbox is heavy and should load on demand. ENGINE.set_backend("chatterbox") logger.info("TTS Engine ready!") return ENGINE # Initialize on startup try: get_engine() except Exception as e: logger.error(f"Failed to initialize engine on startup: {e}") # --- Helper Functions --- def get_language_choices() -> list[tuple[str, str]]: """Get language choices for dropdown.""" engine = get_engine() supported = engine.get_supported_languages() choices = [] for code in supported.keys(): display = LANGUAGE_DISPLAY.get(code, f"{supported[code]} ({code})") choices.append((display, code)) # Sort by display name, but put German first choices.sort(key=lambda x: (x[1] != "de", x[0])) return choices def get_language_choices_for_backend(backend: str) -> list[tuple[str, str]]: engine = get_engine() supported = engine.get_supported_languages(backend=backend) choices = [] for code in supported.keys(): display = LANGUAGE_DISPLAY.get(code, f"{supported[code]} ({code})") choices.append((display, code)) choices.sort(key=lambda x: (x[1] != "de", x[0])) return choices def get_example_text(language: str) -> str: """Get example text for a language.""" return EXAMPLE_TEXTS.get(language, EXAMPLE_TEXTS["en"]) def get_default_voice(language: str) -> str: """Get default voice prompt URL for a language.""" return DEFAULT_VOICE_PROMPTS.get(language) def get_voice_choices() -> list[str]: """Get voice dropdown choices. - Standard voices: local .wav prompts from voices folder - Special entry: Voice cloning (uses Chatterbox + user provided reference) """ voices = list(_list_default_voices().keys()) if voices: voices.append(VOICE_CLONING_OPTION) return voices # If there are no default voices, force voice cloning. return [VOICE_CLONING_OPTION] def _resolve_backend_for_voice_choice(voice_choice: str) -> str: return "chatterbox" def get_background_music_choices() -> list[tuple[str, str]]: """Get available background music choices.""" processor = AudioProcessor() music_files = processor.list_available_music() logger.info(f"Background music files found: {music_files}") # Create choices with display names choices = [("🔇 No background music", "")] for name in music_files: # Create a nicer display name display = name.replace("_", " ").replace("-", " ").title() choices.append((f"🎵 {display}", name)) logger.info(f"Background music choices: {len(choices) - 1} options available") return choices # --- Main Generation Function --- @spaces.GPU def generate_announcement( text: str, language: str, voice_choice: str, voice_audio: str = None, background_music: str = "", custom_music: str = None, music_volume: float = -15.0, fade_in: float = 0.5, fade_out: float = 0.5, seed: int = 0, ) -> tuple[int, np.ndarray]: """ Generate a phone announcement. Args: text: Text to synthesize (supports long text with automatic sentence splitting) language: Language code voice_audio: Optional path to reference audio for voice cloning background_music: Name of preset background music file custom_music: Path to custom uploaded background music music_volume: Volume of background music in dB (default: -15) fade_in: Fade in duration in seconds fade_out: Fade out duration in seconds seed: Random seed (0 = random) Returns: Tuple of (sample_rate, audio_array) for Gradio audio component """ engine = get_engine() # Select backend based on voice choice backend_name = _resolve_backend_for_voice_choice(voice_choice) engine.set_backend(backend_name) # Set seed for reproducibility if seed != 0: torch.manual_seed(seed) random.seed(seed) np.random.seed(seed) if DEVICE == "cuda": torch.cuda.manual_seed_all(seed) # Voice resolution: # - Default voice: use voices/.wav (local prompt) # - Voice cloning: use uploaded reference audio default_voices = _list_default_voices() if voice_choice != VOICE_CLONING_OPTION: if voice_choice not in default_voices: raise gr.Error( f"Unknown voice '{voice_choice}'. Add '{voice_choice}.wav' to '{_get_voices_dir()}' or select '{VOICE_CLONING_OPTION}'." ) voice_audio = str(default_voices[voice_choice]) else: # Force voice cloning when there are no default voices. if not _has_default_voices(): if not voice_audio or not str(voice_audio).strip(): raise gr.Error( f"No default voices found in '{_get_voices_dir()}'. Please upload a reference audio sample for voice cloning." ) # If default voices exist, keep previous behavior: fall back to a per-language prompt. if ( voice_audio is None or not str(voice_audio).strip() ) and _has_default_voices(): voice_audio = get_default_voice(language) # Determine which background music to use (custom upload takes priority) music_path = None if custom_music and str(custom_music).strip(): music_path = custom_music logger.info(f"Using custom background music: {music_path}") elif background_music and str(background_music).strip(): music_path = background_music logger.info(f"Using preset background music: {music_path}") logger.info( f"Generating: lang={language}, text='{text[:50]}...' ({len(text)} chars)" ) # Generate audio (engine handles sentence splitting automatically) # If we have background music, we need to process the audio if music_path: # Generate raw audio first (with sentence splitting for long texts) result = engine.generate_raw( text=text, language=language, voice_audio=voice_audio, split_sentences=True, ) # Process with background music from engine.audio_processor import AudioProcessingConfig, AudioProcessor processor = AudioProcessor( AudioProcessingConfig( background_music_path=music_path, music_volume_db=music_volume, fade_in_ms=int(fade_in * 1000), fade_out_ms=int(fade_out * 1000), padding_start_ms=int( fade_in * 1000 * 1.2 ), # Slightly longer padding for fades padding_end_ms=int(fade_out * 1000 * 1.2), ) ) # Process and get bytes processed_bytes = processor.process( audio=result.audio, sample_rate=result.sample_rate, ) # Convert back to numpy for Gradio import io from pydub import AudioSegment audio_segment = AudioSegment.from_mp3(io.BytesIO(processed_bytes)) samples = np.array(audio_segment.get_array_of_samples()) # Convert to float32 normalized samples = samples.astype(np.float32) / 32768.0 return (audio_segment.frame_rate, samples) else: # No background music, use direct generation result = engine.generate( text=text, language=language, voice_audio=voice_audio, split_sentences=True, ) return result def on_language_change(language: str, voice_choice: str): """Handle language selection change.""" # Only update reference-audio default for voice cloning. if voice_choice == VOICE_CLONING_OPTION: return get_example_text(language), gr.update(value=None) return get_example_text(language), gr.update() def on_voice_choice_change(voice_choice: str): """Switch UI elements depending on voice selection.""" language_choices = get_language_choices_for_backend("chatterbox") default_language = ( "de" if any(v == "de" for _, v in language_choices) else (language_choices[0][1] if language_choices else "en") ) show_voice_audio = voice_choice == VOICE_CLONING_OPTION return ( gr.update(choices=language_choices, value=default_language), gr.update(visible=show_voice_audio, value=None if show_voice_audio else None), gr.update(value=get_example_text(default_language)), ) # --- Gradio Interface --- def create_interface(): """Create the Gradio interface.""" with gr.Blocks( title="Phone Announcements Generator", theme=gr.themes.Soft(), css=""" .main-title { text-align: center; margin-bottom: 1rem; } .generate-btn { min-height: 50px; font-size: 1.1rem; } """, ) as demo: gr.Markdown( """ # 📞 Phone Announcements Generator Create professional phone announcements with AI-powered speech synthesis. Supports 23 languages with optional voice cloning. --- """, elem_classes=["main-title"], ) voices_dir = _get_voices_dir() gr.Markdown( f""" **Default voices folder:** `{voices_dir}` Put `.wav` files there named like `flozi.wav` → voice `flozi`. If the folder has no `.wav` files, the UI will force **Voice cloning**. """ ) with gr.Row(): # Left column - Input with gr.Column(scale=1): voice_choices = get_voice_choices() default_voice_choice = ( voice_choices[0] if voice_choices else VOICE_CLONING_OPTION ) voice_choice = gr.Dropdown( choices=voice_choices, value=default_voice_choice, label="🗣️ Voice", info="Default voices come from the voices folder. 'Voice cloning' uses uploaded reference audio.", ) language = gr.Dropdown( choices=get_language_choices_for_backend("chatterbox"), value="de", label="🌍 Language", info="Choose the language of the announcement", ) text = gr.Textbox( value=EXAMPLE_TEXTS["en"], label="📝 Announcement Text", placeholder="Enter your phone announcement text here...", lines=5, max_lines=15, info="Long texts will be automatically split into sentences", ) with gr.Accordion("🎤 Voice Settings (Optional)", open=False): voice_audio = gr.Audio( sources=["upload", "microphone"], type="filepath", label="Reference audio for voice cloning", visible=(default_voice_choice == VOICE_CLONING_OPTION), value=None, ) gr.Markdown( """ 💡 **Tip:** Upload a short audio sample to clone a voice. The default voice will be used if no sample is provided. """ ) with gr.Accordion("🎵 Background Music (Optional)", open=False): background_music = gr.Dropdown( choices=get_background_music_choices(), value="", label="Preset music", info="Choose background music from the library", ) custom_music = gr.Audio( sources=["upload"], type="filepath", label="Or upload custom music", elem_id="custom_music", ) music_volume = gr.Slider( minimum=-30, maximum=0, value=-15, step=1, label="🔊 Music volume (dB)", info="Background music volume relative to speech", ) with gr.Row(): fade_in = gr.Slider( minimum=0, maximum=3, value=0.5, step=0.1, label="⏫ Fade In (sec.)", info="Fade-In duration", ) fade_out = gr.Slider( minimum=0, maximum=3, value=0.5, step=0.1, label="⏬ Fade Out (sec.)", info="Fade-Out duration", ) gr.Markdown( """ 💡 **Note:** Uploaded custom music takes precedence over the selection. Music will be automatically looped and trimmed to the announcement length. """ ) with gr.Accordion("⚙️ Advanced Settings", open=False): seed = gr.Number( value=0, label="Random seed", info="0 = random, other values for reproducibility", precision=0, ) generate_btn = gr.Button( "🎙️ Generate Announcement", variant="primary", elem_classes=["generate-btn"], ) # Right column - Output with gr.Column(scale=1): audio_output = gr.Audio( label="📢 Generated Announcement", type="numpy", interactive=False ) gr.Markdown( """ ### ℹ️ Notes - Generation can take a few seconds - Long texts will be automatically split into sentences - Reference audio should be 5-15 seconds long - Background music will be looped automatically --- **Supported languages:** German, English, French, Spanish, Italian, Dutch, Polish, Portuguese, Russian, Turkish, Arabic, Chinese, Japanese, Korean, Hindi, Danish, Greek, Finnish, Hebrew, Malay, Norwegian, Swedish, Swahili """ ) # Event handlers voice_choice.change( fn=on_voice_choice_change, inputs=[voice_choice], outputs=[language, voice_audio, text], show_progress=False, ) language.change( fn=on_language_change, inputs=[language, voice_choice], outputs=[text, voice_audio], show_progress=False, ) generate_btn.click( fn=generate_announcement, inputs=[ text, language, voice_choice, voice_audio, background_music, custom_music, music_volume, fade_in, fade_out, seed, ], outputs=[audio_output], ) return demo # --- Main --- if __name__ == "__main__": demo = create_interface() demo.launch(server_name="0.0.0.0", server_port=7860, share=False)