Spaces:
Sleeping
Sleeping
| """Phone Speaker TTS - Gradio Application. | |
| UI requirements: | |
| - Load default voices from a folder of .wav files (e.g. voices/flozi.wav -> "flozi") | |
| - Provide a dropdown to choose a voice | |
| - Include a "Voice cloning" option; when selected, show reference-audio upload | |
| and use Chatterbox (voice cloning capable) backend. | |
| """ | |
| import os | |
| import random | |
| from pathlib import Path | |
| import gradio as gr | |
| import numpy as np | |
| import torch | |
| try: | |
| import spaces | |
| HAS_SPACES = True | |
| except ImportError: | |
| HAS_SPACES = False | |
| # Create a dummy decorator | |
| class spaces: | |
| def GPU(func): | |
| return func | |
| from loguru import logger | |
| from engine import TTSEngine | |
| from engine.audio_processor import AudioProcessor | |
| from engine.backends.chatterbox_backend import DEFAULT_VOICE_PROMPTS | |
| # --- Configuration --- | |
| DEVICE = ( | |
| "cuda" | |
| if torch.cuda.is_available() | |
| else "mps" if torch.backends.mps.is_available() else "cpu" | |
| ) | |
| logger.info(f"๐ Running on device: {DEVICE}") | |
| # Language display configuration | |
| LANGUAGE_DISPLAY = { | |
| "de": "๐ฉ๐ช German", | |
| "en": "๐ฌ๐ง English", | |
| "fr": "๐ซ๐ท French", | |
| "es": "๐ช๐ธ Spanish", | |
| "it": "๐ฎ๐น Italian", | |
| "nl": "๐ณ๐ฑ Dutch", | |
| "pl": "๐ต๐ฑ Polish", | |
| "pt": "๐ต๐น Portuguese", | |
| "ru": "๐ท๐บ Russian", | |
| "tr": "๐น๐ท Turkish", | |
| "ar": "๐ธ๐ฆ Arabic", | |
| "zh": "๐จ๐ณ Chinese", | |
| "ja": "๐ฏ๐ต Japanese", | |
| "ko": "๐ฐ๐ท Korean", | |
| "hi": "๐ฎ๐ณ Hindi", | |
| "da": "๐ฉ๐ฐ Danish", | |
| "el": "๐ฌ๐ท Greek", | |
| "fi": "๐ซ๐ฎ Finnish", | |
| "he": "๐ฎ๐ฑ Hebrew", | |
| "ms": "๐ฒ๐พ Malay", | |
| "no": "๐ณ๐ด Norwegian", | |
| "sv": "๐ธ๐ช Swedish", | |
| "sw": "๐ฐ๐ช Swahili", | |
| } | |
| # Example texts per language | |
| EXAMPLE_TEXTS = { | |
| "de": "Herzlich willkommen. Sie sind mit unserem Kundenservice verbunden. Bitte haben Sie einen Moment Geduld, wir sind gleich fรผr Sie da.", | |
| "en": "Welcome to our customer service. Please hold the line, one of our representatives will be with you shortly.", | |
| "fr": "Bienvenue sur notre service client. Veuillez patienter, un conseiller va prendre votre appel.", | |
| "es": "Bienvenido a nuestro servicio de atenciรณn al cliente. Por favor, espere un momento.", | |
| "it": "Benvenuto nel nostro servizio clienti. La preghiamo di attendere in linea.", | |
| "nl": "Welkom bij onze klantenservice. Een moment geduld alstublieft.", | |
| "pl": "Witamy w naszej obsลudze klienta. Proszฤ czekaฤ na poลฤ czenie.", | |
| "pt": "Bem-vindo ao nosso serviรงo de apoio ao cliente. Por favor, aguarde um momento.", | |
| "ru": "ะะพะฑัะพ ะฟะพะถะฐะปะพะฒะฐัั ะฒ ัะปัะถะฑั ะฟะพะดะดะตัะถะบะธ. ะะพะถะฐะปัะนััะฐ, ะพััะฐะฒะฐะนัะตัั ะฝะฐ ะปะธะฝะธะธ.", | |
| "tr": "Mรผลteri hizmetlerimize hoล geldiniz. Lรผtfen hatta kalฤฑn.", | |
| "ar": "ู ุฑุญุจุงู ุจูู ูู ุฎุฏู ุฉ ุงูุนู ูุงุก. ูุฑุฌู ุงูุงูุชุธุงุฑ ุนูู ุงูุฎุท.", | |
| "zh": "ๆฌข่ฟ่ด็ตๅฎขๆทๆๅกไธญๅฟใ่ฏท็จๅ๏ผๆไปฌ็ๅฎขๆไปฃ่กจๅฐๅพๅฟซไธบๆจๆๅกใ", | |
| "ja": "ใ้ป่ฉฑใใใใจใใใใใพใใๆ ๅฝ่ ใซใใคใชใใใพใใฎใงใๅฐใ ใๅพ ใกใใ ใใใ", | |
| "ko": "๊ณ ๊ฐ ์๋น์ค์ ์ค์ ๊ฒ์ ํ์ํฉ๋๋ค. ์ ์๋ง ๊ธฐ๋ค๋ ค ์ฃผ์ธ์.", | |
| "hi": "เคนเคฎเคพเคฐเฅ เคเฅเคฐเคพเคนเค เคธเฅเคตเคพ เคฎเฅเค เคเคชเคเคพ เคธเฅเคตเคพเคเคค เคนเฅเฅค เคเฅเคชเคฏเคพ เคชเฅเคฐเคคเฅเคเฅเคทเคพ เคเคฐเฅเคเฅค", | |
| "da": "Velkommen til vores kundeservice. Vent venligst.", | |
| "el": "ฮฮฑฮปฯฯ ฮฎฯฮธฮฑฯฮต ฯฯฮทฮฝ ฮตฮพฯ ฯฮทฯฮญฯฮทฯฮท ฯฮตฮปฮฑฯฯฮฝ. ฮ ฮฑฯฮฑฮบฮฑฮปฯ ฯฮตฯฮนฮผฮญฮฝฮตฯฮต.", | |
| "fi": "Tervetuloa asiakaspalveluumme. Odottakaa hetki.", | |
| "he": "ืืจืืืื ืืืืื ืืฉืืจืืช ืืืงืืืืช ืฉืื ื. ืื ื ืืืชืื ื ืขื ืืงื.", | |
| "ms": "Selamat datang ke perkhidmatan pelanggan kami. Sila tunggu sebentar.", | |
| "no": "Velkommen til vรฅr kundeservice. Vennligst vent.", | |
| "sv": "Vรคlkommen till vรฅr kundtjรคnst. Vรคnligen vรคnta.", | |
| "sw": "Karibu kwa huduma yetu ya wateja. Tafadhali subiri.", | |
| } | |
| # --- Global Engine --- | |
| ENGINE = None | |
| VOICE_CLONING_OPTION = "Voice cloning" | |
| def _get_voices_dir() -> Path: | |
| env_dir = os.environ.get("PHONE_SPEAKER_TTS_VOICES_DIR") | |
| if env_dir and str(env_dir).strip(): | |
| return Path(env_dir).expanduser() | |
| return Path(__file__).parent / "voices" | |
| def _list_default_voices() -> dict[str, Path]: | |
| voices_dir = _get_voices_dir() | |
| if not voices_dir.exists() or not voices_dir.is_dir(): | |
| return {} | |
| voices: dict[str, Path] = {} | |
| for wav_path in sorted(voices_dir.glob("*.wav")): | |
| name = wav_path.stem.strip() | |
| if name: | |
| voices[name] = wav_path | |
| return voices | |
| def _has_default_voices() -> bool: | |
| return len(_list_default_voices()) > 0 | |
| def get_engine() -> TTSEngine: | |
| """Get or initialize the TTS engine.""" | |
| global ENGINE | |
| if ENGINE is None: | |
| from engine import TTSEngine | |
| from engine.tts_engine import EngineConfig | |
| logger.info("Initializing TTS Engine...") | |
| ENGINE = TTSEngine( | |
| EngineConfig( | |
| default_backend="chatterbox", | |
| device=DEVICE, | |
| default_language="de", | |
| ) | |
| ) | |
| # Do not force-load models on startup; Chatterbox is heavy and should load on demand. | |
| ENGINE.set_backend("chatterbox") | |
| logger.info("TTS Engine ready!") | |
| return ENGINE | |
| # Initialize on startup | |
| try: | |
| get_engine() | |
| except Exception as e: | |
| logger.error(f"Failed to initialize engine on startup: {e}") | |
| # --- Helper Functions --- | |
| def get_language_choices() -> list[tuple[str, str]]: | |
| """Get language choices for dropdown.""" | |
| engine = get_engine() | |
| supported = engine.get_supported_languages() | |
| choices = [] | |
| for code in supported.keys(): | |
| display = LANGUAGE_DISPLAY.get(code, f"{supported[code]} ({code})") | |
| choices.append((display, code)) | |
| # Sort by display name, but put German first | |
| choices.sort(key=lambda x: (x[1] != "de", x[0])) | |
| return choices | |
| def get_language_choices_for_backend(backend: str) -> list[tuple[str, str]]: | |
| engine = get_engine() | |
| supported = engine.get_supported_languages(backend=backend) | |
| choices = [] | |
| for code in supported.keys(): | |
| display = LANGUAGE_DISPLAY.get(code, f"{supported[code]} ({code})") | |
| choices.append((display, code)) | |
| choices.sort(key=lambda x: (x[1] != "de", x[0])) | |
| return choices | |
| def get_example_text(language: str) -> str: | |
| """Get example text for a language.""" | |
| return EXAMPLE_TEXTS.get(language, EXAMPLE_TEXTS["en"]) | |
| def get_default_voice(language: str) -> str: | |
| """Get default voice prompt URL for a language.""" | |
| return DEFAULT_VOICE_PROMPTS.get(language) | |
| def get_voice_choices() -> list[str]: | |
| """Get voice dropdown choices. | |
| - Standard voices: local .wav prompts from voices folder | |
| - Special entry: Voice cloning (uses Chatterbox + user provided reference) | |
| """ | |
| voices = list(_list_default_voices().keys()) | |
| if voices: | |
| voices.append(VOICE_CLONING_OPTION) | |
| return voices | |
| # If there are no default voices, force voice cloning. | |
| return [VOICE_CLONING_OPTION] | |
| def _resolve_backend_for_voice_choice(voice_choice: str) -> str: | |
| return "chatterbox" | |
| def get_background_music_choices() -> list[tuple[str, str]]: | |
| """Get available background music choices.""" | |
| processor = AudioProcessor() | |
| music_files = processor.list_available_music() | |
| logger.info(f"Background music files found: {music_files}") | |
| # Create choices with display names | |
| choices = [("๐ No background music", "")] | |
| for name in music_files: | |
| # Create a nicer display name | |
| display = name.replace("_", " ").replace("-", " ").title() | |
| choices.append((f"๐ต {display}", name)) | |
| logger.info(f"Background music choices: {len(choices) - 1} options available") | |
| return choices | |
| # --- Main Generation Function --- | |
| def generate_announcement( | |
| text: str, | |
| language: str, | |
| voice_choice: str, | |
| voice_audio: str = None, | |
| background_music: str = "", | |
| custom_music: str = None, | |
| music_volume: float = -15.0, | |
| fade_in: float = 0.5, | |
| fade_out: float = 0.5, | |
| seed: int = 0, | |
| ) -> tuple[int, np.ndarray]: | |
| """ | |
| Generate a phone announcement. | |
| Args: | |
| text: Text to synthesize (supports long text with automatic sentence splitting) | |
| language: Language code | |
| voice_audio: Optional path to reference audio for voice cloning | |
| background_music: Name of preset background music file | |
| custom_music: Path to custom uploaded background music | |
| music_volume: Volume of background music in dB (default: -15) | |
| fade_in: Fade in duration in seconds | |
| fade_out: Fade out duration in seconds | |
| seed: Random seed (0 = random) | |
| Returns: | |
| Tuple of (sample_rate, audio_array) for Gradio audio component | |
| """ | |
| engine = get_engine() | |
| # Select backend based on voice choice | |
| backend_name = _resolve_backend_for_voice_choice(voice_choice) | |
| engine.set_backend(backend_name) | |
| # Set seed for reproducibility | |
| if seed != 0: | |
| torch.manual_seed(seed) | |
| random.seed(seed) | |
| np.random.seed(seed) | |
| if DEVICE == "cuda": | |
| torch.cuda.manual_seed_all(seed) | |
| # Voice resolution: | |
| # - Default voice: use voices/<name>.wav (local prompt) | |
| # - Voice cloning: use uploaded reference audio | |
| default_voices = _list_default_voices() | |
| if voice_choice != VOICE_CLONING_OPTION: | |
| if voice_choice not in default_voices: | |
| raise gr.Error( | |
| f"Unknown voice '{voice_choice}'. Add '{voice_choice}.wav' to '{_get_voices_dir()}' or select '{VOICE_CLONING_OPTION}'." | |
| ) | |
| voice_audio = str(default_voices[voice_choice]) | |
| else: | |
| # Force voice cloning when there are no default voices. | |
| if not _has_default_voices(): | |
| if not voice_audio or not str(voice_audio).strip(): | |
| raise gr.Error( | |
| f"No default voices found in '{_get_voices_dir()}'. Please upload a reference audio sample for voice cloning." | |
| ) | |
| # If default voices exist, keep previous behavior: fall back to a per-language prompt. | |
| if ( | |
| voice_audio is None or not str(voice_audio).strip() | |
| ) and _has_default_voices(): | |
| voice_audio = get_default_voice(language) | |
| # Determine which background music to use (custom upload takes priority) | |
| music_path = None | |
| if custom_music and str(custom_music).strip(): | |
| music_path = custom_music | |
| logger.info(f"Using custom background music: {music_path}") | |
| elif background_music and str(background_music).strip(): | |
| music_path = background_music | |
| logger.info(f"Using preset background music: {music_path}") | |
| logger.info( | |
| f"Generating: lang={language}, text='{text[:50]}...' ({len(text)} chars)" | |
| ) | |
| # Generate audio (engine handles sentence splitting automatically) | |
| # If we have background music, we need to process the audio | |
| if music_path: | |
| # Generate raw audio first (with sentence splitting for long texts) | |
| result = engine.generate_raw( | |
| text=text, | |
| language=language, | |
| voice_audio=voice_audio, | |
| split_sentences=True, | |
| ) | |
| # Process with background music | |
| from engine.audio_processor import AudioProcessingConfig, AudioProcessor | |
| processor = AudioProcessor( | |
| AudioProcessingConfig( | |
| background_music_path=music_path, | |
| music_volume_db=music_volume, | |
| fade_in_ms=int(fade_in * 1000), | |
| fade_out_ms=int(fade_out * 1000), | |
| padding_start_ms=int( | |
| fade_in * 1000 * 1.2 | |
| ), # Slightly longer padding for fades | |
| padding_end_ms=int(fade_out * 1000 * 1.2), | |
| ) | |
| ) | |
| # Process and get bytes | |
| processed_bytes = processor.process( | |
| audio=result.audio, | |
| sample_rate=result.sample_rate, | |
| ) | |
| # Convert back to numpy for Gradio | |
| import io | |
| from pydub import AudioSegment | |
| audio_segment = AudioSegment.from_mp3(io.BytesIO(processed_bytes)) | |
| samples = np.array(audio_segment.get_array_of_samples()) | |
| # Convert to float32 normalized | |
| samples = samples.astype(np.float32) / 32768.0 | |
| return (audio_segment.frame_rate, samples) | |
| else: | |
| # No background music, use direct generation | |
| result = engine.generate( | |
| text=text, | |
| language=language, | |
| voice_audio=voice_audio, | |
| split_sentences=True, | |
| ) | |
| return result | |
| def on_language_change(language: str, voice_choice: str): | |
| """Handle language selection change.""" | |
| # Only update reference-audio default for voice cloning. | |
| if voice_choice == VOICE_CLONING_OPTION: | |
| return get_example_text(language), gr.update(value=None) | |
| return get_example_text(language), gr.update() | |
| def on_voice_choice_change(voice_choice: str): | |
| """Switch UI elements depending on voice selection.""" | |
| language_choices = get_language_choices_for_backend("chatterbox") | |
| default_language = ( | |
| "de" | |
| if any(v == "de" for _, v in language_choices) | |
| else (language_choices[0][1] if language_choices else "en") | |
| ) | |
| show_voice_audio = voice_choice == VOICE_CLONING_OPTION | |
| return ( | |
| gr.update(choices=language_choices, value=default_language), | |
| gr.update(visible=show_voice_audio, value=None if show_voice_audio else None), | |
| gr.update(value=get_example_text(default_language)), | |
| ) | |
| # --- Gradio Interface --- | |
| def create_interface(): | |
| """Create the Gradio interface.""" | |
| with gr.Blocks( | |
| title="Phone Announcements Generator", | |
| theme=gr.themes.Soft(), | |
| css=""" | |
| .main-title { text-align: center; margin-bottom: 1rem; } | |
| .generate-btn { min-height: 50px; font-size: 1.1rem; } | |
| """, | |
| ) as demo: | |
| gr.Markdown( | |
| """ | |
| # ๐ Phone Announcements Generator | |
| Create professional phone announcements with AI-powered speech synthesis. | |
| Supports 23 languages with optional voice cloning. | |
| --- | |
| """, | |
| elem_classes=["main-title"], | |
| ) | |
| voices_dir = _get_voices_dir() | |
| gr.Markdown( | |
| f""" | |
| **Default voices folder:** `{voices_dir}` | |
| Put `.wav` files there named like `flozi.wav` โ voice `flozi`. | |
| If the folder has no `.wav` files, the UI will force **Voice cloning**. | |
| """ | |
| ) | |
| with gr.Row(): | |
| # Left column - Input | |
| with gr.Column(scale=1): | |
| voice_choices = get_voice_choices() | |
| default_voice_choice = ( | |
| voice_choices[0] if voice_choices else VOICE_CLONING_OPTION | |
| ) | |
| voice_choice = gr.Dropdown( | |
| choices=voice_choices, | |
| value=default_voice_choice, | |
| label="๐ฃ๏ธ Voice", | |
| info="Default voices come from the voices folder. 'Voice cloning' uses uploaded reference audio.", | |
| ) | |
| language = gr.Dropdown( | |
| choices=get_language_choices_for_backend("chatterbox"), | |
| value="de", | |
| label="๐ Language", | |
| info="Choose the language of the announcement", | |
| ) | |
| text = gr.Textbox( | |
| value=EXAMPLE_TEXTS["en"], | |
| label="๐ Announcement Text", | |
| placeholder="Enter your phone announcement text here...", | |
| lines=5, | |
| max_lines=15, | |
| info="Long texts will be automatically split into sentences", | |
| ) | |
| with gr.Accordion("๐ค Voice Settings (Optional)", open=False): | |
| voice_audio = gr.Audio( | |
| sources=["upload", "microphone"], | |
| type="filepath", | |
| label="Reference audio for voice cloning", | |
| visible=(default_voice_choice == VOICE_CLONING_OPTION), | |
| value=None, | |
| ) | |
| gr.Markdown( | |
| """ | |
| ๐ก **Tip:** Upload a short audio sample to clone a voice. | |
| The default voice will be used if no sample is provided. | |
| """ | |
| ) | |
| with gr.Accordion("๐ต Background Music (Optional)", open=False): | |
| background_music = gr.Dropdown( | |
| choices=get_background_music_choices(), | |
| value="", | |
| label="Preset music", | |
| info="Choose background music from the library", | |
| ) | |
| custom_music = gr.Audio( | |
| sources=["upload"], | |
| type="filepath", | |
| label="Or upload custom music", | |
| elem_id="custom_music", | |
| ) | |
| music_volume = gr.Slider( | |
| minimum=-30, | |
| maximum=0, | |
| value=-15, | |
| step=1, | |
| label="๐ Music volume (dB)", | |
| info="Background music volume relative to speech", | |
| ) | |
| with gr.Row(): | |
| fade_in = gr.Slider( | |
| minimum=0, | |
| maximum=3, | |
| value=0.5, | |
| step=0.1, | |
| label="โซ Fade In (sec.)", | |
| info="Fade-In duration", | |
| ) | |
| fade_out = gr.Slider( | |
| minimum=0, | |
| maximum=3, | |
| value=0.5, | |
| step=0.1, | |
| label="โฌ Fade Out (sec.)", | |
| info="Fade-Out duration", | |
| ) | |
| gr.Markdown( | |
| """ | |
| ๐ก **Note:** Uploaded custom music takes precedence over the selection. | |
| Music will be automatically looped and trimmed to the announcement length. | |
| """ | |
| ) | |
| with gr.Accordion("โ๏ธ Advanced Settings", open=False): | |
| seed = gr.Number( | |
| value=0, | |
| label="Random seed", | |
| info="0 = random, other values for reproducibility", | |
| precision=0, | |
| ) | |
| generate_btn = gr.Button( | |
| "๐๏ธ Generate Announcement", | |
| variant="primary", | |
| elem_classes=["generate-btn"], | |
| ) | |
| # Right column - Output | |
| with gr.Column(scale=1): | |
| audio_output = gr.Audio( | |
| label="๐ข Generated Announcement", type="numpy", interactive=False | |
| ) | |
| gr.Markdown( | |
| """ | |
| ### โน๏ธ Notes | |
| - Generation can take a few seconds | |
| - Long texts will be automatically split into sentences | |
| - Reference audio should be 5-15 seconds long | |
| - Background music will be looped automatically | |
| --- | |
| **Supported languages:** German, English, French, Spanish, | |
| Italian, Dutch, Polish, Portuguese, Russian, | |
| Turkish, Arabic, Chinese, Japanese, Korean, Hindi, | |
| Danish, Greek, Finnish, Hebrew, Malay, Norwegian, | |
| Swedish, Swahili | |
| """ | |
| ) | |
| # Event handlers | |
| voice_choice.change( | |
| fn=on_voice_choice_change, | |
| inputs=[voice_choice], | |
| outputs=[language, voice_audio, text], | |
| show_progress=False, | |
| ) | |
| language.change( | |
| fn=on_language_change, | |
| inputs=[language, voice_choice], | |
| outputs=[text, voice_audio], | |
| show_progress=False, | |
| ) | |
| generate_btn.click( | |
| fn=generate_announcement, | |
| inputs=[ | |
| text, | |
| language, | |
| voice_choice, | |
| voice_audio, | |
| background_music, | |
| custom_music, | |
| music_volume, | |
| fade_in, | |
| fade_out, | |
| seed, | |
| ], | |
| outputs=[audio_output], | |
| ) | |
| return demo | |
| # --- Main --- | |
| if __name__ == "__main__": | |
| demo = create_interface() | |
| demo.launch(server_name="0.0.0.0", server_port=7860, share=False) | |