flozi00's picture
chatterbox only
0849031
"""Phone Speaker TTS - Gradio Application.
UI requirements:
- Load default voices from a folder of .wav files (e.g. voices/flozi.wav -> "flozi")
- Provide a dropdown to choose a voice
- Include a "Voice cloning" option; when selected, show reference-audio upload
and use Chatterbox (voice cloning capable) backend.
"""
import os
import random
from pathlib import Path
import gradio as gr
import numpy as np
import torch
try:
import spaces
HAS_SPACES = True
except ImportError:
HAS_SPACES = False
# Create a dummy decorator
class spaces:
@staticmethod
def GPU(func):
return func
from loguru import logger
from engine import TTSEngine
from engine.audio_processor import AudioProcessor
from engine.backends.chatterbox_backend import DEFAULT_VOICE_PROMPTS
# --- Configuration ---
DEVICE = (
"cuda"
if torch.cuda.is_available()
else "mps" if torch.backends.mps.is_available() else "cpu"
)
logger.info(f"๐Ÿš€ Running on device: {DEVICE}")
# Language display configuration
LANGUAGE_DISPLAY = {
"de": "๐Ÿ‡ฉ๐Ÿ‡ช German",
"en": "๐Ÿ‡ฌ๐Ÿ‡ง English",
"fr": "๐Ÿ‡ซ๐Ÿ‡ท French",
"es": "๐Ÿ‡ช๐Ÿ‡ธ Spanish",
"it": "๐Ÿ‡ฎ๐Ÿ‡น Italian",
"nl": "๐Ÿ‡ณ๐Ÿ‡ฑ Dutch",
"pl": "๐Ÿ‡ต๐Ÿ‡ฑ Polish",
"pt": "๐Ÿ‡ต๐Ÿ‡น Portuguese",
"ru": "๐Ÿ‡ท๐Ÿ‡บ Russian",
"tr": "๐Ÿ‡น๐Ÿ‡ท Turkish",
"ar": "๐Ÿ‡ธ๐Ÿ‡ฆ Arabic",
"zh": "๐Ÿ‡จ๐Ÿ‡ณ Chinese",
"ja": "๐Ÿ‡ฏ๐Ÿ‡ต Japanese",
"ko": "๐Ÿ‡ฐ๐Ÿ‡ท Korean",
"hi": "๐Ÿ‡ฎ๐Ÿ‡ณ Hindi",
"da": "๐Ÿ‡ฉ๐Ÿ‡ฐ Danish",
"el": "๐Ÿ‡ฌ๐Ÿ‡ท Greek",
"fi": "๐Ÿ‡ซ๐Ÿ‡ฎ Finnish",
"he": "๐Ÿ‡ฎ๐Ÿ‡ฑ Hebrew",
"ms": "๐Ÿ‡ฒ๐Ÿ‡พ Malay",
"no": "๐Ÿ‡ณ๐Ÿ‡ด Norwegian",
"sv": "๐Ÿ‡ธ๐Ÿ‡ช Swedish",
"sw": "๐Ÿ‡ฐ๐Ÿ‡ช Swahili",
}
# Example texts per language
EXAMPLE_TEXTS = {
"de": "Herzlich willkommen. Sie sind mit unserem Kundenservice verbunden. Bitte haben Sie einen Moment Geduld, wir sind gleich fรผr Sie da.",
"en": "Welcome to our customer service. Please hold the line, one of our representatives will be with you shortly.",
"fr": "Bienvenue sur notre service client. Veuillez patienter, un conseiller va prendre votre appel.",
"es": "Bienvenido a nuestro servicio de atenciรณn al cliente. Por favor, espere un momento.",
"it": "Benvenuto nel nostro servizio clienti. La preghiamo di attendere in linea.",
"nl": "Welkom bij onze klantenservice. Een moment geduld alstublieft.",
"pl": "Witamy w naszej obsล‚udze klienta. Proszฤ™ czekaฤ‡ na poล‚ฤ…czenie.",
"pt": "Bem-vindo ao nosso serviรงo de apoio ao cliente. Por favor, aguarde um momento.",
"ru": "ะ”ะพะฑั€ะพ ะฟะพะถะฐะปะพะฒะฐั‚ัŒ ะฒ ัะปัƒะถะฑัƒ ะฟะพะดะดะตั€ะถะบะธ. ะŸะพะถะฐะปัƒะนัั‚ะฐ, ะพัั‚ะฐะฒะฐะนั‚ะตััŒ ะฝะฐ ะปะธะฝะธะธ.",
"tr": "MรผลŸteri hizmetlerimize hoลŸ geldiniz. Lรผtfen hatta kalฤฑn.",
"ar": "ู…ุฑุญุจุงู‹ ุจูƒู… ููŠ ุฎุฏู…ุฉ ุงู„ุนู…ู„ุงุก. ูŠุฑุฌู‰ ุงู„ุงู†ุชุธุงุฑ ุนู„ู‰ ุงู„ุฎุท.",
"zh": "ๆฌข่ฟŽ่‡ด็”ตๅฎขๆˆทๆœๅŠกไธญๅฟƒใ€‚่ฏท็จๅ€™๏ผŒๆˆ‘ไปฌ็š„ๅฎขๆœไปฃ่กจๅฐ†ๅพˆๅฟซไธบๆ‚จๆœๅŠกใ€‚",
"ja": "ใŠ้›ป่ฉฑใ‚ใ‚ŠใŒใจใ†ใ”ใ–ใ„ใพใ™ใ€‚ๆ‹…ๅฝ“่€…ใซใŠใคใชใŽใ—ใพใ™ใฎใงใ€ๅฐ‘ใ€…ใŠๅพ…ใกใใ ใ•ใ„ใ€‚",
"ko": "๊ณ ๊ฐ ์„œ๋น„์Šค์— ์˜ค์‹  ๊ฒƒ์„ ํ™˜์˜ํ•ฉ๋‹ˆ๋‹ค. ์ž ์‹œ๋งŒ ๊ธฐ๋‹ค๋ ค ์ฃผ์„ธ์š”.",
"hi": "เคนเคฎเคพเคฐเฅ€ เค—เฅเคฐเคพเคนเค• เคธเฅ‡เคตเคพ เคฎเฅ‡เค‚ เค†เคชเค•เคพ เคธเฅเคตเคพเค—เคค เคนเฅˆเฅค เค•เฅƒเคชเคฏเคพ เคชเฅเคฐเคคเฅ€เค•เฅเคทเคพ เค•เคฐเฅ‡เค‚เฅค",
"da": "Velkommen til vores kundeservice. Vent venligst.",
"el": "ฮšฮฑฮปฯŽฯ‚ ฮฎฯฮธฮฑฯ„ฮต ฯƒฯ„ฮทฮฝ ฮตฮพฯ…ฯ€ฮทฯฮญฯ„ฮทฯƒฮท ฯ€ฮตฮปฮฑฯ„ฯŽฮฝ. ฮ ฮฑฯฮฑฮบฮฑฮปฯŽ ฯ€ฮตฯฮนฮผฮญฮฝฮตฯ„ฮต.",
"fi": "Tervetuloa asiakaspalveluumme. Odottakaa hetki.",
"he": "ื‘ืจื•ื›ื™ื ื”ื‘ืื™ื ืœืฉื™ืจื•ืช ื”ืœืงื•ื—ื•ืช ืฉืœื ื•. ืื ื ื”ืžืชื™ื ื• ืขืœ ื”ืงื•.",
"ms": "Selamat datang ke perkhidmatan pelanggan kami. Sila tunggu sebentar.",
"no": "Velkommen til vรฅr kundeservice. Vennligst vent.",
"sv": "Vรคlkommen till vรฅr kundtjรคnst. Vรคnligen vรคnta.",
"sw": "Karibu kwa huduma yetu ya wateja. Tafadhali subiri.",
}
# --- Global Engine ---
ENGINE = None
VOICE_CLONING_OPTION = "Voice cloning"
def _get_voices_dir() -> Path:
env_dir = os.environ.get("PHONE_SPEAKER_TTS_VOICES_DIR")
if env_dir and str(env_dir).strip():
return Path(env_dir).expanduser()
return Path(__file__).parent / "voices"
def _list_default_voices() -> dict[str, Path]:
voices_dir = _get_voices_dir()
if not voices_dir.exists() or not voices_dir.is_dir():
return {}
voices: dict[str, Path] = {}
for wav_path in sorted(voices_dir.glob("*.wav")):
name = wav_path.stem.strip()
if name:
voices[name] = wav_path
return voices
def _has_default_voices() -> bool:
return len(_list_default_voices()) > 0
def get_engine() -> TTSEngine:
"""Get or initialize the TTS engine."""
global ENGINE
if ENGINE is None:
from engine import TTSEngine
from engine.tts_engine import EngineConfig
logger.info("Initializing TTS Engine...")
ENGINE = TTSEngine(
EngineConfig(
default_backend="chatterbox",
device=DEVICE,
default_language="de",
)
)
# Do not force-load models on startup; Chatterbox is heavy and should load on demand.
ENGINE.set_backend("chatterbox")
logger.info("TTS Engine ready!")
return ENGINE
# Initialize on startup
try:
get_engine()
except Exception as e:
logger.error(f"Failed to initialize engine on startup: {e}")
# --- Helper Functions ---
def get_language_choices() -> list[tuple[str, str]]:
"""Get language choices for dropdown."""
engine = get_engine()
supported = engine.get_supported_languages()
choices = []
for code in supported.keys():
display = LANGUAGE_DISPLAY.get(code, f"{supported[code]} ({code})")
choices.append((display, code))
# Sort by display name, but put German first
choices.sort(key=lambda x: (x[1] != "de", x[0]))
return choices
def get_language_choices_for_backend(backend: str) -> list[tuple[str, str]]:
engine = get_engine()
supported = engine.get_supported_languages(backend=backend)
choices = []
for code in supported.keys():
display = LANGUAGE_DISPLAY.get(code, f"{supported[code]} ({code})")
choices.append((display, code))
choices.sort(key=lambda x: (x[1] != "de", x[0]))
return choices
def get_example_text(language: str) -> str:
"""Get example text for a language."""
return EXAMPLE_TEXTS.get(language, EXAMPLE_TEXTS["en"])
def get_default_voice(language: str) -> str:
"""Get default voice prompt URL for a language."""
return DEFAULT_VOICE_PROMPTS.get(language)
def get_voice_choices() -> list[str]:
"""Get voice dropdown choices.
- Standard voices: local .wav prompts from voices folder
- Special entry: Voice cloning (uses Chatterbox + user provided reference)
"""
voices = list(_list_default_voices().keys())
if voices:
voices.append(VOICE_CLONING_OPTION)
return voices
# If there are no default voices, force voice cloning.
return [VOICE_CLONING_OPTION]
def _resolve_backend_for_voice_choice(voice_choice: str) -> str:
return "chatterbox"
def get_background_music_choices() -> list[tuple[str, str]]:
"""Get available background music choices."""
processor = AudioProcessor()
music_files = processor.list_available_music()
logger.info(f"Background music files found: {music_files}")
# Create choices with display names
choices = [("๐Ÿ”‡ No background music", "")]
for name in music_files:
# Create a nicer display name
display = name.replace("_", " ").replace("-", " ").title()
choices.append((f"๐ŸŽต {display}", name))
logger.info(f"Background music choices: {len(choices) - 1} options available")
return choices
# --- Main Generation Function ---
@spaces.GPU
def generate_announcement(
text: str,
language: str,
voice_choice: str,
voice_audio: str = None,
background_music: str = "",
custom_music: str = None,
music_volume: float = -15.0,
fade_in: float = 0.5,
fade_out: float = 0.5,
seed: int = 0,
) -> tuple[int, np.ndarray]:
"""
Generate a phone announcement.
Args:
text: Text to synthesize (supports long text with automatic sentence splitting)
language: Language code
voice_audio: Optional path to reference audio for voice cloning
background_music: Name of preset background music file
custom_music: Path to custom uploaded background music
music_volume: Volume of background music in dB (default: -15)
fade_in: Fade in duration in seconds
fade_out: Fade out duration in seconds
seed: Random seed (0 = random)
Returns:
Tuple of (sample_rate, audio_array) for Gradio audio component
"""
engine = get_engine()
# Select backend based on voice choice
backend_name = _resolve_backend_for_voice_choice(voice_choice)
engine.set_backend(backend_name)
# Set seed for reproducibility
if seed != 0:
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)
if DEVICE == "cuda":
torch.cuda.manual_seed_all(seed)
# Voice resolution:
# - Default voice: use voices/<name>.wav (local prompt)
# - Voice cloning: use uploaded reference audio
default_voices = _list_default_voices()
if voice_choice != VOICE_CLONING_OPTION:
if voice_choice not in default_voices:
raise gr.Error(
f"Unknown voice '{voice_choice}'. Add '{voice_choice}.wav' to '{_get_voices_dir()}' or select '{VOICE_CLONING_OPTION}'."
)
voice_audio = str(default_voices[voice_choice])
else:
# Force voice cloning when there are no default voices.
if not _has_default_voices():
if not voice_audio or not str(voice_audio).strip():
raise gr.Error(
f"No default voices found in '{_get_voices_dir()}'. Please upload a reference audio sample for voice cloning."
)
# If default voices exist, keep previous behavior: fall back to a per-language prompt.
if (
voice_audio is None or not str(voice_audio).strip()
) and _has_default_voices():
voice_audio = get_default_voice(language)
# Determine which background music to use (custom upload takes priority)
music_path = None
if custom_music and str(custom_music).strip():
music_path = custom_music
logger.info(f"Using custom background music: {music_path}")
elif background_music and str(background_music).strip():
music_path = background_music
logger.info(f"Using preset background music: {music_path}")
logger.info(
f"Generating: lang={language}, text='{text[:50]}...' ({len(text)} chars)"
)
# Generate audio (engine handles sentence splitting automatically)
# If we have background music, we need to process the audio
if music_path:
# Generate raw audio first (with sentence splitting for long texts)
result = engine.generate_raw(
text=text,
language=language,
voice_audio=voice_audio,
split_sentences=True,
)
# Process with background music
from engine.audio_processor import AudioProcessingConfig, AudioProcessor
processor = AudioProcessor(
AudioProcessingConfig(
background_music_path=music_path,
music_volume_db=music_volume,
fade_in_ms=int(fade_in * 1000),
fade_out_ms=int(fade_out * 1000),
padding_start_ms=int(
fade_in * 1000 * 1.2
), # Slightly longer padding for fades
padding_end_ms=int(fade_out * 1000 * 1.2),
)
)
# Process and get bytes
processed_bytes = processor.process(
audio=result.audio,
sample_rate=result.sample_rate,
)
# Convert back to numpy for Gradio
import io
from pydub import AudioSegment
audio_segment = AudioSegment.from_mp3(io.BytesIO(processed_bytes))
samples = np.array(audio_segment.get_array_of_samples())
# Convert to float32 normalized
samples = samples.astype(np.float32) / 32768.0
return (audio_segment.frame_rate, samples)
else:
# No background music, use direct generation
result = engine.generate(
text=text,
language=language,
voice_audio=voice_audio,
split_sentences=True,
)
return result
def on_language_change(language: str, voice_choice: str):
"""Handle language selection change."""
# Only update reference-audio default for voice cloning.
if voice_choice == VOICE_CLONING_OPTION:
return get_example_text(language), gr.update(value=None)
return get_example_text(language), gr.update()
def on_voice_choice_change(voice_choice: str):
"""Switch UI elements depending on voice selection."""
language_choices = get_language_choices_for_backend("chatterbox")
default_language = (
"de"
if any(v == "de" for _, v in language_choices)
else (language_choices[0][1] if language_choices else "en")
)
show_voice_audio = voice_choice == VOICE_CLONING_OPTION
return (
gr.update(choices=language_choices, value=default_language),
gr.update(visible=show_voice_audio, value=None if show_voice_audio else None),
gr.update(value=get_example_text(default_language)),
)
# --- Gradio Interface ---
def create_interface():
"""Create the Gradio interface."""
with gr.Blocks(
title="Phone Announcements Generator",
theme=gr.themes.Soft(),
css="""
.main-title { text-align: center; margin-bottom: 1rem; }
.generate-btn { min-height: 50px; font-size: 1.1rem; }
""",
) as demo:
gr.Markdown(
"""
# ๐Ÿ“ž Phone Announcements Generator
Create professional phone announcements with AI-powered speech synthesis.
Supports 23 languages with optional voice cloning.
---
""",
elem_classes=["main-title"],
)
voices_dir = _get_voices_dir()
gr.Markdown(
f"""
**Default voices folder:** `{voices_dir}`
Put `.wav` files there named like `flozi.wav` โ†’ voice `flozi`.
If the folder has no `.wav` files, the UI will force **Voice cloning**.
"""
)
with gr.Row():
# Left column - Input
with gr.Column(scale=1):
voice_choices = get_voice_choices()
default_voice_choice = (
voice_choices[0] if voice_choices else VOICE_CLONING_OPTION
)
voice_choice = gr.Dropdown(
choices=voice_choices,
value=default_voice_choice,
label="๐Ÿ—ฃ๏ธ Voice",
info="Default voices come from the voices folder. 'Voice cloning' uses uploaded reference audio.",
)
language = gr.Dropdown(
choices=get_language_choices_for_backend("chatterbox"),
value="de",
label="๐ŸŒ Language",
info="Choose the language of the announcement",
)
text = gr.Textbox(
value=EXAMPLE_TEXTS["en"],
label="๐Ÿ“ Announcement Text",
placeholder="Enter your phone announcement text here...",
lines=5,
max_lines=15,
info="Long texts will be automatically split into sentences",
)
with gr.Accordion("๐ŸŽค Voice Settings (Optional)", open=False):
voice_audio = gr.Audio(
sources=["upload", "microphone"],
type="filepath",
label="Reference audio for voice cloning",
visible=(default_voice_choice == VOICE_CLONING_OPTION),
value=None,
)
gr.Markdown(
"""
๐Ÿ’ก **Tip:** Upload a short audio sample to clone a voice.
The default voice will be used if no sample is provided.
"""
)
with gr.Accordion("๐ŸŽต Background Music (Optional)", open=False):
background_music = gr.Dropdown(
choices=get_background_music_choices(),
value="",
label="Preset music",
info="Choose background music from the library",
)
custom_music = gr.Audio(
sources=["upload"],
type="filepath",
label="Or upload custom music",
elem_id="custom_music",
)
music_volume = gr.Slider(
minimum=-30,
maximum=0,
value=-15,
step=1,
label="๐Ÿ”Š Music volume (dB)",
info="Background music volume relative to speech",
)
with gr.Row():
fade_in = gr.Slider(
minimum=0,
maximum=3,
value=0.5,
step=0.1,
label="โซ Fade In (sec.)",
info="Fade-In duration",
)
fade_out = gr.Slider(
minimum=0,
maximum=3,
value=0.5,
step=0.1,
label="โฌ Fade Out (sec.)",
info="Fade-Out duration",
)
gr.Markdown(
"""
๐Ÿ’ก **Note:** Uploaded custom music takes precedence over the selection.
Music will be automatically looped and trimmed to the announcement length.
"""
)
with gr.Accordion("โš™๏ธ Advanced Settings", open=False):
seed = gr.Number(
value=0,
label="Random seed",
info="0 = random, other values for reproducibility",
precision=0,
)
generate_btn = gr.Button(
"๐ŸŽ™๏ธ Generate Announcement",
variant="primary",
elem_classes=["generate-btn"],
)
# Right column - Output
with gr.Column(scale=1):
audio_output = gr.Audio(
label="๐Ÿ“ข Generated Announcement", type="numpy", interactive=False
)
gr.Markdown(
"""
### โ„น๏ธ Notes
- Generation can take a few seconds
- Long texts will be automatically split into sentences
- Reference audio should be 5-15 seconds long
- Background music will be looped automatically
---
**Supported languages:** German, English, French, Spanish,
Italian, Dutch, Polish, Portuguese, Russian,
Turkish, Arabic, Chinese, Japanese, Korean, Hindi,
Danish, Greek, Finnish, Hebrew, Malay, Norwegian,
Swedish, Swahili
"""
)
# Event handlers
voice_choice.change(
fn=on_voice_choice_change,
inputs=[voice_choice],
outputs=[language, voice_audio, text],
show_progress=False,
)
language.change(
fn=on_language_change,
inputs=[language, voice_choice],
outputs=[text, voice_audio],
show_progress=False,
)
generate_btn.click(
fn=generate_announcement,
inputs=[
text,
language,
voice_choice,
voice_audio,
background_music,
custom_music,
music_volume,
fade_in,
fade_out,
seed,
],
outputs=[audio_output],
)
return demo
# --- Main ---
if __name__ == "__main__":
demo = create_interface()
demo.launch(server_name="0.0.0.0", server_port=7860, share=False)