Spaces:
Sleeping
Sleeping
UI changes - only support 2 languages, hardlocked exaggeration and formatting updates.
Browse files- app.py +46 -53
- requirements.txt +1 -1
app.py
CHANGED
|
@@ -1,8 +1,10 @@
|
|
| 1 |
import random
|
|
|
|
|
|
|
| 2 |
import numpy as np
|
| 3 |
import torch
|
| 4 |
-
|
| 5 |
-
import
|
| 6 |
|
| 7 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 8 |
print(f"🚀 Running on device: {DEVICE}")
|
|
@@ -12,19 +14,17 @@ MODEL = None
|
|
| 12 |
|
| 13 |
LANGUAGE_CONFIG = {
|
| 14 |
"da": {
|
| 15 |
-
"audio_options": {
|
| 16 |
-
"mic": "voices/mic.wav",
|
| 17 |
-
"nic": "voices/nic.wav"
|
| 18 |
-
},
|
| 19 |
"default_audio": "voices/mic.wav", # Default to mic
|
| 20 |
-
"text": "Sidste måned nåede vi en ny milepæl med to milliarder visninger på vores YouTube-kanal."
|
| 21 |
},
|
| 22 |
"en": {
|
| 23 |
"audio": "voices/en_f1.flac",
|
| 24 |
-
"text": "Last month, we reached a new milestone with two billion views on our YouTube channel."
|
| 25 |
},
|
| 26 |
}
|
| 27 |
|
|
|
|
| 28 |
# --- UI Helpers ---
|
| 29 |
def default_audio_for_ui(lang: str, danish_voice: str = "mic") -> str | None:
|
| 30 |
config = LANGUAGE_CONFIG.get(lang, {})
|
|
@@ -47,12 +47,12 @@ def get_supported_languages_display() -> str:
|
|
| 47 |
language_items = []
|
| 48 |
for code, name in sorted(SUPPORTED_LANGUAGES.items()):
|
| 49 |
language_items.append(f"**{name}** (`{code}`)")
|
| 50 |
-
|
| 51 |
# Split into 2 lines
|
| 52 |
mid = len(language_items) // 2
|
| 53 |
line1 = " • ".join(language_items[:mid])
|
| 54 |
line2 = " • ".join(language_items[mid:])
|
| 55 |
-
|
| 56 |
return f"""
|
| 57 |
### 🌍 Supported Languages ({len(SUPPORTED_LANGUAGES)} total)
|
| 58 |
{line1}
|
|
@@ -69,7 +69,7 @@ def get_or_load_model():
|
|
| 69 |
print("Model not loaded, initializing...")
|
| 70 |
try:
|
| 71 |
MODEL = ChatterboxMultilingualTTS.from_pretrained(DEVICE)
|
| 72 |
-
if hasattr(MODEL,
|
| 73 |
MODEL.to(DEVICE)
|
| 74 |
print(f"Model loaded successfully. Internal device: {getattr(MODEL, 'device', 'N/A')}")
|
| 75 |
except Exception as e:
|
|
@@ -77,12 +77,14 @@ def get_or_load_model():
|
|
| 77 |
raise
|
| 78 |
return MODEL
|
| 79 |
|
|
|
|
| 80 |
# Attempt to load the model at startup.
|
| 81 |
try:
|
| 82 |
get_or_load_model()
|
| 83 |
except Exception as e:
|
| 84 |
print(f"CRITICAL: Failed to load model on startup. Application may not function. Error: {e}")
|
| 85 |
|
|
|
|
| 86 |
def set_seed(seed: int):
|
| 87 |
"""Sets the random seed for reproducibility across torch, numpy, and random."""
|
| 88 |
torch.manual_seed(seed)
|
|
@@ -91,7 +93,8 @@ def set_seed(seed: int):
|
|
| 91 |
torch.cuda.manual_seed_all(seed)
|
| 92 |
random.seed(seed)
|
| 93 |
np.random.seed(seed)
|
| 94 |
-
|
|
|
|
| 95 |
def resolve_audio_prompt(language_id: str, provided_path: str | None, danish_voice: str = "mic") -> str | None:
|
| 96 |
"""
|
| 97 |
Decide which audio prompt to use:
|
|
@@ -112,14 +115,14 @@ def generate_tts_audio(
|
|
| 112 |
exaggeration_input: float = 0.5,
|
| 113 |
temperature_input: float = 0.8,
|
| 114 |
seed_num_input: int = 0,
|
| 115 |
-
cfgw_input: float = 0.5
|
| 116 |
) -> tuple[int, np.ndarray]:
|
| 117 |
"""
|
| 118 |
Generate high-quality speech audio from text using Chatterbox Multilingual model with optional reference audio styling.
|
| 119 |
Supported languages: English, French, German, Spanish, Italian, Portuguese, and Hindi.
|
| 120 |
-
|
| 121 |
-
This tool synthesizes natural-sounding speech from input text. When a reference audio file
|
| 122 |
-
is provided, it captures the speaker's voice characteristics and speaking style. The generated audio
|
| 123 |
maintains the prosody, tone, and vocal qualities of the reference speaker, or uses default voice if no reference is provided.
|
| 124 |
|
| 125 |
Args:
|
|
@@ -129,7 +132,7 @@ def generate_tts_audio(
|
|
| 129 |
exaggeration_input (float, optional): Controls speech expressiveness (0.25-2.0, neutral=0.5, extreme values may be unstable). Defaults to 0.5.
|
| 130 |
temperature_input (float, optional): Controls randomness in generation (0.05-5.0, higher=more varied). Defaults to 0.8.
|
| 131 |
seed_num_input (int, optional): Random seed for reproducible results (0 for random generation). Defaults to 0.
|
| 132 |
-
cfgw_input (float, optional): CFG/Pace weight controlling generation guidance (0.2-1.0). Defaults to 0.5, 0 for language transfer.
|
| 133 |
|
| 134 |
Returns:
|
| 135 |
tuple[int, np.ndarray]: A tuple containing the sample rate (int) and the generated audio waveform (numpy.ndarray)
|
|
@@ -143,7 +146,7 @@ def generate_tts_audio(
|
|
| 143 |
set_seed(int(seed_num_input))
|
| 144 |
|
| 145 |
print(f"Generating audio for text: '{text_input[:50]}...'")
|
| 146 |
-
|
| 147 |
# Handle optional audio prompt
|
| 148 |
chosen_prompt = resolve_audio_prompt(language_id, audio_prompt_path_input, danish_voice_input)
|
| 149 |
|
|
@@ -157,71 +160,64 @@ def generate_tts_audio(
|
|
| 157 |
print(f"Using audio prompt: {chosen_prompt}")
|
| 158 |
else:
|
| 159 |
print("No audio prompt provided; using default voice.")
|
| 160 |
-
|
| 161 |
wav = current_model.generate(
|
| 162 |
text_input[:300], # Truncate text to max chars
|
| 163 |
language_id=language_id,
|
| 164 |
-
**generate_kwargs
|
| 165 |
)
|
| 166 |
print("Audio generation complete.")
|
| 167 |
return (current_model.sr, wav.squeeze(0).numpy())
|
| 168 |
|
|
|
|
| 169 |
with gr.Blocks() as demo:
|
| 170 |
gr.Markdown(
|
| 171 |
"""
|
| 172 |
# Chatterbox Multilingual Demo
|
| 173 |
-
Generate high-quality
|
| 174 |
"""
|
| 175 |
)
|
| 176 |
-
|
| 177 |
# Display supported languages
|
| 178 |
gr.Markdown(get_supported_languages_display())
|
| 179 |
with gr.Row():
|
| 180 |
with gr.Column():
|
| 181 |
initial_lang = "da"
|
| 182 |
-
text = gr.Textbox(
|
| 183 |
-
|
| 184 |
-
label="Text to synthesize (max chars 300)",
|
| 185 |
-
max_lines=5
|
| 186 |
-
)
|
| 187 |
-
|
| 188 |
language_id = gr.Dropdown(
|
| 189 |
choices=list(ChatterboxMultilingualTTS.get_supported_languages().keys()),
|
| 190 |
value=initial_lang,
|
| 191 |
label="Language",
|
| 192 |
-
info="Select the language for text-to-speech synthesis"
|
| 193 |
)
|
| 194 |
-
|
| 195 |
danish_voice = gr.Dropdown(
|
| 196 |
choices=get_danish_voice_options(),
|
| 197 |
value="mic",
|
| 198 |
label="Danish Voice Selection",
|
| 199 |
info="Choose between different Danish voice options",
|
| 200 |
-
visible=(initial_lang == "da")
|
| 201 |
)
|
| 202 |
-
|
| 203 |
ref_wav = gr.Audio(
|
| 204 |
sources=["upload", "microphone"],
|
| 205 |
type="filepath",
|
| 206 |
label="Reference Audio File (Optional)",
|
| 207 |
-
value=default_audio_for_ui(initial_lang)
|
| 208 |
)
|
| 209 |
-
|
| 210 |
gr.Markdown(
|
| 211 |
"💡 **Note**: Ensure that the reference clip matches the specified language tag. Otherwise, language transfer outputs may inherit the accent of the reference clip's language. To mitigate this, set the CFG weight to 0.",
|
| 212 |
-
elem_classes=["audio-note"]
|
| 213 |
-
)
|
| 214 |
-
|
| 215 |
-
exaggeration = gr.Slider(
|
| 216 |
-
0.25, 2, step=.05, label="Exaggeration (Neutral = 0.5, extreme values can be unstable)", value=.5
|
| 217 |
-
)
|
| 218 |
-
cfg_weight = gr.Slider(
|
| 219 |
-
0.2, 1, step=.05, label="CFG/Pace", value=0.5
|
| 220 |
)
|
| 221 |
|
|
|
|
|
|
|
|
|
|
| 222 |
with gr.Accordion("More options", open=False):
|
| 223 |
seed_num = gr.Number(value=0, label="Random seed (0 for random)")
|
| 224 |
-
temp = gr.Slider(0.05, 5, step
|
| 225 |
|
| 226 |
run_btn = gr.Button("Generate", variant="primary")
|
| 227 |
|
|
@@ -229,13 +225,13 @@ with gr.Blocks() as demo:
|
|
| 229 |
audio_output = gr.Audio(label="Output Audio")
|
| 230 |
|
| 231 |
def on_language_change(lang, current_ref, current_text):
|
| 232 |
-
is_danish =
|
| 233 |
danish_voice_val = "mic" if is_danish else "mic" # Default to mic
|
| 234 |
return (
|
| 235 |
-
default_audio_for_ui(lang, danish_voice_val),
|
| 236 |
-
default_text_for_ui(lang),
|
| 237 |
gr.update(visible=is_danish), # Update Danish voice dropdown visibility
|
| 238 |
-
danish_voice_val
|
| 239 |
)
|
| 240 |
|
| 241 |
def on_danish_voice_change(lang, danish_voice_val):
|
|
@@ -247,14 +243,11 @@ with gr.Blocks() as demo:
|
|
| 247 |
fn=on_language_change,
|
| 248 |
inputs=[language_id, ref_wav, text],
|
| 249 |
outputs=[ref_wav, text, danish_voice, danish_voice],
|
| 250 |
-
show_progress=False
|
| 251 |
)
|
| 252 |
|
| 253 |
danish_voice.change(
|
| 254 |
-
fn=on_danish_voice_change,
|
| 255 |
-
inputs=[language_id, danish_voice],
|
| 256 |
-
outputs=[ref_wav],
|
| 257 |
-
show_progress=False
|
| 258 |
)
|
| 259 |
|
| 260 |
run_btn.click(
|
|
@@ -272,4 +265,4 @@ with gr.Blocks() as demo:
|
|
| 272 |
outputs=[audio_output],
|
| 273 |
)
|
| 274 |
|
| 275 |
-
demo.launch()
|
|
|
|
| 1 |
import random
|
| 2 |
+
|
| 3 |
+
import gradio as gr
|
| 4 |
import numpy as np
|
| 5 |
import torch
|
| 6 |
+
|
| 7 |
+
from src.chatterbox.mtl_tts import SUPPORTED_LANGUAGES, ChatterboxMultilingualTTS
|
| 8 |
|
| 9 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 10 |
print(f"🚀 Running on device: {DEVICE}")
|
|
|
|
| 14 |
|
| 15 |
LANGUAGE_CONFIG = {
|
| 16 |
"da": {
|
| 17 |
+
"audio_options": {"mic": "voices/mic.wav", "nic": "voices/nic.wav"},
|
|
|
|
|
|
|
|
|
|
| 18 |
"default_audio": "voices/mic.wav", # Default to mic
|
| 19 |
+
"text": "Sidste måned nåede vi en ny milepæl med to milliarder visninger på vores YouTube-kanal.",
|
| 20 |
},
|
| 21 |
"en": {
|
| 22 |
"audio": "voices/en_f1.flac",
|
| 23 |
+
"text": "Last month, we reached a new milestone with two billion views on our YouTube channel.",
|
| 24 |
},
|
| 25 |
}
|
| 26 |
|
| 27 |
+
|
| 28 |
# --- UI Helpers ---
|
| 29 |
def default_audio_for_ui(lang: str, danish_voice: str = "mic") -> str | None:
|
| 30 |
config = LANGUAGE_CONFIG.get(lang, {})
|
|
|
|
| 47 |
language_items = []
|
| 48 |
for code, name in sorted(SUPPORTED_LANGUAGES.items()):
|
| 49 |
language_items.append(f"**{name}** (`{code}`)")
|
| 50 |
+
|
| 51 |
# Split into 2 lines
|
| 52 |
mid = len(language_items) // 2
|
| 53 |
line1 = " • ".join(language_items[:mid])
|
| 54 |
line2 = " • ".join(language_items[mid:])
|
| 55 |
+
|
| 56 |
return f"""
|
| 57 |
### 🌍 Supported Languages ({len(SUPPORTED_LANGUAGES)} total)
|
| 58 |
{line1}
|
|
|
|
| 69 |
print("Model not loaded, initializing...")
|
| 70 |
try:
|
| 71 |
MODEL = ChatterboxMultilingualTTS.from_pretrained(DEVICE)
|
| 72 |
+
if hasattr(MODEL, "to") and str(MODEL.device) != DEVICE:
|
| 73 |
MODEL.to(DEVICE)
|
| 74 |
print(f"Model loaded successfully. Internal device: {getattr(MODEL, 'device', 'N/A')}")
|
| 75 |
except Exception as e:
|
|
|
|
| 77 |
raise
|
| 78 |
return MODEL
|
| 79 |
|
| 80 |
+
|
| 81 |
# Attempt to load the model at startup.
|
| 82 |
try:
|
| 83 |
get_or_load_model()
|
| 84 |
except Exception as e:
|
| 85 |
print(f"CRITICAL: Failed to load model on startup. Application may not function. Error: {e}")
|
| 86 |
|
| 87 |
+
|
| 88 |
def set_seed(seed: int):
|
| 89 |
"""Sets the random seed for reproducibility across torch, numpy, and random."""
|
| 90 |
torch.manual_seed(seed)
|
|
|
|
| 93 |
torch.cuda.manual_seed_all(seed)
|
| 94 |
random.seed(seed)
|
| 95 |
np.random.seed(seed)
|
| 96 |
+
|
| 97 |
+
|
| 98 |
def resolve_audio_prompt(language_id: str, provided_path: str | None, danish_voice: str = "mic") -> str | None:
|
| 99 |
"""
|
| 100 |
Decide which audio prompt to use:
|
|
|
|
| 115 |
exaggeration_input: float = 0.5,
|
| 116 |
temperature_input: float = 0.8,
|
| 117 |
seed_num_input: int = 0,
|
| 118 |
+
cfgw_input: float = 0.5,
|
| 119 |
) -> tuple[int, np.ndarray]:
|
| 120 |
"""
|
| 121 |
Generate high-quality speech audio from text using Chatterbox Multilingual model with optional reference audio styling.
|
| 122 |
Supported languages: English, French, German, Spanish, Italian, Portuguese, and Hindi.
|
| 123 |
+
|
| 124 |
+
This tool synthesizes natural-sounding speech from input text. When a reference audio file
|
| 125 |
+
is provided, it captures the speaker's voice characteristics and speaking style. The generated audio
|
| 126 |
maintains the prosody, tone, and vocal qualities of the reference speaker, or uses default voice if no reference is provided.
|
| 127 |
|
| 128 |
Args:
|
|
|
|
| 132 |
exaggeration_input (float, optional): Controls speech expressiveness (0.25-2.0, neutral=0.5, extreme values may be unstable). Defaults to 0.5.
|
| 133 |
temperature_input (float, optional): Controls randomness in generation (0.05-5.0, higher=more varied). Defaults to 0.8.
|
| 134 |
seed_num_input (int, optional): Random seed for reproducible results (0 for random generation). Defaults to 0.
|
| 135 |
+
cfgw_input (float, optional): CFG/Pace weight controlling generation guidance (0.2-1.0). Defaults to 0.5, 0 for language transfer.
|
| 136 |
|
| 137 |
Returns:
|
| 138 |
tuple[int, np.ndarray]: A tuple containing the sample rate (int) and the generated audio waveform (numpy.ndarray)
|
|
|
|
| 146 |
set_seed(int(seed_num_input))
|
| 147 |
|
| 148 |
print(f"Generating audio for text: '{text_input[:50]}...'")
|
| 149 |
+
|
| 150 |
# Handle optional audio prompt
|
| 151 |
chosen_prompt = resolve_audio_prompt(language_id, audio_prompt_path_input, danish_voice_input)
|
| 152 |
|
|
|
|
| 160 |
print(f"Using audio prompt: {chosen_prompt}")
|
| 161 |
else:
|
| 162 |
print("No audio prompt provided; using default voice.")
|
| 163 |
+
|
| 164 |
wav = current_model.generate(
|
| 165 |
text_input[:300], # Truncate text to max chars
|
| 166 |
language_id=language_id,
|
| 167 |
+
**generate_kwargs,
|
| 168 |
)
|
| 169 |
print("Audio generation complete.")
|
| 170 |
return (current_model.sr, wav.squeeze(0).numpy())
|
| 171 |
|
| 172 |
+
|
| 173 |
with gr.Blocks() as demo:
|
| 174 |
gr.Markdown(
|
| 175 |
"""
|
| 176 |
# Chatterbox Multilingual Demo
|
| 177 |
+
Generate high-quality danish speech from text with reference audio styling.
|
| 178 |
"""
|
| 179 |
)
|
| 180 |
+
|
| 181 |
# Display supported languages
|
| 182 |
gr.Markdown(get_supported_languages_display())
|
| 183 |
with gr.Row():
|
| 184 |
with gr.Column():
|
| 185 |
initial_lang = "da"
|
| 186 |
+
text = gr.Textbox(value=default_text_for_ui(initial_lang), label="Text to synthesize (max chars 300)", max_lines=5)
|
| 187 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
language_id = gr.Dropdown(
|
| 189 |
choices=list(ChatterboxMultilingualTTS.get_supported_languages().keys()),
|
| 190 |
value=initial_lang,
|
| 191 |
label="Language",
|
| 192 |
+
info="Select the language for text-to-speech synthesis",
|
| 193 |
)
|
| 194 |
+
|
| 195 |
danish_voice = gr.Dropdown(
|
| 196 |
choices=get_danish_voice_options(),
|
| 197 |
value="mic",
|
| 198 |
label="Danish Voice Selection",
|
| 199 |
info="Choose between different Danish voice options",
|
| 200 |
+
visible=(initial_lang == "da"),
|
| 201 |
)
|
| 202 |
+
|
| 203 |
ref_wav = gr.Audio(
|
| 204 |
sources=["upload", "microphone"],
|
| 205 |
type="filepath",
|
| 206 |
label="Reference Audio File (Optional)",
|
| 207 |
+
value=default_audio_for_ui(initial_lang),
|
| 208 |
)
|
| 209 |
+
|
| 210 |
gr.Markdown(
|
| 211 |
"💡 **Note**: Ensure that the reference clip matches the specified language tag. Otherwise, language transfer outputs may inherit the accent of the reference clip's language. To mitigate this, set the CFG weight to 0.",
|
| 212 |
+
elem_classes=["audio-note"],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
)
|
| 214 |
|
| 215 |
+
exaggeration = 0.5
|
| 216 |
+
cfg_weight = gr.Slider(0.2, 1, step=0.05, label="CFG/Pace", value=0.5)
|
| 217 |
+
|
| 218 |
with gr.Accordion("More options", open=False):
|
| 219 |
seed_num = gr.Number(value=0, label="Random seed (0 for random)")
|
| 220 |
+
temp = gr.Slider(0.05, 5, step=0.05, label="Temperature", value=0.8)
|
| 221 |
|
| 222 |
run_btn = gr.Button("Generate", variant="primary")
|
| 223 |
|
|
|
|
| 225 |
audio_output = gr.Audio(label="Output Audio")
|
| 226 |
|
| 227 |
def on_language_change(lang, current_ref, current_text):
|
| 228 |
+
is_danish = lang == "da"
|
| 229 |
danish_voice_val = "mic" if is_danish else "mic" # Default to mic
|
| 230 |
return (
|
| 231 |
+
default_audio_for_ui(lang, danish_voice_val),
|
| 232 |
+
default_text_for_ui(lang),
|
| 233 |
gr.update(visible=is_danish), # Update Danish voice dropdown visibility
|
| 234 |
+
danish_voice_val,
|
| 235 |
)
|
| 236 |
|
| 237 |
def on_danish_voice_change(lang, danish_voice_val):
|
|
|
|
| 243 |
fn=on_language_change,
|
| 244 |
inputs=[language_id, ref_wav, text],
|
| 245 |
outputs=[ref_wav, text, danish_voice, danish_voice],
|
| 246 |
+
show_progress=False,
|
| 247 |
)
|
| 248 |
|
| 249 |
danish_voice.change(
|
| 250 |
+
fn=on_danish_voice_change, inputs=[language_id, danish_voice], outputs=[ref_wav], show_progress=False
|
|
|
|
|
|
|
|
|
|
| 251 |
)
|
| 252 |
|
| 253 |
run_btn.click(
|
|
|
|
| 265 |
outputs=[audio_output],
|
| 266 |
)
|
| 267 |
|
| 268 |
+
demo.launch() # mcp_server=True
|
requirements.txt
CHANGED
|
@@ -9,4 +9,4 @@ omegaconf==2.3.0
|
|
| 9 |
resemble-perth==1.0.1
|
| 10 |
silero-vad==5.1.2
|
| 11 |
conformer==0.3.2
|
| 12 |
-
safetensors
|
|
|
|
| 9 |
resemble-perth==1.0.1
|
| 10 |
silero-vad==5.1.2
|
| 11 |
conformer==0.3.2
|
| 12 |
+
safetensors
|