Spaces:
Running
Running
fixed saved voice persistence on reload
Browse files- app.py +95 -122
- voices.json +1 -1
app.py
CHANGED
|
@@ -16,6 +16,10 @@ import nltk
|
|
| 16 |
matplotlib.use("Agg") # Use non-interactive backend
|
| 17 |
import matplotlib.pyplot as plt
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
from text2speech import tts_randomized, parse_speed, tts_with_style_vector
|
| 20 |
|
| 21 |
# Constants and Paths
|
|
@@ -47,7 +51,6 @@ print(f"Using device: {device}")
|
|
| 47 |
# LOAD PCA MODEL AND ANNOTATED FEATURES
|
| 48 |
##############################################################################
|
| 49 |
|
| 50 |
-
# Load PCA model and annotated features
|
| 51 |
try:
|
| 52 |
pca = joblib.load(PCA_MODEL_PATH)
|
| 53 |
print("PCA model loaded successfully.")
|
|
@@ -90,12 +93,7 @@ def save_voices_json(data, path=VOICES_JSON_PATH):
|
|
| 90 |
def update_sliders(voice_name):
|
| 91 |
"""
|
| 92 |
Update slider values based on the selected predefined voice using reverse PCA.
|
| 93 |
-
|
| 94 |
-
Args:
|
| 95 |
-
voice_name (str): The name of the selected voice.
|
| 96 |
-
|
| 97 |
-
Returns:
|
| 98 |
-
list: A list of PCA component values to set the sliders.
|
| 99 |
"""
|
| 100 |
if not voice_name:
|
| 101 |
# Return default slider values (e.g., zeros) if no voice is selected
|
|
@@ -124,24 +122,16 @@ def update_sliders(voice_name):
|
|
| 124 |
def generate_audio_with_voice(text, voice_key, speed_val):
|
| 125 |
"""
|
| 126 |
Generate audio using the style vector of the selected predefined voice.
|
| 127 |
-
|
| 128 |
-
Args:
|
| 129 |
-
text (str): The text to synthesize.
|
| 130 |
-
voice_key (str): The name of the selected voice.
|
| 131 |
-
speed_val (float): The speed multiplier.
|
| 132 |
-
|
| 133 |
-
Returns:
|
| 134 |
-
tuple: (audio_tuple, style_vector)
|
| 135 |
"""
|
| 136 |
try:
|
| 137 |
# Load voices data
|
| 138 |
voices_data = load_voices_json()
|
| 139 |
-
|
| 140 |
if voice_key not in voices_data:
|
| 141 |
-
|
| 142 |
-
|
|
|
|
| 143 |
|
| 144 |
-
# Retrieve the style vector for the selected voice
|
| 145 |
style_vector = np.array(voices_data[voice_key], dtype=np.float32).reshape(1, -1)
|
| 146 |
print(f"Selected Voice: {voice_key}")
|
| 147 |
print(f"Style Vector (First 6): {style_vector[0][:6]}")
|
|
@@ -149,7 +139,7 @@ def generate_audio_with_voice(text, voice_key, speed_val):
|
|
| 149 |
# Convert to torch tensor and move to device
|
| 150 |
style_vec_torch = torch.from_numpy(style_vector).float().to(device)
|
| 151 |
|
| 152 |
-
# Generate audio
|
| 153 |
audio_np = tts_with_style_vector(
|
| 154 |
text,
|
| 155 |
style_vec=style_vec_torch,
|
|
@@ -161,14 +151,12 @@ def generate_audio_with_voice(text, voice_key, speed_val):
|
|
| 161 |
)
|
| 162 |
|
| 163 |
if audio_np is None:
|
| 164 |
-
|
| 165 |
-
|
|
|
|
| 166 |
|
| 167 |
-
|
| 168 |
-
sr = 24000 # Adjust based on your actual sampling rate
|
| 169 |
audio_tuple = (sr, audio_np)
|
| 170 |
-
|
| 171 |
-
# Return audio, image, and style vector
|
| 172 |
return audio_tuple, style_vector.tolist()
|
| 173 |
|
| 174 |
except Exception as e:
|
|
@@ -177,7 +165,7 @@ def generate_audio_with_voice(text, voice_key, speed_val):
|
|
| 177 |
|
| 178 |
|
| 179 |
def build_modified_vector(voice_key, top6_values):
|
| 180 |
-
"""
|
| 181 |
voices_data = load_voices_json()
|
| 182 |
if voice_key not in voices_data:
|
| 183 |
print(f"Voice '{voice_key}' not found in {VOICES_JSON_PATH}.")
|
|
@@ -189,7 +177,6 @@ def build_modified_vector(voice_key, top6_values):
|
|
| 189 |
return None
|
| 190 |
|
| 191 |
try:
|
| 192 |
-
# Reconstruct the style vector using inverse PCA
|
| 193 |
pca_components = np.array(top6_values).reshape(1, -1)
|
| 194 |
reconstructed_vec = pca.inverse_transform(pca_components)[0]
|
| 195 |
return reconstructed_vec
|
|
@@ -198,39 +185,18 @@ def build_modified_vector(voice_key, top6_values):
|
|
| 198 |
return None
|
| 199 |
|
| 200 |
|
| 201 |
-
def
|
| 202 |
-
"""
|
| 203 |
-
Reconstruct the 256-dimensional style vector from PCA components.
|
| 204 |
-
"""
|
| 205 |
-
if pca is None:
|
| 206 |
-
print("PCA model is not loaded.")
|
| 207 |
-
return None
|
| 208 |
-
try:
|
| 209 |
-
return pca.inverse_transform([pca_components])[0]
|
| 210 |
-
except Exception as e:
|
| 211 |
-
print(f"Error during inverse PCA transform: {e}")
|
| 212 |
-
return None
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
def generate_custom_audio(text, voice_key, randomize, speed_str, *slider_values):
|
| 216 |
"""
|
| 217 |
-
Generate audio
|
| 218 |
-
Returns
|
| 219 |
-
- audio tuple (sr, np_array) for Gradio's Audio
|
| 220 |
-
- a PIL Image representing the style vector plot
|
| 221 |
-
- the final style vector as a list for State
|
| 222 |
"""
|
| 223 |
try:
|
| 224 |
-
speed_val = parse_speed(speed_str)
|
| 225 |
-
print(f"Parsed speed: {speed_val}")
|
| 226 |
-
|
| 227 |
if randomize:
|
| 228 |
# Generate randomized style vector
|
| 229 |
audio_np, random_style_vec = tts_randomized(text, speed=speed_val)
|
| 230 |
if random_style_vec is None:
|
| 231 |
print("Failed to generate randomized style vector.")
|
| 232 |
-
return None, None
|
| 233 |
-
# Ensure the style vector is flat and on device
|
| 234 |
final_vec = (
|
| 235 |
random_style_vec.cpu().numpy().flatten()
|
| 236 |
if isinstance(random_style_vec, torch.Tensor)
|
|
@@ -238,20 +204,15 @@ def generate_custom_audio(text, voice_key, randomize, speed_str, *slider_values)
|
|
| 238 |
)
|
| 239 |
print("Randomized Style Vector (First 6):", final_vec[:6])
|
| 240 |
else:
|
| 241 |
-
# Reconstruct
|
| 242 |
reconstructed_vec = build_modified_vector(voice_key, slider_values)
|
| 243 |
if reconstructed_vec is None:
|
| 244 |
-
print(
|
| 245 |
-
|
| 246 |
-
)
|
| 247 |
-
return None, None, None
|
| 248 |
|
| 249 |
-
# Convert to torch tensor and move to device
|
| 250 |
style_vec_torch = (
|
| 251 |
torch.from_numpy(reconstructed_vec).float().unsqueeze(0).to(device)
|
| 252 |
)
|
| 253 |
-
|
| 254 |
-
# Generate audio with the reconstructed style vector
|
| 255 |
audio_np = tts_with_style_vector(
|
| 256 |
text,
|
| 257 |
style_vec=style_vec_torch,
|
|
@@ -266,22 +227,22 @@ def generate_custom_audio(text, voice_key, randomize, speed_str, *slider_values)
|
|
| 266 |
|
| 267 |
if audio_np is None:
|
| 268 |
print("Audio generation failed.")
|
| 269 |
-
return None, None
|
| 270 |
|
| 271 |
-
|
| 272 |
-
sr = 24000 # Adjust based on your actual sampling rate
|
| 273 |
audio_tuple = (sr, audio_np)
|
| 274 |
-
|
| 275 |
-
# Return audio, image, and style vector
|
| 276 |
return audio_tuple, final_vec.tolist()
|
| 277 |
|
| 278 |
except Exception as e:
|
| 279 |
-
print(f"Error generating audio and style
|
| 280 |
-
return None, None
|
| 281 |
|
| 282 |
|
| 283 |
def save_style_to_json(style_data, style_name):
|
| 284 |
-
"""
|
|
|
|
|
|
|
|
|
|
| 285 |
if not style_name.strip():
|
| 286 |
return "Please enter a new style name before saving."
|
| 287 |
|
|
@@ -291,37 +252,37 @@ def save_style_to_json(style_data, style_name):
|
|
| 291 |
f"Style name '{style_name}' already exists. Please choose a different name."
|
| 292 |
)
|
| 293 |
|
| 294 |
-
# Ensure the style_data has the correct length
|
| 295 |
if len(style_data) != VECTOR_DIMENSION:
|
| 296 |
return f"Style vector length mismatch. Expected {VECTOR_DIMENSION}, got {len(style_data)}."
|
| 297 |
|
| 298 |
-
# Save the style vector
|
| 299 |
voices_data[style_name] = style_data
|
| 300 |
save_voices_json(voices_data)
|
| 301 |
return f"Saved style as '{style_name}' in {VOICES_JSON_PATH}."
|
| 302 |
|
| 303 |
|
| 304 |
-
# Gradio Interface Functions
|
| 305 |
-
|
| 306 |
-
|
| 307 |
def rearrange_voices(new_order):
|
| 308 |
-
"""
|
|
|
|
|
|
|
|
|
|
| 309 |
voices_data = load_voices_json()
|
| 310 |
new_order_list = [name.strip() for name in new_order.split(",")]
|
| 311 |
if not all(name in voices_data for name in new_order_list):
|
| 312 |
return "Error: New order contains invalid voice names.", list(
|
| 313 |
voices_data.keys()
|
| 314 |
)
|
|
|
|
| 315 |
ordered_data = OrderedDict()
|
| 316 |
for name in new_order_list:
|
| 317 |
ordered_data[name] = voices_data[name]
|
|
|
|
| 318 |
save_voices_json(ordered_data)
|
| 319 |
print(f"Voices rearranged: {list(ordered_data.keys())}")
|
| 320 |
return "Voices rearranged successfully.", list(ordered_data.keys())
|
| 321 |
|
| 322 |
|
| 323 |
def delete_voice(selected):
|
| 324 |
-
"""Delete voices from the voices.json."""
|
| 325 |
if not selected:
|
| 326 |
return "No voices selected for deletion.", list(load_voices_json().keys())
|
| 327 |
voices_data = load_voices_json()
|
|
@@ -334,14 +295,15 @@ def delete_voice(selected):
|
|
| 334 |
|
| 335 |
|
| 336 |
def upload_new_voices(uploaded_file):
|
| 337 |
-
"""Upload new voices from a JSON file."""
|
| 338 |
if uploaded_file is None:
|
| 339 |
return "No file uploaded.", list(load_voices_json().keys())
|
| 340 |
try:
|
| 341 |
uploaded_data = json.load(uploaded_file)
|
| 342 |
if not isinstance(uploaded_data, dict):
|
| 343 |
-
return
|
| 344 |
-
|
|
|
|
| 345 |
)
|
| 346 |
voices_data = load_voices_json()
|
| 347 |
voices_data.update(uploaded_data)
|
|
@@ -352,10 +314,13 @@ def upload_new_voices(uploaded_file):
|
|
| 352 |
return "Uploaded file is not valid JSON.", list(load_voices_json().keys())
|
| 353 |
|
| 354 |
|
| 355 |
-
#
|
|
|
|
|
|
|
| 356 |
|
| 357 |
|
| 358 |
def create_combined_interface():
|
|
|
|
| 359 |
voices_data = load_voices_json()
|
| 360 |
voice_choices = list(voices_data.keys())
|
| 361 |
default_voice = voice_choices[0] if voice_choices else None
|
|
@@ -367,16 +332,12 @@ def create_combined_interface():
|
|
| 367 |
}
|
| 368 |
"""
|
| 369 |
|
| 370 |
-
def refresh_voices():
|
| 371 |
-
"""Refresh the voices by reloading the JSON."""
|
| 372 |
-
new_choices = list(load_voices_json().keys())
|
| 373 |
-
print(f"Voices refreshed: {new_choices}")
|
| 374 |
-
return gr.Dropdown(choices=new_choices)
|
| 375 |
-
|
| 376 |
with gr.Blocks(theme=gr.themes.Ocean(), css=css) as demo:
|
| 377 |
gr.Markdown("# StyleTTS2 Studio - Build custom voices")
|
| 378 |
|
| 379 |
-
#
|
|
|
|
|
|
|
| 380 |
with gr.Tab("Text-to-Speech"):
|
| 381 |
gr.Markdown("### Generate Speech with Predefined Voices")
|
| 382 |
|
|
@@ -399,28 +360,29 @@ def create_combined_interface():
|
|
| 399 |
label="Speed (%)",
|
| 400 |
value=120,
|
| 401 |
)
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
audio_output = gr.Audio(label="Synthesized Audio")
|
| 406 |
|
| 407 |
-
# Generate
|
| 408 |
def on_generate_tts(text, voice, speed):
|
| 409 |
if not voice:
|
| 410 |
return None, "No voice selected."
|
| 411 |
speed_val = speed / 100 # Convert percentage to multiplier
|
| 412 |
-
|
| 413 |
-
if
|
| 414 |
-
return None,
|
| 415 |
-
return
|
| 416 |
|
| 417 |
generate_btn.click(
|
| 418 |
fn=on_generate_tts,
|
| 419 |
inputs=[text_input, voice_dropdown, speed_slider],
|
| 420 |
-
outputs=[audio_output,
|
| 421 |
)
|
| 422 |
|
| 423 |
-
#
|
|
|
|
|
|
|
| 424 |
with gr.Tab("Voice Studio"):
|
| 425 |
gr.Markdown("### Customize and Create New Voices")
|
| 426 |
|
|
@@ -463,18 +425,16 @@ def create_combined_interface():
|
|
| 463 |
# State to hold the last style vector
|
| 464 |
style_vector_state_studio = gr.State()
|
| 465 |
|
| 466 |
-
# Generate
|
| 467 |
def on_generate_studio(text, voice, speed, *pca_values):
|
| 468 |
if not voice:
|
| 469 |
return None, "No voice selected.", None
|
| 470 |
-
speed_val = speed / 100
|
| 471 |
-
|
| 472 |
text, voice, False, speed_val, *pca_values
|
| 473 |
)
|
| 474 |
-
if
|
| 475 |
return None, "Failed to generate audio.", None
|
| 476 |
-
audio_tuple, style_vector = result
|
| 477 |
-
style_vector_state_studio.value = style_vector
|
| 478 |
return audio_tuple, "Audio generated successfully.", style_vector
|
| 479 |
|
| 480 |
generate_btn_studio.click(
|
|
@@ -484,43 +444,56 @@ def create_combined_interface():
|
|
| 484 |
outputs=[audio_output_studio, status_text, style_vector_state_studio],
|
| 485 |
)
|
| 486 |
|
|
|
|
| 487 |
def on_save_style_studio(style_vector, style_name):
|
| 488 |
-
|
|
|
|
| 489 |
return (
|
| 490 |
-
"Please enter a name for the new voice!",
|
| 491 |
-
gr.
|
| 492 |
-
|
| 493 |
-
), # Return a new Dropdown instance with empty choices
|
| 494 |
-
gr.Dropdown(
|
| 495 |
-
choices=[]
|
| 496 |
-
), # Return a new Dropdown instance with empty choices
|
| 497 |
)
|
|
|
|
| 498 |
result = save_style_to_json(style_vector, style_name)
|
|
|
|
| 499 |
new_choices = list(load_voices_json().keys())
|
| 500 |
-
|
|
|
|
| 501 |
return (
|
| 502 |
-
gr.
|
| 503 |
-
|
| 504 |
-
),
|
| 505 |
-
gr.Dropdown(
|
| 506 |
-
choices=new_choices
|
| 507 |
-
), # Return a new Dropdown instance with updated choices
|
| 508 |
-
result, # Status message
|
| 509 |
)
|
| 510 |
|
| 511 |
save_btn_studio.click(
|
| 512 |
fn=on_save_style_studio,
|
| 513 |
inputs=[style_vector_state_studio, new_style_name],
|
| 514 |
-
|
|
|
|
| 515 |
)
|
| 516 |
|
| 517 |
-
#
|
| 518 |
voice_dropdown_studio.change(
|
| 519 |
fn=update_sliders,
|
| 520 |
inputs=voice_dropdown_studio,
|
| 521 |
outputs=pca_sliders,
|
| 522 |
)
|
| 523 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 524 |
gr.Markdown(
|
| 525 |
"#### Based on [StyleTTS2](https://github.com/yl4579/StyleTTS2) and [artificial StyleTTS2](https://huggingface.co/dkounadis/artificial-styletts2/tree/main)"
|
| 526 |
)
|
|
@@ -531,6 +504,6 @@ def create_combined_interface():
|
|
| 531 |
if __name__ == "__main__":
|
| 532 |
try:
|
| 533 |
interface = create_combined_interface()
|
| 534 |
-
interface.launch(share=False)
|
| 535 |
except Exception as e:
|
| 536 |
print(f"An error occurred while launching the interface: {e}")
|
|
|
|
| 16 |
matplotlib.use("Agg") # Use non-interactive backend
|
| 17 |
import matplotlib.pyplot as plt
|
| 18 |
|
| 19 |
+
# -------------------------------------------------------------------
|
| 20 |
+
# IMPORT OR DEFINE YOUR TEXT-TO-SPEECH FUNCTIONS
|
| 21 |
+
# (Adjust these imports to match your local TTS code)
|
| 22 |
+
# -------------------------------------------------------------------
|
| 23 |
from text2speech import tts_randomized, parse_speed, tts_with_style_vector
|
| 24 |
|
| 25 |
# Constants and Paths
|
|
|
|
| 51 |
# LOAD PCA MODEL AND ANNOTATED FEATURES
|
| 52 |
##############################################################################
|
| 53 |
|
|
|
|
| 54 |
try:
|
| 55 |
pca = joblib.load(PCA_MODEL_PATH)
|
| 56 |
print("PCA model loaded successfully.")
|
|
|
|
| 93 |
def update_sliders(voice_name):
|
| 94 |
"""
|
| 95 |
Update slider values based on the selected predefined voice using reverse PCA.
|
| 96 |
+
Returns a list of PCA component values to set the sliders.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
"""
|
| 98 |
if not voice_name:
|
| 99 |
# Return default slider values (e.g., zeros) if no voice is selected
|
|
|
|
| 122 |
def generate_audio_with_voice(text, voice_key, speed_val):
|
| 123 |
"""
|
| 124 |
Generate audio using the style vector of the selected predefined voice.
|
| 125 |
+
Returns (audio_tuple, style_vector) or (None, error_message).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
"""
|
| 127 |
try:
|
| 128 |
# Load voices data
|
| 129 |
voices_data = load_voices_json()
|
|
|
|
| 130 |
if voice_key not in voices_data:
|
| 131 |
+
msg = f"Voice '{voice_key}' not found in {VOICES_JSON_PATH}."
|
| 132 |
+
print(msg)
|
| 133 |
+
return None, msg
|
| 134 |
|
|
|
|
| 135 |
style_vector = np.array(voices_data[voice_key], dtype=np.float32).reshape(1, -1)
|
| 136 |
print(f"Selected Voice: {voice_key}")
|
| 137 |
print(f"Style Vector (First 6): {style_vector[0][:6]}")
|
|
|
|
| 139 |
# Convert to torch tensor and move to device
|
| 140 |
style_vec_torch = torch.from_numpy(style_vector).float().to(device)
|
| 141 |
|
| 142 |
+
# Generate audio
|
| 143 |
audio_np = tts_with_style_vector(
|
| 144 |
text,
|
| 145 |
style_vec=style_vec_torch,
|
|
|
|
| 151 |
)
|
| 152 |
|
| 153 |
if audio_np is None:
|
| 154 |
+
msg = "Audio generation failed."
|
| 155 |
+
print(msg)
|
| 156 |
+
return None, msg
|
| 157 |
|
| 158 |
+
sr = 24000
|
|
|
|
| 159 |
audio_tuple = (sr, audio_np)
|
|
|
|
|
|
|
| 160 |
return audio_tuple, style_vector.tolist()
|
| 161 |
|
| 162 |
except Exception as e:
|
|
|
|
| 165 |
|
| 166 |
|
| 167 |
def build_modified_vector(voice_key, top6_values):
|
| 168 |
+
"""Reconstruct a style vector by applying inverse PCA on the given 6 slider values."""
|
| 169 |
voices_data = load_voices_json()
|
| 170 |
if voice_key not in voices_data:
|
| 171 |
print(f"Voice '{voice_key}' not found in {VOICES_JSON_PATH}.")
|
|
|
|
| 177 |
return None
|
| 178 |
|
| 179 |
try:
|
|
|
|
| 180 |
pca_components = np.array(top6_values).reshape(1, -1)
|
| 181 |
reconstructed_vec = pca.inverse_transform(pca_components)[0]
|
| 182 |
return reconstructed_vec
|
|
|
|
| 185 |
return None
|
| 186 |
|
| 187 |
|
| 188 |
+
def generate_custom_audio(text, voice_key, randomize, speed_val, *slider_values):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
"""
|
| 190 |
+
Generate audio with either a random style vector or a reconstructed vector
|
| 191 |
+
from the 6 PCA sliders. Returns (audio_tuple, style_vector) or (None, None).
|
|
|
|
|
|
|
|
|
|
| 192 |
"""
|
| 193 |
try:
|
|
|
|
|
|
|
|
|
|
| 194 |
if randomize:
|
| 195 |
# Generate randomized style vector
|
| 196 |
audio_np, random_style_vec = tts_randomized(text, speed=speed_val)
|
| 197 |
if random_style_vec is None:
|
| 198 |
print("Failed to generate randomized style vector.")
|
| 199 |
+
return None, None
|
|
|
|
| 200 |
final_vec = (
|
| 201 |
random_style_vec.cpu().numpy().flatten()
|
| 202 |
if isinstance(random_style_vec, torch.Tensor)
|
|
|
|
| 204 |
)
|
| 205 |
print("Randomized Style Vector (First 6):", final_vec[:6])
|
| 206 |
else:
|
| 207 |
+
# Reconstruct vector from PCA sliders
|
| 208 |
reconstructed_vec = build_modified_vector(voice_key, slider_values)
|
| 209 |
if reconstructed_vec is None:
|
| 210 |
+
print("No reconstructed vector. Skipping audio generation.")
|
| 211 |
+
return None, None
|
|
|
|
|
|
|
| 212 |
|
|
|
|
| 213 |
style_vec_torch = (
|
| 214 |
torch.from_numpy(reconstructed_vec).float().unsqueeze(0).to(device)
|
| 215 |
)
|
|
|
|
|
|
|
| 216 |
audio_np = tts_with_style_vector(
|
| 217 |
text,
|
| 218 |
style_vec=style_vec_torch,
|
|
|
|
| 227 |
|
| 228 |
if audio_np is None:
|
| 229 |
print("Audio generation failed.")
|
| 230 |
+
return None, None
|
| 231 |
|
| 232 |
+
sr = 24000
|
|
|
|
| 233 |
audio_tuple = (sr, audio_np)
|
|
|
|
|
|
|
| 234 |
return audio_tuple, final_vec.tolist()
|
| 235 |
|
| 236 |
except Exception as e:
|
| 237 |
+
print(f"Error generating audio and style: {e}")
|
| 238 |
+
return None, None
|
| 239 |
|
| 240 |
|
| 241 |
def save_style_to_json(style_data, style_name):
|
| 242 |
+
"""
|
| 243 |
+
Saves the provided style_data (list of floats) into voices.json under style_name.
|
| 244 |
+
Returns a status message.
|
| 245 |
+
"""
|
| 246 |
if not style_name.strip():
|
| 247 |
return "Please enter a new style name before saving."
|
| 248 |
|
|
|
|
| 252 |
f"Style name '{style_name}' already exists. Please choose a different name."
|
| 253 |
)
|
| 254 |
|
|
|
|
| 255 |
if len(style_data) != VECTOR_DIMENSION:
|
| 256 |
return f"Style vector length mismatch. Expected {VECTOR_DIMENSION}, got {len(style_data)}."
|
| 257 |
|
|
|
|
| 258 |
voices_data[style_name] = style_data
|
| 259 |
save_voices_json(voices_data)
|
| 260 |
return f"Saved style as '{style_name}' in {VOICES_JSON_PATH}."
|
| 261 |
|
| 262 |
|
|
|
|
|
|
|
|
|
|
| 263 |
def rearrange_voices(new_order):
|
| 264 |
+
"""
|
| 265 |
+
Rearrange the voices in voices.json based on the comma-separated `new_order`.
|
| 266 |
+
Returns (status_msg, updated_list_of_voices).
|
| 267 |
+
"""
|
| 268 |
voices_data = load_voices_json()
|
| 269 |
new_order_list = [name.strip() for name in new_order.split(",")]
|
| 270 |
if not all(name in voices_data for name in new_order_list):
|
| 271 |
return "Error: New order contains invalid voice names.", list(
|
| 272 |
voices_data.keys()
|
| 273 |
)
|
| 274 |
+
|
| 275 |
ordered_data = OrderedDict()
|
| 276 |
for name in new_order_list:
|
| 277 |
ordered_data[name] = voices_data[name]
|
| 278 |
+
|
| 279 |
save_voices_json(ordered_data)
|
| 280 |
print(f"Voices rearranged: {list(ordered_data.keys())}")
|
| 281 |
return "Voices rearranged successfully.", list(ordered_data.keys())
|
| 282 |
|
| 283 |
|
| 284 |
def delete_voice(selected):
|
| 285 |
+
"""Delete voices from the voices.json. Returns (status_msg, updated_list_of_voices)."""
|
| 286 |
if not selected:
|
| 287 |
return "No voices selected for deletion.", list(load_voices_json().keys())
|
| 288 |
voices_data = load_voices_json()
|
|
|
|
| 295 |
|
| 296 |
|
| 297 |
def upload_new_voices(uploaded_file):
|
| 298 |
+
"""Upload new voices from a JSON file. Returns (status_msg, updated_list_of_voices)."""
|
| 299 |
if uploaded_file is None:
|
| 300 |
return "No file uploaded.", list(load_voices_json().keys())
|
| 301 |
try:
|
| 302 |
uploaded_data = json.load(uploaded_file)
|
| 303 |
if not isinstance(uploaded_data, dict):
|
| 304 |
+
return (
|
| 305 |
+
"Invalid JSON format. Expected a dictionary of voices.",
|
| 306 |
+
list(load_voices_json().keys()),
|
| 307 |
)
|
| 308 |
voices_data = load_voices_json()
|
| 309 |
voices_data.update(uploaded_data)
|
|
|
|
| 314 |
return "Uploaded file is not valid JSON.", list(load_voices_json().keys())
|
| 315 |
|
| 316 |
|
| 317 |
+
# -------------------------------------------------------------------
|
| 318 |
+
# GRADIO INTERFACE
|
| 319 |
+
# -------------------------------------------------------------------
|
| 320 |
|
| 321 |
|
| 322 |
def create_combined_interface():
|
| 323 |
+
# We'll initially load the voices to get a default set for the dropdown
|
| 324 |
voices_data = load_voices_json()
|
| 325 |
voice_choices = list(voices_data.keys())
|
| 326 |
default_voice = voice_choices[0] if voice_choices else None
|
|
|
|
| 332 |
}
|
| 333 |
"""
|
| 334 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 335 |
with gr.Blocks(theme=gr.themes.Ocean(), css=css) as demo:
|
| 336 |
gr.Markdown("# StyleTTS2 Studio - Build custom voices")
|
| 337 |
|
| 338 |
+
# -------------------------------------------------------
|
| 339 |
+
# 1) Text-to-Speech Tab
|
| 340 |
+
# -------------------------------------------------------
|
| 341 |
with gr.Tab("Text-to-Speech"):
|
| 342 |
gr.Markdown("### Generate Speech with Predefined Voices")
|
| 343 |
|
|
|
|
| 360 |
label="Speed (%)",
|
| 361 |
value=120,
|
| 362 |
)
|
| 363 |
+
generate_btn = gr.Button("Generate Audio")
|
| 364 |
+
status_tts = gr.Textbox(label="Status", visible=False)
|
|
|
|
| 365 |
audio_output = gr.Audio(label="Synthesized Audio")
|
| 366 |
|
| 367 |
+
# Generate TTS callback
|
| 368 |
def on_generate_tts(text, voice, speed):
|
| 369 |
if not voice:
|
| 370 |
return None, "No voice selected."
|
| 371 |
speed_val = speed / 100 # Convert percentage to multiplier
|
| 372 |
+
audio_result, msg = generate_audio_with_voice(text, voice, speed_val)
|
| 373 |
+
if audio_result is None:
|
| 374 |
+
return None, msg
|
| 375 |
+
return audio_result, "Audio generated successfully."
|
| 376 |
|
| 377 |
generate_btn.click(
|
| 378 |
fn=on_generate_tts,
|
| 379 |
inputs=[text_input, voice_dropdown, speed_slider],
|
| 380 |
+
outputs=[audio_output, status_tts],
|
| 381 |
)
|
| 382 |
|
| 383 |
+
# -------------------------------------------------------
|
| 384 |
+
# 2) Voice Studio Tab
|
| 385 |
+
# -------------------------------------------------------
|
| 386 |
with gr.Tab("Voice Studio"):
|
| 387 |
gr.Markdown("### Customize and Create New Voices")
|
| 388 |
|
|
|
|
| 425 |
# State to hold the last style vector
|
| 426 |
style_vector_state_studio = gr.State()
|
| 427 |
|
| 428 |
+
# Generate customized audio callback
|
| 429 |
def on_generate_studio(text, voice, speed, *pca_values):
|
| 430 |
if not voice:
|
| 431 |
return None, "No voice selected.", None
|
| 432 |
+
speed_val = speed / 100
|
| 433 |
+
audio_tuple, style_vector = generate_custom_audio(
|
| 434 |
text, voice, False, speed_val, *pca_values
|
| 435 |
)
|
| 436 |
+
if audio_tuple is None:
|
| 437 |
return None, "Failed to generate audio.", None
|
|
|
|
|
|
|
| 438 |
return audio_tuple, "Audio generated successfully.", style_vector
|
| 439 |
|
| 440 |
generate_btn_studio.click(
|
|
|
|
| 444 |
outputs=[audio_output_studio, status_text, style_vector_state_studio],
|
| 445 |
)
|
| 446 |
|
| 447 |
+
# Save customized voice callback
|
| 448 |
def on_save_style_studio(style_vector, style_name):
|
| 449 |
+
"""Save the new style, then update the dropdown choices."""
|
| 450 |
+
if not style_vector or not style_name:
|
| 451 |
return (
|
| 452 |
+
gr.update(value="Please enter a name for the new voice!"),
|
| 453 |
+
gr.update(),
|
| 454 |
+
gr.update(),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 455 |
)
|
| 456 |
+
# Save the style
|
| 457 |
result = save_style_to_json(style_vector, style_name)
|
| 458 |
+
# Reload the voices to get the new list
|
| 459 |
new_choices = list(load_voices_json().keys())
|
| 460 |
+
|
| 461 |
+
# Return dictionary updates to existing components
|
| 462 |
return (
|
| 463 |
+
gr.update(value=result),
|
| 464 |
+
gr.update(choices=new_choices),
|
| 465 |
+
gr.update(choices=new_choices),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 466 |
)
|
| 467 |
|
| 468 |
save_btn_studio.click(
|
| 469 |
fn=on_save_style_studio,
|
| 470 |
inputs=[style_vector_state_studio, new_style_name],
|
| 471 |
+
# We update: status_text, voice_dropdown, voice_dropdown_studio
|
| 472 |
+
outputs=[status_text, voice_dropdown, voice_dropdown_studio],
|
| 473 |
)
|
| 474 |
|
| 475 |
+
# Update sliders callback
|
| 476 |
voice_dropdown_studio.change(
|
| 477 |
fn=update_sliders,
|
| 478 |
inputs=voice_dropdown_studio,
|
| 479 |
outputs=pca_sliders,
|
| 480 |
)
|
| 481 |
|
| 482 |
+
# -------------------------------------------------------
|
| 483 |
+
# Optionally: Reload voices on page load
|
| 484 |
+
# -------------------------------------------------------
|
| 485 |
+
def on_page_load():
|
| 486 |
+
new_choices = list(load_voices_json().keys())
|
| 487 |
+
return {
|
| 488 |
+
voice_dropdown: gr.update(choices=new_choices),
|
| 489 |
+
voice_dropdown_studio: gr.update(choices=new_choices),
|
| 490 |
+
}
|
| 491 |
+
|
| 492 |
+
# This automatically refreshes dropdowns every time the user loads/refreshes the page
|
| 493 |
+
demo.load(
|
| 494 |
+
on_page_load, inputs=None, outputs=[voice_dropdown, voice_dropdown_studio]
|
| 495 |
+
)
|
| 496 |
+
|
| 497 |
gr.Markdown(
|
| 498 |
"#### Based on [StyleTTS2](https://github.com/yl4579/StyleTTS2) and [artificial StyleTTS2](https://huggingface.co/dkounadis/artificial-styletts2/tree/main)"
|
| 499 |
)
|
|
|
|
| 504 |
if __name__ == "__main__":
|
| 505 |
try:
|
| 506 |
interface = create_combined_interface()
|
| 507 |
+
interface.launch(share=False) # or share=True if you want a public share link
|
| 508 |
except Exception as e:
|
| 509 |
print(f"An error occurred while launching the interface: {e}")
|
voices.json
CHANGED
|
@@ -2837,4 +2837,4 @@
|
|
| 2837 |
0.057131367030820654,
|
| 2838 |
-0.0762246848122452
|
| 2839 |
]
|
| 2840 |
-
}
|
|
|
|
| 2837 |
0.057131367030820654,
|
| 2838 |
-0.0762246848122452
|
| 2839 |
]
|
| 2840 |
+
}
|