Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| import gradio as gr | |
| import numpy as np | |
| import torch | |
| import json | |
| import io | |
| import soundfile as sf | |
| from PIL import Image | |
| import matplotlib | |
| import joblib | |
| from sklearn.decomposition import PCA | |
| from collections import OrderedDict | |
| import nltk | |
| matplotlib.use("Agg") # Use non-interactive backend | |
| import matplotlib.pyplot as plt | |
| # ------------------------------------------------------------------- | |
| # IMPORT OR DEFINE YOUR TEXT-TO-SPEECH FUNCTIONS | |
| # (Adjust these imports to match your local TTS code) | |
| # ------------------------------------------------------------------- | |
| from text2speech import tts_randomized, parse_speed, tts_with_style_vector | |
| # Constants and Paths | |
| VOICES_JSON_PATH = "voices.json" | |
| PCA_MODEL_PATH = "pca_model.pkl" | |
| ANNOTATED_FEATURES_PATH = "annotated_features.npy" | |
| VECTOR_DIMENSION = 256 | |
| ANNOTATED_FEATURES_NAMES = ["Gender", "Tone", "Quality", "Enunciation", "Pace", "Style"] | |
| ANNOTATED_FEATURES_INFO = [ | |
| "Male | Female", | |
| "High | Low", | |
| "Noisy | Clean", | |
| "Clear | Unclear", | |
| "Rapid | Slow", | |
| "Colloquial | Formal", | |
| ] | |
| # Download necessary NLTK data | |
| nltk.download("punkt_tab") | |
| ############################################################################## | |
| # DEVICE CONFIGURATION | |
| ############################################################################## | |
| # Detect if CUDA is available and set the device accordingly | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| print(f"Using device: {device}") | |
| ############################################################################## | |
| # LOAD PCA MODEL AND ANNOTATED FEATURES | |
| ############################################################################## | |
| try: | |
| pca = joblib.load(PCA_MODEL_PATH) | |
| print("PCA model loaded successfully.") | |
| except FileNotFoundError: | |
| print(f"Error: PCA model file '{PCA_MODEL_PATH}' not found.") | |
| pca = None | |
| try: | |
| annotated_features = np.load(ANNOTATED_FEATURES_PATH) | |
| print("Annotated features loaded successfully.") | |
| except FileNotFoundError: | |
| print(f"Error: Annotated features file '{ANNOTATED_FEATURES_PATH}' not found.") | |
| annotated_features = None | |
| ############################################################################## | |
| # UTILITY FUNCTIONS | |
| ############################################################################## | |
| def load_voices_json(): | |
| """Load the voices.json file.""" | |
| try: | |
| with open(VOICES_JSON_PATH, "r") as f: | |
| return json.load(f, object_pairs_hook=OrderedDict) | |
| except FileNotFoundError: | |
| print(f"Warning: {VOICES_JSON_PATH} not found. Creating a new one.") | |
| return OrderedDict() | |
| except json.JSONDecodeError: | |
| print(f"Warning: {VOICES_JSON_PATH} is not valid JSON.") | |
| return OrderedDict() | |
| def save_voices_json(data, path=VOICES_JSON_PATH): | |
| """Save to voices.json.""" | |
| with open(path, "w") as f: | |
| json.dump(data, f, indent=2) | |
| print(f"Voices saved to '{path}'.") | |
| def update_sliders(voice_name): | |
| """ | |
| Update slider values based on the selected predefined voice using reverse PCA. | |
| Returns a list of PCA component values to set the sliders. | |
| """ | |
| if not voice_name: | |
| # Return default slider values (e.g., zeros) if no voice is selected | |
| return [0.0] * len(ANNOTATED_FEATURES_NAMES) | |
| voices_data = load_voices_json() | |
| if voice_name not in voices_data: | |
| print(f"Voice '{voice_name}' not found in {VOICES_JSON_PATH}.") | |
| return [0.0] * len(ANNOTATED_FEATURES_NAMES) | |
| style_vector = np.array(voices_data[voice_name], dtype=np.float32).reshape(1, -1) | |
| if pca is None: | |
| print("PCA model is not loaded.") | |
| return [0.0] * len(ANNOTATED_FEATURES_NAMES) | |
| try: | |
| # Transform the style vector into PCA component values | |
| pca_components = pca.transform(style_vector)[0] | |
| return pca_components.tolist() | |
| except Exception as e: | |
| print(f"Error transforming style vector to PCA components: {e}") | |
| return [0.0] * len(ANNOTATED_FEATURES_NAMES) | |
| def generate_audio_with_voice(text, voice_key, speed_val): | |
| """ | |
| Generate audio using the style vector of the selected predefined voice. | |
| Returns (audio_tuple, style_vector) or (None, error_message). | |
| """ | |
| try: | |
| # Load voices data | |
| voices_data = load_voices_json() | |
| if voice_key not in voices_data: | |
| msg = f"Voice '{voice_key}' not found in {VOICES_JSON_PATH}." | |
| print(msg) | |
| return None, msg | |
| style_vector = np.array(voices_data[voice_key], dtype=np.float32).reshape(1, -1) | |
| print(f"Selected Voice: {voice_key}") | |
| print(f"Style Vector (First 6): {style_vector[0][:6]}") | |
| # Convert to torch tensor and move to device | |
| style_vec_torch = torch.from_numpy(style_vector).float().to(device) | |
| # Generate audio | |
| audio_np = tts_with_style_vector( | |
| text, | |
| style_vec=style_vec_torch, | |
| speed=speed_val, | |
| alpha=0.3, | |
| beta=0.7, | |
| diffusion_steps=7, | |
| embedding_scale=1.0, | |
| ) | |
| if audio_np is None: | |
| msg = "Audio generation failed." | |
| print(msg) | |
| return None, msg | |
| sr = 24000 | |
| audio_tuple = (sr, audio_np) | |
| return audio_tuple, style_vector.tolist() | |
| except Exception as e: | |
| print(f"Error in generate_audio_with_voice: {e}") | |
| return None, "An error occurred during audio generation." | |
| def build_modified_vector(voice_key, top6_values): | |
| """Reconstruct a style vector by applying inverse PCA on the given 6 slider values.""" | |
| voices_data = load_voices_json() | |
| if voice_key not in voices_data: | |
| print(f"Voice '{voice_key}' not found in {VOICES_JSON_PATH}.") | |
| return None | |
| arr = np.array(voices_data[voice_key], dtype=np.float32).squeeze() | |
| if arr.ndim != 1 or arr.shape[0] != VECTOR_DIMENSION: | |
| print(f"Voice '{voice_key}' has invalid shape {arr.shape}. Expected (256,).") | |
| return None | |
| try: | |
| pca_components = np.array(top6_values).reshape(1, -1) | |
| reconstructed_vec = pca.inverse_transform(pca_components)[0] | |
| return reconstructed_vec | |
| except Exception as e: | |
| print(f"Error reconstructing style vector: {e}") | |
| return None | |
| def generate_custom_audio(text, voice_key, randomize, speed_val, *slider_values): | |
| """ | |
| Generate audio with either a random style vector or a reconstructed vector | |
| from the 6 PCA sliders. Returns (audio_tuple, style_vector) or (None, None). | |
| """ | |
| try: | |
| if randomize: | |
| # Generate randomized style vector | |
| audio_np, random_style_vec = tts_randomized(text, speed=speed_val) | |
| if random_style_vec is None: | |
| print("Failed to generate randomized style vector.") | |
| return None, None | |
| final_vec = ( | |
| random_style_vec.cpu().numpy().flatten() | |
| if isinstance(random_style_vec, torch.Tensor) | |
| else np.array(random_style_vec).flatten() | |
| ) | |
| print("Randomized Style Vector (First 6):", final_vec[:6]) | |
| else: | |
| # Reconstruct vector from PCA sliders | |
| reconstructed_vec = build_modified_vector(voice_key, slider_values) | |
| if reconstructed_vec is None: | |
| print("No reconstructed vector. Skipping audio generation.") | |
| return None, None | |
| style_vec_torch = ( | |
| torch.from_numpy(reconstructed_vec).float().unsqueeze(0).to(device) | |
| ) | |
| audio_np = tts_with_style_vector( | |
| text, | |
| style_vec=style_vec_torch, | |
| speed=speed_val, | |
| alpha=0.3, | |
| beta=0.7, | |
| diffusion_steps=7, | |
| embedding_scale=1.0, | |
| ) | |
| final_vec = reconstructed_vec | |
| print("Reconstructed Style Vector (First 6):", final_vec[:6]) | |
| if audio_np is None: | |
| print("Audio generation failed.") | |
| return None, None | |
| sr = 24000 | |
| audio_tuple = (sr, audio_np) | |
| return audio_tuple, final_vec.tolist() | |
| except Exception as e: | |
| print(f"Error generating audio and style: {e}") | |
| return None, None | |
| def save_style_to_json(style_data, style_name): | |
| """ | |
| Saves the provided style_data (list of floats) into voices.json under style_name. | |
| Returns a status message. | |
| """ | |
| if not style_name.strip(): | |
| return "Please enter a new style name before saving." | |
| voices_data = load_voices_json() | |
| if style_name in voices_data: | |
| return ( | |
| f"Style name '{style_name}' already exists. Please choose a different name." | |
| ) | |
| if len(style_data) != VECTOR_DIMENSION: | |
| return f"Style vector length mismatch. Expected {VECTOR_DIMENSION}, got {len(style_data)}." | |
| voices_data[style_name] = style_data | |
| save_voices_json(voices_data) | |
| return f"Saved style as '{style_name}' in {VOICES_JSON_PATH}." | |
| def rearrange_voices(new_order): | |
| """ | |
| Rearrange the voices in voices.json based on the comma-separated `new_order`. | |
| Returns (status_msg, updated_list_of_voices). | |
| """ | |
| voices_data = load_voices_json() | |
| new_order_list = [name.strip() for name in new_order.split(",")] | |
| if not all(name in voices_data for name in new_order_list): | |
| return "Error: New order contains invalid voice names.", list( | |
| voices_data.keys() | |
| ) | |
| ordered_data = OrderedDict() | |
| for name in new_order_list: | |
| ordered_data[name] = voices_data[name] | |
| save_voices_json(ordered_data) | |
| print(f"Voices rearranged: {list(ordered_data.keys())}") | |
| return "Voices rearranged successfully.", list(ordered_data.keys()) | |
| def delete_voice(selected): | |
| """Delete voices from the voices.json. Returns (status_msg, updated_list_of_voices).""" | |
| if not selected: | |
| return "No voices selected for deletion.", list(load_voices_json().keys()) | |
| voices_data = load_voices_json() | |
| for voice_name in selected: | |
| if voice_name in voices_data: | |
| del voices_data[voice_name] | |
| print(f"Voice '{voice_name}' deleted.") | |
| save_voices_json(voices_data) | |
| return "Deleted selected voices successfully.", list(voices_data.keys()) | |
| def upload_new_voices(uploaded_file): | |
| """Upload new voices from a JSON file. Returns (status_msg, updated_list_of_voices).""" | |
| if uploaded_file is None: | |
| return "No file uploaded.", list(load_voices_json().keys()) | |
| try: | |
| uploaded_data = json.load(uploaded_file) | |
| if not isinstance(uploaded_data, dict): | |
| return ( | |
| "Invalid JSON format. Expected a dictionary of voices.", | |
| list(load_voices_json().keys()), | |
| ) | |
| voices_data = load_voices_json() | |
| voices_data.update(uploaded_data) | |
| save_voices_json(voices_data) | |
| print(f"Voices uploaded: {list(uploaded_data.keys())}") | |
| return "Voices uploaded successfully.", list(voices_data.keys()) | |
| except json.JSONDecodeError: | |
| return "Uploaded file is not valid JSON.", list(load_voices_json().keys()) | |
| # ------------------------------------------------------------------- | |
| # GRADIO INTERFACE | |
| # ------------------------------------------------------------------- | |
| def create_combined_interface(): | |
| # We'll initially load the voices to get a default set for the dropdown | |
| voices_data = load_voices_json() | |
| voice_choices = list(voices_data.keys()) | |
| default_voice = voice_choices[0] if voice_choices else None | |
| css = """ | |
| h4 { | |
| text-align: center; | |
| display:block; | |
| } | |
| """ | |
| with gr.Blocks(theme=gr.themes.Ocean(), css=css) as demo: | |
| gr.Markdown("# StyleTTS2 Studio - Build custom voices") | |
| # ------------------------------------------------------- | |
| # 1) Text-to-Speech Tab | |
| # ------------------------------------------------------- | |
| with gr.Tab("Text-to-Speech"): | |
| gr.Markdown("### Generate Speech with Predefined Voices") | |
| with gr.Column(): | |
| text_input = gr.Textbox( | |
| label="Text to Synthesize", | |
| value="How much wood could a woodchuck chuck if a woodchuck could chuck wood?", | |
| lines=3, | |
| ) | |
| voice_dropdown = gr.Dropdown( | |
| choices=voice_choices, | |
| label="Select Base Voice", | |
| value=default_voice, | |
| interactive=True, | |
| ) | |
| speed_slider = gr.Slider( | |
| minimum=50, | |
| maximum=200, | |
| step=1, | |
| label="Speed (%)", | |
| value=120, | |
| ) | |
| generate_btn = gr.Button("Generate Audio") | |
| status_tts = gr.Textbox(label="Status", visible=False) | |
| audio_output = gr.Audio(label="Synthesized Audio") | |
| # Generate TTS callback | |
| def on_generate_tts(text, voice, speed): | |
| if not voice: | |
| return None, "No voice selected." | |
| speed_val = speed / 100 # Convert percentage to multiplier | |
| audio_result, msg = generate_audio_with_voice(text, voice, speed_val) | |
| if audio_result is None: | |
| return None, msg | |
| return audio_result, "Audio generated successfully." | |
| generate_btn.click( | |
| fn=on_generate_tts, | |
| inputs=[text_input, voice_dropdown, speed_slider], | |
| outputs=[audio_output, status_tts], | |
| ) | |
| # ------------------------------------------------------- | |
| # 2) Voice Studio Tab | |
| # ------------------------------------------------------- | |
| with gr.Tab("Voice Studio"): | |
| gr.Markdown("### Customize and Create New Voices") | |
| with gr.Column(): | |
| text_input_studio = gr.Textbox( | |
| label="Text to Synthesize", | |
| value="Use the sliders to customize a voice!", | |
| lines=3, | |
| ) | |
| voice_dropdown_studio = gr.Dropdown( | |
| choices=voice_choices, | |
| label="Select Base Voice", | |
| value=default_voice, | |
| ) | |
| speed_slider_studio = gr.Slider( | |
| minimum=50, | |
| maximum=200, | |
| step=1, | |
| label="Speed (%)", | |
| value=120, | |
| ) | |
| # Sliders for PCA components (6 sliders) | |
| pca_sliders = [ | |
| gr.Slider( | |
| minimum=-2.0, | |
| maximum=2.0, | |
| value=0.0, | |
| step=0.1, | |
| label=feature, | |
| ) | |
| for feature in ANNOTATED_FEATURES_NAMES | |
| ] | |
| generate_btn_studio = gr.Button("Generate Customized Audio") | |
| audio_output_studio = gr.Audio(label="Customized Synthesized Audio") | |
| new_style_name = gr.Textbox(label="New Style Name", value="") | |
| save_btn_studio = gr.Button("Save Customized Voice") | |
| status_text = gr.Textbox(label="Status", visible=True) | |
| # State to hold the last style vector | |
| style_vector_state_studio = gr.State() | |
| # Generate customized audio callback | |
| def on_generate_studio(text, voice, speed, *pca_values): | |
| if not voice: | |
| return None, "No voice selected.", None | |
| speed_val = speed / 100 | |
| audio_tuple, style_vector = generate_custom_audio( | |
| text, voice, False, speed_val, *pca_values | |
| ) | |
| if audio_tuple is None: | |
| return None, "Failed to generate audio.", None | |
| return audio_tuple, "Audio generated successfully.", style_vector | |
| generate_btn_studio.click( | |
| fn=on_generate_studio, | |
| inputs=[text_input_studio, voice_dropdown_studio, speed_slider_studio] | |
| + pca_sliders, | |
| outputs=[audio_output_studio, status_text, style_vector_state_studio], | |
| ) | |
| # Save customized voice callback | |
| def on_save_style_studio(style_vector, style_name): | |
| """Save the new style, then update the dropdown choices.""" | |
| if not style_vector or not style_name: | |
| return ( | |
| gr.update(value="Please enter a name for the new voice!"), | |
| gr.update(), | |
| gr.update(), | |
| ) | |
| # Save the style | |
| result = save_style_to_json(style_vector, style_name) | |
| # Reload the voices to get the new list | |
| new_choices = list(load_voices_json().keys()) | |
| # Return dictionary updates to existing components | |
| return ( | |
| gr.update(value=result), | |
| gr.update(choices=new_choices), | |
| gr.update(choices=new_choices), | |
| ) | |
| save_btn_studio.click( | |
| fn=on_save_style_studio, | |
| inputs=[style_vector_state_studio, new_style_name], | |
| # We update: status_text, voice_dropdown, voice_dropdown_studio | |
| outputs=[status_text, voice_dropdown, voice_dropdown_studio], | |
| ) | |
| # Update sliders callback | |
| voice_dropdown_studio.change( | |
| fn=update_sliders, | |
| inputs=voice_dropdown_studio, | |
| outputs=pca_sliders, | |
| ) | |
| # ------------------------------------------------------- | |
| # Optionally: Reload voices on page load | |
| # ------------------------------------------------------- | |
| def on_page_load(): | |
| new_choices = list(load_voices_json().keys()) | |
| return { | |
| voice_dropdown: gr.update(choices=new_choices), | |
| voice_dropdown_studio: gr.update(choices=new_choices), | |
| } | |
| # This automatically refreshes dropdowns every time the user loads/refreshes the page | |
| demo.load( | |
| on_page_load, inputs=None, outputs=[voice_dropdown, voice_dropdown_studio] | |
| ) | |
| gr.Markdown( | |
| "#### Based on [StyleTTS2](https://github.com/yl4579/StyleTTS2) and [artificial StyleTTS2](https://huggingface.co/dkounadis/artificial-styletts2/tree/main)" | |
| ) | |
| return demo | |
| if __name__ == "__main__": | |
| try: | |
| interface = create_combined_interface() | |
| interface.launch(share=False) # or share=True if you want a public share link | |
| except Exception as e: | |
| print(f"An error occurred while launching the interface: {e}") | |