Spaces:

Wismut
/

StyleTTS2_Studio

Running

App Files Files Community

Wismut commited on Dec 28, 2024

Commit

4d3cdf4

1 Parent(s): 2d36cf6

fixed saved voice persistence on reload

Browse files

Files changed (2) hide show

app.py +95 -122
voices.json +1 -1

app.py CHANGED Viewed

@@ -16,6 +16,10 @@ import nltk
 matplotlib.use("Agg")  # Use non-interactive backend
 import matplotlib.pyplot as plt
 from text2speech import tts_randomized, parse_speed, tts_with_style_vector
 # Constants and Paths
@@ -47,7 +51,6 @@ print(f"Using device: {device}")
 # LOAD PCA MODEL AND ANNOTATED FEATURES
 ##############################################################################
-# Load PCA model and annotated features
 try:
     pca = joblib.load(PCA_MODEL_PATH)
     print("PCA model loaded successfully.")
@@ -90,12 +93,7 @@ def save_voices_json(data, path=VOICES_JSON_PATH):
 def update_sliders(voice_name):
     """
     Update slider values based on the selected predefined voice using reverse PCA.
-    Args:
-        voice_name (str): The name of the selected voice.
-    Returns:
-        list: A list of PCA component values to set the sliders.
     """
     if not voice_name:
         # Return default slider values (e.g., zeros) if no voice is selected
@@ -124,24 +122,16 @@ def update_sliders(voice_name):
 def generate_audio_with_voice(text, voice_key, speed_val):
     """
     Generate audio using the style vector of the selected predefined voice.
-    Args:
-        text (str): The text to synthesize.
-        voice_key (str): The name of the selected voice.
-        speed_val (float): The speed multiplier.
-    Returns:
-        tuple: (audio_tuple, style_vector)
     """
     try:
         # Load voices data
         voices_data = load_voices_json()
         if voice_key not in voices_data:
-            print(f"Voice '{voice_key}' not found in {VOICES_JSON_PATH}.")
-            return None, None, "Selected voice not found."
-        # Retrieve the style vector for the selected voice
         style_vector = np.array(voices_data[voice_key], dtype=np.float32).reshape(1, -1)
         print(f"Selected Voice: {voice_key}")
         print(f"Style Vector (First 6): {style_vector[0][:6]}")
@@ -149,7 +139,7 @@ def generate_audio_with_voice(text, voice_key, speed_val):
         # Convert to torch tensor and move to device
         style_vec_torch = torch.from_numpy(style_vector).float().to(device)
-        # Generate audio using the TTS model
         audio_np = tts_with_style_vector(
             text,
             style_vec=style_vec_torch,
@@ -161,14 +151,12 @@ def generate_audio_with_voice(text, voice_key, speed_val):
         )
         if audio_np is None:
-            print("Audio generation failed.")
-            return None, None, "Audio generation failed."
-        # Prepare audio for Gradio
-        sr = 24000  # Adjust based on your actual sampling rate
         audio_tuple = (sr, audio_np)
-        # Return audio, image, and style vector
         return audio_tuple, style_vector.tolist()
     except Exception as e:
@@ -177,7 +165,7 @@ def generate_audio_with_voice(text, voice_key, speed_val):
 def build_modified_vector(voice_key, top6_values):
-    """Build a modified style vector by updating top 6 PCA components."""
     voices_data = load_voices_json()
     if voice_key not in voices_data:
         print(f"Voice '{voice_key}' not found in {VOICES_JSON_PATH}.")
@@ -189,7 +177,6 @@ def build_modified_vector(voice_key, top6_values):
         return None
     try:
-        # Reconstruct the style vector using inverse PCA
         pca_components = np.array(top6_values).reshape(1, -1)
         reconstructed_vec = pca.inverse_transform(pca_components)[0]
         return reconstructed_vec
@@ -198,39 +185,18 @@ def build_modified_vector(voice_key, top6_values):
         return None
-def reconstruct_style_vector(pca_components):
-    """
-    Reconstruct the 256-dimensional style vector from PCA components.
-    """
-    if pca is None:
-        print("PCA model is not loaded.")
-        return None
-    try:
-        return pca.inverse_transform([pca_components])[0]
-    except Exception as e:
-        print(f"Error during inverse PCA transform: {e}")
-        return None
-def generate_custom_audio(text, voice_key, randomize, speed_str, *slider_values):
     """
-    Generate audio and produce a matplotlib plot of the style vector.
-    Returns:
-      - audio tuple (sr, np_array) for Gradio's Audio
-      - a PIL Image representing the style vector plot
-      - the final style vector as a list for State
     """
     try:
-        speed_val = parse_speed(speed_str)
-        print(f"Parsed speed: {speed_val}")
         if randomize:
             # Generate randomized style vector
             audio_np, random_style_vec = tts_randomized(text, speed=speed_val)
             if random_style_vec is None:
                 print("Failed to generate randomized style vector.")
-                return None, None, None
-            # Ensure the style vector is flat and on device
             final_vec = (
                 random_style_vec.cpu().numpy().flatten()
                 if isinstance(random_style_vec, torch.Tensor)
@@ -238,20 +204,15 @@ def generate_custom_audio(text, voice_key, randomize, speed_str, *slider_values)
             )
             print("Randomized Style Vector (First 6):", final_vec[:6])
         else:
-            # Reconstruct the style vector from slider values using inverse PCA
             reconstructed_vec = build_modified_vector(voice_key, slider_values)
             if reconstructed_vec is None:
-                print(
-                    "No reconstructed vector could be constructed, skipping audio generation."
-                )
-                return None, None, None
-            # Convert to torch tensor and move to device
             style_vec_torch = (
                 torch.from_numpy(reconstructed_vec).float().unsqueeze(0).to(device)
             )
-            # Generate audio with the reconstructed style vector
             audio_np = tts_with_style_vector(
                 text,
                 style_vec=style_vec_torch,
@@ -266,22 +227,22 @@ def generate_custom_audio(text, voice_key, randomize, speed_str, *slider_values)
         if audio_np is None:
             print("Audio generation failed.")
-            return None, None, None
-        # Prepare audio for Gradio
-        sr = 24000  # Adjust based on your actual sampling rate
         audio_tuple = (sr, audio_np)
-        # Return audio, image, and style vector
         return audio_tuple, final_vec.tolist()
     except Exception as e:
-        print(f"Error generating audio and style plot: {e}")
-        return None, None, None
 def save_style_to_json(style_data, style_name):
-    """Saves the provided style_data (list of floats) into voices.json under style_name."""
     if not style_name.strip():
         return "Please enter a new style name before saving."
@@ -291,37 +252,37 @@ def save_style_to_json(style_data, style_name):
             f"Style name '{style_name}' already exists. Please choose a different name."
         )
-    # Ensure the style_data has the correct length
     if len(style_data) != VECTOR_DIMENSION:
         return f"Style vector length mismatch. Expected {VECTOR_DIMENSION}, got {len(style_data)}."
-    # Save the style vector
     voices_data[style_name] = style_data
     save_voices_json(voices_data)
     return f"Saved style as '{style_name}' in {VOICES_JSON_PATH}."
-# Gradio Interface Functions
 def rearrange_voices(new_order):
-    """Rearrange the voices based on the new_order list."""
     voices_data = load_voices_json()
     new_order_list = [name.strip() for name in new_order.split(",")]
     if not all(name in voices_data for name in new_order_list):
         return "Error: New order contains invalid voice names.", list(
             voices_data.keys()
         )
     ordered_data = OrderedDict()
     for name in new_order_list:
         ordered_data[name] = voices_data[name]
     save_voices_json(ordered_data)
     print(f"Voices rearranged: {list(ordered_data.keys())}")
     return "Voices rearranged successfully.", list(ordered_data.keys())
 def delete_voice(selected):
-    """Delete voices from the voices.json."""
     if not selected:
         return "No voices selected for deletion.", list(load_voices_json().keys())
     voices_data = load_voices_json()
@@ -334,14 +295,15 @@ def delete_voice(selected):
 def upload_new_voices(uploaded_file):
-    """Upload new voices from a JSON file."""
     if uploaded_file is None:
         return "No file uploaded.", list(load_voices_json().keys())
     try:
         uploaded_data = json.load(uploaded_file)
         if not isinstance(uploaded_data, dict):
-            return "Invalid JSON format. Expected a dictionary of voices.", list(
-                load_voices_json().keys()
             )
         voices_data = load_voices_json()
         voices_data.update(uploaded_data)
@@ -352,10 +314,13 @@ def upload_new_voices(uploaded_file):
         return "Uploaded file is not valid JSON.", list(load_voices_json().keys())
-# Create Gradio Interface with Tabs
 def create_combined_interface():
     voices_data = load_voices_json()
     voice_choices = list(voices_data.keys())
     default_voice = voice_choices[0] if voice_choices else None
@@ -367,16 +332,12 @@ def create_combined_interface():
     }
     """
-    def refresh_voices():
-        """Refresh the voices by reloading the JSON."""
-        new_choices = list(load_voices_json().keys())
-        print(f"Voices refreshed: {new_choices}")
-        return gr.Dropdown(choices=new_choices)
     with gr.Blocks(theme=gr.themes.Ocean(), css=css) as demo:
         gr.Markdown("# StyleTTS2 Studio - Build custom voices")
-        # ----------- Text-to-Speech Tab -----------
         with gr.Tab("Text-to-Speech"):
             gr.Markdown("### Generate Speech with Predefined Voices")
@@ -399,28 +360,29 @@ def create_combined_interface():
                     label="Speed (%)",
                     value=120,
                 )
-                with gr.Row():
-                    generate_btn = gr.Button("Generate Audio")
             audio_output = gr.Audio(label="Synthesized Audio")
-            # Generate button functionality
             def on_generate_tts(text, voice, speed):
                 if not voice:
                     return None, "No voice selected."
                 speed_val = speed / 100  # Convert percentage to multiplier
-                audio, style_vector = generate_audio_with_voice(text, voice, speed_val)
-                if audio is None:
-                    return None, style_vector  # style_vector contains the error message
-                return audio, "Audio generated successfully."
             generate_btn.click(
                 fn=on_generate_tts,
                 inputs=[text_input, voice_dropdown, speed_slider],
-                outputs=[audio_output, gr.Textbox(label="Status", visible=False)],
             )
-        # ----------- Voice Studio Tab -----------
         with gr.Tab("Voice Studio"):
             gr.Markdown("### Customize and Create New Voices")
@@ -463,18 +425,16 @@ def create_combined_interface():
             # State to hold the last style vector
             style_vector_state_studio = gr.State()
-            # Generate button functionality
             def on_generate_studio(text, voice, speed, *pca_values):
                 if not voice:
                     return None, "No voice selected.", None
-                speed_val = speed / 100  # Convert percentage to multiplier
-                result = generate_custom_audio(
                     text, voice, False, speed_val, *pca_values
                 )
-                if result is None:
                     return None, "Failed to generate audio.", None
-                audio_tuple, style_vector = result
-                style_vector_state_studio.value = style_vector
                 return audio_tuple, "Audio generated successfully.", style_vector
             generate_btn_studio.click(
@@ -484,43 +444,56 @@ def create_combined_interface():
                 outputs=[audio_output_studio, status_text, style_vector_state_studio],
             )
             def on_save_style_studio(style_vector, style_name):
-                if not style_name:
                     return (
-                        "Please enter a name for the new voice!",
-                        gr.Dropdown(
-                            choices=[]
-                        ),  # Return a new Dropdown instance with empty choices
-                        gr.Dropdown(
-                            choices=[]
-                        ),  # Return a new Dropdown instance with empty choices
                     )
                 result = save_style_to_json(style_vector, style_name)
                 new_choices = list(load_voices_json().keys())
-                # Return multiple values to update both dropdowns and show status
                 return (
-                    gr.Dropdown(
-                        choices=new_choices
-                    ),  # Return a new Dropdown instance with updated choices
-                    gr.Dropdown(
-                        choices=new_choices
-                    ),  # Return a new Dropdown instance with updated choices
-                    result,  # Status message
                 )
             save_btn_studio.click(
                 fn=on_save_style_studio,
                 inputs=[style_vector_state_studio, new_style_name],
-                outputs=[voice_dropdown, voice_dropdown_studio, status_text],
             )
-            # Add callback to update sliders when a voice is selected
             voice_dropdown_studio.change(
                 fn=update_sliders,
                 inputs=voice_dropdown_studio,
                 outputs=pca_sliders,
             )
         gr.Markdown(
             "#### Based on [StyleTTS2](https://github.com/yl4579/StyleTTS2) and [artificial StyleTTS2](https://huggingface.co/dkounadis/artificial-styletts2/tree/main)"
         )
@@ -531,6 +504,6 @@ def create_combined_interface():
 if __name__ == "__main__":
     try:
         interface = create_combined_interface()
-        interface.launch(share=False)
     except Exception as e:
         print(f"An error occurred while launching the interface: {e}")

 matplotlib.use("Agg")  # Use non-interactive backend
 import matplotlib.pyplot as plt
+# -------------------------------------------------------------------
+# IMPORT OR DEFINE YOUR TEXT-TO-SPEECH FUNCTIONS
+# (Adjust these imports to match your local TTS code)
+# -------------------------------------------------------------------
 from text2speech import tts_randomized, parse_speed, tts_with_style_vector
 # Constants and Paths
 # LOAD PCA MODEL AND ANNOTATED FEATURES
 ##############################################################################
 try:
     pca = joblib.load(PCA_MODEL_PATH)
     print("PCA model loaded successfully.")
 def update_sliders(voice_name):
     """
     Update slider values based on the selected predefined voice using reverse PCA.
+    Returns a list of PCA component values to set the sliders.
     """
     if not voice_name:
         # Return default slider values (e.g., zeros) if no voice is selected
 def generate_audio_with_voice(text, voice_key, speed_val):
     """
     Generate audio using the style vector of the selected predefined voice.
+    Returns (audio_tuple, style_vector) or (None, error_message).
     """
     try:
         # Load voices data
         voices_data = load_voices_json()
         if voice_key not in voices_data:
+            msg = f"Voice '{voice_key}' not found in {VOICES_JSON_PATH}."
+            print(msg)
+            return None, msg
         style_vector = np.array(voices_data[voice_key], dtype=np.float32).reshape(1, -1)
         print(f"Selected Voice: {voice_key}")
         print(f"Style Vector (First 6): {style_vector[0][:6]}")
         # Convert to torch tensor and move to device
         style_vec_torch = torch.from_numpy(style_vector).float().to(device)
+        # Generate audio
         audio_np = tts_with_style_vector(
             text,
             style_vec=style_vec_torch,
         )
         if audio_np is None:
+            msg = "Audio generation failed."
+            print(msg)
+            return None, msg
+        sr = 24000
         audio_tuple = (sr, audio_np)
         return audio_tuple, style_vector.tolist()
     except Exception as e:
 def build_modified_vector(voice_key, top6_values):
+    """Reconstruct a style vector by applying inverse PCA on the given 6 slider values."""
     voices_data = load_voices_json()
     if voice_key not in voices_data:
         print(f"Voice '{voice_key}' not found in {VOICES_JSON_PATH}.")
         return None
     try:
         pca_components = np.array(top6_values).reshape(1, -1)
         reconstructed_vec = pca.inverse_transform(pca_components)[0]
         return reconstructed_vec
         return None
+def generate_custom_audio(text, voice_key, randomize, speed_val, *slider_values):
     """
+    Generate audio with either a random style vector or a reconstructed vector
+    from the 6 PCA sliders. Returns (audio_tuple, style_vector) or (None, None).
     """
     try:
         if randomize:
             # Generate randomized style vector
             audio_np, random_style_vec = tts_randomized(text, speed=speed_val)
             if random_style_vec is None:
                 print("Failed to generate randomized style vector.")
+                return None, None
             final_vec = (
                 random_style_vec.cpu().numpy().flatten()
                 if isinstance(random_style_vec, torch.Tensor)
             )
             print("Randomized Style Vector (First 6):", final_vec[:6])
         else:
+            # Reconstruct vector from PCA sliders
             reconstructed_vec = build_modified_vector(voice_key, slider_values)
             if reconstructed_vec is None:
+                print("No reconstructed vector. Skipping audio generation.")
+                return None, None
             style_vec_torch = (
                 torch.from_numpy(reconstructed_vec).float().unsqueeze(0).to(device)
             )
             audio_np = tts_with_style_vector(
                 text,
                 style_vec=style_vec_torch,
         if audio_np is None:
             print("Audio generation failed.")
+            return None, None
+        sr = 24000
         audio_tuple = (sr, audio_np)
         return audio_tuple, final_vec.tolist()
     except Exception as e:
+        print(f"Error generating audio and style: {e}")
+        return None, None
 def save_style_to_json(style_data, style_name):
+    """
+    Saves the provided style_data (list of floats) into voices.json under style_name.
+    Returns a status message.
+    """
     if not style_name.strip():
         return "Please enter a new style name before saving."
             f"Style name '{style_name}' already exists. Please choose a different name."
         )
     if len(style_data) != VECTOR_DIMENSION:
         return f"Style vector length mismatch. Expected {VECTOR_DIMENSION}, got {len(style_data)}."
     voices_data[style_name] = style_data
     save_voices_json(voices_data)
     return f"Saved style as '{style_name}' in {VOICES_JSON_PATH}."
 def rearrange_voices(new_order):
+    """
+    Rearrange the voices in voices.json based on the comma-separated `new_order`.
+    Returns (status_msg, updated_list_of_voices).
+    """
     voices_data = load_voices_json()
     new_order_list = [name.strip() for name in new_order.split(",")]
     if not all(name in voices_data for name in new_order_list):
         return "Error: New order contains invalid voice names.", list(
             voices_data.keys()
         )
     ordered_data = OrderedDict()
     for name in new_order_list:
         ordered_data[name] = voices_data[name]
     save_voices_json(ordered_data)
     print(f"Voices rearranged: {list(ordered_data.keys())}")
     return "Voices rearranged successfully.", list(ordered_data.keys())
 def delete_voice(selected):
+    """Delete voices from the voices.json. Returns (status_msg, updated_list_of_voices)."""
     if not selected:
         return "No voices selected for deletion.", list(load_voices_json().keys())
     voices_data = load_voices_json()
 def upload_new_voices(uploaded_file):
+    """Upload new voices from a JSON file. Returns (status_msg, updated_list_of_voices)."""
     if uploaded_file is None:
         return "No file uploaded.", list(load_voices_json().keys())
     try:
         uploaded_data = json.load(uploaded_file)
         if not isinstance(uploaded_data, dict):
+            return (
+                "Invalid JSON format. Expected a dictionary of voices.",
+                list(load_voices_json().keys()),
             )
         voices_data = load_voices_json()
         voices_data.update(uploaded_data)
         return "Uploaded file is not valid JSON.", list(load_voices_json().keys())
+# -------------------------------------------------------------------
+# GRADIO INTERFACE
+# -------------------------------------------------------------------
 def create_combined_interface():
+    # We'll initially load the voices to get a default set for the dropdown
     voices_data = load_voices_json()
     voice_choices = list(voices_data.keys())
     default_voice = voice_choices[0] if voice_choices else None
     }
     """
     with gr.Blocks(theme=gr.themes.Ocean(), css=css) as demo:
         gr.Markdown("# StyleTTS2 Studio - Build custom voices")
+        # -------------------------------------------------------
+        # 1) Text-to-Speech Tab
+        # -------------------------------------------------------
         with gr.Tab("Text-to-Speech"):
             gr.Markdown("### Generate Speech with Predefined Voices")
                     label="Speed (%)",
                     value=120,
                 )
+                generate_btn = gr.Button("Generate Audio")
+                status_tts = gr.Textbox(label="Status", visible=False)
             audio_output = gr.Audio(label="Synthesized Audio")
+            # Generate TTS callback
             def on_generate_tts(text, voice, speed):
                 if not voice:
                     return None, "No voice selected."
                 speed_val = speed / 100  # Convert percentage to multiplier
+                audio_result, msg = generate_audio_with_voice(text, voice, speed_val)
+                if audio_result is None:
+                    return None, msg
+                return audio_result, "Audio generated successfully."
             generate_btn.click(
                 fn=on_generate_tts,
                 inputs=[text_input, voice_dropdown, speed_slider],
+                outputs=[audio_output, status_tts],
             )
+        # -------------------------------------------------------
+        # 2) Voice Studio Tab
+        # -------------------------------------------------------
         with gr.Tab("Voice Studio"):
             gr.Markdown("### Customize and Create New Voices")
             # State to hold the last style vector
             style_vector_state_studio = gr.State()
+            # Generate customized audio callback
             def on_generate_studio(text, voice, speed, *pca_values):
                 if not voice:
                     return None, "No voice selected.", None
+                speed_val = speed / 100
+                audio_tuple, style_vector = generate_custom_audio(
                     text, voice, False, speed_val, *pca_values
                 )
+                if audio_tuple is None:
                     return None, "Failed to generate audio.", None
                 return audio_tuple, "Audio generated successfully.", style_vector
             generate_btn_studio.click(
                 outputs=[audio_output_studio, status_text, style_vector_state_studio],
             )
+            # Save customized voice callback
             def on_save_style_studio(style_vector, style_name):
+                """Save the new style, then update the dropdown choices."""
+                if not style_vector or not style_name:
                     return (
+                        gr.update(value="Please enter a name for the new voice!"),
+                        gr.update(),
+                        gr.update(),
                     )
+                # Save the style
                 result = save_style_to_json(style_vector, style_name)
+                # Reload the voices to get the new list
                 new_choices = list(load_voices_json().keys())
+                # Return dictionary updates to existing components
                 return (
+                    gr.update(value=result),
+                    gr.update(choices=new_choices),
+                    gr.update(choices=new_choices),
                 )
             save_btn_studio.click(
                 fn=on_save_style_studio,
                 inputs=[style_vector_state_studio, new_style_name],
+                # We update: status_text, voice_dropdown, voice_dropdown_studio
+                outputs=[status_text, voice_dropdown, voice_dropdown_studio],
             )
+            # Update sliders callback
             voice_dropdown_studio.change(
                 fn=update_sliders,
                 inputs=voice_dropdown_studio,
                 outputs=pca_sliders,
             )
+        # -------------------------------------------------------
+        # Optionally: Reload voices on page load
+        # -------------------------------------------------------
+        def on_page_load():
+            new_choices = list(load_voices_json().keys())
+            return {
+                voice_dropdown: gr.update(choices=new_choices),
+                voice_dropdown_studio: gr.update(choices=new_choices),
+            }
+        # This automatically refreshes dropdowns every time the user loads/refreshes the page
+        demo.load(
+            on_page_load, inputs=None, outputs=[voice_dropdown, voice_dropdown_studio]
+        )
         gr.Markdown(
             "#### Based on [StyleTTS2](https://github.com/yl4579/StyleTTS2) and [artificial StyleTTS2](https://huggingface.co/dkounadis/artificial-styletts2/tree/main)"
         )
 if __name__ == "__main__":
     try:
         interface = create_combined_interface()
+        interface.launch(share=False)  # or share=True if you want a public share link
     except Exception as e:
         print(f"An error occurred while launching the interface: {e}")

voices.json CHANGED Viewed

@@ -2837,4 +2837,4 @@
     0.057131367030820654,
     -0.0762246848122452
   ]
-}

     0.057131367030820654,
     -0.0762246848122452
   ]
+}