Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import torch | |
| import os | |
| from kokoro import generate | |
| from models import build_model | |
| # Initialize model and device | |
| device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
| MODEL = build_model('kokoro-v0_19.pth', device) | |
| # Load the voice models | |
| voices = { | |
| 'af': torch.load("voices/af.pt", weights_only=True), | |
| 'af_bella': torch.load("voices/af_bella.pt", weights_only=True), | |
| 'af_sarah': torch.load("voices/af_sarah.pt", weights_only=True), | |
| 'am_adam': torch.load("voices/am_adam.pt", weights_only=True), | |
| 'am_michael': torch.load("voices/am_michael.pt", weights_only=True), | |
| 'bf_emma': torch.load("voices/bf_emma.pt", weights_only=True), | |
| 'bf_isabella': torch.load("voices/bf_isabella.pt", weights_only=True), | |
| 'bm_george': torch.load("voices/bm_george.pt", weights_only=True), | |
| 'bm_lewis': torch.load("voices/bm_lewis.pt", weights_only=True), | |
| 'af_nicole': torch.load("voices/af_nicole.pt", weights_only=True), | |
| 'af_sky': torch.load("voices/af_sky.pt", weights_only=True) | |
| } | |
| def parse_voice_formula(formula): | |
| """Parse the voice formula string and return the combined voice tensor.""" | |
| if not formula.strip(): | |
| raise ValueError("Empty voice formula") | |
| # Initialize the weighted sum | |
| weighted_sum = None | |
| # Split the formula into terms | |
| terms = formula.split('+') | |
| for term in terms: | |
| # Parse each term (format: "0.333 * voice_name") | |
| weight, voice_name = term.strip().split('*') | |
| weight = float(weight.strip()) | |
| voice_name = voice_name.strip() | |
| # Get the voice tensor | |
| if voice_name not in voices: | |
| raise ValueError(f"Unknown voice: {voice_name}") | |
| voice_tensor = voices[voice_name] | |
| # Add to weighted sum | |
| if weighted_sum is None: | |
| weighted_sum = weight * voice_tensor | |
| else: | |
| weighted_sum += weight * voice_tensor | |
| return weighted_sum | |
| def get_new_voice(formula): | |
| try: | |
| # Parse the formula and get the combined voice tensor | |
| weighted_voices = parse_voice_formula(formula) | |
| # Save and load the combined voice | |
| torch.save(weighted_voices, "weighted_normalised_voices.pt") | |
| VOICEPACK = torch.load("weighted_normalised_voices.pt", weights_only=False).to(device) | |
| return VOICEPACK | |
| except Exception as e: | |
| raise gr.Error(f"Failed to create voice: {str(e)}") | |
| def text_to_speech(text, formula): | |
| try: | |
| if not text.strip(): | |
| raise gr.Error("Please enter some text") | |
| if not formula.strip(): | |
| raise gr.Error("Please select at least one voice") | |
| # Get the combined voice | |
| VOICEPACK = get_new_voice(formula) | |
| # Generate audio | |
| audio, phonemes = generate(MODEL, text, VOICEPACK, lang='a') | |
| return (24000, audio) | |
| except Exception as e: | |
| raise gr.Error(f"Failed to generate speech: {str(e)}") | |
| custom_css = """ | |
| /* Main title */ | |
| .heading { | |
| color: rgb(76, 175, 147) !important; | |
| font-size: 2em !important; | |
| font-weight: 600 !important; | |
| text-align: center !important; | |
| margin: 20px 0 10px 0 !important; | |
| width: 100% !important; | |
| } | |
| /* Description text - Dark mode */ | |
| .description { | |
| color: var(--body-text-color, rgba(76, 175, 147, 0.7)) !important; | |
| text-align: center !important; | |
| max-width: 800px !important; | |
| margin: 0 auto 30px auto !important; | |
| font-size: 0.9em !important; | |
| line-height: 1.6 !important; | |
| } | |
| /* Description text - Light mode override */ | |
| .light-mode .description, | |
| [data-theme="light"] .description { | |
| color: rgba(55, 65, 81, 0.9) !important; | |
| } | |
| /* Description text - Bold elements in light mode */ | |
| .light-mode .description b, | |
| [data-theme="light"] .description b { | |
| color: rgb(55, 65, 81) !important; | |
| font-weight: 600 !important; | |
| } | |
| .container-wrap { | |
| display: flex !important; | |
| gap: 5px !important; | |
| justify-content: center !important; | |
| margin: 0 auto !important; | |
| max-width: 1400px !important; /* Increased max-width */ | |
| } | |
| .vert-group { | |
| min-width: 100px !important; /* Increased from 80px */ | |
| width: 105px !important; /* Increased from 90px */ | |
| flex: 0 0 auto !important; | |
| } | |
| .vert-group label { | |
| white-space: nowrap !important; | |
| overflow: visible !important; | |
| width: auto !important; | |
| font-size: 0.85em !important; /* Slightly increased font size */ | |
| transform-origin: left center !important; | |
| transform: rotate(0deg) translateX(-50%) !important; | |
| position: relative !important; | |
| left: 50% !important; | |
| display: inline-block !important; | |
| text-align: center !important; | |
| margin-bottom: 5px !important; | |
| padding: 0 1px !important; /* Added padding */ | |
| } | |
| .vert-group .wrap label { | |
| text-align: center !important; | |
| width: 100% !important; | |
| display: block !important; | |
| } | |
| /* Hover effect */ | |
| .vert-group:hover { | |
| transform: translateY(-5px) !important; | |
| box-shadow: 0 5px 15px rgba(0, 0, 0, 0.2) !important; | |
| } | |
| .slider_input_container { | |
| height: 200px !important; | |
| position: relative !important; | |
| width: 50px !important; /* Increased from 40px */ | |
| margin: 0 auto !important; | |
| overflow: hidden !important; | |
| } | |
| .slider_input_container input[type="range"] { | |
| position: absolute !important; | |
| width: 200px !important; | |
| left: -75px !important; /* Adjusted from -80px */ | |
| top: 100px !important; | |
| transform: rotate(90deg) !important; | |
| } | |
| .min_value { | |
| position: absolute !important; | |
| bottom: 0 !important; | |
| left: 10px !important; | |
| } | |
| .max_value { | |
| position: absolute !important; | |
| top: 0 !important; | |
| left: 10px !important; | |
| } | |
| .tab-like-container { | |
| transform: scale(0.8) !important; | |
| } | |
| .gradio-row, .gradio-column { | |
| background: none !important; | |
| border: none !important; | |
| min-width: unset !important; | |
| } | |
| .heading { | |
| text-align: center !important; | |
| margin-bottom: 1rem !important; | |
| } | |
| .description { | |
| text-align: center !important; | |
| margin-bottom: 2rem !important; | |
| color: rgba(255, 255, 255, 0.7) !important; | |
| } | |
| /* Generate button */ | |
| #generate-btn { | |
| background: linear-gradient(90deg, rgb(76, 175, 147), rgb(76, 147, 175)) !important; | |
| border: none !important; | |
| border-radius: 8px !important; | |
| padding: 12px 24px !important; | |
| color: white !important; | |
| font-weight: 600 !important; | |
| transition: transform 0.2s, box-shadow 0.2s !important; | |
| } | |
| #generate-btn:hover { | |
| transform: translateY(-2px) !important; | |
| box-shadow: 0 5px 15px rgba(76, 175, 147, 0.3) !important; | |
| } | |
| """ | |
| with gr.Blocks(css=custom_css, theme="ocean") as demo: | |
| gr.HTML( | |
| """ | |
| <div class="heading">🎙️ AI Voice Mixer Studio - Kokoro TTS</div> | |
| <div class="description"> | |
| <b>Mix and match different voices to create your perfect text-to-speech voice.<br>Each slider represents a | |
| unique voice with distinct characteristics. This app lets you combine multiple voices with different weights | |
| to create custom voice combinations. Select voices using checkboxes and adjust their weights using the sliders below!</b> | |
| </div> | |
| """ | |
| ) | |
| with gr.Row(variant="default", equal_height=True, elem_classes="container-wrap"): | |
| checkboxes = [] | |
| sliders = [] | |
| # Define slider configurations with emojis | |
| slider_configs = [ | |
| ("af", "Default 👩🦰"), | |
| ("af_bella", "Bella 👩🦰🇺🇸"), | |
| ("af_sarah", "Sarah 👩🦰🇺🇸"), | |
| ("af_nicole", "Nicole 👩🦰🇺🇸"), | |
| ("af_sky", "Sky 👩🦰🇺🇸"), | |
| ("am_adam", "Adam 👨🇺🇸"), | |
| ("am_michael", "Michael 👨🇺🇸"), | |
| ("bf_emma", "Emma 👩🦰🇬🇧"), | |
| ("bf_isabella", "Isabella 👩🦰🇬🇧"), | |
| ("bm_george", "George 👨🇬🇧"), | |
| ("bm_lewis", "Lewis 👨🇬🇧") | |
| ] | |
| # Create columns for each slider | |
| for value, label in slider_configs: | |
| with gr.Column(min_width=70, scale=1, variant="default", elem_classes="vert-group"): | |
| checkbox = gr.Checkbox(label='') | |
| slider = gr.Slider(label=label, minimum=0, maximum=1, interactive=False, value=0, step=0.01) | |
| checkboxes.append(checkbox) | |
| sliders.append(slider) | |
| # Add voice combination formula display | |
| with gr.Row(equal_height=True): | |
| formula_display = gr.Textbox( | |
| label="Voice Combination Formula", | |
| value="", | |
| lines=2, | |
| scale=4, | |
| interactive=False, | |
| placeholder="This will begin to display immediately once any of the voice checkboxes is selected selected", | |
| info="Slider values are normalized to create this voice formula. Use the Sliders to intuitively increase or decrease a voice effect." | |
| ) | |
| input_text = gr.Textbox( | |
| label="Input Text", | |
| placeholder="Enter text to convert to speech", | |
| lines=2, | |
| scale=4 | |
| ) | |
| button_tts = gr.Button("🎙️ Generate Voice", scale=2, min_width=100, elem_id="generate-btn") | |
| # Generate speech from the selected custom voice | |
| with gr.Row(equal_height=True): | |
| kokoro_tts = gr.Audio(label="Generated Speech", type="numpy", autoplay=True) | |
| def generate_voice_formula(*values): | |
| """ | |
| Generate a formatted string showing the normalized voice combination. | |
| Returns: String like "0.6 * voice1" or "0.4 * voice1 + 0.6 * voice2" | |
| """ | |
| n = len(values) // 2 | |
| checkbox_values = values[:n] | |
| slider_values = list(values[n:]) | |
| # Get active sliders and their names | |
| active_pairs = [(slider_values[i], slider_configs[i][0]) | |
| for i in range(len(slider_configs)) | |
| if checkbox_values[i]] | |
| if not active_pairs: | |
| return "" | |
| # If only one voice is selected, use its actual value | |
| if len(active_pairs) == 1: | |
| value, name = active_pairs[0] | |
| return f"{value:.3f} * {name}" | |
| # Calculate sum for normalization of multiple voices | |
| total_sum = sum(value for value, _ in active_pairs) | |
| if total_sum == 0: | |
| return "" | |
| # Generate normalized formula for multiple voices | |
| terms = [] | |
| for value, name in active_pairs: | |
| normalized_value = value / total_sum | |
| terms.append(f"{normalized_value:.3f} * {name}") | |
| return " + ".join(terms) | |
| def check_box(checkbox): | |
| """Handle checkbox changes.""" | |
| if checkbox: | |
| return gr.Slider(interactive=True, value=1.0) # Changed default to 1.0 | |
| else: | |
| return gr.Slider(interactive=False, value=0) | |
| # Connect all checkboxes and sliders | |
| all_inputs = checkboxes + sliders | |
| # Update on checkbox changes | |
| for checkbox, slider in zip(checkboxes, sliders): | |
| checkbox.change( | |
| fn=check_box, | |
| inputs=[checkbox], | |
| outputs=[slider] | |
| ) | |
| # Update formula on checkbox changes | |
| checkbox.change( | |
| fn=generate_voice_formula, | |
| inputs=all_inputs, | |
| outputs=[formula_display] | |
| ) | |
| # Update formula on slider changes | |
| for slider in sliders: | |
| slider.change( | |
| fn=generate_voice_formula, | |
| inputs=all_inputs, | |
| outputs=[formula_display] | |
| ) | |
| button_tts.click( | |
| fn=text_to_speech, | |
| inputs=[input_text, formula_display], | |
| outputs=[kokoro_tts] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |