Spaces:
Sleeping
Sleeping
| # | |
| # SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org> | |
| # SPDX-License-Identifier: Apache-2.0 | |
| # | |
| import math | |
| import torch | |
| import gradio as gr | |
| torch.set_num_threads(1) | |
| torch.set_num_interop_threads(1) | |
| from config import ( | |
| AVAILABLE_VOICES, | |
| DEFAULT_VOICE, | |
| DEFAULT_MODEL_VARIANT, | |
| DEFAULT_TEMPERATURE, | |
| DEFAULT_LSD_DECODE_STEPS, | |
| DEFAULT_EOS_THRESHOLD, | |
| DEFAULT_NOISE_CLAMP, | |
| DEFAULT_FRAMES_AFTER_EOS, | |
| MAXIMUM_INPUT_LENGTH, | |
| VOICE_MODE_PRESET, | |
| VOICE_MODE_CLONE, | |
| EXAMPLE_PROMPTS | |
| ) | |
| from src.core.authentication import authenticate_huggingface | |
| authenticate_huggingface() | |
| from src.core.memory import start_background_cleanup_thread | |
| start_background_cleanup_thread() | |
| from src.generation.handler import ( | |
| perform_speech_generation, | |
| request_generation_stop | |
| ) | |
| from src.ui.state import ( | |
| check_generate_button_state, | |
| calculate_character_count_display, | |
| determine_clear_button_visibility, | |
| update_voice_mode_visibility | |
| ) | |
| from src.ui.handlers import ( | |
| switch_to_generating_state, | |
| switch_to_idle_state, | |
| perform_clear_action, | |
| create_example_handler, | |
| format_example_button_label | |
| ) | |
| from assets.css.styles import CSS | |
| from assets.static.title import TITLE | |
| from assets.static.header import HEADER | |
| from assets.static.footer import FOOTER | |
| from assets.static.sidebar import SIDEBAR | |
| with gr.Blocks(css=CSS, fill_height=False, fill_width=True) as app: | |
| ui_state = gr.State({"generating": False}) | |
| with gr.Sidebar(): | |
| gr.HTML(SIDEBAR()) | |
| with gr.Column(elem_classes="header-section"): | |
| gr.HTML(TITLE()) | |
| gr.HTML(HEADER()) | |
| with gr.Row(): | |
| with gr.Column(): | |
| audio_output_component = gr.Audio( | |
| label="Generated Speech Output", | |
| type="filepath", | |
| interactive=False, | |
| autoplay=False | |
| ) | |
| with gr.Accordion("Voice Selection", open=True): | |
| voice_mode_radio = gr.Radio( | |
| label="Voice Mode", | |
| choices=[ | |
| VOICE_MODE_PRESET, | |
| VOICE_MODE_CLONE | |
| ], | |
| value=VOICE_MODE_PRESET, | |
| info="Choose between preset voices or clone a voice from uploaded audio", | |
| elem_id="voice-mode" | |
| ) | |
| with gr.Column(visible=True) as preset_voice_container: | |
| voice_preset_dropdown = gr.Dropdown( | |
| label="Select Preset Voice", | |
| choices=AVAILABLE_VOICES, | |
| value=DEFAULT_VOICE | |
| ) | |
| with gr.Column(visible=False) as clone_voice_container: | |
| voice_clone_audio_input = gr.Audio( | |
| label="Upload Audio for Voice Cloning", | |
| type="filepath" | |
| ) | |
| with gr.Accordion("Model Parameters", open=False): | |
| with gr.Row(): | |
| temperature_slider = gr.Slider( | |
| label="Temperature", | |
| minimum=0.1, | |
| maximum=2.0, | |
| step=0.05, | |
| value=DEFAULT_TEMPERATURE, | |
| info="Higher values produce more expressive speech" | |
| ) | |
| lsd_decode_steps_slider = gr.Slider( | |
| label="LSD Decode Steps", | |
| minimum=1, | |
| maximum=20, | |
| step=1, | |
| value=DEFAULT_LSD_DECODE_STEPS, | |
| info="More steps may improve quality but slower" | |
| ) | |
| with gr.Row(): | |
| noise_clamp_slider = gr.Slider( | |
| label="Noise Clamp", | |
| minimum=0.0, | |
| maximum=2.0, | |
| step=0.05, | |
| value=DEFAULT_NOISE_CLAMP, | |
| info="Maximum noise sampling value (0 = disabled)" | |
| ) | |
| eos_threshold_slider = gr.Slider( | |
| label="End of Sequence Threshold", | |
| minimum=-10.0, | |
| maximum=0.0, | |
| step=0.25, | |
| value=DEFAULT_EOS_THRESHOLD, | |
| info="Smaller values cause earlier completion" | |
| ) | |
| with gr.Accordion("Advanced Settings", open=False): | |
| model_variant_textbox = gr.Textbox( | |
| label="Model Variant Identifier", | |
| value=DEFAULT_MODEL_VARIANT, | |
| info="Model signature for generation" | |
| ) | |
| with gr.Row(): | |
| enable_custom_frames_checkbox = gr.Checkbox( | |
| label="Enable Custom Frames After EOS", | |
| value=False, | |
| info="Manually control post-EOS frame generation" | |
| ) | |
| frames_after_eos_slider = gr.Slider( | |
| label="Frames After EOS", | |
| minimum=0, | |
| maximum=100, | |
| step=1, | |
| value=DEFAULT_FRAMES_AFTER_EOS, | |
| info="Additional frames after end-of-sequence (80ms per frame)" | |
| ) | |
| with gr.Column(scale=1): | |
| text_input_component = gr.Textbox( | |
| label="Prompt", | |
| placeholder="Enter the text you want to convert to speech...", | |
| lines=2, | |
| max_lines=20, | |
| max_length=MAXIMUM_INPUT_LENGTH, | |
| autoscroll=True | |
| ) | |
| character_count_display = gr.HTML( | |
| f"<div style='text-align: right; padding: 4px 0;'><span style='color: var(--body-text-color-subdued); font-size: 0.85em;'>0 / {MAXIMUM_INPUT_LENGTH}</span></div>", | |
| visible=False | |
| ) | |
| generate_button = gr.Button( | |
| "Generate", | |
| variant="primary", | |
| size="lg", | |
| interactive=False | |
| ) | |
| stop_button = gr.Button( | |
| "Stop", | |
| variant="stop", | |
| size="lg", | |
| visible=False | |
| ) | |
| clear_button = gr.Button( | |
| "Clear", | |
| variant="secondary", | |
| size="lg", | |
| visible=False | |
| ) | |
| gr.HTML( | |
| """ | |
| <div style="padding: 16px 0 8px 0;"> | |
| <h3 style="margin: 0 0 8px 0; font-size: 1.1em;">Example Prompts</h3> | |
| <p style="margin: 0; opacity: 0.7; font-size: 0.9em;">Click any example to generate speech with its assigned voice</p> | |
| </div> | |
| """ | |
| ) | |
| example_buttons_list = [] | |
| num_examples = len(EXAMPLE_PROMPTS) | |
| examples_per_row = 2 | |
| num_rows = math.ceil(num_examples / examples_per_row) | |
| for row_idx in range(num_rows): | |
| with gr.Row(): | |
| start_idx = row_idx * examples_per_row | |
| end_idx = min(start_idx + examples_per_row, num_examples) | |
| for i in range(start_idx, end_idx): | |
| btn = gr.Button( | |
| format_example_button_label( | |
| EXAMPLE_PROMPTS[i]["text"], | |
| EXAMPLE_PROMPTS[i]["voice"] | |
| ), | |
| size="sm", | |
| variant="secondary" | |
| ) | |
| example_buttons_list.append(btn) | |
| gr.HTML(FOOTER()) | |
| generation_inputs = [ | |
| text_input_component, | |
| voice_mode_radio, | |
| voice_preset_dropdown, | |
| voice_clone_audio_input, | |
| model_variant_textbox, | |
| lsd_decode_steps_slider, | |
| temperature_slider, | |
| noise_clamp_slider, | |
| eos_threshold_slider, | |
| frames_after_eos_slider, | |
| enable_custom_frames_checkbox | |
| ] | |
| voice_mode_radio.change( | |
| fn=update_voice_mode_visibility, | |
| inputs=[voice_mode_radio], | |
| outputs=[ | |
| preset_voice_container, | |
| clone_voice_container | |
| ] | |
| ) | |
| text_input_component.change( | |
| fn=calculate_character_count_display, | |
| inputs=[text_input_component], | |
| outputs=[character_count_display] | |
| ) | |
| text_input_component.change( | |
| fn=check_generate_button_state, | |
| inputs=[ | |
| text_input_component, | |
| ui_state | |
| ], | |
| outputs=[generate_button] | |
| ) | |
| text_input_component.change( | |
| fn=determine_clear_button_visibility, | |
| inputs=[ | |
| text_input_component, | |
| ui_state | |
| ], | |
| outputs=[clear_button] | |
| ) | |
| generate_button.click( | |
| fn=switch_to_generating_state, | |
| inputs=[ui_state], | |
| outputs=[ | |
| generate_button, | |
| stop_button, | |
| clear_button, | |
| ui_state | |
| ] | |
| ).then( | |
| fn=perform_speech_generation, | |
| inputs=generation_inputs, | |
| outputs=[audio_output_component] | |
| ).then( | |
| fn=switch_to_idle_state, | |
| inputs=[ | |
| text_input_component, | |
| ui_state | |
| ], | |
| outputs=[ | |
| generate_button, | |
| stop_button, | |
| clear_button, | |
| ui_state | |
| ] | |
| ) | |
| stop_button.click( | |
| fn=request_generation_stop, | |
| outputs=[stop_button] | |
| ) | |
| clear_button.click( | |
| fn=perform_clear_action, | |
| outputs=[ | |
| text_input_component, | |
| audio_output_component, | |
| clear_button, | |
| voice_mode_radio, | |
| voice_preset_dropdown, | |
| voice_clone_audio_input | |
| ] | |
| ) | |
| for button_index, example_button in enumerate(example_buttons_list): | |
| example_text = EXAMPLE_PROMPTS[button_index]["text"] | |
| example_voice = EXAMPLE_PROMPTS[button_index]["voice"] | |
| example_button.click( | |
| fn=switch_to_generating_state, | |
| inputs=[ui_state], | |
| outputs=[ | |
| generate_button, | |
| stop_button, | |
| clear_button, | |
| ui_state | |
| ] | |
| ).then( | |
| fn=create_example_handler(example_text, example_voice), | |
| outputs=[ | |
| text_input_component, | |
| voice_mode_radio, | |
| voice_preset_dropdown | |
| ] | |
| ).then( | |
| fn=perform_speech_generation, | |
| inputs=generation_inputs, | |
| outputs=[audio_output_component] | |
| ).then( | |
| fn=switch_to_idle_state, | |
| inputs=[ | |
| text_input_component, | |
| ui_state | |
| ], | |
| outputs=[ | |
| generate_button, | |
| stop_button, | |
| clear_button, | |
| ui_state | |
| ] | |
| ) | |
| app.launch(server_name="0.0.0.0") |