# # SPDX-FileCopyrightText: Hadad # SPDX-License-Identifier: Apache-2.0 # import atexit import math import torch import gradio as gr from config import ( AVAILABLE_VOICES, DEFAULT_VOICE, DEFAULT_MODEL_VARIANT, DEFAULT_TEMPERATURE, DEFAULT_LSD_DECODE_STEPS, DEFAULT_EOS_THRESHOLD, DEFAULT_NOISE_CLAMP, DEFAULT_FRAMES_AFTER_EOS, MAXIMUM_INPUT_LENGTH, VOICE_MODE_PRESET, VOICE_MODE_CLONE, EXAMPLE_PROMPTS, ACCELERATOR_ENABLED, PYTORCH_COMPUTATION_THREADS, PYTORCH_INTEROP_THREADS ) torch.set_num_threads(PYTORCH_COMPUTATION_THREADS) torch.set_num_interop_threads(PYTORCH_INTEROP_THREADS) from src.core.authentication import authenticate_huggingface authenticate_huggingface() if ACCELERATOR_ENABLED: from src.accelerator.client import start_accelerator_daemon, stop_accelerator_daemon accelerator_started = start_accelerator_daemon() if accelerator_started: print("Accelerator daemon started successfully", flush=True) else: print("Accelerator daemon not available, using Python fallback", flush=True) atexit.register(stop_accelerator_daemon) from src.core.memory import start_background_cleanup_thread start_background_cleanup_thread() from src.generation.handler import ( perform_speech_generation, request_generation_stop ) from src.ui.state import ( check_generate_button_state, calculate_character_count_display, determine_clear_button_visibility, update_voice_mode_visibility ) from src.ui.handlers import ( switch_to_generating_state, switch_to_idle_state, perform_clear_action, create_example_handler, format_example_button_label ) from assets.css.styles import CSS from assets.static.title import TITLE from assets.static.header import HEADER from assets.static.footer import FOOTER from assets.static.sidebar import SIDEBAR with gr.Blocks(css=CSS, fill_height=False, fill_width=True) as app: ui_state = gr.State({"generating": False}) with gr.Sidebar(): gr.HTML(SIDEBAR()) with gr.Column(elem_classes="header-section"): gr.HTML(TITLE()) gr.HTML(HEADER()) with gr.Row(): with gr.Column(): audio_output_component = gr.Audio( label="Generated Speech Output", type="filepath", interactive=False ) with gr.Accordion("Voice Selection", open=True): voice_mode_radio = gr.Radio( label="Voice Mode", choices=[ VOICE_MODE_PRESET, VOICE_MODE_CLONE ], value=VOICE_MODE_PRESET, info="Choose between preset voices or clone a voice from uploaded audio", elem_id="voice-mode" ) with gr.Column(visible=True) as preset_voice_container: voice_preset_dropdown = gr.Dropdown( label="Select Preset Voice", choices=AVAILABLE_VOICES, value=DEFAULT_VOICE ) with gr.Column(visible=False) as clone_voice_container: voice_clone_audio_input = gr.Audio( label="Upload Audio for Voice Cloning", type="filepath" ) with gr.Accordion("Model Parameters", open=False): with gr.Row(): temperature_slider = gr.Slider( label="Temperature", minimum=0.1, maximum=2.0, step=0.05, value=DEFAULT_TEMPERATURE, info="Higher values produce more expressive speech" ) lsd_decode_steps_slider = gr.Slider( label="LSD Decode Steps", minimum=1, maximum=20, step=1, value=DEFAULT_LSD_DECODE_STEPS, info="More steps may improve quality but slower" ) with gr.Row(): noise_clamp_slider = gr.Slider( label="Noise Clamp", minimum=0.0, maximum=2.0, step=0.05, value=DEFAULT_NOISE_CLAMP, info="Maximum noise sampling value (0 = disabled)" ) eos_threshold_slider = gr.Slider( label="End of Sequence Threshold", minimum=-10.0, maximum=0.0, step=0.25, value=DEFAULT_EOS_THRESHOLD, info="Smaller values cause earlier completion" ) with gr.Accordion("Advanced Settings", open=False): model_variant_textbox = gr.Textbox( label="Model Variant Identifier", value=DEFAULT_MODEL_VARIANT, info="Model signature for generation" ) with gr.Row(): enable_custom_frames_checkbox = gr.Checkbox( label="Enable Custom Frames After EOS", value=False, info="Manually control post-EOS frame generation" ) frames_after_eos_slider = gr.Slider( label="Frames After EOS", minimum=0, maximum=100, step=1, value=DEFAULT_FRAMES_AFTER_EOS, info="Additional frames after end-of-sequence (80ms per frame)" ) with gr.Column(scale=1): text_input_component = gr.Textbox( label="Prompt", placeholder="Enter the text you want to convert to speech...", lines=2, max_lines=20, max_length=MAXIMUM_INPUT_LENGTH, autoscroll=True ) character_count_display = gr.HTML( f"""
0 / {MAXIMUM_INPUT_LENGTH}
""", visible=False ) generate_button = gr.Button( "Generate", variant="primary", size="lg", interactive=False ) stop_button = gr.Button( "Stop", variant="stop", size="lg", visible=False ) clear_button = gr.Button( "Clear", variant="secondary", size="lg", visible=False ) gr.HTML( """

Example Prompts

Click any example to generate speech with its assigned voice

""" ) example_buttons_list = [] num_examples = len(EXAMPLE_PROMPTS) examples_per_row = 2 num_rows = math.ceil(num_examples / examples_per_row) for row_idx in range(num_rows): with gr.Row(): start_idx = row_idx * examples_per_row end_idx = min(start_idx + examples_per_row, num_examples) for i in range(start_idx, end_idx): btn = gr.Button( format_example_button_label( EXAMPLE_PROMPTS[i]["text"], EXAMPLE_PROMPTS[i]["voice"] ), size="sm", variant="secondary" ) example_buttons_list.append(btn) gr.HTML(FOOTER()) generation_inputs = [ text_input_component, voice_mode_radio, voice_preset_dropdown, voice_clone_audio_input, model_variant_textbox, lsd_decode_steps_slider, temperature_slider, noise_clamp_slider, eos_threshold_slider, frames_after_eos_slider, enable_custom_frames_checkbox ] voice_mode_radio.change( fn=update_voice_mode_visibility, inputs=[voice_mode_radio], outputs=[ preset_voice_container, clone_voice_container ] ) text_input_component.change( fn=calculate_character_count_display, inputs=[text_input_component], outputs=[character_count_display] ) text_input_component.change( fn=check_generate_button_state, inputs=[ text_input_component, ui_state ], outputs=[generate_button] ) text_input_component.change( fn=determine_clear_button_visibility, inputs=[ text_input_component, ui_state ], outputs=[clear_button] ) generate_button.click( fn=switch_to_generating_state, inputs=[ui_state], outputs=[ generate_button, stop_button, clear_button, ui_state ] ).then( fn=perform_speech_generation, inputs=generation_inputs, outputs=[audio_output_component] ).then( fn=switch_to_idle_state, inputs=[ text_input_component, ui_state ], outputs=[ generate_button, stop_button, clear_button, ui_state ] ) stop_button.click( fn=request_generation_stop, outputs=[stop_button] ) clear_button.click( fn=perform_clear_action, outputs=[ text_input_component, audio_output_component, clear_button, voice_mode_radio, voice_preset_dropdown, voice_clone_audio_input ] ) for button_index, example_button in enumerate(example_buttons_list): example_text = EXAMPLE_PROMPTS[button_index]["text"] example_voice = EXAMPLE_PROMPTS[button_index]["voice"] example_button.click( fn=switch_to_generating_state, inputs=[ui_state], outputs=[ generate_button, stop_button, clear_button, ui_state ] ).then( fn=create_example_handler(example_text, example_voice), outputs=[ text_input_component, voice_mode_radio, voice_preset_dropdown ] ).then( fn=perform_speech_generation, inputs=generation_inputs, outputs=[audio_output_component] ).then( fn=switch_to_idle_state, inputs=[ text_input_component, ui_state ], outputs=[ generate_button, stop_button, clear_button, ui_state ] ) app.launch( server_name="0.0.0.0", max_file_size="1mb" )