|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import atexit |
|
|
import math |
|
|
import torch |
|
|
import gradio as gr |
|
|
from config import ( |
|
|
AVAILABLE_VOICES, |
|
|
DEFAULT_VOICE, |
|
|
DEFAULT_MODEL_VARIANT, |
|
|
DEFAULT_TEMPERATURE, |
|
|
DEFAULT_LSD_DECODE_STEPS, |
|
|
DEFAULT_EOS_THRESHOLD, |
|
|
DEFAULT_NOISE_CLAMP, |
|
|
DEFAULT_FRAMES_AFTER_EOS, |
|
|
MAXIMUM_INPUT_LENGTH, |
|
|
VOICE_MODE_PRESET, |
|
|
VOICE_MODE_CLONE, |
|
|
EXAMPLE_PROMPTS, |
|
|
ACCELERATOR_ENABLED, |
|
|
PYTORCH_COMPUTATION_THREADS, |
|
|
PYTORCH_INTEROP_THREADS |
|
|
) |
|
|
torch.set_num_threads(PYTORCH_COMPUTATION_THREADS) |
|
|
torch.set_num_interop_threads(PYTORCH_INTEROP_THREADS) |
|
|
from src.core.authentication import authenticate_huggingface |
|
|
authenticate_huggingface() |
|
|
if ACCELERATOR_ENABLED: |
|
|
from src.accelerator.client import start_accelerator_daemon, stop_accelerator_daemon |
|
|
accelerator_started = start_accelerator_daemon() |
|
|
if accelerator_started: |
|
|
print("Accelerator daemon started successfully", flush=True) |
|
|
else: |
|
|
print("Accelerator daemon not available, using Python fallback", flush=True) |
|
|
atexit.register(stop_accelerator_daemon) |
|
|
from src.core.memory import start_background_cleanup_thread |
|
|
start_background_cleanup_thread() |
|
|
from src.generation.handler import ( |
|
|
perform_speech_generation, |
|
|
request_generation_stop |
|
|
) |
|
|
from src.ui.state import ( |
|
|
check_generate_button_state, |
|
|
calculate_character_count_display, |
|
|
determine_clear_button_visibility, |
|
|
update_voice_mode_visibility |
|
|
) |
|
|
from src.ui.handlers import ( |
|
|
switch_to_generating_state, |
|
|
switch_to_idle_state, |
|
|
perform_clear_action, |
|
|
create_example_handler, |
|
|
format_example_button_label |
|
|
) |
|
|
from assets.css.styles import CSS |
|
|
from assets.static.title import TITLE |
|
|
from assets.static.header import HEADER |
|
|
from assets.static.footer import FOOTER |
|
|
from assets.static.sidebar import SIDEBAR |
|
|
|
|
|
with gr.Blocks(css=CSS, fill_height=False, fill_width=True) as app: |
|
|
ui_state = gr.State({"generating": False}) |
|
|
|
|
|
with gr.Sidebar(): |
|
|
gr.HTML(SIDEBAR()) |
|
|
|
|
|
with gr.Column(elem_classes="header-section"): |
|
|
gr.HTML(TITLE()) |
|
|
gr.HTML(HEADER()) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
audio_output_component = gr.Audio( |
|
|
label="Generated Speech Output", |
|
|
type="filepath", |
|
|
interactive=False |
|
|
) |
|
|
|
|
|
with gr.Accordion("Voice Selection", open=True): |
|
|
voice_mode_radio = gr.Radio( |
|
|
label="Voice Mode", |
|
|
choices=[ |
|
|
VOICE_MODE_PRESET, |
|
|
VOICE_MODE_CLONE |
|
|
], |
|
|
value=VOICE_MODE_PRESET, |
|
|
info="Choose between preset voices or clone a voice from uploaded audio", |
|
|
elem_id="voice-mode" |
|
|
) |
|
|
|
|
|
with gr.Column(visible=True) as preset_voice_container: |
|
|
voice_preset_dropdown = gr.Dropdown( |
|
|
label="Select Preset Voice", |
|
|
choices=AVAILABLE_VOICES, |
|
|
value=DEFAULT_VOICE |
|
|
) |
|
|
|
|
|
with gr.Column(visible=False) as clone_voice_container: |
|
|
voice_clone_audio_input = gr.Audio( |
|
|
label="Upload Audio for Voice Cloning", |
|
|
type="filepath" |
|
|
) |
|
|
|
|
|
with gr.Accordion("Model Parameters", open=False): |
|
|
with gr.Row(): |
|
|
temperature_slider = gr.Slider( |
|
|
label="Temperature", |
|
|
minimum=0.1, |
|
|
maximum=2.0, |
|
|
step=0.05, |
|
|
value=DEFAULT_TEMPERATURE, |
|
|
info="Higher values produce more expressive speech" |
|
|
) |
|
|
|
|
|
lsd_decode_steps_slider = gr.Slider( |
|
|
label="LSD Decode Steps", |
|
|
minimum=1, |
|
|
maximum=20, |
|
|
step=1, |
|
|
value=DEFAULT_LSD_DECODE_STEPS, |
|
|
info="More steps may improve quality but slower" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
noise_clamp_slider = gr.Slider( |
|
|
label="Noise Clamp", |
|
|
minimum=0.0, |
|
|
maximum=2.0, |
|
|
step=0.05, |
|
|
value=DEFAULT_NOISE_CLAMP, |
|
|
info="Maximum noise sampling value (0 = disabled)" |
|
|
) |
|
|
|
|
|
eos_threshold_slider = gr.Slider( |
|
|
label="End of Sequence Threshold", |
|
|
minimum=-10.0, |
|
|
maximum=0.0, |
|
|
step=0.25, |
|
|
value=DEFAULT_EOS_THRESHOLD, |
|
|
info="Smaller values cause earlier completion" |
|
|
) |
|
|
|
|
|
with gr.Accordion("Advanced Settings", open=False): |
|
|
model_variant_textbox = gr.Textbox( |
|
|
label="Model Variant Identifier", |
|
|
value=DEFAULT_MODEL_VARIANT, |
|
|
info="Model signature for generation" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
enable_custom_frames_checkbox = gr.Checkbox( |
|
|
label="Enable Custom Frames After EOS", |
|
|
value=False, |
|
|
info="Manually control post-EOS frame generation" |
|
|
) |
|
|
|
|
|
frames_after_eos_slider = gr.Slider( |
|
|
label="Frames After EOS", |
|
|
minimum=0, |
|
|
maximum=100, |
|
|
step=1, |
|
|
value=DEFAULT_FRAMES_AFTER_EOS, |
|
|
info="Additional frames after end-of-sequence (80ms per frame)" |
|
|
) |
|
|
|
|
|
with gr.Column(scale=1): |
|
|
text_input_component = gr.Textbox( |
|
|
label="Prompt", |
|
|
placeholder="Enter the text you want to convert to speech...", |
|
|
lines=2, |
|
|
max_lines=20, |
|
|
max_length=MAXIMUM_INPUT_LENGTH, |
|
|
autoscroll=True |
|
|
) |
|
|
|
|
|
character_count_display = gr.HTML( |
|
|
f""" |
|
|
<div class="character-count"> |
|
|
<span>0 / {MAXIMUM_INPUT_LENGTH}</span> |
|
|
</div> |
|
|
""", |
|
|
visible=False |
|
|
) |
|
|
|
|
|
generate_button = gr.Button( |
|
|
"Generate", |
|
|
variant="primary", |
|
|
size="lg", |
|
|
interactive=False |
|
|
) |
|
|
|
|
|
stop_button = gr.Button( |
|
|
"Stop", |
|
|
variant="stop", |
|
|
size="lg", |
|
|
visible=False |
|
|
) |
|
|
|
|
|
clear_button = gr.Button( |
|
|
"Clear", |
|
|
variant="secondary", |
|
|
size="lg", |
|
|
visible=False |
|
|
) |
|
|
|
|
|
gr.HTML( |
|
|
""" |
|
|
<div class="example-prompts"> |
|
|
<h3>Example Prompts</h3> |
|
|
<p>Click any example to generate speech with its assigned voice</p> |
|
|
</div> |
|
|
""" |
|
|
) |
|
|
|
|
|
example_buttons_list = [] |
|
|
num_examples = len(EXAMPLE_PROMPTS) |
|
|
examples_per_row = 2 |
|
|
num_rows = math.ceil(num_examples / examples_per_row) |
|
|
|
|
|
for row_idx in range(num_rows): |
|
|
with gr.Row(): |
|
|
start_idx = row_idx * examples_per_row |
|
|
end_idx = min(start_idx + examples_per_row, num_examples) |
|
|
for i in range(start_idx, end_idx): |
|
|
btn = gr.Button( |
|
|
format_example_button_label( |
|
|
EXAMPLE_PROMPTS[i]["text"], |
|
|
EXAMPLE_PROMPTS[i]["voice"] |
|
|
), |
|
|
size="sm", |
|
|
variant="secondary" |
|
|
) |
|
|
example_buttons_list.append(btn) |
|
|
|
|
|
gr.HTML(FOOTER()) |
|
|
|
|
|
generation_inputs = [ |
|
|
text_input_component, |
|
|
voice_mode_radio, |
|
|
voice_preset_dropdown, |
|
|
voice_clone_audio_input, |
|
|
model_variant_textbox, |
|
|
lsd_decode_steps_slider, |
|
|
temperature_slider, |
|
|
noise_clamp_slider, |
|
|
eos_threshold_slider, |
|
|
frames_after_eos_slider, |
|
|
enable_custom_frames_checkbox |
|
|
] |
|
|
|
|
|
voice_mode_radio.change( |
|
|
fn=update_voice_mode_visibility, |
|
|
inputs=[voice_mode_radio], |
|
|
outputs=[ |
|
|
preset_voice_container, |
|
|
clone_voice_container |
|
|
] |
|
|
) |
|
|
|
|
|
text_input_component.change( |
|
|
fn=calculate_character_count_display, |
|
|
inputs=[text_input_component], |
|
|
outputs=[character_count_display] |
|
|
) |
|
|
|
|
|
text_input_component.change( |
|
|
fn=check_generate_button_state, |
|
|
inputs=[ |
|
|
text_input_component, |
|
|
ui_state |
|
|
], |
|
|
outputs=[generate_button] |
|
|
) |
|
|
|
|
|
text_input_component.change( |
|
|
fn=determine_clear_button_visibility, |
|
|
inputs=[ |
|
|
text_input_component, |
|
|
ui_state |
|
|
], |
|
|
outputs=[clear_button] |
|
|
) |
|
|
|
|
|
generate_button.click( |
|
|
fn=switch_to_generating_state, |
|
|
inputs=[ui_state], |
|
|
outputs=[ |
|
|
generate_button, |
|
|
stop_button, |
|
|
clear_button, |
|
|
ui_state |
|
|
] |
|
|
).then( |
|
|
fn=perform_speech_generation, |
|
|
inputs=generation_inputs, |
|
|
outputs=[audio_output_component] |
|
|
).then( |
|
|
fn=switch_to_idle_state, |
|
|
inputs=[ |
|
|
text_input_component, |
|
|
ui_state |
|
|
], |
|
|
outputs=[ |
|
|
generate_button, |
|
|
stop_button, |
|
|
clear_button, |
|
|
ui_state |
|
|
] |
|
|
) |
|
|
|
|
|
stop_button.click( |
|
|
fn=request_generation_stop, |
|
|
outputs=[stop_button] |
|
|
) |
|
|
|
|
|
clear_button.click( |
|
|
fn=perform_clear_action, |
|
|
outputs=[ |
|
|
text_input_component, |
|
|
audio_output_component, |
|
|
clear_button, |
|
|
voice_mode_radio, |
|
|
voice_preset_dropdown, |
|
|
voice_clone_audio_input |
|
|
] |
|
|
) |
|
|
|
|
|
for button_index, example_button in enumerate(example_buttons_list): |
|
|
example_text = EXAMPLE_PROMPTS[button_index]["text"] |
|
|
example_voice = EXAMPLE_PROMPTS[button_index]["voice"] |
|
|
|
|
|
example_button.click( |
|
|
fn=switch_to_generating_state, |
|
|
inputs=[ui_state], |
|
|
outputs=[ |
|
|
generate_button, |
|
|
stop_button, |
|
|
clear_button, |
|
|
ui_state |
|
|
] |
|
|
).then( |
|
|
fn=create_example_handler(example_text, example_voice), |
|
|
outputs=[ |
|
|
text_input_component, |
|
|
voice_mode_radio, |
|
|
voice_preset_dropdown |
|
|
] |
|
|
).then( |
|
|
fn=perform_speech_generation, |
|
|
inputs=generation_inputs, |
|
|
outputs=[audio_output_component] |
|
|
).then( |
|
|
fn=switch_to_idle_state, |
|
|
inputs=[ |
|
|
text_input_component, |
|
|
ui_state |
|
|
], |
|
|
outputs=[ |
|
|
generate_button, |
|
|
stop_button, |
|
|
clear_button, |
|
|
ui_state |
|
|
] |
|
|
) |
|
|
|
|
|
app.launch( |
|
|
server_name="0.0.0.0", |
|
|
max_file_size="1mb" |
|
|
) |