tts / app.py
hadadrjt's picture
Pocket TTS: Implement safe and efficient processing mechanisms.
02b5975
#
# SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
# SPDX-License-Identifier: Apache-2.0
#
import atexit
import math
import torch
import gradio as gr
from config import (
AVAILABLE_VOICES,
DEFAULT_VOICE,
DEFAULT_MODEL_VARIANT,
DEFAULT_TEMPERATURE,
DEFAULT_LSD_DECODE_STEPS,
DEFAULT_EOS_THRESHOLD,
DEFAULT_NOISE_CLAMP,
DEFAULT_FRAMES_AFTER_EOS,
MAXIMUM_INPUT_LENGTH,
VOICE_MODE_PRESET,
VOICE_MODE_CLONE,
EXAMPLE_PROMPTS,
ACCELERATOR_ENABLED,
PYTORCH_COMPUTATION_THREADS,
PYTORCH_INTEROP_THREADS
)
torch.set_num_threads(PYTORCH_COMPUTATION_THREADS)
torch.set_num_interop_threads(PYTORCH_INTEROP_THREADS)
from src.core.authentication import authenticate_huggingface
authenticate_huggingface()
if ACCELERATOR_ENABLED:
from src.accelerator.client import start_accelerator_daemon, stop_accelerator_daemon
accelerator_started = start_accelerator_daemon()
if accelerator_started:
print("Accelerator daemon started successfully", flush=True)
else:
print("Accelerator daemon not available, using Python fallback", flush=True)
atexit.register(stop_accelerator_daemon)
from src.core.memory import start_background_cleanup_thread
start_background_cleanup_thread()
from src.generation.handler import (
perform_speech_generation,
request_generation_stop
)
from src.ui.state import (
check_generate_button_state,
calculate_character_count_display,
determine_clear_button_visibility,
update_voice_mode_visibility
)
from src.ui.handlers import (
switch_to_generating_state,
switch_to_idle_state,
perform_clear_action,
create_example_handler,
format_example_button_label
)
from assets.css.styles import CSS
from assets.static.title import TITLE
from assets.static.header import HEADER
from assets.static.footer import FOOTER
from assets.static.sidebar import SIDEBAR
with gr.Blocks(css=CSS, fill_height=False, fill_width=True) as app:
ui_state = gr.State({"generating": False})
with gr.Sidebar():
gr.HTML(SIDEBAR())
with gr.Column(elem_classes="header-section"):
gr.HTML(TITLE())
gr.HTML(HEADER())
with gr.Row():
with gr.Column():
audio_output_component = gr.Audio(
label="Generated Speech Output",
type="filepath",
interactive=False
)
with gr.Accordion("Voice Selection", open=True):
voice_mode_radio = gr.Radio(
label="Voice Mode",
choices=[
VOICE_MODE_PRESET,
VOICE_MODE_CLONE
],
value=VOICE_MODE_PRESET,
info="Choose between preset voices or clone a voice from uploaded audio",
elem_id="voice-mode"
)
with gr.Column(visible=True) as preset_voice_container:
voice_preset_dropdown = gr.Dropdown(
label="Select Preset Voice",
choices=AVAILABLE_VOICES,
value=DEFAULT_VOICE
)
with gr.Column(visible=False) as clone_voice_container:
voice_clone_audio_input = gr.Audio(
label="Upload Audio for Voice Cloning",
type="filepath"
)
with gr.Accordion("Model Parameters", open=False):
with gr.Row():
temperature_slider = gr.Slider(
label="Temperature",
minimum=0.1,
maximum=2.0,
step=0.05,
value=DEFAULT_TEMPERATURE,
info="Higher values produce more expressive speech"
)
lsd_decode_steps_slider = gr.Slider(
label="LSD Decode Steps",
minimum=1,
maximum=20,
step=1,
value=DEFAULT_LSD_DECODE_STEPS,
info="More steps may improve quality but slower"
)
with gr.Row():
noise_clamp_slider = gr.Slider(
label="Noise Clamp",
minimum=0.0,
maximum=2.0,
step=0.05,
value=DEFAULT_NOISE_CLAMP,
info="Maximum noise sampling value (0 = disabled)"
)
eos_threshold_slider = gr.Slider(
label="End of Sequence Threshold",
minimum=-10.0,
maximum=0.0,
step=0.25,
value=DEFAULT_EOS_THRESHOLD,
info="Smaller values cause earlier completion"
)
with gr.Accordion("Advanced Settings", open=False):
model_variant_textbox = gr.Textbox(
label="Model Variant Identifier",
value=DEFAULT_MODEL_VARIANT,
info="Model signature for generation"
)
with gr.Row():
enable_custom_frames_checkbox = gr.Checkbox(
label="Enable Custom Frames After EOS",
value=False,
info="Manually control post-EOS frame generation"
)
frames_after_eos_slider = gr.Slider(
label="Frames After EOS",
minimum=0,
maximum=100,
step=1,
value=DEFAULT_FRAMES_AFTER_EOS,
info="Additional frames after end-of-sequence (80ms per frame)"
)
with gr.Column(scale=1):
text_input_component = gr.Textbox(
label="Prompt",
placeholder="Enter the text you want to convert to speech...",
lines=2,
max_lines=20,
max_length=MAXIMUM_INPUT_LENGTH,
autoscroll=True
)
character_count_display = gr.HTML(
f"""
<div class="character-count">
<span>0 / {MAXIMUM_INPUT_LENGTH}</span>
</div>
""",
visible=False
)
generate_button = gr.Button(
"Generate",
variant="primary",
size="lg",
interactive=False
)
stop_button = gr.Button(
"Stop",
variant="stop",
size="lg",
visible=False
)
clear_button = gr.Button(
"Clear",
variant="secondary",
size="lg",
visible=False
)
gr.HTML(
"""
<div class="example-prompts">
<h3>Example Prompts</h3>
<p>Click any example to generate speech with its assigned voice</p>
</div>
"""
)
example_buttons_list = []
num_examples = len(EXAMPLE_PROMPTS)
examples_per_row = 2
num_rows = math.ceil(num_examples / examples_per_row)
for row_idx in range(num_rows):
with gr.Row():
start_idx = row_idx * examples_per_row
end_idx = min(start_idx + examples_per_row, num_examples)
for i in range(start_idx, end_idx):
btn = gr.Button(
format_example_button_label(
EXAMPLE_PROMPTS[i]["text"],
EXAMPLE_PROMPTS[i]["voice"]
),
size="sm",
variant="secondary"
)
example_buttons_list.append(btn)
gr.HTML(FOOTER())
generation_inputs = [
text_input_component,
voice_mode_radio,
voice_preset_dropdown,
voice_clone_audio_input,
model_variant_textbox,
lsd_decode_steps_slider,
temperature_slider,
noise_clamp_slider,
eos_threshold_slider,
frames_after_eos_slider,
enable_custom_frames_checkbox
]
voice_mode_radio.change(
fn=update_voice_mode_visibility,
inputs=[voice_mode_radio],
outputs=[
preset_voice_container,
clone_voice_container
]
)
text_input_component.change(
fn=calculate_character_count_display,
inputs=[text_input_component],
outputs=[character_count_display]
)
text_input_component.change(
fn=check_generate_button_state,
inputs=[
text_input_component,
ui_state
],
outputs=[generate_button]
)
text_input_component.change(
fn=determine_clear_button_visibility,
inputs=[
text_input_component,
ui_state
],
outputs=[clear_button]
)
generate_button.click(
fn=switch_to_generating_state,
inputs=[ui_state],
outputs=[
generate_button,
stop_button,
clear_button,
ui_state
]
).then(
fn=perform_speech_generation,
inputs=generation_inputs,
outputs=[audio_output_component]
).then(
fn=switch_to_idle_state,
inputs=[
text_input_component,
ui_state
],
outputs=[
generate_button,
stop_button,
clear_button,
ui_state
]
)
stop_button.click(
fn=request_generation_stop,
outputs=[stop_button]
)
clear_button.click(
fn=perform_clear_action,
outputs=[
text_input_component,
audio_output_component,
clear_button,
voice_mode_radio,
voice_preset_dropdown,
voice_clone_audio_input
]
)
for button_index, example_button in enumerate(example_buttons_list):
example_text = EXAMPLE_PROMPTS[button_index]["text"]
example_voice = EXAMPLE_PROMPTS[button_index]["voice"]
example_button.click(
fn=switch_to_generating_state,
inputs=[ui_state],
outputs=[
generate_button,
stop_button,
clear_button,
ui_state
]
).then(
fn=create_example_handler(example_text, example_voice),
outputs=[
text_input_component,
voice_mode_radio,
voice_preset_dropdown
]
).then(
fn=perform_speech_generation,
inputs=generation_inputs,
outputs=[audio_output_component]
).then(
fn=switch_to_idle_state,
inputs=[
text_input_component,
ui_state
],
outputs=[
generate_button,
stop_button,
clear_button,
ui_state
]
)
app.launch(
server_name="0.0.0.0",
max_file_size="1mb"
)