pocket-tts

Sleeping

App Files Files Community

pocket-tts / app.py

hadadrjt

Pocket TTS: Let's take this seriously.

5da0109 about 2 months ago

raw

history blame contribute delete

11.2 kB

	#
	# SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
	# SPDX-License-Identifier: Apache-2.0
	#

	import math
	import torch
	import gradio as gr
	torch.set_num_threads(1)
	torch.set_num_interop_threads(1)
	from config import (
	AVAILABLE_VOICES,
	DEFAULT_VOICE,
	DEFAULT_MODEL_VARIANT,
	DEFAULT_TEMPERATURE,
	DEFAULT_LSD_DECODE_STEPS,
	DEFAULT_EOS_THRESHOLD,
	DEFAULT_NOISE_CLAMP,
	DEFAULT_FRAMES_AFTER_EOS,
	MAXIMUM_INPUT_LENGTH,
	VOICE_MODE_PRESET,
	VOICE_MODE_CLONE,
	EXAMPLE_PROMPTS
	)
	from src.core.authentication import authenticate_huggingface
	authenticate_huggingface()
	from src.core.memory import start_background_cleanup_thread
	start_background_cleanup_thread()
	from src.generation.handler import (
	perform_speech_generation,
	request_generation_stop
	)
	from src.ui.state import (
	check_generate_button_state,
	calculate_character_count_display,
	determine_clear_button_visibility,
	update_voice_mode_visibility
	)
	from src.ui.handlers import (
	switch_to_generating_state,
	switch_to_idle_state,
	perform_clear_action,
	create_example_handler,
	format_example_button_label
	)
	from assets.css.styles import CSS
	from assets.static.title import TITLE
	from assets.static.header import HEADER
	from assets.static.footer import FOOTER
	from assets.static.sidebar import SIDEBAR

	with gr.Blocks(css=CSS, fill_height=False, fill_width=True) as app:
	ui_state = gr.State({"generating": False})

	with gr.Sidebar():
	gr.HTML(SIDEBAR())

	with gr.Column(elem_classes="header-section"):
	gr.HTML(TITLE())
	gr.HTML(HEADER())

	with gr.Row():
	with gr.Column():
	audio_output_component = gr.Audio(
	label="Generated Speech Output",
	type="filepath",
	interactive=False,
	autoplay=False
	)

	with gr.Accordion("Voice Selection", open=True):
	voice_mode_radio = gr.Radio(
	label="Voice Mode",
	choices=[
	VOICE_MODE_PRESET,
	VOICE_MODE_CLONE
	],
	value=VOICE_MODE_PRESET,
	info="Choose between preset voices or clone a voice from uploaded audio",
	elem_id="voice-mode"
	)

	with gr.Column(visible=True) as preset_voice_container:
	voice_preset_dropdown = gr.Dropdown(
	label="Select Preset Voice",
	choices=AVAILABLE_VOICES,
	value=DEFAULT_VOICE
	)

	with gr.Column(visible=False) as clone_voice_container:
	voice_clone_audio_input = gr.Audio(
	label="Upload Audio for Voice Cloning",
	type="filepath"
	)

	with gr.Accordion("Model Parameters", open=False):
	with gr.Row():
	temperature_slider = gr.Slider(
	label="Temperature",
	minimum=0.1,
	maximum=2.0,
	step=0.05,
	value=DEFAULT_TEMPERATURE,
	info="Higher values produce more expressive speech"
	)

	lsd_decode_steps_slider = gr.Slider(
	label="LSD Decode Steps",
	minimum=1,
	maximum=20,
	step=1,
	value=DEFAULT_LSD_DECODE_STEPS,
	info="More steps may improve quality but slower"
	)

	with gr.Row():
	noise_clamp_slider = gr.Slider(
	label="Noise Clamp",
	minimum=0.0,
	maximum=2.0,
	step=0.05,
	value=DEFAULT_NOISE_CLAMP,
	info="Maximum noise sampling value (0 = disabled)"
	)

	eos_threshold_slider = gr.Slider(
	label="End of Sequence Threshold",
	minimum=-10.0,
	maximum=0.0,
	step=0.25,
	value=DEFAULT_EOS_THRESHOLD,
	info="Smaller values cause earlier completion"
	)

	with gr.Accordion("Advanced Settings", open=False):
	model_variant_textbox = gr.Textbox(
	label="Model Variant Identifier",
	value=DEFAULT_MODEL_VARIANT,
	info="Model signature for generation"
	)

	with gr.Row():
	enable_custom_frames_checkbox = gr.Checkbox(
	label="Enable Custom Frames After EOS",
	value=False,
	info="Manually control post-EOS frame generation"
	)

	frames_after_eos_slider = gr.Slider(
	label="Frames After EOS",
	minimum=0,
	maximum=100,
	step=1,
	value=DEFAULT_FRAMES_AFTER_EOS,
	info="Additional frames after end-of-sequence (80ms per frame)"
	)

	with gr.Column(scale=1):
	text_input_component = gr.Textbox(
	label="Prompt",
	placeholder="Enter the text you want to convert to speech...",
	lines=2,
	max_lines=20,
	max_length=MAXIMUM_INPUT_LENGTH,
	autoscroll=True
	)

	character_count_display = gr.HTML(
	f"<div style='text-align: right; padding: 4px 0;'><span style='color: var(--body-text-color-subdued); font-size: 0.85em;'>0 / {MAXIMUM_INPUT_LENGTH}</span></div>",
	visible=False
	)

	generate_button = gr.Button(
	"Generate",
	variant="primary",
	size="lg",
	interactive=False
	)

	stop_button = gr.Button(
	"Stop",
	variant="stop",
	size="lg",
	visible=False
	)

	clear_button = gr.Button(
	"Clear",
	variant="secondary",
	size="lg",
	visible=False
	)

	gr.HTML(
	"""
	<div style="padding: 16px 0 8px 0;">
	<h3 style="margin: 0 0 8px 0; font-size: 1.1em;">Example Prompts</h3>
	<p style="margin: 0; opacity: 0.7; font-size: 0.9em;">Click any example to generate speech with its assigned voice</p>
	</div>
	"""
	)

	example_buttons_list = []
	num_examples = len(EXAMPLE_PROMPTS)
	examples_per_row = 2
	num_rows = math.ceil(num_examples / examples_per_row)

	for row_idx in range(num_rows):
	with gr.Row():
	start_idx = row_idx * examples_per_row
	end_idx = min(start_idx + examples_per_row, num_examples)
	for i in range(start_idx, end_idx):
	btn = gr.Button(
	format_example_button_label(
	EXAMPLE_PROMPTS[i]["text"],
	EXAMPLE_PROMPTS[i]["voice"]
	),
	size="sm",
	variant="secondary"
	)
	example_buttons_list.append(btn)

	gr.HTML(FOOTER())

	generation_inputs = [
	text_input_component,
	voice_mode_radio,
	voice_preset_dropdown,
	voice_clone_audio_input,
	model_variant_textbox,
	lsd_decode_steps_slider,
	temperature_slider,
	noise_clamp_slider,
	eos_threshold_slider,
	frames_after_eos_slider,
	enable_custom_frames_checkbox
	]

	voice_mode_radio.change(
	fn=update_voice_mode_visibility,
	inputs=[voice_mode_radio],
	outputs=[
	preset_voice_container,
	clone_voice_container
	]
	)

	text_input_component.change(
	fn=calculate_character_count_display,
	inputs=[text_input_component],
	outputs=[character_count_display]
	)

	text_input_component.change(
	fn=check_generate_button_state,
	inputs=[
	text_input_component,
	ui_state
	],
	outputs=[generate_button]
	)

	text_input_component.change(
	fn=determine_clear_button_visibility,
	inputs=[
	text_input_component,
	ui_state
	],
	outputs=[clear_button]
	)

	generate_button.click(
	fn=switch_to_generating_state,
	inputs=[ui_state],
	outputs=[
	generate_button,
	stop_button,
	clear_button,
	ui_state
	]
	).then(
	fn=perform_speech_generation,
	inputs=generation_inputs,
	outputs=[audio_output_component]
	).then(
	fn=switch_to_idle_state,
	inputs=[
	text_input_component,
	ui_state
	],
	outputs=[
	generate_button,
	stop_button,
	clear_button,
	ui_state
	]
	)

	stop_button.click(
	fn=request_generation_stop,
	outputs=[stop_button]
	)

	clear_button.click(
	fn=perform_clear_action,
	outputs=[
	text_input_component,
	audio_output_component,
	clear_button,
	voice_mode_radio,
	voice_preset_dropdown,
	voice_clone_audio_input
	]
	)

	for button_index, example_button in enumerate(example_buttons_list):
	example_text = EXAMPLE_PROMPTS[button_index]["text"]
	example_voice = EXAMPLE_PROMPTS[button_index]["voice"]

	example_button.click(
	fn=switch_to_generating_state,
	inputs=[ui_state],
	outputs=[
	generate_button,
	stop_button,
	clear_button,
	ui_state
	]
	).then(
	fn=create_example_handler(example_text, example_voice),
	outputs=[
	text_input_component,
	voice_mode_radio,
	voice_preset_dropdown
	]
	).then(
	fn=perform_speech_generation,
	inputs=generation_inputs,
	outputs=[audio_output_component]
	).then(
	fn=switch_to_idle_state,
	inputs=[
	text_input_component,
	ui_state
	],
	outputs=[
	generate_button,
	stop_button,
	clear_button,
	ui_state
	]
	)

	app.launch(server_name="0.0.0.0")