tts

Running

App Files Files Community

tts / app.py

Xlnk

Update app.py

babc541 verified 9 days ago

raw

history blame contribute delete

21.2 kB

	import gradio as gr
	import numpy as np
	import os
	import time
	import torch
	import tempfile
	import threading
	import scipy.io.wavfile
	import traceback
	from huggingface_hub import login
	from pocket_tts import TTSModel

	# Configure PyTorch threading behavior for CPU optimization
	torch.set_num_threads(1)
	torch.set_num_interop_threads(1)

	# HF Token for gated models in Spaces
	hf_token = os.getenv("HF_TOKEN")
	if hf_token:
	print("HF_TOKEN found, logging in...")
	login(token=hf_token)

	VOICES = ['alba', 'marius', 'javert', 'jean', 'fantine', 'cosette', 'eponine', 'azelma']

	# Default configuration values
	DEFAULT_VOICE = "alba"
	DEFAULT_MODEL_VARIANT = "b6369a24"
	DEFAULT_TEMPERATURE = 0.1
	DEFAULT_LSD_DECODE_STEPS = 1
	DEFAULT_EOS_THRESHOLD = -4.0
	DEFAULT_NOISE_CLAMP = 0.0
	DEFAULT_FRAMES_AFTER_EOS = 10
	MAXIMUM_INPUT_LENGTH = 111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111
	TEMPORARY_FILE_LIFETIME_SECONDS = 7200 # 2 hours

	generation_state_lock = threading.Lock()
	is_currently_generating = False
	stop_generation_requested = False

	temporary_files_registry = {}
	temporary_files_lock = threading.Lock()


	class TextToSpeechManager:
	"""
	Manages TTS model lifecycle and speech generation operations.
	Implements lazy loading and caching strategies for performance.
	"""

	def __init__(self):
	self.loaded_model = None
	self.current_configuration = {}
	self.voice_state_cache = {}

	def load_or_get_model(
	self,
	model_variant,
	temperature,
	lsd_decode_steps,
	noise_clamp,
	eos_threshold
	):
	"""Load a TTS model or return cached instance if configuration matches."""
	processed_variant = str(model_variant or DEFAULT_MODEL_VARIANT).strip()
	processed_temperature = float(temperature) if temperature is not None else DEFAULT_TEMPERATURE
	processed_lsd_steps = int(lsd_decode_steps) if lsd_decode_steps is not None else DEFAULT_LSD_DECODE_STEPS
	processed_noise_clamp = float(noise_clamp) if noise_clamp and float(noise_clamp) > 0 else None
	processed_eos_threshold = float(eos_threshold) if eos_threshold is not None else DEFAULT_EOS_THRESHOLD

	requested_configuration = {
	"variant": processed_variant,
	"temp": processed_temperature,
	"lsd_decode_steps": processed_lsd_steps,
	"noise_clamp": processed_noise_clamp,
	"eos_threshold": processed_eos_threshold
	}

	if self.loaded_model is None or self.current_configuration != requested_configuration:
	print(f"Loading model with config: {requested_configuration}")
	self.loaded_model = TTSModel.load_model(**requested_configuration)
	self.current_configuration = requested_configuration
	self.voice_state_cache = {}
	print("Model loaded.")

	return self.loaded_model

	def get_voice_state_for_preset(self, voice_name):
	"""Get or compute voice state for a preset voice with caching."""
	validated_voice = voice_name if voice_name in VOICES else DEFAULT_VOICE

	if validated_voice not in self.voice_state_cache:
	self.voice_state_cache[validated_voice] = self.loaded_model.get_state_for_audio_prompt(
	audio_conditioning=validated_voice,
	truncate=False
	)

	return self.voice_state_cache[validated_voice]

	def get_voice_state_for_clone(self, audio_file_path):
	"""Compute voice state from uploaded audio file for voice cloning."""
	return self.loaded_model.get_state_for_audio_prompt(
	audio_conditioning=audio_file_path,
	truncate=False
	)

	def generate_audio(self, text_content, voice_state, frames_after_eos, enable_custom_frames):
	"""Generate speech audio from text using the specified voice state."""
	processed_frames = int(frames_after_eos) if enable_custom_frames else None

	return self.loaded_model.generate_audio(
	model_state=voice_state,
	text_to_generate=text_content,
	frames_after_eos=processed_frames,
	copy_state=True
	)

	def save_audio_to_file(self, audio_tensor):
	"""Save generated audio tensor to a temporary WAV file."""
	audio_numpy_data = audio_tensor.numpy()
	audio_sample_rate = self.loaded_model.sample_rate

	output_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
	scipy.io.wavfile.write(output_file.name, audio_sample_rate, audio_numpy_data)

	with temporary_files_lock:
	temporary_files_registry[output_file.name] = time.time()

	return output_file.name


	# Create global TTS manager instance
	tts_manager = TextToSpeechManager()

	# Load model at startup with default parameters
	print("Loading PocketTTS model with default parameters...")
	tts_manager.load_or_get_model(
	DEFAULT_MODEL_VARIANT,
	DEFAULT_TEMPERATURE,
	DEFAULT_LSD_DECODE_STEPS,
	DEFAULT_NOISE_CLAMP,
	DEFAULT_EOS_THRESHOLD
	)
	print("Model ready!")

	def cleanup_expired_temporary_files():
	"""Remove temporary files that have exceeded their lifetime."""
	current_timestamp = time.time()
	expired_files = []

	with temporary_files_lock:
	for file_path, creation_timestamp in list(temporary_files_registry.items()):
	if current_timestamp - creation_timestamp > TEMPORARY_FILE_LIFETIME_SECONDS:
	expired_files.append(file_path)

	for file_path in expired_files:
	try:
	if os.path.exists(file_path):
	os.remove(file_path)
	del temporary_files_registry[file_path]
	except Exception:
	pass


	def validate_text_input(text_content):
	"""Validate and clean text input for speech generation."""
	if not text_content or not isinstance(text_content, str):
	return False, ""

	cleaned_text = text_content.strip()

	if not cleaned_text:
	return False, ""

	if len(cleaned_text) > MAXIMUM_INPUT_LENGTH:
	return False, f"Input exceeds maximum length of {MAXIMUM_INPUT_LENGTH} characters."

	return True, cleaned_text


	def request_generation_stop():
	"""Signal a request to stop the current generation."""
	global stop_generation_requested
	stop_generation_requested = True
	return gr.update(interactive=False)

	# Speech generation function
	def generate_speech(
	text,
	voice_mode,
	voice_dropdown,
	voice_upload,
	temperature,
	lsd_decode_steps,
	noise_clamp,
	eos_threshold,
	frames_after_eos,
	enable_custom_frames
	):
	"""Perform the complete speech generation workflow with thread safety."""
	global is_currently_generating, stop_generation_requested

	cleanup_expired_temporary_files()

	is_valid, validation_result = validate_text_input(text)
	if not is_valid:
	if validation_result:
	raise gr.Error(validation_result)
	raise gr.Error("Please enter valid text to generate speech.")

	if voice_mode == "Voice Cloning" and not voice_upload:
	raise gr.Error("Please upload an audio file for voice cloning.")

	with generation_state_lock:
	if is_currently_generating:
	raise gr.Error("A generation is already in progress. Please wait.")
	is_currently_generating = True
	stop_generation_requested = False

	try:
	tts_manager.load_or_get_model(
	DEFAULT_MODEL_VARIANT,
	temperature,
	lsd_decode_steps,
	noise_clamp,
	eos_threshold
	)

	if stop_generation_requested:
	return None

	if voice_mode == "Voice Cloning":
	voice_state = tts_manager.get_voice_state_for_clone(voice_upload)
	else:
	voice_state = tts_manager.get_voice_state_for_preset(voice_dropdown)

	if stop_generation_requested:
	return None

	print(f"Generating with voice mode: {voice_mode}, temp: {temperature}, lsd_steps: {lsd_decode_steps}")

	generated_audio = tts_manager.generate_audio(
	validation_result,
	voice_state,
	frames_after_eos,
	enable_custom_frames
	)

	if stop_generation_requested:
	return None

	output_file_path = tts_manager.save_audio_to_file(generated_audio)
	return output_file_path

	except gr.Error:
	raise
	except Exception as e:
	full_error = traceback.format_exc()
	print(f"Unexpected error: {full_error}")
	raise gr.Error(f"An unexpected error occurred: {str(e)}")
	finally:
	with generation_state_lock:
	is_currently_generating = False
	stop_generation_requested = False


	# Load custom theme with fallback
	try:
	theme = gr.Theme.from_hub("JohnSmith9982/small_and_pretty")
	except Exception as e:
	print(f"Warning: Could not load custom theme: {e}. Using default Soft theme.")
	theme = gr.themes.Soft()

	css = """
	footer {visibility: hidden}
	.gradio-container {
	max-width: 100% !important;
	padding: 0 !important;
	}
	@media (min-width: 768px) {
	.gradio-container {
	padding-left: 2% !important;
	padding-right: 2% !important;
	}
	}
	.header-section {
	text-align: left;
	margin-bottom: 1.5rem;
	}
	.main-title {
	color: #10b981;
	font-weight: 800;
	font-size: 1.8rem;
	margin: 5px 0;
	}
	@media (min-width: 768px) {
	.main-title {
	font-size: 2.2rem;
	}
	}
	.logo-container {
	display: flex;
	justify-content: flex-start;
	align-items: center;
	gap: 10px;
	margin-bottom: 10px;
	}
	.logo-img {
	height: 40px;
	border-radius: 8px;
	}
	@media (min-width: 768px) {
	.logo-img {
	height: 50px;
	}
	.logo-container {
	gap: 15px;
	}
	}
	.description {
	max-width: 900px;
	margin: 10px 0;
	font-size: 0.95rem;
	line-height: 1.5;
	color: #4b5563;
	}
	.links-row {
	display: flex;
	flex-wrap: wrap;
	justify-content: flex-start;
	gap: 8px;
	margin: 10px 0;
	font-size: 0.85rem;
	}
	@media (min-width: 768px) {
	.links-row {
	gap: 10px;
	font-size: 0.9rem;
	}
	}
	.links-row a {
	color: #10b981;
	text-decoration: none;
	padding: 3px 10px;
	border: 1px solid #10b981;
	border-radius: 15px;
	transition: all 0.2s;
	white-space: nowrap;
	}
	.links-row a:hover {
	background-color: #10b981;
	color: white;
	}
	.social-handles {
	display: flex;
	justify-content: center;
	gap: 20px;
	margin: 15px 0;
	}
	.social-icon {
	width: 28px;
	height: 28px;
	transition: all 0.3s ease;
	}
	.social-icon:hover {
	transform: scale(1.1) translateY(-3px);
	}
	.disclaimer {
	text-align: center;
	font-size: 0.8rem;
	color: #9ca3af;
	margin-top: 30px;
	padding: 15px;
	border-top: 1px solid #f3f4f6;
	}
	@media (min-width: 768px) {
	.disclaimer {
	margin-top: 40px;
	padding: 20px;
	}
	}
	#voice-mode .wrap {
	display: flex !important;
	flex-direction: row !important;
	width: 100% !important;
	}

	#voice-mode .wrap label {
	flex: 1 !important;
	justify-content: center !important;
	text-align: center !important;
	}
	"""

	with gr.Blocks() as demo:
	with gr.Column(elem_classes="header-section"):
	with gr.Row():
	with gr.Column(scale=4):
	gr.HTML("""
	<div class="logo-container">
	<img src="https://me.xo.je/icon.svg" class="logo-img" alt="Kyutai Logo">

	<h1 class='main-title'>XlnkTTS</h1>
	</div>
	""")
	gr.HTML("""

	""")
	gr.HTML("""

	""")

	with gr.Row():
	with gr.Column(scale=1):
	text_input = gr.Textbox(
	label="Text to Speak",
	placeholder="Enter text here...",
	value="Hello! Welcome to Pocket TTS. This lightweight text to speech model runs entirely on your CPU. Try changing the voice or adjusting the generation parameters below.",
	lines=9,
	elem_id="text-input"
	)
	voice_mode = gr.Radio(
	choices=["Kyutai Voices", "Voice Cloning"],
	value="Kyutai Voices",
	label="Voice Mode",
	elem_id="voice-mode"
	)

	with gr.Column(visible=True) as standard_voice_col:
	voice_select = gr.Dropdown(
	choices=VOICES,
	value="alba",
	label="Select from Kyutai Voices",
	elem_id="voice-select"
	)

	with gr.Column(visible=False) as cloning_voice_col:
	voice_upload = gr.Audio(
	label="Upload Voice for Cloning (WAV/MP3)",
	type="filepath",
	elem_id="voice-upload"
	)

	# Generation Parameters Accordion
	with gr.Accordion("⚙️ Generation Parameters", open=False):
	with gr.Row():
	temperature_slider = gr.Slider(
	label="Temperature",
	minimum=0.1,
	maximum=2.0,
	step=0.05,
	value=DEFAULT_TEMPERATURE,
	info="Higher values produce more expressive speech"
	)
	lsd_decode_steps_slider = gr.Slider(
	label="LSD Decode Steps",
	minimum=1,
	maximum=20,
	step=1,
	value=DEFAULT_LSD_DECODE_STEPS,
	info="More steps may improve quality but slower"
	)

	with gr.Row():
	noise_clamp_slider = gr.Slider(
	label="Noise Clamp",
	minimum=0.0,
	maximum=2.0,
	step=0.05,
	value=DEFAULT_NOISE_CLAMP,
	info="Maximum noise sampling value (0 = disabled)"
	)
	eos_threshold_slider = gr.Slider(
	label="End of Sequence Threshold",
	minimum=-10.0,
	maximum=0.0,
	step=0.25,
	value=DEFAULT_EOS_THRESHOLD,
	info="Smaller values cause earlier completion"
	)

	with gr.Row():
	enable_custom_frames_checkbox = gr.Checkbox(
	label="Enable Custom Frames After EOS",
	value=False,
	info="Manually control post-EOS frame generation"
	)
	frames_after_eos_slider = gr.Slider(
	label="Frames After EOS",
	minimum=0,
	maximum=100,
	step=1,
	value=DEFAULT_FRAMES_AFTER_EOS,
	info="Additional frames after end-of-sequence (80ms per frame)"
	)

	with gr.Row():
	clear_btn = gr.Button("🗑️ Clear", variant="secondary")
	generate_btn = gr.Button("⚡ Generate", variant="primary")
	stop_btn = gr.Button("🔴 Stop", variant="stop", visible=False)

	with gr.Column(scale=1):
	audio_output = gr.Audio(
	label="Audio Output",
	autoplay=True,
	elem_id="audio-output"
	)
	gr.Examples(
	examples=[
	["On Tuesday, the seventeenth of October, two thousand twenty-five, at exactly six forty-five in the morning, the outdoor temperature dropped to twelve point eight degrees Celsius. The forecast predicts a high of twenty-two degrees by noon.", "alba"],
	["Welcome to Station Forty-Seven. Your train to Platform Nineteen B will arrive in approximately fifteen minutes. Please have your tickets ready for inspection.", "marius"],
	["You dare defy me? I have spent twenty long years hunting you across every shadow and every corner of this wretched kingdom. There is no escape. Justice will find you, and when it does, you will kneel before me and beg for mercy that will never come!", "javert"],
	["Flight Seven Ninety-Two to London Heathrow is now boarding at Gate Twenty-Three A. Final call for passengers Smith and Johnson. Departure is scheduled for fourteen thirty hours.", "jean"],
	["Our quarterly revenue reached four point seven million dollars, up eighteen percent from last year. The board meeting is scheduled for the twenty-fifth of November at two fifteen in the afternoon.", "fantine"],
	["The recipe calls for three hundred fifty grams of flour, two hundred milliliters of milk, and one point five teaspoons of vanilla extract. Bake at one hundred eighty degrees for forty-five minutes.", "cosette"],
	["Chapter Fourteen, Page Two Hundred Thirty-Seven. The mysterious traveler arrived at the inn precisely at midnight. He carried nothing but a worn leather satchel and spoke with an accent no one could place.", "eponine"],
	["Exercise routine: Run five kilometers in under thirty minutes. Complete three sets of fifteen push-ups. Rest for ninety seconds between each set. Cool down with ten minutes of stretching.", "azelma"]
	],
	inputs=[text_input, voice_select],
	)

	gr.HTML("""
	<div class="disclaimer">

	""")

	# Visibility Toggling
	def update_voice_ui(mode):
	if mode == "Kyutai Voices":
	return gr.update(visible=True), gr.update(visible=False)
	else:
	return gr.update(visible=False), gr.update(visible=True)

	voice_mode.change(
	fn=update_voice_ui,
	inputs=[voice_mode],
	outputs=[standard_voice_col, cloning_voice_col]
	)

	# Define generation inputs list
	generation_inputs = [
	text_input,
	voice_mode,
	voice_select,
	voice_upload,
	temperature_slider,
	lsd_decode_steps_slider,
	noise_clamp_slider,
	eos_threshold_slider,
	frames_after_eos_slider,
	enable_custom_frames_checkbox
	]

	# UI state management functions
	def switch_to_generating_state():
	return (
	gr.update(visible=False), # Hide generate button
	gr.update(visible=True, interactive=True) # Show stop button
	)

	def switch_to_idle_state():
	return (
	gr.update(visible=True), # Show generate button
	gr.update(visible=False) # Hide stop button
	)

	# Event handlers
	generate_btn.click(
	fn=switch_to_generating_state,
	outputs=[generate_btn, stop_btn]
	).then(
	fn=generate_speech,
	inputs=generation_inputs,
	outputs=audio_output
	).then(
	fn=switch_to_idle_state,
	outputs=[generate_btn, stop_btn]
	)

	text_input.submit(
	fn=switch_to_generating_state,
	outputs=[generate_btn, stop_btn]
	).then(
	fn=generate_speech,
	inputs=generation_inputs,
	outputs=audio_output
	).then(
	fn=switch_to_idle_state,
	outputs=[generate_btn, stop_btn]
	)

	# Stop button handler
	stop_btn.click(
	fn=request_generation_stop,
	outputs=[stop_btn]
	)

	# Clear button handler - also reset generation parameters
	def perform_clear_action():
	return (
	"", # text_input
	"Kyutai Voices", # voice_mode
	"alba", # voice_select
	None, # voice_upload
	None, # audio_output
	DEFAULT_TEMPERATURE, # temperature_slider
	DEFAULT_LSD_DECODE_STEPS, # lsd_decode_steps_slider
	DEFAULT_NOISE_CLAMP, # noise_clamp_slider
	DEFAULT_EOS_THRESHOLD, # eos_threshold_slider
	DEFAULT_FRAMES_AFTER_EOS, # frames_after_eos_slider
	False # enable_custom_frames_checkbox
	)

	clear_btn.click(
	fn=perform_clear_action,
	outputs=[
	text_input,
	voice_mode,
	voice_select,
	voice_upload,
	audio_output,
	temperature_slider,
	lsd_decode_steps_slider,
	noise_clamp_slider,
	eos_threshold_slider,
	frames_after_eos_slider,
	enable_custom_frames_checkbox
	]
	)

	if __name__ == "__main__":
	demo.queue().launch(theme=theme, css=css)