KaniTTS

Sleeping

App Files Files Community

KaniTTS / app.py

ylankgz

Change urls of the models

7296cb2 3 months ago

raw

history blame

6.48 kB

	import os
	import subprocess
	import sys

	# Fix OMP_NUM_THREADS issue before any imports
	os.environ["OMP_NUM_THREADS"] = "4"

	# Install dependencies programmatically to avoid conflicts
	def setup_dependencies():
	try:
	# Check if already installed
	if os.path.exists('/tmp/deps_installed'):
	return

	print("Installing transformers dev version...")
	subprocess.check_call([
	sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-cache-dir",
	"git+https://github.com/huggingface/transformers.git"
	])

	# Mark as installed
	with open('/tmp/deps_installed', 'w') as f:
	f.write('done')

	except Exception as e:
	print(f"Dependencies setup error: {e}")

	# Run setup
	setup_dependencies()

	import spaces
	import gradio as gr
	from util import Config, NemoAudioPlayer, KaniModel, Demo
	import numpy as np
	import torch

	# Get HuggingFace token
	token_ = os.getenv('HF_TOKEN')

	# Model configurations
	models_configs = {
	'base': Config(),
	'female': Config(
	model_name='nineninesix/kani-tts-450m-0.2-ft',
	),
	'male': Config(
	model_name='nineninesix/kani-tts-450m-0.1-ft',
	)
	}

	# Global variables for models (loaded once)
	player = NemoAudioPlayer(Config())
	models = {}
	for model_name, config in models_configs.items():
	print(f"Loading {model_name}...")
	models[model_name] = KaniModel(config, player, token_)
	print(f"{model_name} loaded!")
	print("All models loaded!")


	@spaces.GPU
	def generate_speech_gpu(text, model_choice, t, top_p, rp, max_tok):
	"""
	Generate speech from text using the selected model on GPU
	"""

	if not text.strip():
	return None, "Please enter text for speech generation."

	if not model_choice:
	return None, "Please select a model."

	try:
	# Check GPU availability
	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Using device: {device}")

	# Get selected model
	selected_model = models[model_choice]

	# Generate audio
	print(f"Generating speech with {model_choice}...")
	audio, _, time_report = selected_model.run_model(text, t, top_p, rp, max_tok)

	sample_rate = 22050
	print("Speech generation completed!")

	return (sample_rate, audio), time_report #, f"✅ Audio generated successfully using {model_choice} on {device}"

	except Exception as e:
	print(f"Error during generation: {str(e)}")
	return None, f"❌ Error during generation: {str(e)}"

	# Create Gradio interface
	with gr.Blocks(title="😻 KaniTTS - Text to Speech", theme=gr.themes.Default()) as demo:
	gr.Markdown("# 😻 KaniTTS: Fast and Expressive Speech Generation Model")
	gr.Markdown("Select a model and enter text to generate emotional speech")

	with gr.Row():
	with gr.Column(scale=1):
	model_dropdown = gr.Dropdown(
	choices=list(models_configs.keys()),
	value=list(models_configs.keys())[0],
	label="Selected Model",
	info="Base generates random voices"
	)

	text_input = gr.Textbox(
	label="Text",
	placeholder="Enter your text ...",
	lines=3,
	max_lines=10
	)

	with gr.Accordion("Settings", open=False):
	temp = gr.Slider(
	minimum=0.1, maximum=1.5, value=0.6, step=0.05,
	label="Temp",
	)
	top_p = gr.Slider(
	minimum=0.1, maximum=1.0, value=0.95, step=0.05,
	label="Top P",
	)
	rp = gr.Slider(
	minimum=1.0, maximum=2.0, value=1.1, step=0.05,
	label="Repetition Penalty",
	)
	max_tok = gr.Slider(
	minimum=100, maximum=2000, value=1200, step=100,
	label="Max Tokens",
	)

	generate_btn = gr.Button("Run", variant="primary", size="lg")


	with gr.Column(scale=1):
	audio_output = gr.Audio(
	label="Generated Audio",
	type="numpy"
	)

	time_report_output = gr.Textbox(
	label="Time Report",
	interactive=False,
	value="Ready to generate speech",
	lines=3
	)

	# GPU generation event
	generate_btn.click(
	fn=generate_speech_gpu,
	inputs=[text_input, model_dropdown, temp, top_p, rp, max_tok],
	outputs=[audio_output, time_report_output]
	)

	with gr.Row():

	examples = [
	["Anyway, um, so, um, tell me, tell me all about her. I mean, what's she like? Is she really, you know, pretty?", "male", 0.6, 0.95, 1.1, 1200],
	["No, that does not make you a failure. No, sweetie, no. It just, uh, it just means that you're having a tough time...", "male", 0.6, 0.95, 1.1, 1200],
	["I-- Oh, I am such an idiot sometimes. I'm so sorry. Um, I-I don't know where my head's at.", "male", 0.6, 0.95, 1.1, 1200],
	["Got it. $300,000. I can definitely help you get a very good price for your property by selecting a realtor.", "female", 0.6, 0.95, 1.1, 1200],
	["Holy fu- Oh my God! Don't you understand how dangerous it is, huh?", "male", 0.6, 0.95, 1.1, 1200],
	["You make my days brighter, and my wildest dreams feel like reality. How do you do that?", "female", 0.6, 0.95, 1.1, 1200],
	["Great, and just a couple quick questions so we can match you with the right buyer. Is your home address still 330 East Charleston Road?", "female", 0.6, 0.95, 1.1, 1200],
	["Oh, yeah. I mean did you want to get a quick snack together or maybe something before you go?", "female", 0.6, 0.95, 1.1, 1200],
	]


	gr.Examples(
	examples=examples,
	inputs=[text_input, model_dropdown, temp, top_p, rp, max_tok],
	fn=generate_speech_gpu,
	outputs=[audio_output, time_report_output],
	cache_examples=True,
	)

	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	show_error=True
	)