KaniTTS / app.py
ylankgz's picture
enable settings
088ca61
raw
history blame
6.51 kB
import os
import subprocess
import sys
# Fix OMP_NUM_THREADS issue before any imports
os.environ["OMP_NUM_THREADS"] = "4"
# Install dependencies programmatically to avoid conflicts
def setup_dependencies():
try:
# Check if already installed
if os.path.exists('/tmp/deps_installed'):
return
print("Installing transformers dev version...")
subprocess.check_call([
sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-cache-dir",
"git+https://github.com/huggingface/transformers.git"
])
# Mark as installed
with open('/tmp/deps_installed', 'w') as f:
f.write('done')
except Exception as e:
print(f"Dependencies setup error: {e}")
# Run setup
setup_dependencies()
import spaces
import gradio as gr
from util import Config, NemoAudioPlayer, KaniModel, Demo
import numpy as np
import torch
# Get HuggingFace token
token_ = os.getenv('HF_TOKEN')
# Model configurations
models_configs = {
'base': Config(),
'female': Config(
model_name='nineninesix/lfm-nano-codec-expresso-ex02-v.0.2',
),
'male': Config(
model_name='nineninesix/lfm-nano-codec-expresso-ex01-v.0.1',
)
}
# Global variables for models (loaded once)
player = NemoAudioPlayer(Config())
models = {}
for model_name, config in models_configs.items():
print(f"Loading {model_name}...")
models[model_name] = KaniModel(config, player, token_)
print(f"{model_name} loaded!")
print("All models loaded!")
@spaces.GPU
def generate_speech_gpu(text, model_choice, t, top_p, rp, max_tok):
"""
Generate speech from text using the selected model on GPU
"""
if not text.strip():
return None, "Please enter text for speech generation."
if not model_choice:
return None, "Please select a model."
try:
# Check GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
# Get selected model
selected_model = models[model_choice]
# Generate audio
print(f"Generating speech with {model_choice}...")
audio, _, time_report = selected_model.run_model(text, t, top_p, rp, max_tok)
sample_rate = 22050
print("Speech generation completed!")
return (sample_rate, audio), time_report #, f"✅ Audio generated successfully using {model_choice} on {device}"
except Exception as e:
print(f"Error during generation: {str(e)}")
return None, f"❌ Error during generation: {str(e)}"
# Create Gradio interface
with gr.Blocks(title="😻 KaniTTS - Text to Speech", theme=gr.themes.Default()) as demo:
gr.Markdown("# 😻 KaniTTS: Fast and Expressive Speech Generation Model")
gr.Markdown("Select a model and enter text to generate emotional speech")
with gr.Row():
with gr.Column(scale=1):
model_dropdown = gr.Dropdown(
choices=list(models_configs.keys()),
value=list(models_configs.keys())[0],
label="Selected Model",
info="Base generates random voices"
)
text_input = gr.Textbox(
label="Text",
placeholder="Enter your text ...",
lines=3,
max_lines=10
)
with gr.Accordion("Settings", open=False):
temp = gr.Slider(
minimum=0.1, maximum=1.5, value=0.6, step=0.05,
label="Temp",
)
top_p = gr.Slider(
minimum=0.1, maximum=1.0, value=0.95, step=0.05,
label="Top P",
)
rp = gr.Slider(
minimum=1.0, maximum=2.0, value=1.1, step=0.05,
label="Repetition Penalty",
)
max_tok = gr.Slider(
minimum=100, maximum=2000, value=1200, step=100,
label="Max Tokens",
)
generate_btn = gr.Button("Run", variant="primary", size="lg")
with gr.Column(scale=1):
audio_output = gr.Audio(
label="Generated Audio",
type="numpy"
)
time_report_output = gr.Textbox(
label="Time Report",
interactive=False,
value="Ready to generate speech",
lines=3
)
# GPU generation event
generate_btn.click(
fn=generate_speech_gpu,
inputs=[text_input, model_dropdown, temp, top_p, rp, max_tok],
outputs=[audio_output, time_report_output]
)
with gr.Row():
examples = [
["Anyway, um, so, um, tell me, tell me all about her. I mean, what's she like? Is she really, you know, pretty?", "male", 0.6, 0.95, 1.1, 1200],
["No, that does not make you a failure. No, sweetie, no. It just, uh, it just means that you're having a tough time...", "male", 0.6, 0.95, 1.1, 1200],
["I-- Oh, I am such an idiot sometimes. I'm so sorry. Um, I-I don't know where my head's at.", "male", 0.6, 0.95, 1.1, 1200],
["Got it. $300,000. I can definitely help you get a very good price for your property by selecting a realtor.", "female", 0.6, 0.95, 1.1, 1200],
["Holy fu- Oh my God! Don't you understand how dangerous it is, huh?", "male", 0.6, 0.95, 1.1, 1200],
["You make my days brighter, and my wildest dreams feel like reality. How do you do that?", "female", 0.6, 0.95, 1.1, 1200],
["Great, and just a couple quick questions so we can match you with the right buyer. Is your home address still 330 East Charleston Road?", "female", 0.6, 0.95, 1.1, 1200],
["Oh, yeah. I mean did you want to get a quick snack together or maybe something before you go?", "female", 0.6, 0.95, 1.1, 1200],
]
gr.Examples(
examples=examples,
inputs=[text_input, model_dropdown, temp, top_p, rp, max_tok],
fn=generate_speech_gpu,
outputs=[audio_output, time_report_output],
cache_examples=True,
)
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
show_error=True
)