|
|
import os |
|
|
import subprocess |
|
|
import sys |
|
|
|
|
|
|
|
|
os.environ["OMP_NUM_THREADS"] = "4" |
|
|
|
|
|
|
|
|
def setup_dependencies(): |
|
|
try: |
|
|
|
|
|
if os.path.exists('/tmp/deps_installed'): |
|
|
return |
|
|
|
|
|
print("Installing transformers dev version...") |
|
|
subprocess.check_call([ |
|
|
sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-cache-dir", |
|
|
"git+https://github.com/huggingface/transformers.git" |
|
|
]) |
|
|
|
|
|
|
|
|
with open('/tmp/deps_installed', 'w') as f: |
|
|
f.write('done') |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Dependencies setup error: {e}") |
|
|
|
|
|
|
|
|
setup_dependencies() |
|
|
|
|
|
import spaces |
|
|
import gradio as gr |
|
|
from util import Config, NemoAudioPlayer, KaniModel, Demo |
|
|
import numpy as np |
|
|
import torch |
|
|
|
|
|
|
|
|
token_ = os.getenv('HF_TOKEN') |
|
|
|
|
|
|
|
|
models_configs = { |
|
|
'base': Config(), |
|
|
'female': Config( |
|
|
model_name='nineninesix/kani-tts-450m-0.2-ft', |
|
|
), |
|
|
'male': Config( |
|
|
model_name='nineninesix/kani-tts-450m-0.1-ft', |
|
|
) |
|
|
} |
|
|
|
|
|
|
|
|
player = NemoAudioPlayer(Config()) |
|
|
models = {} |
|
|
for model_name, config in models_configs.items(): |
|
|
print(f"Loading {model_name}...") |
|
|
models[model_name] = KaniModel(config, player, token_) |
|
|
print(f"{model_name} loaded!") |
|
|
print("All models loaded!") |
|
|
|
|
|
|
|
|
@spaces.GPU |
|
|
def generate_speech_gpu(text, model_choice, t, top_p, rp, max_tok): |
|
|
""" |
|
|
Generate speech from text using the selected model on GPU |
|
|
""" |
|
|
|
|
|
if not text.strip(): |
|
|
return None, "Please enter text for speech generation." |
|
|
|
|
|
if not model_choice: |
|
|
return None, "Please select a model." |
|
|
|
|
|
try: |
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
print(f"Using device: {device}") |
|
|
|
|
|
|
|
|
selected_model = models[model_choice] |
|
|
|
|
|
|
|
|
print(f"Generating speech with {model_choice}...") |
|
|
audio, _, time_report = selected_model.run_model(text, t, top_p, rp, max_tok) |
|
|
|
|
|
sample_rate = 22050 |
|
|
print("Speech generation completed!") |
|
|
|
|
|
return (sample_rate, audio), time_report |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error during generation: {str(e)}") |
|
|
return None, f"❌ Error during generation: {str(e)}" |
|
|
|
|
|
|
|
|
with gr.Blocks(title="😻 KaniTTS - Text to Speech", theme=gr.themes.Default()) as demo: |
|
|
gr.Markdown("# 😻 KaniTTS: Fast and Expressive Speech Generation Model") |
|
|
gr.Markdown("Select a model and enter text to generate emotional speech") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
model_dropdown = gr.Dropdown( |
|
|
choices=list(models_configs.keys()), |
|
|
value=list(models_configs.keys())[0], |
|
|
label="Selected Model", |
|
|
info="Base generates random voices" |
|
|
) |
|
|
|
|
|
text_input = gr.Textbox( |
|
|
label="Text", |
|
|
placeholder="Enter your text ...", |
|
|
lines=3, |
|
|
max_lines=10 |
|
|
) |
|
|
|
|
|
with gr.Accordion("Settings", open=False): |
|
|
temp = gr.Slider( |
|
|
minimum=0.1, maximum=1.5, value=0.6, step=0.05, |
|
|
label="Temp", |
|
|
) |
|
|
top_p = gr.Slider( |
|
|
minimum=0.1, maximum=1.0, value=0.95, step=0.05, |
|
|
label="Top P", |
|
|
) |
|
|
rp = gr.Slider( |
|
|
minimum=1.0, maximum=2.0, value=1.1, step=0.05, |
|
|
label="Repetition Penalty", |
|
|
) |
|
|
max_tok = gr.Slider( |
|
|
minimum=100, maximum=2000, value=1200, step=100, |
|
|
label="Max Tokens", |
|
|
) |
|
|
|
|
|
generate_btn = gr.Button("Run", variant="primary", size="lg") |
|
|
|
|
|
|
|
|
with gr.Column(scale=1): |
|
|
audio_output = gr.Audio( |
|
|
label="Generated Audio", |
|
|
type="numpy" |
|
|
) |
|
|
|
|
|
time_report_output = gr.Textbox( |
|
|
label="Time Report", |
|
|
interactive=False, |
|
|
value="Ready to generate speech", |
|
|
lines=3 |
|
|
) |
|
|
|
|
|
|
|
|
generate_btn.click( |
|
|
fn=generate_speech_gpu, |
|
|
inputs=[text_input, model_dropdown, temp, top_p, rp, max_tok], |
|
|
outputs=[audio_output, time_report_output] |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
|
|
|
examples = [ |
|
|
["Anyway, um, so, um, tell me, tell me all about her. I mean, what's she like? Is she really, you know, pretty?", "male", 0.6, 0.95, 1.1, 1200], |
|
|
["No, that does not make you a failure. No, sweetie, no. It just, uh, it just means that you're having a tough time...", "male", 0.6, 0.95, 1.1, 1200], |
|
|
["I-- Oh, I am such an idiot sometimes. I'm so sorry. Um, I-I don't know where my head's at.", "male", 0.6, 0.95, 1.1, 1200], |
|
|
["Got it. $300,000. I can definitely help you get a very good price for your property by selecting a realtor.", "female", 0.6, 0.95, 1.1, 1200], |
|
|
["Holy fu- Oh my God! Don't you understand how dangerous it is, huh?", "male", 0.6, 0.95, 1.1, 1200], |
|
|
["You make my days brighter, and my wildest dreams feel like reality. How do you do that?", "female", 0.6, 0.95, 1.1, 1200], |
|
|
["Great, and just a couple quick questions so we can match you with the right buyer. Is your home address still 330 East Charleston Road?", "female", 0.6, 0.95, 1.1, 1200], |
|
|
["Oh, yeah. I mean did you want to get a quick snack together or maybe something before you go?", "female", 0.6, 0.95, 1.1, 1200], |
|
|
] |
|
|
|
|
|
|
|
|
gr.Examples( |
|
|
examples=examples, |
|
|
inputs=[text_input, model_dropdown, temp, top_p, rp, max_tok], |
|
|
fn=generate_speech_gpu, |
|
|
outputs=[audio_output, time_report_output], |
|
|
cache_examples=True, |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch( |
|
|
server_name="0.0.0.0", |
|
|
server_port=7860, |
|
|
show_error=True |
|
|
) |