|
|
from create_env import setup_dependencies |
|
|
|
|
|
setup_dependencies() |
|
|
|
|
|
import spaces |
|
|
import gradio as gr |
|
|
from util import NemoAudioPlayer, InitModels, load_config, Examples |
|
|
import numpy as np |
|
|
import torch |
|
|
import os |
|
|
|
|
|
|
|
|
token_ = os.getenv('HF_TOKEN') |
|
|
|
|
|
config = load_config("./model_config.yaml") |
|
|
models_configs = config.models |
|
|
nemo_player_cfg = config.nemo_player |
|
|
|
|
|
examples_cfg = load_config("./examples.yaml") |
|
|
examples_maker = Examples(examples_cfg) |
|
|
examples = examples_maker() |
|
|
|
|
|
player = NemoAudioPlayer(nemo_player_cfg) |
|
|
init_models = InitModels(models_configs, player, token_) |
|
|
models = init_models() |
|
|
|
|
|
|
|
|
@spaces.GPU |
|
|
def generate_speech_gpu(text, model_choice, speaker_display: str, t, top_p, rp, max_tok): |
|
|
""" |
|
|
Generate speech from text using the selected model on GPU |
|
|
""" |
|
|
|
|
|
if not text.strip(): |
|
|
return None, "Please enter text for speech generation." |
|
|
|
|
|
if not model_choice: |
|
|
return None, "Please select a model." |
|
|
|
|
|
try: |
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
print(f"Using device: {device}") |
|
|
|
|
|
selected_model = models[model_choice] |
|
|
cfg = models_configs.get(model_choice) |
|
|
speaker_map = cfg.get('speaker_id', {}) if cfg is not None else {} |
|
|
if speaker_display and speaker_map: |
|
|
speaker_id = speaker_map.get(speaker_display) |
|
|
else: |
|
|
speaker_id = None |
|
|
|
|
|
print(f"Generating speech with {model_choice}...") |
|
|
audio, _, time_report = selected_model.run_model(text, speaker_id, t, top_p, rp, max_tok) |
|
|
|
|
|
sample_rate = 22050 |
|
|
print("Speech generation completed!") |
|
|
|
|
|
return (sample_rate, audio), time_report |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error during generation: {str(e)}") |
|
|
return None, f"❌ Error during generation: {str(e)}" |
|
|
|
|
|
|
|
|
with gr.Blocks(title="😻 KaniTTS - Text to Speech", theme=gr.themes.Ocean()) as demo: |
|
|
gr.Markdown("# 😻 KaniTTS: Fast and Expressive Speech Generation Model") |
|
|
gr.Markdown("Select a model and enter text to generate emotional speech") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
model_dropdown = gr.Dropdown( |
|
|
choices=list(models_configs.keys()), |
|
|
value=list(models_configs.keys())[0], |
|
|
label="Selected Model", |
|
|
info="Base generates random voices" |
|
|
) |
|
|
|
|
|
|
|
|
all_speakers = [] |
|
|
for _cfg in models_configs.values(): |
|
|
if _cfg and _cfg.get('speaker_id'): |
|
|
all_speakers.extend(list(_cfg.speaker_id.keys())) |
|
|
all_speakers = sorted(list(set(all_speakers))) |
|
|
speaker_dropdown = gr.Dropdown( |
|
|
choices=all_speakers, |
|
|
value=None, |
|
|
label="Speaker", |
|
|
visible=False, |
|
|
allow_custom_value=True |
|
|
) |
|
|
|
|
|
text_input = gr.Textbox( |
|
|
label="Text", |
|
|
placeholder="Enter your text ...", |
|
|
lines=3, |
|
|
max_lines=10 |
|
|
) |
|
|
|
|
|
with gr.Accordion("Settings", open=False): |
|
|
temp = gr.Slider( |
|
|
minimum=0.1, maximum=1.5, value=0.6, step=0.05, |
|
|
label="Temp", |
|
|
) |
|
|
top_p = gr.Slider( |
|
|
minimum=0.1, maximum=1.0, value=0.95, step=0.05, |
|
|
label="Top P", |
|
|
) |
|
|
rp = gr.Slider( |
|
|
minimum=1.0, maximum=2.0, value=1.1, step=0.05, |
|
|
label="Repetition Penalty", |
|
|
) |
|
|
max_tok = gr.Slider( |
|
|
minimum=100, maximum=2000, value=1000, step=100, |
|
|
label="Max Tokens", |
|
|
) |
|
|
|
|
|
generate_btn = gr.Button("Run", variant="primary", size="lg") |
|
|
|
|
|
|
|
|
with gr.Column(scale=1): |
|
|
audio_output = gr.Audio( |
|
|
label="Generated Audio", |
|
|
type="numpy" |
|
|
) |
|
|
|
|
|
time_report_output = gr.Textbox( |
|
|
label="Time Report", |
|
|
interactive=False, |
|
|
value="Ready to generate speech", |
|
|
lines=3 |
|
|
) |
|
|
|
|
|
|
|
|
def update_speakers(model_choice): |
|
|
cfg = models_configs.get(model_choice) |
|
|
speakers = list(cfg.speaker_id.keys()) if (cfg and cfg.get('speaker_id')) else [] |
|
|
if speakers: |
|
|
return gr.update(choices=speakers, value=speakers[0], visible=True) |
|
|
else: |
|
|
return gr.update(choices=[], value=None, visible=False) |
|
|
|
|
|
model_dropdown.change( |
|
|
fn=update_speakers, |
|
|
inputs=[model_dropdown], |
|
|
outputs=[speaker_dropdown] |
|
|
) |
|
|
|
|
|
|
|
|
demo.load( |
|
|
fn=update_speakers, |
|
|
inputs=[model_dropdown], |
|
|
outputs=[speaker_dropdown] |
|
|
) |
|
|
|
|
|
|
|
|
generate_btn.click( |
|
|
fn=generate_speech_gpu, |
|
|
inputs=[text_input, model_dropdown, speaker_dropdown, temp, top_p, rp, max_tok], |
|
|
outputs=[audio_output, time_report_output] |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
|
|
|
examples = examples |
|
|
|
|
|
gr.Examples( |
|
|
examples=examples, |
|
|
inputs=[text_input, model_dropdown, speaker_dropdown, temp, top_p, rp, max_tok], |
|
|
fn=generate_speech_gpu, |
|
|
outputs=[audio_output, time_report_output], |
|
|
cache_examples=True, |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch( |
|
|
server_name="0.0.0.0", |
|
|
server_port=7860, |
|
|
show_error=True |
|
|
) |