|
|
import gradio as gr |
|
|
import torch |
|
|
import os |
|
|
|
|
|
|
|
|
MODEL_ID = "nineninesix/Kani-TTS-370m" |
|
|
|
|
|
@spaces.GPU |
|
|
def generate_speech(text: str, model_choice: str, speaker_display: str): |
|
|
if not text.strip(): |
|
|
return "Please enter text for speech generation.", None |
|
|
|
|
|
try: |
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
print(f"Using device: {device}") |
|
|
|
|
|
|
|
|
if model_choice not in MODELS: |
|
|
return f"Model '{model_choice}' not found.", None |
|
|
|
|
|
selected_model = MODELS[model_choice] |
|
|
|
|
|
|
|
|
cfg = selected_model[1] |
|
|
speaker_map = cfg.get('speaker_id', {}) if cfg is not None else {} |
|
|
if speaker_display and speaker_map: |
|
|
speaker_id = speaker_map.get(speaker_display) |
|
|
else: |
|
|
speaker_id = None |
|
|
|
|
|
print(f"Generating speech with {model_choice}...") |
|
|
|
|
|
|
|
|
model_to_generate = selected_model[0] |
|
|
audio, _, time_report = model_to_generate.run_model( |
|
|
text=text, |
|
|
speaker_id=speaker_id, |
|
|
temperature=0.7, |
|
|
repetition_penalty=1.2, |
|
|
max_tokens=1024 |
|
|
) |
|
|
|
|
|
sample_rate = 22050 |
|
|
print("Speech generation completed!") |
|
|
|
|
|
return (sample_rate, audio), time_report |
|
|
|
|
|
def load_models(): |
|
|
global MODELS |
|
|
if not MODELS: |
|
|
print("Loading models into GPU memory...") |
|
|
from transformers import AutoModel |
|
|
model_path = MODEL_ID |
|
|
|
|
|
|
|
|
model = AutoModel.from_pretrained(model_path, trust_remote_code=True) |
|
|
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) |
|
|
|
|
|
MODELS = { |
|
|
"Kani TTS 370M": (model, config) |
|
|
} |
|
|
|
|
|
print(f"Models loaded. Available speakers: {list(config.speaker_id.keys()) if config.speaker_id else []}") |
|
|
return MODELS |
|
|
|
|
|
|
|
|
MODELS = load_models() |
|
|
|
|
|
with gr.Blocks(title="😻 KaniTTS - Text to Speech") as demo: |
|
|
gr.Markdown("# 😻 KaniTTS: Fast and Expressive Speech Generation Model") |
|
|
|
|
|
model_dropdown = gr.Dropdown( |
|
|
choices=list(MODELS.keys()), |
|
|
value=list(MODELS.keys())[0], |
|
|
label="Selected Model" |
|
|
) |
|
|
|
|
|
|
|
|
all_speakers = list(MODELS[list(MODELS.keys())[0]][1].speaker_id.keys()) if MODELS and MODELS[list(MODELS.keys())[0]][1] and MODELS[list(MODELS.keys())[0]][1].speaker_id else [] |
|
|
speaker_dropdown = gr.Dropdown( |
|
|
choices=all_speakers, |
|
|
value=None, |
|
|
label="Speaker", |
|
|
visible=True, |
|
|
allow_custom_value=True |
|
|
) |
|
|
|
|
|
text_input = gr.Textbox(label="Text", lines=5) |
|
|
|
|
|
generate_btn = gr.Button("Generate Speech", variant="primary") |
|
|
|
|
|
audio_output = gr.Audio(label="Generated Audio", type="numpy") |
|
|
|
|
|
|
|
|
model_dropdown.change( |
|
|
fn=lambda choice: gr.update(choices=list(MODELS[choice][1].speaker_id.keys()), value=None, visible=True) if MODELS and MODELS[choice][1].speaker_id else gr.update(visible=False), |
|
|
inputs=[model_dropdown], |
|
|
outputs=[speaker_dropdown] |
|
|
) |
|
|
|
|
|
generate_btn.click( |
|
|
fn=generate_speech, |
|
|
inputs=[text_input, model_dropdown, speaker_dropdown], |
|
|
outputs=[audio_output] |
|
|
) |
|
|
|
|
|
|
|
|
demo.queue().launch(show_api=True) |
|
|
|