| import gradio as gr |
| import numpy as np |
| import torch |
| from TTS.tts.configs.xtts_config import XttsConfig |
| from TTS.tts.models.xtts import Xtts |
|
|
| device = "cuda:0" if torch.cuda.is_available() else "cpu" |
|
|
| def load_model(): |
| config = XttsConfig() |
| config.load_json("model/config.json") |
| XTTS_MODEL = Xtts.init_from_config(config) |
| XTTS_MODEL.load_checkpoint(config, checkpoint_path="model/model.pth", vocab_path="model/vocab.json", use_deepspeed=False) |
| XTTS_MODEL.to(device) |
| return XTTS_MODEL |
|
|
| model = load_model() |
|
|
| def predict(sentence, language, reference_clip): |
| if not reference_clip or not reference_clip.split('.')[-1] in ['mp3', 'wav']: |
| return |
| gpt_cond_latent, speaker_embedding = model.get_conditioning_latents( |
| audio_path=reference_clip, |
| gpt_cond_len=model.config.gpt_cond_len, |
| max_ref_length=model.config.max_ref_len, |
| sound_norm_refs=model.config.sound_norm_refs, |
| ) |
| |
| wav_chunks = [] |
| for chunk in model.inference_stream( |
| text=sentence, |
| language=language, |
| gpt_cond_latent=gpt_cond_latent, |
| speaker_embedding=speaker_embedding, |
| temperature=0.1, |
| length_penalty=1.0, |
| repetition_penalty=10.0, |
| top_k=10, |
| top_p=0.3, |
| ): |
| if chunk is not None: |
| wav_chunks.append(chunk) |
| |
| return (22050, torch.cat(wav_chunks, dim=0).unsqueeze(0)[0].numpy()) |
|
|
| demo = gr.Interface( |
| title="XTTSv2-est Demo", |
| description="To get the best results, provide a reference clip around the same length as the output sentence you want.", |
| fn=predict, |
| inputs=["text", gr.Dropdown(["et", "en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "hu", "ko", "ja", "hi"]), gr.File()], |
| outputs=[gr.Audio()], |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch() |