tartuNLP
/

XTTS-v2-multi

Model card Files Files and versions

XTTS-v2-multi / app_local.py

rlellep's picture

Upload folder using huggingface_hub

99341ef verified about 2 months ago

history blame contribute delete

1.84 kB

	import gradio as gr
	import numpy as np
	import torch
	from TTS.tts.configs.xtts_config import XttsConfig
	from TTS.tts.models.xtts import Xtts

	device = "cuda:0" if torch.cuda.is_available() else "cpu"

	def load_model():
	config = XttsConfig()
	config.load_json("model/config.json")
	XTTS_MODEL = Xtts.init_from_config(config)
	XTTS_MODEL.load_checkpoint(config, checkpoint_path="model/model.pth", vocab_path="model/vocab.json", use_deepspeed=False)
	XTTS_MODEL.to(device)
	return XTTS_MODEL

	model = load_model()

	def predict(sentence, language, reference_clip):
	if not reference_clip or not reference_clip.split('.')[-1] in ['mp3', 'wav']:
	return
	gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
	audio_path=reference_clip,
	gpt_cond_len=model.config.gpt_cond_len,
	max_ref_length=model.config.max_ref_len,
	sound_norm_refs=model.config.sound_norm_refs,
	)

	wav_chunks = []
	for chunk in model.inference_stream(
	text=sentence,
	language=language,
	gpt_cond_latent=gpt_cond_latent,
	speaker_embedding=speaker_embedding,
	temperature=0.1,
	length_penalty=1.0,
	repetition_penalty=10.0,
	top_k=10,
	top_p=0.3,
	):
	if chunk is not None:
	wav_chunks.append(chunk)

	return (22050, torch.cat(wav_chunks, dim=0).unsqueeze(0)[0].numpy())

	demo = gr.Interface(
	title="XTTSv2-est Demo",
	description="To get the best results, provide a reference clip around the same length as the output sentence you want.",
	fn=predict,
	inputs=["text", gr.Dropdown(["et", "en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "hu", "ko", "ja", "hi"]), gr.File()],
	outputs=[gr.Audio()],
	)

	if __name__ == "__main__":
	demo.launch()