|
|
import os |
|
|
import torch |
|
|
import torchaudio |
|
|
from TTS.tts.configs.xtts_config import XttsConfig |
|
|
from TTS.tts.models.xtts import Xtts |
|
|
import time |
|
|
|
|
|
|
|
|
CONFIG_PATH = "/content/fine-tunning-result/speakerjj393_50/run/training/XTTS_v2.0_original_model_files/config.json" |
|
|
|
|
|
TOKENIZER_PATH = "/content/fine-tunning-result/speakerjj393_50/run/training/XTTS_v2.0_original_model_files/vocab.json" |
|
|
|
|
|
XTTS_CHECKPOINT = "/content/fine-tunning-result/speakerjj393_50/run/training/GPT_XTTS_FT-December-13-2023_10+17AM-c99e885c/best_model.pth" |
|
|
|
|
|
SPEAKER_REFERENCE = "/content/fine-tunning-result/speakerjj393_50/dataset/wavs/say_set1_collectorjj71_speakerjj393_46_0_19_00000000.wav" |
|
|
|
|
|
|
|
|
OUTPUT_WAV_PATH = "" |
|
|
|
|
|
print("Loading model...") |
|
|
config = XttsConfig() |
|
|
config.load_json(CONFIG_PATH) |
|
|
model = Xtts.init_from_config(config) |
|
|
model.load_checkpoint(config, checkpoint_path=XTTS_CHECKPOINT, vocab_path=TOKENIZER_PATH, use_deepspeed=False, speaker_file_path="/path/to/speaker/file.pth") |
|
|
model.cuda() |
|
|
|
|
|
print("Computing speaker latents...") |
|
|
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[SPEAKER_REFERENCE]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def voice_inference(sentence): |
|
|
print("Inference...") |
|
|
|
|
|
out = model.inference( |
|
|
sentence, |
|
|
"ko", |
|
|
gpt_cond_latent, |
|
|
speaker_embedding, |
|
|
temperature=0.7, |
|
|
top_k = 55 |
|
|
) |
|
|
output_data_numpy = torch.tensor(out["wav"]).unsqueeze(0) |
|
|
|
|
|
|
|
|
n = time.localtime() |
|
|
s = 'static/wav/%04d-%02d-%02d-%02d-%02d-%02d.wav' % (n.tm_year, n.tm_mon, n.tm_mday, n.tm_hour, n.tm_min, n.tm_sec) |
|
|
OUTPUT_WAV_PATH = s |
|
|
|
|
|
|
|
|
torchaudio.save(OUTPUT_WAV_PATH, output_data_numpy, 24000) |
|
|
|
|
|
|
|
|
while not os.path.exists(OUTPUT_WAV_PATH): |
|
|
time.sleep(1) |
|
|
|
|
|
print(f"WAV ํ์ผ์ด ์์ฑ๋์์ต๋๋ค: {OUTPUT_WAV_PATH}") |
|
|
|
|
|
return OUTPUT_WAV_PATH |