File size: 2,330 Bytes
8eb2cb0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
import os
import torch
import torchaudio
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
import time
# Add here the xtts_config path
CONFIG_PATH = "/content/fine-tunning-result/speakerjj393_50/run/training/XTTS_v2.0_original_model_files/config.json"
# Add here the vocab file that you have used to train the model
TOKENIZER_PATH = "/content/fine-tunning-result/speakerjj393_50/run/training/XTTS_v2.0_original_model_files/vocab.json"
# Add here the checkpoint that you want to do inference with
XTTS_CHECKPOINT = "/content/fine-tunning-result/speakerjj393_50/run/training/GPT_XTTS_FT-December-13-2023_10+17AM-c99e885c/best_model.pth"
# Add here the speaker reference
SPEAKER_REFERENCE = "/content/fine-tunning-result/speakerjj393_50/dataset/wavs/say_set1_collectorjj71_speakerjj393_46_0_19_00000000.wav"
# output wav path
OUTPUT_WAV_PATH = ""
print("Loading model...")
config = XttsConfig()
config.load_json(CONFIG_PATH)
model = Xtts.init_from_config(config)
model.load_checkpoint(config, checkpoint_path=XTTS_CHECKPOINT, vocab_path=TOKENIZER_PATH, use_deepspeed=False, speaker_file_path="/path/to/speaker/file.pth")
model.cuda()
print("Computing speaker latents...")
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[SPEAKER_REFERENCE])
# ์์ฑ์ ๋ง๋ ํ static/result.wav ํ์ผ ์ ์ฅ
def voice_inference(sentence):
print("Inference...")
out = model.inference(
sentence,
"ko",
gpt_cond_latent,
speaker_embedding,
temperature=0.7, # Add custom parameters here
top_k = 55 # default is 50 / ๊ฐ์ด ๋ฎ์์๋ก ์ ์ฌํ ์ถ๋ ฅ
)
output_data_numpy = torch.tensor(out["wav"]).unsqueeze(0)
# ์ค๋ณต๋์ง ์๋ ํ์ผ๋ช
์์ฑ
n = time.localtime() #ํ์ฌ์๊ฐ
s = 'static/wav/%04d-%02d-%02d-%02d-%02d-%02d.wav' % (n.tm_year, n.tm_mon, n.tm_mday, n.tm_hour, n.tm_min, n.tm_sec)
OUTPUT_WAV_PATH = s
# ์์ฑ ํ์ผ ์ ์ฅ
torchaudio.save(OUTPUT_WAV_PATH, output_data_numpy, 24000)
# ํ์ผ์ด ์์ฑ๋ ๋๊น์ง ๊ธฐ๋ค๋ฆผ
while not os.path.exists(OUTPUT_WAV_PATH):
time.sleep(1) # 1์ด ๋๊ธฐ ํ ๋ค์ ํ์ธ
print(f"WAV ํ์ผ์ด ์์ฑ๋์์ต๋๋ค: {OUTPUT_WAV_PATH}")
return OUTPUT_WAV_PATH |