styletts2-ver2 / datnt_predictor.py
hieuducle's picture
Upload full StyleTTS2_custom folder
1b242be verified
import torch
import torchaudio
import librosa
import yaml
# from nltk.tokenize import word_tokenize
import phonemizer
import time
torch.set_num_threads(4)
# Setup
device = "cuda"#'cuda' if torch.cuda.is_available() else 'cpu'
# global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)
global_phonemizer = phonemizer.backend.EspeakBackend(language='vi', preserve_punctuation=True, language_switch='remove-flags', with_stress=True)
# Load model (1 lần duy nhất)
config = yaml.safe_load(open("/workspace/trainTTS/StyleTTS2_custom/Configs/config_ft.yml"))
from models import *
from utils import *
from text_utils import TextCleaner
textclenaer = TextCleaner()
text_aligner = load_ASR_models(config['ASR_path'], config['ASR_config'])
pitch_extractor = load_F0_models(config['F0_path'])
from Utils.PLBERT.util import load_plbert
plbert = load_plbert(config['PLBERT_dir'])
model = build_model(recursive_munch(config['model_params']), text_aligner, pitch_extractor, plbert)
params = torch.load("/workspace/trainTTS/StyleTTS2_custom/Models/mix5voice/ver2/best_model.pth", map_location='cuda')['net']
# params = torch.load("/workspace/trainTTS/StyleTTS2_custom/Models/mix5voice/merge_model.pth", map_location='cuda')['net']
# params = torch.load("/workspace/trainTTS/StyleTTS2_custom/Models/mix5voice/ver2/best_model.pth", map_location='cuda')['net']
# params = torch.load("/workspace/trainTTS/epochs_2nd_00020_universal.pth", map_location='cuda')['net']
# params = torch.load("/u01/colombo/hungnt/hieuld/tts/StyleTTS2-LibriTTS/Models/LibriTTS/epochs_2nd_00020_no_bert.pth", map_location='cuda')['net']
for key in model:
state_dict = params[key]
# remove "module." prefix nếu có
new_state_dict = {}
for k, v in state_dict.items():
if k.startswith("module."):
new_state_dict[k[len("module."):]] = v
else:
new_state_dict[k] = v
model[key].load_state_dict(new_state_dict, strict=True)
model[key].eval().to(device)
from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule
sampler = DiffusionSampler(model.diffusion.diffusion, sampler=ADPM2Sampler(),
sigma_schedule=KarrasSchedule(sigma_min=0.0001, sigma_max=3.0, rho=9.0), clamp=False)
# Hàm clone voice
def clone_voice(text, reference_audio_path):
# Extract style từ reference
wave, sr = librosa.load(reference_audio_path, sr=24000)
audio, _ = librosa.effects.trim(wave, top_db=30)
to_mel = torchaudio.transforms.MelSpectrogram(n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
mel = to_mel(torch.from_numpy(audio).float())
mel = (torch.log(1e-5 + mel.unsqueeze(0)) - (-4)) / 4
mel = mel.to(device)
with torch.no_grad():
ref_s = model.style_encoder(mel.unsqueeze(1))
ref_p = model.predictor_encoder(mel.unsqueeze(1))
ref_style = torch.cat([ref_s, ref_p], dim=1)
# Synthesize
ps = global_phonemizer.phonemize([text.strip()])[0]
ps = ps.replace("t̪", "vhv.vn").replace("t", "tʰ").replace("vhv.vn", "t")
# ps = ' '.join(word_tokenize(ps[0]))
tokens = torch.LongTensor(textclenaer(ps)).to(device).unsqueeze(0)
tokens = torch.cat([torch.LongTensor([0]).to(device).unsqueeze(0), tokens], dim=-1)
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
text_mask = torch.arange(input_lengths.max()).unsqueeze(0).expand(input_lengths.shape[0], -1).type_as(input_lengths)
text_mask = torch.gt(text_mask+1, input_lengths.unsqueeze(1)).to(device)
with torch.no_grad():
t_en = model.text_encoder(tokens, input_lengths, text_mask)
bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
s_pred = sampler(noise=torch.randn((1, 256)).unsqueeze(1).to(device),
embedding=bert_dur, features=ref_style, num_steps=5).squeeze(1)
s = 0.7 * s_pred[:, 128:] + 0.3 * ref_style[:, 128:]
ref = 0.3 * s_pred[:, :128] + 0.7 * ref_style[:, :128]
print('*' * 100)
print('s', s)
print('s.shape', s.shape)
print('*' * 100)
print('*' * 100)
print('ref', ref)
print('ref.shape', ref.shape)
print('*' * 100)
# s = ref_style[:, 128:]
# ref = ref_style[:, :128]
# s = 0.1 * s_pred[:, 128:] + 0.9 * ref_style[:, 128:]
# ref = 0.05 * s_pred[:, :128] + 0.95 * ref_style[:, :128]
d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
x, _ = model.predictor.lstm(d)
duration = torch.sigmoid(model.predictor.duration_proj(x)).sum(axis=-1)
pred_dur = torch.round(duration.squeeze()).clamp(min=1)
pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
c_frame = 0
for i in range(pred_aln_trg.size(0)):
pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1
c_frame += int(pred_dur[i].data)
en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))
F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))
out = model.decoder(asr, F0_pred, N_pred, ref.squeeze().unsqueeze(0))
return out.squeeze().cpu().numpy()[..., :-50]
# list_texts = ["Vietnam – Philippines: Unraveling the phenomenon in the SEA Games 33 semi-finals against Thailand.","Vietnam – Philippines: Unraveling the phenomenon in the SEA Games 33 semi-finals against Thailand. Vietnams next obstacle on its journey to the SEA Games 33 final is the Philippines, a major surprise at the tournament, reaching the semi-finals for the first time in 34 years. * Vietnam – Philippines: 3:30 PM, Monday, December 15th, on VnExpress. The last time the Philippines reached the SEA Games semi-finals was in 1991 on home soil. That was a time when a win only counted for two points; the Philippines drew with Vietnam 2-2, won against Malaysia 1-0, and lost to Indonesia 1-2 to advance as runners-up in their group. But in the semi-finals, they dropped their mask, suffering a crushing 2-6 defeat against Thailand, and then losing 0-2 to Singapore in the bronze medal match. 34 years later, the Philippines are once again enjoying the atmosphere of a SEA Games semi-final. Yesterday, their girls also overcame host Thailand after a penalty shootout, reaching the womens football final for the first time and facing Vietnam again. Philippine football is truly thriving at this SEA Games and they are dreaming of reaching both finals. Nguyen Dinh Bac celebrates the equalizer goal in the 2-1 victory against the Philippines in the semi-final of the 2025 Southeast Asian U23 Championship, at Gelora Bung Karno Stadium in Jakarta, Indonesia on July 25, 2025. Photo: Hai Tu. The Philippines were in a tough Group C, but they defeated Myanmar 2-0 and then the defending champions Indonesia 1-0. Coach Garrath McPherson affirmed that this was not a matter of luck or chance. His players showed enthusiasm and confidence, along with disciplined play, a willingness to hold onto the ball, and a readiness to compete physically. We wont change our approach. This is an opportunity for the young players to show themselves against a strong opponent, McPherson said at the press conference on the afternoon of December 14th. Vietnam is physically strong. We are too, and we mustnt let their midfielders and defenders have free rein with the ball."]
# list_texts = ["Vietnam – Philippines: Unraveling the phenomenon in the SEA Games 33 semi-finals against Thailand. Vietnams next obstacle on its journey to the SEA Games 33 final is the Philippines, a major surprise at the tournament, reaching the semi-finals for the first time in 34 years. * Vietnam – Philippines: 3:30 PM, Monday, December 15th, on VnExpress. The last time the Philippines reached the SEA Games semi-finals was in 1991 on home soil. That was a time when a win only counted for two points; the Philippines drew with Vietnam 2-2, won against Malaysia 1-0, and lost to Indonesia 1-2 to advance as runners-up in their group. But in the semi-finals, they dropped their mask, suffering a crushing 2-6 defeat against Thailand, and then losing 0-2 to Singapore in the bronze medal match. 34 years later, the Philippines are once again enjoying the atmosphere of a SEA Games semi-final. Yesterday, their girls also overcame host Thailand after a penalty shootout, reaching the womens football final for the first time and facing Vietnam again. Philippine football is truly thriving at this SEA Games and they are dreaming of reaching both finals. Nguyen Dinh Bac celebrates the equalizer goal in the 2-1 victory against the Philippines in the semi-final of the 2025 Southeast Asian U23 Championship, at Gelora Bung Karno Stadium in Jakarta, Indonesia on July 25, 2025. Photo: Hai Tu. The Philippines were in a tough Group C, but they defeated Myanmar 2-0 and then the defending champions Indonesia 1-0. Coach Garrath McPherson affirmed that this was not a matter of luck or chance. His players showed enthusiasm and confidence, along with disciplined play, a willingness to hold onto the ball, and a readiness to compete physically. We wont change our approach. This is an opportunity for the young players to show themselves against a strong opponent, McPherson said at the press conference on the afternoon of December 14th. Vietnam is physically strong. We are too, and we mustnt let their midfielders and defenders have free rein with the ball."]
# list_texts = ["luôn luôn trân trọng những gì đã trải qua, bạn ơi tôi đánh giá bạn rất cao, ngày hôm nay rất đẹp"]
# list_texts = ["Trao giải cuộc thi Báo chí viết về bảo vệ môi trường ngành Công Thương 2025 Chiều 15/12, tại Hà Nội, Bộ Công Thương long trọng tổ chức Lễ trao giải cuộc thi Báo chí viết về bảo vệ môi trường ngành Công Thương năm 2025. Cuộc thi nhằm tôn vinh những tác phẩm báo chí tiêu biểu, có đóng góp quan trọng đối với công tác bảo vệ môi trường ngành Công Thương."]
list_texts = ["Hà Nội, trái tim của Việt Nam, là một thành phố ngàn năm văn hiến với bề dày lịch sử và văn hóa độc đáo. Bước chân trên những con phố cổ kính quanh Hồ Hoàn Kiếm, du khách như được du hành ngược thời gian, chiêm ngưỡng kiến trúc Pháp cổ điển hòa quyện với nét kiến trúc truyền thống Việt Nam"]
# list_texts = ["xin chào việt nam, hôm nay trời rất đẹp, tôi nghĩ chúng ta nên ra ngoài đi dạo và tận hưởng không khí trong lành"]
for text in list_texts:
st = time.time()
# wav = clone_voice(text, "/home/general/TTS/train_model/audio/donal_trump.wav")
sens = text.split('.')
wavs = []
for sen in sens:
if sen:
# wav = clone_voice(sen, "/workspace/trainTTS/StyleTTS2_custom/hue_tieu_vi.wav")[1000:-4000]
wav = clone_voice(sen, "/workspace/trainTTS/StyleTTS2_custom/sangnq_original.wav")[1000:-4000]
# wav = clone_voice(sen, "/workspace/trainTTS/StyleTTS2_custom/test_voice_clone/sena30.wav")[1000:-4000]
wavs.append(wav)
all_wav = np.concatenate(wavs)
print('Time cloning voice = ', time.time()-st)
# Save hoặc play
import soundfile as sf
import subprocess
# sf.write('./test_voice_clone/sena30_best_last_73.wav', all_wav, 24000)
# output = './test_voice_clone/hue_tieu_vi_clone.wav'
# output = './test_voice_clone/sena30_clone.wav'
output = './test_voice_clone/sangnq_original_clone.wav'
sf.write(output, all_wav, 24000)
# 2. Chuyển đổi sang MP3 bằng pydub
command = [
'ffmpeg',
'-y', # Ghi đè file nếu đã tồn tại
'-i', output, # Input file
'-b:a', '192k', # Bitrate (chất lượng âm thanh), có thể đổi thành 128k, 320k
output.replace('.wav','.mp3') # Output file
]
try:
# Chạy lệnh, ẩn bớt log (capture_output=True) để terminal đỡ rối
subprocess.run(command, check=True, capture_output=True)
print(f"Đã chuyển đổi thành công: {output.replace('.wav','.mp3')}")
# (Tùy chọn) Xóa file wav gốc để tiết kiệm dung lượng
# os.remove(wav_path)
except subprocess.CalledProcessError as e:
print("Lỗi FFmpeg!")
print(f"Chi tiết lỗi: {e.stderr.decode()}")
except FileNotFoundError:
print("Lỗi: Không tìm thấy lệnh 'ffmpeg'. Hãy chắc chắn bạn đã cài đặt và thêm vào PATH.")