Initial upload from script

5273c6f verified 2 months ago

9.96 kB

	import nltk
	nltk.download('punkt_tab') # <--- Thêm dòng này
	nltk.download('punkt') # <--- Nên thêm cả dòng này cho chắc chắn

	import torch
	torch.manual_seed(0)
	torch.backends.cudnn.benchmark = False
	torch.backends.cudnn.deterministic = True

	import random
	random.seed(0)

	import numpy as np
	np.random.seed(0)

	import time
	import random
	import yaml
	from munch import Munch
	import numpy as np
	import torch
	from torch import nn
	import torch.nn.functional as F
	import torchaudio
	import librosa
	from nltk.tokenize import word_tokenize

	from models import *
	from utils import *
	from text_utils import TextCleaner

	import soundfile as sf
	import os
	textclenaer = TextCleaner()

	to_mel = torchaudio.transforms.MelSpectrogram(
	n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
	mean, std = -4, 4

	def length_to_mask(lengths):
	mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
	mask = torch.gt(mask+1, lengths.unsqueeze(1))
	return mask

	def preprocess(wave):
	wave_tensor = torch.from_numpy(wave).float()
	mel_tensor = to_mel(wave_tensor)
	mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
	return mel_tensor

	def compute_style(path):
	wave, sr = librosa.load(path, sr=24000)
	audio, index = librosa.effects.trim(wave, top_db=30)
	if sr != 24000:
	audio = librosa.resample(audio, sr, 24000)
	mel_tensor = preprocess(audio).to(device)

	with torch.no_grad():
	ref_s = model.style_encoder(mel_tensor.unsqueeze(1))
	ref_p = model.predictor_encoder(mel_tensor.unsqueeze(1))

	return torch.cat([ref_s, ref_p], dim=1)

	device = 'cuda' if torch.cuda.is_available() else 'cpu'

	# load phonemizer
	import phonemizer
	global_phonemizer = phonemizer.backend.EspeakBackend(language='vi', preserve_punctuation=True, with_stress=True,language_switch="remove-flags")
	# global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True,language_switch="remove-flags")

	config = yaml.safe_load(open("./Configs/config_libritts.yml"))
	# config = yaml.safe_load(open("./Configs/config_vokan.yml"))

	# load pretrained ASR model
	ASR_config = config.get('ASR_config', False)
	ASR_path = config.get('ASR_path', False)
	text_aligner = load_ASR_models(ASR_path, ASR_config)

	# load pretrained F0 model
	F0_path = config.get('F0_path', False)
	pitch_extractor = load_F0_models(F0_path)

	# load BERT model
	from Utils.PLBERT.util import load_plbert
	BERT_path = config.get('PLBERT_dir', False)
	plbert = load_plbert(BERT_path)

	model_params = recursive_munch(config['model_params'])
	model = build_model(model_params, text_aligner, pitch_extractor, plbert)
	_ = [model[key].eval() for key in model]
	_ = [model[key].to(device) for key in model]

	# params_whole = torch.load("/u01/colombo/hungnt/hieuld/tts/StyleTTS2/hieuducle/model_40speaker/model_iter_00004000.pth", map_location='cpu')
	print("Loading pretrained model from HF...")
	# params_whole = torch.load("/u01/colombo/hungnt/hieuld/TTS_clone/pretrainedModel/hieuducle/train_second_bestcheckpoint/best_model.pth", map_location='cpu')
	params_whole = torch.load("/workspace/StyleTTS2/Models/LibriTTS/model_iter_00002300.pth", map_location='cpu')

	params = params_whole['net']

	for key in model:
	if key in params:
	print('%s loaded' % key)
	try:
	model[key].load_state_dict(params[key])
	except:
	from collections import OrderedDict
	state_dict = params[key]
	new_state_dict = OrderedDict()
	for k, v in state_dict.items():
	name = k[7:] # remove `module.`
	new_state_dict[name] = v
	# load params
	model[key].load_state_dict(new_state_dict, strict=False)
	# except:
	# _load(params[key], model[key])
	_ = [model[key].eval() for key in model]

	from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule

	sampler = DiffusionSampler(
	model.diffusion.diffusion,
	sampler=ADPM2Sampler(),
	sigma_schedule=KarrasSchedule(sigma_min=0.0001, sigma_max=3.0, rho=9.0), # empirical parameters
	clamp=False
	)

	passage = '''Lý do ông Putin chưa chấp nhận kế hoạch hòa bình từ Mỹ, Tổng thống Putin không vội chấp nhận đề xuất hòa bình Ukraine của Mỹ, khi Nga có lợi thế đàm phán nhờ đà tiến trên chiến trường, trong khi phương Tây lục đục nội bộ. Chính quyền Tổng thống Mỹ Donald Trump hồi tháng 11 xây dựng kế hoạch hòa bình 28 điểm để chấm dứt chiến sự Nga - Ukraine, sau đó tiến hành chiến dịch ngoại giao con thoi để thuyết phục hai bên chấp nhận. Ukraine cùng châu Âu đã nhanh chóng phản đối kế hoạch, do nó có nhiều đề xuất hoàn toàn có lợi cho Nga'''
	# passage = '''Kumapaterado là ai và làm gì'''
	def LFinference(text, s_prev, ref_s, alpha = 0.3, beta = 0.7, t = 0.7, diffusion_steps=5, embedding_scale=1):
	text = text.strip()
	ps = global_phonemizer.phonemize([text])
	ps = word_tokenize(ps[0])
	ps = ' '.join(ps)
	ps = ps.replace('``', '"')
	ps = ps.replace("''", '"')
	ps = ps.replace('t̪', '\uFFFF').replace('t', 'tʰ').replace('\uFFFF', 't')


	tokens = textclenaer(ps)
	tokens.insert(0, 0)
	tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)

	with torch.no_grad():
	input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
	text_mask = length_to_mask(input_lengths).to(device)

	t_en = model.text_encoder(tokens, input_lengths, text_mask)
	bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
	d_en = model.bert_encoder(bert_dur).transpose(-1, -2)

	s_pred = sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(device),
	embedding=bert_dur,
	embedding_scale=embedding_scale,
	features=ref_s, # reference from the same speaker as the embedding
	num_steps=diffusion_steps).squeeze(1)

	if s_prev is not None:
	# convex combination of previous and current style
	s_pred = t * s_prev + (1 - t) * s_pred

	s = s_pred[:, 128:]
	ref = s_pred[:, :128]

	ref = alpha * ref + (1 - alpha) * ref_s[:, :128]
	s = beta * s + (1 - beta) * ref_s[:, 128:]

	s_pred = torch.cat([ref, s], dim=-1)

	d = model.predictor.text_encoder(d_en,
	s, input_lengths, text_mask)

	x, _ = model.predictor.lstm(d)
	duration = model.predictor.duration_proj(x)

	duration = torch.sigmoid(duration).sum(axis=-1)
	pred_dur = torch.round(duration.squeeze()).clamp(min=1)


	pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
	c_frame = 0
	for i in range(pred_aln_trg.size(0)):
	pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1
	c_frame += int(pred_dur[i].data)

	# encode prosody
	en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))
	if model_params.decoder.type == "hifigan":
	asr_new = torch.zeros_like(en)
	asr_new[:, :, 0] = en[:, :, 0]
	asr_new[:, :, 1:] = en[:, :, 0:-1]
	en = asr_new

	F0_pred, N_pred = model.predictor.F0Ntrain(en, s)

	asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))
	if model_params.decoder.type == "hifigan":
	asr_new = torch.zeros_like(asr)
	asr_new[:, :, 0] = asr[:, :, 0]
	asr_new[:, :, 1:] = asr[:, :, 0:-1]
	asr = asr_new

	out = model.decoder(asr,
	F0_pred, N_pred, ref.squeeze().unsqueeze(0))


	return out.squeeze().cpu().numpy()[..., :-100], s_pred # weird pulse at the end of the model, need to be fixed later

	# os.makedirs("outputs", exist_ok=True)

	# unseen speaker
	path = "/workspace/StyleTTS2/audio_ref/sangnq.wav"
	s_ref = compute_style(path)
	sentences = passage.split('.') # simple split by comma
	wavs = []
	s_prev = None
	print("Synthesizing...")
	for text in sentences:
	if text.strip() == "": continue
	text += '.' # add it back

	wav, s_prev = LFinference(text,
	s_prev,
	s_ref,
	# alpha = 0.3,
	alpha = 0,
	# beta = 0.7,
	beta = 0, # make it more suitable for the text
	t = 0.7,
	diffusion_steps=5, embedding_scale=1)
	wavs.append(wav)
	# concat all segments
	# final_wav = np.concatenate(wavs)
	# name_audio = os.path.basename(path).split('.')[0]
	# out_path = f"./audio_clone/{name_audio}_clone.wav"
	# sf.write(out_path, final_wav, 24000, subtype="PCM_16")

	# print("Saved synthesized audio to:", out_path)
	# print("Reference audio:", path)



	# concat all segments
	final_wav = np.concatenate(wavs)
	name_audio = os.path.basename(path).split('.')[0]

	# 1. Đổi tên file output thành .mp3
	out_path = f"./audio_clone/{name_audio}_clone.mp3"

	# 2. Dùng torchaudio để lưu mp3 (thay vì sf.write)
	# final_wav đang là numpy, cần chuyển sang Tensor và thêm chiều channel (1, Time)
	torchaudio.save(out_path, torch.from_numpy(final_wav).unsqueeze(0), 24000, format="mp3")

	print("Saved synthesized audio to:", out_path)
	print("Reference audio:", path)