import os import sys sys.path.append( os.path.abspath( os.path.join(os.path.dirname(__file__), "../bigvgan_v2_24khz_100band_256x/") ) ) import bigvgan import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from pydub import AudioSegment from tqdm import tqdm from config import config from .flow_matching import BASECFM from .utilities import denormalize_tacotron_mel, normalize_tacotron_mel def infer(model, timeshapes, code_embs, ref_mels, epoch=0): os.makedirs("Samples/" + config.model_name + "/S2A/", exist_ok=True) FM = BASECFM() device = next(model.parameters()).device hifi = bigvgan.BigVGAN.from_pretrained( "nvidia/bigvgan_v2_24khz_100band_256x", use_cuda_kernel=False ) hifi.remove_weight_norm() hifi = hifi.eval().to(device) audio_paths = [] mels = [] for n, (timeshape, code_emb, ref_mel) in enumerate( zip(timeshapes, code_embs, ref_mels) ): with torch.no_grad(): mel = FM( model, code_emb.unsqueeze(0).to(device), (1, 100, timeshape), ref_mel.unsqueeze(0).to(device), n_timesteps=20, temperature=1.0, ) mel = denormalize_tacotron_mel(mel) mels.append(mel) audio = hifi(mel) audio = audio.squeeze(0).detach().cpu() audio = audio * 32767.0 audio = audio.numpy().reshape(-1).astype(np.int16) audio_path = ( "../Samples/" + config.model_name + "/S2A/" + str(epoch) + "_" + str(n) + ".wav" ) AudioSegment( audio.tobytes(), frame_rate=24000, sample_width=audio.dtype.itemsize, channels=1, ).export(audio_path, format="wav") audio_paths.append(audio_path) return audio_paths, mels