Spaces:

FireRedTeam
/

FireRedTTS-1S

Sleeping

FireRedTTS-1S / fireredtts /modules /semantic_tokenizer /semantic_tokenizer.py

Shen Feiyu

add 1s

faadabf 5 months ago

27.7 kB

	from functools import reduce
	from tokenize import Triple
	from torch import nn
	from torch.autograd import Function
	from torch.nn import functional as F
	from torch.nn.utils import spectral_norm, weight_norm
	from torch.utils.checkpoint import checkpoint

	import einops
	import math
	import numpy as np
	import os
	import random
	import torch
	import torchaudio
	import typing as tp
	import warnings

	from .audio import TorchMelSpectrogram
	from .ecapa_tdnn import ECAPA_TDNN
	from .hubert import HuBERT
	from ..acoustic_codec.vector_quantization import VectorQuantization


	CONV_NORMALIZATIONS = frozenset(
	[
	"none",
	"weight_norm",
	"spectral_norm",
	"time_layer_norm",
	"layer_norm",
	"time_group_norm",
	]
	)
	NORM = "weight_norm"


	def get_mask_from_lengths(lengths, max_len=None):
	max_len = torch.max(lengths).item() if max_len is None else max_len
	ids = torch.arange(0, max_len).to(lengths.device)
	mask = ~(ids < lengths.unsqueeze(1)).bool()
	return mask


	class ConvLayerNorm(nn.LayerNorm):
	def __init__(
	self, normalized_shape: tp.Union[int, tp.List[int], torch.Size], **kwargs
	):
	super().__init__(normalized_shape, **kwargs)

	def forward(self, x):
	x = einops.rearrange(x, "b ... t -> b t ...")
	x = super().forward(x)
	x = einops.rearrange(x, "b t ... -> b ... t")
	return


	def apply_parametrization_norm(module: nn.Module, norm: str = "none") -> nn.Module:
	assert norm in CONV_NORMALIZATIONS
	if norm == "weight_norm":
	return weight_norm(module)
	elif norm == "spectral_norm":
	return spectral_norm(module)
	else:
	# We already check was in CONV_NORMALIZATION, so any other choice
	# doesn't need reparametrization.
	return module


	def get_norm_module(
	module: nn.Module, causal: bool = False, norm: str = "none", **norm_kwargs
	) -> nn.Module:
	assert norm in CONV_NORMALIZATIONS
	if norm == "layer_norm":
	assert isinstance(module, nn.modules.conv._ConvNd)
	return ConvLayerNorm(module.out_channels, **norm_kwargs)
	elif norm == "time_group_norm":
	if causal:
	raise ValueError("GroupNorm doesn't support causal evaluation.")
	assert isinstance(module, nn.modules.conv._ConvNd)
	return nn.GroupNorm(1, module.out_channels, **norm_kwargs)
	else:
	return nn.Identity()


	def get_extra_padding_for_conv1d(
	x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0
	) -> int:
	length = x.shape[-1]
	n_frames = (length - kernel_size + padding_total) / stride + 1
	ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)
	return ideal_length - length


	def pad_for_conv1d(
	x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0
	):
	extra_padding = get_extra_padding_for_conv1d(x, kernel_size, stride, padding_total)
	return F.pad(x, (0, extra_padding))


	def pad1d(
	x: torch.Tensor,
	paddings: tp.Tuple[int, int],
	mode: str = "zero",
	value: float = 0.0,
	):
	length = x.shape[-1]
	padding_left, padding_right = paddings
	assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
	if mode == "reflect":
	max_pad = max(padding_left, padding_right)
	extra_pad = 0
	if length <= max_pad:
	extra_pad = max_pad - length + 1
	x = F.pad(x, (0, extra_pad))
	padded = F.pad(x, paddings, mode, value)
	end = padded.shape[-1] - extra_pad
	return padded[..., :end]
	else:
	return F.pad(x, paddings, mode, value)


	def unpad1d(x: torch.Tensor, paddings: tp.Tuple[int, int]):
	padding_left, padding_right = paddings
	assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
	assert (padding_left + padding_right) <= x.shape[-1]
	end = x.shape[-1] - padding_right
	return x[..., padding_left:end]


	class NormConv1d(nn.Module):

	def __init__(
	self,
	*args,
	causal: bool = False,
	norm: str = "none",
	norm_kwargs: tp.Dict[str, tp.Any] = {},
	**kwargs,
	):
	super().__init__()
	self.conv = apply_parametrization_norm(nn.Conv1d(args, *kwargs), norm)
	self.norm = get_norm_module(self.conv, causal, norm, **norm_kwargs)
	self.norm_type = norm

	def forward(self, x):
	x = self.conv(x)
	x = self.norm(x)
	return x


	class NormConv2d(nn.Module):

	def __init__(
	self,
	*args,
	norm: str = "none",
	norm_kwargs: tp.Dict[str, tp.Any] = {},
	**kwargs,
	):
	super().__init__()
	self.conv = apply_parametrization_norm(nn.Conv2d(args, *kwargs), norm)
	self.norm = get_norm_module(self.conv, causal=False, norm=norm, **norm_kwargs)
	self.norm_type = norm

	def forward(self, x):
	x = self.conv(x)
	x = self.norm(x)
	return x


	class NormConvTranspose1d(nn.Module):

	def __init__(
	self,
	*args,
	causal: bool = False,
	norm: str = "none",
	norm_kwargs: tp.Dict[str, tp.Any] = {},
	**kwargs,
	):
	super().__init__()
	self.convtr = apply_parametrization_norm(
	nn.ConvTranspose1d(args, *kwargs), norm
	)
	self.norm = get_norm_module(self.convtr, causal, norm, **norm_kwargs)
	self.norm_type = norm

	def forward(self, x):
	x = self.convtr(x)
	x = self.norm(x)
	return x


	class NormConvTranspose2d(nn.Module):

	def __init__(
	self,
	*args,
	norm: str = "none",
	norm_kwargs: tp.Dict[str, tp.Any] = {},
	**kwargs,
	):
	super().__init__()
	self.convtr = apply_parametrization_norm(
	nn.ConvTranspose2d(args, *kwargs), norm
	)
	self.norm = get_norm_module(self.convtr, causal=False, norm=norm, **norm_kwargs)

	def forward(self, x):
	x = self.convtr(x)
	x = self.norm(x)
	return x


	class SConv1d(nn.Module):

	def __init__(
	self,
	in_channels: int,
	out_channels: int,
	kernel_size: int,
	stride: int = 1,
	dilation: int = 1,
	groups: int = 1,
	bias: bool = True,
	causal: bool = False,
	norm: str = "weight_norm",
	norm_kwargs: tp.Dict[str, tp.Any] = {},
	pad_mode: str = "reflect",
	):
	super().__init__()
	# warn user on unusual setup between dilation and stride
	if stride > 1 and dilation > 1:
	warnings.warn(
	"SConv1d has been initialized with stride > 1 and dilation > 1"
	f" (kernel_size={kernel_size} stride={stride}, dilation={dilation})."
	)
	self.conv = NormConv1d(
	in_channels,
	out_channels,
	kernel_size,
	stride,
	dilation=dilation,
	groups=groups,
	bias=bias,
	causal=causal,
	norm=norm,
	norm_kwargs=norm_kwargs,
	)
	self.causal = causal
	self.pad_mode = pad_mode

	def forward(self, x):
	B, C, T = x.shape
	kernel_size = self.conv.conv.kernel_size[0]
	stride = self.conv.conv.stride[0]
	dilation = self.conv.conv.dilation[0]
	kernel_size = (
	kernel_size - 1
	) * dilation + 1 # effective kernel size with dilations
	padding_total = kernel_size - stride
	extra_padding = get_extra_padding_for_conv1d(
	x, kernel_size, stride, padding_total
	)
	if self.causal:
	# Left padding for causal
	x = pad1d(x, (padding_total, extra_padding), mode=self.pad_mode)
	else:
	# Asymmetric padding required for odd strides
	padding_right = padding_total // 2
	padding_left = padding_total - padding_right
	x = pad1d(
	x, (padding_left, padding_right + extra_padding), mode=self.pad_mode
	)
	return self.conv(x)


	class SConvTranspose1d(nn.Module):

	def __init__(
	self,
	in_channels: int,
	out_channels: int,
	kernel_size: int,
	stride: int = 1,
	causal: bool = False,
	norm: str = "weight_norm",
	trim_right_ratio: float = 1.0,
	norm_kwargs: tp.Dict[str, tp.Any] = {},
	):
	super().__init__()
	self.convtr = NormConvTranspose1d(
	in_channels,
	out_channels,
	kernel_size,
	stride,
	causal=causal,
	norm=norm,
	norm_kwargs=norm_kwargs,
	)
	self.causal = causal
	self.trim_right_ratio = trim_right_ratio
	assert (
	self.causal or self.trim_right_ratio == 1.0
	), "`trim_right_ratio` != 1.0 only makes sense for causal convolutions"
	assert self.trim_right_ratio >= 0.0 and self.trim_right_ratio <= 1.0

	def forward(self, x):
	kernel_size = self.convtr.convtr.kernel_size[0]
	stride = self.convtr.convtr.stride[0]
	padding_total = kernel_size - stride

	y = self.convtr(x)

	# We will only trim fixed padding. Extra padding from `pad_for_conv1d` would be
	# removed at the very end, when keeping only the right length for the output,
	# as removing it here would require also passing the length at the matching layer
	# in the encoder.
	if self.causal:
	# Trim the padding on the right according to the specified ratio
	# if trim_right_ratio = 1.0, trim everything from right
	padding_right = math.ceil(padding_total * self.trim_right_ratio)
	padding_left = padding_total - padding_right
	y = unpad1d(y, (padding_left, padding_right))
	else:
	# Asymmetric padding required for odd strides
	padding_right = padding_total // 2
	padding_left = padding_total - padding_right
	y = unpad1d(y, (padding_left, padding_right))
	return y


	class SLSTM(nn.Module):

	def __init__(
	self,
	dimension: int,
	num_layers: int = 2,
	bidirectional: bool = False,
	skip: bool = True,
	):
	super().__init__()
	self.bidirectional = bidirectional
	self.skip = skip
	if bidirectional:
	self.lstm = nn.LSTM(
	dimension, dimension // 2, num_layers, bidirectional=bidirectional
	)
	else:
	self.lstm = nn.LSTM(dimension, dimension, num_layers)

	def forward(self, x):
	x = x.permute(2, 0, 1)
	y, _ = self.lstm(x)
	if self.skip:
	y = y + x
	y = y.permute(1, 2, 0)
	return y


	class Swish(nn.Module):
	def forward(self, x):
	return x * torch.sigmoid(x)


	class ResidualUnit(nn.Module):
	def __init__(self, in_channels, out_channels, kernel_size=3, groups=1):
	super().__init__()

	self.layers = nn.Sequential(
	SConv1d(
	in_channels=in_channels,
	out_channels=out_channels // 2,
	kernel_size=kernel_size,
	groups=groups,
	norm=NORM,
	),
	Swish(),
	SConv1d(
	in_channels=out_channels // 2,
	out_channels=out_channels,
	kernel_size=kernel_size,
	groups=groups,
	norm=NORM,
	),
	)

	def forward(self, x):
	return x + self.layers(x)


	class EncoderBlock(nn.Module):
	def __init__(self, out_channels, stride):
	super().__init__()

	self.layers = nn.Sequential(
	ResidualUnit(in_channels=out_channels, out_channels=out_channels),
	Swish(),
	ResidualUnit(in_channels=out_channels, out_channels=out_channels),
	Swish(),
	SConv1d(
	in_channels=out_channels,
	out_channels=out_channels,
	kernel_size=2 * stride,
	stride=stride,
	norm=NORM,
	),
	)

	def forward(self, x):
	return self.layers(x)


	class DecoderBlock(nn.Module):
	def __init__(self, in_channels, stride):
	super().__init__()
	out_channels = in_channels
	self.layers = nn.Sequential(
	SConvTranspose1d(
	in_channels=in_channels,
	out_channels=out_channels,
	kernel_size=2 * stride,
	stride=stride,
	norm=NORM,
	),
	Swish(),
	ResidualUnit(in_channels=out_channels, out_channels=out_channels),
	Swish(),
	ResidualUnit(in_channels=out_channels, out_channels=out_channels),
	)

	def forward(self, x):
	return self.layers(x)


	class Encoder(nn.Module):
	def __init__(self, C, D, strides=[2, 2], checkpointing=True):
	super().__init__()
	self.checkpointing = checkpointing

	self.downsample_scale = np.cumprod(np.asarray(strides))[-1]
	self.layers = [
	SConv1d(in_channels=C, out_channels=D, kernel_size=3, norm=NORM),
	Swish(),
	]
	for stride in strides:
	self.layers += [
	EncoderBlock(out_channels=D, stride=stride),
	Swish(),
	]
	self.layers += [
	SConv1d(in_channels=D, out_channels=D, kernel_size=3, norm=NORM),
	SLSTM(D, num_layers=1, bidirectional=True),
	]
	self.layers = nn.Sequential(*self.layers)

	def forward(self, x):
	if self.checkpointing:
	x = checkpoint(
	self.layers, x.transpose(1, 2), use_reentrant=False
	).transpose(1, 2)
	else:
	x = self.layers(x.transpose(1, 2)).transpose(1, 2)
	return x


	class Decoder(nn.Module):
	def __init__(self, C, D, H, strides=[2, 2], checkpointing=True):
	super().__init__()
	self.checkpointing = checkpointing

	self.in_layer = nn.Sequential(
	SConv1d(in_channels=D, out_channels=H, kernel_size=3, norm=NORM),
	SLSTM(H, num_layers=1, bidirectional=True),
	)
	self.layers = nn.ModuleList()
	for stride in strides:
	self.layers.append(
	nn.Sequential(DecoderBlock(in_channels=H, stride=stride), Swish())
	)
	self.out_layer = SConv1d(
	in_channels=H, out_channels=C, kernel_size=3, norm=NORM
	)

	def forward(self, x, g=None):
	if self.checkpointing:
	y = checkpoint(self._forward, x, g, use_reentrant=False)
	else:
	y = self._forward(x, g)
	return y

	def _forward(self, x, g=None):
	h = self.in_layer(x.transpose(1, 2))

	for layer in self.layers:
	up_g = g.unsqueeze(-1).repeat(1, 1, h.shape[-1])
	h = h + up_g
	h = layer(h)

	y = self.out_layer(h)

	return y.transpose(1, 2), h.transpose(1, 2)


	class TimeRegulator(nn.Module):

	def __init__(self, in_dim, scale, learnable=False):
	super().__init__()
	self.scale = scale
	self.learnable = learnable

	def forward(self, x, x_len, downsample=True):
	if downsample:
	x = self.downsample(x, x_len)
	else:
	x = self.upsample(x, x_len)
	return x

	def downsample(self, x, x_len):
	x = torch.nn.functional.avg_pool1d(
	x.transpose(1, 2), self.scale, stride=self.scale, ceil_mode=True
	).transpose(1, 2)
	x_len = (x_len / self.scale).ceil()
	return x, x_len

	def upsample(self, x, x_len):
	if self.learnable:
	x = self.upsampler(x.transpose(1, 2)).transpose(1, 2)
	else:
	x = torch.repeat_interleave(x, self.scale, dim=1)
	return x


	class TreeVectorQuantization(nn.Module):

	def __init__(
	self,
	in_dim,
	vq_class="VectorQuantization",
	vq_config={},
	tree_config={},
	):
	super().__init__()
	self.vq_config = vq_config
	self.tree_config = tree_config

	self.quantizers = nn.ModuleList()
	self.time_regulators = nn.ModuleList()
	for config in self.tree_config:
	vq_config = self.vq_config.copy()
	if not isinstance(vq_config["codebook_size"], (tuple, list)):
	vq_config["codebook_size"] = [vq_config["codebook_size"]]
	vq_config["codebook_dim"] = [vq_config["codebook_dim"]]
	vq_config["codebook_size"] = vq_config["codebook_size"] * config["n_groups"]
	vq_config["codebook_dim"] = vq_config["codebook_dim"] * config["n_groups"]
	self.quantizers.append(
	VectorQuantization(
	in_dim,
	n_groups=config.get("n_groups", 1),
	dropout_rate_per_group=config.get("dropout_rate_per_group", 0),
	ordered=config.get("ordered", False),
	**vq_config,
	)
	)
	self.time_regulators.append(
	TimeRegulator(
	in_dim,
	config["downsample_rate"],
	config.get("learnable_time_regulator", False),
	)
	)

	def forward(
	self, inp, inp_len, enable_vq=True, update_codebook=True, return_pre_quant=False
	):
	output, (quants, losses, embed_inds) = self.quantize(
	inp,
	inp_len,
	enable_vq=enable_vq,
	update_codebook=update_codebook,
	return_pre_quant=return_pre_quant,
	)
	loss = sum(losses) / len(losses)
	return output, (quants, loss, embed_inds)

	def quantize(
	self, inp, inp_len, enable_vq=True, update_codebook=True, return_pre_quant=False
	):
	quants, losses, embed_inds = [], [], []

	pre_quant_output, quant_output, residual = 0, 0, inp
	for tree_config, quantizer, regulator in zip(
	self.tree_config, self.quantizers, self.time_regulators
	):
	# Downsample
	x, x_len = regulator(residual, inp_len, True)

	# Quantization
	q, diff, embed_ind = quantizer(
	x,
	x_len,
	enable_vq=enable_vq,
	update_codebook=update_codebook,
	return_pre_quant=return_pre_quant,
	)
	if return_pre_quant:
	pq, q = q

	# Upsample
	x = regulator(q, x_len, False)[:, : residual.shape[1]]

	residual = residual - x
	quant_output = quant_output + x

	if return_pre_quant:
	pq = regulator(pq, x_len, False)[:, : residual.shape[1]]
	pre_quant_output = pre_quant_output + pq

	quants.append(q)
	losses.append(diff)
	embed_inds.append(embed_ind)

	if return_pre_quant:
	return (pre_quant_output, quant_output), (quants, losses, embed_inds)
	return quant_output, (quants, losses, embed_inds)

	def decode(self, seqs, seq_lens=None):
	if not isinstance(seqs, (tuple, list)):
	tokens, token_lens = self.deserialize(seqs, seq_lens)
	else:
	tokens, token_lens = seqs, seq_lens

	quant_output = 0
	for token, quantizer, regulator in zip(
	tokens, self.quantizers, self.time_regulators
	):
	x = quantizer.decode(token).transpose(1, 2)
	x = regulator(x, None, False)
	if torch.is_tensor(quant_output):
	x = x[:, : quant_output.size(1)]
	quant_output = quant_output + x

	return quant_output, token_lens

	def serialize(self, tokens, token_lens):
	assert len(tokens) <= 2, "we only support 1 or 2-scale sequences now..."

	scale = self.tree_config[0]["downsample_rate"]
	token_lens = ((token_lens.float() / scale).ceil() * scale).int()

	seq1 = tokens[0].unsqueeze(-1)

	if len(tokens) == 1:
	seq_cat = seq1.view(seq1.shape[0], -1)
	seq_cat_lens = (token_lens / scale * seq1.shape[2]).int()
	elif len(tokens) == 2:
	seq2 = F.pad(
	tokens[1], (0, token_lens.max() - tokens[1].size(1)), "replicate"
	)
	seq2 = torch.stack([seq2[:, i::scale] for i in range(scale)], dim=-1)
	seq_cat = torch.cat((seq1, seq2), dim=-1).view(seq1.shape[0], -1)
	seq_cat_lens = (token_lens / scale + token_lens).int()

	return seq_cat, seq_cat_lens

	def deserialize(self, seqs, seq_lens):
	if len(self.tree_config) == 1:
	return [seqs], seq_lens

	max_scale = max(config["downsample_rate"] for config in self.tree_config)
	total_scale = sum(config["downsample_rate"] for config in self.tree_config)

	# Cut for aligning
	if seq_lens is None:
	seq_lens = torch.full([seqs.shape[0]], seqs.shape[1]).to(seqs.device)
	seq_lens = (seq_lens / total_scale).int() * total_scale
	token_lens = (seq_lens / total_scale).int() * max_scale
	seqs = seqs[:, : seq_lens.max()]

	# Separate
	tokens = torch.stack(
	[seqs[:, i::total_scale] for i in range(total_scale)], dim=-1
	)
	seq1 = tokens[..., 0]
	seq2 = tokens[..., 1:].contiguous().view(tokens.shape[0], -1)

	return [seq1, seq2], token_lens


	class SemanticVQVAE(nn.Module):

	def __init__(
	self,
	in_dim,
	out_dim,
	n_model_size,
	downsample_scales=[1, 2],
	upsample_scales=[[2, 1], [2, 1]],
	mel_config={},
	ssl_config={},
	# Quantization
	vq_class="VectorQuantization",
	vq_config={},
	tree_config={},
	# Training
	checkpointing=True,
	dual_decoding=False,
	n_samples_per_token=640,
	online_extraction=True,
	ssl_extractor=None,
	):
	super(SemanticVQVAE, self).__init__()
	self.in_dim = in_dim
	self.n_model_size = n_model_size
	self.mel_config = mel_config
	self.dual_decoding = dual_decoding
	self.vq_config = vq_config
	self.tree_config = tree_config
	self.output_feature = "mel"
	self.n_samples_per_token = n_samples_per_token
	self.checkpointing = checkpointing

	self.mel_spectrogram = TorchMelSpectrogram(**mel_config)

	# Speaker encoder
	self.speaker_encoder = ECAPA_TDNN(
	out_dim,
	n_model_size,
	channels=[512, 512, 512, 512, 1536],
	kernel_sizes=[5, 3, 3, 3, 1],
	dilations=[1, 2, 3, 4, 1],
	attention_channels=128,
	res2net_scale=4,
	se_channels=128,
	global_context=True,
	batch_norm=True,
	)

	# Encoder & decoder
	self.encoder = Encoder(
	in_dim, n_model_size, downsample_scales, checkpointing=checkpointing
	)

	# Quantization
	self.quantizer = TreeVectorQuantization(
	n_model_size,
	vq_class=vq_class,
	vq_config=vq_config,
	tree_config=tree_config,
	)

	def forward(
	self,
	wav,
	wav_length,
	enable_vq=True,
	decode=True,
	extract_spk=True,
	shuffle=False,
	**kwargs,
	):
	output_dict = {}

	with torch.no_grad():
	# Pad waveform
	if wav.shape[1] % self.n_samples_per_token > 0:
	pad_size = (
	self.n_samples_per_token - wav.shape[1] % self.n_samples_per_token
	)
	wav = F.pad(wav, (0, pad_size), value=0)
	wav_length += pad_size

	# Extract mel & sll
	mel, mel_length = kwargs.get("mel", None), kwargs.get("mel_length", None)
	if mel is None:
	mel, mel_length = self.mel_spectrogram(wav, wav_length)
	output_dict.update({"mel": mel, "mel_length": mel_length})

	ssl, ssl_length = kwargs.get("ssl", None), kwargs.get("ssl_length", None)
	if ssl is None:
	ssl, ssl_length = self.ssl_extractor(wav, wav_length)
	output_dict.update({"ssl": ssl.float(), "ssl_length": ssl_length})

	input, input_length = ssl, ssl_length
	output, output_length = mel, mel_length

	encoder_outputs = self.encoder(input)
	quant_length = torch.ceil(input_length / self.encoder.downsample_scale)
	quant_length = quant_length.clamp(max=encoder_outputs.shape[1])

	quant, (quants, diff, embed_ind) = self.quantizer(
	encoder_outputs,
	quant_length,
	enable_vq=enable_vq,
	update_codebook=True,
	return_pre_quant=self.dual_decoding,
	)

	output_dict.update(
	{
	"quants": quants,
	"token": embed_ind,
	"token_length": quant_length.int(),
	"encoder_diffs": diff,
	}
	)

	# Speaker
	if extract_spk:
	cond, cond_length = output, output_length
	speaker_embedding = self.speaker_encoder(cond, cond_length)
	speaker_embedding_1 = speaker_embedding_2 = speaker_embedding
	output_dict["spk"] = speaker_embedding

	return output_dict

	@torch.no_grad()
	def extract_speech_tokens(
	self, wav, wav_length, serialize=True, extract_spk=True, shuffle=False
	):
	output_dict = self.forward(
	wav, wav_length, True, False, extract_spk=extract_spk, shuffle=shuffle
	)
	token_seqs, token_length = output_dict["token"], output_dict["token_length"]

	# Align sequences
	scale = self.tree_config[0]["downsample_rate"]
	token_length = (torch.ceil(token_length / scale) * scale).int()

	new_token_seqs, new_token_lens = [], []
	for i, token_seq in enumerate(token_seqs):
	# discrete-continuous tokens
	residual = None
	if isinstance(token_seq, (tuple, list)):
	token_seq, residual = token_seq

	scale = self.tree_config[i]["downsample_rate"]
	new_token_len = token_length // scale
	pad = int(new_token_len.max()) - token_seq.shape[1]
	token_seq = F.pad(
	token_seq,
	(0, pad) if len(token_seq.shape) == 2 else (0, 0, 0, pad),
	"replicate",
	)

	if residual is not None:
	token_seq = (token_seq, residual)
	new_token_seqs.append(token_seq)
	new_token_lens.append(new_token_len)

	if len(new_token_seqs) == 1:
	new_token_seqs, new_token_lens = new_token_seqs[0], new_token_lens[0]
	elif serialize:
	new_token_seqs, new_token_lens = self.quantizer.serialize(
	new_token_seqs, new_token_lens
	)

	output_dict.update(
	{
	"embed": output_dict["quants"],
	"token": new_token_seqs,
	"token_length": new_token_lens,
	}
	)

	return output_dict

	@torch.no_grad()
	def code_to_latent(self, token, mel=None):
	quant, _ = self.quantizer.decode(token, None)
	speaker_embedding = self.speaker_encoder(mel)
	latents = quant + speaker_embedding.unsqueeze(1).repeat(1, quant.shape[1], 1)
	return {
	"latents": latents,
	}