Spaces:

ZDisket
/

MusicLSTMDemo

Sleeping

ZDisket

fix preenc

5a8156a 8 months ago

14.1 kB

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torch.nn.utils import weight_norm
	from collections import OrderedDict
	import os
	from .attentions import ResidualBlock1D, APTx
	from .quantizer import FSQ


	def sequence_mask(max_length, x_lengths):
	"""
	Make a bool sequence mask
	:param max_length: Max length of sequences
	:param x_lengths: Tensor (batch,) indicating sequence lengths
	:return: Bool tensor size (batch, max_length) where True is padded and False is valid
	"""
	mask = torch.arange(max_length).expand(len(x_lengths), max_length).to(x_lengths.device)
	mask = mask >= x_lengths.unsqueeze(1)
	return mask


	class ConvBlock2D(nn.Module):
	"""
	2-D convolutional block that supports:
	• weight-norm wrapping
	• regular or depth-wise-separable conv
	• boolean padding mask (B, 1, H, W) – keeps padded pixels at 0

	Forward signature
	-----------------
	y = block(x, x_mask=None)

	If x_mask is provided (True = padded), the block applies
	`out = out.masked_fill(mask_expanded, 0)` right before the
	non-linearity. This mirrors the masking strategy used in
	ResidualBlock2D.
	"""

	def __init__(
	self,
	in_channels: int,
	out_channels: int,
	kernel_size: int \| tuple[int, int] = 3,
	stride: int \| tuple[int, int] = 1,
	dilation: int \| tuple[int, int] = 1,
	*,
	depthwise: bool = False,
	use_weight_norm: bool = True,
	act: str = "relu",
	dropout: float = 0.1,
	bias: bool = True,
	):
	super().__init__()

	# ------ util ------ #
	def _make_conv(cin, cout, k, s, d, groups=1):
	padding = (
	d * (k // 2) if isinstance(k, int)
	else (d[0] * (k[0] // 2), d[1] * (k[1] // 2))
	)
	conv = nn.Conv2d(
	cin, cout, k, stride=s, padding=padding,
	dilation=d, groups=groups, bias=bias
	)
	return weight_norm(conv) if use_weight_norm else conv

	# ------ conv path ------ #
	if depthwise:
	self.dw = _make_conv(in_channels, in_channels, kernel_size, stride, dilation,
	groups=in_channels) # depth-wise
	self.pw = _make_conv(in_channels, out_channels, 1, 1, 1) # point-wise
	else:
	self.conv = _make_conv(in_channels, out_channels, kernel_size, stride, dilation)

	# ------ activation ------ #
	if act.lower() == "gelu":
	self.activation = nn.GELU()
	elif act.lower() == "aptx":
	self.activation = APTx()
	else:
	self.activation = nn.ReLU(inplace=True)

	self.dropout = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
	self.depthwise = depthwise # store flag for forward()
	self.conv_out = nn.Conv2d(out_channels, 1, 1)

	# --------------------------------------------------------------------- #
	def _apply_mask(self, tensor: torch.Tensor, mask: torch.Tensor \| None) -> torch.Tensor:
	if mask is not None:
	tensor = tensor.masked_fill(mask.expand_as(tensor), 0.0)
	return tensor

	# --------------------------------------------------------------------- #
	def forward(self, x: torch.Tensor, x_mask: torch.Tensor \| None = None) -> torch.Tensor:
	"""
	x : (B, Cin, H, W)
	x_mask : (B, 1, H, W) boolean, True = padding
	"""
	# (B, H, W)
	x = x.unsqueeze(1)
	x_mask = x_mask.unsqueeze(1)
	if self.depthwise:
	out = self.dw(x)
	out = self._apply_mask(out, x_mask)
	out = self.pw(out)
	else:
	out = self.conv(x)

	out = self._apply_mask(out, x_mask)
	out = self.activation(out)
	out = self.dropout(out)
	out = self.conv_out(out)
	return out.squeeze(1)


	class PreEncoder(nn.Module):
	def __init__(self, mel_channels, channels, kernel_sizes, fsq_levels=[8, 8, 5, 5, 5], dropout=0.1):
	"""
	Spectrogram Pre-Encoder.
	ResNet-based autoencoder with configurable encoder and decoder blocks.

	Parameters:
	- mel_channels (int): number of channels in the input spectrogram.
	- channels (list of ints): list of channel dimensions for encoder blocks.
	* The first element is the projected input dimension.
	* The last element is the latent dimension.
	- kernel_sizes (list of ints): list of kernel sizes for each ResidualBlock1D.
	Length should be len(channels) - 1. The decoder will use these lists in reverse.
	"""
	super(PreEncoder, self).__init__()
	# Project input from mel_channels to channels[0]
	self.proj = nn.Linear(mel_channels, channels[0])
	self.pre = ConvBlock2D(1, channels[0], kernel_size=5, depthwise=True, act="aptx")
	self.quantizer_dim = len(fsq_levels)
	# Encoder: build a sequence of ResidualBlock1D modules
	self.encoder_blocks = nn.ModuleList([
	ResidualBlock1D(channels[i], channels[i + 1], kernel_size=kernel_sizes[i], dropout=dropout, act="taptx",
	norm="weight")
	for i in range(len(channels) - 1)
	])

	# Quantization stage: here we use the latent dimension as the last element of channels.
	latent_dim = channels[-1]

	self.q_in_proj = nn.Linear(latent_dim, self.quantizer_dim)
	self.quantizer = FSQ(levels=fsq_levels)
	self.q_out_proj = nn.Linear(self.quantizer_dim, latent_dim)
	self.codebook_size = 8010 # TODO: dyn calculate this
	self.bos_token_id = 8001
	self.eos_token_id = 8002

	# Decoder: use the reversed lists so that the decoder mirrors the encoder.
	rev_channels = list(reversed(channels))
	rev_kernel_sizes = list(reversed(kernel_sizes))
	self.decoder_blocks = nn.ModuleList([
	ResidualBlock1D(rev_channels[i], rev_channels[i + 1], kernel_size=rev_kernel_sizes[i], dropout=dropout,
	act="taptx", causal=True, norm="weight")
	for i in range(len(rev_channels) - 1)
	])
	self.post = ConvBlock2D(1, channels[0], kernel_size=5, depthwise=True, act="aptx")

	# Output projection: map from the decoder’s final channel (channels[0]) back to mel_channels.
	self.out_proj = nn.Linear(channels[0], mel_channels)

	def forward(self, x, x_lengths):
	"""
	Forward pass.

	Parameters:
	- x: Tensor of shape (batch, mel_len, mel_channels)
	- x_lengths: (batch,), int lengths of each thing
	Returns:
	- Reconstructed tensor of shape (batch, mel_len, mel_channels)
	"""
	# Project input to channel dimension channels[0]
	x = self.proj(x) # (batch, mel_len, channels[0])
	# Permute to (batch, channels[0], mel_len) for 1D convolutions.
	x = x.permute(0, 2, 1)

	x_mask = sequence_mask(x.size(2), x_lengths)
	x_mask = x_mask.unsqueeze(1) # (B, 1, T)
	x = self.pre(x, x_mask)

	# Pass through the encoder blocks
	for block in self.encoder_blocks:
	x = block(x, x_mask=x_mask)

	# Permute back to (batch, mel_len, latent_dim)
	x = x.permute(0, 2, 1)
	x = self.q_in_proj(x)
	xhat, indices = self.quantizer(x)
	x = self.q_out_proj(xhat)
	# Permute for the decoder
	x = x.permute(0, 2, 1)

	# Pass through the decoder blocks
	for block in self.decoder_blocks:
	x = block(x, x_mask=x_mask)

	x = self.post(x, x_mask)
	# Permute back to (batch, mel_len, channels[0])
	x = x.permute(0, 2, 1)
	# Final projection back to mel_channels
	x = self.out_proj(x)

	return x

	def encode(self, x, x_mask=None):
	"""
	Encodes the input spectrogram into discrete latent indices.

	Args:
	x (torch.Tensor): Input tensor of shape (batch, mel_len, mel_channels).
	- x_mask: Tensor of shape (batch, mel_len), bool where padded positions are True.
	(This mask will be passed to each ResidualBlock1D, which is assumed to apply
	.masked_fill(x_mask, 0) before its activation calls.)
	Returns:
	indices (torch.Tensor): Discrete token indices from the vector quantizer.
	"""
	# Project input to latent_dim
	x = self.proj(x)
	# Permute to (batch, latent_dim, mel_len) for convolutional operations
	x = x.permute(0, 2, 1)

	if x_mask is None:
	x_mask = torch.zeros((x.size(0), 1, x.size(2)), device=x.device).bool()

	x = self.pre(x, x_mask)

	# Pass through the encoder blocks
	for block in self.encoder_blocks:
	x = block(x, x_mask=x_mask)
	# Permute back to (batch, mel_len, latent_dim)
	x = x.permute(0, 2, 1)
	# Project to quantizer input dimension (e.g. 4)
	x = self.q_in_proj(x)
	# Quantize and obtain indices
	_, indices = self.quantizer(x)
	return indices.long() # otherwise cross entropy loss bitches later

	def decode(self, indices, x_mask=None, return_hidden=False):
	"""
	Decodes discrete latent indices into a reconstructed spectrogram.

	Args:
	indices (torch.Tensor): Discrete token indices from the vector quantizer.

	Returns:
	x (torch.Tensor): Reconstructed spectrogram of shape (batch, mel_len, mel_channels).
	"""
	# Convert indices to quantized latent codes (shape: (batch, mel_len, 4))
	xhat = self.quantizer.indices_to_codes(indices)
	# Project quantized representation back to latent_dim
	x = self.q_out_proj(xhat)
	# Permute to (batch, latent_dim, mel_len) for convolutional operations

	x = x.permute(0, 2, 1)

	if x_mask is None:
	x_mask = torch.zeros((x.size(0), 1, x.size(2)), device=x.device).bool()

	# Pass through the decoder blocks
	for block in self.decoder_blocks:
	x = block(x, x_mask=x_mask)

	if return_hidden:
	last_hid = x.clone()

	x = self.post(x, x_mask)
	# Permute back to (batch, mel_len, latent_dim)
	x = x.permute(0, 2, 1)
	# Project back to the original mel_channels
	x = self.out_proj(x)
	if return_hidden:
	return x, last_hid

	return x


	def get_pre_encoder(model_path: str, device: str or torch.device, channels = [384, 512, 768], kernel_sizes=[7, 5, 3], mel_channels=88):
	"""
	Loads a Pre-Encoder model from a checkpoint file.

	Assumes the checkpoint was saved with the training script's structure,
	containing 'model_state_dict' and 'args' (or a compatible dict).

	Args:
	model_path (str): Path to the .pth checkpoint file.
	device (str or torch.device): The device to load the model onto ('cpu', 'cuda', etc.).

	Returns:
	tuple: A tuple containing:
	- model (nn.Module): The loaded ResNetAutoencoder1D model instance,
	moved to the specified device and set to eval mode.
	- model_args (argparse.Namespace or dict): The configuration arguments
	used to initialize the model,
	loaded from the checkpoint.
	Raises:
	FileNotFoundError: If the model_path does not exist.
	KeyError: If essential keys ('args', 'model_state_dict') are missing
	from the checkpoint.
	RuntimeError: If load_state_dict fails (e.g., architecture mismatch).
	ImportError: If the ResNetAutoencoder1D class cannot be imported/found.
	"""
	if not os.path.isfile(model_path):
	raise FileNotFoundError(f"Checkpoint file not found: {model_path}")

	print(f"Loading checkpoint from: {model_path}")
	checkpoint = torch.load(model_path, map_location='cpu', weights_only=False) # Load to CPU first

	# --- 2. Instantiate Model ---
	try:
	model = PreEncoder(mel_channels=mel_channels, channels=channels, kernel_sizes=kernel_sizes,
	dropout=0.0, fsq_levels=[8, 5, 5, 5])
	except NameError:
	raise ImportError("ResNetAutoencoder1D class definition not found. Ensure model.py is accessible or the class is defined.")
	except Exception as e:
	raise RuntimeError(f"Failed to instantiate model with loaded config: {e}")


	# --- 3. Load Weights ---
	if 'model_state_dict' in checkpoint:
	pretrained_weights = checkpoint['model_state_dict']
	print("Found weights under 'model_state_dict' key.")

	# Optional: Handle 'module.' prefix (if saved using DataParallel/DDP)
	clean_weights = OrderedDict()
	has_module_prefix = False
	for k, v in pretrained_weights.items():
	if k.startswith('module.'):
	has_module_prefix = True
	clean_weights[k[7:]] = v # remove `module.`
	else:
	clean_weights[k] = v
	if has_module_prefix:
	print("Removed 'module.' prefix from weight keys.")
	pretrained_weights = clean_weights # Use the cleaned dictionary

	# Load the weights using strict=True (assumes exact match)
	try:
	model.load_state_dict(pretrained_weights, strict=True)
	print("Successfully loaded model weights.")
	except RuntimeError as e:
	print(f"Error loading state_dict (likely architecture mismatch): {e}")
	raise e # Re-raise the error

	else:
	raise KeyError(f"Checkpoint missing 'model_state_dict' key containing weights.")

	# --- 4. Final Steps ---
	model.to(device) # Move model to the target device
	model.eval() # Set model to evaluation mode
	print(f"Model loaded onto {device} and set to evaluation mode.")

	return model