NeMo_Canary / nemo /collections /diffusion /vae /autoencoder.py

Upload folder using huggingface_hub

b386992 verified 7 months ago

11.1 kB

	# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	from dataclasses import dataclass

	import numpy as np
	import torch
	from torch import Tensor, nn

	from nemo.collections.diffusion.vae.blocks import Downsample, Normalize, ResnetBlock, Upsample, make_attn


	# pylint: disable=C0116
	@dataclass
	class AutoEncoderConfig:
	ch_mult: list[int]
	attn_resolutions: list[int]
	resolution: int = 256
	in_channels: int = 3
	ch: int = 128
	out_ch: int = 3
	num_res_blocks: int = 2
	z_channels: int = 16
	scale_factor: float = 0.3611
	shift_factor: float = 0.1159
	attn_type: str = 'vanilla'
	double_z: bool = True
	dropout: float = 0.0
	ckpt: str = None


	def nonlinearity(x):
	# swish
	return torch.nn.functional.silu(x)


	class Encoder(nn.Module):
	def __init__(
	self,
	*,
	ch: int,
	out_ch: int,
	ch_mult: list[int],
	num_res_blocks: int,
	attn_resolutions: list[int],
	in_channels: int,
	resolution: int,
	z_channels: int,
	dropout=0.0,
	resamp_with_conv=True,
	double_z=True,
	use_linear_attn=False,
	attn_type="vanilla",
	):
	super().__init__()
	if use_linear_attn:
	attn_type = "linear"
	self.ch = ch
	self.temb_ch = 0
	self.num_resolutions = len(ch_mult)
	self.num_res_blocks = num_res_blocks
	self.resolution = resolution
	self.in_channels = in_channels

	# downsampling
	self.conv_in = torch.nn.Conv2d(in_channels, self.ch, kernel_size=3, stride=1, padding=1)

	curr_res = resolution
	in_ch_mult = (1,) + tuple(ch_mult)
	self.in_ch_mult = in_ch_mult
	self.down = nn.ModuleList()
	for i_level in range(self.num_resolutions):
	block = nn.ModuleList()
	attn = nn.ModuleList()
	block_in = ch * in_ch_mult[i_level]
	block_out = ch * ch_mult[i_level]
	for i_block in range(self.num_res_blocks):
	block.append(
	ResnetBlock(
	in_channels=block_in, out_channels=block_out, temb_channels=self.temb_ch, dropout=dropout
	)
	)
	block_in = block_out
	if curr_res in attn_resolutions:
	attn.append(make_attn(block_in, attn_type=attn_type))
	down = nn.Module()
	down.block = block
	down.attn = attn
	if i_level != self.num_resolutions - 1:
	down.downsample = Downsample(block_in, resamp_with_conv)
	curr_res = curr_res // 2
	self.down.append(down)

	# middle
	self.mid = nn.Module()
	self.mid.block_1 = ResnetBlock(
	in_channels=block_in, out_channels=block_in, temb_channels=self.temb_ch, dropout=dropout
	)
	self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
	self.mid.block_2 = ResnetBlock(
	in_channels=block_in, out_channels=block_in, temb_channels=self.temb_ch, dropout=dropout
	)

	# end
	self.norm_out = Normalize(block_in)
	self.conv_out = torch.nn.Conv2d(
	block_in, 2 * z_channels if double_z else z_channels, kernel_size=3, stride=1, padding=1
	)

	def forward(self, x):
	# timestep embedding
	temb = None

	# downsampling
	hs = [self.conv_in(x)]
	for i_level in range(self.num_resolutions):
	for i_block in range(self.num_res_blocks):
	h = self.down[i_level].block[i_block](hs[-1], temb)
	if len(self.down[i_level].attn) > 0:
	h = self.down[i_level].attn[i_block](h)
	hs.append(h)
	if i_level != self.num_resolutions - 1:
	hs.append(self.down[i_level].downsample(hs[-1]))

	# middle
	h = hs[-1]
	h = self.mid.block_1(h, temb)
	h = self.mid.attn_1(h)
	h = self.mid.block_2(h, temb)

	# end
	h = self.norm_out(h)
	h = nonlinearity(h)
	h = self.conv_out(h)
	return h


	class Decoder(nn.Module):
	def __init__(
	self,
	*,
	ch: int,
	out_ch: int,
	ch_mult: list[int],
	num_res_blocks: int,
	attn_resolutions: list[int],
	in_channels: int,
	resolution: int,
	z_channels: int,
	dropout=0.0,
	resamp_with_conv=True,
	give_pre_end=False,
	tanh_out=False,
	use_linear_attn=False,
	attn_type="vanilla",
	**ignorekwargs,
	):
	super().__init__()
	if use_linear_attn:
	attn_type = "linear"
	self.ch = ch
	self.temb_ch = 0
	self.num_resolutions = len(ch_mult)
	self.num_res_blocks = num_res_blocks
	self.resolution = resolution
	self.in_channels = in_channels
	self.give_pre_end = give_pre_end
	self.tanh_out = tanh_out

	# compute in_ch_mult, block_in and curr_res at lowest res
	in_ch_mult = (1,) + tuple(ch_mult)
	block_in = ch * ch_mult[self.num_resolutions - 1]
	curr_res = resolution // 2 ** (self.num_resolutions - 1)
	self.z_shape = (1, z_channels, curr_res, curr_res)
	print("Working with z of shape {} = {} dimensions.".format(self.z_shape, np.prod(self.z_shape)))

	# z to block_in
	self.conv_in = torch.nn.Conv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1)

	# middle
	self.mid = nn.Module()
	self.mid.block_1 = ResnetBlock(
	in_channels=block_in, out_channels=block_in, temb_channels=self.temb_ch, dropout=dropout
	)
	self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
	self.mid.block_2 = ResnetBlock(
	in_channels=block_in, out_channels=block_in, temb_channels=self.temb_ch, dropout=dropout
	)

	# upsampling
	self.up = nn.ModuleList()
	for i_level in reversed(range(self.num_resolutions)):
	block = nn.ModuleList()
	attn = nn.ModuleList()
	block_out = ch * ch_mult[i_level]
	for i_block in range(self.num_res_blocks + 1):
	block.append(
	ResnetBlock(
	in_channels=block_in, out_channels=block_out, temb_channels=self.temb_ch, dropout=dropout
	)
	)
	block_in = block_out
	if curr_res in attn_resolutions:
	attn.append(make_attn(block_in, attn_type=attn_type))
	up = nn.Module()
	up.block = block
	up.attn = attn
	if i_level != 0:
	up.upsample = Upsample(block_in, resamp_with_conv)
	curr_res = curr_res * 2
	self.up.insert(0, up) # prepend to get consistent order

	# end
	self.norm_out = Normalize(block_in)
	self.conv_out = torch.nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1)

	def forward(self, z):
	# assert z.shape[1:] == self.z_shape[1:]
	self.last_z_shape = z.shape

	# timestep embedding
	temb = None

	# z to block_in
	h = self.conv_in(z)

	# middle
	h = self.mid.block_1(h, temb)
	h = self.mid.attn_1(h)
	h = self.mid.block_2(h, temb)

	# upsampling
	for i_level in reversed(range(self.num_resolutions)):
	for i_block in range(self.num_res_blocks + 1):
	h = self.up[i_level].block[i_block](h, temb)
	if len(self.up[i_level].attn) > 0:
	h = self.up[i_level].attn[i_block](h)
	if i_level != 0:
	h = self.up[i_level].upsample(h)

	# end
	if self.give_pre_end:
	return h

	h = self.norm_out(h)
	h = nonlinearity(h)
	h = self.conv_out(h)
	if self.tanh_out:
	h = torch.tanh(h)
	return h


	class DiagonalGaussian(nn.Module):
	def __init__(self, sample: bool = True, chunk_dim: int = 1):
	super().__init__()
	self.sample = sample
	self.chunk_dim = chunk_dim

	def forward(self, z: Tensor) -> Tensor:
	mean, logvar = torch.chunk(z, 2, dim=self.chunk_dim)
	if self.sample:
	std = torch.exp(0.5 * logvar)
	return mean + std * torch.randn_like(mean)
	else:
	return mean


	class AutoEncoder(nn.Module):
	def __init__(self, params: AutoEncoderConfig):
	super().__init__()
	self.encoder = Encoder(
	resolution=params.resolution,
	in_channels=params.in_channels,
	ch=params.ch,
	ch_mult=params.ch_mult,
	num_res_blocks=params.num_res_blocks,
	z_channels=params.z_channels,
	double_z=params.double_z,
	attn_type=params.attn_type,
	dropout=params.dropout,
	out_ch=params.out_ch,
	attn_resolutions=params.attn_resolutions,
	)
	self.decoder = Decoder(
	resolution=params.resolution,
	in_channels=params.in_channels,
	ch=params.ch,
	out_ch=params.out_ch,
	ch_mult=params.ch_mult,
	num_res_blocks=params.num_res_blocks,
	z_channels=params.z_channels,
	double_z=params.double_z,
	attn_type=params.attn_type,
	dropout=params.dropout,
	attn_resolutions=params.attn_resolutions,
	)
	self.reg = DiagonalGaussian()

	self.scale_factor = params.scale_factor
	self.shift_factor = params.shift_factor
	self.params = params

	if params.ckpt is not None:
	self.load_from_checkpoint(params.ckpt)

	def encode(self, x: Tensor) -> Tensor:
	z = self.reg(self.encoder(x))
	z = self.scale_factor * (z - self.shift_factor)
	return z

	def decode(self, z: Tensor) -> Tensor:
	z = z / self.scale_factor + self.shift_factor
	return self.decoder(z)

	def forward(self, x: Tensor) -> Tensor:
	return self.decode(self.encode(x))

	def load_from_checkpoint(self, ckpt_path):
	from safetensors.torch import load_file as load_sft

	state_dict = load_sft(ckpt_path)
	missing, unexpected = self.load_state_dict(state_dict)
	if len(missing) > 0:
	logger.warning(f"Following keys are missing from checkpoint loaded: {missing}")


	# pylint: disable=C0116