final upload

Browse files

Files changed (9) hide show

models/diffloss.py +248 -0
models/mar.py +355 -0
models/vae.py +67 -0
taming/modules/autoencoder/lpips/vgg.pth +3 -0
util/crop.py +23 -0
util/download.py +62 -0
util/loader.py +56 -0
util/lr_sched.py +21 -0
util/misc.py +340 -0

models/diffloss.py ADDED Viewed

	@@ -0,0 +1,248 @@

+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+import math
+from diffusion import create_diffusion
+class DiffLoss(nn.Module):
+    """Diffusion Loss"""
+    def __init__(self, target_channels, z_channels, depth, width, num_sampling_steps, grad_checkpointing=False):
+        super(DiffLoss, self).__init__()
+        self.in_channels = target_channels
+        self.net = SimpleMLPAdaLN(
+            in_channels=target_channels,
+            model_channels=width,
+            out_channels=target_channels * 2,  # for vlb loss
+            z_channels=z_channels,
+            num_res_blocks=depth,
+            grad_checkpointing=grad_checkpointing
+        )
+        self.train_diffusion = create_diffusion(timestep_respacing="", noise_schedule="cosine")
+        self.gen_diffusion = create_diffusion(timestep_respacing=num_sampling_steps, noise_schedule="cosine")
+    def forward(self, target, z, mask=None):
+        t = torch.randint(0, self.train_diffusion.num_timesteps, (target.shape[0],), device=target.device)
+        model_kwargs = dict(c=z)
+        loss_dict = self.train_diffusion.training_losses(self.net, target, t, model_kwargs)
+        loss = loss_dict["loss"]
+        if mask is not None:
+            loss = (loss * mask).sum() / mask.sum()
+        return loss.mean()
+    def sample(self, z, temperature=1.0, cfg=1.0):
+        # diffusion loss sampling
+        if not cfg == 1.0:
+            noise = torch.randn(z.shape[0] // 2, self.in_channels).cuda()
+            noise = torch.cat([noise, noise], dim=0)
+            model_kwargs = dict(c=z, cfg_scale=cfg)
+            sample_fn = self.net.forward_with_cfg
+        else:
+            noise = torch.randn(z.shape[0], self.in_channels).cuda()
+            model_kwargs = dict(c=z)
+            sample_fn = self.net.forward
+        sampled_token_latent = self.gen_diffusion.p_sample_loop(
+            sample_fn, noise.shape, noise, clip_denoised=False, model_kwargs=model_kwargs, progress=False,
+            temperature=temperature
+        )
+        return sampled_token_latent
+def modulate(x, shift, scale):
+    return x * (1 + scale) + shift
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class ResBlock(nn.Module):
+    """
+    A residual block that can optionally change the number of channels.
+    :param channels: the number of input channels.
+    """
+    def __init__(
+        self,
+        channels
+    ):
+        super().__init__()
+        self.channels = channels
+        self.in_ln = nn.LayerNorm(channels, eps=1e-6)
+        self.mlp = nn.Sequential(
+            nn.Linear(channels, channels, bias=True),
+            nn.SiLU(),
+            nn.Linear(channels, channels, bias=True),
+        )
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(channels, 3 * channels, bias=True)
+        )
+    def forward(self, x, y):
+        shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(y).chunk(3, dim=-1)
+        h = modulate(self.in_ln(x), shift_mlp, scale_mlp)
+        h = self.mlp(h)
+        return x + gate_mlp * h
+class FinalLayer(nn.Module):
+    """
+    The final layer adopted from DiT.
+    """
+    def __init__(self, model_channels, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(model_channels, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(model_channels, out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(model_channels, 2 * model_channels, bias=True)
+        )
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=-1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+class SimpleMLPAdaLN(nn.Module):
+    """
+    The MLP for Diffusion Loss.
+    :param in_channels: channels in the input Tensor.
+    :param model_channels: base channel count for the model.
+    :param out_channels: channels in the output Tensor.
+    :param z_channels: channels in the condition.
+    :param num_res_blocks: number of residual blocks per downsample.
+    """
+    def __init__(
+        self,
+        in_channels,
+        model_channels,
+        out_channels,
+        z_channels,
+        num_res_blocks,
+        grad_checkpointing=False
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.out_channels = out_channels
+        self.num_res_blocks = num_res_blocks
+        self.grad_checkpointing = grad_checkpointing
+        self.time_embed = TimestepEmbedder(model_channels)
+        self.cond_embed = nn.Linear(z_channels, model_channels)
+        self.input_proj = nn.Linear(in_channels, model_channels)
+        res_blocks = []
+        for i in range(num_res_blocks):
+            res_blocks.append(ResBlock(
+                model_channels,
+            ))
+        self.res_blocks = nn.ModuleList(res_blocks)
+        self.final_layer = FinalLayer(model_channels, out_channels)
+        self.initialize_weights()
+    def initialize_weights(self):
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize timestep embedding MLP
+        nn.init.normal_(self.time_embed.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.time_embed.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers
+        for block in self.res_blocks:
+            nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+        # Zero-out output layers
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+        nn.init.constant_(self.final_layer.linear.bias, 0)
+    def forward(self, x, t, c):
+        """
+        Apply the model to an input batch.
+        :param x: an [N x C] Tensor of inputs.
+        :param t: a 1-D batch of timesteps.
+        :param c: conditioning from AR transformer.
+        :return: an [N x C] Tensor of outputs.
+        """
+        x = self.input_proj(x)
+        t = self.time_embed(t)
+        c = self.cond_embed(c)
+        y = t + c
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            for block in self.res_blocks:
+                x = checkpoint(block, x, y)
+        else:
+            for block in self.res_blocks:
+                x = block(x, y)
+        return self.final_layer(x, y)
+    def forward_with_cfg(self, x, t, c, cfg_scale):
+        half = x[: len(x) // 2]
+        combined = torch.cat([half, half], dim=0)
+        model_out = self.forward(combined, t, c)
+        eps, rest = model_out[:, :self.in_channels], model_out[:, self.in_channels:]
+        cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
+        half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
+        eps = torch.cat([half_eps, half_eps], dim=0)
+        return torch.cat([eps, rest], dim=1)

models/mar.py ADDED Viewed

	@@ -0,0 +1,355 @@

+from functools import partial
+import numpy as np
+from tqdm import tqdm
+import scipy.stats as stats
+import math
+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+from timm.models.vision_transformer import Block
+from models.diffloss import DiffLoss
+def mask_by_order(mask_len, order, bsz, seq_len):
+    masking = torch.zeros(bsz, seq_len).cuda()
+    masking = torch.scatter(masking, dim=-1, index=order[:, :mask_len.long()], src=torch.ones(bsz, seq_len).cuda()).bool()
+    return masking
+class MAR(nn.Module):
+    """ Masked Autoencoder with VisionTransformer backbone
+    """
+    def __init__(self, img_size=256, vae_stride=16, patch_size=1,
+                 encoder_embed_dim=1024, encoder_depth=16, encoder_num_heads=16,
+                 decoder_embed_dim=1024, decoder_depth=16, decoder_num_heads=16,
+                 mlp_ratio=4., norm_layer=nn.LayerNorm,
+                 vae_embed_dim=16,
+                 mask_ratio_min=0.7,
+                 label_drop_prob=0.1,
+                 attn_dropout=0.1,
+                 proj_dropout=0.1,
+                 buffer_size=64,
+                 diffloss_d=3,
+                 diffloss_w=1024,
+                 num_sampling_steps='100',
+                 diffusion_batch_mul=4,
+                 grad_checkpointing=False,
+                 ):
+        super().__init__()
+        # --------------------------------------------------------------------------
+        # VAE and patchify specifics
+        self.vae_embed_dim = vae_embed_dim
+        self.img_size = img_size
+        self.vae_stride = vae_stride
+        self.patch_size = patch_size
+        self.seq_h = self.seq_w = img_size // vae_stride // patch_size
+        self.seq_len = self.seq_h * self.seq_w
+        self.token_embed_dim = vae_embed_dim * patch_size**2
+        self.grad_checkpointing = grad_checkpointing
+        # --------------------------------------------------------------------------
+        # image drop
+        self.label_drop_prob = label_drop_prob
+        # Fake class embedding for CFG's unconditional generation
+        # self.fake_latent = nn.Parameter(torch.zeros(1, encoder_embed_dim))
+        # --------------------------------------------------------------------------
+        # MAR variant masking ratio, a left-half truncated Gaussian centered at 100% masking ratio with std 0.25
+        self.mask_ratio_generator = stats.truncnorm((mask_ratio_min - 1.0) / 0.25, 0, loc=1.0, scale=0.25)
+        # --------------------------------------------------------------------------
+        # MAR encoder specifics
+        self.z_proj1 = nn.Linear(self.token_embed_dim, encoder_embed_dim, bias=True)
+        self.z_proj2 = nn.Linear(self.token_embed_dim, encoder_embed_dim, bias=True)
+        self.z_proj_ln = nn.LayerNorm(encoder_embed_dim, eps=1e-6)
+        self.buffer_size = buffer_size
+        self.encoder_pos_embed_learned = nn.Parameter(torch.zeros(1, 2 * self.seq_len, encoder_embed_dim))
+        self.encoder_blocks = nn.ModuleList([
+            Block(encoder_embed_dim, encoder_num_heads, mlp_ratio, qkv_bias=True, norm_layer=norm_layer,
+                  proj_drop=proj_dropout, attn_drop=attn_dropout) for _ in range(encoder_depth)])
+        self.encoder_norm = norm_layer(encoder_embed_dim)
+        # --------------------------------------------------------------------------
+        # MAR decoder specifics
+        self.decoder_embed = nn.Linear(encoder_embed_dim, decoder_embed_dim, bias=True)
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, decoder_embed_dim))
+        self.decoder_pos_embed_learned = nn.Parameter(torch.zeros(1, 2 * self.seq_len, decoder_embed_dim))
+        self.decoder_blocks = nn.ModuleList([
+            Block(decoder_embed_dim, decoder_num_heads, mlp_ratio, qkv_bias=True, norm_layer=norm_layer,
+                  proj_drop=proj_dropout, attn_drop=attn_dropout) for _ in range(decoder_depth)])
+        self.decoder_norm = norm_layer(decoder_embed_dim)
+        self.diffusion_pos_embed_learned = nn.Parameter(torch.zeros(1, 2*self.seq_len, decoder_embed_dim))
+        self.initialize_weights()
+        # --------------------------------------------------------------------------
+        # Diffusion Loss
+        self.diffloss = DiffLoss(
+            target_channels=self.token_embed_dim,
+            z_channels=decoder_embed_dim,
+            width=diffloss_w,
+            depth=diffloss_d,
+            num_sampling_steps=num_sampling_steps,
+            grad_checkpointing=grad_checkpointing
+        )
+        self.diffusion_batch_mul = diffusion_batch_mul
+    def initialize_weights(self):
+        # parameters
+        # torch.nn.init.normal_(self.class_emb.weight, std=.02)
+        # torch.nn.init.normal_(self.fake_latent, std=.02)
+        torch.nn.init.normal_(self.mask_token, std=.02)
+        torch.nn.init.normal_(self.encoder_pos_embed_learned, std=.02)
+        torch.nn.init.normal_(self.decoder_pos_embed_learned, std=.02)
+        torch.nn.init.normal_(self.diffusion_pos_embed_learned, std=.02)
+        # initialize nn.Linear and nn.LayerNorm
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            # we use xavier_uniform following official JAX ViT:
+            torch.nn.init.xavier_uniform_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+            if m.weight is not None:
+                nn.init.constant_(m.weight, 1.0)
+    def patchify(self, x):
+        bsz, c, h, w = x.shape
+        p = self.patch_size
+        h_, w_ = h // p, w // p
+        x = x.reshape(bsz, c, h_, p, w_, p)
+        x = torch.einsum('nchpwq->nhwcpq', x)
+        x = x.reshape(bsz, h_ * w_, c * p ** 2)
+        return x  # [n, l, d]
+    def unpatchify(self, x):
+        bsz = x.shape[0]
+        p = self.patch_size
+        c = self.vae_embed_dim
+        h_, w_ = self.seq_h, self.seq_w
+        x = x.reshape(bsz, h_, w_, c, p, p)
+        x = torch.einsum('nhwcpq->nchpwq', x)
+        x = x.reshape(bsz, c, h_ * p, w_ * p)
+        return x  # [n, c, h, w]
+    def sample_orders(self, bsz):
+        # generate a batch of random generation orders
+        orders = []
+        for _ in range(bsz):
+            order = np.array(list(range(self.seq_len)))
+            np.random.shuffle(order)
+            orders.append(order)
+        orders = torch.Tensor(np.array(orders)).cuda().long()
+        return orders
+    def random_masking(self, x, orders):
+        # generate token mask
+        bsz, seq_len, embed_dim = x.shape
+        mask_rate = self.mask_ratio_generator.rvs(1)[0]
+        num_masked_tokens = int(np.ceil(seq_len * mask_rate))
+        mask = torch.zeros(bsz, seq_len, device=x.device)
+        mask = torch.scatter(mask, dim=-1, index=orders[:, :num_masked_tokens],
+                             src=torch.ones(bsz, seq_len, device=x.device))
+        return mask
+    def forward_mae_encoder(self, x, mask, y):
+        x = self.z_proj1(x)
+        y = self.z_proj2(y)
+        bsz, seq_len, embed_dim = y.shape
+        # concat buffer
+        x = torch.cat([x, y], dim=1)
+        mask_with_buffer = mask #torch.cat([torch.zeros(y.size(0), self.seq_len, device=y.device), mask], dim=1)
+        # # random drop class embedding during training
+        # if self.training:
+        #     drop_latent_mask = torch.rand(bsz) < self.label_drop_prob
+        #     drop_latent_mask = drop_latent_mask.unsqueeze(-1).cuda().to(x.dtype)
+        #     class_embedding = drop_latent_mask * self.fake_latent + (1 - drop_latent_mask) * class_embedding
+        # x[:, :self.buffer_size] = class_embedding.unsqueeze(1)
+        # encoder position embedding
+        x = x + self.encoder_pos_embed_learned
+        x = self.z_proj_ln(x)
+        # dropping
+        x = x[(1-mask_with_buffer).nonzero(as_tuple=True)].reshape(bsz, -1, embed_dim)
+        # apply Transformer blocks
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            for block in self.encoder_blocks:
+                x = checkpoint(block, x)
+        else:
+            for block in self.encoder_blocks:
+                x = block(x)
+        x = self.encoder_norm(x)
+        return x
+    def forward_mae_decoder(self, x, mask):
+        x = self.decoder_embed(x)
+        mask_with_buffer = mask#cleartorch.cat([torch.zeros(x.size(0), self.seq_len, device=x.device), mask], dim=1)
+        # pad mask tokens
+        mask_tokens = self.mask_token.repeat(mask_with_buffer.shape[0], mask_with_buffer.shape[1], 1).to(x.dtype)
+        x_after_pad = mask_tokens.clone()
+        x_after_pad[(1 - mask_with_buffer).nonzero(as_tuple=True)] = x.reshape(x.shape[0] * x.shape[1], x.shape[2])
+        # decoder position embedding
+        x = x_after_pad + self.decoder_pos_embed_learned
+        # apply Transformer blocks
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            for block in self.decoder_blocks:
+                x = checkpoint(block, x)
+        else:
+            for block in self.decoder_blocks:
+                x = block(x)
+        x = self.decoder_norm(x)
+        # x = x [:, self.seq_len:]
+        x = x + self.diffusion_pos_embed_learned
+        return x
+    def forward_loss(self, z, target, mask):
+        bsz, seq_len, _ = target.shape
+        target = target.reshape(bsz * seq_len, -1).repeat(self.diffusion_batch_mul, 1)
+        z = z.reshape(bsz*seq_len, -1).repeat(self.diffusion_batch_mul, 1)
+        mask = mask.reshape(bsz*seq_len).repeat(self.diffusion_batch_mul)
+        loss = self.diffloss(z=z, target=target, mask=mask)
+        return loss
+    def forward(self, imgs, labels):
+        # class embed
+        # class_embedding = self.class_emb(labels)
+        # patchify and mask (drop) tokens
+        x = self.patchify(imgs)
+        y = self.patchify(labels)
+        gt_latents = torch.cat([x, y], dim=1).clone().detach()
+        orders = self.sample_orders(bsz=y.size(0))
+        mask = self.random_masking(x, orders)
+        mask = torch.cat([torch.zeros(y.size(0), self.seq_len).cuda(), mask], dim=1)
+        # mask = torch.cat([torch.zeros(y.size(0), self.seq_len), torch.ones(y.size(0), self.seq_len)], dim=1)
+        # mae encoder
+        x = self.forward_mae_encoder(x, mask, y)
+        # mae decoder
+        z = self.forward_mae_decoder(x, mask)
+        # diffloss
+        loss = self.forward_loss(z=z, target=gt_latents, mask=mask)
+        return loss
+    def sample_tokens(self, bsz, num_iter=64, cfg=1.0, cfg_schedule="linear", labels=None, temperature=1.0, progress=False):
+        # init and sample generation orders
+        mask = torch.ones(bsz, self.seq_len).cuda()
+        tokens = torch.zeros(bsz, self.seq_len, self.token_embed_dim).cuda()
+        orders = self.sample_orders(bsz)
+        indices = list(range(num_iter))
+        if progress:
+            indices = tqdm(indices)
+        # generate latents
+        for step in indices:
+            cur_tokens = tokens.clone()
+            # class embedding and CFG
+            if labels is not None:
+                class_embedding = self.class_emb(labels)
+            else:
+                class_embedding = self.fake_latent.repeat(bsz, 1)
+            if not cfg == 1.0:
+                tokens = torch.cat([tokens, tokens], dim=0)
+                class_embedding = torch.cat([class_embedding, self.fake_latent.repeat(bsz, 1)], dim=0)
+                mask = torch.cat([mask, mask], dim=0)
+            # mae encoder
+            x = self.forward_mae_encoder(tokens, mask, class_embedding)
+            # mae decoder
+            z = self.forward_mae_decoder(x, mask)
+            # mask ratio for the next round, following MaskGIT and MAGE.
+            mask_ratio = np.cos(math.pi / 2. * (step + 1) / num_iter)
+            mask_len = torch.Tensor([np.floor(self.seq_len * mask_ratio)]).cuda()
+            # masks out at least one for the next iteration
+            mask_len = torch.maximum(torch.Tensor([1]).cuda(),
+                                     torch.minimum(torch.sum(mask, dim=-1, keepdims=True) - 1, mask_len))
+            # get masking for next iteration and locations to be predicted in this iteration
+            mask_next = mask_by_order(mask_len[0], orders, bsz, self.seq_len)
+            if step >= num_iter - 1:
+                mask_to_pred = mask[:bsz].bool()
+            else:
+                mask_to_pred = torch.logical_xor(mask[:bsz].bool(), mask_next.bool())
+            mask = mask_next
+            if not cfg == 1.0:
+                mask_to_pred = torch.cat([mask_to_pred, mask_to_pred], dim=0)
+            # sample token latents for this step
+            z = z[mask_to_pred.nonzero(as_tuple=True)]
+            # cfg schedule follow Muse
+            if cfg_schedule == "linear":
+                cfg_iter = 1 + (cfg - 1) * (self.seq_len - mask_len[0]) / self.seq_len
+            elif cfg_schedule == "constant":
+                cfg_iter = cfg
+            else:
+                raise NotImplementedError
+            sampled_token_latent = self.diffloss.sample(z, temperature, cfg_iter)
+            if not cfg == 1.0:
+                sampled_token_latent, _ = sampled_token_latent.chunk(2, dim=0)  # Remove null class samples
+                mask_to_pred, _ = mask_to_pred.chunk(2, dim=0)
+            cur_tokens[mask_to_pred.nonzero(as_tuple=True)] = sampled_token_latent
+            tokens = cur_tokens.clone()
+        # unpatchify
+        tokens = self.unpatchify(tokens)
+        return tokens
+def mar_base(**kwargs):
+    model = MAR(
+        encoder_embed_dim=768, encoder_depth=12, encoder_num_heads=12,
+        decoder_embed_dim=768, decoder_depth=12, decoder_num_heads=12,
+        mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
+    return model
+def mar_large(**kwargs):
+    model = MAR(
+        encoder_embed_dim=1024, encoder_depth=16, encoder_num_heads=16,
+        decoder_embed_dim=1024, decoder_depth=16, decoder_num_heads=16,
+        mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
+    return model
+def mar_huge(**kwargs):
+    model = MAR(
+        encoder_embed_dim=1280, encoder_depth=20, encoder_num_heads=16,
+        decoder_embed_dim=1280, decoder_depth=20, decoder_num_heads=16,
+        mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
+    return model

models/vae.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from ldm.modules.diffusionmodules.model import Encoder, Decoder
+from ldm.modules.distributions.distributions import DiagonalGaussianDistribution
+from ldm.util import instantiate_from_config
+class AutoencoderKL(nn.Module):
+    def __init__(self,
+                 ddconfig,
+                 embed_dim,
+                 ckpt_path=None,
+                 ignore_keys=[],
+                 image_key="image",
+                 colorize_nlabels=None,
+                 monitor=None,
+                 ):
+        super().__init__()
+        self.image_key = image_key
+        self.encoder = Encoder(**ddconfig)
+        self.decoder = Decoder(**ddconfig)
+        assert ddconfig["double_z"]
+        self.quant_conv = torch.nn.Conv2d(2*ddconfig["z_channels"], 2*embed_dim, 1)
+        self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
+        self.embed_dim = embed_dim
+        if colorize_nlabels is not None:
+            assert type(colorize_nlabels)==int
+            self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
+        if monitor is not None:
+            self.monitor = monitor
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+    def init_from_ckpt(self, path, ignore_keys=list()):
+        sd = torch.load(path, map_location="cpu")["state_dict"]
+        keys = list(sd.keys())
+        for k in keys:
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    print("Deleting key {} from state_dict.".format(k))
+                    del sd[k]
+        self.load_state_dict(sd, strict=False)
+        print(f"Restored from {path}")
+    def encode(self, x):
+        h = self.encoder(x)
+        moments = self.quant_conv(h)
+        posterior = DiagonalGaussianDistribution(moments)
+        return posterior
+    def decode(self, z):
+        z = self.post_quant_conv(z)
+        dec = self.decoder(z)
+        return dec
+    def forward(self, input, sample_posterior=True):
+        posterior = self.encode(input)
+        if sample_posterior:
+            z = posterior.sample()
+        else:
+            z = posterior.mode()
+        dec = self.decode(z)
+        return dec, posterior

taming/modules/autoencoder/lpips/vgg.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a78928a0af1e5f0fcb1f3b9e8f8c3a2a5a3de244d830ad5c1feddc79b8432868
+size 7289

util/crop.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import numpy as np
+from PIL import Image
+def center_crop_arr(pil_image, image_size):
+    """
+    Center cropping implementation from ADM.
+    https://github.com/openai/guided-diffusion/blob/8fb3ad9197f16bbc40620447b2742e13458d2831/guided_diffusion/image_datasets.py#L126
+    """
+    while min(*pil_image.size) >= 2 * image_size:
+        pil_image = pil_image.resize(
+            tuple(x // 2 for x in pil_image.size), resample=Image.BOX
+        )
+    scale = image_size / min(*pil_image.size)
+    pil_image = pil_image.resize(
+        tuple(round(x * scale) for x in pil_image.size), resample=Image.BICUBIC
+    )
+    arr = np.array(pil_image)
+    crop_y = (arr.shape[0] - image_size) // 2
+    crop_x = (arr.shape[1] - image_size) // 2
+    return Image.fromarray(arr[crop_y: crop_y + image_size, crop_x: crop_x + image_size])

util/download.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import os
+from tqdm import tqdm
+import requests
+def download_pretrained_vae(overwrite=False):
+    download_path = "pretrained_models/vae/kl16.ckpt"
+    if not os.path.exists(download_path) or overwrite:
+        headers = {'user-agent': 'Wget/1.16 (linux-gnu)'}
+        os.makedirs("pretrained_models/vae", exist_ok=True)
+        r = requests.get("https://www.dropbox.com/scl/fi/hhmuvaiacrarfg28qxhwz/kl16.ckpt?rlkey=l44xipsezc8atcffdp4q7mwmh&dl=0", stream=True, headers=headers)
+        print("Downloading KL-16 VAE...")
+        with open(download_path, 'wb') as f:
+            for chunk in tqdm(r.iter_content(chunk_size=1024*1024), unit="MB", total=254):
+                if chunk:
+                    f.write(chunk)
+def download_pretrained_marb(overwrite=False):
+    download_path = "pretrained_models/mar/mar_base/checkpoint-last.pth"
+    if not os.path.exists(download_path) or overwrite:
+        headers = {'user-agent': 'Wget/1.16 (linux-gnu)'}
+        os.makedirs("pretrained_models/mar/mar_base", exist_ok=True)
+        r = requests.get("https://www.dropbox.com/scl/fi/f6dpuyjb7fudzxcyhvrhk/checkpoint-last.pth?rlkey=a6i4bo71vhfo4anp33n9ukujb&dl=0", stream=True, headers=headers)
+        print("Downloading MAR-B...")
+        with open(download_path, 'wb') as f:
+            for chunk in tqdm(r.iter_content(chunk_size=1024*1024), unit="MB", total=1587):
+                if chunk:
+                    f.write(chunk)
+def download_pretrained_marl(overwrite=False):
+    download_path = "pretrained_models/mar/mar_large/checkpoint-last.pth"
+    if not os.path.exists(download_path) or overwrite:
+        headers = {'user-agent': 'Wget/1.16 (linux-gnu)'}
+        os.makedirs("pretrained_models/mar/mar_large", exist_ok=True)
+        r = requests.get("https://www.dropbox.com/scl/fi/pxacc5b2mrt3ifw4cah6k/checkpoint-last.pth?rlkey=m48ovo6g7ivcbosrbdaz0ehqt&dl=0", stream=True, headers=headers)
+        print("Downloading MAR-L...")
+        with open(download_path, 'wb') as f:
+            for chunk in tqdm(r.iter_content(chunk_size=1024*1024), unit="MB", total=3650):
+                if chunk:
+                    f.write(chunk)
+def download_pretrained_marh(overwrite=False):
+    download_path = "pretrained_models/mar/mar_huge/checkpoint-last.pth"
+    if not os.path.exists(download_path) or overwrite:
+        headers = {'user-agent': 'Wget/1.16 (linux-gnu)'}
+        os.makedirs("pretrained_models/mar/mar_huge", exist_ok=True)
+        r = requests.get("https://www.dropbox.com/scl/fi/1qmfx6fpy3k7j9vcjjs3s/checkpoint-last.pth?rlkey=4lae281yzxb406atp32vzc83o&dl=0", stream=True, headers=headers)
+        print("Downloading MAR-H...")
+        with open(download_path, 'wb') as f:
+            for chunk in tqdm(r.iter_content(chunk_size=1024*1024), unit="MB", total=7191):
+                if chunk:
+                    f.write(chunk)
+if __name__ == "__main__":
+    download_pretrained_vae()
+    download_pretrained_marb()
+    download_pretrained_marl()
+    download_pretrained_marh()

util/loader.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import os
+import numpy as np
+import torch
+import torchvision.datasets as datasets
+class ImageFolderWithFilename(datasets.ImageFolder):
+    def __getitem__(self, index: int):
+        """
+        Args:
+            index (int): Index
+        Returns:
+            tuple: (sample, target, filename).
+        """
+        path, target = self.samples[index]
+        sample = self.loader(path)
+        if self.transform is not None:
+            sample = self.transform(sample)
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+        filename = path.split(os.path.sep)[-2:]
+        filename = os.path.join(*filename)
+        return sample, target, filename
+class CachedFolder(datasets.DatasetFolder):
+    def __init__(
+            self,
+            root: str,
+    ):
+        super().__init__(
+            root,
+            loader=None,
+            extensions=(".npz",),
+        )
+    def __getitem__(self, index: int):
+        """
+        Args:
+            index (int): Index
+        Returns:
+            tuple: (moments, target).
+        """
+        path, target = self.samples[index]
+        data = np.load(path)
+        if torch.rand(1) < 0.5:  # randomly hflip
+            moments = data['moments']
+        else:
+            moments = data['moments_flip']
+        return moments, target

util/lr_sched.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import math
+def adjust_learning_rate(optimizer, epoch, args):
+    """Decay the learning rate with half-cycle cosine after warmup"""
+    if epoch < args.warmup_epochs:
+        lr = args.lr * epoch / args.warmup_epochs
+    else:
+        if args.lr_schedule == "constant":
+            lr = args.lr
+        elif args.lr_schedule == "cosine":
+            lr = args.min_lr + (args.lr - args.min_lr) * 0.5 * \
+                (1. + math.cos(math.pi * (epoch - args.warmup_epochs) / (args.epochs - args.warmup_epochs)))
+        else:
+            raise NotImplementedError
+    for param_group in optimizer.param_groups:
+        if "lr_scale" in param_group:
+            param_group["lr"] = lr * param_group["lr_scale"]
+        else:
+            param_group["lr"] = lr
+    return lr

util/misc.py ADDED Viewed

	@@ -0,0 +1,340 @@

+import builtins
+import datetime
+import os
+import time
+from collections import defaultdict, deque
+from pathlib import Path
+import torch
+import torch.distributed as dist
+TORCH_MAJOR = int(torch.__version__.split('.')[0])
+TORCH_MINOR = int(torch.__version__.split('.')[1])
+if TORCH_MAJOR == 1 and TORCH_MINOR < 8:
+    from torch._six import inf
+else:
+    from torch import inf
+import copy
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        if not is_dist_avail_and_initialized():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
+        dist.barrier()
+        dist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+    @property
+    def global_avg(self):
+        return self.total / self.count
+    @property
+    def max(self):
+        return max(self.deque)
+    @property
+    def value(self):
+        return self.deque[-1]
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value)
+class MetricLogger(object):
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if v is None:
+                continue
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+            type(self).__name__, attr))
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append(
+                "{}: {}".format(name, str(meter))
+            )
+        return self.delimiter.join(loss_str)
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+    def log_every(self, iterable, print_freq, header=None):
+        i = 0
+        if not header:
+            header = ''
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt='{avg:.4f}')
+        data_time = SmoothedValue(fmt='{avg:.4f}')
+        space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
+        log_msg = [
+            header,
+            '[{0' + space_fmt + '}/{1}]',
+            'eta: {eta}',
+            '{meters}',
+            'time: {time}',
+            'data: {data}'
+        ]
+        if torch.cuda.is_available():
+            log_msg.append('max mem: {memory:.0f}')
+        log_msg = self.delimiter.join(log_msg)
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == len(iterable) - 1:
+                eta_seconds = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time),
+                        memory=torch.cuda.max_memory_allocated() / MB))
+                else:
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time)))
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print('{} Total time: {} ({:.4f} s / it)'.format(
+            header, total_time_str, total_time / len(iterable)))
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    builtin_print = builtins.print
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        force = force or (get_world_size() > 8)
+        if is_master or force:
+            now = datetime.datetime.now().time()
+            builtin_print('[{}] '.format(now), end='')  # print with time stamp
+            builtin_print(*args, **kwargs)
+    builtins.print = print
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+def is_main_process():
+    return get_rank() == 0
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+def init_distributed_mode(args):
+    if args.dist_on_itp:
+        args.rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
+        args.world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
+        args.gpu = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
+        args.dist_url = "tcp://%s:%s" % (os.environ['MASTER_ADDR'], os.environ['MASTER_PORT'])
+        os.environ['LOCAL_RANK'] = str(args.gpu)
+        os.environ['RANK'] = str(args.rank)
+        os.environ['WORLD_SIZE'] = str(args.world_size)
+        # ["RANK", "WORLD_SIZE", "MASTER_ADDR", "MASTER_PORT", "LOCAL_RANK"]
+    elif 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ['WORLD_SIZE'])
+        args.gpu = int(os.environ['LOCAL_RANK'])
+    elif 'SLURM_PROCID' in os.environ:
+        args.rank = int(os.environ['SLURM_PROCID'])
+        args.gpu = args.rank % torch.cuda.device_count()
+    else:
+        print('Not using distributed mode')
+        setup_for_distributed(is_master=True)  # hack
+        args.distributed = False
+        return
+    args.distributed = True
+    torch.cuda.set_device(args.gpu)
+    args.dist_backend = 'nccl'
+    print('| distributed init (rank {}): {}, gpu {}'.format(
+        args.rank, args.dist_url, args.gpu), flush=True)
+    torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                         world_size=args.world_size, rank=args.rank)
+    torch.distributed.barrier()
+    setup_for_distributed(args.rank == 0)
+class NativeScalerWithGradNormCount:
+    state_dict_key = "amp_scaler"
+    def __init__(self):
+        self._scaler = torch.cuda.amp.GradScaler()
+    def __call__(self, loss, optimizer, clip_grad=None, parameters=None, create_graph=False, update_grad=True):
+        self._scaler.scale(loss).backward(create_graph=create_graph)
+        if update_grad:
+            if clip_grad is not None:
+                assert parameters is not None
+                self._scaler.unscale_(optimizer)  # unscale the gradients of optimizer's assigned params in-place
+                norm = torch.nn.utils.clip_grad_norm_(parameters, clip_grad)
+            else:
+                self._scaler.unscale_(optimizer)
+                norm = get_grad_norm_(parameters)
+            self._scaler.step(optimizer)
+            self._scaler.update()
+        else:
+            norm = None
+        return norm
+    def state_dict(self):
+        return self._scaler.state_dict()
+    def load_state_dict(self, state_dict):
+        self._scaler.load_state_dict(state_dict)
+def get_grad_norm_(parameters, norm_type: float = 2.0) -> torch.Tensor:
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    parameters = [p for p in parameters if p.grad is not None]
+    norm_type = float(norm_type)
+    if len(parameters) == 0:
+        return torch.tensor(0.)
+    device = parameters[0].grad.device
+    if norm_type == inf:
+        total_norm = max(p.grad.detach().abs().max().to(device) for p in parameters)
+    else:
+        total_norm = torch.norm(torch.stack([torch.norm(p.grad.detach(), norm_type).to(device) for p in parameters]), norm_type)
+    return total_norm
+def add_weight_decay(model, weight_decay=1e-5, skip_list=()):
+    decay = []
+    no_decay = []
+    for name, param in model.named_parameters():
+        if not param.requires_grad:
+            continue  # frozen weights
+        if len(param.shape) == 1 or name.endswith(".bias") or name in skip_list or 'diffloss' in name:
+            no_decay.append(param)  # no weight decay on bias, norm and diffloss
+        else:
+            decay.append(param)
+    return [
+        {'params': no_decay, 'weight_decay': 0.},
+        {'params': decay, 'weight_decay': weight_decay}]
+def save_model(args, epoch, model, model_without_ddp, optimizer, loss_scaler, ema_params=None, epoch_name=None):
+    if epoch_name is None:
+        epoch_name = str(epoch)
+    output_dir = Path(args.output_dir)
+    checkpoint_path = output_dir / ('checkpoint-%s.pth' % epoch_name)
+    # ema
+    if ema_params is not None:
+        ema_state_dict = copy.deepcopy(model_without_ddp.state_dict())
+        for i, (name, _value) in enumerate(model_without_ddp.named_parameters()):
+            assert name in ema_state_dict
+            ema_state_dict[name] = ema_params[i]
+    else:
+        ema_state_dict = None
+    to_save = {
+        'model': model_without_ddp.state_dict(),
+        'model_ema': ema_state_dict,
+        'optimizer': optimizer.state_dict(),
+        'epoch': epoch,
+        'scaler': loss_scaler.state_dict(),
+        'args': args,
+    }
+    save_on_master(to_save, checkpoint_path)
+def all_reduce_mean(x):
+    world_size = get_world_size()
+    if world_size > 1:
+        x_reduce = torch.tensor(x).cuda()
+        dist.all_reduce(x_reduce)
+        x_reduce /= world_size
+        return x_reduce.item()
+    else:
+        return x