NoteDance
/

DiT-Keras

Keras

dit

Model card Files Files and versions

xet

Community

NoteDance commited on Mar 6, 2024

Commit

e07f172

verified ·

1 Parent(s): 7cc8696

Upload DiT.py

Browse files

Files changed (1) hide show

DiT.py +440 -0

DiT.py ADDED Viewed

	@@ -0,0 +1,440 @@

+import tensorflow as tf
+from tensorflow.keras.layers import Conv2d,Dense,Dropout,LayerNormalization,Activation
+from tensorflow.keras.initializers import RandomNormal
+from tensorflow.keras import Model
+import collections.abc
+from itertools import repeat
+from typing import Optional
+import numpy as np
+import math
+def modulate(x, shift, scale):
+    return x * (1 + tf.expand_dims(scale, 1)) + tf.expand_dims(shift, 1)
+#################################################################################
+#               Embedding Layers for Timesteps and Class Labels                 #
+#################################################################################
+class TimestepEmbedder:
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        self.mlp = tf.keras.Sequential()
+        self.mlp.add(Dense(hidden_size, kernel_initializer=RandomNormal(stddev=0.02), use_bias=True))
+        self.mlp.add(Activation('silu'))
+        self.mlp.add(Dense(hidden_size,  kernel_initializer=RandomNormal(stddev=0.02), use_bias=True))
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = tf.math.exp(
+            -math.log(max_period) * tf.range(start=0, limit=half, dtype=tf.float32) / half
+        )
+        args = tf.cast(t[:, None], 'float32') * freqs[None]
+        embedding = tf.concat([tf.math.cos(args), tf.math.sin(args)], axis=-1)
+        if dim % 2:
+            embedding = tf.concat([embedding, tf.zeros_like(embedding[:, :1])], axis=-1)
+        return embedding
+    def __call__(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class LabelEmbedder:
+    """
+    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
+    """
+    def __init__(self, num_classes, hidden_size, dropout_prob):
+        use_cfg_embedding = dropout_prob > 0
+        self.embedding_table = tf.Variable(tf.random.normal((num_classes + use_cfg_embedding, hidden_size), stddev=0.02))
+        self.num_classes = num_classes
+        self.dropout_prob = dropout_prob
+    def token_drop(self, labels, force_drop_ids=None):
+        """
+        Drops labels to enable classifier-free guidance.
+        """
+        if force_drop_ids is None:
+            drop_ids = tf.random.uniform([labels.shape[0]]) < self.dropout_prob
+        else:
+            drop_ids = force_drop_ids == 1
+        labels = tf.where(drop_ids, self.num_classes, labels)
+        return labels
+    def __call__(self, labels, train, force_drop_ids=None):
+        use_dropout = self.dropout_prob > 0
+        if (train and use_dropout) or (force_drop_ids is not None):
+            labels = self.token_drop(labels, force_drop_ids)
+        embeddings = tf.gather(self.embedding_table, labels)
+        return embeddings
+#################################################################################
+#                                 Core DiT Model                                #
+#################################################################################
+class DiTBlock:
+    """
+    A DiT block with adaptive layer norm zero (adaLN-Zero) conditioning.
+    """
+    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0):
+        self.norm1 = LayerNormalization(epsilon=1e-6)
+        self.attn = Attention(hidden_size, num_heads=num_heads, qkv_bias=True)
+        self.norm2 = LayerNormalization(epsilon=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, drop=0)
+        self.adaLN_modulation = tf.keras.Sequential()
+        self.adaLN_modulation.add(Activation('silu'))
+        self.adaLN_modulation.add(Dense(6 * hidden_size, kernel_initializer='zeros', use_bias=True))
+    def __call__(self, x, c):
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = tf.split(self.adaLN_modulation(c), num_or_size_splits=6, axis=1)
+        x = x + tf.expand_dims(gate_msa, 1) * self.attn(modulate(self.norm1(x), shift_msa, scale_msa))
+        x = x + tf.expand_dims(gate_mlp, 1) * self.mlp(modulate(self.norm2(x), shift_mlp, scale_mlp))
+        return x
+class FinalLayer:
+    """
+    The final layer of DiT.
+    """
+    def __init__(self, hidden_size, patch_size, out_channels):
+        self.norm_final = LayerNormalization(epsilon=1e-6)
+        self.linear = Dense(patch_size * patch_size * out_channels, kernel_initializer='zeros', use_bias=True)
+        self.adaLN_modulation = tf.keras.Sequential()
+        self.adaLN_modulation.add(Activation('silu'))
+        self.adaLN_modulation.add(Dense(2 * hidden_size, kernel_initializer='zeros', use_bias=True))
+    def __call__(self, x, c):
+        shift, scale = tf.split(self.adaLN_modulation(c), num_or_size_splits=2, axis=1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+class DiT(Model):
+    """
+    Diffusion model with a Transformer backbone.
+    """
+    def __init__(
+        self,
+        input_size=32,
+        patch_size=2,
+        in_channels=4,
+        hidden_size=1152,
+        depth=28,
+        num_heads=16,
+        mlp_ratio=4.0,
+        class_dropout_prob=0.1,
+        num_classes=1000,
+        learn_sigma=True,
+    ):
+        super(DiT, self).__init__()
+        self.learn_sigma = learn_sigma
+        self.in_channels = in_channels
+        self.out_channels = in_channels * 2 if learn_sigma else in_channels
+        self.patch_size = patch_size
+        self.num_heads = num_heads
+        self.x_embedder = PatchEmbed(input_size, patch_size, in_channels, hidden_size, bias=True)
+        self.t_embedder = TimestepEmbedder(hidden_size)
+        self.y_embedder = LabelEmbedder(num_classes, hidden_size, class_dropout_prob)
+        num_patches = self.x_embedder.num_patches
+        # Will use fixed sin-cos embedding:
+        self.pos_embed = tf.zeros((1, num_patches, hidden_size))
+        self.blocks = [
+            DiTBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio) for _ in range(depth)
+        ]
+        self.final_layer = FinalLayer(hidden_size, patch_size, self.out_channels)
+        self.initialize_weights()
+    def initialize_weights(self):
+        # Initialize (and freeze) pos_embed by sin-cos embedding:
+        pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], int(self.x_embedder.num_patches ** 0.5))
+        self.pos_embed = tf.convert_to_tensor(pos_embed, dtype=tf.float32)[tf.newaxis, :]
+        tf.Variable(self.pos_embed)
+    def unpatchify(self, x):
+        """
+        x: (N, T, patch_size**2 * C)
+        imgs: (N, H, W, C)
+        """
+        c = self.out_channels
+        p = self.x_embedder.patch_size[0]
+        h = w = int(x.shape[1] ** 0.5)
+        assert h * w == x.shape[1]
+        x = tf.reshape(x, (x.shape[0], h, w, p, p, c))
+        x = tf.einsum('nhwpqc->nchpwq', x)
+        imgs = tf.reshape(x, (x.shape[0], h * p, h * p, c))
+        return imgs
+    def __call__(self, x, t, y):
+        """
+        Forward pass of DiT.
+        x: (N, H, W, C) tensor of spatial inputs (images or latent representations of images)
+        t: (N,) tensor of diffusion timesteps
+        y: (N,) tensor of class labels
+        """
+        x = self.x_embedder(x) + self.pos_embed  # (N, T, D), where T = H * W / patch_size ** 2
+        t = self.t_embedder(t)                   # (N, D)
+        y = self.y_embedder(y, self.training)    # (N, D)
+        c = t + y                                # (N, D)
+        for block in self.blocks:
+            x = block(x, c)                      # (N, T, D)
+        x = self.final_layer(x, c)                # (N, T, patch_size ** 2 * out_channels)
+        x = self.unpatchify(x)                   # (N, out_channels, H, W)
+        return x
+    def forward_with_cfg(self, x, t, y, cfg_scale):
+        """
+        Forward pass of DiT, but also batches the unconditional forward pass for classifier-free guidance.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
+        half = x[: len(x) // 2]
+        combined = tf.concat([half, half], axis=0)
+        model_out = self.forward(combined, t, y)
+        # For exact reproducibility reasons, we apply classifier-free guidance on only
+        # three channels by default. The standard approach to cfg applies it to all channels.
+        # This can be done by uncommenting the following line and commenting-out the line following that.
+        # eps, rest = model_out[:, :self.in_channels], model_out[:, self.in_channels:]
+        eps, rest = model_out[:, :3], model_out[:, 3:]
+        cond_eps, uncond_eps = tf.split(eps, len(eps) // 2, dim=0)
+        half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
+        eps = tf.concat([half_eps, half_eps], axis=0)
+        return tf.concat([eps, rest], axis=1)
+#################################################################################
+#                   Sine/Cosine Positional Embedding Functions                  #
+#################################################################################
+# https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out) # (M, D/2)
+    emb_cos = np.cos(out) # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+#################################################################################
+#                                   DiT Configs                                  #
+#################################################################################
+def DiT_XL_2():
+    return DiT(depth=28, hidden_size=1152, patch_size=2, num_heads=16)
+def DiT_XL_4():
+    return DiT(depth=28, hidden_size=1152, patch_size=4, num_heads=16)
+def DiT_XL_8():
+    return DiT(depth=28, hidden_size=1152, patch_size=8, num_heads=16)
+def DiT_L_2():
+    return DiT(depth=24, hidden_size=1024, patch_size=2, num_heads=16)
+def DiT_L_4():
+    return DiT(depth=24, hidden_size=1024, patch_size=4, num_heads=16)
+def DiT_L_8():
+    return DiT(depth=24, hidden_size=1024, patch_size=8, num_heads=16)
+def DiT_B_2():
+    return DiT(depth=12, hidden_size=768, patch_size=2, num_heads=12)
+def DiT_B_4():
+    return DiT(depth=12, hidden_size=768, patch_size=4, num_heads=12)
+def DiT_B_8():
+    return DiT(depth=12, hidden_size=768, patch_size=8, num_heads=12)
+def DiT_S_2():
+    return DiT(depth=12, hidden_size=384, patch_size=2, num_heads=6)
+def DiT_S_4():
+    return DiT(depth=12, hidden_size=384, patch_size=4, num_heads=6)
+def DiT_S_8():
+    return DiT(depth=12, hidden_size=384, patch_size=8, num_heads=6)
+DiT_models = {
+    'DiT-XL/2': DiT_XL_2,  'DiT-XL/4': DiT_XL_4,  'DiT-XL/8': DiT_XL_8,
+    'DiT-L/2':  DiT_L_2,   'DiT-L/4':  DiT_L_4,   'DiT-L/8':  DiT_L_8,
+    'DiT-B/2':  DiT_B_2,   'DiT-B/4':  DiT_B_4,   'DiT-B/8':  DiT_B_8,
+    'DiT-S/2':  DiT_S_2,   'DiT-S/4':  DiT_S_4,   'DiT-S/8':  DiT_S_8,
+}
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            return tuple(x)
+        return tuple(repeat(x, n))
+    return parse
+to_2tuple = _ntuple(2)
+class PatchEmbed:
+    """ 2D Image to Patch Embedding
+    """
+    def __init__(
+            self,
+            img_size: Optional[int] = 224,
+            patch_size: int = 16,
+            in_chans: int = 3,
+            embed_dim: int = 768,
+            flatten: bool = True,
+            bias: bool = True,
+    ):
+        self.patch_size = to_2tuple(patch_size)
+        if img_size is not None:
+            self.img_size = to_2tuple(img_size)
+            self.grid_size = tuple([s // p for s, p in zip(self.img_size, self.patch_size)])
+            self.num_patches = self.grid_size[0] * self.grid_size[1]
+        else:
+            self.img_size = None
+            self.grid_size = None
+            self.num_patches = None
+        # flatten spatial dim and transpose to channels last, kept for bwd compat
+        self.flatten = flatten
+        self.proj = Conv2d(embed_dim, kernel_size=patch_size, strides=patch_size, use_bias=bias)
+    def __call__(self, x):
+        x = self.proj(x)
+        B, H, W, C = x.shape
+        if self.flatten:
+            x = tf.reshape(x, [B, H*W, C])  # NHWC -> NLC
+        return x
+class Mlp:
+    """ MLP as used in Vision Transformer, MLP-Mixer and related networks
+    """
+    def __init__(
+            self,
+            in_features,
+            hidden_features=None,
+            out_features=None,
+            act_layer=tf.nn.gelu,
+            norm_layer=None,
+            bias=True,
+            drop=0.,
+            use_conv=False,
+    ):
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        bias = to_2tuple(bias)
+        drop_probs = to_2tuple(drop)
+        self.fc1 = Dense(hidden_features, use_bias=bias[0])
+        self.act = act_layer
+        self.drop1 = Dropout(drop_probs[0])
+        self.fc2 = Dense(out_features, use_bias=bias[1])
+        self.drop2 = Dropout(drop_probs[1])
+    def __call__(self, x):
+        x = self.fc1(x)
+        x = self.act(x, approximate="tanh")
+        x = self.drop1(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+class Attention:
+    def __init__(
+            self,
+            dim: int,
+            num_heads: int = 8,
+            qkv_bias: bool = False,
+            attn_drop: float = 0.,
+            proj_drop: float = 0.,
+    ):
+        assert dim % num_heads == 0, 'dim should be divisible by num_heads'
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim ** -0.5
+        self.qkv = Dense(dim * 3, use_bias=qkv_bias)
+        self.attn_drop = Dropout(attn_drop)
+        self.proj = Dense(dim)
+        self.proj_drop = Dropout(proj_drop)
+    def __call__(self, x):
+        B, N, C = x.shape
+        qkv = tf.transpose(tf.reshape(self.qkv(x), (B, N, 3, self.num_heads, self.head_dim)), (2, 0, 3, 1, 4))
+        q, k, v = tf.unstack(qkv)
+        q = q * self.scale
+        attn = tf.matmul(q, tf.transpose(k, (0, 1, 3, 2)))
+        attn = tf.nn.softmax(attn)
+        attn = self.attn_drop(attn)
+        x = tf.matmul(attn, v)
+        x = tf.reshape(tf.transpose(x, (0, 2, 1, 3)), (B, N, C))
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x