Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

.gitattributes +1 -0
registers/810001.tmp +3 -0
registers/log.txt +0 -0
registers/model.py +529 -0

.gitattributes CHANGED Viewed

@@ -95,3 +95,4 @@ dt_nolog/810001.tmp filter=lfs diff=lfs merge=lfs -text
 1e-4_no_sampling/810001.tmp filter=lfs diff=lfs merge=lfs -text
 1e-6_no_sampling/810001.tmp filter=lfs diff=lfs merge=lfs -text
 2e-5_no_sampling/810001.tmp filter=lfs diff=lfs merge=lfs -text

 1e-4_no_sampling/810001.tmp filter=lfs diff=lfs merge=lfs -text
 1e-6_no_sampling/810001.tmp filter=lfs diff=lfs merge=lfs -text
 2e-5_no_sampling/810001.tmp filter=lfs diff=lfs merge=lfs -text
+registers/810001.tmp filter=lfs diff=lfs merge=lfs -text

registers/810001.tmp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ec4eb6c89197aa6f922b93395cea6c24714d1c38b4da1c9244f5331726ec57ff
+size 2097911357

registers/log.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

registers/model.py ADDED Viewed

	@@ -0,0 +1,529 @@

+import math
+from typing import Any, Callable, Optional, Tuple, Type, Sequence, Union
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+from einops import rearrange
+from flax import nnx
+Array = Any
+PRNGKey = Any
+Shape = Tuple[int]
+Dtype = Any
+from math_utils import get_2d_sincos_pos_embed, modulate
+from jax._src import core
+from jax._src import dtypes
+from jax._src.nn.initializers import _compute_fans
+def xavier_uniform_pytorchlike():
+    def init(key, shape, dtype):
+        dtype = dtypes.canonicalize_dtype(dtype)
+        #named_shape = core.as_named_shape(shape)
+        if len(shape) == 2: # Dense, [in, out]
+            fan_in = shape[0]
+            fan_out = shape[1]
+        elif len(shape) == 4: # Conv, [k, k, in, out]. Assumes patch-embed style conv.
+            fan_in = shape[0] * shape[1] * shape[2]
+            fan_out = shape[3]
+        else:
+            raise ValueError(f"Invalid shape {shape}")
+        variance = 2 / (fan_in + fan_out)
+        scale = jnp.sqrt(3 * variance)
+        param = jax.random.uniform(key, shape, dtype, -1) * scale
+        return param
+    return init
+class TrainConfig:
+    def __init__(self, dtype):
+        self.dtype = dtype
+    def kern_init(self, name='default', zero=False):
+        if zero or 'bias' in name:
+            return nn.initializers.constant(0)
+        return xavier_uniform_pytorchlike()
+    def default_config(self):
+        return {
+            'kernel_init': self.kern_init(),
+            'bias_init': self.kern_init('bias', zero=True),
+            'dtype': self.dtype,
+        }
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    hidden_size: int
+    tc: TrainConfig
+    frequency_embedding_size: int = 256
+    @nn.compact
+    def __call__(self, t):
+        x = self.timestep_embedding(t)
+        x = nn.Dense(self.hidden_size, kernel_init=nn.initializers.normal(0.02),
+                     bias_init=self.tc.kern_init('time_bias'), dtype=self.tc.dtype)(x)
+        x = nn.silu(x)
+        x = nn.Dense(self.hidden_size, kernel_init=nn.initializers.normal(0.02),
+                     bias_init=self.tc.kern_init('time_bias'))(x)
+        return x
+    # t is between [0, 1].
+    def timestep_embedding(self, t, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                            These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        t = jax.lax.convert_element_type(t, jnp.float32)
+        # t = t * max_period
+        dim = self.frequency_embedding_size
+        half = dim // 2
+        freqs = jnp.exp( -math.log(max_period) * jnp.arange(start=0, stop=half, dtype=jnp.float32) / half)
+        args = t[:, None] * freqs[None]
+        embedding = jnp.concatenate([jnp.cos(args), jnp.sin(args)], axis=-1)
+        embedding = embedding.astype(self.tc.dtype)
+        return embedding
+class LabelEmbedder(nn.Module):
+    """
+    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
+    """
+    num_classes: int
+    hidden_size: int
+    tc: TrainConfig
+    @nn.compact
+    def __call__(self, labels):
+        embedding_table = nn.Embed(self.num_classes + 1, self.hidden_size,
+                                   embedding_init=nn.initializers.normal(0.02), dtype=self.tc.dtype)
+        embeddings = embedding_table(labels)
+        return embeddings
+class PatchEmbed(nn.Module):
+    """ 2D Image to Patch Embedding """
+    patch_size: int
+    hidden_size: int
+    tc: TrainConfig
+    bias: bool = True
+    @nn.compact
+    def __call__(self, x):
+        B, H, W, C = x.shape
+        patch_tuple = (self.patch_size, self.patch_size)
+        num_patches = (H // self.patch_size)
+        x = nn.Conv(self.hidden_size, patch_tuple, patch_tuple, use_bias=self.bias, padding="VALID",
+                     kernel_init=self.tc.kern_init('patch'), bias_init=self.tc.kern_init('patch_bias', zero=True),
+                     dtype=self.tc.dtype)(x) # (B, P, P, hidden_size)
+        x = rearrange(x, 'b h w c -> b (h w) c', h=num_patches, w=num_patches)
+        return x
+class MlpBlock(nn.Module):
+    """Transformer MLP / feed-forward block."""
+    mlp_dim: int
+    tc: TrainConfig
+    out_dim: Optional[int] = None
+    dropout_rate: float = None
+    train: bool = False
+    @nn.compact
+    def __call__(self, inputs):
+        """It's just an MLP, so the input shape is (batch, len, emb)."""
+        actual_out_dim = inputs.shape[-1] if self.out_dim is None else self.out_dim
+        x = nn.Dense(features=self.mlp_dim, **self.tc.default_config())(inputs)
+        x = nn.gelu(x)
+        x = nn.Dropout(rate=self.dropout_rate, deterministic=(not self.train))(x)
+        output = nn.Dense(features=actual_out_dim, **self.tc.default_config())(x)
+        output = nn.Dropout(rate=self.dropout_rate, deterministic=(not self.train))(output)
+        return output
+def modulate(x, shift, scale):
+    # scale = jnp.clip(scale, -1, 1)
+    #print("modulate input shapes", x.shape)
+    #print(shift.shape)
+    #print("scale", scale.shape)
+    scale = scale.reshape(x.shape[0], -1, x.shape[-1])
+    #print(scale.shape)
+    shift = shift.reshape(x.shape[0], -1, x.shape[-1])
+#    return x * (1 + scale[:, None]) + shift[:, None]
+    return x * (1 + scale) + shift
+#We forgot the 1+X...
+from flax import nnx
+from typing import Optional
+from einops import rearrange, repeat
+import math
+def rotate_half(x):
+    x = rearrange(x, '... (d r) -> ... d r', r=2)
+    x1, x2 = x[..., 0], x[..., 1]
+    x = jnp.stack((-x2, x1), axis=-1)
+    return rearrange(x, '... d r -> ... (d r)')
+def broadcat(tensors, dim: int = -1):
+    num_tensors = len(tensors)
+    shape_lens = set(list(map(lambda t: len(t.shape), tensors)))
+    assert len(shape_lens) == 1, 'tensors must all have the same number of dimensions'
+    shape_len = list(shape_lens)[0]
+    dim = (dim + shape_len) if dim < 0 else dim
+    dims = list(zip(*map(lambda t: list(t.shape), tensors)))
+    expandable_dims = [(i, val) for i, val in enumerate(dims) if i != dim]
+    assert all([*map(lambda t: len(set(t[1])) <= 2, expandable_dims)]), 'invalid dimensions for broadcastable concatentation'
+    max_dims = list(map(lambda t: (t[0], max(t[1])), expandable_dims))
+    expanded_dims = list(map(lambda t: (t[0], (t[1],) * num_tensors), max_dims))
+    expanded_dims.insert(dim, (dim, dims[dim]))
+    expandable_shapes = list(zip(*map(lambda t: t[1], expanded_dims)))
+    tensors = [jnp.broadcast_to(t, shape) for t, shape in zip(tensors, expandable_shapes)]
+    return jnp.concatenate(tensors, axis=dim)
+class VisionRotaryEmbeddingFast(nn.Module):
+    dim: int
+    pt_seq_len: int = 16
+    ft_seq_len: Optional[int] = None
+    custom_freqs: Optional[jnp.ndarray] = None
+    freqs_for: str = 'lang'
+    theta: float = 10000.0
+    max_freq: float = 10.0
+    num_freqs: int = 1
+    def setup(self):
+        if self.custom_freqs is not None:
+            freqs = self.custom_freqs
+        elif self.freqs_for == 'lang':
+            freqs = 1. / (self.theta ** (jnp.arange(0, self.dim, 2)[:(self.dim // 2)].astype(jnp.float32) / self.dim))
+        elif self.freqs_for == 'pixel':
+            freqs = jnp.linspace(1., self.max_freq / 2, self.dim // 2) * math.pi
+        elif self.freqs_for == 'constant':
+            freqs = jnp.ones(self.num_freqs, dtype=jnp.float32)
+        else:
+            raise ValueError(f'unknown modality {self.freqs_for}')
+        ft_seq_len = self.ft_seq_len if self.ft_seq_len is not None else self.pt_seq_len
+        t = jnp.arange(ft_seq_len) / ft_seq_len * self.pt_seq_len
+        freqs = jnp.einsum('..., f -> ... f', t, freqs)
+        freqs = repeat(freqs, '... n -> ... (n r)', r=2)
+        freqs = broadcat((freqs[:, None, :], freqs[None, :, :]), dim=-1)
+        self.freqs_cos = jnp.cos(freqs).reshape(-1, freqs.shape[-1])
+        self.freqs_sin = jnp.sin(freqs).reshape(-1, freqs.shape[-1])
+    def __call__(self, t):
+#        print("t shape", t.shape)
+#        print(self.freqs_cos.shape)
+        freqs_cos_expanded = self.freqs_cos[None, :, None, :]  # Shape: (1, 256, 1, 64)
+        freqs_sin_expanded = self.freqs_sin[None, :, None, :]  # Shape: (1, 256, 1, 64)
+        #basically for this, t just needs to be trimmed to not include registers
+        if True:#registers
+            t = t[:,:-4,:,:]
+        return t * freqs_cos_expanded + rotate_half(t) * freqs_sin_expanded
+################################################################################
+#                                 Core DiT Model                                #
+#################################################################################
+class DiTBlock(nn.Module):
+    """
+    A DiT block with adaptive layer norm zero (adaLN-Zero) conditioning.
+    """
+    hidden_size: int
+    num_heads: int
+    tc: TrainConfig
+    mlp_ratio: float = 4.0
+    dropout: float = 0.0
+    train: bool = False
+    rope : VisionRotaryEmbeddingFast = None
+    # @functools.partial(jax.checkpoint, policy=jax.checkpoint_policies.nothing_saveable)
+    @nn.compact
+    def __call__(self, x, c):
+        # Calculate adaLn modulation parameters.
+        #print("Doing adaln")
+        c = nn.silu(c)
+        c = nn.Dense(6 * self.hidden_size, **self.tc.default_config())(c)
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = jnp.split(c, 6, axis=-1)
+        # Attention Residual.
+        #x_norm = nn.LayerNorm(use_bias=False, use_scale=False, dtype=self.tc.dtype)(x)
+        x_norm = nnx.RMSNorm(self.hidden_size, use_scale=False,dtype=self.tc.dtype,rngs=nnx.Rngs(0))(x)
+        #print("x norm shap", x_norm.shape)
+        x_modulated = modulate(x_norm, shift_msa, scale_msa)
+        #For some reason the modulate is adding an extra dim
+        channels_per_head = self.hidden_size // self.num_heads
+        k = nn.Dense(self.hidden_size, **self.tc.default_config())(x_modulated)
+        q = nn.Dense(self.hidden_size, **self.tc.default_config())(x_modulated)
+        v = nn.Dense(self.hidden_size, **self.tc.default_config())(x_modulated)
+        #print("x mod shape", x_modulated.shape)
+        #So the issue is here with the reshape, for some reason...
+        #print("k shape", k.shape)#With decoupled, it is.... one side bigger, some reason?
+        k = jnp.reshape(k, (k.shape[0], k.shape[1], self.num_heads, channels_per_head))
+        q = jnp.reshape(q, (q.shape[0], q.shape[1], self.num_heads, channels_per_head))
+        v = jnp.reshape(v, (v.shape[0], v.shape[1], self.num_heads, channels_per_head))
+        #In va vae, they do soemthing else. norm q/k I think
+        if self.rope != None:
+            #print("qshape", q.shape)#1,260,12,??
+            q_registers = q[:,-4:,:,:]
+            k_registers = k[:,-4:,:,:]
+            q = self.rope(q)
+            k = self.rope(k)
+            q = jnp.concat((q, q_registers), axis = 1)
+            k = jnp.concat((k, k_registers), axis = 1)
+            #we don't apply rope, and thus drop the 4 tokens, so need to concat them back
+        q = q / q.shape[3] # (1/d) scaling.
+        w = jnp.einsum('bqhc,bkhc->bhqk', q, k) # [B, HW, HW, num_heads]
+        w = w.astype(jnp.float32)
+        w = nn.softmax(w, axis=-1)
+        y = jnp.einsum('bhqk,bkhc->bqhc', w, v) # [B, HW, num_heads, channels_per_head]
+        y = jnp.reshape(y, x.shape) # [B, H, W, C] (C = heads * channels_per_head)
+        attn_x = nn.Dense(self.hidden_size, **self.tc.default_config())(y)
+        #x = x + (gate_msa[:, None] * attn_x)
+        x = x + gate_msa.reshape(x.shape[0], -1, x.shape[-1]) * attn_x
+        # MLP Residual.
+#        x_norm2 = nn.LayerNorm(use_bias=False, use_scale=False, dtype=self.tc.dtype)(x)
+        x_norm2 = nnx.RMSNorm(self.hidden_size, use_scale=False,dtype=self.tc.dtype,rngs=nnx.Rngs(0))(x)
+        #print("Modulate 2", x_norm2.shape)
+        x_modulated2 = modulate(x_norm2, shift_mlp, scale_mlp)
+        #print(x_modulated.shape)
+        #mlp_x = MlpBlock(mlp_dim=int(self.hidden_size * self.mlp_ratio), tc=self.tc,
+        #                 dropout_rate=self.dropout, train=self.train)(x_modulated2)
+        mlp_x = SwiGLUFFN(self.hidden_size, int(2/3*self.hidden_size*self.mlp_ratio))(x_modulated2)
+#        x = x + (gate_mlp[:, None] * mlp_x)
+        x = x + gate_mlp.reshape(x.shape[0], -1,x.shape[-1]) * mlp_x
+        return x
+class SwiGLUFFN(nn.Module):
+    #So they have in features, hidden, out
+    #Although they pass in only in and hidden
+    #And set out to in
+    #So
+    in_features: int
+    hidden_features: int
+    @nn.compact
+    def __call__(self, x):
+        #In compact, we just craete them and go
+        #We also only need to include the output size
+        x = nn.Dense(2*self.hidden_features, use_bias=True)(x)
+        x1, x2 = jnp.split(x, 2, axis = -1)
+        hidden = nn.silu(x1) * x2
+        x = nn.Dense(self.in_features, use_bias = True)(hidden)
+        return x
+class FinalLayer(nn.Module):
+    """
+    The final layer of DiT.
+    """
+    patch_size: int
+    out_channels: int
+    hidden_size: int
+    tc: TrainConfig
+    @nn.compact
+    def __call__(self, x, c):
+        c = nn.silu(c)
+        c = nn.Dense(2 * self.hidden_size, kernel_init=self.tc.kern_init(zero=True),
+                     bias_init=self.tc.kern_init('bias', zero=True), dtype=self.tc.dtype)(c)
+        shift, scale = jnp.split(c, 2, axis=-1)
+ #       x = nn.LayerNorm(use_bias=False, use_scale=False, dtype=self.tc.dtype)(x)
+        x = nnx.RMSNorm(self.hidden_size, use_scale=False,dtype=self.tc.dtype,rngs=nnx.Rngs(0))(x)
+        x = modulate(x, shift, scale)
+        x = nn.Dense(self.patch_size * self.patch_size * self.out_channels,
+                     kernel_init=self.tc.kern_init('final', zero=True),
+                     bias_init=self.tc.kern_init('final_bias', zero=True), dtype=self.tc.dtype)(x)
+        return x
+class DiT(nn.Module):
+    """
+    Diffusion model with a Transformer backbone.
+    """
+    patch_size: int
+    hidden_size: int
+    depth: int
+    num_heads: int
+    mlp_ratio: float
+    out_channels: int
+    class_dropout_prob: float
+    num_classes: int
+    ignore_dt: bool = False
+    dropout: float = 0.0
+    dtype: Dtype = jnp.bfloat16
+    rope : VisionRotaryEmbeddingFast = None
+    @nn.compact
+    def __call__(self, x, t, dt, y, train=False, return_activations=False):
+        # (x = (B, H, W, C) image, t = (B,) timesteps, y = (B,) class labels)
+        print("DiT: Input of shape", x.shape, "dtype", x.dtype)
+        activations = {}
+        batch_size = x.shape[0]
+        input_size = x.shape[1]
+        in_channels = x.shape[-1]
+        num_patches = (input_size // self.patch_size) ** 2
+        num_patches_side = input_size // self.patch_size
+        tc = TrainConfig(dtype=self.dtype)
+        if self.ignore_dt:
+            dt = jnp.zeros_like(t)
+        # pos_embed = self.param("pos_embed", get_2d_sincos_pos_embed, self.hidden_size, num_patches)
+        # pos_embed = jax.lax.stop_gradient(pos_embed)
+        #Extra patches for registers
+        pos_embed = get_2d_sincos_pos_embed(None, self.hidden_size, num_patches)
+        #Decoupled
+        s = PatchEmbed(self.patch_size, self.hidden_size, tc=tc)(x) # (B, num_patches, hidden_size)
+        x = PatchEmbed(self.patch_size, self.hidden_size, tc=tc)(x) # (B, num_patches, hidden_size)
+        print("DiT: After patch embed, shape is", x.shape, "dtype", x.dtype)
+        activations['patch_embed'] = x
+        x = x + pos_embed
+        if True:#registers get added here
+            #nobody cares about num patches now
+            registers = jnp.ones((x.shape[0], 4, x.shape[-1]))
+            x = jnp.concat((x, registers), axis = 1)
+        #x = x + pos_embed
+        x = x.astype(self.dtype)
+        #More decoupled
+        s = s + pos_embed
+        s = s.astype(self.dtype)
+        te = TimestepEmbedder(self.hidden_size, tc=tc)(t) # (B, hidden_size)
+        dte = TimestepEmbedder(self.hidden_size, tc=tc)(dt) # (B, hidden_size)
+        ye = LabelEmbedder(self.num_classes, self.hidden_size, tc=tc)(y) # (B, hidden_size)
+        c = te + ye + dte
+        activations['pos_embed'] = pos_embed
+        activations['time_embed'] = te
+        activations['dt_embed'] = dte
+        activations['label_embed'] = ye
+        activations['conditioning'] = c
+        print("DiT: Patch Embed of shape", x.shape, "dtype", x.dtype)
+        print("DiT: Conditioning of shape", c.shape, "dtype", c.dtype)
+        if True:#Use rope
+            half_head_dim = self.hidden_size // self.num_heads // 2
+            hw_seq_len = input_size // self.patch_size #This part is quite awkward, but it's basically just image shape - probably 16 or 32
+            print("selfh idden", self.hidden_size)
+            print("self heads", self.num_heads)
+            print("hw_swq", hw_seq_len)
+            print("xshape", x.shape)
+            #selfh idden 768
+            #self heads 12
+            #hw_swq 128
+            #xshape (1, 256, 768)
+            #t shape (1, 256, 12, 64)
+            #(16384, 64)
+            rope = VisionRotaryEmbeddingFast(dim=half_head_dim, pt_seq_len=hw_seq_len)
+        decoupled = False
+        #So the original decoupled code we created was wrong. Let's try normal..?
+        if False:#Old code
+            extra_depth = 0
+            if decoupled:
+                for i in range(4):#Manually set to 4
+                    s =  DiTBlock(self.hidden_size, self.num_heads, tc, self.mlp_ratio, self.dropout, train, rope)(s,c)
+                #I don't even know what the fucking shapes are bro....
+                s = nn.silu(te.reshape(s.shape[0],-1,s.shape[-1]) + dte.reshape(s.shape[0],-1,s.shape[-1]) + s)#Add conditioning back, somewhat.
+                if True:
+                    c = s#Replace conditioning
+                else:#Instead of replacing conditioning, we will..... leave c as is?
+                    pass
+            else:#Probably turn extra length to true instead
+                extra = True
+                extra_depth = 4
+            for i in range(self.depth + extra_depth):
+                x = DiTBlock(self.hidden_size, self.num_heads, tc, self.mlp_ratio, self.dropout, train, rope)(x,c)
+                activations[f'dit_block_{i}'] = x
+        if False:#decoupled new/working
+            for i in range(4):#Manually set to 4
+                s =  DiTBlock(self.hidden_size, self.num_heads, tc, self.mlp_ratio, self.dropout, train, rope)(s,c)
+            s = nn.silu(te.reshape(s.shape[0],-1,s.shape[-1]) + dte.reshape(s.shape[0],-1,s.shape[-1]) + s)#Add conditioning back, somewhat.
+            if True:
+                c = s#Replace conditioning
+            for i in range(self.depth - 4):
+                x = DiTBlock(self.hidden_size, self.num_heads, tc, self.mlp_ratio, self.dropout, train, rope)(x,c)
+                activations[f'dit_block_{i}'] = x
+        else:#Normal
+            for i in range(self.depth):
+                x = DiTBlock(self.hidden_size, self.num_heads, tc, self.mlp_ratio, self.dropout, train, rope)(x,c)
+                activations[f'dit_block_{i}'] = x
+        x = FinalLayer(self.patch_size, self.out_channels, self.hidden_size, tc)(x, c) # (B, num_patches, p*p*c)
+        activations['final_layer'] = x
+        # print("DiT: FinalLayer of shape", x.shape, "dtype", x.dtype)
+        if True:#more registers
+            #Need to remove the registers
+            registers = x[:,-4:,:]
+            x = x[:,:-4, :]
+        x = jnp.reshape(x, (batch_size, num_patches_side, num_patches_side,
+                            self.patch_size, self.patch_size, self.out_channels))
+        x = jnp.einsum('bhwpqc->bhpwqc', x)
+        x = rearrange(x, 'B H P W Q C -> B (H P) (W Q) C', H=int(num_patches_side), W=int(num_patches_side))
+        assert x.shape == (batch_size, input_size, input_size, self.out_channels)
+        t_discrete = jnp.floor(t * 256).astype(jnp.int32)
+        logvars = nn.Embed(256, 1, embedding_init=nn.initializers.constant(0))(t_discrete) * 100
+        if return_activations:
+            return x, logvars, activations
+        return x