MongolLabs
/

SDLikeModels

Model card Files Files and versions

xet

Community

KublaiKhan1 commited on Aug 22, 2025

Commit

6fa0aca

verified ·

1 Parent(s): 0561cd8

Upload LinearAE/vqvae.py with huggingface_hub

Browse files

Files changed (1) hide show

LinearAE/vqvae.py +523 -0

LinearAE/vqvae.py ADDED Viewed

	@@ -0,0 +1,523 @@

+from typing import Any
+import flax.linen as nn
+import jax.numpy as jnp
+import functools
+import ml_collections
+import jax
+###########################
+### Helper Modules
+### https://github.com/google-research/maskgit/blob/main/maskgit/nets/layers.py
+###########################
+def get_norm_layer(norm_type):
+    """Normalization layer."""
+    if norm_type == 'BN':
+        raise NotImplementedError
+    elif norm_type == 'LN':
+        norm_fn = functools.partial(nn.LayerNorm)
+    elif norm_type == 'GN':
+        norm_fn = functools.partial(nn.GroupNorm)
+    else:
+        raise NotImplementedError
+    return norm_fn
+def tensorflow_style_avg_pooling(x, window_shape, strides, padding: str):
+    pool_sum = jax.lax.reduce_window(x, 0.0, jax.lax.add,
+                                   (1,) + window_shape + (1,),
+                                   (1,) + strides + (1,), padding)
+    pool_denom = jax.lax.reduce_window(
+        jnp.ones_like(x), 0.0, jax.lax.add, (1,) + window_shape + (1,),
+        (1,) + strides + (1,), padding)
+    return pool_sum / pool_denom
+def upsample(x, factor=2):
+    n, h, w, c = x.shape
+    x = jax.image.resize(x, (n, h * factor, w * factor, c), method='nearest')
+    return x
+def dsample(x):
+    return tensorflow_style_avg_pooling(x, (2, 2), strides=(2, 2), padding='same')
+def squared_euclidean_distance(a: jnp.ndarray,
+                               b: jnp.ndarray,
+                               b2: jnp.ndarray = None) -> jnp.ndarray:
+    """Computes the pairwise squared Euclidean distance.
+    Args:
+        a: float32: (n, d): An array of points.
+        b: float32: (m, d): An array of points.
+        b2: float32: (d, m): b square transpose.
+    Returns:
+        d: float32: (n, m): Where d[i, j] is the squared Euclidean distance between
+        a[i] and b[j].
+    """
+    if b2 is None:
+        b2 = jnp.sum(b.T**2, axis=0, keepdims=True)
+    a2 = jnp.sum(a**2, axis=1, keepdims=True)
+    ab = jnp.matmul(a, b.T)
+    d = a2 - 2 * ab + b2
+    return d
+def entropy_loss_fn(affinity, loss_type="softmax", temperature=1.0):
+    """Calculates the entropy loss. Affinity is the similarity/distance matrix."""
+    flat_affinity = affinity.reshape(-1, affinity.shape[-1])
+    flat_affinity /= temperature
+    probs = jax.nn.softmax(flat_affinity, axis=-1)
+    log_probs = jax.nn.log_softmax(flat_affinity + 1e-5, axis=-1)
+    if loss_type == "softmax":
+        target_probs = probs
+    elif loss_type == "argmax":
+        codes = jnp.argmax(flat_affinity, axis=-1)
+        onehots = jax.nn.one_hot(
+            codes, flat_affinity.shape[-1], dtype=flat_affinity.dtype)
+        onehots = probs - jax.lax.stop_gradient(probs - onehots)
+        target_probs = onehots
+    else:
+        raise ValueError("Entropy loss {} not supported".format(loss_type))
+    avg_probs = jnp.mean(target_probs, axis=0)
+    avg_entropy = -jnp.sum(avg_probs * jnp.log(avg_probs + 1e-5))
+    sample_entropy = -jnp.mean(jnp.sum(target_probs * log_probs, axis=-1))
+    loss = sample_entropy - avg_entropy
+    return loss
+def sg(x):
+    return jax.lax.stop_gradient(x)
+###########################
+### Modules
+###########################
+class ResBlock(nn.Module):
+    """Basic Residual Block."""
+    filters: int
+    norm_fn: Any
+    activation_fn: Any
+    @nn.compact
+    def __call__(self, x):
+        input_dim = x.shape[-1]
+        residual = x
+        x = self.norm_fn()(x)
+        x = self.activation_fn(x)
+        x = nn.Conv(self.filters, kernel_size=(3, 3), use_bias=False)(x)
+        x = self.norm_fn()(x)
+        x = self.activation_fn(x)
+        x = nn.Conv(self.filters, kernel_size=(3, 3), use_bias=False)(x)
+        if input_dim != self.filters:#Basically if input doesn't match output, use a skip
+            residual = nn.Conv(self.filters, kernel_size=(1, 1), use_bias=False)(x)
+        return x + residual
+class Fourier(nn.Module):
+    def setup(self):
+        #Our input comes in as 3... after we convert to 512, maybe instead we convert to 256, and then do this?
+        self.weight = jax.random.normal(self.make_rng("noise"), means.shape)
+    @nn.compact
+    def __call__(self, f):
+        #this is probabl ycahnnels lastz
+        f = 2 * math.pi * input @ self.weight.T
+        return torch.cat([f.cos(), f.sin()], dim = -1)
+from einops import rearrange
+class Encoder(nn.Module):
+    config: ml_collections.ConfigDict
+    #So in this setup, we don't carea bout anything
+    @nn.compact
+    def __call__(self, x):
+        print("init encoder")
+        print("x shape", x.shape)
+        x = rearrange(x, '... (h b1) (w b2) c -> ... h w (c b1 b2)', b1=8, b2=8)
+        x = nn.Dense(4)(x)#We just put to 4 for now
+        print(x.shape)
+        return x
+        #k = nn.Dense(self.hidden_size, **self.tc.default_config())(x_modulated)
+#1x1 conv, uplift from 3 to like..... 64
+#That gives us 256x256x64
+#Then pixelshuffle to
+class OriginalEncoder(nn.Module):
+    """From [H,W,D] image to [H',W',D'] embedding. Using Conv layers."""
+    config: ml_collections.ConfigDict
+    def setup(self):
+        self.filters = self.config.filters#filters is the original setup
+        self.num_res_blocks = self.config.num_res_blocks
+        self.channel_multipliers = self.config.channel_multipliers
+        self.embedding_dim = self.config.embedding_dim
+        self.norm_type = self.config.norm_type
+        self.activation_fn = nn.swish
+    @nn.compact
+    def __call__(self, x):
+        print("Initializing encoder.")
+        norm_fn = get_norm_layer(norm_type=self.norm_type)
+        block_args = dict(norm_fn=norm_fn, activation_fn=self.activation_fn)
+        print("Incoming encoder shape", x.shape)
+        x = nn.Conv(self.filters, kernel_size=(3, 3), use_bias=False)(x)
+        print('Encoder layer', x.shape)
+        num_blocks = len(self.channel_multipliers)
+        #The way SD works, is it does 2x resnet, not changing anything, then downsample
+        #It does this 3 times, leading to 8x downsample
+        #Then it has an extra resnet block, and THEN from 512 to 8 / 4
+        for i in range(num_blocks):
+            filters = self.filters * self.channel_multipliers[i]
+            for _ in range(self.num_res_blocks):
+                x = ResBlock(filters, **block_args)(x)
+            if i < num_blocks - 1:#For each block *except end* do downsample
+                print("doing downsample")
+                x = dsample(x)
+            print('Encoder layer', x.shape)
+        #After we are done downsampling, we do the 2 resnet, and down below here, we have the 2 midblock?
+        for _ in range(self.num_res_blocks):
+            x = ResBlock(filters, **block_args)(x)
+            print('Encoder layer final', x.shape)
+        x = norm_fn()(x)
+        x = self.activation_fn(x)
+        last_dim = self.embedding_dim*2 if self.config['quantizer_type'] == 'kl' else self.embedding_dim
+        x = nn.Conv(last_dim, kernel_size=(1, 1))(x)
+        print("Final embeddings are size", x.shape)
+        return x
+class Decoder(nn.Module):
+    """From [H',W',D'] embedding to [H,W,D] embedding. Using Conv layers."""
+    config: ml_collections.ConfigDict
+    def setup(self):
+        self.filters = self.config.filters
+        self.num_res_blocks = self.config.num_res_blocks
+        self.channel_multipliers = self.config.channel_multipliers
+        self.norm_type = self.config.norm_type
+        self.image_channels = self.config.image_channels
+        self.activation_fn = nn.swish
+    @nn.compact
+    def __call__(self, x):
+        norm_fn = get_norm_layer(norm_type=self.norm_type)
+        block_args = dict(norm_fn=norm_fn, activation_fn=self.activation_fn,)
+        num_blocks = len(self.channel_multipliers)
+        filters = self.filters * self.channel_multipliers[-1]
+        print("Decoder incoming shape", x.shape)
+        #We don't need to do anything here because it'll put it back to 512
+        x = nn.Conv(filters, kernel_size=(3, 3), use_bias=True)(x)
+        print("Decoder input", x.shape)
+        #This is the mid block
+        for _ in range(self.num_res_blocks):
+            x = ResBlock(filters, **block_args)(x)
+            print('Mid Block Decoder layer', x.shape)
+        #First two SET of blocks is just 3 resnet, no channel changes, we are already at 4x = 512
+        for i in reversed(range(num_blocks)):
+            filters = self.filters * self.channel_multipliers[i]
+            for _ in range(self.num_res_blocks):#sym
+                x = ResBlock(filters, **block_args)(x)
+            if i > 0:
+                x = upsample(x, 2)
+                x = nn.Conv(filters, kernel_size=(3, 3))(x)
+            print('Decoder layer', x.shape)
+        x = norm_fn()(x)
+        x = self.activation_fn(x)
+        x = nn.Conv(self.image_channels, kernel_size=(3, 3))(x)
+        return x
+class VectorQuantizer(nn.Module):
+    """Basic vector quantizer."""
+    config: ml_collections.ConfigDict
+    train: bool
+    @nn.compact
+    def __call__(self, x):
+        codebook_size = self.config.codebook_size
+        emb_dim = x.shape[-1]
+        codebook = self.param(
+            "codebook",
+            jax.nn.initializers.variance_scaling(scale=1.0, mode="fan_in", distribution="uniform"),
+            (codebook_size, emb_dim))
+        codebook = jnp.asarray(codebook) # (codebook_size, emb_dim)
+        distances = jnp.reshape(
+            squared_euclidean_distance(jnp.reshape(x, (-1, emb_dim)), codebook),
+            x.shape[:-1] + (codebook_size,)) # [x, codebook_size] similarity matrix.
+        encoding_indices = jnp.argmin(distances, axis=-1)
+        encoding_onehot = jax.nn.one_hot(encoding_indices, codebook_size)
+        quantized = self.quantize(encoding_onehot)
+        result_dict = dict()
+        if self.train:
+            e_latent_loss = jnp.mean((sg(quantized) - x)**2) * self.config.commitment_cost
+            q_latent_loss = jnp.mean((quantized - sg(x))**2)
+            entropy_loss = 0.0
+            if self.config.entropy_loss_ratio != 0:
+                entropy_loss = entropy_loss_fn(
+                    -distances,
+                    loss_type=self.config.entropy_loss_type,
+                    temperature=self.config.entropy_temperature
+                ) * self.config.entropy_loss_ratio
+            e_latent_loss = jnp.asarray(e_latent_loss, jnp.float32)
+            q_latent_loss = jnp.asarray(q_latent_loss, jnp.float32)
+            entropy_loss = jnp.asarray(entropy_loss, jnp.float32)
+            loss = e_latent_loss + q_latent_loss + entropy_loss
+            result_dict = dict(
+                quantizer_loss=loss,
+                e_latent_loss=e_latent_loss,
+                q_latent_loss=q_latent_loss,
+                entropy_loss=entropy_loss)
+            quantized = x + jax.lax.stop_gradient(quantized - x)
+        result_dict.update({
+            "z_ids": encoding_indices,
+        })
+        return quantized, result_dict
+    def quantize(self, encoding_onehot: jnp.ndarray) -> jnp.ndarray:
+        codebook = jnp.asarray(self.variables["params"]["codebook"])
+        return jnp.dot(encoding_onehot, codebook)
+    def decode_ids(self, ids: jnp.ndarray) -> jnp.ndarray:
+        codebook = self.variables["params"]["codebook"]
+        return jnp.take(codebook, ids, axis=0)
+class KLQuantizer(nn.Module):
+    config: ml_collections.ConfigDict
+    train: bool
+    @nn.compact
+    def __call__(self, x):
+        emb_dim = x.shape[-1] // 2 # Use half as means, half as logvars.
+        means = x[..., :emb_dim]
+        logvars = x[..., emb_dim:]
+        if not self.train:
+            result_dict = dict()
+            result_dict["std"] = jnp.exp(0.5 * logvars)
+            return means, result_dict
+        else:
+            noise = jax.random.normal(self.make_rng("noise"), means.shape)
+            stds = jnp.exp(0.5 * logvars)
+            z = means + stds * noise
+            #kl_loss = -0.5 * jnp.mean(1 + logvars - means**2 - jnp.exp(logvars))
+            #New kl
+            kl_loss = - 0.5 * jnp.sum(1 + logvars - jnp.square(means) - jnp.exp(logvars),axis=tuple(range(1, means.ndim)))
+            kl_loss = jnp.mean(kl_loss)
+            result_dict = dict(quantizer_loss=kl_loss)
+            result_dict["std"] = jnp.exp(0.5 * logvars)
+            return z, result_dict
+class AEQuantizer(nn.Module): #cooking
+    config: ml_collections.ConfigDict
+    train: bool
+    @nn.compact
+    def __call__(self, x):
+        result_dict = dict()
+        result_dict["std"] = 0.0
+        return x, result_dict
+import jax
+import jax.numpy as jnp
+from jax import random
+def imq_kernel(X: jnp.ndarray, Y: jnp.ndarray, h_dim: int):
+    batch_size = X.shape[0]
+    norms_x = jnp.sum(X**2, axis=1, keepdims=True)  # batch_size x 1
+    prods_x = jnp.dot(X, X.T)  # batch_size x batch_size
+    dists_x = norms_x + norms_x.T - 2 * prods_x
+    norms_y = jnp.sum(Y**2, axis=1, keepdims=True)  # batch_size x 1
+    prods_y = jnp.dot(Y, Y.T)  # batch_size x batch_size
+    dists_y = norms_y + norms_y.T - 2 * prods_y
+    dot_prd = jnp.dot(X, Y.T)
+    dists_c = norms_x + norms_y.T - 2 * dot_prd
+    stats = 0
+    for scale in [0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10.0]:
+        C = 2 * h_dim * 1.0 * scale
+        res1 = C / (C + dists_x)
+        res1 += C / (C + dists_y)
+        res1 = (1 - jnp.eye(batch_size)) * res1
+        res1 = jnp.sum(res1) / (batch_size - 1)
+        res2 = C / (C + dists_c)
+        res2 = jnp.sum(res2) * 2.0 / batch_size
+        stats += res1 - res2
+    return stats
+class MMDQuantizer(nn.Module): #cooking
+    config: ml_collections.ConfigDict
+    train: bool
+    @nn.compact
+    def __call__(self, x):
+        if not self.train:
+            result_dict = dict()
+            return x, result_dict
+        else:
+            print("mmd quantizer")
+            batch_size, height, width, latent_channels = x.shape
+            z_flat = x.reshape(batch_size, -1)
+            print(z_flat.shape)
+            z_fake_flat = jax.random.normal(self.make_rng("noise"), z_flat.shape) * self.config["MMD_weight"]
+            print(z_fake_flat.shape)
+            mmd_loss = imq_kernel(z_flat, z_fake_flat, z_flat.shape[1])
+            print(mmd_loss.shape)
+            print(mmd_loss)
+            result_dict = dict(quantizer_loss=mmd_loss)
+            return x, result_dict
+class KLQuantizerTwo(nn.Module):
+    config: ml_collections.ConfigDict
+    train: bool
+    @nn.compact
+    def __call__(self, x):
+        #emb_dim = x.shape[-1] // 2 # Use half as means, half as logvars.
+        #means = x[..., :emb_dim]
+        #logvars = x[..., emb_dim:]
+        #Wwe actually wanna do mean and STD on the batch axis?
+        #we start as b hw 8, go to b hw 4, with mean and std over those.
+        if not self.train:
+            result_dict = dict()
+            result_dict["std"] = 1.0
+            return x, result_dict
+        else:
+            stds = jnp.std(x, axis = [1,2,3])
+            noise = jax.random.normal(self.make_rng("noise"), x.shape)
+            logvars = .5 * jnp.log(stds)
+            logvars = logvars.reshape(-1,1,1,1)
+            if True:#This is true for special KL where we set sigma to 1 manually
+                logvars = 0.0
+            if False:#dinossl
+                x_2 = x.reshape(x.shape[0], -1, x.shape[-1])#Linear with channel size
+                x_2 = jnp.swapaxes(x_2,0,1)
+                #then/ get the covariance
+                cov = jnp.swapaxes(x_2,1,2) @ x_2 / x.shape[0]
+                #Not sure about this, we also have regular cov
+                I_d = jnp.identity(x.shape[-1])
+                R_eps = jnp.log(jnp.linalg.det(jnp.expand_dims(I_d, axis = 0) + x.shape[-1]/ (.0001 ** 2) * cov))
+                #So something here *does* depend on the -1 shape, but I need to math it out.
+                kl_loss = R_eps.mean()
+            #This is the denoising version
+            kl_loss = - 0.5 * jnp.sum(1 + logvars - jnp.square(x) - jnp.exp(logvars),axis=tuple(range(1, x.ndim)))
+            kl_loss = jnp.mean(kl_loss)
+            result_dict = dict(quantizer_loss=kl_loss)
+            result_dict["std"] = 1.0
+            #For proper kl two, we need to return noise + mean.
+            return x + noise, result_dict
+class FSQuantizer(nn.Module):
+    config: ml_collections.ConfigDict
+    train: bool
+    @nn.compact
+    def __call__(self, x):
+        assert self.config['fsq_levels'] % 2 == 1, "FSQ levels must be odd."
+        z = jnp.tanh(x) # [-1, 1]
+        z = z * (self.config['fsq_levels']-1) / 2 # [-fsq_levels/2, fsq_levels/2]
+        zhat = jnp.round(z) # e.g. [-2, -1, 0, 1, 2]
+        quantized = z + jax.lax.stop_gradient(zhat - z)
+        quantized = quantized / (self.config['fsq_levels'] // 2) # [-1, 1], but quantized.
+        result_dict = dict()
+        # Diagnostics for codebook usage.
+        zhat_scaled = zhat + self.config['fsq_levels'] // 2
+        basis = jnp.concatenate((jnp.array([1]), jnp.cumprod(jnp.array([self.config['fsq_levels']] * (x.shape[-1]-1))))).astype(jnp.uint32)
+        idx = (zhat_scaled * basis).sum(axis=-1).astype(jnp.uint32)
+        idx_flat = idx.reshape(-1)
+        usage = jnp.bincount(idx_flat, length=self.config['fsq_levels']**x.shape[-1])
+        result_dict.update({
+            "z_ids": zhat,
+            'usage': usage
+        })
+        return quantized, result_dict
+class VQVAE(nn.Module):
+    """VQVAE model."""
+    config: ml_collections.ConfigDict
+    train: bool
+    def setup(self):
+        """VQVAE setup."""
+        if self.config['quantizer_type'] == 'vq':
+            self.quantizer = VectorQuantizer(config=self.config, train=self.train)
+        elif self.config['quantizer_type'] == 'kl':
+            self.quantizer = KLQuantizer(config=self.config, train=self.train)
+        elif self.config['quantizer_type'] == 'fsq':
+            self.quantizer = FSQuantizer(config=self.config, train=self.train)
+        elif self.config['quantizer_type'] == 'ae':
+            self.quantizer = AEQuantizer(config=self.config, train=self.train)
+        elif self.config["quantizer_type"] == "kl_two":
+            self.quantizer = KLQuantizerTwo(config=self.config, train=self.train)
+        self.encoder = Encoder(config=self.config)
+        self.decoder = Decoder(config=self.config)
+    def encode(self, image):
+        encoded_feature = self.encoder(image)
+        quantized, result_dict = self.quantizer(encoded_feature)
+        print("After quant", quantized.shape)
+        return quantized, result_dict
+    def decode(self, z_vectors):
+        print("z_vectors shape", z_vectors.shape)
+        reconstructed = self.decoder(z_vectors)
+        return reconstructed
+    def decode_from_indices(self, z_ids):
+        z_vectors = self.quantizer.decode_ids(z_ids)
+        reconstructed_image = self.decode(z_vectors)
+        return reconstructed_image
+    def encode_to_indices(self, image):
+        encoded_feature = self.encoder(image)
+        _, result_dict = self.quantizer(encoded_feature)
+        ids = result_dict["z_ids"]
+        return ids
+    def __call__(self, input_dict):
+        quantized, result_dict = jax.lax.stop_gradient(self.encode(input_dict))
+        #Freezing encoder now
+        print("encode finished")
+        result_dict["latents"] = quantized
+        outputs = self.decoder(quantized)
+        return outputs, result_dict