MongolLabs
/

SDLikeModels

Model card Files Files and versions

xet

Community

KublaiKhan1 commited on Aug 31, 2025

Commit

1a7c3b5

verified ·

1 Parent(s): 561f2b7

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

GramSmall/train.py +33 -12

GramSmall/train.py CHANGED Viewed

@@ -161,19 +161,36 @@ class VQGANModel(flax.struct.PyTreeNode):
             assert reconstructed_images.shape == images.shape
-            #Gram is not normalized, so let's try that first.
-            reshaped_latents = result_dict["latents"].reshape(result_dict["latents"].shape[0],-1,result_dict["latents"].shape[-1])
-            #Reshape to batch x patches x embeddings
-            #Calculate gram matrix
-            x_transposed = jnp.transpose(reshaped_latents, (0, 2, 1))
-            gram_matrix = jnp.matmul(reshaped_latents, x_transposed)
-            diagonal_elements = jnp.einsum('bii->bi', gram_matrix)
-            sum_of_diagonals = jnp.sum(diagonal_elements)
-            total_sum = jnp.sum(gram_matrix)
-            gram_loss = total_sum - sum_of_diagonals
-            gram_loss = gram_loss / 992 #divide by 32x32 - 32
-            gram_loss = gram_loss / 40 #Try this for now
@@ -207,6 +224,7 @@ class VQGANModel(flax.struct.PyTreeNode):
                 + (quantizer_loss * FLAGS.model['quantizer_loss_ratio']) \
                 + (d_loss_for_vae * FLAGS.model['g_adversarial_loss_weight']) \
                 + (perceptual_loss * FLAGS.model['perceptual_loss_weight']) \
                 #+ (smooth_loss * FLAGS.model['pl_weight'] )
             codebook_usage = result_dict['usage'] if 'usage' in result_dict else 0.0
@@ -218,6 +236,7 @@ class VQGANModel(flax.struct.PyTreeNode):
                 'perceptual_loss': perceptual_loss,
                 'quantizer_loss': quantizer_loss,
                 'codebook_usage': codebook_usage,
                 #'pl_loss': smooth_loss,
             }
@@ -581,6 +600,8 @@ def main(_):
         model, update_info = model.update(batch_images)
         if i % FLAGS.log_interval == 0:
             update_info = jax.tree.map(lambda x: x.mean(), update_info)
             train_metrics = {f'training/{k}': v for k, v in update_info.items()}

             assert reconstructed_images.shape == images.shape
+            def calculate_covariance_loss_single(image):
+                """Calculates the covariance loss for one image."""
+                # image.shape is (H, W, C)
+                C = image.shape[-1]
+                # Reshape the spatial dimensions into one dimension of "observations"
+                # New shape: (H*W, C)
+                reshaped_features = image.reshape(-1, C)
+                # Calculate the covariance matrix of the channels.
+                # We treat each channel as a variable and spatial locations as observations.
+                # The resulting shape will be (C, C).
+                cov_matrix = jnp.cov(reshaped_features, rowvar=False)
+                # The target is the identity matrix of size (C, C)
+                identity_matrix = jnp.eye(C)
+                # The loss is the sum of squared differences (Frobenius norm squared)
+                loss = jnp.sum(jnp.square(cov_matrix - identity_matrix))
+                return loss
+            B, H, W, C = reconstructed_images.shape
+            reshaped_features = reconstructed_images.reshape(B, -1, C)
+            batched_loss_fn = jax.vmap(calculate_covariance_loss_single, in_axes=0)
+            per_image_losses = batched_loss_fn(reconstructed_images)
+            gram_loss = jnp.mean(per_image_losses) * 1
+            #Gram loss is very low - let's crank it up until it starts harming thngs?
                 + (quantizer_loss * FLAGS.model['quantizer_loss_ratio']) \
                 + (d_loss_for_vae * FLAGS.model['g_adversarial_loss_weight']) \
                 + (perceptual_loss * FLAGS.model['perceptual_loss_weight']) \
+                + gram_loss
                 #+ (smooth_loss * FLAGS.model['pl_weight'] )
             codebook_usage = result_dict['usage'] if 'usage' in result_dict else 0.0
                 'perceptual_loss': perceptual_loss,
                 'quantizer_loss': quantizer_loss,
                 'codebook_usage': codebook_usage,
+                'cov loss': gram_loss
                 #'pl_loss': smooth_loss,
             }
         model, update_info = model.update(batch_images)
+        print(update_info)
         if i % FLAGS.log_interval == 0:
             update_info = jax.tree.map(lambda x: x.mean(), update_info)
             train_metrics = {f'training/{k}': v for k, v in update_info.items()}