Make changes

Files changed (5) hide show

dataset/celeba.py +6 -2
dataset/dataset.py +142 -0
models/blocks.py +252 -260
models/vqvae.py +101 -98
train_vqvae.py +4 -5

dataset/celeba.py CHANGED Viewed

@@ -8,11 +8,12 @@ from PIL import Image
 class ParquetImageDataset(Dataset):
-    def __init__(self, parquet_files, transform=None, im_size=256):
         self.data = pd.concat([pd.read_parquet(file)
                               for file in parquet_files], ignore_index=True)
         self.transform = transform
         self.im_size = im_size
     def __len__(self):
         return len(self.data)
@@ -27,7 +28,10 @@ class ParquetImageDataset(Dataset):
         ])(image)
         image.close()
         im_tensor = (2 * im_tensor) - 1  # type: ignore
-        return im_tensor, caption
 def create_dataloader(parquet_dir, batch_size=32, shuffle=True, num_workers=4):

 class ParquetImageDataset(Dataset):
+    def __init__(self, parquet_files, transform=None, im_size=256,condition_config=None):
         self.data = pd.concat([pd.read_parquet(file)
                               for file in parquet_files], ignore_index=True)
         self.transform = transform
         self.im_size = im_size
+        self.condition_types = [] if condition_config is None else condition_config['condition_types']
     def __len__(self):
         return len(self.data)
         ])(image)
         image.close()
         im_tensor = (2 * im_tensor) - 1  # type: ignore
+        if len(self.condition_types) == 0:
+            return im_tensor
+        else:
+            return im_tensor, caption
 def create_dataloader(parquet_dir, batch_size=32, shuffle=True, num_workers=4):

dataset/dataset.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import glob
+import os
+import random
+import torch
+import torchvision
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+from torch.utils.data.dataset import Dataset
+class CelebDataset(Dataset):
+    r"""
+    Celeb dataset will by default centre crop and resize the images.
+    This can be replaced by any other dataset. As long as all the images
+    are under one directory.
+    """
+    def __init__(self, split, im_path, im_size=256, im_channels=3, im_ext='jpg',
+                 use_latents=False, latent_path=None, condition_config=None):
+        self.split = split
+        self.im_size = im_size
+        self.im_channels = im_channels
+        self.im_ext = im_ext
+        self.im_path = im_path
+        self.latent_maps = None
+        self.use_latents = False
+        self.condition_types = [] if condition_config is None else condition_config['condition_types']
+        self.idx_to_cls_map = {}
+        self.cls_to_idx_map = {}
+        if 'image' in self.condition_types:
+            self.mask_channels = condition_config['image_condition_config']['image_condition_input_channels']
+            self.mask_h = condition_config['image_condition_config']['image_condition_h']
+            self.mask_w = condition_config['image_condition_config']['image_condition_w']
+        self.images, self.texts, self.masks = self.load_images(im_path)
+    def load_images(self, im_path):
+        r"""
+        Gets all images from the path specified
+        and stacks them all up
+        """
+        assert os.path.exists(
+            im_path), "images path {} does not exist".format(im_path)
+        ims = []
+        fnames = glob.glob(os.path.join(
+            im_path, 'CelebA-HQ-img/*.{}'.format('png')))
+        fnames += glob.glob(os.path.join(im_path,
+                            'CelebA-HQ-img/*.{}'.format('jpg')))
+        fnames += glob.glob(os.path.join(im_path,
+                            'CelebA-HQ-img/*.{}'.format('jpeg')))
+        texts = []
+        masks = []
+        if 'image' in self.condition_types:
+            label_list = ['skin', 'nose', 'eye_g', 'l_eye', 'r_eye', 'l_brow', 'r_brow', 'l_ear', 'r_ear', 'mouth',
+                          'u_lip', 'l_lip', 'hair', 'hat', 'ear_r', 'neck_l', 'neck', 'cloth']
+            self.idx_to_cls_map = {idx: label_list[idx]
+                                   for idx in range(len(label_list))}
+            self.cls_to_idx_map = {
+                label_list[idx]: idx for idx in range(len(label_list))}
+        for fname in tqdm(fnames):
+            ims.append(fname)
+            if 'text' in self.condition_types:
+                im_name = os.path.split(fname)[1].split('.')[0]
+                captions_im = []
+                with open(os.path.join(im_path, 'celeba-caption/{}.txt'.format(im_name))) as f:
+                    for line in f.readlines():
+                        captions_im.append(line.strip())
+                texts.append(captions_im)
+            if 'image' in self.condition_types:
+                im_name = int(os.path.split(fname)[1].split('.')[0])
+                masks.append(os.path.join(
+                    im_path, 'CelebAMask-HQ-mask', '{}.png'.format(im_name)))
+        if 'text' in self.condition_types:
+            assert len(texts) == len(
+                ims), "Condition Type Text but could not find captions for all images"
+        if 'image' in self.condition_types:
+            assert len(masks) == len(
+                ims), "Condition Type Image but could not find masks for all images"
+        print('Found {} images'.format(len(ims)))
+        print('Found {} masks'.format(len(masks)))
+        print('Found {} captions'.format(len(texts)))
+        return ims, texts, masks
+    def get_mask(self, index):
+        r"""
+        Method to get the mask of WxH
+        for given index and convert it into
+        Classes x W x H mask image
+        :param index:
+        :return:
+        """
+        mask_im = Image.open(self.masks[index])
+        mask_im = np.array(mask_im)
+        im_base = np.zeros((self.mask_h, self.mask_w, self.mask_channels))
+        for orig_idx in range(len(self.idx_to_cls_map)):
+            im_base[mask_im == (orig_idx+1), orig_idx] = 1
+        mask = torch.from_numpy(im_base).permute(2, 0, 1).float()
+        return mask
+    def __len__(self):
+        return len(self.images)
+    def __getitem__(self, index):
+        ######## Set Conditioning Info ########
+        cond_inputs = {}
+        if 'text' in self.condition_types:
+            cond_inputs['text'] = random.sample(self.texts[index], k=1)[0]
+        if 'image' in self.condition_types:
+            mask = self.get_mask(index)
+            cond_inputs['image'] = mask
+        #######################################
+        if self.use_latents:
+            latent = self.latent_maps[self.images[index]]
+            if len(self.condition_types) == 0:
+                return latent
+            else:
+                return latent, cond_inputs
+        else:
+            im = Image.open(self.images[index])
+            im_tensor = torchvision.transforms.Compose([
+                torchvision.transforms.Resize(self.im_size),
+                torchvision.transforms.CenterCrop(self.im_size),
+                torchvision.transforms.ToTensor(),
+            ])(im)
+            im.close()
+            # Convert input to -1 to 1 range.
+            im_tensor = (2 * im_tensor) - 1
+            if len(self.condition_types) == 0:
+                return im_tensor
+            else:
+                return im_tensor, cond_inputs

models/blocks.py CHANGED Viewed

@@ -1,92 +1,99 @@
-from re import A
 import torch
 import torch.nn as nn
 def get_time_embedding(time_steps, temb_dim):
-    assert time_steps % 2 == 0, "time embedding dimension must be divisible by 2"
     factor = 10000 ** ((torch.arange(
         start=0, end=temb_dim // 2, dtype=torch.float32, device=time_steps.device) / (temb_dim // 2))
     )
     # pos / factor
-    # time_steps B -> B, 1 -> B, temb_dim
     t_emb = time_steps[:, None].repeat(1, temb_dim // 2) / factor
     t_emb = torch.cat([torch.sin(t_emb), torch.cos(t_emb)], dim=-1)
     return t_emb
 class DownBlock(nn.Module):
     """
-    Down Block that down samples the image, flows like this:
-    1) Resnet block with time embedding
-    2) Self Attention block
-    3) Down Sample
-    """
-    def __init__(self, in_channels, out_channels, t_emd_dim, down_sample, num_heads, num_layers, attn, norm_channels, cross_attn=False,
-                 context_dim=None):
         super().__init__()
         self.down_sample = down_sample
-        self.cross_attn = cross_attn
         self.context_dim = context_dim
         self.cross_attn = cross_attn
-        self.t_emb_dim = t_emd_dim
-        self.num_layers = num_layers
-        self.attn = attn
-        self.resnet_conv_first = nn.ModuleList([
-            nn.Sequential(
-                nn.GroupNorm(norm_channels, in_channels if i ==
-                             0 else out_channels),
-                nn.SiLU(),
-                nn.Conv2d(in_channels=in_channels if i == 0 else out_channels,
-                          out_channels=out_channels, kernel_size=3, stride=1, padding=1)
-            ) for i in range(num_layers)
-        ])
         if self.t_emb_dim is not None:
-            self.time_embd_layers = nn.ModuleList([
                 nn.Sequential(
                     nn.SiLU(),
                     nn.Linear(self.t_emb_dim, out_channels)
                 )
                 for _ in range(num_layers)
             ])
-        self.resnet_conv_second = nn.ModuleList([
-            nn.Sequential(
-                nn.GroupNorm(norm_channels, out_channels),
-                nn.SiLU(),
-                nn.Conv2d(in_channels, out_channels,
-                          kernel_size=3, stride=1, padding=1),
-            )
-            for _ in range(num_layers)
-        ])
         if self.attn:
             self.attention_norms = nn.ModuleList(
                 [nn.GroupNorm(norm_channels, out_channels)
                  for _ in range(num_layers)]
             )
-            self.attention = nn.ModuleList(
-                [nn.MultiheadAttention(
-                    out_channels, num_heads=num_heads, batch_first=True) for _ in range(num_layers)]
             )
         if self.cross_attn:
-            assert context_dim is not None, "Context Dimension must be passed to cross attention"
-            self.cross_attn_norms = nn.ModuleList(
                 [nn.GroupNorm(norm_channels, out_channels)
                  for _ in range(num_layers)]
             )
-            self.cross_attention = nn.ModuleList(
-                [nn.MultiheadAttention(
-                    out_channels, num_heads=num_heads, batch_first=True) for _ in range(num_layers)]
             )
             self.context_proj = nn.ModuleList(
                 [nn.Linear(context_dim, out_channels)
                  for _ in range(num_layers)]
@@ -94,177 +101,173 @@ class DownBlock(nn.Module):
         self.residual_input_conv = nn.ModuleList(
             [
-                nn.Conv2d(in_channels=in_channels if i == 0 else out_channels,
-                          out_channels=out_channels, kernel_size=1)
                 for i in range(num_layers)
             ]
         )
-        self.resnet_down_conv = nn.Conv2d(out_channels, out_channels,
                                           4, 2, 1) if self.down_sample else nn.Identity()
     def forward(self, x, t_emb=None, context=None):
         out = x
         for i in range(self.num_layers):
-            # Resnet Block
             resnet_input = out
             out = self.resnet_conv_first[i](out)
             if self.t_emb_dim is not None:
-                out = out + self.time_embd_layers[i](t_emb)[:, :, None, None]
             out = self.resnet_conv_second[i](out)
             out = out + self.residual_input_conv[i](resnet_input)
-            # Self Attention
             if self.attn:
                 batch_size, channels, h, w = out.shape
-                in_attn = out.reshape(batch_size, channels, h*w)
                 in_attn = self.attention_norms[i](in_attn)
                 in_attn = in_attn.transpose(1, 2)
-                out_attn, _ = self.attention[i](in_attn, in_attn, in_attn)
-                out_attn = out.transpose(1, 2).reshape(
-                    batch_size, channels, h, w)
                 out = out + out_attn
-            # Cross Attention
             if self.cross_attn:
-                assert context is not None, "Context must be given for cross_attn"
                 batch_size, channels, h, w = out.shape
                 in_attn = out.reshape(batch_size, channels, h * w)
                 in_attn = self.cross_attention_norms[i](in_attn)
                 in_attn = in_attn.transpose(1, 2)
                 assert context.shape[0] == x.shape[0] and context.shape[-1] == self.context_dim
                 context_proj = self.context_proj[i](context)
-                out_attn, _ = self.cross_attentions[i](
-                    in_attn, context_proj, context_proj)
-                out_attn = out_attn.transpose(1, 2).reshape(
-                    batch_size, channels, h, w)
                 out = out + out_attn
-        out = self.resnet_down_conv(out)
         return out
 class MidBlock(nn.Module):
     """
-    Mid Block that works with same dimensions, flows like this:
-    1) Resnet block with time embedding
-    2) Self Attention block
-    3) Resnet block with time embedding
-    """
-    def __init__(self, in_channels, out_channels, t_emb_dim, num_heads, num_layers, norm_dim, cross_attn=None, context_dim=None):
         super().__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
         self.t_emb_dim = t_emb_dim
-        self.cross_attn = cross_attn
         self.context_dim = context_dim
-        self.num_layers = num_layers
-        self.resnet_conv_one = nn.ModuleList([
-            nn.Sequential(
-                nn.GroupNorm(norm_dim, in_channels if i ==
-                             0 else out_channels),
-                nn.SiLU(),
-                nn.Conv2d(in_channels if i == 0 else out_channels,
-                          out_channels, 3, 1, 1)
-            )
-            for i in range(num_layers + 1)
-        ])
         if self.t_emb_dim is not None:
-            self.time_emb_layers = nn.ModuleList([
                 nn.Sequential(
                     nn.SiLU(),
                     nn.Linear(t_emb_dim, out_channels)
                 )
                 for _ in range(num_layers + 1)
             ])
-        self.resnet_conv_two = nn.ModuleList([
-            nn.Sequential(
-                nn.GroupNorm(norm_dim, out_channels),
-                nn.SiLU(),
-                nn.Conv2d(out_channels, out_channels, 3, 1, 1)
-            ) for _ in range(num_layers + 1)
-        ])
         self.attention_norms = nn.ModuleList(
-            [nn.GroupNorm(norm_dim, out_channels) for _ in range(num_layers)]
         )
-        self.attention_heads = nn.ModuleList(
             [nn.MultiheadAttention(out_channels, num_heads, batch_first=True)
              for _ in range(num_layers)]
         )
         if self.cross_attn:
-            assert context_dim is not None, "Context must be given for cross attn"
-            self.cross_attn_norms = nn.ModuleList(
-                [nn.GroupNorm(norm_dim, out_channels)
                  for _ in range(num_layers)]
             )
-            self.cross_attn = nn.ModuleList(
-                [nn.MultiheadAttention(
-                    out_channels, num_heads=num_heads, batch_first=True) for _ in range(num_layers)]
             )
-            self.context_proj = nn.ModuleList([
-                nn.Conv2d(in_channels if i == 0 else out_channels,
-                          out_channels, kernel_size=1)
                 for i in range(num_layers + 1)
-            ])
-        self.residual_input_conv = nn.ModuleList([
-            nn.Conv2d(in_channels if i == 0 else out_channels,
-                      out_channels, kernel_size=1)
-            for i in range(num_layers + 1)
-        ])
     def forward(self, x, t_emb=None, context=None):
         out = x
         resnet_input = out
-        out = self.resnet_conv_one[0](out)
         if self.t_emb_dim is not None:
-            out = out + self.time_emb_layers[0](t_emb)[:, :, None, None]
-        out = self.resnet_conv_two[0](out)
         out = out + self.residual_input_conv[0](resnet_input)
         for i in range(self.num_layers):
             batch_size, channels, h, w = out.shape
-            in_attn = out.reshape(batch_size, channels, h*w)
             in_attn = self.attention_norms[i](in_attn)
             in_attn = in_attn.transpose(1, 2)
-            out_attn, _ = self.attention_heads[i](in_attn, in_attn, in_attn)
-            out_attn = out_attn.reshape(batch_size, channels, h, w)
             out = out + out_attn
             if self.cross_attn:
-                assert context is not None, "Context needed when using cross attn"
                 batch_size, channels, h, w = out.shape
-                in_attn = out.reshape(batch_size, channels, h*w)
-                in_attn = self.cross_attn_norms[i](in_attn)
                 in_attn = in_attn.transpose(1, 2)
                 assert context.shape[0] == x.shape[0] and context.shape[-1] == self.context_dim
                 context_proj = self.context_proj[i](context)
-                out_attn, _ = self.cross_attn[i](
-                    in_attn, context_proj, context_proj)
-                out_attn = out_attn.transpose(1, 2).reshape(
-                    batch_size, channels, h, w)
                 out = out + out_attn
             resnet_input = out
-            out = self.resnet_conv_one[i+1](out)
             if self.t_emb_dim is not None:
-                out = out + self.time_emb_layers[i+1](t_emb)[:, :, None, None]
-            out = out + self.resnet_conv_two[i+1](out)
-            out = out + self.residual_input_conv[i+1](resnet_input)
         return out
-class UpBlockUnet(nn.Module):
     r"""
     Up conv block with attention.
     Sequence of following blocks
@@ -273,20 +276,18 @@ class UpBlockUnet(nn.Module):
     2. Resnet block with time embedding
     3. Attention Block
     """
-    def __init__(self, in_channels, out_channels, t_emb_dim, up_sample,
-                 num_heads, num_layers, norm_channels, cross_attn=False, context_dim=None):
         super().__init__()
         self.num_layers = num_layers
         self.up_sample = up_sample
         self.t_emb_dim = t_emb_dim
-        self.cross_attn = cross_attn
-        self.context_dim = context_dim
         self.resnet_conv_first = nn.ModuleList(
             [
                 nn.Sequential(
-                    nn.GroupNorm(norm_channels, in_channels if i ==
-                                 0 else out_channels),
                     nn.SiLU(),
                     nn.Conv2d(in_channels if i == 0 else out_channels, out_channels, kernel_size=3, stride=1,
                               padding=1),
@@ -294,7 +295,7 @@ class UpBlockUnet(nn.Module):
                 for i in range(num_layers)
             ]
         )
         if self.t_emb_dim is not None:
             self.t_emb_layers = nn.ModuleList([
                 nn.Sequential(
@@ -303,104 +304,73 @@ class UpBlockUnet(nn.Module):
                 )
                 for _ in range(num_layers)
             ])
         self.resnet_conv_second = nn.ModuleList(
             [
                 nn.Sequential(
                     nn.GroupNorm(norm_channels, out_channels),
                     nn.SiLU(),
-                    nn.Conv2d(out_channels, out_channels,
-                              kernel_size=3, stride=1, padding=1),
                 )
                 for _ in range(num_layers)
             ]
         )
-        self.attention_norms = nn.ModuleList(
-            [
-                nn.GroupNorm(norm_channels, out_channels)
-                for _ in range(num_layers)
-            ]
-        )
-        self.attentions = nn.ModuleList(
-            [
-                nn.MultiheadAttention(
-                    out_channels, num_heads, batch_first=True)
-                for _ in range(num_layers)
-            ]
-        )
-        if self.cross_attn:
-            assert context_dim is not None, "Context Dimension must be passed for cross attention"
-            self.cross_attention_norms = nn.ModuleList(
-                [nn.GroupNorm(norm_channels, out_channels)
-                 for _ in range(num_layers)]
-            )
-            self.cross_attentions = nn.ModuleList(
-                [nn.MultiheadAttention(out_channels, num_heads, batch_first=True)
-                 for _ in range(num_layers)]
             )
-            self.context_proj = nn.ModuleList(
-                [nn.Linear(context_dim, out_channels)
-                 for _ in range(num_layers)]
             )
         self.residual_input_conv = nn.ModuleList(
             [
-                nn.Conv2d(in_channels if i == 0 else out_channels,
-                          out_channels, kernel_size=1)
                 for i in range(num_layers)
             ]
         )
-        self.up_sample_conv = nn.ConvTranspose2d(in_channels // 2, in_channels // 2,
                                                  4, 2, 1) \
             if self.up_sample else nn.Identity()
-    def forward(self, x, out_down=None, t_emb=None, context=None):
         x = self.up_sample_conv(x)
         if out_down is not None:
             x = torch.cat([x, out_down], dim=1)
         out = x
         for i in range(self.num_layers):
-            # Resnet
             resnet_input = out
             out = self.resnet_conv_first[i](out)
             if self.t_emb_dim is not None:
                 out = out + self.t_emb_layers[i](t_emb)[:, :, None, None]
             out = self.resnet_conv_second[i](out)
             out = out + self.residual_input_conv[i](resnet_input)
             # Self Attention
-            batch_size, channels, h, w = out.shape
-            in_attn = out.reshape(batch_size, channels, h * w)
-            in_attn = self.attention_norms[i](in_attn)
-            in_attn = in_attn.transpose(1, 2)
-            out_attn, _ = self.attentions[i](in_attn, in_attn, in_attn)
-            out_attn = out_attn.transpose(1, 2).reshape(
-                batch_size, channels, h, w)
-            out = out + out_attn
-            # Cross Attention
-            if self.cross_attn:
-                assert context is not None, "context cannot be None if cross attention layers are used"
                 batch_size, channels, h, w = out.shape
                 in_attn = out.reshape(batch_size, channels, h * w)
-                in_attn = self.cross_attention_norms[i](in_attn)
                 in_attn = in_attn.transpose(1, 2)
-                assert len(context.shape) == 3, \
-                    "Context shape does not match B,_,CONTEXT_DIM"
-                assert context.shape[0] == x.shape[0] and context.shape[-1] == self.context_dim, \
-                    "Context shape does not match B,_,CONTEXT_DIM"
-                context_proj = self.context_proj[i](context)
-                out_attn, _ = self.cross_attentions[i](
-                    in_attn, context_proj, context_proj)
-                out_attn = out_attn.transpose(1, 2).reshape(
-                    batch_size, channels, h, w)
                 out = out + out_attn
         return out
-class UpBlock(nn.Module):
     r"""
     Up conv block with attention.
     Sequence of following blocks
@@ -409,19 +379,19 @@ class UpBlock(nn.Module):
     2. Resnet block with time embedding
     3. Attention Block
     """
-    def __init__(self, in_channels, out_channels, t_emb_dim,
-                 up_sample, num_heads, num_layers, attn, norm_channels):
         super().__init__()
         self.num_layers = num_layers
         self.up_sample = up_sample
         self.t_emb_dim = t_emb_dim
-        self.attn = attn
         self.resnet_conv_first = nn.ModuleList(
             [
                 nn.Sequential(
-                    nn.GroupNorm(norm_channels, in_channels if i ==
-                                 0 else out_channels),
                     nn.SiLU(),
                     nn.Conv2d(in_channels if i == 0 else out_channels, out_channels, kernel_size=3, stride=1,
                               padding=1),
@@ -429,7 +399,7 @@ class UpBlock(nn.Module):
                 for i in range(num_layers)
             ]
         )
         if self.t_emb_dim is not None:
             self.t_emb_layers = nn.ModuleList([
                 nn.Sequential(
@@ -438,71 +408,93 @@ class UpBlock(nn.Module):
                 )
                 for _ in range(num_layers)
             ])
         self.resnet_conv_second = nn.ModuleList(
             [
                 nn.Sequential(
                     nn.GroupNorm(norm_channels, out_channels),
                     nn.SiLU(),
-                    nn.Conv2d(out_channels, out_channels,
-                              kernel_size=3, stride=1, padding=1),
                 )
                 for _ in range(num_layers)
             ]
         )
-        if self.attn:
-            self.attention_norms = nn.ModuleList(
-                [
-                    nn.GroupNorm(norm_channels, out_channels)
-                    for _ in range(num_layers)
-                ]
             )
-            self.attentions = nn.ModuleList(
-                [
-                    nn.MultiheadAttention(
-                        out_channels, num_heads, batch_first=True)
-                    for _ in range(num_layers)
-                ]
             )
         self.residual_input_conv = nn.ModuleList(
             [
-                nn.Conv2d(in_channels if i == 0 else out_channels,
-                          out_channels, kernel_size=1)
                 for i in range(num_layers)
             ]
         )
-        self.up_sample_conv = nn.ConvTranspose2d(in_channels, in_channels,
                                                  4, 2, 1) \
             if self.up_sample else nn.Identity()
-    def forward(self, x, out_down=None, t_emb=None):
-        # Upsample
         x = self.up_sample_conv(x)
-        # Concat with Downblock output
         if out_down is not None:
             x = torch.cat([x, out_down], dim=1)
         out = x
         for i in range(self.num_layers):
-            # Resnet Block
             resnet_input = out
             out = self.resnet_conv_first[i](out)
             if self.t_emb_dim is not None:
                 out = out + self.t_emb_layers[i](t_emb)[:, :, None, None]
             out = self.resnet_conv_second[i](out)
             out = out + self.residual_input_conv[i](resnet_input)
             # Self Attention
-            if self.attn:
                 batch_size, channels, h, w = out.shape
                 in_attn = out.reshape(batch_size, channels, h * w)
-                in_attn = self.attention_norms[i](in_attn)
                 in_attn = in_attn.transpose(1, 2)
-                out_attn, _ = self.attentions[i](in_attn, in_attn, in_attn)
-                out_attn = out_attn.transpose(1, 2).reshape(
-                    batch_size, channels, h, w)
                 out = out + out_attn
         return out

 import torch
 import torch.nn as nn
 def get_time_embedding(time_steps, temb_dim):
+    r"""
+    Convert time steps tensor into an embedding using the
+    sinusoidal time embedding formula
+    :param time_steps: 1D tensor of length batch size
+    :param temb_dim: Dimension of the embedding
+    :return: BxD embedding representation of B time steps
+    """
+    assert temb_dim % 2 == 0, "time embedding dimension must be divisible by 2"
+    # factor = 10000^(2i/d_model)
     factor = 10000 ** ((torch.arange(
         start=0, end=temb_dim // 2, dtype=torch.float32, device=time_steps.device) / (temb_dim // 2))
     )
     # pos / factor
+    # timesteps B -> B, 1 -> B, temb_dim
     t_emb = time_steps[:, None].repeat(1, temb_dim // 2) / factor
     t_emb = torch.cat([torch.sin(t_emb), torch.cos(t_emb)], dim=-1)
     return t_emb
 class DownBlock(nn.Module):
+    r"""
+    Down conv block with attention.
+    Sequence of following block
+    1. Resnet block with time embedding
+    2. Attention block
+    3. Downsample
     """
+    def __init__(self, in_channels, out_channels, t_emb_dim,
+                 down_sample, num_heads, num_layers, attn, norm_channels, cross_attn=False, context_dim=None):
         super().__init__()
+        self.num_layers = num_layers
         self.down_sample = down_sample
+        self.attn = attn
         self.context_dim = context_dim
         self.cross_attn = cross_attn
+        self.t_emb_dim = t_emb_dim
+        self.resnet_conv_first = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.GroupNorm(norm_channels, in_channels if i == 0 else out_channels),
+                    nn.SiLU(),
+                    nn.Conv2d(in_channels if i == 0 else out_channels, out_channels,
+                              kernel_size=3, stride=1, padding=1),
+                )
+                for i in range(num_layers)
+            ]
+        )
         if self.t_emb_dim is not None:
+            self.t_emb_layers = nn.ModuleList([
                 nn.Sequential(
                     nn.SiLU(),
                     nn.Linear(self.t_emb_dim, out_channels)
                 )
                 for _ in range(num_layers)
             ])
+        self.resnet_conv_second = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.GroupNorm(norm_channels, out_channels),
+                    nn.SiLU(),
+                    nn.Conv2d(out_channels, out_channels,
+                              kernel_size=3, stride=1, padding=1),
+                )
+                for _ in range(num_layers)
+            ]
+        )
         if self.attn:
             self.attention_norms = nn.ModuleList(
                 [nn.GroupNorm(norm_channels, out_channels)
                  for _ in range(num_layers)]
             )
+            self.attentions = nn.ModuleList(
+                [nn.MultiheadAttention(out_channels, num_heads, batch_first=True)
+                 for _ in range(num_layers)]
             )
         if self.cross_attn:
+            assert context_dim is not None, "Context Dimension must be passed for cross attention"
+            self.cross_attention_norms = nn.ModuleList(
                 [nn.GroupNorm(norm_channels, out_channels)
                  for _ in range(num_layers)]
             )
+            self.cross_attentions = nn.ModuleList(
+                [nn.MultiheadAttention(out_channels, num_heads, batch_first=True)
+                 for _ in range(num_layers)]
             )
             self.context_proj = nn.ModuleList(
                 [nn.Linear(context_dim, out_channels)
                  for _ in range(num_layers)]
         self.residual_input_conv = nn.ModuleList(
             [
+                nn.Conv2d(in_channels if i == 0 else out_channels, out_channels, kernel_size=1)
                 for i in range(num_layers)
             ]
         )
+        self.down_sample_conv = nn.Conv2d(out_channels, out_channels,
                                           4, 2, 1) if self.down_sample else nn.Identity()
     def forward(self, x, t_emb=None, context=None):
         out = x
         for i in range(self.num_layers):
+            # Resnet block of Unet
             resnet_input = out
             out = self.resnet_conv_first[i](out)
             if self.t_emb_dim is not None:
+                out = out + self.t_emb_layers[i](t_emb)[:, :, None, None]
             out = self.resnet_conv_second[i](out)
             out = out + self.residual_input_conv[i](resnet_input)
             if self.attn:
+                # Attention block of Unet
                 batch_size, channels, h, w = out.shape
+                in_attn = out.reshape(batch_size, channels, h * w)
                 in_attn = self.attention_norms[i](in_attn)
                 in_attn = in_attn.transpose(1, 2)
+                out_attn, _ = self.attentions[i](in_attn, in_attn, in_attn)
+                out_attn = out_attn.transpose(1, 2).reshape(batch_size, channels, h, w)
                 out = out + out_attn
             if self.cross_attn:
+                assert context is not None, "context cannot be None if cross attention layers are used"
                 batch_size, channels, h, w = out.shape
                 in_attn = out.reshape(batch_size, channels, h * w)
                 in_attn = self.cross_attention_norms[i](in_attn)
                 in_attn = in_attn.transpose(1, 2)
                 assert context.shape[0] == x.shape[0] and context.shape[-1] == self.context_dim
                 context_proj = self.context_proj[i](context)
+                out_attn, _ = self.cross_attentions[i](in_attn, context_proj, context_proj)
+                out_attn = out_attn.transpose(1, 2).reshape(batch_size, channels, h, w)
                 out = out + out_attn
+        # Downsample
+        out = self.down_sample_conv(out)
         return out
 class MidBlock(nn.Module):
+    r"""
+    Mid conv block with attention.
+    Sequence of following blocks
+    1. Resnet block with time embedding
+    2. Attention block
+    3. Resnet block with time embedding
     """
+    def __init__(self, in_channels, out_channels, t_emb_dim, num_heads, num_layers, norm_channels, cross_attn=None, context_dim=None):
         super().__init__()
+        self.num_layers = num_layers
         self.t_emb_dim = t_emb_dim
         self.context_dim = context_dim
+        self.cross_attn = cross_attn
+        self.resnet_conv_first = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.GroupNorm(norm_channels, in_channels if i == 0 else out_channels),
+                    nn.SiLU(),
+                    nn.Conv2d(in_channels if i == 0 else out_channels, out_channels, kernel_size=3, stride=1,
+                              padding=1),
+                )
+                for i in range(num_layers + 1)
+            ]
+        )
         if self.t_emb_dim is not None:
+            self.t_emb_layers = nn.ModuleList([
                 nn.Sequential(
                     nn.SiLU(),
                     nn.Linear(t_emb_dim, out_channels)
                 )
                 for _ in range(num_layers + 1)
             ])
+        self.resnet_conv_second = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.GroupNorm(norm_channels, out_channels),
+                    nn.SiLU(),
+                    nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1),
+                )
+                for _ in range(num_layers + 1)
+            ]
+        )
         self.attention_norms = nn.ModuleList(
+            [nn.GroupNorm(norm_channels, out_channels)
+             for _ in range(num_layers)]
         )
+        self.attentions = nn.ModuleList(
             [nn.MultiheadAttention(out_channels, num_heads, batch_first=True)
              for _ in range(num_layers)]
         )
         if self.cross_attn:
+            assert context_dim is not None, "Context Dimension must be passed for cross attention"
+            self.cross_attention_norms = nn.ModuleList(
+                [nn.GroupNorm(norm_channels, out_channels)
                  for _ in range(num_layers)]
             )
+            self.cross_attentions = nn.ModuleList(
+                [nn.MultiheadAttention(out_channels, num_heads, batch_first=True)
+                 for _ in range(num_layers)]
             )
+            self.context_proj = nn.ModuleList(
+                [nn.Linear(context_dim, out_channels)
+                 for _ in range(num_layers)]
+            )
+        self.residual_input_conv = nn.ModuleList(
+            [
+                nn.Conv2d(in_channels if i == 0 else out_channels, out_channels, kernel_size=1)
                 for i in range(num_layers + 1)
+            ]
+        )
     def forward(self, x, t_emb=None, context=None):
         out = x
+        # First resnet block
         resnet_input = out
+        out = self.resnet_conv_first[0](out)
         if self.t_emb_dim is not None:
+            out = out + self.t_emb_layers[0](t_emb)[:, :, None, None]
+        out = self.resnet_conv_second[0](out)
         out = out + self.residual_input_conv[0](resnet_input)
         for i in range(self.num_layers):
+            # Attention Block
             batch_size, channels, h, w = out.shape
+            in_attn = out.reshape(batch_size, channels, h * w)
             in_attn = self.attention_norms[i](in_attn)
             in_attn = in_attn.transpose(1, 2)
+            out_attn, _ = self.attentions[i](in_attn, in_attn, in_attn)
+            out_attn = out_attn.transpose(1, 2).reshape(batch_size, channels, h, w)
             out = out + out_attn
             if self.cross_attn:
+                assert context is not None, "context cannot be None if cross attention layers are used"
                 batch_size, channels, h, w = out.shape
+                in_attn = out.reshape(batch_size, channels, h * w)
+                in_attn = self.cross_attention_norms[i](in_attn)
                 in_attn = in_attn.transpose(1, 2)
                 assert context.shape[0] == x.shape[0] and context.shape[-1] == self.context_dim
                 context_proj = self.context_proj[i](context)
+                out_attn, _ = self.cross_attentions[i](in_attn, context_proj, context_proj)
+                out_attn = out_attn.transpose(1, 2).reshape(batch_size, channels, h, w)
                 out = out + out_attn
+            # Resnet Block
             resnet_input = out
+            out = self.resnet_conv_first[i + 1](out)
             if self.t_emb_dim is not None:
+                out = out + self.t_emb_layers[i + 1](t_emb)[:, :, None, None]
+            out = self.resnet_conv_second[i + 1](out)
+            out = out + self.residual_input_conv[i + 1](resnet_input)
         return out
+class UpBlock(nn.Module):
     r"""
     Up conv block with attention.
     Sequence of following blocks
     2. Resnet block with time embedding
     3. Attention Block
     """
+    def __init__(self, in_channels, out_channels, t_emb_dim,
+                 up_sample, num_heads, num_layers, attn, norm_channels):
         super().__init__()
         self.num_layers = num_layers
         self.up_sample = up_sample
         self.t_emb_dim = t_emb_dim
+        self.attn = attn
         self.resnet_conv_first = nn.ModuleList(
             [
                 nn.Sequential(
+                    nn.GroupNorm(norm_channels, in_channels if i == 0 else out_channels),
                     nn.SiLU(),
                     nn.Conv2d(in_channels if i == 0 else out_channels, out_channels, kernel_size=3, stride=1,
                               padding=1),
                 for i in range(num_layers)
             ]
         )
         if self.t_emb_dim is not None:
             self.t_emb_layers = nn.ModuleList([
                 nn.Sequential(
                 )
                 for _ in range(num_layers)
             ])
         self.resnet_conv_second = nn.ModuleList(
             [
                 nn.Sequential(
                     nn.GroupNorm(norm_channels, out_channels),
                     nn.SiLU(),
+                    nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1),
                 )
                 for _ in range(num_layers)
             ]
         )
+        if self.attn:
+            self.attention_norms = nn.ModuleList(
+                [
+                    nn.GroupNorm(norm_channels, out_channels)
+                    for _ in range(num_layers)
+                ]
             )
+            self.attentions = nn.ModuleList(
+                [
+                    nn.MultiheadAttention(out_channels, num_heads, batch_first=True)
+                    for _ in range(num_layers)
+                ]
             )
         self.residual_input_conv = nn.ModuleList(
             [
+                nn.Conv2d(in_channels if i == 0 else out_channels, out_channels, kernel_size=1)
                 for i in range(num_layers)
             ]
         )
+        self.up_sample_conv = nn.ConvTranspose2d(in_channels, in_channels,
                                                  4, 2, 1) \
             if self.up_sample else nn.Identity()
+    def forward(self, x, out_down=None, t_emb=None):
+        # Upsample
         x = self.up_sample_conv(x)
+        # Concat with Downblock output
         if out_down is not None:
             x = torch.cat([x, out_down], dim=1)
         out = x
         for i in range(self.num_layers):
+            # Resnet Block
             resnet_input = out
             out = self.resnet_conv_first[i](out)
             if self.t_emb_dim is not None:
                 out = out + self.t_emb_layers[i](t_emb)[:, :, None, None]
             out = self.resnet_conv_second[i](out)
             out = out + self.residual_input_conv[i](resnet_input)
             # Self Attention
+            if self.attn:
                 batch_size, channels, h, w = out.shape
                 in_attn = out.reshape(batch_size, channels, h * w)
+                in_attn = self.attention_norms[i](in_attn)
                 in_attn = in_attn.transpose(1, 2)
+                out_attn, _ = self.attentions[i](in_attn, in_attn, in_attn)
+                out_attn = out_attn.transpose(1, 2).reshape(batch_size, channels, h, w)
                 out = out + out_attn
         return out
+class UpBlockUnet(nn.Module):
     r"""
     Up conv block with attention.
     Sequence of following blocks
     2. Resnet block with time embedding
     3. Attention Block
     """
+    def __init__(self, in_channels, out_channels, t_emb_dim, up_sample,
+                 num_heads, num_layers, norm_channels, cross_attn=False, context_dim=None):
         super().__init__()
         self.num_layers = num_layers
         self.up_sample = up_sample
         self.t_emb_dim = t_emb_dim
+        self.cross_attn = cross_attn
+        self.context_dim = context_dim
         self.resnet_conv_first = nn.ModuleList(
             [
                 nn.Sequential(
+                    nn.GroupNorm(norm_channels, in_channels if i == 0 else out_channels),
                     nn.SiLU(),
                     nn.Conv2d(in_channels if i == 0 else out_channels, out_channels, kernel_size=3, stride=1,
                               padding=1),
                 for i in range(num_layers)
             ]
         )
         if self.t_emb_dim is not None:
             self.t_emb_layers = nn.ModuleList([
                 nn.Sequential(
                 )
                 for _ in range(num_layers)
             ])
         self.resnet_conv_second = nn.ModuleList(
             [
                 nn.Sequential(
                     nn.GroupNorm(norm_channels, out_channels),
                     nn.SiLU(),
+                    nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1),
                 )
                 for _ in range(num_layers)
             ]
         )
+        self.attention_norms = nn.ModuleList(
+            [
+                nn.GroupNorm(norm_channels, out_channels)
+                for _ in range(num_layers)
+            ]
+        )
+        self.attentions = nn.ModuleList(
+            [
+                nn.MultiheadAttention(out_channels, num_heads, batch_first=True)
+                for _ in range(num_layers)
+            ]
+        )
+        if self.cross_attn:
+            assert context_dim is not None, "Context Dimension must be passed for cross attention"
+            self.cross_attention_norms = nn.ModuleList(
+                [nn.GroupNorm(norm_channels, out_channels)
+                 for _ in range(num_layers)]
             )
+            self.cross_attentions = nn.ModuleList(
+                [nn.MultiheadAttention(out_channels, num_heads, batch_first=True)
+                 for _ in range(num_layers)]
+            )
+            self.context_proj = nn.ModuleList(
+                [nn.Linear(context_dim, out_channels)
+                 for _ in range(num_layers)]
             )
         self.residual_input_conv = nn.ModuleList(
             [
+                nn.Conv2d(in_channels if i == 0 else out_channels, out_channels, kernel_size=1)
                 for i in range(num_layers)
             ]
         )
+        self.up_sample_conv = nn.ConvTranspose2d(in_channels // 2, in_channels // 2,
                                                  4, 2, 1) \
             if self.up_sample else nn.Identity()
+    def forward(self, x, out_down=None, t_emb=None, context=None):
         x = self.up_sample_conv(x)
         if out_down is not None:
             x = torch.cat([x, out_down], dim=1)
         out = x
         for i in range(self.num_layers):
+            # Resnet
             resnet_input = out
             out = self.resnet_conv_first[i](out)
             if self.t_emb_dim is not None:
                 out = out + self.t_emb_layers[i](t_emb)[:, :, None, None]
             out = self.resnet_conv_second[i](out)
             out = out + self.residual_input_conv[i](resnet_input)
             # Self Attention
+            batch_size, channels, h, w = out.shape
+            in_attn = out.reshape(batch_size, channels, h * w)
+            in_attn = self.attention_norms[i](in_attn)
+            in_attn = in_attn.transpose(1, 2)
+            out_attn, _ = self.attentions[i](in_attn, in_attn, in_attn)
+            out_attn = out_attn.transpose(1, 2).reshape(batch_size, channels, h, w)
+            out = out + out_attn
+            # Cross Attention
+            if self.cross_attn:
+                assert context is not None, "context cannot be None if cross attention layers are used"
                 batch_size, channels, h, w = out.shape
                 in_attn = out.reshape(batch_size, channels, h * w)
+                in_attn = self.cross_attention_norms[i](in_attn)
                 in_attn = in_attn.transpose(1, 2)
+                assert len(context.shape) == 3, \
+                    "Context shape does not match B,_,CONTEXT_DIM"
+                assert context.shape[0] == x.shape[0] and context.shape[-1] == self.context_dim,\
+                    "Context shape does not match B,_,CONTEXT_DIM"
+                context_proj = self.context_proj[i](context)
+                out_attn, _ = self.cross_attentions[i](in_attn, context_proj, context_proj)
+                out_attn = out_attn.transpose(1, 2).reshape(batch_size, channels, h, w)
                 out = out + out_attn
         return out

models/vqvae.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import torch
 import torch.nn as nn
-from models.blocks import DownBlock, UpBlock, MidBlock
 class VQVAE(nn.Module):
@@ -10,122 +10,125 @@ class VQVAE(nn.Module):
         self.mid_channels = model_config['mid_channels']
         self.down_sample = model_config['down_sample']
         self.num_down_layers = model_config['num_down_layers']
-        self.num_up_layers = model_config['num_up_layers']
         self.num_mid_layers = model_config['num_mid_layers']
-        # To disable attn in encoder and decoder blocks
-        self.attns = model_config['attn']
         # Latent Dimension
-        self.z_channels = model_config["z_channels"]
-        self.codebook_size = model_config["codebook_size"]
-        self.norm_channels = model_config["norm_channels"]
-        self.num_heads = model_config["num_heads"]
         assert self.mid_channels[0] == self.down_channels[-1]
         assert self.mid_channels[-1] == self.down_channels[-1]
         assert len(self.down_sample) == len(self.down_channels) - 1
         assert len(self.attns) == len(self.down_channels) - 1
-        self.upsample = list(reversed(self.down_sample))
-        # Encoder
-        self.encoder_conv_one = nn.Conv2d(
-            im_channels, self.down_channels[0], kernel_size=3, padding=1, stride=1)
         self.encoder_layers = nn.ModuleList([])
         for i in range(len(self.down_channels) - 1):
-            self.encoder_layers.append(DownBlock(self.down_channels[i], self.down_channels[i+1],
-                                                 t_emd_dim=None, down_sample=self.down_sample[i],
-                                                 num_heads=self.num_heads, num_layers=self.num_down_layers,
-                                                 attn=self.attns[i], norm_channels=self.norm_channels))
-        self.encode_mid_blocks = nn.ModuleList([])
-        for i in range(len(self.down_channels)-1):
-            self.encode_mid_blocks.append(MidBlock(self.down_channels[i], self.down_channels[i+1],
-                                          t_emb_dim=None, num_heads=self.num_heads, num_layers=self.num_mid_layers,
-                                          norm_dim=self.norm_channels))
-        self.encoder_norm_out = nn.GroupNorm(
-            self.norm_channels, self.down_channels[-1])
-        self.encoder_conv_out = nn.Conv2d(
-            self.down_channels[-1], self.z_channels, kernel_size=3, padding=1)
-        # Pre-Quantization Convolution (Before comparing to code blocks to get embedding matrix)
-        self.pre_quant_conv = nn.Conv2d(
-            self.z_channels, self.z_channels, kernel_size=1)
-        # Code book
         self.embedding = nn.Embedding(self.codebook_size, self.z_channels)
-        # Decoder
-        self.post_quant_conv = nn.Conv2d(
-            self.z_channels, self.z_channels, kernel_size=1)
-        self.decoder_conv_out = nn.Conv2d(
-            self.z_channels, self.mid_channels[-1], kernel_size=3, padding=1)
-        # Midblock + UpBlock
-        self.decode_mids = nn.ModuleList([])
         for i in reversed(range(1, len(self.mid_channels))):
-            self.decode_mids.append(MidBlock(self.mid_channels[i], self.mid_channels[i-1],
-                                             t_emb_dim=None, num_heads=self.num_heads,
-                                             num_layers=self.num_mid_layers,
-                                             norm_dim=self.norm_channels))
         self.decoder_layers = nn.ModuleList([])
         for i in reversed(range(1, len(self.down_channels))):
-            self.decoder_layers.append(UpBlock(self.down_channels[i], self.down_channels[i-1],
-                                       t_emb_dim=None, up_sample=self.down_sample[i-1], num_heads=self.num_heads,
-                                       num_layers=self.num_up_layers,
-                                       attn=self.attns[i-1],
-                                       norm_channels=self.norm_channels))
-        self.decoder_norm_out = nn.GroupNorm(
-            self.norm_channels, self.down_channels[0])
-        self.decoder_conv_out = nn.Conv2d(
-            self.down_channels[0], im_channels, kernel_size=3, padding=1)
     def quantize(self, x):
         B, C, H, W = x.shape
-        # B,C,H,W -> B,H,W,C
         x = x.permute(0, 2, 3, 1)
-        # B,H,W,C -> B, H*W, C
         x = x.reshape(x.size(0), -1, x.size(-1))
-        # Find nearest neighbours/codebook vectors
-        # Distance between  B,H*W,C and B,K,C
-        dist = torch.cdist(
-            x, self.embedding.weight[None, :].repeat((x.size(0), 1, 1)))
         min_encoding_indices = torch.argmin(dist, dim=-1)
-        # Replace encoder output with codebook vector
-        quant_out = torch.index_select(
-            self.embedding.weight, 0, min_encoding_indices.view(-1))
-        # x -> B*H*W,C
         x = x.reshape((-1, x.size(-1)))
-        commitment_loss = torch.mean((quant_out.detach() - x) ** 2)
         codebook_loss = torch.mean((quant_out - x.detach()) ** 2)
-        quantize_loss = {
-            "codebook_loss": codebook_loss,
-            "commitment_loss": commitment_loss
         }
         # Straight through estimation
-        quant_out = x - (quant_out - x).detach()
-        # quant_out -> B,C,H,W
         quant_out = quant_out.reshape((B, H, W, C)).permute(0, 3, 1, 2)
-        min_encoding_indices = min_encoding_indices.reshape(
-            (-1, quant_out.size(-2), quant_out.size(-1)))
-        return quant_out, quantize_loss, min_encoding_indices
     def encode(self, x):
-        out = self.encoder_conv_one(x)
-        for _, down in enumerate(self.encoder_layers):
             out = down(out)
-        for mid in self.encode_mid_blocks:
             out = mid(out)
         out = self.encoder_norm_out(out)
         out = nn.SiLU()(out)
@@ -133,21 +136,21 @@ class VQVAE(nn.Module):
         out = self.pre_quant_conv(out)
         out, quant_losses, _ = self.quantize(out)
         return out, quant_losses
     def decode(self, z):
         out = z
         out = self.post_quant_conv(out)
         out = self.decoder_conv_in(out)
-        for mid in self.decode_mids:
             out = mid(out)
-        for up in self.decoder_layers:
             out = up(out)
         out = self.decoder_norm_out(out)
-        out = nn.SiLU(out)
         out = self.decoder_conv_out(out)
         return out
     def forward(self, x):
         z, quant_losses = self.encode(x)
         out = self.decode(z)

 import torch
 import torch.nn as nn
+from models.blocks import DownBlock, MidBlock, UpBlock
 class VQVAE(nn.Module):
         self.mid_channels = model_config['mid_channels']
         self.down_sample = model_config['down_sample']
         self.num_down_layers = model_config['num_down_layers']
         self.num_mid_layers = model_config['num_mid_layers']
+        self.num_up_layers = model_config['num_up_layers']
+        # To disable attention in Downblock of Encoder and Upblock of Decoder
+        self.attns = model_config['attn_down']
         # Latent Dimension
+        self.z_channels = model_config['z_channels']
+        self.codebook_size = model_config['codebook_size']
+        self.norm_channels = model_config['norm_channels']
+        self.num_heads = model_config['num_heads']
+        # Assertion to validate the channel information
         assert self.mid_channels[0] == self.down_channels[-1]
         assert self.mid_channels[-1] == self.down_channels[-1]
         assert len(self.down_sample) == len(self.down_channels) - 1
         assert len(self.attns) == len(self.down_channels) - 1
+        # Wherever we use downsampling in encoder correspondingly use
+        # upsampling in decoder
+        self.up_sample = list(reversed(self.down_sample))
+        ##################### Encoder ######################
+        self.encoder_conv_in = nn.Conv2d(im_channels, self.down_channels[0], kernel_size=3, padding=(1, 1))
+        # Downblock + Midblock
         self.encoder_layers = nn.ModuleList([])
         for i in range(len(self.down_channels) - 1):
+            self.encoder_layers.append(DownBlock(self.down_channels[i], self.down_channels[i + 1],
+                                                 t_emb_dim=None, down_sample=self.down_sample[i],
+                                                 num_heads=self.num_heads,
+                                                 num_layers=self.num_down_layers,
+                                                 attn=self.attns[i],
+                                                 norm_channels=self.norm_channels))
+        self.encoder_mids = nn.ModuleList([])
+        for i in range(len(self.mid_channels) - 1):
+            self.encoder_mids.append(MidBlock(self.mid_channels[i], self.mid_channels[i + 1],
+                                              t_emb_dim=None,
+                                              num_heads=self.num_heads,
+                                              num_layers=self.num_mid_layers,
+                                              norm_channels=self.norm_channels))
+        self.encoder_norm_out = nn.GroupNorm(self.norm_channels, self.down_channels[-1])
+        self.encoder_conv_out = nn.Conv2d(self.down_channels[-1], self.z_channels, kernel_size=3, padding=1)
+        # Pre Quantization Convolution
+        self.pre_quant_conv = nn.Conv2d(self.z_channels, self.z_channels, kernel_size=1)
+        # Codebook
         self.embedding = nn.Embedding(self.codebook_size, self.z_channels)
+        ##################### Decoder ######################
+        # Post Quantization Convolution
+        self.post_quant_conv = nn.Conv2d(self.z_channels, self.z_channels, kernel_size=1)
+        self.decoder_conv_in = nn.Conv2d(self.z_channels, self.mid_channels[-1], kernel_size=3, padding=(1, 1))
+        # Midblock + Upblock
+        self.decoder_mids = nn.ModuleList([])
         for i in reversed(range(1, len(self.mid_channels))):
+            self.decoder_mids.append(MidBlock(self.mid_channels[i], self.mid_channels[i - 1],
+                                              t_emb_dim=None,
+                                              num_heads=self.num_heads,
+                                              num_layers=self.num_mid_layers,
+                                              norm_channels=self.norm_channels))
         self.decoder_layers = nn.ModuleList([])
         for i in reversed(range(1, len(self.down_channels))):
+            self.decoder_layers.append(UpBlock(self.down_channels[i], self.down_channels[i - 1],
+                                               t_emb_dim=None, up_sample=self.down_sample[i - 1],
+                                               num_heads=self.num_heads,
+                                               num_layers=self.num_up_layers,
+                                               attn=self.attns[i-1],
+                                               norm_channels=self.norm_channels))
+        self.decoder_norm_out = nn.GroupNorm(self.norm_channels, self.down_channels[0])
+        self.decoder_conv_out = nn.Conv2d(self.down_channels[0], im_channels, kernel_size=3, padding=1)
     def quantize(self, x):
         B, C, H, W = x.shape
+        # B, C, H, W -> B, H, W, C
         x = x.permute(0, 2, 3, 1)
+        # B, H, W, C -> B, H*W, C
         x = x.reshape(x.size(0), -1, x.size(-1))
+        # Find nearest embedding/codebook vector
+        # dist between (B, H*W, C) and (B, K, C) -> (B, H*W, K)
+        dist = torch.cdist(x, self.embedding.weight[None, :].repeat((x.size(0), 1, 1)))
+        # (B, H*W)
         min_encoding_indices = torch.argmin(dist, dim=-1)
+        # Replace encoder output with nearest codebook
+        # quant_out -> B*H*W, C
+        quant_out = torch.index_select(self.embedding.weight, 0, min_encoding_indices.view(-1))
+        # x -> B*H*W, C
         x = x.reshape((-1, x.size(-1)))
+        commmitment_loss = torch.mean((quant_out.detach() - x) ** 2)
         codebook_loss = torch.mean((quant_out - x.detach()) ** 2)
+        quantize_losses = {
+            'codebook_loss': codebook_loss,
+            'commitment_loss': commmitment_loss
         }
         # Straight through estimation
+        quant_out = x + (quant_out - x).detach()
+        # quant_out -> B, C, H, W
         quant_out = quant_out.reshape((B, H, W, C)).permute(0, 3, 1, 2)
+        min_encoding_indices = min_encoding_indices.reshape((-1, quant_out.size(-2), quant_out.size(-1)))
+        return quant_out, quantize_losses, min_encoding_indices
     def encode(self, x):
+        out = self.encoder_conv_in(x)
+        for idx, down in enumerate(self.encoder_layers):
             out = down(out)
+        for mid in self.encoder_mids:
             out = mid(out)
         out = self.encoder_norm_out(out)
         out = nn.SiLU()(out)
         out = self.pre_quant_conv(out)
         out, quant_losses, _ = self.quantize(out)
         return out, quant_losses
     def decode(self, z):
         out = z
         out = self.post_quant_conv(out)
         out = self.decoder_conv_in(out)
+        for mid in self.decoder_mids:
             out = mid(out)
+        for idx, up in enumerate(self.decoder_layers):
             out = up(out)
         out = self.decoder_norm_out(out)
+        out = nn.SiLU()(out)
         out = self.decoder_conv_out(out)
         return out
     def forward(self, x):
         z, quant_losses = self.encode(x)
         out = self.decode(z)

train_vqvae.py CHANGED Viewed

@@ -24,7 +24,6 @@ def train(args):
         except yaml.YAMLError as e:
             print(e)
     autoencoder_config = config["autoencoder_params"]
     train_config = config["train_config"]
     dataset_config = config["dataset_config"]
@@ -84,11 +83,11 @@ def train(args):
             # Image saving
             if steps % img_save_steps == 0 or steps == 1:
-                sample_size = min(8, im.shape[0])
                 save_output = torch.clamp(
                     output[:sample_size], -1., 1.).detach().cpu()
                 save_output = ((save_output + 1) / 2)
-                save_input = ((im[:sample_size] + 1) / 2).detach().cpu()
                 grid = make_grid(
                     torch.cat([save_input, save_output], dim=0), nrow=sample_size)
@@ -97,8 +96,8 @@ def train(args):
                     os.mkdir(os.path.join(
                         train_config['task_name'], 'vqvae_autoencoder_samples'))
                 img.save(os.path.join(train_config['task_name'], 'vqvae_autoencoder_samples',
-                                      'current_autoencoder_sample_{}.png'.format(img_save_count)))
-                img_save_count += 1
                 img.close()
             # Optimizing generator

         except yaml.YAMLError as e:
             print(e)
     autoencoder_config = config["autoencoder_params"]
     train_config = config["train_config"]
     dataset_config = config["dataset_config"]
             # Image saving
             if steps % img_save_steps == 0 or steps == 1:
+                sample_size = min(8, im_tensor.shape[0])
                 save_output = torch.clamp(
                     output[:sample_size], -1., 1.).detach().cpu()
                 save_output = ((save_output + 1) / 2)
+                save_input = ((im_tensor[:sample_size] + 1) / 2).detach().cpu()
                 grid = make_grid(
                     torch.cat([save_input, save_output], dim=0), nrow=sample_size)
                     os.mkdir(os.path.join(
                         train_config['task_name'], 'vqvae_autoencoder_samples'))
                 img.save(os.path.join(train_config['task_name'], 'vqvae_autoencoder_samples',
+                                      'current_autoencoder_sample_{}.png'.format(img_saved)))
+                img_saved += 1
                 img.close()
             # Optimizing generator