YashNagraj75
/

Latent-Diffusion-Conditional

Model card Files Files and versions

xet

Community

Yash Nagraj commited on Jan 7, 2025

Commit

c2857b5

1 Parent(s): 95b2cf2

Add MidBlocks and change cross_attn in down blocks

Browse files

Files changed (1) hide show

models/blocks.py +72 -5

models/blocks.py CHANGED Viewed

@@ -86,6 +86,11 @@ class DownBlock(nn.Module):
                     out_channels, num_heads=num_heads, batch_first=True) for _ in range(num_layers)]
             )
         self.residual_input_conv = nn.ModuleList(
             [
                 nn.Conv2d(in_channels=in_channels if i == 0 else out_channels,
@@ -121,15 +126,77 @@ class DownBlock(nn.Module):
             # Cross Attention
             if self.cross_attn:
-                assert context not None, "Context must be given for cross_attn"
                 batch_size, channels, h, w = out.shape
-                in_attn = out.reshape(batch_size, channels, h*w)
-                in_attn = self.attention_norms[i](in_attn)
                 in_attn = in_attn.transpose(1, 2)
-                out_attn, _ = self.attention[i](in_attn, in_attn, in_attn)
-                out_attn = out.transpose(1, 2).reshape(
                     batch_size, channels, h, w)
                 out = out + out_attn
         out = self.resnet_down_conv(out)
         return out

                     out_channels, num_heads=num_heads, batch_first=True) for _ in range(num_layers)]
             )
+            self.context_proj = nn.ModuleList(
+                [nn.Linear(context_dim, out_channels)
+                 for _ in range(num_layers)]
+            )
         self.residual_input_conv = nn.ModuleList(
             [
                 nn.Conv2d(in_channels=in_channels if i == 0 else out_channels,
             # Cross Attention
             if self.cross_attn:
+                assert context is not None, "Context must be given for cross_attn"
                 batch_size, channels, h, w = out.shape
+                in_attn = out.reshape(batch_size, channels, h * w)
+                in_attn = self.cross_attention_norms[i](in_attn)
                 in_attn = in_attn.transpose(1, 2)
+                assert context.shape[0] == x.shape[0] and context.shape[-1] == self.context_dim
+                context_proj = self.context_proj[i](context)
+                out_attn, _ = self.cross_attentions[i](
+                    in_attn, context_proj, context_proj)
+                out_attn = out_attn.transpose(1, 2).reshape(
                     batch_size, channels, h, w)
                 out = out + out_attn
         out = self.resnet_down_conv(out)
         return out
+class MidBlock(nn.Module):
+    """
+    Mid Block that works with same dimensions, flows like this:
+    1) Resnet block with time embedding
+    2) Self Attention block
+    3) Resnet block with time embedding
+    """
+    def __init__(self, in_channels, out_channels, t_emb_dim, num_heads, num_layers, norm_dim, cross_attn=None, context_dim=None):
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.t_emb_dim = t_emb_dim
+        self.cross_attn = cross_attn
+        self.context_dim = context_dim
+        self.resnet_conv_one = nn.ModuleList([
+            nn.Sequential(
+                nn.GroupNorm(norm_dim, in_channels if i ==
+                             0 else out_channels),
+                nn.SiLU(),
+                nn.Conv2d(in_channels if i == 0 else out_channels,
+                          out_channels, 3, 1, 1)
+            )
+            for i in range(num_layers + 1)
+        ])
+        if self.t_emb_dim is not None:
+            self.time_emb_layers = nn.ModuleList([
+                nn.Sequential(
+                    nn.SiLU(),
+                    nn.Linear(t_emb_dim, out_channels)
+                )
+                for _ in range(num_layers + 1)
+            ])
+        self.resnet_conv_two = nn.ModuleList([
+            nn.Sequential(
+                nn.GroupNorm(norm_dim, out_channels),
+                nn.SiLU(),
+                nn.Conv2d(out_channels, out_channels, 3, 1, 1)
+            ) for _ in range(num_layers + 1)
+        ])
+        self.attention_norms = nn.ModuleList(
+            [nn.GroupNorm(norm_dim, out_channels) for _ in range(num_layers)]
+        )
+        self.attention_heads = nn.ModuleList(
+            [nn.MultiheadAttention(out_channels, num_heads, batch_first=True)
+             for _ in range(num_layers)]
+        )
+        if self.cross_attn:
+            assert context_dim is not None, "Context must be given for cross attn"
+            self.cross_attn_norms = nn.ModuleList(
+                [nn.GroupNorm(norm_dim, out_channels)
+                 for _ in range(num_layers)]
+            )