YashNagraj75
/

ControlNet

Model card Files Files and versions

xet

Community

YashNagraj75 commited on Mar 29, 2025

Commit

e1d97a8

1 Parent(s): 426ee66

Add UpBlock

Browse files

Files changed (1) hide show

model_blocks/blocks.py +302 -5

model_blocks/blocks.py CHANGED Viewed

@@ -1,6 +1,10 @@
 import torch
 import torch.nn as nn
 def get_time_embedding(time_steps, temb_dim):
     r"""
@@ -35,7 +39,7 @@ class DownBlock(nn.Module):
         1) Resnet Block :- [Norm-> Silu -> Conv] x num_layers
         2) Self Attention :- [Norm -> SA]
         3) Cross Attention :- [Norm -> CA]
-    b) DownSample : DownSample the dimnension
     """
     def __init__(
@@ -170,15 +174,29 @@ class DownBlock(nn.Module):
         out = x
         for i in range(self.num_layers):
             # Input x to Resnet Block of the Encoder of the Unet
             resnet_input = out
             out = self.resnet_one[i](out)
-            if t_emb is not None:
                 out = out + self.t_emb_layers[i](t_emb)[:, :, None, None]
             out = self.resnet_two[i](out)
             out = out + self.resnet_in[i](resnet_input)
             if self.attn:
                 # Now Passing through the Self Attention blocks
                 batch_size, channels, h, w = out.shape
                 in_attn = out.reshape(batch_size, channels, h * w)
                 in_attn = self.attention_norms[i](in_attn)
@@ -186,11 +204,17 @@ class DownBlock(nn.Module):
                 out_attn, _ = self.attentions[i](in_attn, in_attn, in_attn)
                 out_attn = out_attn.transpose(1, 2).reshape(batch_size, channels, h, w)
                 out = out + out_attn
             if self.cross_attn:
                 assert context is not None, (
                     "context cannot be None if cross attention layers are used"
                 )
                 batch_size, channels, h, w = out.shape
                 in_attn = out.reshape(batch_size, channels, h * w)
                 in_attn = self.cross_attn_norms[i](in_attn)
@@ -199,19 +223,40 @@ class DownBlock(nn.Module):
                     context.shape[0] == x.shape[0]
                     and context.shape[-1] == self.context_dim
                 )
                 context_proj = self.context_proj[i](context)
                 out_attn, _ = self.cross_attentions[i](
                     in_attn, context_proj, context_proj
                 )
                 out_attn = out_attn.transpose(1, 2).reshape(batch_size, channels, h, w)
                 out = out + out_attn
             # DownSample to x2 smaller dimension
             out = self.down_sample_conv(out)
             return out
 class MidBlock(nn.Module):
     def __init__(
         self,
         num_heads,
@@ -253,7 +298,7 @@ class MidBlock(nn.Module):
                         padding=1,
                     ),
                 )
-                for i in range(self.num_layers)
             ]
         )
@@ -261,7 +306,7 @@ class MidBlock(nn.Module):
             self.t_emb_layers = nn.ModuleList(
                 [
                     nn.Sequential(nn.SiLU(), nn.Linear(self.t_emb_dim, self.output_dim))
-                    for _ in range(self.num_layers)
                 ]
             )
@@ -281,7 +326,7 @@ class MidBlock(nn.Module):
                         padding=1,
                     ),
                 )
-                for _ in range(self.num_layers)
             ]
         )
@@ -323,3 +368,255 @@ class MidBlock(nn.Module):
                     for _ in range(self.num_layers)
                 ]
             )

+import logging
 import torch
 import torch.nn as nn
+logger = logging.getLogger(__name__)
 def get_time_embedding(time_steps, temb_dim):
     r"""
         1) Resnet Block :- [Norm-> Silu -> Conv] x num_layers
         2) Self Attention :- [Norm -> SA]
         3) Cross Attention :- [Norm -> CA]
+    b) MidSample : DownSample the dimnension
     """
     def __init__(
         out = x
         for i in range(self.num_layers):
             # Input x to Resnet Block of the Encoder of the Unet
+            logger.debug(f"Input to Resnet Block in Down Block Layer {i} : {out.shape}")
             resnet_input = out
             out = self.resnet_one[i](out)
+            logger.debug(
+                f"Output of Resnet Sub Block 1 of Down Block Layer {i}  : {out.shape}"
+            )
+            if self.t_emb_dim is not None:
+                logger.debug(
+                    f"Adding t_emb of shape {self.t_emb_dim} to output of shape: {out.shape} of Down Block Layer {i}"
+                )
                 out = out + self.t_emb_layers[i](t_emb)[:, :, None, None]
             out = self.resnet_two[i](out)
+            logger.debug(
+                f"Output of Resnet Sub Block 2 of Down Block Layer: {i} with output_shape:{out.shape}"
+            )
             out = out + self.resnet_in[i](resnet_input)
+            logger.debug(
+                f"Residual connection of the input to out : {out.shape} in Down Block Layer {i}"
+            )
             if self.attn:
                 # Now Passing through the Self Attention blocks
+                logger.debug(f"Going into the attention Block in Down Block Layer {i}")
                 batch_size, channels, h, w = out.shape
                 in_attn = out.reshape(batch_size, channels, h * w)
                 in_attn = self.attention_norms[i](in_attn)
                 out_attn, _ = self.attentions[i](in_attn, in_attn, in_attn)
                 out_attn = out_attn.transpose(1, 2).reshape(batch_size, channels, h, w)
                 out = out + out_attn
+                logger.debug(
+                    f"Out of the Self Attention Block with out : {out.shape} in Down Block Layer {i}"
+                )
             if self.cross_attn:
                 assert context is not None, (
                     "context cannot be None if cross attention layers are used"
                 )
+                logger.debug(
+                    f"Going into the Cross Attention Block in Down Block Layer {i}"
+                )
                 batch_size, channels, h, w = out.shape
                 in_attn = out.reshape(batch_size, channels, h * w)
                 in_attn = self.cross_attn_norms[i](in_attn)
                     context.shape[0] == x.shape[0]
                     and context.shape[-1] == self.context_dim
                 )
+                logger.debug(
+                    f"Calculating context projection for Cross Attn in Down Block Layer : {i}"
+                )
                 context_proj = self.context_proj[i](context)
                 out_attn, _ = self.cross_attentions[i](
                     in_attn, context_proj, context_proj
                 )
                 out_attn = out_attn.transpose(1, 2).reshape(batch_size, channels, h, w)
                 out = out + out_attn
+                logger.debug(
+                    f"Out of the Cross Attention Block with out : {out.shape} in Down Block Layer {i}"
+                )
             # DownSample to x2 smaller dimension
             out = self.down_sample_conv(out)
+            logger.debug(f"Down Sampling out to : {out.shape} in Down Block Layer {i} ")
             return out
 class MidBlock(nn.Module):
+    r"""
+    MidBlock for Diffusion model:
+           Time embedding -> [Silu -> FC]
+                                ↓
+        1) Resnet Block :- [Norm-> Silu -> Conv] x num_layers
+        2) Self Attention :- [Norm -> SA]
+        3) Cross Attention :- [Norm -> CA]
+           Time embedding -> [Silu -> FC]
+                                ↓
+        4) Resnet Block :- [Norm-> Silu -> Conv] x num_layers
+    """
     def __init__(
         self,
         num_heads,
                         padding=1,
                     ),
                 )
+                for i in range(self.num_layers + 1)
             ]
         )
             self.t_emb_layers = nn.ModuleList(
                 [
                     nn.Sequential(nn.SiLU(), nn.Linear(self.t_emb_dim, self.output_dim))
+                    for _ in range(self.num_layers + 1)
                 ]
             )
                         padding=1,
                     ),
                 )
+                for _ in range(self.num_layers + 1)
             ]
         )
                     for _ in range(self.num_layers)
                 ]
             )
+        self.resnet_in = nn.ModuleList(
+            [
+                nn.Conv2d(
+                    self.input_dim if i == 0 else self.output_dim,
+                    self.output_dim,
+                    kernel_size=1,
+                )
+                for i in range(self.num_layers + 1)
+            ]
+        )
+    def forward(self, x, t_emb=None, context=None):
+        out = x
+        # Input Resnet Block
+        logger.debug("Input to First Resnet Block in Mid Block")
+        resnet_input = out
+        out = self.resnet_one[0](out)
+        logger.debug(f"Output of Resnet Sub Block 1 of Mid Block Layer: {out.shape}")
+        if self.t_emb_dim is not None:
+            out = out + self.t_emb_layers[0](t_emb)[:, :, None, None]
+            logger.debug(
+                f"Adding t_emb of shape {self.t_emb_dim} to output of shape: {out.shape}"
+            )
+        out = self.resnet_two[0](out)
+        logger.debug(f"Output of Resnet Sub Block 2 with output_shape:{out.shape}")
+        out = out + self.resnet_in[0](resnet_input)
+        logger.debug(
+            f"Residual connection of the input to out : {out.shape} in Mid Block"
+        )
+        for i in range(self.num_layers):
+            logger.debug(f"Going into the attention Block in Mid Block Layer {i}")
+            batch_size, channels, h, w = out.shape
+            in_attn = out.reshape(batch_size, channels, h * w)
+            in_attn = self.attention_norms[i](in_attn)
+            in_attn = in_attn.transpose(1, 2)
+            out_attn, _ = self.attentions[i](in_attn, in_attn, in_attn)
+            out_attn = out_attn.transpose(1, 2).reshape(batch_size, channels, h, w)
+            out = out + out_attn
+            logger.debug(
+                f"Out of the Self Attention Block with out : {out.shape} in Mid Block Layer {i}"
+            )
+            if self.cross_attn:
+                assert context is not None, (
+                    "context cannot be None if cross attention layers are used"
+                )
+                logger.debug(
+                    f"Going into the Cross Attention Block in Mid Block Layer {i}"
+                )
+                batch_size, channels, h, w = out.shape
+                in_attn = out.reshape(batch_size, channels, h * w)
+                in_attn = self.cross_attn_norms[i](in_attn)
+                in_attn = in_attn.transpose(1, 2)
+                assert (
+                    context.shape[0] == x.shape[0]
+                    and context.shape[-1] == self.context_dim
+                )
+                logger.debug(
+                    f"Calculating context projection for Cross Attn in Mid Block Layer : {i}"
+                )
+                context_proj = self.context_proj[i](context)
+                out_attn, _ = self.cross_attentions[i](
+                    in_attn, context_proj, context_proj
+                )
+                out_attn = out_attn.transpose(1, 2).reshape(batch_size, channels, h, w)
+                out = out + out_attn
+                logger.debug(
+                    f"Out of the Cross Attention Block with out : {out.shape} in Mid Block Layer {i}"
+                )
+            logger.debug(
+                f"Last Resnet Block input : {out.shape} of Mid Block Layer {i}"
+            )
+            resnet_input = out
+            out = self.resnet_one[0](out)
+            logger.debug(
+                f"Output of Resnet Sub Block 1 of Mid Block Layer {i} of shape : {out.shape}"
+            )
+            if self.t_emb_dim is not None:
+                out = out + self.t_emb_layers[0](t_emb)[:, :, None, None]
+                logger.debug(
+                    f"Adding t_emb of shape {self.t_emb_dim} to output of shape: {out.shape} of Mid Block Layer {i}"
+                )
+            out = self.resnet_two[0](out)
+            logger.debug(
+                f"Output of Resnet Sub Block 2 with output_shape:{out.shape} of Mid Block Layer {i}"
+            )
+            out = out + self.resnet_in[0](resnet_input)
+            logger.debug(
+                f"Residual connection of the input to out : {out.shape} in Mid Block Layer {i}"
+            )
+        return out
+class UpBlockUnet(nn.Module):
+    r"""
+    Up conv block with attention.
+    Sequence of following blocks
+    1. Upsample
+    1. Concatenate Down block output
+    2. Resnet block with time embedding
+    3. Attention Block
+    """
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        t_emb_dim,
+        up_sample,
+        num_heads,
+        num_layers,
+        norm_channels,
+        cross_attn=False,
+        context_dim=None,
+    ):
+        super().__init__()
+        self.num_layers = num_layers
+        self.up_sample = up_sample
+        self.t_emb_dim = t_emb_dim
+        self.cross_attn = cross_attn
+        self.context_dim = context_dim
+        self.resnet_conv_first = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.GroupNorm(
+                        norm_channels, in_channels if i == 0 else out_channels
+                    ),
+                    nn.SiLU(),
+                    nn.Conv2d(
+                        in_channels if i == 0 else out_channels,
+                        out_channels,
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                    ),
+                )
+                for i in range(num_layers)
+            ]
+        )
+        if self.t_emb_dim is not None:
+            self.t_emb_layers = nn.ModuleList(
+                [
+                    nn.Sequential(nn.SiLU(), nn.Linear(t_emb_dim, out_channels))
+                    for _ in range(num_layers)
+                ]
+            )
+        self.resnet_conv_second = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.GroupNorm(norm_channels, out_channels),
+                    nn.SiLU(),
+                    nn.Conv2d(
+                        out_channels, out_channels, kernel_size=3, stride=1, padding=1
+                    ),
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.attention_norms = nn.ModuleList(
+            [nn.GroupNorm(norm_channels, out_channels) for _ in range(num_layers)]
+        )
+        self.attentions = nn.ModuleList(
+            [
+                nn.MultiheadAttention(out_channels, num_heads, batch_first=True)
+                for _ in range(num_layers)
+            ]
+        )
+        if self.cross_attn:
+            assert context_dim is not None, (
+                "Context Dimension must be passed for cross attention"
+            )
+            self.cross_attention_norms = nn.ModuleList(
+                [nn.GroupNorm(norm_channels, out_channels) for _ in range(num_layers)]
+            )
+            self.cross_attentions = nn.ModuleList(
+                [
+                    nn.MultiheadAttention(out_channels, num_heads, batch_first=True)
+                    for _ in range(num_layers)
+                ]
+            )
+            self.context_proj = nn.ModuleList(
+                [nn.Linear(context_dim, out_channels) for _ in range(num_layers)]
+            )
+        self.residual_input_conv = nn.ModuleList(
+            [
+                nn.Conv2d(
+                    in_channels if i == 0 else out_channels, out_channels, kernel_size=1
+                )
+                for i in range(num_layers)
+            ]
+        )
+        self.up_sample_conv = (
+            nn.ConvTranspose2d(in_channels // 2, in_channels // 2, 4, 2, 1)
+            if self.up_sample
+            else nn.Identity()
+        )
+    def forward(self, x, out_down=None, t_emb=None, context=None):
+        x = self.up_sample_conv(x)
+        if out_down is not None:
+            x = torch.cat([x, out_down], dim=1)
+        out = x
+        for i in range(self.num_layers):
+            # Resnet
+            resnet_input = out
+            out = self.resnet_conv_first[i](out)
+            if self.t_emb_dim is not None:
+                out = out + self.t_emb_layers[i](t_emb)[:, :, None, None]
+            out = self.resnet_conv_second[i](out)
+            out = out + self.residual_input_conv[i](resnet_input)
+            # Self Attention
+            batch_size, channels, h, w = out.shape
+            in_attn = out.reshape(batch_size, channels, h * w)
+            in_attn = self.attention_norms[i](in_attn)
+            in_attn = in_attn.transpose(1, 2)
+            out_attn, _ = self.attentions[i](in_attn, in_attn, in_attn)
+            out_attn = out_attn.transpose(1, 2).reshape(batch_size, channels, h, w)
+            out = out + out_attn
+            # Cross Attention
+            if self.cross_attn:
+                assert context is not None, (
+                    "context cannot be None if cross attention layers are used"
+                )
+                batch_size, channels, h, w = out.shape
+                in_attn = out.reshape(batch_size, channels, h * w)
+                in_attn = self.cross_attention_norms[i](in_attn)
+                in_attn = in_attn.transpose(1, 2)
+                assert len(context.shape) == 3, (
+                    "Context shape does not match B,_,CONTEXT_DIM"
+                )
+                assert (
+                    context.shape[0] == x.shape[0]
+                    and context.shape[-1] == self.context_dim
+                ), "Context shape does not match B,_,CONTEXT_DIM"
+                context_proj = self.context_proj[i](context)
+                out_attn, _ = self.cross_attentions[i](
+                    in_attn, context_proj, context_proj
+                )
+                out_attn = out_attn.transpose(1, 2).reshape(batch_size, channels, h, w)
+                out = out + out_attn
+        return out