YashNagraj75
/

Latent-Diffusion-Conditional

Model card Files Files and versions

xet

Community

Yash Nagraj commited on Jan 6, 2025

Commit

cb6bd3a

1 Parent(s): 3cb348b

Add attention to Down Blocks

Browse files

Files changed (1) hide show

models/blocks.py +77 -0

models/blocks.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import torch
+import torch.nn as nn
+def get_time_embedding(time_steps, temb_dim):
+    assert time_steps % 2 == 0, "time embedding dimension must be divisible by 2"
+    factor = 10000 ** ((torch.arange(
+        start=0, end=temb_dim // 2, dtype=torch.float32, device=time_steps.device) / (temb_dim // 2))
+    )
+    # pos / factor
+    # time_steps B -> B, 1 -> B, temb_dim
+    t_emb = time_steps[:, None].repeat(1, temb_dim // 2) / factor
+    t_emb = torch.cat([torch.sin(t_emb), torch.cos(t_emb)], dim=-1)
+    return t_emb
+class DownBlock(nn.Module):
+    """
+    Down Block that down samples the image, flows like this:
+    1) Resnet block with time embedding
+    2) Self Attention block
+    3) Down Sample
+    """
+    def __init__(self, in_channels, out_channels, t_emd_dim, down_sample, num_heads, num_layers, attn, norm_channels, cross_attn=False,
+                 context_dim=None):
+        super().__init__()
+        self.down_sample = down_sample
+        self.cross_attn = cross_attn
+        self.context_dim = context_dim
+        self.cross_attn = cross_attn
+        self.t_emb_dim = t_emd_dim
+        self.attn = attn
+        self.resnet_conv_first = nn.ModuleList([
+            nn.Sequential(
+                nn.GroupNorm(norm_channels, in_channels if i ==
+                             0 else out_channels),
+                nn.SiLU(),
+                nn.Conv2d(in_channels=in_channels if i == 0 else out_channels,
+                          out_channels=out_channels, kernel_size=3, stride=1, padding=1)
+            ) for i in range(num_layers)
+        ])
+        if self.t_emb_dim is not None:
+            self.time_embd_layers = nn.ModuleList([
+                nn.Sequential(
+                    nn.SiLU(),
+                    nn.Linear(self.t_emb_dim, out_channels)
+                )
+                for _ in range(num_layers)
+            ])
+        self.resnet_conv_second = nn.ModuleList([
+            nn.Sequential(
+                nn.GroupNorm(norm_channels, out_channels),
+                nn.SiLU(),
+                nn.Conv2d(in_channels, out_channels,
+                          kernel_size=3, stride=1, padding=1),
+            )
+            for _ in range(num_layers)
+        ])
+        if self.attn:
+            self.attention_norms = nn.ModuleList(
+                [nn.GroupNorm(norm_channels, out_channels)
+                 for _ in range(num_layers)]
+            )
+            self.attention = nn.ModuleList(
+                [nn.MultiheadAttention(
+                    out_channels, num_heads=num_heads, batch_first=True) for _ in range(num_layers)]
+            )
+        if self.cross_attn:
+            assert context_dim is not None, "Context Dimension must be passed to croo attention"