perorina
/

yahe

Model card Files Files and versions

xet

Community

perorina commited on Sep 12, 2023

Commit

847e2b9

1 Parent(s): 3ed91a5

Create UNetStructureStr.txt

Browse files

Files changed (1) hide show

UNetStructureStr.txt +926 -0

UNetStructureStr.txt ADDED Viewed

	@@ -0,0 +1,926 @@

+input_blocks
+ModuleList(
+  (0): TimestepEmbedSequential(
+    (0): Conv2d(4, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+  )
+  (1-2): 2 x TimestepEmbedSequential(
+    (0): ResBlock(
+      (in_layers): Sequential(
+        (0): GroupNorm32(32, 320, eps=1e-05, affine=True)
+        (1): SiLU()
+        (2): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+      (h_upd): Identity()
+      (x_upd): Identity()
+      (emb_layers): Sequential(
+        (0): SiLU()
+        (1): Linear(in_features=1280, out_features=320, bias=True)
+      )
+      (out_layers): Sequential(
+        (0): GroupNorm32(32, 320, eps=1e-05, affine=True)
+        (1): SiLU()
+        (2): Dropout(p=0, inplace=False)
+        (3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+      (skip_connection): Identity()
+    )
+    (1): SpatialTransformer(
+      (norm): GroupNorm(32, 320, eps=1e-06, affine=True)
+      (proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
+      (transformer_blocks): ModuleList(
+        (0): BasicTransformerBlock(
+          (attn1): CrossAttention(
+            (to_q): Linear(in_features=320, out_features=320, bias=False)
+            (to_k): Linear(in_features=320, out_features=320, bias=False)
+            (to_v): Linear(in_features=320, out_features=320, bias=False)
+            (to_out): Sequential(
+              (0): Linear(in_features=320, out_features=320, bias=True)
+              (1): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (ff): FeedForward(
+            (net): Sequential(
+              (0): GEGLU(
+                (proj): Linear(in_features=320, out_features=2560, bias=True)
+              )
+              (1): Dropout(p=0.0, inplace=False)
+              (2): Linear(in_features=1280, out_features=320, bias=True)
+            )
+          )
+          (attn2): CrossAttention(
+            (to_q): Linear(in_features=320, out_features=320, bias=False)
+            (to_k): Linear(in_features=768, out_features=320, bias=False)
+            (to_v): Linear(in_features=768, out_features=320, bias=False)
+            (to_out): Sequential(
+              (0): Linear(in_features=320, out_features=320, bias=True)
+              (1): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
+          (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
+          (norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
+        )
+      )
+      (proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
+    )
+  )
+  (3): TimestepEmbedSequential(
+    (0): Downsample(
+      (op): Conv2d(320, 320, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
+    )
+  )
+  (4): TimestepEmbedSequential(
+    (0): ResBlock(
+      (in_layers): Sequential(
+        (0): GroupNorm32(32, 320, eps=1e-05, affine=True)
+        (1): SiLU()
+        (2): Conv2d(320, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+      (h_upd): Identity()
+      (x_upd): Identity()
+      (emb_layers): Sequential(
+        (0): SiLU()
+        (1): Linear(in_features=1280, out_features=640, bias=True)
+      )
+      (out_layers): Sequential(
+        (0): GroupNorm32(32, 640, eps=1e-05, affine=True)
+        (1): SiLU()
+        (2): Dropout(p=0, inplace=False)
+        (3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+      (skip_connection): Conv2d(320, 640, kernel_size=(1, 1), stride=(1, 1))
+    )
+    (1): SpatialTransformer(
+      (norm): GroupNorm(32, 640, eps=1e-06, affine=True)
+      (proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
+      (transformer_blocks): ModuleList(
+        (0): BasicTransformerBlock(
+          (attn1): CrossAttention(
+            (to_q): Linear(in_features=640, out_features=640, bias=False)
+            (to_k): Linear(in_features=640, out_features=640, bias=False)
+            (to_v): Linear(in_features=640, out_features=640, bias=False)
+            (to_out): Sequential(
+              (0): Linear(in_features=640, out_features=640, bias=True)
+              (1): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (ff): FeedForward(
+            (net): Sequential(
+              (0): GEGLU(
+                (proj): Linear(in_features=640, out_features=5120, bias=True)
+              )
+              (1): Dropout(p=0.0, inplace=False)
+              (2): Linear(in_features=2560, out_features=640, bias=True)
+            )
+          )
+          (attn2): CrossAttention(
+            (to_q): Linear(in_features=640, out_features=640, bias=False)
+            (to_k): Linear(in_features=768, out_features=640, bias=False)
+            (to_v): Linear(in_features=768, out_features=640, bias=False)
+            (to_out): Sequential(
+              (0): Linear(in_features=640, out_features=640, bias=True)
+              (1): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+          (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+          (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+        )
+      )
+      (proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
+    )
+  )
+  (5): TimestepEmbedSequential(
+    (0): ResBlock(
+      (in_layers): Sequential(
+        (0): GroupNorm32(32, 640, eps=1e-05, affine=True)
+        (1): SiLU()
+        (2): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+      (h_upd): Identity()
+      (x_upd): Identity()
+      (emb_layers): Sequential(
+        (0): SiLU()
+        (1): Linear(in_features=1280, out_features=640, bias=True)
+      )
+      (out_layers): Sequential(
+        (0): GroupNorm32(32, 640, eps=1e-05, affine=True)
+        (1): SiLU()
+        (2): Dropout(p=0, inplace=False)
+        (3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+      (skip_connection): Identity()
+    )
+    (1): SpatialTransformer(
+      (norm): GroupNorm(32, 640, eps=1e-06, affine=True)
+      (proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
+      (transformer_blocks): ModuleList(
+        (0): BasicTransformerBlock(
+          (attn1): CrossAttention(
+            (to_q): Linear(in_features=640, out_features=640, bias=False)
+            (to_k): Linear(in_features=640, out_features=640, bias=False)
+            (to_v): Linear(in_features=640, out_features=640, bias=False)
+            (to_out): Sequential(
+              (0): Linear(in_features=640, out_features=640, bias=True)
+              (1): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (ff): FeedForward(
+            (net): Sequential(
+              (0): GEGLU(
+                (proj): Linear(in_features=640, out_features=5120, bias=True)
+              )
+              (1): Dropout(p=0.0, inplace=False)
+              (2): Linear(in_features=2560, out_features=640, bias=True)
+            )
+          )
+          (attn2): CrossAttention(
+            (to_q): Linear(in_features=640, out_features=640, bias=False)
+            (to_k): Linear(in_features=768, out_features=640, bias=False)
+            (to_v): Linear(in_features=768, out_features=640, bias=False)
+            (to_out): Sequential(
+              (0): Linear(in_features=640, out_features=640, bias=True)
+              (1): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+          (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+          (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+        )
+      )
+      (proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
+    )
+  )
+  (6): TimestepEmbedSequential(
+    (0): Downsample(
+      (op): Conv2d(640, 640, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
+    )
+  )
+  (7): TimestepEmbedSequential(
+    (0): ResBlock(
+      (in_layers): Sequential(
+        (0): GroupNorm32(32, 640, eps=1e-05, affine=True)
+        (1): SiLU()
+        (2): Conv2d(640, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+      (h_upd): Identity()
+      (x_upd): Identity()
+      (emb_layers): Sequential(
+        (0): SiLU()
+        (1): Linear(in_features=1280, out_features=1280, bias=True)
+      )
+      (out_layers): Sequential(
+        (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+        (1): SiLU()
+        (2): Dropout(p=0, inplace=False)
+        (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+      (skip_connection): Conv2d(640, 1280, kernel_size=(1, 1), stride=(1, 1))
+    )
+    (1): SpatialTransformer(
+      (norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
+      (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
+      (transformer_blocks): ModuleList(
+        (0): BasicTransformerBlock(
+          (attn1): CrossAttention(
+            (to_q): Linear(in_features=1280, out_features=1280, bias=False)
+            (to_k): Linear(in_features=1280, out_features=1280, bias=False)
+            (to_v): Linear(in_features=1280, out_features=1280, bias=False)
+            (to_out): Sequential(
+              (0): Linear(in_features=1280, out_features=1280, bias=True)
+              (1): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (ff): FeedForward(
+            (net): Sequential(
+              (0): GEGLU(
+                (proj): Linear(in_features=1280, out_features=10240, bias=True)
+              )
+              (1): Dropout(p=0.0, inplace=False)
+              (2): Linear(in_features=5120, out_features=1280, bias=True)
+            )
+          )
+          (attn2): CrossAttention(
+            (to_q): Linear(in_features=1280, out_features=1280, bias=False)
+            (to_k): Linear(in_features=768, out_features=1280, bias=False)
+            (to_v): Linear(in_features=768, out_features=1280, bias=False)
+            (to_out): Sequential(
+              (0): Linear(in_features=1280, out_features=1280, bias=True)
+              (1): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+          (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+          (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+        )
+      )
+      (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
+    )
+  )
+  (8): TimestepEmbedSequential(
+    (0): ResBlock(
+      (in_layers): Sequential(
+        (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+        (1): SiLU()
+        (2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+      (h_upd): Identity()
+      (x_upd): Identity()
+      (emb_layers): Sequential(
+        (0): SiLU()
+        (1): Linear(in_features=1280, out_features=1280, bias=True)
+      )
+      (out_layers): Sequential(
+        (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+        (1): SiLU()
+        (2): Dropout(p=0, inplace=False)
+        (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+      (skip_connection): Identity()
+    )
+    (1): SpatialTransformer(
+      (norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
+      (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
+      (transformer_blocks): ModuleList(
+        (0): BasicTransformerBlock(
+          (attn1): CrossAttention(
+            (to_q): Linear(in_features=1280, out_features=1280, bias=False)
+            (to_k): Linear(in_features=1280, out_features=1280, bias=False)
+            (to_v): Linear(in_features=1280, out_features=1280, bias=False)
+            (to_out): Sequential(
+              (0): Linear(in_features=1280, out_features=1280, bias=True)
+              (1): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (ff): FeedForward(
+            (net): Sequential(
+              (0): GEGLU(
+                (proj): Linear(in_features=1280, out_features=10240, bias=True)
+              )
+              (1): Dropout(p=0.0, inplace=False)
+              (2): Linear(in_features=5120, out_features=1280, bias=True)
+            )
+          )
+          (attn2): CrossAttention(
+            (to_q): Linear(in_features=1280, out_features=1280, bias=False)
+            (to_k): Linear(in_features=768, out_features=1280, bias=False)
+            (to_v): Linear(in_features=768, out_features=1280, bias=False)
+            (to_out): Sequential(
+              (0): Linear(in_features=1280, out_features=1280, bias=True)
+              (1): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+          (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+          (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+        )
+      )
+      (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
+    )
+  )
+  (9): TimestepEmbedSequential(
+    (0): Downsample(
+      (op): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
+    )
+  )
+  (10-11): 2 x TimestepEmbedSequential(
+    (0): ResBlock(
+      (in_layers): Sequential(
+        (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+        (1): SiLU()
+        (2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+      (h_upd): Identity()
+      (x_upd): Identity()
+      (emb_layers): Sequential(
+        (0): SiLU()
+        (1): Linear(in_features=1280, out_features=1280, bias=True)
+      )
+      (out_layers): Sequential(
+        (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+        (1): SiLU()
+        (2): Dropout(p=0, inplace=False)
+        (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+      (skip_connection): Identity()
+    )
+  )
+)
+middle_block
+TimestepEmbedSequential(
+  (0): ResBlock(
+    (in_layers): Sequential(
+      (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+      (1): SiLU()
+      (2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+    )
+    (h_upd): Identity()
+    (x_upd): Identity()
+    (emb_layers): Sequential(
+      (0): SiLU()
+      (1): Linear(in_features=1280, out_features=1280, bias=True)
+    )
+    (out_layers): Sequential(
+      (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+      (1): SiLU()
+      (2): Dropout(p=0, inplace=False)
+      (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+    )
+    (skip_connection): Identity()
+  )
+  (1): SpatialTransformer(
+    (norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
+    (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
+    (transformer_blocks): ModuleList(
+      (0): BasicTransformerBlock(
+        (attn1): CrossAttention(
+          (to_q): Linear(in_features=1280, out_features=1280, bias=False)
+          (to_k): Linear(in_features=1280, out_features=1280, bias=False)
+          (to_v): Linear(in_features=1280, out_features=1280, bias=False)
+          (to_out): Sequential(
+            (0): Linear(in_features=1280, out_features=1280, bias=True)
+            (1): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (ff): FeedForward(
+          (net): Sequential(
+            (0): GEGLU(
+              (proj): Linear(in_features=1280, out_features=10240, bias=True)
+            )
+            (1): Dropout(p=0.0, inplace=False)
+            (2): Linear(in_features=5120, out_features=1280, bias=True)
+          )
+        )
+        (attn2): CrossAttention(
+          (to_q): Linear(in_features=1280, out_features=1280, bias=False)
+          (to_k): Linear(in_features=768, out_features=1280, bias=False)
+          (to_v): Linear(in_features=768, out_features=1280, bias=False)
+          (to_out): Sequential(
+            (0): Linear(in_features=1280, out_features=1280, bias=True)
+            (1): Dropout(p=0.0, inplace=False)
+          )
+        )
+        (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+        (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+        (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+      )
+    )
+    (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
+  )
+  (2): ResBlock(
+    (in_layers): Sequential(
+      (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+      (1): SiLU()
+      (2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+    )
+    (h_upd): Identity()
+    (x_upd): Identity()
+    (emb_layers): Sequential(
+      (0): SiLU()
+      (1): Linear(in_features=1280, out_features=1280, bias=True)
+    )
+    (out_layers): Sequential(
+      (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+      (1): SiLU()
+      (2): Dropout(p=0, inplace=False)
+      (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+    )
+    (skip_connection): Identity()
+  )
+)
+output_blocks
+ModuleList(
+  (0-1): 2 x TimestepEmbedSequential(
+    (0): ResBlock(
+      (in_layers): Sequential(
+        (0): GroupNorm32(32, 2560, eps=1e-05, affine=True)
+        (1): SiLU()
+        (2): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+      (h_upd): Identity()
+      (x_upd): Identity()
+      (emb_layers): Sequential(
+        (0): SiLU()
+        (1): Linear(in_features=1280, out_features=1280, bias=True)
+      )
+      (out_layers): Sequential(
+        (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+        (1): SiLU()
+        (2): Dropout(p=0, inplace=False)
+        (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+      (skip_connection): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1))
+    )
+  )
+  (2): TimestepEmbedSequential(
+    (0): ResBlock(
+      (in_layers): Sequential(
+        (0): GroupNorm32(32, 2560, eps=1e-05, affine=True)
+        (1): SiLU()
+        (2): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+      (h_upd): Identity()
+      (x_upd): Identity()
+      (emb_layers): Sequential(
+        (0): SiLU()
+        (1): Linear(in_features=1280, out_features=1280, bias=True)
+      )
+      (out_layers): Sequential(
+        (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+        (1): SiLU()
+        (2): Dropout(p=0, inplace=False)
+        (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+      (skip_connection): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1))
+    )
+    (1): Upsample(
+      (conv): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+    )
+  )
+  (3-4): 2 x TimestepEmbedSequential(
+    (0): ResBlock(
+      (in_layers): Sequential(
+        (0): GroupNorm32(32, 2560, eps=1e-05, affine=True)
+        (1): SiLU()
+        (2): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+      (h_upd): Identity()
+      (x_upd): Identity()
+      (emb_layers): Sequential(
+        (0): SiLU()
+        (1): Linear(in_features=1280, out_features=1280, bias=True)
+      )
+      (out_layers): Sequential(
+        (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+        (1): SiLU()
+        (2): Dropout(p=0, inplace=False)
+        (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+      (skip_connection): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1))
+    )
+    (1): SpatialTransformer(
+      (norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
+      (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
+      (transformer_blocks): ModuleList(
+        (0): BasicTransformerBlock(
+          (attn1): CrossAttention(
+            (to_q): Linear(in_features=1280, out_features=1280, bias=False)
+            (to_k): Linear(in_features=1280, out_features=1280, bias=False)
+            (to_v): Linear(in_features=1280, out_features=1280, bias=False)
+            (to_out): Sequential(
+              (0): Linear(in_features=1280, out_features=1280, bias=True)
+              (1): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (ff): FeedForward(
+            (net): Sequential(
+              (0): GEGLU(
+                (proj): Linear(in_features=1280, out_features=10240, bias=True)
+              )
+              (1): Dropout(p=0.0, inplace=False)
+              (2): Linear(in_features=5120, out_features=1280, bias=True)
+            )
+          )
+          (attn2): CrossAttention(
+            (to_q): Linear(in_features=1280, out_features=1280, bias=False)
+            (to_k): Linear(in_features=768, out_features=1280, bias=False)
+            (to_v): Linear(in_features=768, out_features=1280, bias=False)
+            (to_out): Sequential(
+              (0): Linear(in_features=1280, out_features=1280, bias=True)
+              (1): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+          (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+          (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+        )
+      )
+      (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
+    )
+  )
+  (5): TimestepEmbedSequential(
+    (0): ResBlock(
+      (in_layers): Sequential(
+        (0): GroupNorm32(32, 1920, eps=1e-05, affine=True)
+        (1): SiLU()
+        (2): Conv2d(1920, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+      (h_upd): Identity()
+      (x_upd): Identity()
+      (emb_layers): Sequential(
+        (0): SiLU()
+        (1): Linear(in_features=1280, out_features=1280, bias=True)
+      )
+      (out_layers): Sequential(
+        (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+        (1): SiLU()
+        (2): Dropout(p=0, inplace=False)
+        (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+      (skip_connection): Conv2d(1920, 1280, kernel_size=(1, 1), stride=(1, 1))
+    )
+    (1): SpatialTransformer(
+      (norm): GroupNorm(32, 1280, eps=1e-06, affine=True)
+      (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
+      (transformer_blocks): ModuleList(
+        (0): BasicTransformerBlock(
+          (attn1): CrossAttention(
+            (to_q): Linear(in_features=1280, out_features=1280, bias=False)
+            (to_k): Linear(in_features=1280, out_features=1280, bias=False)
+            (to_v): Linear(in_features=1280, out_features=1280, bias=False)
+            (to_out): Sequential(
+              (0): Linear(in_features=1280, out_features=1280, bias=True)
+              (1): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (ff): FeedForward(
+            (net): Sequential(
+              (0): GEGLU(
+                (proj): Linear(in_features=1280, out_features=10240, bias=True)
+              )
+              (1): Dropout(p=0.0, inplace=False)
+              (2): Linear(in_features=5120, out_features=1280, bias=True)
+            )
+          )
+          (attn2): CrossAttention(
+            (to_q): Linear(in_features=1280, out_features=1280, bias=False)
+            (to_k): Linear(in_features=768, out_features=1280, bias=False)
+            (to_v): Linear(in_features=768, out_features=1280, bias=False)
+            (to_out): Sequential(
+              (0): Linear(in_features=1280, out_features=1280, bias=True)
+              (1): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+          (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+          (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
+        )
+      )
+      (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))
+    )
+    (2): Upsample(
+      (conv): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+    )
+  )
+  (6): TimestepEmbedSequential(
+    (0): ResBlock(
+      (in_layers): Sequential(
+        (0): GroupNorm32(32, 1920, eps=1e-05, affine=True)
+        (1): SiLU()
+        (2): Conv2d(1920, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+      (h_upd): Identity()
+      (x_upd): Identity()
+      (emb_layers): Sequential(
+        (0): SiLU()
+        (1): Linear(in_features=1280, out_features=640, bias=True)
+      )
+      (out_layers): Sequential(
+        (0): GroupNorm32(32, 640, eps=1e-05, affine=True)
+        (1): SiLU()
+        (2): Dropout(p=0, inplace=False)
+        (3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+      (skip_connection): Conv2d(1920, 640, kernel_size=(1, 1), stride=(1, 1))
+    )
+    (1): SpatialTransformer(
+      (norm): GroupNorm(32, 640, eps=1e-06, affine=True)
+      (proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
+      (transformer_blocks): ModuleList(
+        (0): BasicTransformerBlock(
+          (attn1): CrossAttention(
+            (to_q): Linear(in_features=640, out_features=640, bias=False)
+            (to_k): Linear(in_features=640, out_features=640, bias=False)
+            (to_v): Linear(in_features=640, out_features=640, bias=False)
+            (to_out): Sequential(
+              (0): Linear(in_features=640, out_features=640, bias=True)
+              (1): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (ff): FeedForward(
+            (net): Sequential(
+              (0): GEGLU(
+                (proj): Linear(in_features=640, out_features=5120, bias=True)
+              )
+              (1): Dropout(p=0.0, inplace=False)
+              (2): Linear(in_features=2560, out_features=640, bias=True)
+            )
+          )
+          (attn2): CrossAttention(
+            (to_q): Linear(in_features=640, out_features=640, bias=False)
+            (to_k): Linear(in_features=768, out_features=640, bias=False)
+            (to_v): Linear(in_features=768, out_features=640, bias=False)
+            (to_out): Sequential(
+              (0): Linear(in_features=640, out_features=640, bias=True)
+              (1): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+          (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+          (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+        )
+      )
+      (proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
+    )
+  )
+  (7): TimestepEmbedSequential(
+    (0): ResBlock(
+      (in_layers): Sequential(
+        (0): GroupNorm32(32, 1280, eps=1e-05, affine=True)
+        (1): SiLU()
+        (2): Conv2d(1280, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+      (h_upd): Identity()
+      (x_upd): Identity()
+      (emb_layers): Sequential(
+        (0): SiLU()
+        (1): Linear(in_features=1280, out_features=640, bias=True)
+      )
+      (out_layers): Sequential(
+        (0): GroupNorm32(32, 640, eps=1e-05, affine=True)
+        (1): SiLU()
+        (2): Dropout(p=0, inplace=False)
+        (3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+      (skip_connection): Conv2d(1280, 640, kernel_size=(1, 1), stride=(1, 1))
+    )
+    (1): SpatialTransformer(
+      (norm): GroupNorm(32, 640, eps=1e-06, affine=True)
+      (proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
+      (transformer_blocks): ModuleList(
+        (0): BasicTransformerBlock(
+          (attn1): CrossAttention(
+            (to_q): Linear(in_features=640, out_features=640, bias=False)
+            (to_k): Linear(in_features=640, out_features=640, bias=False)
+            (to_v): Linear(in_features=640, out_features=640, bias=False)
+            (to_out): Sequential(
+              (0): Linear(in_features=640, out_features=640, bias=True)
+              (1): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (ff): FeedForward(
+            (net): Sequential(
+              (0): GEGLU(
+                (proj): Linear(in_features=640, out_features=5120, bias=True)
+              )
+              (1): Dropout(p=0.0, inplace=False)
+              (2): Linear(in_features=2560, out_features=640, bias=True)
+            )
+          )
+          (attn2): CrossAttention(
+            (to_q): Linear(in_features=640, out_features=640, bias=False)
+            (to_k): Linear(in_features=768, out_features=640, bias=False)
+            (to_v): Linear(in_features=768, out_features=640, bias=False)
+            (to_out): Sequential(
+              (0): Linear(in_features=640, out_features=640, bias=True)
+              (1): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+          (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+          (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+        )
+      )
+      (proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
+    )
+  )
+  (8): TimestepEmbedSequential(
+    (0): ResBlock(
+      (in_layers): Sequential(
+        (0): GroupNorm32(32, 960, eps=1e-05, affine=True)
+        (1): SiLU()
+        (2): Conv2d(960, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+      (h_upd): Identity()
+      (x_upd): Identity()
+      (emb_layers): Sequential(
+        (0): SiLU()
+        (1): Linear(in_features=1280, out_features=640, bias=True)
+      )
+      (out_layers): Sequential(
+        (0): GroupNorm32(32, 640, eps=1e-05, affine=True)
+        (1): SiLU()
+        (2): Dropout(p=0, inplace=False)
+        (3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+      (skip_connection): Conv2d(960, 640, kernel_size=(1, 1), stride=(1, 1))
+    )
+    (1): SpatialTransformer(
+      (norm): GroupNorm(32, 640, eps=1e-06, affine=True)
+      (proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
+      (transformer_blocks): ModuleList(
+        (0): BasicTransformerBlock(
+          (attn1): CrossAttention(
+            (to_q): Linear(in_features=640, out_features=640, bias=False)
+            (to_k): Linear(in_features=640, out_features=640, bias=False)
+            (to_v): Linear(in_features=640, out_features=640, bias=False)
+            (to_out): Sequential(
+              (0): Linear(in_features=640, out_features=640, bias=True)
+              (1): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (ff): FeedForward(
+            (net): Sequential(
+              (0): GEGLU(
+                (proj): Linear(in_features=640, out_features=5120, bias=True)
+              )
+              (1): Dropout(p=0.0, inplace=False)
+              (2): Linear(in_features=2560, out_features=640, bias=True)
+            )
+          )
+          (attn2): CrossAttention(
+            (to_q): Linear(in_features=640, out_features=640, bias=False)
+            (to_k): Linear(in_features=768, out_features=640, bias=False)
+            (to_v): Linear(in_features=768, out_features=640, bias=False)
+            (to_out): Sequential(
+              (0): Linear(in_features=640, out_features=640, bias=True)
+              (1): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+          (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+          (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)
+        )
+      )
+      (proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))
+    )
+    (2): Upsample(
+      (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+    )
+  )
+  (9): TimestepEmbedSequential(
+    (0): ResBlock(
+      (in_layers): Sequential(
+        (0): GroupNorm32(32, 960, eps=1e-05, affine=True)
+        (1): SiLU()
+        (2): Conv2d(960, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+      (h_upd): Identity()
+      (x_upd): Identity()
+      (emb_layers): Sequential(
+        (0): SiLU()
+        (1): Linear(in_features=1280, out_features=320, bias=True)
+      )
+      (out_layers): Sequential(
+        (0): GroupNorm32(32, 320, eps=1e-05, affine=True)
+        (1): SiLU()
+        (2): Dropout(p=0, inplace=False)
+        (3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+      (skip_connection): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1))
+    )
+    (1): SpatialTransformer(
+      (norm): GroupNorm(32, 320, eps=1e-06, affine=True)
+      (proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
+      (transformer_blocks): ModuleList(
+        (0): BasicTransformerBlock(
+          (attn1): CrossAttention(
+            (to_q): Linear(in_features=320, out_features=320, bias=False)
+            (to_k): Linear(in_features=320, out_features=320, bias=False)
+            (to_v): Linear(in_features=320, out_features=320, bias=False)
+            (to_out): Sequential(
+              (0): Linear(in_features=320, out_features=320, bias=True)
+              (1): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (ff): FeedForward(
+            (net): Sequential(
+              (0): GEGLU(
+                (proj): Linear(in_features=320, out_features=2560, bias=True)
+              )
+              (1): Dropout(p=0.0, inplace=False)
+              (2): Linear(in_features=1280, out_features=320, bias=True)
+            )
+          )
+          (attn2): CrossAttention(
+            (to_q): Linear(in_features=320, out_features=320, bias=False)
+            (to_k): Linear(in_features=768, out_features=320, bias=False)
+            (to_v): Linear(in_features=768, out_features=320, bias=False)
+            (to_out): Sequential(
+              (0): Linear(in_features=320, out_features=320, bias=True)
+              (1): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
+          (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
+          (norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
+        )
+      )
+      (proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
+    )
+  )
+  (10-11): 2 x TimestepEmbedSequential(
+    (0): ResBlock(
+      (in_layers): Sequential(
+        (0): GroupNorm32(32, 640, eps=1e-05, affine=True)
+        (1): SiLU()
+        (2): Conv2d(640, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+      (h_upd): Identity()
+      (x_upd): Identity()
+      (emb_layers): Sequential(
+        (0): SiLU()
+        (1): Linear(in_features=1280, out_features=320, bias=True)
+      )
+      (out_layers): Sequential(
+        (0): GroupNorm32(32, 320, eps=1e-05, affine=True)
+        (1): SiLU()
+        (2): Dropout(p=0, inplace=False)
+        (3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+      )
+      (skip_connection): Conv2d(640, 320, kernel_size=(1, 1), stride=(1, 1))
+    )
+    (1): SpatialTransformer(
+      (norm): GroupNorm(32, 320, eps=1e-06, affine=True)
+      (proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
+      (transformer_blocks): ModuleList(
+        (0): BasicTransformerBlock(
+          (attn1): CrossAttention(
+            (to_q): Linear(in_features=320, out_features=320, bias=False)
+            (to_k): Linear(in_features=320, out_features=320, bias=False)
+            (to_v): Linear(in_features=320, out_features=320, bias=False)
+            (to_out): Sequential(
+              (0): Linear(in_features=320, out_features=320, bias=True)
+              (1): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (ff): FeedForward(
+            (net): Sequential(
+              (0): GEGLU(
+                (proj): Linear(in_features=320, out_features=2560, bias=True)
+              )
+              (1): Dropout(p=0.0, inplace=False)
+              (2): Linear(in_features=1280, out_features=320, bias=True)
+            )
+          )
+          (attn2): CrossAttention(
+            (to_q): Linear(in_features=320, out_features=320, bias=False)
+            (to_k): Linear(in_features=768, out_features=320, bias=False)
+            (to_v): Linear(in_features=768, out_features=320, bias=False)
+            (to_out): Sequential(
+              (0): Linear(in_features=320, out_features=320, bias=True)
+              (1): Dropout(p=0.0, inplace=False)
+            )
+          )
+          (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
+          (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
+          (norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
+        )
+      )
+      (proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))
+    )
+  )
+)
+out
+Sequential(
+  (0): GroupNorm32(32, 320, eps=1e-05, affine=True)
+  (1): SiLU()
+  (2): Conv2d(320, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+)