| input_blocks | |
| ModuleList( | |
| (0): TimestepEmbedSequential( | |
| (0): Conv2d(4, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| (1-2): 2 x TimestepEmbedSequential( | |
| (0): ResBlock( | |
| (in_layers): Sequential( | |
| (0): GroupNorm32(32, 320, eps=1e-05, affine=True) | |
| (1): SiLU() | |
| (2): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| (h_upd): Identity() | |
| (x_upd): Identity() | |
| (emb_layers): Sequential( | |
| (0): SiLU() | |
| (1): Linear(in_features=1280, out_features=320, bias=True) | |
| ) | |
| (out_layers): Sequential( | |
| (0): GroupNorm32(32, 320, eps=1e-05, affine=True) | |
| (1): SiLU() | |
| (2): Dropout(p=0, inplace=False) | |
| (3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| (skip_connection): Identity() | |
| ) | |
| (1): SpatialTransformer( | |
| (norm): GroupNorm(32, 320, eps=1e-06, affine=True) | |
| (proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1)) | |
| (transformer_blocks): ModuleList( | |
| (0): BasicTransformerBlock( | |
| (attn1): CrossAttention( | |
| (to_q): Linear(in_features=320, out_features=320, bias=False) | |
| (to_k): Linear(in_features=320, out_features=320, bias=False) | |
| (to_v): Linear(in_features=320, out_features=320, bias=False) | |
| (to_out): Sequential( | |
| (0): Linear(in_features=320, out_features=320, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (ff): FeedForward( | |
| (net): Sequential( | |
| (0): GEGLU( | |
| (proj): Linear(in_features=320, out_features=2560, bias=True) | |
| ) | |
| (1): Dropout(p=0.0, inplace=False) | |
| (2): Linear(in_features=1280, out_features=320, bias=True) | |
| ) | |
| ) | |
| (attn2): CrossAttention( | |
| (to_q): Linear(in_features=320, out_features=320, bias=False) | |
| (to_k): Linear(in_features=768, out_features=320, bias=False) | |
| (to_v): Linear(in_features=768, out_features=320, bias=False) | |
| (to_out): Sequential( | |
| (0): Linear(in_features=320, out_features=320, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True) | |
| (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True) | |
| (norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| ) | |
| (proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| ) | |
| (3): TimestepEmbedSequential( | |
| (0): Downsample( | |
| (op): Conv2d(320, 320, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) | |
| ) | |
| ) | |
| (4): TimestepEmbedSequential( | |
| (0): ResBlock( | |
| (in_layers): Sequential( | |
| (0): GroupNorm32(32, 320, eps=1e-05, affine=True) | |
| (1): SiLU() | |
| (2): Conv2d(320, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| (h_upd): Identity() | |
| (x_upd): Identity() | |
| (emb_layers): Sequential( | |
| (0): SiLU() | |
| (1): Linear(in_features=1280, out_features=640, bias=True) | |
| ) | |
| (out_layers): Sequential( | |
| (0): GroupNorm32(32, 640, eps=1e-05, affine=True) | |
| (1): SiLU() | |
| (2): Dropout(p=0, inplace=False) | |
| (3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| (skip_connection): Conv2d(320, 640, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| (1): SpatialTransformer( | |
| (norm): GroupNorm(32, 640, eps=1e-06, affine=True) | |
| (proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1)) | |
| (transformer_blocks): ModuleList( | |
| (0): BasicTransformerBlock( | |
| (attn1): CrossAttention( | |
| (to_q): Linear(in_features=640, out_features=640, bias=False) | |
| (to_k): Linear(in_features=640, out_features=640, bias=False) | |
| (to_v): Linear(in_features=640, out_features=640, bias=False) | |
| (to_out): Sequential( | |
| (0): Linear(in_features=640, out_features=640, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (ff): FeedForward( | |
| (net): Sequential( | |
| (0): GEGLU( | |
| (proj): Linear(in_features=640, out_features=5120, bias=True) | |
| ) | |
| (1): Dropout(p=0.0, inplace=False) | |
| (2): Linear(in_features=2560, out_features=640, bias=True) | |
| ) | |
| ) | |
| (attn2): CrossAttention( | |
| (to_q): Linear(in_features=640, out_features=640, bias=False) | |
| (to_k): Linear(in_features=768, out_features=640, bias=False) | |
| (to_v): Linear(in_features=768, out_features=640, bias=False) | |
| (to_out): Sequential( | |
| (0): Linear(in_features=640, out_features=640, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True) | |
| (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True) | |
| (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| ) | |
| (proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| ) | |
| (5): TimestepEmbedSequential( | |
| (0): ResBlock( | |
| (in_layers): Sequential( | |
| (0): GroupNorm32(32, 640, eps=1e-05, affine=True) | |
| (1): SiLU() | |
| (2): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| (h_upd): Identity() | |
| (x_upd): Identity() | |
| (emb_layers): Sequential( | |
| (0): SiLU() | |
| (1): Linear(in_features=1280, out_features=640, bias=True) | |
| ) | |
| (out_layers): Sequential( | |
| (0): GroupNorm32(32, 640, eps=1e-05, affine=True) | |
| (1): SiLU() | |
| (2): Dropout(p=0, inplace=False) | |
| (3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| (skip_connection): Identity() | |
| ) | |
| (1): SpatialTransformer( | |
| (norm): GroupNorm(32, 640, eps=1e-06, affine=True) | |
| (proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1)) | |
| (transformer_blocks): ModuleList( | |
| (0): BasicTransformerBlock( | |
| (attn1): CrossAttention( | |
| (to_q): Linear(in_features=640, out_features=640, bias=False) | |
| (to_k): Linear(in_features=640, out_features=640, bias=False) | |
| (to_v): Linear(in_features=640, out_features=640, bias=False) | |
| (to_out): Sequential( | |
| (0): Linear(in_features=640, out_features=640, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (ff): FeedForward( | |
| (net): Sequential( | |
| (0): GEGLU( | |
| (proj): Linear(in_features=640, out_features=5120, bias=True) | |
| ) | |
| (1): Dropout(p=0.0, inplace=False) | |
| (2): Linear(in_features=2560, out_features=640, bias=True) | |
| ) | |
| ) | |
| (attn2): CrossAttention( | |
| (to_q): Linear(in_features=640, out_features=640, bias=False) | |
| (to_k): Linear(in_features=768, out_features=640, bias=False) | |
| (to_v): Linear(in_features=768, out_features=640, bias=False) | |
| (to_out): Sequential( | |
| (0): Linear(in_features=640, out_features=640, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True) | |
| (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True) | |
| (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| ) | |
| (proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| ) | |
| (6): TimestepEmbedSequential( | |
| (0): Downsample( | |
| (op): Conv2d(640, 640, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) | |
| ) | |
| ) | |
| (7): TimestepEmbedSequential( | |
| (0): ResBlock( | |
| (in_layers): Sequential( | |
| (0): GroupNorm32(32, 640, eps=1e-05, affine=True) | |
| (1): SiLU() | |
| (2): Conv2d(640, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| (h_upd): Identity() | |
| (x_upd): Identity() | |
| (emb_layers): Sequential( | |
| (0): SiLU() | |
| (1): Linear(in_features=1280, out_features=1280, bias=True) | |
| ) | |
| (out_layers): Sequential( | |
| (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) | |
| (1): SiLU() | |
| (2): Dropout(p=0, inplace=False) | |
| (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| (skip_connection): Conv2d(640, 1280, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| (1): SpatialTransformer( | |
| (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) | |
| (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) | |
| (transformer_blocks): ModuleList( | |
| (0): BasicTransformerBlock( | |
| (attn1): CrossAttention( | |
| (to_q): Linear(in_features=1280, out_features=1280, bias=False) | |
| (to_k): Linear(in_features=1280, out_features=1280, bias=False) | |
| (to_v): Linear(in_features=1280, out_features=1280, bias=False) | |
| (to_out): Sequential( | |
| (0): Linear(in_features=1280, out_features=1280, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (ff): FeedForward( | |
| (net): Sequential( | |
| (0): GEGLU( | |
| (proj): Linear(in_features=1280, out_features=10240, bias=True) | |
| ) | |
| (1): Dropout(p=0.0, inplace=False) | |
| (2): Linear(in_features=5120, out_features=1280, bias=True) | |
| ) | |
| ) | |
| (attn2): CrossAttention( | |
| (to_q): Linear(in_features=1280, out_features=1280, bias=False) | |
| (to_k): Linear(in_features=768, out_features=1280, bias=False) | |
| (to_v): Linear(in_features=768, out_features=1280, bias=False) | |
| (to_out): Sequential( | |
| (0): Linear(in_features=1280, out_features=1280, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| ) | |
| (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| ) | |
| (8): TimestepEmbedSequential( | |
| (0): ResBlock( | |
| (in_layers): Sequential( | |
| (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) | |
| (1): SiLU() | |
| (2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| (h_upd): Identity() | |
| (x_upd): Identity() | |
| (emb_layers): Sequential( | |
| (0): SiLU() | |
| (1): Linear(in_features=1280, out_features=1280, bias=True) | |
| ) | |
| (out_layers): Sequential( | |
| (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) | |
| (1): SiLU() | |
| (2): Dropout(p=0, inplace=False) | |
| (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| (skip_connection): Identity() | |
| ) | |
| (1): SpatialTransformer( | |
| (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) | |
| (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) | |
| (transformer_blocks): ModuleList( | |
| (0): BasicTransformerBlock( | |
| (attn1): CrossAttention( | |
| (to_q): Linear(in_features=1280, out_features=1280, bias=False) | |
| (to_k): Linear(in_features=1280, out_features=1280, bias=False) | |
| (to_v): Linear(in_features=1280, out_features=1280, bias=False) | |
| (to_out): Sequential( | |
| (0): Linear(in_features=1280, out_features=1280, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (ff): FeedForward( | |
| (net): Sequential( | |
| (0): GEGLU( | |
| (proj): Linear(in_features=1280, out_features=10240, bias=True) | |
| ) | |
| (1): Dropout(p=0.0, inplace=False) | |
| (2): Linear(in_features=5120, out_features=1280, bias=True) | |
| ) | |
| ) | |
| (attn2): CrossAttention( | |
| (to_q): Linear(in_features=1280, out_features=1280, bias=False) | |
| (to_k): Linear(in_features=768, out_features=1280, bias=False) | |
| (to_v): Linear(in_features=768, out_features=1280, bias=False) | |
| (to_out): Sequential( | |
| (0): Linear(in_features=1280, out_features=1280, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| ) | |
| (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| ) | |
| (9): TimestepEmbedSequential( | |
| (0): Downsample( | |
| (op): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) | |
| ) | |
| ) | |
| (10-11): 2 x TimestepEmbedSequential( | |
| (0): ResBlock( | |
| (in_layers): Sequential( | |
| (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) | |
| (1): SiLU() | |
| (2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| (h_upd): Identity() | |
| (x_upd): Identity() | |
| (emb_layers): Sequential( | |
| (0): SiLU() | |
| (1): Linear(in_features=1280, out_features=1280, bias=True) | |
| ) | |
| (out_layers): Sequential( | |
| (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) | |
| (1): SiLU() | |
| (2): Dropout(p=0, inplace=False) | |
| (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| (skip_connection): Identity() | |
| ) | |
| ) | |
| ) | |
| middle_block | |
| TimestepEmbedSequential( | |
| (0): ResBlock( | |
| (in_layers): Sequential( | |
| (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) | |
| (1): SiLU() | |
| (2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| (h_upd): Identity() | |
| (x_upd): Identity() | |
| (emb_layers): Sequential( | |
| (0): SiLU() | |
| (1): Linear(in_features=1280, out_features=1280, bias=True) | |
| ) | |
| (out_layers): Sequential( | |
| (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) | |
| (1): SiLU() | |
| (2): Dropout(p=0, inplace=False) | |
| (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| (skip_connection): Identity() | |
| ) | |
| (1): SpatialTransformer( | |
| (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) | |
| (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) | |
| (transformer_blocks): ModuleList( | |
| (0): BasicTransformerBlock( | |
| (attn1): CrossAttention( | |
| (to_q): Linear(in_features=1280, out_features=1280, bias=False) | |
| (to_k): Linear(in_features=1280, out_features=1280, bias=False) | |
| (to_v): Linear(in_features=1280, out_features=1280, bias=False) | |
| (to_out): Sequential( | |
| (0): Linear(in_features=1280, out_features=1280, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (ff): FeedForward( | |
| (net): Sequential( | |
| (0): GEGLU( | |
| (proj): Linear(in_features=1280, out_features=10240, bias=True) | |
| ) | |
| (1): Dropout(p=0.0, inplace=False) | |
| (2): Linear(in_features=5120, out_features=1280, bias=True) | |
| ) | |
| ) | |
| (attn2): CrossAttention( | |
| (to_q): Linear(in_features=1280, out_features=1280, bias=False) | |
| (to_k): Linear(in_features=768, out_features=1280, bias=False) | |
| (to_v): Linear(in_features=768, out_features=1280, bias=False) | |
| (to_out): Sequential( | |
| (0): Linear(in_features=1280, out_features=1280, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| ) | |
| (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| (2): ResBlock( | |
| (in_layers): Sequential( | |
| (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) | |
| (1): SiLU() | |
| (2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| (h_upd): Identity() | |
| (x_upd): Identity() | |
| (emb_layers): Sequential( | |
| (0): SiLU() | |
| (1): Linear(in_features=1280, out_features=1280, bias=True) | |
| ) | |
| (out_layers): Sequential( | |
| (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) | |
| (1): SiLU() | |
| (2): Dropout(p=0, inplace=False) | |
| (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| (skip_connection): Identity() | |
| ) | |
| ) | |
| output_blocks | |
| ModuleList( | |
| (0-1): 2 x TimestepEmbedSequential( | |
| (0): ResBlock( | |
| (in_layers): Sequential( | |
| (0): GroupNorm32(32, 2560, eps=1e-05, affine=True) | |
| (1): SiLU() | |
| (2): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| (h_upd): Identity() | |
| (x_upd): Identity() | |
| (emb_layers): Sequential( | |
| (0): SiLU() | |
| (1): Linear(in_features=1280, out_features=1280, bias=True) | |
| ) | |
| (out_layers): Sequential( | |
| (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) | |
| (1): SiLU() | |
| (2): Dropout(p=0, inplace=False) | |
| (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| (skip_connection): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| ) | |
| (2): TimestepEmbedSequential( | |
| (0): ResBlock( | |
| (in_layers): Sequential( | |
| (0): GroupNorm32(32, 2560, eps=1e-05, affine=True) | |
| (1): SiLU() | |
| (2): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| (h_upd): Identity() | |
| (x_upd): Identity() | |
| (emb_layers): Sequential( | |
| (0): SiLU() | |
| (1): Linear(in_features=1280, out_features=1280, bias=True) | |
| ) | |
| (out_layers): Sequential( | |
| (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) | |
| (1): SiLU() | |
| (2): Dropout(p=0, inplace=False) | |
| (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| (skip_connection): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| (1): Upsample( | |
| (conv): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| ) | |
| (3-4): 2 x TimestepEmbedSequential( | |
| (0): ResBlock( | |
| (in_layers): Sequential( | |
| (0): GroupNorm32(32, 2560, eps=1e-05, affine=True) | |
| (1): SiLU() | |
| (2): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| (h_upd): Identity() | |
| (x_upd): Identity() | |
| (emb_layers): Sequential( | |
| (0): SiLU() | |
| (1): Linear(in_features=1280, out_features=1280, bias=True) | |
| ) | |
| (out_layers): Sequential( | |
| (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) | |
| (1): SiLU() | |
| (2): Dropout(p=0, inplace=False) | |
| (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| (skip_connection): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| (1): SpatialTransformer( | |
| (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) | |
| (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) | |
| (transformer_blocks): ModuleList( | |
| (0): BasicTransformerBlock( | |
| (attn1): CrossAttention( | |
| (to_q): Linear(in_features=1280, out_features=1280, bias=False) | |
| (to_k): Linear(in_features=1280, out_features=1280, bias=False) | |
| (to_v): Linear(in_features=1280, out_features=1280, bias=False) | |
| (to_out): Sequential( | |
| (0): Linear(in_features=1280, out_features=1280, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (ff): FeedForward( | |
| (net): Sequential( | |
| (0): GEGLU( | |
| (proj): Linear(in_features=1280, out_features=10240, bias=True) | |
| ) | |
| (1): Dropout(p=0.0, inplace=False) | |
| (2): Linear(in_features=5120, out_features=1280, bias=True) | |
| ) | |
| ) | |
| (attn2): CrossAttention( | |
| (to_q): Linear(in_features=1280, out_features=1280, bias=False) | |
| (to_k): Linear(in_features=768, out_features=1280, bias=False) | |
| (to_v): Linear(in_features=768, out_features=1280, bias=False) | |
| (to_out): Sequential( | |
| (0): Linear(in_features=1280, out_features=1280, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| ) | |
| (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| ) | |
| (5): TimestepEmbedSequential( | |
| (0): ResBlock( | |
| (in_layers): Sequential( | |
| (0): GroupNorm32(32, 1920, eps=1e-05, affine=True) | |
| (1): SiLU() | |
| (2): Conv2d(1920, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| (h_upd): Identity() | |
| (x_upd): Identity() | |
| (emb_layers): Sequential( | |
| (0): SiLU() | |
| (1): Linear(in_features=1280, out_features=1280, bias=True) | |
| ) | |
| (out_layers): Sequential( | |
| (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) | |
| (1): SiLU() | |
| (2): Dropout(p=0, inplace=False) | |
| (3): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| (skip_connection): Conv2d(1920, 1280, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| (1): SpatialTransformer( | |
| (norm): GroupNorm(32, 1280, eps=1e-06, affine=True) | |
| (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) | |
| (transformer_blocks): ModuleList( | |
| (0): BasicTransformerBlock( | |
| (attn1): CrossAttention( | |
| (to_q): Linear(in_features=1280, out_features=1280, bias=False) | |
| (to_k): Linear(in_features=1280, out_features=1280, bias=False) | |
| (to_v): Linear(in_features=1280, out_features=1280, bias=False) | |
| (to_out): Sequential( | |
| (0): Linear(in_features=1280, out_features=1280, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (ff): FeedForward( | |
| (net): Sequential( | |
| (0): GEGLU( | |
| (proj): Linear(in_features=1280, out_features=10240, bias=True) | |
| ) | |
| (1): Dropout(p=0.0, inplace=False) | |
| (2): Linear(in_features=5120, out_features=1280, bias=True) | |
| ) | |
| ) | |
| (attn2): CrossAttention( | |
| (to_q): Linear(in_features=1280, out_features=1280, bias=False) | |
| (to_k): Linear(in_features=768, out_features=1280, bias=False) | |
| (to_v): Linear(in_features=768, out_features=1280, bias=False) | |
| (to_out): Sequential( | |
| (0): Linear(in_features=1280, out_features=1280, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| ) | |
| (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| (2): Upsample( | |
| (conv): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| ) | |
| (6): TimestepEmbedSequential( | |
| (0): ResBlock( | |
| (in_layers): Sequential( | |
| (0): GroupNorm32(32, 1920, eps=1e-05, affine=True) | |
| (1): SiLU() | |
| (2): Conv2d(1920, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| (h_upd): Identity() | |
| (x_upd): Identity() | |
| (emb_layers): Sequential( | |
| (0): SiLU() | |
| (1): Linear(in_features=1280, out_features=640, bias=True) | |
| ) | |
| (out_layers): Sequential( | |
| (0): GroupNorm32(32, 640, eps=1e-05, affine=True) | |
| (1): SiLU() | |
| (2): Dropout(p=0, inplace=False) | |
| (3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| (skip_connection): Conv2d(1920, 640, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| (1): SpatialTransformer( | |
| (norm): GroupNorm(32, 640, eps=1e-06, affine=True) | |
| (proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1)) | |
| (transformer_blocks): ModuleList( | |
| (0): BasicTransformerBlock( | |
| (attn1): CrossAttention( | |
| (to_q): Linear(in_features=640, out_features=640, bias=False) | |
| (to_k): Linear(in_features=640, out_features=640, bias=False) | |
| (to_v): Linear(in_features=640, out_features=640, bias=False) | |
| (to_out): Sequential( | |
| (0): Linear(in_features=640, out_features=640, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (ff): FeedForward( | |
| (net): Sequential( | |
| (0): GEGLU( | |
| (proj): Linear(in_features=640, out_features=5120, bias=True) | |
| ) | |
| (1): Dropout(p=0.0, inplace=False) | |
| (2): Linear(in_features=2560, out_features=640, bias=True) | |
| ) | |
| ) | |
| (attn2): CrossAttention( | |
| (to_q): Linear(in_features=640, out_features=640, bias=False) | |
| (to_k): Linear(in_features=768, out_features=640, bias=False) | |
| (to_v): Linear(in_features=768, out_features=640, bias=False) | |
| (to_out): Sequential( | |
| (0): Linear(in_features=640, out_features=640, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True) | |
| (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True) | |
| (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| ) | |
| (proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| ) | |
| (7): TimestepEmbedSequential( | |
| (0): ResBlock( | |
| (in_layers): Sequential( | |
| (0): GroupNorm32(32, 1280, eps=1e-05, affine=True) | |
| (1): SiLU() | |
| (2): Conv2d(1280, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| (h_upd): Identity() | |
| (x_upd): Identity() | |
| (emb_layers): Sequential( | |
| (0): SiLU() | |
| (1): Linear(in_features=1280, out_features=640, bias=True) | |
| ) | |
| (out_layers): Sequential( | |
| (0): GroupNorm32(32, 640, eps=1e-05, affine=True) | |
| (1): SiLU() | |
| (2): Dropout(p=0, inplace=False) | |
| (3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| (skip_connection): Conv2d(1280, 640, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| (1): SpatialTransformer( | |
| (norm): GroupNorm(32, 640, eps=1e-06, affine=True) | |
| (proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1)) | |
| (transformer_blocks): ModuleList( | |
| (0): BasicTransformerBlock( | |
| (attn1): CrossAttention( | |
| (to_q): Linear(in_features=640, out_features=640, bias=False) | |
| (to_k): Linear(in_features=640, out_features=640, bias=False) | |
| (to_v): Linear(in_features=640, out_features=640, bias=False) | |
| (to_out): Sequential( | |
| (0): Linear(in_features=640, out_features=640, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (ff): FeedForward( | |
| (net): Sequential( | |
| (0): GEGLU( | |
| (proj): Linear(in_features=640, out_features=5120, bias=True) | |
| ) | |
| (1): Dropout(p=0.0, inplace=False) | |
| (2): Linear(in_features=2560, out_features=640, bias=True) | |
| ) | |
| ) | |
| (attn2): CrossAttention( | |
| (to_q): Linear(in_features=640, out_features=640, bias=False) | |
| (to_k): Linear(in_features=768, out_features=640, bias=False) | |
| (to_v): Linear(in_features=768, out_features=640, bias=False) | |
| (to_out): Sequential( | |
| (0): Linear(in_features=640, out_features=640, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True) | |
| (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True) | |
| (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| ) | |
| (proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| ) | |
| (8): TimestepEmbedSequential( | |
| (0): ResBlock( | |
| (in_layers): Sequential( | |
| (0): GroupNorm32(32, 960, eps=1e-05, affine=True) | |
| (1): SiLU() | |
| (2): Conv2d(960, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| (h_upd): Identity() | |
| (x_upd): Identity() | |
| (emb_layers): Sequential( | |
| (0): SiLU() | |
| (1): Linear(in_features=1280, out_features=640, bias=True) | |
| ) | |
| (out_layers): Sequential( | |
| (0): GroupNorm32(32, 640, eps=1e-05, affine=True) | |
| (1): SiLU() | |
| (2): Dropout(p=0, inplace=False) | |
| (3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| (skip_connection): Conv2d(960, 640, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| (1): SpatialTransformer( | |
| (norm): GroupNorm(32, 640, eps=1e-06, affine=True) | |
| (proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1)) | |
| (transformer_blocks): ModuleList( | |
| (0): BasicTransformerBlock( | |
| (attn1): CrossAttention( | |
| (to_q): Linear(in_features=640, out_features=640, bias=False) | |
| (to_k): Linear(in_features=640, out_features=640, bias=False) | |
| (to_v): Linear(in_features=640, out_features=640, bias=False) | |
| (to_out): Sequential( | |
| (0): Linear(in_features=640, out_features=640, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (ff): FeedForward( | |
| (net): Sequential( | |
| (0): GEGLU( | |
| (proj): Linear(in_features=640, out_features=5120, bias=True) | |
| ) | |
| (1): Dropout(p=0.0, inplace=False) | |
| (2): Linear(in_features=2560, out_features=640, bias=True) | |
| ) | |
| ) | |
| (attn2): CrossAttention( | |
| (to_q): Linear(in_features=640, out_features=640, bias=False) | |
| (to_k): Linear(in_features=768, out_features=640, bias=False) | |
| (to_v): Linear(in_features=768, out_features=640, bias=False) | |
| (to_out): Sequential( | |
| (0): Linear(in_features=640, out_features=640, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True) | |
| (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True) | |
| (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| ) | |
| (proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| (2): Upsample( | |
| (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| ) | |
| (9): TimestepEmbedSequential( | |
| (0): ResBlock( | |
| (in_layers): Sequential( | |
| (0): GroupNorm32(32, 960, eps=1e-05, affine=True) | |
| (1): SiLU() | |
| (2): Conv2d(960, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| (h_upd): Identity() | |
| (x_upd): Identity() | |
| (emb_layers): Sequential( | |
| (0): SiLU() | |
| (1): Linear(in_features=1280, out_features=320, bias=True) | |
| ) | |
| (out_layers): Sequential( | |
| (0): GroupNorm32(32, 320, eps=1e-05, affine=True) | |
| (1): SiLU() | |
| (2): Dropout(p=0, inplace=False) | |
| (3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| (skip_connection): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| (1): SpatialTransformer( | |
| (norm): GroupNorm(32, 320, eps=1e-06, affine=True) | |
| (proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1)) | |
| (transformer_blocks): ModuleList( | |
| (0): BasicTransformerBlock( | |
| (attn1): CrossAttention( | |
| (to_q): Linear(in_features=320, out_features=320, bias=False) | |
| (to_k): Linear(in_features=320, out_features=320, bias=False) | |
| (to_v): Linear(in_features=320, out_features=320, bias=False) | |
| (to_out): Sequential( | |
| (0): Linear(in_features=320, out_features=320, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (ff): FeedForward( | |
| (net): Sequential( | |
| (0): GEGLU( | |
| (proj): Linear(in_features=320, out_features=2560, bias=True) | |
| ) | |
| (1): Dropout(p=0.0, inplace=False) | |
| (2): Linear(in_features=1280, out_features=320, bias=True) | |
| ) | |
| ) | |
| (attn2): CrossAttention( | |
| (to_q): Linear(in_features=320, out_features=320, bias=False) | |
| (to_k): Linear(in_features=768, out_features=320, bias=False) | |
| (to_v): Linear(in_features=768, out_features=320, bias=False) | |
| (to_out): Sequential( | |
| (0): Linear(in_features=320, out_features=320, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True) | |
| (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True) | |
| (norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| ) | |
| (proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| ) | |
| (10-11): 2 x TimestepEmbedSequential( | |
| (0): ResBlock( | |
| (in_layers): Sequential( | |
| (0): GroupNorm32(32, 640, eps=1e-05, affine=True) | |
| (1): SiLU() | |
| (2): Conv2d(640, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| (h_upd): Identity() | |
| (x_upd): Identity() | |
| (emb_layers): Sequential( | |
| (0): SiLU() | |
| (1): Linear(in_features=1280, out_features=320, bias=True) | |
| ) | |
| (out_layers): Sequential( | |
| (0): GroupNorm32(32, 320, eps=1e-05, affine=True) | |
| (1): SiLU() | |
| (2): Dropout(p=0, inplace=False) | |
| (3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) | |
| (skip_connection): Conv2d(640, 320, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| (1): SpatialTransformer( | |
| (norm): GroupNorm(32, 320, eps=1e-06, affine=True) | |
| (proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1)) | |
| (transformer_blocks): ModuleList( | |
| (0): BasicTransformerBlock( | |
| (attn1): CrossAttention( | |
| (to_q): Linear(in_features=320, out_features=320, bias=False) | |
| (to_k): Linear(in_features=320, out_features=320, bias=False) | |
| (to_v): Linear(in_features=320, out_features=320, bias=False) | |
| (to_out): Sequential( | |
| (0): Linear(in_features=320, out_features=320, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (ff): FeedForward( | |
| (net): Sequential( | |
| (0): GEGLU( | |
| (proj): Linear(in_features=320, out_features=2560, bias=True) | |
| ) | |
| (1): Dropout(p=0.0, inplace=False) | |
| (2): Linear(in_features=1280, out_features=320, bias=True) | |
| ) | |
| ) | |
| (attn2): CrossAttention( | |
| (to_q): Linear(in_features=320, out_features=320, bias=False) | |
| (to_k): Linear(in_features=768, out_features=320, bias=False) | |
| (to_v): Linear(in_features=768, out_features=320, bias=False) | |
| (to_out): Sequential( | |
| (0): Linear(in_features=320, out_features=320, bias=True) | |
| (1): Dropout(p=0.0, inplace=False) | |
| ) | |
| ) | |
| (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True) | |
| (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True) | |
| (norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True) | |
| ) | |
| ) | |
| (proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1)) | |
| ) | |
| ) | |
| ) | |
| out | |
| Sequential( | |
| (0): GroupNorm32(32, 320, eps=1e-05, affine=True) | |
| (1): SiLU() | |
| (2): Conv2d(320, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) | |
| ) |