EdgeDiffusion_r3 / pruned_unet.config.json
ChenHe727's picture
Upload pruned_unet.config.json with huggingface_hub
746fb1b verified
{
"model_config": {
"_use_default_values": [
"resnet_out_scale_factor",
"use_linear_projection",
"projection_class_embeddings_input_dim",
"resnet_time_scale_shift",
"time_embedding_dim",
"class_embeddings_concat",
"encoder_hid_dim_type",
"num_attention_heads",
"transformer_layers_per_block",
"addition_embed_type",
"encoder_hid_dim",
"conv_out_kernel",
"upcast_attention",
"only_cross_attention",
"dropout",
"timestep_post_act",
"dual_cross_attention",
"addition_embed_type_num_heads",
"time_cond_proj_dim",
"num_class_embeds",
"time_embedding_act_fn",
"attention_type",
"class_embed_type",
"resnet_skip_time_act",
"reverse_transformer_layers_per_block",
"mid_block_only_cross_attention",
"cross_attention_norm",
"mid_block_type",
"time_embedding_type",
"conv_in_kernel",
"addition_time_embed_dim"
],
"_class_name": "UNet2DConditionModel",
"_diffusers_version": "0.6.0",
"_name_or_path": "runwayml/stable-diffusion-v1-5",
"sample_size": 64,
"in_channels": 4,
"out_channels": 4,
"center_input_sample": false,
"flip_sin_to_cos": true,
"freq_shift": 0,
"down_block_types": [
"CrossAttnDownBlock2D",
"CrossAttnDownBlock2D",
"CrossAttnDownBlock2D",
"DownBlock2D"
],
"mid_block_type": "UNetMidBlock2DCrossAttn",
"up_block_types": [
"UpBlock2D",
"CrossAttnUpBlock2D",
"CrossAttnUpBlock2D",
"CrossAttnUpBlock2D"
],
"only_cross_attention": false,
"block_out_channels": [
320,
640,
1280,
1280
],
"layers_per_block": 2,
"downsample_padding": 1,
"mid_block_scale_factor": 1,
"dropout": 0.0,
"act_fn": "silu",
"norm_num_groups": 32,
"norm_eps": 1e-05,
"cross_attention_dim": 768,
"transformer_layers_per_block": 1,
"reverse_transformer_layers_per_block": null,
"encoder_hid_dim": null,
"encoder_hid_dim_type": null,
"attention_head_dim": 8,
"num_attention_heads": null,
"dual_cross_attention": false,
"use_linear_projection": false,
"class_embed_type": null,
"addition_embed_type": null,
"addition_time_embed_dim": null,
"num_class_embeds": null,
"upcast_attention": false,
"resnet_time_scale_shift": "default",
"resnet_skip_time_act": false,
"resnet_out_scale_factor": 1.0,
"time_embedding_type": "positional",
"time_embedding_dim": null,
"time_embedding_act_fn": null,
"timestep_post_act": null,
"time_cond_proj_dim": null,
"conv_in_kernel": 3,
"conv_out_kernel": 3,
"projection_class_embeddings_input_dim": null,
"attention_type": "default",
"class_embeddings_concat": false,
"mid_block_only_cross_attention": null,
"cross_attention_norm": null,
"addition_embed_type_num_heads": 64
},
"module_shapes": {
"conv_in": {
"type": "conv2d",
"out_channels": 320,
"in_channels": 4,
"kernel_size": [
3,
3
]
},
"time_embedding.linear_1": {
"type": "linear",
"out_features": 1280,
"in_features": 320
},
"time_embedding.linear_2": {
"type": "linear",
"out_features": 1280,
"in_features": 1280
},
"down_blocks.0.attentions.0.proj_in": {
"type": "conv2d",
"out_channels": 320,
"in_channels": 320,
"kernel_size": [
1,
1
]
},
"down_blocks.0.attentions.0.transformer_blocks.0.attn1.to_q": {
"type": "linear",
"out_features": 320,
"in_features": 320
},
"down_blocks.0.attentions.0.transformer_blocks.0.attn1.to_k": {
"type": "linear",
"out_features": 320,
"in_features": 320
},
"down_blocks.0.attentions.0.transformer_blocks.0.attn1.to_v": {
"type": "linear",
"out_features": 320,
"in_features": 320
},
"down_blocks.0.attentions.0.transformer_blocks.0.attn1.to_out.0": {
"type": "linear",
"out_features": 320,
"in_features": 320
},
"down_blocks.0.attentions.0.transformer_blocks.0.attn2.to_q": {
"type": "linear",
"out_features": 200,
"in_features": 320
},
"down_blocks.0.attentions.0.transformer_blocks.0.attn2.to_k": {
"type": "linear",
"out_features": 200,
"in_features": 768
},
"down_blocks.0.attentions.0.transformer_blocks.0.attn2.to_v": {
"type": "linear",
"out_features": 200,
"in_features": 768
},
"down_blocks.0.attentions.0.transformer_blocks.0.attn2.to_out.0": {
"type": "linear",
"out_features": 320,
"in_features": 200
},
"down_blocks.0.attentions.0.transformer_blocks.0.ff.net.0.proj": {
"type": "linear",
"out_features": 2560,
"in_features": 320
},
"down_blocks.0.attentions.0.transformer_blocks.0.ff.net.2": {
"type": "linear",
"out_features": 320,
"in_features": 1280
},
"down_blocks.0.attentions.0.proj_out": {
"type": "conv2d",
"out_channels": 320,
"in_channels": 320,
"kernel_size": [
1,
1
]
},
"down_blocks.0.attentions.1.proj_in": {
"type": "conv2d",
"out_channels": 320,
"in_channels": 320,
"kernel_size": [
1,
1
]
},
"down_blocks.0.attentions.1.transformer_blocks.0.attn1.to_q": {
"type": "linear",
"out_features": 320,
"in_features": 320
},
"down_blocks.0.attentions.1.transformer_blocks.0.attn1.to_k": {
"type": "linear",
"out_features": 320,
"in_features": 320
},
"down_blocks.0.attentions.1.transformer_blocks.0.attn1.to_v": {
"type": "linear",
"out_features": 320,
"in_features": 320
},
"down_blocks.0.attentions.1.transformer_blocks.0.attn1.to_out.0": {
"type": "linear",
"out_features": 320,
"in_features": 320
},
"down_blocks.0.attentions.1.transformer_blocks.0.attn2.to_q": {
"type": "linear",
"out_features": 200,
"in_features": 320
},
"down_blocks.0.attentions.1.transformer_blocks.0.attn2.to_k": {
"type": "linear",
"out_features": 200,
"in_features": 768
},
"down_blocks.0.attentions.1.transformer_blocks.0.attn2.to_v": {
"type": "linear",
"out_features": 200,
"in_features": 768
},
"down_blocks.0.attentions.1.transformer_blocks.0.attn2.to_out.0": {
"type": "linear",
"out_features": 320,
"in_features": 200
},
"down_blocks.0.attentions.1.transformer_blocks.0.ff.net.0.proj": {
"type": "linear",
"out_features": 2560,
"in_features": 320
},
"down_blocks.0.attentions.1.transformer_blocks.0.ff.net.2": {
"type": "linear",
"out_features": 320,
"in_features": 1280
},
"down_blocks.0.attentions.1.proj_out": {
"type": "conv2d",
"out_channels": 320,
"in_channels": 320,
"kernel_size": [
1,
1
]
},
"down_blocks.0.resnets.0.conv1": {
"type": "conv2d",
"out_channels": 320,
"in_channels": 320,
"kernel_size": [
3,
3
]
},
"down_blocks.0.resnets.0.time_emb_proj": {
"type": "linear",
"out_features": 320,
"in_features": 1280
},
"down_blocks.0.resnets.0.conv2": {
"type": "conv2d",
"out_channels": 320,
"in_channels": 320,
"kernel_size": [
3,
3
]
},
"down_blocks.0.resnets.1.conv1": {
"type": "conv2d",
"out_channels": 320,
"in_channels": 320,
"kernel_size": [
3,
3
]
},
"down_blocks.0.resnets.1.time_emb_proj": {
"type": "linear",
"out_features": 320,
"in_features": 1280
},
"down_blocks.0.resnets.1.conv2": {
"type": "conv2d",
"out_channels": 320,
"in_channels": 320,
"kernel_size": [
3,
3
]
},
"down_blocks.0.downsamplers.0.conv": {
"type": "conv2d",
"out_channels": 320,
"in_channels": 320,
"kernel_size": [
3,
3
]
},
"down_blocks.1.attentions.0.proj_in": {
"type": "conv2d",
"out_channels": 640,
"in_channels": 640,
"kernel_size": [
1,
1
]
},
"down_blocks.1.attentions.0.transformer_blocks.0.attn1.to_q": {
"type": "linear",
"out_features": 640,
"in_features": 640
},
"down_blocks.1.attentions.0.transformer_blocks.0.attn1.to_k": {
"type": "linear",
"out_features": 640,
"in_features": 640
},
"down_blocks.1.attentions.0.transformer_blocks.0.attn1.to_v": {
"type": "linear",
"out_features": 640,
"in_features": 640
},
"down_blocks.1.attentions.0.transformer_blocks.0.attn1.to_out.0": {
"type": "linear",
"out_features": 640,
"in_features": 640
},
"down_blocks.1.attentions.0.transformer_blocks.0.attn2.to_q": {
"type": "linear",
"out_features": 400,
"in_features": 640
},
"down_blocks.1.attentions.0.transformer_blocks.0.attn2.to_k": {
"type": "linear",
"out_features": 400,
"in_features": 768
},
"down_blocks.1.attentions.0.transformer_blocks.0.attn2.to_v": {
"type": "linear",
"out_features": 400,
"in_features": 768
},
"down_blocks.1.attentions.0.transformer_blocks.0.attn2.to_out.0": {
"type": "linear",
"out_features": 640,
"in_features": 400
},
"down_blocks.1.attentions.0.transformer_blocks.0.ff.net.0.proj": {
"type": "linear",
"out_features": 5120,
"in_features": 640
},
"down_blocks.1.attentions.0.transformer_blocks.0.ff.net.2": {
"type": "linear",
"out_features": 640,
"in_features": 2560
},
"down_blocks.1.attentions.0.proj_out": {
"type": "conv2d",
"out_channels": 640,
"in_channels": 640,
"kernel_size": [
1,
1
]
},
"down_blocks.1.attentions.1.proj_in": {
"type": "conv2d",
"out_channels": 640,
"in_channels": 640,
"kernel_size": [
1,
1
]
},
"down_blocks.1.attentions.1.transformer_blocks.0.attn1.to_q": {
"type": "linear",
"out_features": 640,
"in_features": 640
},
"down_blocks.1.attentions.1.transformer_blocks.0.attn1.to_k": {
"type": "linear",
"out_features": 640,
"in_features": 640
},
"down_blocks.1.attentions.1.transformer_blocks.0.attn1.to_v": {
"type": "linear",
"out_features": 640,
"in_features": 640
},
"down_blocks.1.attentions.1.transformer_blocks.0.attn1.to_out.0": {
"type": "linear",
"out_features": 640,
"in_features": 640
},
"down_blocks.1.attentions.1.transformer_blocks.0.attn2.to_q": {
"type": "linear",
"out_features": 400,
"in_features": 640
},
"down_blocks.1.attentions.1.transformer_blocks.0.attn2.to_k": {
"type": "linear",
"out_features": 400,
"in_features": 768
},
"down_blocks.1.attentions.1.transformer_blocks.0.attn2.to_v": {
"type": "linear",
"out_features": 400,
"in_features": 768
},
"down_blocks.1.attentions.1.transformer_blocks.0.attn2.to_out.0": {
"type": "linear",
"out_features": 640,
"in_features": 400
},
"down_blocks.1.attentions.1.transformer_blocks.0.ff.net.0.proj": {
"type": "linear",
"out_features": 5120,
"in_features": 640
},
"down_blocks.1.attentions.1.transformer_blocks.0.ff.net.2": {
"type": "linear",
"out_features": 640,
"in_features": 2560
},
"down_blocks.1.attentions.1.proj_out": {
"type": "conv2d",
"out_channels": 640,
"in_channels": 640,
"kernel_size": [
1,
1
]
},
"down_blocks.1.resnets.0.conv1": {
"type": "conv2d",
"out_channels": 640,
"in_channels": 320,
"kernel_size": [
3,
3
]
},
"down_blocks.1.resnets.0.time_emb_proj": {
"type": "linear",
"out_features": 640,
"in_features": 1280
},
"down_blocks.1.resnets.0.conv2": {
"type": "conv2d",
"out_channels": 640,
"in_channels": 640,
"kernel_size": [
3,
3
]
},
"down_blocks.1.resnets.0.conv_shortcut": {
"type": "conv2d",
"out_channels": 640,
"in_channels": 320,
"kernel_size": [
1,
1
]
},
"down_blocks.1.resnets.1.conv1": {
"type": "conv2d",
"out_channels": 640,
"in_channels": 640,
"kernel_size": [
3,
3
]
},
"down_blocks.1.resnets.1.time_emb_proj": {
"type": "linear",
"out_features": 640,
"in_features": 1280
},
"down_blocks.1.resnets.1.conv2": {
"type": "conv2d",
"out_channels": 640,
"in_channels": 640,
"kernel_size": [
3,
3
]
},
"down_blocks.1.downsamplers.0.conv": {
"type": "conv2d",
"out_channels": 640,
"in_channels": 640,
"kernel_size": [
3,
3
]
},
"down_blocks.2.attentions.0.proj_in": {
"type": "conv2d",
"out_channels": 1280,
"in_channels": 1280,
"kernel_size": [
1,
1
]
},
"down_blocks.2.attentions.0.transformer_blocks.0.attn1.to_q": {
"type": "linear",
"out_features": 800,
"in_features": 1280
},
"down_blocks.2.attentions.0.transformer_blocks.0.attn1.to_k": {
"type": "linear",
"out_features": 800,
"in_features": 1280
},
"down_blocks.2.attentions.0.transformer_blocks.0.attn1.to_v": {
"type": "linear",
"out_features": 800,
"in_features": 1280
},
"down_blocks.2.attentions.0.transformer_blocks.0.attn1.to_out.0": {
"type": "linear",
"out_features": 1280,
"in_features": 800
},
"down_blocks.2.attentions.0.transformer_blocks.0.attn2.to_q": {
"type": "linear",
"out_features": 480,
"in_features": 1280
},
"down_blocks.2.attentions.0.transformer_blocks.0.attn2.to_k": {
"type": "linear",
"out_features": 480,
"in_features": 768
},
"down_blocks.2.attentions.0.transformer_blocks.0.attn2.to_v": {
"type": "linear",
"out_features": 480,
"in_features": 768
},
"down_blocks.2.attentions.0.transformer_blocks.0.attn2.to_out.0": {
"type": "linear",
"out_features": 1280,
"in_features": 480
},
"down_blocks.2.attentions.0.transformer_blocks.0.ff.net.0.proj": {
"type": "linear",
"out_features": 10240,
"in_features": 1280
},
"down_blocks.2.attentions.0.transformer_blocks.0.ff.net.2": {
"type": "linear",
"out_features": 1280,
"in_features": 5120
},
"down_blocks.2.attentions.0.proj_out": {
"type": "conv2d",
"out_channels": 1280,
"in_channels": 1280,
"kernel_size": [
1,
1
]
},
"down_blocks.2.attentions.1.proj_in": {
"type": "conv2d",
"out_channels": 1280,
"in_channels": 1280,
"kernel_size": [
1,
1
]
},
"down_blocks.2.attentions.1.transformer_blocks.0.attn1.to_q": {
"type": "linear",
"out_features": 800,
"in_features": 1280
},
"down_blocks.2.attentions.1.transformer_blocks.0.attn1.to_k": {
"type": "linear",
"out_features": 800,
"in_features": 1280
},
"down_blocks.2.attentions.1.transformer_blocks.0.attn1.to_v": {
"type": "linear",
"out_features": 800,
"in_features": 1280
},
"down_blocks.2.attentions.1.transformer_blocks.0.attn1.to_out.0": {
"type": "linear",
"out_features": 1280,
"in_features": 800
},
"down_blocks.2.attentions.1.transformer_blocks.0.attn2.to_q": {
"type": "linear",
"out_features": 480,
"in_features": 1280
},
"down_blocks.2.attentions.1.transformer_blocks.0.attn2.to_k": {
"type": "linear",
"out_features": 480,
"in_features": 768
},
"down_blocks.2.attentions.1.transformer_blocks.0.attn2.to_v": {
"type": "linear",
"out_features": 480,
"in_features": 768
},
"down_blocks.2.attentions.1.transformer_blocks.0.attn2.to_out.0": {
"type": "linear",
"out_features": 1280,
"in_features": 480
},
"down_blocks.2.attentions.1.transformer_blocks.0.ff.net.0.proj": {
"type": "linear",
"out_features": 10240,
"in_features": 1280
},
"down_blocks.2.attentions.1.transformer_blocks.0.ff.net.2": {
"type": "linear",
"out_features": 1280,
"in_features": 5120
},
"down_blocks.2.attentions.1.proj_out": {
"type": "conv2d",
"out_channels": 1280,
"in_channels": 1280,
"kernel_size": [
1,
1
]
},
"down_blocks.2.resnets.0.conv1": {
"type": "conv2d",
"out_channels": 1280,
"in_channels": 640,
"kernel_size": [
3,
3
]
},
"down_blocks.2.resnets.0.time_emb_proj": {
"type": "linear",
"out_features": 1280,
"in_features": 1280
},
"down_blocks.2.resnets.0.conv2": {
"type": "conv2d",
"out_channels": 1280,
"in_channels": 1280,
"kernel_size": [
3,
3
]
},
"down_blocks.2.resnets.0.conv_shortcut": {
"type": "conv2d",
"out_channels": 1280,
"in_channels": 640,
"kernel_size": [
1,
1
]
},
"down_blocks.2.resnets.1.conv1": {
"type": "conv2d",
"out_channels": 1280,
"in_channels": 1280,
"kernel_size": [
3,
3
]
},
"down_blocks.2.resnets.1.time_emb_proj": {
"type": "linear",
"out_features": 1280,
"in_features": 1280
},
"down_blocks.2.resnets.1.conv2": {
"type": "conv2d",
"out_channels": 1280,
"in_channels": 1280,
"kernel_size": [
3,
3
]
},
"down_blocks.2.downsamplers.0.conv": {
"type": "conv2d",
"out_channels": 1280,
"in_channels": 1280,
"kernel_size": [
3,
3
]
},
"down_blocks.3.resnets.0.conv1": {
"type": "conv2d",
"out_channels": 480,
"in_channels": 1280,
"kernel_size": [
3,
3
]
},
"down_blocks.3.resnets.0.time_emb_proj": {
"type": "linear",
"out_features": 480,
"in_features": 1280
},
"down_blocks.3.resnets.0.conv2": {
"type": "conv2d",
"out_channels": 1280,
"in_channels": 480,
"kernel_size": [
3,
3
]
},
"down_blocks.3.resnets.1.conv1": {
"type": "conv2d",
"out_channels": 480,
"in_channels": 1280,
"kernel_size": [
3,
3
]
},
"down_blocks.3.resnets.1.time_emb_proj": {
"type": "linear",
"out_features": 480,
"in_features": 1280
},
"down_blocks.3.resnets.1.conv2": {
"type": "conv2d",
"out_channels": 1280,
"in_channels": 480,
"kernel_size": [
3,
3
]
},
"up_blocks.0.resnets.0.conv1": {
"type": "conv2d",
"out_channels": 960,
"in_channels": 2560,
"kernel_size": [
3,
3
]
},
"up_blocks.0.resnets.0.time_emb_proj": {
"type": "linear",
"out_features": 960,
"in_features": 1280
},
"up_blocks.0.resnets.0.conv2": {
"type": "conv2d",
"out_channels": 1280,
"in_channels": 960,
"kernel_size": [
3,
3
]
},
"up_blocks.0.resnets.0.conv_shortcut": {
"type": "conv2d",
"out_channels": 1280,
"in_channels": 2560,
"kernel_size": [
1,
1
]
},
"up_blocks.0.resnets.1.conv1": {
"type": "conv2d",
"out_channels": 960,
"in_channels": 2560,
"kernel_size": [
3,
3
]
},
"up_blocks.0.resnets.1.time_emb_proj": {
"type": "linear",
"out_features": 960,
"in_features": 1280
},
"up_blocks.0.resnets.1.conv2": {
"type": "conv2d",
"out_channels": 1280,
"in_channels": 960,
"kernel_size": [
3,
3
]
},
"up_blocks.0.resnets.1.conv_shortcut": {
"type": "conv2d",
"out_channels": 1280,
"in_channels": 2560,
"kernel_size": [
1,
1
]
},
"up_blocks.0.resnets.2.conv1": {
"type": "conv2d",
"out_channels": 960,
"in_channels": 2560,
"kernel_size": [
3,
3
]
},
"up_blocks.0.resnets.2.time_emb_proj": {
"type": "linear",
"out_features": 960,
"in_features": 1280
},
"up_blocks.0.resnets.2.conv2": {
"type": "conv2d",
"out_channels": 1280,
"in_channels": 960,
"kernel_size": [
3,
3
]
},
"up_blocks.0.resnets.2.conv_shortcut": {
"type": "conv2d",
"out_channels": 1280,
"in_channels": 2560,
"kernel_size": [
1,
1
]
},
"up_blocks.0.upsamplers.0.conv": {
"type": "conv2d",
"out_channels": 1280,
"in_channels": 1280,
"kernel_size": [
3,
3
]
},
"up_blocks.1.attentions.0.proj_in": {
"type": "conv2d",
"out_channels": 1280,
"in_channels": 1280,
"kernel_size": [
1,
1
]
},
"up_blocks.1.attentions.0.transformer_blocks.0.attn1.to_q": {
"type": "linear",
"out_features": 800,
"in_features": 1280
},
"up_blocks.1.attentions.0.transformer_blocks.0.attn1.to_k": {
"type": "linear",
"out_features": 800,
"in_features": 1280
},
"up_blocks.1.attentions.0.transformer_blocks.0.attn1.to_v": {
"type": "linear",
"out_features": 800,
"in_features": 1280
},
"up_blocks.1.attentions.0.transformer_blocks.0.attn1.to_out.0": {
"type": "linear",
"out_features": 1280,
"in_features": 800
},
"up_blocks.1.attentions.0.transformer_blocks.0.attn2.to_q": {
"type": "linear",
"out_features": 480,
"in_features": 1280
},
"up_blocks.1.attentions.0.transformer_blocks.0.attn2.to_k": {
"type": "linear",
"out_features": 480,
"in_features": 768
},
"up_blocks.1.attentions.0.transformer_blocks.0.attn2.to_v": {
"type": "linear",
"out_features": 480,
"in_features": 768
},
"up_blocks.1.attentions.0.transformer_blocks.0.attn2.to_out.0": {
"type": "linear",
"out_features": 1280,
"in_features": 480
},
"up_blocks.1.attentions.0.transformer_blocks.0.ff.net.0.proj": {
"type": "linear",
"out_features": 10240,
"in_features": 1280
},
"up_blocks.1.attentions.0.transformer_blocks.0.ff.net.2": {
"type": "linear",
"out_features": 1280,
"in_features": 5120
},
"up_blocks.1.attentions.0.proj_out": {
"type": "conv2d",
"out_channels": 1280,
"in_channels": 1280,
"kernel_size": [
1,
1
]
},
"up_blocks.1.attentions.1.proj_in": {
"type": "conv2d",
"out_channels": 1280,
"in_channels": 1280,
"kernel_size": [
1,
1
]
},
"up_blocks.1.attentions.1.transformer_blocks.0.attn1.to_q": {
"type": "linear",
"out_features": 800,
"in_features": 1280
},
"up_blocks.1.attentions.1.transformer_blocks.0.attn1.to_k": {
"type": "linear",
"out_features": 800,
"in_features": 1280
},
"up_blocks.1.attentions.1.transformer_blocks.0.attn1.to_v": {
"type": "linear",
"out_features": 800,
"in_features": 1280
},
"up_blocks.1.attentions.1.transformer_blocks.0.attn1.to_out.0": {
"type": "linear",
"out_features": 1280,
"in_features": 800
},
"up_blocks.1.attentions.1.transformer_blocks.0.attn2.to_q": {
"type": "linear",
"out_features": 480,
"in_features": 1280
},
"up_blocks.1.attentions.1.transformer_blocks.0.attn2.to_k": {
"type": "linear",
"out_features": 480,
"in_features": 768
},
"up_blocks.1.attentions.1.transformer_blocks.0.attn2.to_v": {
"type": "linear",
"out_features": 480,
"in_features": 768
},
"up_blocks.1.attentions.1.transformer_blocks.0.attn2.to_out.0": {
"type": "linear",
"out_features": 1280,
"in_features": 480
},
"up_blocks.1.attentions.1.transformer_blocks.0.ff.net.0.proj": {
"type": "linear",
"out_features": 10240,
"in_features": 1280
},
"up_blocks.1.attentions.1.transformer_blocks.0.ff.net.2": {
"type": "linear",
"out_features": 1280,
"in_features": 5120
},
"up_blocks.1.attentions.1.proj_out": {
"type": "conv2d",
"out_channels": 1280,
"in_channels": 1280,
"kernel_size": [
1,
1
]
},
"up_blocks.1.attentions.2.proj_in": {
"type": "conv2d",
"out_channels": 1280,
"in_channels": 1280,
"kernel_size": [
1,
1
]
},
"up_blocks.1.attentions.2.transformer_blocks.0.attn1.to_q": {
"type": "linear",
"out_features": 800,
"in_features": 1280
},
"up_blocks.1.attentions.2.transformer_blocks.0.attn1.to_k": {
"type": "linear",
"out_features": 800,
"in_features": 1280
},
"up_blocks.1.attentions.2.transformer_blocks.0.attn1.to_v": {
"type": "linear",
"out_features": 800,
"in_features": 1280
},
"up_blocks.1.attentions.2.transformer_blocks.0.attn1.to_out.0": {
"type": "linear",
"out_features": 1280,
"in_features": 800
},
"up_blocks.1.attentions.2.transformer_blocks.0.attn2.to_q": {
"type": "linear",
"out_features": 480,
"in_features": 1280
},
"up_blocks.1.attentions.2.transformer_blocks.0.attn2.to_k": {
"type": "linear",
"out_features": 480,
"in_features": 768
},
"up_blocks.1.attentions.2.transformer_blocks.0.attn2.to_v": {
"type": "linear",
"out_features": 480,
"in_features": 768
},
"up_blocks.1.attentions.2.transformer_blocks.0.attn2.to_out.0": {
"type": "linear",
"out_features": 1280,
"in_features": 480
},
"up_blocks.1.attentions.2.transformer_blocks.0.ff.net.0.proj": {
"type": "linear",
"out_features": 10240,
"in_features": 1280
},
"up_blocks.1.attentions.2.transformer_blocks.0.ff.net.2": {
"type": "linear",
"out_features": 1280,
"in_features": 5120
},
"up_blocks.1.attentions.2.proj_out": {
"type": "conv2d",
"out_channels": 1280,
"in_channels": 1280,
"kernel_size": [
1,
1
]
},
"up_blocks.1.resnets.0.conv1": {
"type": "conv2d",
"out_channels": 1280,
"in_channels": 2560,
"kernel_size": [
3,
3
]
},
"up_blocks.1.resnets.0.time_emb_proj": {
"type": "linear",
"out_features": 1280,
"in_features": 1280
},
"up_blocks.1.resnets.0.conv2": {
"type": "conv2d",
"out_channels": 1280,
"in_channels": 1280,
"kernel_size": [
3,
3
]
},
"up_blocks.1.resnets.0.conv_shortcut": {
"type": "conv2d",
"out_channels": 1280,
"in_channels": 2560,
"kernel_size": [
1,
1
]
},
"up_blocks.1.resnets.1.conv1": {
"type": "conv2d",
"out_channels": 1280,
"in_channels": 2560,
"kernel_size": [
3,
3
]
},
"up_blocks.1.resnets.1.time_emb_proj": {
"type": "linear",
"out_features": 1280,
"in_features": 1280
},
"up_blocks.1.resnets.1.conv2": {
"type": "conv2d",
"out_channels": 1280,
"in_channels": 1280,
"kernel_size": [
3,
3
]
},
"up_blocks.1.resnets.1.conv_shortcut": {
"type": "conv2d",
"out_channels": 1280,
"in_channels": 2560,
"kernel_size": [
1,
1
]
},
"up_blocks.1.resnets.2.conv1": {
"type": "conv2d",
"out_channels": 1280,
"in_channels": 1920,
"kernel_size": [
3,
3
]
},
"up_blocks.1.resnets.2.time_emb_proj": {
"type": "linear",
"out_features": 1280,
"in_features": 1280
},
"up_blocks.1.resnets.2.conv2": {
"type": "conv2d",
"out_channels": 1280,
"in_channels": 1280,
"kernel_size": [
3,
3
]
},
"up_blocks.1.resnets.2.conv_shortcut": {
"type": "conv2d",
"out_channels": 1280,
"in_channels": 1920,
"kernel_size": [
1,
1
]
},
"up_blocks.1.upsamplers.0.conv": {
"type": "conv2d",
"out_channels": 1280,
"in_channels": 1280,
"kernel_size": [
3,
3
]
},
"up_blocks.2.attentions.0.proj_in": {
"type": "conv2d",
"out_channels": 640,
"in_channels": 640,
"kernel_size": [
1,
1
]
},
"up_blocks.2.attentions.0.transformer_blocks.0.attn1.to_q": {
"type": "linear",
"out_features": 640,
"in_features": 640
},
"up_blocks.2.attentions.0.transformer_blocks.0.attn1.to_k": {
"type": "linear",
"out_features": 640,
"in_features": 640
},
"up_blocks.2.attentions.0.transformer_blocks.0.attn1.to_v": {
"type": "linear",
"out_features": 640,
"in_features": 640
},
"up_blocks.2.attentions.0.transformer_blocks.0.attn1.to_out.0": {
"type": "linear",
"out_features": 640,
"in_features": 640
},
"up_blocks.2.attentions.0.transformer_blocks.0.attn2.to_q": {
"type": "linear",
"out_features": 400,
"in_features": 640
},
"up_blocks.2.attentions.0.transformer_blocks.0.attn2.to_k": {
"type": "linear",
"out_features": 400,
"in_features": 768
},
"up_blocks.2.attentions.0.transformer_blocks.0.attn2.to_v": {
"type": "linear",
"out_features": 400,
"in_features": 768
},
"up_blocks.2.attentions.0.transformer_blocks.0.attn2.to_out.0": {
"type": "linear",
"out_features": 640,
"in_features": 400
},
"up_blocks.2.attentions.0.transformer_blocks.0.ff.net.0.proj": {
"type": "linear",
"out_features": 5120,
"in_features": 640
},
"up_blocks.2.attentions.0.transformer_blocks.0.ff.net.2": {
"type": "linear",
"out_features": 640,
"in_features": 2560
},
"up_blocks.2.attentions.0.proj_out": {
"type": "conv2d",
"out_channels": 640,
"in_channels": 640,
"kernel_size": [
1,
1
]
},
"up_blocks.2.attentions.1.proj_in": {
"type": "conv2d",
"out_channels": 640,
"in_channels": 640,
"kernel_size": [
1,
1
]
},
"up_blocks.2.attentions.1.transformer_blocks.0.attn1.to_q": {
"type": "linear",
"out_features": 640,
"in_features": 640
},
"up_blocks.2.attentions.1.transformer_blocks.0.attn1.to_k": {
"type": "linear",
"out_features": 640,
"in_features": 640
},
"up_blocks.2.attentions.1.transformer_blocks.0.attn1.to_v": {
"type": "linear",
"out_features": 640,
"in_features": 640
},
"up_blocks.2.attentions.1.transformer_blocks.0.attn1.to_out.0": {
"type": "linear",
"out_features": 640,
"in_features": 640
},
"up_blocks.2.attentions.1.transformer_blocks.0.attn2.to_q": {
"type": "linear",
"out_features": 400,
"in_features": 640
},
"up_blocks.2.attentions.1.transformer_blocks.0.attn2.to_k": {
"type": "linear",
"out_features": 400,
"in_features": 768
},
"up_blocks.2.attentions.1.transformer_blocks.0.attn2.to_v": {
"type": "linear",
"out_features": 400,
"in_features": 768
},
"up_blocks.2.attentions.1.transformer_blocks.0.attn2.to_out.0": {
"type": "linear",
"out_features": 640,
"in_features": 400
},
"up_blocks.2.attentions.1.transformer_blocks.0.ff.net.0.proj": {
"type": "linear",
"out_features": 5120,
"in_features": 640
},
"up_blocks.2.attentions.1.transformer_blocks.0.ff.net.2": {
"type": "linear",
"out_features": 640,
"in_features": 2560
},
"up_blocks.2.attentions.1.proj_out": {
"type": "conv2d",
"out_channels": 640,
"in_channels": 640,
"kernel_size": [
1,
1
]
},
"up_blocks.2.attentions.2.proj_in": {
"type": "conv2d",
"out_channels": 640,
"in_channels": 640,
"kernel_size": [
1,
1
]
},
"up_blocks.2.attentions.2.transformer_blocks.0.attn1.to_q": {
"type": "linear",
"out_features": 640,
"in_features": 640
},
"up_blocks.2.attentions.2.transformer_blocks.0.attn1.to_k": {
"type": "linear",
"out_features": 640,
"in_features": 640
},
"up_blocks.2.attentions.2.transformer_blocks.0.attn1.to_v": {
"type": "linear",
"out_features": 640,
"in_features": 640
},
"up_blocks.2.attentions.2.transformer_blocks.0.attn1.to_out.0": {
"type": "linear",
"out_features": 640,
"in_features": 640
},
"up_blocks.2.attentions.2.transformer_blocks.0.attn2.to_q": {
"type": "linear",
"out_features": 400,
"in_features": 640
},
"up_blocks.2.attentions.2.transformer_blocks.0.attn2.to_k": {
"type": "linear",
"out_features": 400,
"in_features": 768
},
"up_blocks.2.attentions.2.transformer_blocks.0.attn2.to_v": {
"type": "linear",
"out_features": 400,
"in_features": 768
},
"up_blocks.2.attentions.2.transformer_blocks.0.attn2.to_out.0": {
"type": "linear",
"out_features": 640,
"in_features": 400
},
"up_blocks.2.attentions.2.transformer_blocks.0.ff.net.0.proj": {
"type": "linear",
"out_features": 5120,
"in_features": 640
},
"up_blocks.2.attentions.2.transformer_blocks.0.ff.net.2": {
"type": "linear",
"out_features": 640,
"in_features": 2560
},
"up_blocks.2.attentions.2.proj_out": {
"type": "conv2d",
"out_channels": 640,
"in_channels": 640,
"kernel_size": [
1,
1
]
},
"up_blocks.2.resnets.0.conv1": {
"type": "conv2d",
"out_channels": 640,
"in_channels": 1920,
"kernel_size": [
3,
3
]
},
"up_blocks.2.resnets.0.time_emb_proj": {
"type": "linear",
"out_features": 640,
"in_features": 1280
},
"up_blocks.2.resnets.0.conv2": {
"type": "conv2d",
"out_channels": 640,
"in_channels": 640,
"kernel_size": [
3,
3
]
},
"up_blocks.2.resnets.0.conv_shortcut": {
"type": "conv2d",
"out_channels": 640,
"in_channels": 1920,
"kernel_size": [
1,
1
]
},
"up_blocks.2.resnets.1.conv1": {
"type": "conv2d",
"out_channels": 640,
"in_channels": 1280,
"kernel_size": [
3,
3
]
},
"up_blocks.2.resnets.1.time_emb_proj": {
"type": "linear",
"out_features": 640,
"in_features": 1280
},
"up_blocks.2.resnets.1.conv2": {
"type": "conv2d",
"out_channels": 640,
"in_channels": 640,
"kernel_size": [
3,
3
]
},
"up_blocks.2.resnets.1.conv_shortcut": {
"type": "conv2d",
"out_channels": 640,
"in_channels": 1280,
"kernel_size": [
1,
1
]
},
"up_blocks.2.resnets.2.conv1": {
"type": "conv2d",
"out_channels": 640,
"in_channels": 960,
"kernel_size": [
3,
3
]
},
"up_blocks.2.resnets.2.time_emb_proj": {
"type": "linear",
"out_features": 640,
"in_features": 1280
},
"up_blocks.2.resnets.2.conv2": {
"type": "conv2d",
"out_channels": 640,
"in_channels": 640,
"kernel_size": [
3,
3
]
},
"up_blocks.2.resnets.2.conv_shortcut": {
"type": "conv2d",
"out_channels": 640,
"in_channels": 960,
"kernel_size": [
1,
1
]
},
"up_blocks.2.upsamplers.0.conv": {
"type": "conv2d",
"out_channels": 640,
"in_channels": 640,
"kernel_size": [
3,
3
]
},
"up_blocks.3.attentions.0.proj_in": {
"type": "conv2d",
"out_channels": 320,
"in_channels": 320,
"kernel_size": [
1,
1
]
},
"up_blocks.3.attentions.0.transformer_blocks.0.attn1.to_q": {
"type": "linear",
"out_features": 320,
"in_features": 320
},
"up_blocks.3.attentions.0.transformer_blocks.0.attn1.to_k": {
"type": "linear",
"out_features": 320,
"in_features": 320
},
"up_blocks.3.attentions.0.transformer_blocks.0.attn1.to_v": {
"type": "linear",
"out_features": 320,
"in_features": 320
},
"up_blocks.3.attentions.0.transformer_blocks.0.attn1.to_out.0": {
"type": "linear",
"out_features": 320,
"in_features": 320
},
"up_blocks.3.attentions.0.transformer_blocks.0.attn2.to_q": {
"type": "linear",
"out_features": 200,
"in_features": 320
},
"up_blocks.3.attentions.0.transformer_blocks.0.attn2.to_k": {
"type": "linear",
"out_features": 200,
"in_features": 768
},
"up_blocks.3.attentions.0.transformer_blocks.0.attn2.to_v": {
"type": "linear",
"out_features": 200,
"in_features": 768
},
"up_blocks.3.attentions.0.transformer_blocks.0.attn2.to_out.0": {
"type": "linear",
"out_features": 320,
"in_features": 200
},
"up_blocks.3.attentions.0.transformer_blocks.0.ff.net.0.proj": {
"type": "linear",
"out_features": 2560,
"in_features": 320
},
"up_blocks.3.attentions.0.transformer_blocks.0.ff.net.2": {
"type": "linear",
"out_features": 320,
"in_features": 1280
},
"up_blocks.3.attentions.0.proj_out": {
"type": "conv2d",
"out_channels": 320,
"in_channels": 320,
"kernel_size": [
1,
1
]
},
"up_blocks.3.attentions.1.proj_in": {
"type": "conv2d",
"out_channels": 320,
"in_channels": 320,
"kernel_size": [
1,
1
]
},
"up_blocks.3.attentions.1.transformer_blocks.0.attn1.to_q": {
"type": "linear",
"out_features": 320,
"in_features": 320
},
"up_blocks.3.attentions.1.transformer_blocks.0.attn1.to_k": {
"type": "linear",
"out_features": 320,
"in_features": 320
},
"up_blocks.3.attentions.1.transformer_blocks.0.attn1.to_v": {
"type": "linear",
"out_features": 320,
"in_features": 320
},
"up_blocks.3.attentions.1.transformer_blocks.0.attn1.to_out.0": {
"type": "linear",
"out_features": 320,
"in_features": 320
},
"up_blocks.3.attentions.1.transformer_blocks.0.attn2.to_q": {
"type": "linear",
"out_features": 200,
"in_features": 320
},
"up_blocks.3.attentions.1.transformer_blocks.0.attn2.to_k": {
"type": "linear",
"out_features": 200,
"in_features": 768
},
"up_blocks.3.attentions.1.transformer_blocks.0.attn2.to_v": {
"type": "linear",
"out_features": 200,
"in_features": 768
},
"up_blocks.3.attentions.1.transformer_blocks.0.attn2.to_out.0": {
"type": "linear",
"out_features": 320,
"in_features": 200
},
"up_blocks.3.attentions.1.transformer_blocks.0.ff.net.0.proj": {
"type": "linear",
"out_features": 2560,
"in_features": 320
},
"up_blocks.3.attentions.1.transformer_blocks.0.ff.net.2": {
"type": "linear",
"out_features": 320,
"in_features": 1280
},
"up_blocks.3.attentions.1.proj_out": {
"type": "conv2d",
"out_channels": 320,
"in_channels": 320,
"kernel_size": [
1,
1
]
},
"up_blocks.3.attentions.2.proj_in": {
"type": "conv2d",
"out_channels": 320,
"in_channels": 320,
"kernel_size": [
1,
1
]
},
"up_blocks.3.attentions.2.transformer_blocks.0.attn1.to_q": {
"type": "linear",
"out_features": 320,
"in_features": 320
},
"up_blocks.3.attentions.2.transformer_blocks.0.attn1.to_k": {
"type": "linear",
"out_features": 320,
"in_features": 320
},
"up_blocks.3.attentions.2.transformer_blocks.0.attn1.to_v": {
"type": "linear",
"out_features": 320,
"in_features": 320
},
"up_blocks.3.attentions.2.transformer_blocks.0.attn1.to_out.0": {
"type": "linear",
"out_features": 320,
"in_features": 320
},
"up_blocks.3.attentions.2.transformer_blocks.0.attn2.to_q": {
"type": "linear",
"out_features": 200,
"in_features": 320
},
"up_blocks.3.attentions.2.transformer_blocks.0.attn2.to_k": {
"type": "linear",
"out_features": 200,
"in_features": 768
},
"up_blocks.3.attentions.2.transformer_blocks.0.attn2.to_v": {
"type": "linear",
"out_features": 200,
"in_features": 768
},
"up_blocks.3.attentions.2.transformer_blocks.0.attn2.to_out.0": {
"type": "linear",
"out_features": 320,
"in_features": 200
},
"up_blocks.3.attentions.2.transformer_blocks.0.ff.net.0.proj": {
"type": "linear",
"out_features": 2560,
"in_features": 320
},
"up_blocks.3.attentions.2.transformer_blocks.0.ff.net.2": {
"type": "linear",
"out_features": 320,
"in_features": 1280
},
"up_blocks.3.attentions.2.proj_out": {
"type": "conv2d",
"out_channels": 320,
"in_channels": 320,
"kernel_size": [
1,
1
]
},
"up_blocks.3.resnets.0.conv1": {
"type": "conv2d",
"out_channels": 320,
"in_channels": 960,
"kernel_size": [
3,
3
]
},
"up_blocks.3.resnets.0.time_emb_proj": {
"type": "linear",
"out_features": 320,
"in_features": 1280
},
"up_blocks.3.resnets.0.conv2": {
"type": "conv2d",
"out_channels": 320,
"in_channels": 320,
"kernel_size": [
3,
3
]
},
"up_blocks.3.resnets.0.conv_shortcut": {
"type": "conv2d",
"out_channels": 320,
"in_channels": 960,
"kernel_size": [
1,
1
]
},
"up_blocks.3.resnets.1.conv1": {
"type": "conv2d",
"out_channels": 320,
"in_channels": 640,
"kernel_size": [
3,
3
]
},
"up_blocks.3.resnets.1.time_emb_proj": {
"type": "linear",
"out_features": 320,
"in_features": 1280
},
"up_blocks.3.resnets.1.conv2": {
"type": "conv2d",
"out_channels": 320,
"in_channels": 320,
"kernel_size": [
3,
3
]
},
"up_blocks.3.resnets.1.conv_shortcut": {
"type": "conv2d",
"out_channels": 320,
"in_channels": 640,
"kernel_size": [
1,
1
]
},
"up_blocks.3.resnets.2.conv1": {
"type": "conv2d",
"out_channels": 320,
"in_channels": 640,
"kernel_size": [
3,
3
]
},
"up_blocks.3.resnets.2.time_emb_proj": {
"type": "linear",
"out_features": 320,
"in_features": 1280
},
"up_blocks.3.resnets.2.conv2": {
"type": "conv2d",
"out_channels": 320,
"in_channels": 320,
"kernel_size": [
3,
3
]
},
"up_blocks.3.resnets.2.conv_shortcut": {
"type": "conv2d",
"out_channels": 320,
"in_channels": 640,
"kernel_size": [
1,
1
]
},
"mid_block.attentions.0.proj_in": {
"type": "conv2d",
"out_channels": 1280,
"in_channels": 1280,
"kernel_size": [
1,
1
]
},
"mid_block.attentions.0.transformer_blocks.0.attn1.to_q": {
"type": "linear",
"out_features": 480,
"in_features": 1280
},
"mid_block.attentions.0.transformer_blocks.0.attn1.to_k": {
"type": "linear",
"out_features": 480,
"in_features": 1280
},
"mid_block.attentions.0.transformer_blocks.0.attn1.to_v": {
"type": "linear",
"out_features": 480,
"in_features": 1280
},
"mid_block.attentions.0.transformer_blocks.0.attn1.to_out.0": {
"type": "linear",
"out_features": 1280,
"in_features": 480
},
"mid_block.attentions.0.transformer_blocks.0.attn2.to_q": {
"type": "linear",
"out_features": 480,
"in_features": 1280
},
"mid_block.attentions.0.transformer_blocks.0.attn2.to_k": {
"type": "linear",
"out_features": 480,
"in_features": 768
},
"mid_block.attentions.0.transformer_blocks.0.attn2.to_v": {
"type": "linear",
"out_features": 480,
"in_features": 768
},
"mid_block.attentions.0.transformer_blocks.0.attn2.to_out.0": {
"type": "linear",
"out_features": 1280,
"in_features": 480
},
"mid_block.attentions.0.transformer_blocks.0.ff.net.0.proj": {
"type": "linear",
"out_features": 1024,
"in_features": 1280
},
"mid_block.attentions.0.transformer_blocks.0.ff.net.2": {
"type": "linear",
"out_features": 1280,
"in_features": 512
},
"mid_block.attentions.0.proj_out": {
"type": "conv2d",
"out_channels": 1280,
"in_channels": 1280,
"kernel_size": [
1,
1
]
},
"mid_block.resnets.0.conv1": {
"type": "conv2d",
"out_channels": 608,
"in_channels": 1280,
"kernel_size": [
3,
3
]
},
"mid_block.resnets.0.time_emb_proj": {
"type": "linear",
"out_features": 608,
"in_features": 1280
},
"mid_block.resnets.0.conv2": {
"type": "conv2d",
"out_channels": 1280,
"in_channels": 608,
"kernel_size": [
3,
3
]
},
"mid_block.resnets.1.conv1": {
"type": "conv2d",
"out_channels": 608,
"in_channels": 1280,
"kernel_size": [
3,
3
]
},
"mid_block.resnets.1.time_emb_proj": {
"type": "linear",
"out_features": 608,
"in_features": 1280
},
"mid_block.resnets.1.conv2": {
"type": "conv2d",
"out_channels": 1280,
"in_channels": 608,
"kernel_size": [
3,
3
]
},
"conv_out": {
"type": "conv2d",
"out_channels": 4,
"in_channels": 320,
"kernel_size": [
3,
3
]
}
},
"torch_version": "2.9.1+cu130"
}