| _target_: ignore_this_field | |
| depth: 48 | |
| patch_size: 2 | |
| num_heads: 24 | |
| hidden_size_x: 3072 | |
| hidden_size_y: 1536 | |
| mlp_ratio_x: 4.0 | |
| mlp_ratio_y: 4.0 | |
| learn_sigma: false | |
| in_channels: 12 | |
| clip_feat_dim: 2048 | |
| qk_norm: true | |
| qkv_bias: false | |
| out_bias: true | |
| attn_drop: 0.0 | |
| patch_embed_bias: true | |
| posenc_preserve_area: true | |
| timestep_mlp_bias: true | |
| pooled_caption_mlp_bias: true | |
| attend_to_padding: false | |
| timestep_scale: 1000.0 | |
| use_t5: true | |
| t5_feat_dim: 4096 | |
| t5_token_length: 256 | |
| rope_theta: 10000.0 | |
| use_transformer_engine: true | |