{ "sample_size": 128, "in_channels": 4, "out_channels": 4, "center_input_sample": false, "flip_sin_to_cos": true, "freq_shift": 0, "down_block_types": [ "DownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D" ], "mid_block_type": "UNetMidBlock2DCrossAttn", "up_block_types": [ "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "UpBlock2D" ], "block_out_channels": [320, 640, 1280], "layers_per_block": 2, "cross_attention_dim": 2048, "transformer_layers_per_block": 10, "attention_head_dim": 8, "num_attention_heads": 16, "use_linear_projection": true }