{ "sample_size": 128, "patch_size": 2, "in_channels": 16, "num_layers": 18, "attention_head_dim": 64, "num_attention_heads": 18, "joint_attention_dim": 4096, "caption_projection_dim": 1152, "pooled_projection_dim": 2048, "out_channels": 16, "pos_embed_max_size": 96, "_use_default_values": [ "in_channels", "num_layers", "caption_projection_dim", "out_channels", "joint_attention_dim", "patch_size", "pos_embed_max_size", "num_attention_heads", "sample_size", "pooled_projection_dim", "attention_head_dim" ] }