encoder: type: point_image_text x0_mode: random num_bins: 256 input_channels: 3 d_model: 768 conv_layers: - 32 - 64 - 128 - 256 dino_image_size: 280 dino_mask_inject: true dino_rot90inputs: true dino_use_giant_model: true dino_legacy_upsample: false use_pre_text_attn_blocks: true use_sam2_features: false fm_transformer: hidden_size: 1024 num_heads: 16 mlp_ratio: 2.0 qkv_bias: true depth: 16 depth_single_blocks: 32 time_sampler: flux vae: embed_dim: 64 dataset: load_image_mode: composite num_views: 2 variable_num_views: false semi_dense_threshold_theta: 0.002 semi_dense_threshold_phi: 0.01