| encoder: | |
| type: point_image_text | |
| x0_mode: random | |
| num_bins: 256 | |
| input_channels: 3 | |
| d_model: 768 | |
| conv_layers: | |
| - 32 | |
| - 64 | |
| - 128 | |
| - 256 | |
| dino_image_size: 280 | |
| dino_mask_inject: true | |
| dino_rot90inputs: true | |
| dino_use_giant_model: true | |
| dino_legacy_upsample: false | |
| use_pre_text_attn_blocks: true | |
| use_sam2_features: false | |
| fm_transformer: | |
| hidden_size: 1024 | |
| num_heads: 16 | |
| mlp_ratio: 2.0 | |
| qkv_bias: true | |
| depth: 16 | |
| depth_single_blocks: 32 | |
| time_sampler: flux | |
| vae: | |
| embed_dim: 64 | |
| dataset: | |
| load_image_mode: composite | |
| num_views: 2 | |
| variable_num_views: false | |
| semi_dense_threshold_theta: 0.002 | |
| semi_dense_threshold_phi: 0.01 | |