| # Stage 2: Structure DiT (Rectified Flow) | |
| model: | |
| name: "SLAT-Interior-DiT" | |
| width: 1536 | |
| depth: 30 | |
| num_heads: 12 | |
| mlp_ratio: 8192 | |
| conditioning: | |
| image_encoder: "dinov3_large" | |
| depth_encoder: "custom_cnn" | |
| depth_dim: 256 | |
| layout_encoder: "transformer" | |
| layout_dim: 512 | |
| semantic_dim: 256 | |
| optimizer: | |
| type: AdamW | |
| lr: 1.0e-4 | |
| weight_decay: 0.01 | |
| scheduler: | |
| type: linear_warmup_cosine | |
| warmup_steps: 10000 | |
| training: | |
| batch_size: 8 # per GPU | |
| num_gpus: 32 | |
| effective_batch_size: 256 | |
| gradient_accumulation: 1 | |
| max_steps: 400000 | |
| mixed_precision: bf16 | |
| save_every: 10000 | |
| log_every: 100 | |
| curriculum: | |
| - resolution: 256 | |
| steps: 100000 | |
| lr: 1.0e-4 | |
| - resolution: 512 | |
| steps: 200000 | |
| lr: 1.0e-4 | |
| - resolution: 1024 | |
| steps: 100000 | |
| lr: 2.0e-5 | |
| data: | |
| dataset: "InteriorFusion-Train" | |
| num_workers: 8 | |
| pin_memory: true | |
| flow_matching: | |
| sigma_min: 0.001 | |
| sigma_max: 80.0 | |
| p_mean: -1.2 | |
| p_std: 1.2 | |
| loss: | |
| flow_matching: | |
| weight: 1.0 | |
| depth_guidance: | |
| weight: 0.3 | |