File size: 4,990 Bytes
43b591a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
{
    "models": {
        "denoiser": {
            "name": "ElasticSLatCondFlowModel",
            "args": {
                "resolution": 64,
                "in_channels": 8,
                "out_channels": 8,
                "model_channels": 1024,
                "cond_channels": 1024,
                "num_blocks": 24,
                "num_heads": 16,
                "mlp_ratio": 4,
                "patch_size": 2,
                "num_io_res_blocks": 2,
                "io_block_channels": [
                    128
                ],
                "pe_mode": "ape",
                "qk_rms_norm": true,
                "use_fp16": true,
                "use_point_embedder": true,
                "point_embedder_out_channels": 1024,
                "use_mask_embedder": true,
                "mask_embedder_out_channels": 1024,
                "use_pose_embedder": true,
                "pose_embedder_out_channels": 1024,
                "pose_representation": "9d_translation_scale"
            }
        }
    },
    "dataset": {
        "name": "WebDatasetMultiViewStructuredLatent",
        "s3_data_root": "s3://data_root",
        "args": {
            "shards": "ABO_wds/shard-{000000..000398},HSSD_wds/shard-{000000..000559},Objaverse_recgen_new_wds/shard-{000000..008374},Objaverse_recgen_new_wds/shard-{010000..018208},PartNeXt_wds_full/shard-{000000..001981}}.tar",
            "num_views": 2,
            "image_size": 518,
            "min_aesthetic_score": 4.5,
            "shuffle_buffer": 200,
            "length": 189119,
            "require_pose_data": true,
            "require_pointmap": true,
            "use_pose_normalization": true,
            "pose_variant": "median_quantile_5per",
            "mix_stereo_depth": true,
            "normalization": {
                "mean": [
                    -2.1687545776367188,
                    -0.004347046371549368,
                    -0.13352349400520325,
                    -0.08418072760105133,
                    -0.5271206498146057,
                    0.7238689064979553,
                    -1.1414450407028198,
                    1.2039363384246826
                ],
                "std": [
                    2.377650737762451,
                    2.386378288269043,
                    2.124418020248413,
                    2.1748552322387695,
                    2.663944721221924,
                    2.371192216873169,
                    2.6217446327209473,
                    2.684523105621338
                ]
            },
            "metadata_root": "/tmp/wds_metadata"
        }
    },
    "trainer": {
        "name": "MultiImageConditionedSparseFlowMatchingCFGTrainer",
        "args": {
            "max_steps": 100000,
            "batch_size_per_gpu": 8,
            "batch_split": 2,
            "finetune_ckpt": {
                "denoiser": "checkpoints/TRELLIS-image-large/ckpts/slat_flow_img_dit_L_64l8p2_fp16.pt"
            },
            "optimizer": {
                "name": "AdamW",
                "args": {
                    "lr": 0.0001,
                    "weight_decay": 0.0
                }
            },
            "ema_rate": [
                0.9999
            ],
            "fp16_mode": "inflat_all",
            "fp16_scale_growth": 0.001,
            "elastic": {
                "name": "LinearMemoryController",
                "args": {
                    "target_ratio": 0.85,
                    "max_mem_ratio_start": 0.6
                }
            },
            "grad_clip": {
                "name": "AdaptiveGradClipper",
                "args": {
                    "max_norm": 1.0,
                    "clip_percentile": 95
                }
            },
            "i_print": 200,
            "i_log": 200,
            "i_sample": 1000,
            "i_save": 5000,
            "i_ddpcheck": 5000,
            "p_uncond": 0.1,
            "p_single_view": 0.33,
            "single_view_drop_mode": "last",
            "t_schedule": {
                "name": "logitNormal",
                "args": {
                    "mean": 1.0,
                    "std": 1.0
                }
            },
            "sigma_min": 1e-05,
            "image_cond_model": "dinov2_vitl14_reg",
            "use_pose_conditioning": true,
            "pose_representation": "9d_translation_scale",
            "enable_snapshot": true
        }
    },
    "use_wandb": true,
    "wandb_name": "slat-multiview-2pose-s3",
    "wandb_config": {
        "experiment_type": "slat_multiview_2pose",
        "datasets": [
            "ABO_wds",
            "HSSD_wds",
            "Objaverse_recgen_new_wds",
            "PartNeXt_wds"
        ],
        "num_views": 2,
        "notes": "SLat multi-view training with per-view additive pose conditioning (9D rotation). Stereo depth mixing, flexible view dropout (33% single), shared CFG dropout (10% uncond). Robust pose normalization (median_quantile_5per). S3 streaming."
    }
}