File size: 4,309 Bytes
43b591a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
{
    "models": {
        "denoiser": {
            "name": "SparseStructurePoseFlowModel",
            "args": {
                "resolution": 16,
                "in_channels": 8,
                "out_channels": 8,
                "model_channels": 1024,
                "cond_channels": 1024,
                "num_blocks": 24,
                "num_heads": 16,
                "mlp_ratio": 4,
                "patch_size": 1,
                "pe_mode": "ape",
                "qk_rms_norm": true,
                "use_fp16": true,
                "use_point_embedder": true,
                "point_embedder_out_channels": 1024,
                "use_mask_embedder": true,
                "mask_embedder_out_channels": 1024,
                "use_frame_token_embedder": true,
                "pose_representation": "6d_translation_scale",
                "num_pose_tokens": 2
            }
        }
    },
    "dataset": {
        "name": "WebDatasetMultiViewSparseStructureLatent",
        "s3_data_root": "s3://data_root",
        "args": {
            "shards": "{ABO_wds/shard-{000000..000346},HSSD_wds/shard-{000000..000519},Objaverse_recgen_new_wds/shard-{000000..015735},PhysX3DParts_wds/shard-{000000..000933},PartNeXt_wds/shard-{000000..001528},PartNetMobility_wds/shard-{000000..000212}}.tar",
            "num_views": 2,
            "min_aesthetic_score": 0.0,
            "image_size": 518,
            "shuffle_buffer": 200,
            "length": 198455,
            "require_pose_data": true,
            "require_pointmap": true,
            "use_pose_normalization": true,
            "pose_variant": "median_quantile_5per",
            "mix_stereo_depth": true,
            "blacklist_file": "blacklist.txt",
            "metadata_root": ""
        }
    },
    "trainer": {
        "name": "MultiImageConditionedFlowMatchingCFGTrainer",
        "args": {
            "max_steps": 100000,
            "batch_size_per_gpu": 8,
            "batch_split": 2,
            "finetune_ckpt": {
                "denoiser": "checkpoints/TRELLIS-image-large/ckpts/ss_flow_img_dit_L_16l8_fp16.pt"
            },
            "optimizer": {
                "name": "AdamW",
                "args": {
                    "lr": 0.0001,
                    "weight_decay": 0.0
                }
            },
            "ema_rate": [
                0.9999
            ],
            "fp16_mode": "inflat_all",
            "fp16_scale_growth": 0.001,
            "grad_clip": {
                "name": "AdaptiveGradClipper",
                "args": {
                    "max_norm": 1.0,
                    "clip_percentile": 95
                }
            },
            "i_print": 200,
            "i_log": 200,
            "i_sample": 1000,
            "i_save": 5000,
            "i_ddpcheck": 5000,
            "generate_alignment_overlays": false,
            "num_alignment_samples": 4,
            "p_uncond": 0.1,
            "p_single_view": 0.33,
            "single_view_drop_mode": "last",
            "t_schedule": {
                "name": "logitNormal",
                "args": {
                    "mean": 1.0,
                    "std": 1.0
                }
            },
            "sigma_min": 1e-05,
            "pose_alpha": 0.01,
            "pose_representation": "6d_translation_scale",
            "use_pose_normalization": true,
            "image_cond_model": "dinov2_vitl14_reg"
        }
    },
    "use_wandb": true,
    "wandb_name": "flexible-dropout-stereo-depth-frame-token-2pose-s3",
    "wandb_config": {
        "experiment_type": "multiview_flexible_dropout_stereo_depth_frame_token_2pose",
        "datasets": [
            "ABO_wds",
            "HSSD_wds",
            "Objaverse_recgen_new_wds",
            "PhysX3DParts_wds",
            "PartNeXt_wds",
            "PartNetMobility_wds"
        ],
        "num_views": 2,
        "num_pose_tokens": 2,
        "view_dropout": "66% both / 33% single (independent) + 10% uncond (CFG)",
        "notes": "Multi-view training with 2 pose tokens (one per view). Learned per-frame token-type embeddings. Stereo depth mixing (50/50), flexible view dropout (33% single, pose loss masked for dropped view), shared CFG dropout (10% uncond). Robust pose normalization (median_quantile_5per). S3 streaming."
    }
}