{ "models": { "denoiser": { "name": "SparseStructurePoseFlowModel", "args": { "resolution": 16, "in_channels": 8, "out_channels": 8, "model_channels": 1024, "cond_channels": 1024, "num_blocks": 24, "num_heads": 16, "mlp_ratio": 4, "patch_size": 1, "pe_mode": "ape", "qk_rms_norm": true, "use_fp16": true, "use_point_embedder": true, "point_embedder_out_channels": 1024, "use_mask_embedder": true, "mask_embedder_out_channels": 1024, "use_frame_token_embedder": true, "pose_representation": "6d_translation_scale", "num_pose_tokens": 2 } } }, "dataset": { "name": "WebDatasetMultiViewSparseStructureLatent", "s3_data_root": "s3://data_root", "args": { "shards": "{ABO_wds/shard-{000000..000346},HSSD_wds/shard-{000000..000519},Objaverse_recgen_new_wds/shard-{000000..015735},PhysX3DParts_wds/shard-{000000..000933},PartNeXt_wds/shard-{000000..001528},PartNetMobility_wds/shard-{000000..000212}}.tar", "num_views": 2, "min_aesthetic_score": 0.0, "image_size": 518, "shuffle_buffer": 200, "length": 198455, "require_pose_data": true, "require_pointmap": true, "use_pose_normalization": true, "pose_variant": "median_quantile_5per", "mix_stereo_depth": true, "blacklist_file": "blacklist.txt", "metadata_root": "" } }, "trainer": { "name": "MultiImageConditionedFlowMatchingCFGTrainer", "args": { "max_steps": 100000, "batch_size_per_gpu": 8, "batch_split": 2, "finetune_ckpt": { "denoiser": "checkpoints/TRELLIS-image-large/ckpts/ss_flow_img_dit_L_16l8_fp16.pt" }, "optimizer": { "name": "AdamW", "args": { "lr": 0.0001, "weight_decay": 0.0 } }, "ema_rate": [ 0.9999 ], "fp16_mode": "inflat_all", "fp16_scale_growth": 0.001, "grad_clip": { "name": "AdaptiveGradClipper", "args": { "max_norm": 1.0, "clip_percentile": 95 } }, "i_print": 200, "i_log": 200, "i_sample": 1000, "i_save": 5000, "i_ddpcheck": 5000, "generate_alignment_overlays": false, "num_alignment_samples": 4, "p_uncond": 0.1, "p_single_view": 0.33, "single_view_drop_mode": "last", "t_schedule": { "name": "logitNormal", "args": { "mean": 1.0, "std": 1.0 } }, "sigma_min": 1e-05, "pose_alpha": 0.01, "pose_representation": "6d_translation_scale", "use_pose_normalization": true, "image_cond_model": "dinov2_vitl14_reg" } }, "use_wandb": true, "wandb_name": "flexible-dropout-stereo-depth-frame-token-2pose-s3", "wandb_config": { "experiment_type": "multiview_flexible_dropout_stereo_depth_frame_token_2pose", "datasets": [ "ABO_wds", "HSSD_wds", "Objaverse_recgen_new_wds", "PhysX3DParts_wds", "PartNeXt_wds", "PartNetMobility_wds" ], "num_views": 2, "num_pose_tokens": 2, "view_dropout": "66% both / 33% single (independent) + 10% uncond (CFG)", "notes": "Multi-view training with 2 pose tokens (one per view). Learned per-frame token-type embeddings. Stereo depth mixing (50/50), flexible view dropout (33% single, pose loss masked for dropped view), shared CFG dropout (10% uncond). Robust pose normalization (median_quantile_5per). S3 streaming." } }