| { |
| "models": { |
| "denoiser": { |
| "name": "SparseStructurePoseFlowModel", |
| "args": { |
| "resolution": 16, |
| "in_channels": 8, |
| "out_channels": 8, |
| "model_channels": 1024, |
| "cond_channels": 1024, |
| "num_blocks": 24, |
| "num_heads": 16, |
| "mlp_ratio": 4, |
| "patch_size": 1, |
| "pe_mode": "ape", |
| "qk_rms_norm": true, |
| "use_fp16": true, |
| "use_point_embedder": true, |
| "point_embedder_out_channels": 1024, |
| "use_mask_embedder": true, |
| "mask_embedder_out_channels": 1024, |
| "use_frame_token_embedder": true, |
| "pose_representation": "6d_translation_scale", |
| "num_pose_tokens": 2 |
| } |
| } |
| }, |
| "dataset": { |
| "name": "WebDatasetMultiViewSparseStructureLatent", |
| "s3_data_root": "s3://data_root", |
| "args": { |
| "shards": "{ABO_wds/shard-{000000..000346},HSSD_wds/shard-{000000..000519},Objaverse_recgen_new_wds/shard-{000000..015735},PhysX3DParts_wds/shard-{000000..000933},PartNeXt_wds/shard-{000000..001528},PartNetMobility_wds/shard-{000000..000212}}.tar", |
| "num_views": 2, |
| "min_aesthetic_score": 0.0, |
| "image_size": 518, |
| "shuffle_buffer": 200, |
| "length": 198455, |
| "require_pose_data": true, |
| "require_pointmap": true, |
| "use_pose_normalization": true, |
| "pose_variant": "median_quantile_5per", |
| "mix_stereo_depth": true, |
| "blacklist_file": "blacklist.txt", |
| "metadata_root": "" |
| } |
| }, |
| "trainer": { |
| "name": "MultiImageConditionedFlowMatchingCFGTrainer", |
| "args": { |
| "max_steps": 100000, |
| "batch_size_per_gpu": 8, |
| "batch_split": 2, |
| "finetune_ckpt": { |
| "denoiser": "checkpoints/TRELLIS-image-large/ckpts/ss_flow_img_dit_L_16l8_fp16.pt" |
| }, |
| "optimizer": { |
| "name": "AdamW", |
| "args": { |
| "lr": 0.0001, |
| "weight_decay": 0.0 |
| } |
| }, |
| "ema_rate": [ |
| 0.9999 |
| ], |
| "fp16_mode": "inflat_all", |
| "fp16_scale_growth": 0.001, |
| "grad_clip": { |
| "name": "AdaptiveGradClipper", |
| "args": { |
| "max_norm": 1.0, |
| "clip_percentile": 95 |
| } |
| }, |
| "i_print": 200, |
| "i_log": 200, |
| "i_sample": 1000, |
| "i_save": 5000, |
| "i_ddpcheck": 5000, |
| "generate_alignment_overlays": false, |
| "num_alignment_samples": 4, |
| "p_uncond": 0.1, |
| "p_single_view": 0.33, |
| "single_view_drop_mode": "last", |
| "t_schedule": { |
| "name": "logitNormal", |
| "args": { |
| "mean": 1.0, |
| "std": 1.0 |
| } |
| }, |
| "sigma_min": 1e-05, |
| "pose_alpha": 0.01, |
| "pose_representation": "6d_translation_scale", |
| "use_pose_normalization": true, |
| "image_cond_model": "dinov2_vitl14_reg" |
| } |
| }, |
| "use_wandb": true, |
| "wandb_name": "flexible-dropout-stereo-depth-frame-token-2pose-s3", |
| "wandb_config": { |
| "experiment_type": "multiview_flexible_dropout_stereo_depth_frame_token_2pose", |
| "datasets": [ |
| "ABO_wds", |
| "HSSD_wds", |
| "Objaverse_recgen_new_wds", |
| "PhysX3DParts_wds", |
| "PartNeXt_wds", |
| "PartNetMobility_wds" |
| ], |
| "num_views": 2, |
| "num_pose_tokens": 2, |
| "view_dropout": "66% both / 33% single (independent) + 10% uncond (CFG)", |
| "notes": "Multi-view training with 2 pose tokens (one per view). Learned per-frame token-type embeddings. Stereo depth mixing (50/50), flexible view dropout (33% single, pose loss masked for dropped view), shared CFG dropout (10% uncond). Robust pose normalization (median_quantile_5per). S3 streaming." |
| } |
| } |