RecGen / stereo_config.json
zakharos's picture
Upload folder using huggingface_hub
43b591a verified
{
"models": {
"denoiser": {
"name": "SparseStructurePoseFlowModel",
"args": {
"resolution": 16,
"in_channels": 8,
"out_channels": 8,
"model_channels": 1024,
"cond_channels": 1024,
"num_blocks": 24,
"num_heads": 16,
"mlp_ratio": 4,
"patch_size": 1,
"pe_mode": "ape",
"qk_rms_norm": true,
"use_fp16": true,
"use_point_embedder": true,
"point_embedder_out_channels": 1024,
"use_mask_embedder": true,
"mask_embedder_out_channels": 1024,
"use_frame_token_embedder": true,
"pose_representation": "6d_translation_scale",
"num_pose_tokens": 2
}
}
},
"dataset": {
"name": "WebDatasetMultiViewSparseStructureLatent",
"s3_data_root": "s3://data_root",
"args": {
"shards": "{ABO_wds/shard-{000000..000346},HSSD_wds/shard-{000000..000519},Objaverse_recgen_new_wds/shard-{000000..015735},PhysX3DParts_wds/shard-{000000..000933},PartNeXt_wds/shard-{000000..001528},PartNetMobility_wds/shard-{000000..000212}}.tar",
"num_views": 2,
"min_aesthetic_score": 0.0,
"image_size": 518,
"shuffle_buffer": 200,
"length": 198455,
"require_pose_data": true,
"require_pointmap": true,
"use_pose_normalization": true,
"pose_variant": "median_quantile_5per",
"mix_stereo_depth": true,
"blacklist_file": "blacklist.txt",
"metadata_root": ""
}
},
"trainer": {
"name": "MultiImageConditionedFlowMatchingCFGTrainer",
"args": {
"max_steps": 100000,
"batch_size_per_gpu": 8,
"batch_split": 2,
"finetune_ckpt": {
"denoiser": "checkpoints/TRELLIS-image-large/ckpts/ss_flow_img_dit_L_16l8_fp16.pt"
},
"optimizer": {
"name": "AdamW",
"args": {
"lr": 0.0001,
"weight_decay": 0.0
}
},
"ema_rate": [
0.9999
],
"fp16_mode": "inflat_all",
"fp16_scale_growth": 0.001,
"grad_clip": {
"name": "AdaptiveGradClipper",
"args": {
"max_norm": 1.0,
"clip_percentile": 95
}
},
"i_print": 200,
"i_log": 200,
"i_sample": 1000,
"i_save": 5000,
"i_ddpcheck": 5000,
"generate_alignment_overlays": false,
"num_alignment_samples": 4,
"p_uncond": 0.1,
"p_single_view": 0.33,
"single_view_drop_mode": "last",
"t_schedule": {
"name": "logitNormal",
"args": {
"mean": 1.0,
"std": 1.0
}
},
"sigma_min": 1e-05,
"pose_alpha": 0.01,
"pose_representation": "6d_translation_scale",
"use_pose_normalization": true,
"image_cond_model": "dinov2_vitl14_reg"
}
},
"use_wandb": true,
"wandb_name": "flexible-dropout-stereo-depth-frame-token-2pose-s3",
"wandb_config": {
"experiment_type": "multiview_flexible_dropout_stereo_depth_frame_token_2pose",
"datasets": [
"ABO_wds",
"HSSD_wds",
"Objaverse_recgen_new_wds",
"PhysX3DParts_wds",
"PartNeXt_wds",
"PartNetMobility_wds"
],
"num_views": 2,
"num_pose_tokens": 2,
"view_dropout": "66% both / 33% single (independent) + 10% uncond (CFG)",
"notes": "Multi-view training with 2 pose tokens (one per view). Learned per-frame token-type embeddings. Stereo depth mixing (50/50), flexible view dropout (33% single, pose loss masked for dropped view), shared CFG dropout (10% uncond). Robust pose normalization (median_quantile_5per). S3 streaming."
}
}