RecGen / slat_config.json
zakharos's picture
Upload folder using huggingface_hub
43b591a verified
{
"models": {
"denoiser": {
"name": "ElasticSLatCondFlowModel",
"args": {
"resolution": 64,
"in_channels": 8,
"out_channels": 8,
"model_channels": 1024,
"cond_channels": 1024,
"num_blocks": 24,
"num_heads": 16,
"mlp_ratio": 4,
"patch_size": 2,
"num_io_res_blocks": 2,
"io_block_channels": [
128
],
"pe_mode": "ape",
"qk_rms_norm": true,
"use_fp16": true,
"use_point_embedder": true,
"point_embedder_out_channels": 1024,
"use_mask_embedder": true,
"mask_embedder_out_channels": 1024,
"use_pose_embedder": true,
"pose_embedder_out_channels": 1024,
"pose_representation": "9d_translation_scale"
}
}
},
"dataset": {
"name": "WebDatasetMultiViewStructuredLatent",
"s3_data_root": "s3://data_root",
"args": {
"shards": "ABO_wds/shard-{000000..000398},HSSD_wds/shard-{000000..000559},Objaverse_recgen_new_wds/shard-{000000..008374},Objaverse_recgen_new_wds/shard-{010000..018208},PartNeXt_wds_full/shard-{000000..001981}}.tar",
"num_views": 2,
"image_size": 518,
"min_aesthetic_score": 4.5,
"shuffle_buffer": 200,
"length": 189119,
"require_pose_data": true,
"require_pointmap": true,
"use_pose_normalization": true,
"pose_variant": "median_quantile_5per",
"mix_stereo_depth": true,
"normalization": {
"mean": [
-2.1687545776367188,
-0.004347046371549368,
-0.13352349400520325,
-0.08418072760105133,
-0.5271206498146057,
0.7238689064979553,
-1.1414450407028198,
1.2039363384246826
],
"std": [
2.377650737762451,
2.386378288269043,
2.124418020248413,
2.1748552322387695,
2.663944721221924,
2.371192216873169,
2.6217446327209473,
2.684523105621338
]
},
"metadata_root": "/tmp/wds_metadata"
}
},
"trainer": {
"name": "MultiImageConditionedSparseFlowMatchingCFGTrainer",
"args": {
"max_steps": 100000,
"batch_size_per_gpu": 8,
"batch_split": 2,
"finetune_ckpt": {
"denoiser": "checkpoints/TRELLIS-image-large/ckpts/slat_flow_img_dit_L_64l8p2_fp16.pt"
},
"optimizer": {
"name": "AdamW",
"args": {
"lr": 0.0001,
"weight_decay": 0.0
}
},
"ema_rate": [
0.9999
],
"fp16_mode": "inflat_all",
"fp16_scale_growth": 0.001,
"elastic": {
"name": "LinearMemoryController",
"args": {
"target_ratio": 0.85,
"max_mem_ratio_start": 0.6
}
},
"grad_clip": {
"name": "AdaptiveGradClipper",
"args": {
"max_norm": 1.0,
"clip_percentile": 95
}
},
"i_print": 200,
"i_log": 200,
"i_sample": 1000,
"i_save": 5000,
"i_ddpcheck": 5000,
"p_uncond": 0.1,
"p_single_view": 0.33,
"single_view_drop_mode": "last",
"t_schedule": {
"name": "logitNormal",
"args": {
"mean": 1.0,
"std": 1.0
}
},
"sigma_min": 1e-05,
"image_cond_model": "dinov2_vitl14_reg",
"use_pose_conditioning": true,
"pose_representation": "9d_translation_scale",
"enable_snapshot": true
}
},
"use_wandb": true,
"wandb_name": "slat-multiview-2pose-s3",
"wandb_config": {
"experiment_type": "slat_multiview_2pose",
"datasets": [
"ABO_wds",
"HSSD_wds",
"Objaverse_recgen_new_wds",
"PartNeXt_wds"
],
"num_views": 2,
"notes": "SLat multi-view training with per-view additive pose conditioning (9D rotation). Stereo depth mixing, flexible view dropout (33% single), shared CFG dropout (10% uncond). Robust pose normalization (median_quantile_5per). S3 streaming."
}
}