| { | |
| "models": { | |
| "property_encoder": { | |
| "name": "ElasticPropertyEncoder", | |
| "args": { | |
| "resolution": 64, | |
| "in_channels": 3072, | |
| "in_channels_phy": 14, | |
| "model_channels": 768, | |
| "latent_channels": 8, | |
| "num_blocks": 4, | |
| "num_heads": 12, | |
| "mlp_ratio": 4, | |
| "attn_mode": "swin", | |
| "window_size": 8, | |
| "use_fp16": true | |
| } | |
| }, | |
| "property_decoder": { | |
| "name": "ElasticPropertyDecoder", | |
| "args": { | |
| "resolution": 64, | |
| "model_channels": 2048, | |
| "latent_channels": 8, | |
| "num_blocks": 4, | |
| "num_heads": 16, | |
| "mlp_ratio": 4, | |
| "attn_mode": "swin", | |
| "window_size": 8, | |
| "use_fp16": true, | |
| "representation_config": { | |
| "use_color": true | |
| } | |
| } | |
| }, | |
| "property_output": { | |
| "name": "PropertyOutput", | |
| "args": { | |
| "model_channels": 32, | |
| "output_channels_lang": 3072, | |
| "output_channels_phy": 14, | |
| "use_fp16": true | |
| } | |
| }, | |
| "decoder": { | |
| "name": "ElasticSLatMeshDecodernew", | |
| "args": { | |
| "resolution": 64, | |
| "model_channels": 768, | |
| "phy_channels": 2048, | |
| "latent_channels": 8, | |
| "num_blocks": 12, | |
| "num_heads": 12, | |
| "mlp_ratio": 4, | |
| "attn_mode": "swin", | |
| "window_size": 8, | |
| "use_fp16": true, | |
| "representation_config": { | |
| "use_color": true | |
| } | |
| } | |
| } | |
| }, | |
| "dataset": { | |
| "name": "Slat2RenderGeomesh", | |
| "args": { | |
| "image_size": 384, | |
| "latent_model": "dinov2_vitl14_reg_slat_enc_swin8_B_64l8_fp16", | |
| "min_aesthetic_score": 4.5, | |
| "max_num_voxels": 28000 | |
| } | |
| }, | |
| "trainer": { | |
| "name": "SLatVaeMeshTrainer", | |
| "args": { | |
| "onlyphy_property": true, | |
| "max_steps": 1000000, | |
| "batch_size_per_gpu": 4, | |
| "batch_split": 4, | |
| "optimizer": { | |
| "name": "AdamW", | |
| "args": { | |
| "lr": 0.0001, | |
| "weight_decay": 0.0 | |
| } | |
| }, | |
| "ema_rate": [ | |
| 0.9999 | |
| ], | |
| "fp16_mode": "inflat_all", | |
| "fp16_scale_growth": 0.001, | |
| "elastic": { | |
| "name": "LinearMemoryController", | |
| "args": { | |
| "target_ratio": 0.6, | |
| "max_mem_ratio_start": 0.5 | |
| } | |
| }, | |
| "grad_clip": { | |
| "name": "AdaptiveGradClipper", | |
| "args": { | |
| "max_norm": 1.0, | |
| "clip_percentile": 95 | |
| } | |
| }, | |
| "i_log": 10, | |
| "i_sample": 5000, | |
| "i_save": 10000, | |
| "lambda_ssim": 0.2, | |
| "lambda_lpips": 0.2, | |
| "lambda_tsdf": 0.01, | |
| "lambda_depth": 10.0, | |
| "lambda_color": 0.1, | |
| "lambda_kl": 1e-06, | |
| "depth_loss_type": "smooth_l1" | |
| } | |
| } | |
| } |