File size: 3,194 Bytes
5d8c7a5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 | {
"seed": 42,
"output_dir": "work_dirs/baseline",
"model": {
"img_size": 256,
"input_range": "minus_one_one",
"num_classes": 1000,
"encoder_type": "rectok",
"encoder_model_size": "base",
"encoder_patch_size": 16,
"token_channels": 128,
"mask_ratio": 0.4,
"mask_ratio_min": -0.1,
"mask_ratio_type": "random",
"use_qknorm_encoder": false,
"latent_hw": 16,
"decoder_model": "JiTCoT-B/16",
"decoder_patch_size": 16,
"bottleneck_dim_latent": 128,
"dh_depth": 2,
"dh_hidden_size": 1024,
"attn_dropout": 0.0,
"proj_dropout": 0.0,
"enable_ema": true,
"ema_decay1": 0.9999,
"ema_decay2": 0.9998,
"label_drop_prob": 0.1,
"P_mean": -0.4,
"P_std": 0.8,
"latent_mean": -1.2,
"latent_std": 1.0,
"latent_weight": 1.0,
"choose_latent_p": 0.4,
"perceptual_weight": 1.0,
"perceptual_net": "lpips-convnext_s-1.0-0.1",
"sample_mode": "latent_first_cascaded_noised",
"latent_max_t": 1.0,
"latent_pixel_offset": 0.0,
"latent_pixel_shift": 1.0,
"t_eps": 0.05,
"t_eps_inference": 0.05,
"noise_scale": 1.0,
"sampling_method": "heun",
"num_sampling_steps": 50,
"cfg": 1.0,
"cfg_latent": 1.0,
"interval_min": 0.0,
"interval_max": 1.0,
"interval_min_latent": 0.0,
"interval_max_latent": 1.0,
"gen_shift_pixel": 1.0,
"gen_shift_latent": 1.0,
"guidance_method": "cfg"
},
"data": {
"train_dir": "data/imagenet/train",
"val_dir": "data/imagenet/val",
"num_workers": 8,
"pin_memory": true,
"persistent_workers": true
},
"train": {
"epochs": 200,
"global_batch_size": 1024,
"eval_global_batch_size": 1024,
"grad_accum_steps": 1,
"grad_clip": 3.0,
"amp_dtype": "bf16",
"log_interval": 50
},
"visualization": {
"initial_visualization": true,
"vis_interval": 100,
"visualize_reconstruction": true,
"visualize_generation": true
},
"eval": {
"initial_eval": {
"reconstruction": false,
"generation": false
},
"gfid_interval": 10,
"rfid_interval": 10,
"gfid_stats_path": "fid_stats/jit_in256_stats.npz",
"rfid_stats_path": "fid_stats/val_fid_statistics_file_256.npz",
"inception_weights": "fid_stats/weights-inception-2015-12-05-6726825d.pth",
"gfid_backend": "online",
"gfid_num_classes": 1000,
"gfid_num_images": 50000,
"rfid_num_images": 50000,
"batch_size": 64,
"num_workers": 8,
"gfid_metric_verbose": false,
"gfid_keep_images": false,
"gfid_cfg_scale": null,
"gfid_cfg_scale_latent": null,
"gfid_cfg_interval": null,
"gfid_cfg_interval_latent": null,
"gfid_steps": null,
"eval_ema": "1"
},
"optim": {
"name": "adamw",
"lr": 0.0001,
"lr_schedule": "constant",
"weight_decay": 0.0,
"betas": [
0.9,
0.95
],
"min_lr": 1e-06,
"warmup_epochs": 5
},
"checkpoint": {
"resume": "",
"auto_resume": true,
"save_interval": 1,
"keep_last": 3
},
"logging": {
"enable_wandb": false,
"entity": "",
"project": "diffusion-decoder",
"run_name": "diffusion_decoder_imagenet256"
}
} |