File size: 3,194 Bytes
5d8c7a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
{
  "seed": 42,
  "output_dir": "work_dirs/baseline",
  "model": {
    "img_size": 256,
    "input_range": "minus_one_one",
    "num_classes": 1000,
    "encoder_type": "rectok",
    "encoder_model_size": "base",
    "encoder_patch_size": 16,
    "token_channels": 128,
    "mask_ratio": 0.4,
    "mask_ratio_min": -0.1,
    "mask_ratio_type": "random",
    "use_qknorm_encoder": false,
    "latent_hw": 16,
    "decoder_model": "JiTCoT-B/16",
    "decoder_patch_size": 16,
    "bottleneck_dim_latent": 128,
    "dh_depth": 2,
    "dh_hidden_size": 1024,
    "attn_dropout": 0.0,
    "proj_dropout": 0.0,
    "enable_ema": true,
    "ema_decay1": 0.9999,
    "ema_decay2": 0.9998,
    "label_drop_prob": 0.1,
    "P_mean": -0.4,
    "P_std": 0.8,
    "latent_mean": -1.2,
    "latent_std": 1.0,
    "latent_weight": 1.0,
    "choose_latent_p": 0.4,
    "perceptual_weight": 1.0,
    "perceptual_net": "lpips-convnext_s-1.0-0.1",
    "sample_mode": "latent_first_cascaded_noised",
    "latent_max_t": 1.0,
    "latent_pixel_offset": 0.0,
    "latent_pixel_shift": 1.0,
    "t_eps": 0.05,
    "t_eps_inference": 0.05,
    "noise_scale": 1.0,
    "sampling_method": "heun",
    "num_sampling_steps": 50,
    "cfg": 1.0,
    "cfg_latent": 1.0,
    "interval_min": 0.0,
    "interval_max": 1.0,
    "interval_min_latent": 0.0,
    "interval_max_latent": 1.0,
    "gen_shift_pixel": 1.0,
    "gen_shift_latent": 1.0,
    "guidance_method": "cfg"
  },
  "data": {
    "train_dir": "data/imagenet/train",
    "val_dir": "data/imagenet/val",
    "num_workers": 8,
    "pin_memory": true,
    "persistent_workers": true
  },
  "train": {
    "epochs": 200,
    "global_batch_size": 1024,
    "eval_global_batch_size": 1024,
    "grad_accum_steps": 1,
    "grad_clip": 3.0,
    "amp_dtype": "bf16",
    "log_interval": 50
  },
  "visualization": {
    "initial_visualization": true,
    "vis_interval": 100,
    "visualize_reconstruction": true,
    "visualize_generation": true
  },
  "eval": {
    "initial_eval": {
      "reconstruction": false,
      "generation": false
    },
    "gfid_interval": 10,
    "rfid_interval": 10,
    "gfid_stats_path": "fid_stats/jit_in256_stats.npz",
    "rfid_stats_path": "fid_stats/val_fid_statistics_file_256.npz",
    "inception_weights": "fid_stats/weights-inception-2015-12-05-6726825d.pth",
    "gfid_backend": "online",
    "gfid_num_classes": 1000,
    "gfid_num_images": 50000,
    "rfid_num_images": 50000,
    "batch_size": 64,
    "num_workers": 8,
    "gfid_metric_verbose": false,
    "gfid_keep_images": false,
    "gfid_cfg_scale": null,
    "gfid_cfg_scale_latent": null,
    "gfid_cfg_interval": null,
    "gfid_cfg_interval_latent": null,
    "gfid_steps": null,
    "eval_ema": "1"
  },
  "optim": {
    "name": "adamw",
    "lr": 0.0001,
    "lr_schedule": "constant",
    "weight_decay": 0.0,
    "betas": [
      0.9,
      0.95
    ],
    "min_lr": 1e-06,
    "warmup_epochs": 5
  },
  "checkpoint": {
    "resume": "",
    "auto_resume": true,
    "save_interval": 1,
    "keep_last": 3
  },
  "logging": {
    "enable_wandb": false,
    "entity": "",
    "project": "diffusion-decoder",
    "run_name": "diffusion_decoder_imagenet256"
  }
}