huaweilin commited on
Commit
8edc572
·
verified ·
1 Parent(s): 04d477a

Add files using upload-large-folder tool

Browse files
anole/vqgan.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ede986bf6b171db3081ce171ad88e4ac970793cea14c180b3e5ac5105f4cb43
3
+ size 281270377
anole/vqgan.yaml ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 4.5e-06
3
+ target: taming.models.vqgan.VQModel
4
+ params:
5
+ embed_dim: 256
6
+ n_embed: 8192
7
+ ddconfig:
8
+ double_z: false
9
+ z_channels: 256
10
+ resolution: 512
11
+ in_channels: 3
12
+ out_ch: 3
13
+ ch: 128
14
+ ch_mult:
15
+ - 1
16
+ - 1
17
+ - 2
18
+ - 2
19
+ - 4
20
+ num_res_blocks: 2
21
+ attn_resolutions: []
22
+ dropout: 0.0
23
+ lossconfig:
24
+ target: taming.modules.losses.vqperceptual_vit_vqgan.VQLPIPSWithDiscriminator
25
+ params:
26
+ disc_start: 100001
27
+ perceptual_weight: 1.0
28
+ adversarial_weight: 0.5
29
+ disc_params:
30
+ size: 512
31
+ ckpt_path: manifold://fair_onellm_checkpoints/tree/v2/tokenizer/vqgan_wm_0209.ckpt
32
+ data:
33
+ target: main.DataModuleFromConfig
34
+ params:
35
+ batch_size: 4
36
+ num_workers: 10
37
+ image_size: 512
38
+ filter_image_size: 512
39
+ dataset: coco
40
+ aesthetics_th: 0
41
+ clipsim_th: 0
42
+ --distributed-world-size: null
43
+ '32': null
44
+ --distributed-port: null
45
+ '17338': null
46
+ --save-dir: null
47
+ /checkpoint/shellysheynin/shutterstock/512x512_1024tokens_4node_shutterstock_laion_no_attn_styleGAN:
48
+ log_every-500:
49
+ ngpu32: null
50
+ --tensorboard-logdir: null
51
+ /checkpoint/shellysheynin/tensorboard_logs/2023-03-30/512x512_1024tokens_4node_shutterstock_laion_no_attn_styleGAN:
52
+ log_every-500:
53
+ ngpu32: null
54
+ '14561': null
55
+ /checkpoint/shellysheynin/tensorboard_logs/2023-04-02/512x512_1024tokens_4node_shutterstock_laion_no_attn_styleGAN:
56
+ log_every-500:
57
+ ngpu32: null
bsqvit/checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b11e147819d7633e1b569ccb367c5df43f0ba683c2bbe640f6dfaad0d235a34
3
+ size 3144485248
bsqvit/config.yaml ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ target: transcoder.models.bsqvit.VITBSQModel
3
+ params:
4
+ embed_dim: 36
5
+ embed_group_size: 1
6
+ l2_norm: True
7
+ persample_entropy_compute: 'analytical'
8
+ post_q_l2_norm: True
9
+ logit_laplace: False
10
+ beta: 0.
11
+ vitconfig:
12
+ image_size: 256
13
+ patch_size: 8
14
+ width: 768
15
+ layers: 12
16
+ heads: 12
17
+ mlp_ratio: 4
18
+ drop_rate: 0.
19
+ # grad_checkpointing: True
20
+ loss:
21
+ target: transcoder.losses.vqperceptual.VQLPIPSWithDiscriminator
22
+ params:
23
+ disc_type: 'stylegan'
24
+ disc_input_size: 256
25
+ disc_loss: 'vanilla'
26
+ disc_reg_freq: 16
27
+ disc_conditional: False
28
+ disc_in_channels: 3
29
+ disc_start: 0
30
+ disc_weight: 0.1
31
+ codebook_weight: 0.1
32
+ codebook_rampup_multiplier: 3.0
33
+ codebook_rampup_steps: 2_000
34
+ perceptual_weight: 0.1
35
+ use_adaptive_disc_weight: False
36
+
37
+ data:
38
+ image_size: 256
39
+ batch_size: 32
40
+ num_workers: 8
41
+ train:
42
+ target: torchvision.datasets.ImageFolder
43
+ params:
44
+ root: '/storage/Datasets/ILSVRC2012/train'
45
+ val:
46
+ target: torchvision.datasets.ImageFolder
47
+ params:
48
+ root: '/storage/Datasets/ILSVRC2012/val'
49
+ # root: '/storage/Datasets/ILSVRC2012/coco/' # for coco2017val
50
+ zero_mean: True
51
+
52
+ optimizer:
53
+ disable_amp: False
54
+ use_bf16: True
55
+ base_lr: 4e-7
56
+ max_iter: 1_000_000
57
+ lr_scheduler_config:
58
+ target: transcoder.optim.schedulers.LambdaWarmUpCosineScheduler
59
+ params:
60
+ warm_up_steps: 5_000
61
+ max_decay_steps: 1_000_000
62
+ lr_start: 0.1
63
+ lr_max: 1.0
64
+ lr_min: 0.5
65
+ target: torch.optim.AdamW
66
+ params:
67
+ weight_decay: 1e-4
68
+ betas: [0.9, 0.99]
69
+ eps: 1e-8
70
+ evaluation:
71
+ interpolation: 'lanczos'
72
+ fid:
73
+ dims: 2048
74
+ num_samples: 50_000
75
+ # num_samples: 5_000 # for coco2017val
76
+ groundtruth_npz: 'imagenet_val_256x256_lanczos.npz'
77
+ # groundtruth_npz: 'coco_val_256x256_lanczos.npz' # for coco2017val
78
+ wandb:
79
+ project: transcoder
80
+ run: imagenet_128x128_bsqvit_b18g18_stylegan_f8_fp16
chameleon/vqgan.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ede986bf6b171db3081ce171ad88e4ac970793cea14c180b3e5ac5105f4cb43
3
+ size 281270377
chameleon/vqgan.yaml ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 4.5e-06
3
+ target: taming.models.vqgan.VQModel
4
+ params:
5
+ embed_dim: 256
6
+ n_embed: 8192
7
+ ddconfig:
8
+ double_z: false
9
+ z_channels: 256
10
+ resolution: 512
11
+ in_channels: 3
12
+ out_ch: 3
13
+ ch: 128
14
+ ch_mult:
15
+ - 1
16
+ - 1
17
+ - 2
18
+ - 2
19
+ - 4
20
+ num_res_blocks: 2
21
+ attn_resolutions: []
22
+ dropout: 0.0
23
+ lossconfig:
24
+ target: taming.modules.losses.vqperceptual_vit_vqgan.VQLPIPSWithDiscriminator
25
+ params:
26
+ disc_start: 100001
27
+ perceptual_weight: 1.0
28
+ adversarial_weight: 0.5
29
+ disc_params:
30
+ size: 512
31
+ ckpt_path: manifold://fair_onellm_checkpoints/tree/v2/tokenizer/vqgan_wm_0209.ckpt
32
+ data:
33
+ target: main.DataModuleFromConfig
34
+ params:
35
+ batch_size: 4
36
+ num_workers: 10
37
+ image_size: 512
38
+ filter_image_size: 512
39
+ dataset: coco
40
+ aesthetics_th: 0
41
+ clipsim_th: 0
42
+ --distributed-world-size: null
43
+ '32': null
44
+ --distributed-port: null
45
+ '17338': null
46
+ --save-dir: null
47
+ /checkpoint/shellysheynin/shutterstock/512x512_1024tokens_4node_shutterstock_laion_no_attn_styleGAN:
48
+ log_every-500:
49
+ ngpu32: null
50
+ --tensorboard-logdir: null
51
+ /checkpoint/shellysheynin/tensorboard_logs/2023-03-30/512x512_1024tokens_4node_shutterstock_laion_no_attn_styleGAN:
52
+ log_every-500:
53
+ ngpu32: null
54
+ '14561': null
55
+ /checkpoint/shellysheynin/tensorboard_logs/2023-04-02/512x512_1024tokens_4node_shutterstock_laion_no_attn_styleGAN:
56
+ log_every-500:
57
+ ngpu32: null
flowmo_hi/base.yaml ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ global_seed: 0
2
+
3
+ data:
4
+ batch_size: 16
5
+ eval_batch_size: 5
6
+ image_size: 256
7
+ num_workers: 8
8
+
9
+ imagenet_train_index: "imagenet_train_index_overall.json"
10
+ imagenet_train_tar: "ILSVRC2012_img_train.tar"
11
+ imagenet_val_index: "imagenet_val_index_overall.json"
12
+ imagenet_val_tar: "ILSVRC2012_img_val.tar"
13
+
14
+ model:
15
+ enable_mup: True
16
+ patch_size: 4
17
+ mup_width: 6
18
+ enc_mup_width: 4
19
+
20
+ fix_initial_norms: True
21
+ fix_norm_mode: l2
22
+
23
+ quantization_type: lfq
24
+ code_length: 256
25
+ context_dim: 18
26
+ codebook_size_for_entropy: 9
27
+ entropy_loss_weight: 0.0025
28
+ commit_loss_weight: 0.000625
29
+
30
+ enc_depth: 8
31
+ dec_depth: 16
32
+ enable_cfg: True
33
+
34
+ ema_decay: 0.9999
35
+
36
+ posttrain_sample: False
37
+ posttrain_sample_enable_cfg: False
38
+ posttrain_sample_k: 8
39
+
40
+ opt:
41
+ # Floats need to be specified with decimals to get loaded as such
42
+ lr: 1.0e-4
43
+ n_grad_acc: 2
44
+ schedule: fat_lognormal
45
+ freeze_encoder: False
46
+ freeze_encoder_after: 200000
47
+ weight_decay: 0.0
48
+ log_norms: True
49
+ lpips_weight: 0.1
50
+ lpips_mode: "vgg"
51
+
52
+ beta1: .9
53
+ beta2: .95
54
+
55
+ trainer:
56
+ enable_bfloat16: True
57
+ log_every: 100
58
+ checkpoint_every: 5000
59
+ max_steps: 10000000000000
60
+ keep_every: 200000
61
+ gs_checkpoint_bucket: ""
62
+
63
+ eval:
64
+ reconstruction: True
65
+ state_dict_key: model_ema_state_dict
66
+ eval_dir: ""
67
+ eval_baseline: ""
68
+ continuous: True
69
+ force_ckpt_path: null
70
+ subsample_rate: 1
71
+
72
+ sampling:
73
+ sample_steps: 25
74
+ schedule: "pow_0.25"
75
+ cfg: 1.5
76
+ mode: 'rf'
77
+ cfg_interval: "(.17,1.02)"
flowmo_hi/flowmo_hi.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa8278ec492028aa54915b061c27300550731bc1d4ddb543dea2ec54d432a137
3
+ size 7564557830
flowmo_lo/base.yaml ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ global_seed: 0
2
+
3
+ data:
4
+ batch_size: 16
5
+ eval_batch_size: 5
6
+ image_size: 256
7
+ num_workers: 8
8
+
9
+ imagenet_train_index: "imagenet_train_index_overall.json"
10
+ imagenet_train_tar: "ILSVRC2012_img_train.tar"
11
+ imagenet_val_index: "imagenet_val_index_overall.json"
12
+ imagenet_val_tar: "ILSVRC2012_img_val.tar"
13
+
14
+ model:
15
+ enable_mup: True
16
+ patch_size: 4
17
+ mup_width: 6
18
+ enc_mup_width: 4
19
+
20
+ fix_initial_norms: True
21
+ fix_norm_mode: l2
22
+
23
+ quantization_type: lfq
24
+ code_length: 256
25
+ context_dim: 18
26
+ codebook_size_for_entropy: 9
27
+ entropy_loss_weight: 0.0025
28
+ commit_loss_weight: 0.000625
29
+
30
+ enc_depth: 8
31
+ dec_depth: 16
32
+ enable_cfg: True
33
+
34
+ ema_decay: 0.9999
35
+
36
+ posttrain_sample: False
37
+ posttrain_sample_enable_cfg: False
38
+ posttrain_sample_k: 8
39
+
40
+ opt:
41
+ # Floats need to be specified with decimals to get loaded as such
42
+ lr: 1.0e-4
43
+ n_grad_acc: 2
44
+ schedule: fat_lognormal
45
+ freeze_encoder: False
46
+ freeze_encoder_after: 200000
47
+ weight_decay: 0.0
48
+ log_norms: True
49
+ lpips_weight: 0.1
50
+ lpips_mode: "vgg"
51
+
52
+ beta1: .9
53
+ beta2: .95
54
+
55
+ trainer:
56
+ enable_bfloat16: True
57
+ log_every: 100
58
+ checkpoint_every: 5000
59
+ max_steps: 10000000000000
60
+ keep_every: 200000
61
+ gs_checkpoint_bucket: ""
62
+
63
+ eval:
64
+ reconstruction: True
65
+ state_dict_key: model_ema_state_dict
66
+ eval_dir: ""
67
+ eval_baseline: ""
68
+ continuous: True
69
+ force_ckpt_path: null
70
+ subsample_rate: 1
71
+
72
+ sampling:
73
+ sample_steps: 25
74
+ schedule: "pow_0.25"
75
+ cfg: 1.5
76
+ mode: 'rf'
77
+ cfg_interval: "(.17,1.02)"
flowmo_lo/flowmo_lo.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12ee996b780057ef986260e6ed5a87d9036c08c29bcee3f479e5d0c8208636fc
3
+ size 7563389958
infinity-d32/infinity_vae_d32.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a669bca347d46dc1b10ca601ab67d4f231ad9bad07339e80b22f807ee36ad3e4
3
+ size 1557325341
infinity-d64/infinity_vae_d64.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab1d29c91d52aece43bae1e9c3f11ae993427a315db0be7c26780c5d1e2dbe28
3
+ size 1560864925
llamagen-ds16-t2i/vq_ds16_t2i.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e21fc1318e2e9ee641a07bdad0e20675e9ec35e6e3eb911d58b5d7a2cd8d4cb
3
+ size 287920306
llamagen-ds16/vq_ds16_c2i.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:109aa8afb2cf3761eec23cdc8644154cb498f5ab7eef2a35264d25e5e0499f7d
3
+ size 287920306
llamagen-ds8/vq_ds8_c2i.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c70d2966ec012f378793214ff0755087e7c5fe6f035b469995143325dca4a4e3
3
+ size 280809450
maskbit-16bit/maskbit_tokenizer_16bit.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44e3ff7c3bf15eda4e1e25586ff8f657be9291d5e2bf3b755505394e11765b06
3
+ size 222258958
maskbit-16bit/maskbit_tokenizer_16bit.yaml ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ experiment:
2
+ project: "MaskBit"
3
+ name: "maskbit_tokenizer_16bit"
4
+ max_train_examples: 1_281_167 # total number of imagenet examples
5
+ save_every: 20_000
6
+ eval_every: 20_000
7
+ generate_every: 2000
8
+ log_every: 50
9
+ log_grad_norm_every: 100_000
10
+ logger: "tensorboard"
11
+ resume: True
12
+ init_checkpoint: ""
13
+ # vqgan_checkpoint: "MODEL_PATH/maskbit_tokenizer_16bit.bin" # Only for evaluating a trained model
14
+
15
+ model:
16
+ vq_model:
17
+ model_class: "vqgan+"
18
+ quantizer_type: "lookup-free"
19
+ codebook_size: 4096
20
+ token_size: 16
21
+ commitment_cost: 0.25
22
+ entropy_loss_weight: 0.02
23
+ entropy_loss_temperature: 0.01
24
+ entropy_gamma: 1.0
25
+ num_channels: 3 # rgb
26
+ hidden_channels: 128
27
+ channel_mult: [1,1,2,2,4]
28
+ num_resolutions: 5
29
+ num_res_blocks: 2
30
+ sample_with_conv: True
31
+
32
+ discriminator:
33
+ name: "VQGAN+Discriminator"
34
+ num_channels: 3
35
+ num_stages: 4
36
+ hidden_channels: 128
37
+ blur_resample: True
38
+ blur_kernel_size: 4
39
+
40
+
41
+ losses:
42
+ quantizer_weight: 1.0
43
+ perceptual_loss: "resnet50"
44
+ perceptual_weight: 0.1
45
+ perceptual_loss_on_logits: True
46
+ reconstruction_loss: "l2"
47
+ reconstruction_weight: 4.0
48
+ discriminator_start: 20_000
49
+ discriminator_loss: "hinge"
50
+ discriminator_factor: 1.0
51
+ discriminator_weight: 0.02
52
+ discriminator_gradient_penalty: "none"
53
+ discriminator_penalty_cost: 10.0
54
+ lecam_regularization_weight: 0.001
55
+ entropy_annealing_steps: 2000
56
+ entropy_annealing_factor: 2.0
57
+
58
+ dataset:
59
+ params:
60
+ train_shards_path_or_url: "DATA_PATH/imagenet_shards/train/imagenet-train-{0000..0252}.tar"
61
+ eval_shards_path_or_url: "DATA_PATH/imagenet_shards/val/imagenet-val-{0000..0009}.tar"
62
+ shuffle_buffer_size: 1000
63
+ num_workers_per_gpu: 8
64
+ pin_memory: True
65
+ persistent_workers: True
66
+ preprocessing:
67
+ resolution: 256
68
+ use_aspect_ratio_aug: True
69
+ use_random_crop: True
70
+ min_scale: 0.8
71
+ interpolation: "bilinear"
72
+
73
+
74
+ optimizer:
75
+ name: adamw
76
+ params: # default adamw params
77
+ learning_rate: 1e-4
78
+ discriminator_learning_rate: 1e-4
79
+ scale_lr: False # scale learning rate by total batch size
80
+ beta1: 0.9
81
+ beta2: 0.999
82
+ weight_decay: 1e-4
83
+ epsilon: 1e-8
84
+
85
+
86
+ lr_scheduler:
87
+ scheduler: "cosine_with_minimum"
88
+ params:
89
+ learning_rate: ${optimizer.params.learning_rate}
90
+ warmup_steps: 5_000
91
+
92
+
93
+ training:
94
+ gradient_accumulation_steps: 1
95
+ per_gpu_batch_size: 16
96
+ mixed_precision: "no" # "bf16"
97
+ enable_tf32: True
98
+ use_ema: True
99
+ seed: 42
100
+ max_train_steps: 1_350_000
101
+ overfit_batch: False
102
+ overfit_batch_num: 1
103
+ num_generated_images: 2 # Must be smaller than or equal to per_gpu_batch_size
104
+ max_grad_norm: 1.0
maskbit-18bit/maskbit_tokenizer_18bit.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c672c2508e14c4d7cd8621c1cb8939610478632725bb26521cf3c4bb6fc3f78
3
+ size 236979982
maskbit-18bit/maskbit_tokenizer_18bit.yaml ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ experiment:
2
+ project: "MaskBit"
3
+ name: "maskbit_tokenizer_18bit"
4
+ max_train_examples: 1_281_167 # total number of imagenet examples
5
+ save_every: 20_000
6
+ eval_every: 20_000
7
+ generate_every: 2000
8
+ log_every: 50
9
+ log_grad_norm_every: 100_000
10
+ logger: "tensorboard"
11
+ resume: True
12
+ init_checkpoint: ""
13
+ # vqgan_checkpoint: ""MODEL_PATH/maskbit_tokenizer_18bit.bin" # Only for evaluating a trained model
14
+
15
+ model:
16
+ vq_model:
17
+ model_class: "vqgan+"
18
+ quantizer_type: "lookup-free"
19
+ codebook_size: 4096
20
+ token_size: 18
21
+ commitment_cost: 0.25
22
+ entropy_loss_weight: 0.02
23
+ entropy_loss_temperature: 0.01
24
+ entropy_gamma: 1.0
25
+ num_channels: 3 # rgb
26
+ hidden_channels: 128
27
+ channel_mult: [1,1,2,2,4]
28
+ num_resolutions: 5
29
+ num_res_blocks: 2
30
+ sample_with_conv: True
31
+
32
+ discriminator:
33
+ name: "VQGAN+Discriminator"
34
+ num_channels: 3
35
+ num_stages: 4
36
+ hidden_channels: 128
37
+ blur_resample: True
38
+ blur_kernel_size: 4
39
+
40
+
41
+ losses:
42
+ quantizer_weight: 1.0
43
+ perceptual_loss: "resnet50"
44
+ perceptual_weight: 0.1
45
+ perceptual_loss_on_logits: True
46
+ reconstruction_loss: "l2"
47
+ reconstruction_weight: 4.0
48
+ discriminator_start: 20_000
49
+ discriminator_loss: "hinge"
50
+ discriminator_factor: 1.0
51
+ discriminator_weight: 0.02
52
+ discriminator_gradient_penalty: "none"
53
+ discriminator_penalty_cost: 10.0
54
+ lecam_regularization_weight: 0.001
55
+ entropy_annealing_steps: 2000
56
+ entropy_annealing_factor: 2.0
57
+
58
+ dataset:
59
+ params:
60
+ train_shards_path_or_url: "DATA_PATH/imagenet_shards/train/imagenet-train-{0000..0252}.tar"
61
+ eval_shards_path_or_url: "DATA_PATHimagenet_shards/val/imagenet-val-{0000..0009}.tar"
62
+ shuffle_buffer_size: 1000
63
+ num_workers_per_gpu: 8
64
+ pin_memory: True
65
+ persistent_workers: True
66
+ preprocessing:
67
+ resolution: 256
68
+ use_aspect_ratio_aug: True
69
+ use_random_crop: True
70
+ min_scale: 0.8
71
+ interpolation: "bilinear"
72
+
73
+
74
+ optimizer:
75
+ name: adamw
76
+ params: # default adamw params
77
+ learning_rate: 1e-4
78
+ discriminator_learning_rate: 1e-4
79
+ scale_lr: False # scale learning rate by total batch size
80
+ beta1: 0.9
81
+ beta2: 0.999
82
+ weight_decay: 1e-4
83
+ epsilon: 1e-8
84
+
85
+
86
+ lr_scheduler:
87
+ scheduler: "cosine_with_minimum"
88
+ params:
89
+ learning_rate: ${optimizer.params.learning_rate}
90
+ warmup_steps: 5_000
91
+
92
+
93
+ training:
94
+ gradient_accumulation_steps: 1
95
+ per_gpu_batch_size: 16
96
+ mixed_precision: "no" # "bf16"
97
+ enable_tf32: True
98
+ use_ema: True
99
+ seed: 42
100
+ max_train_steps: 1_350_000
101
+ overfit_batch: False
102
+ overfit_batch_num: 1
103
+ num_generated_images: 2 # Must be smaller than or equal to per_gpu_batch_size
104
+ max_grad_norm: 1.0
open_magvit2/imagenet_256_L.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0e27c4a5ab21db0f38cba484e717584e8a8ea7d9fda9a729d058bc8d68d0922
3
+ size 921240234
open_magvit2/imagenet_lfqgan_256_L.yaml ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ seed_everything: true
2
+ trainer:
3
+ accelerator: gpu
4
+ strategy: ddp_find_unused_parameters_true
5
+ devices: 8
6
+ num_nodes: 4
7
+ precision: 16-mixed
8
+ max_epochs: 270
9
+ check_val_every_n_epoch: 1
10
+ num_sanity_val_steps: -1
11
+ log_every_n_steps: 100
12
+ callbacks:
13
+ - class_path: lightning.pytorch.callbacks.ModelCheckpoint
14
+ init_args:
15
+ dirpath: "../../checkpoints/vqgan/test"
16
+ save_top_k: -1 # save all checkpoints
17
+ - class_path: lightning.pytorch.callbacks.LearningRateMonitor
18
+ init_args:
19
+ logging_interval: step
20
+ logger:
21
+ class_path: lightning.pytorch.loggers.TensorBoardLogger
22
+ init_args:
23
+ save_dir: "../../results/vqgan/"
24
+ version: "test"
25
+ name:
26
+
27
+ model:
28
+ class_path: src.Open_MAGVIT2.models.lfqgan.VQModel
29
+ init_args:
30
+ ddconfig:
31
+ double_z: False
32
+ z_channels: 18
33
+ resolution: 128
34
+ in_channels: 3
35
+ out_ch: 3
36
+ ch: 128
37
+ ch_mult: [1,1,2,2,4] # num_down = len(ch_mult)-1
38
+ num_res_blocks: 4
39
+
40
+ lossconfig:
41
+ target: src.Open_MAGVIT2.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
42
+ params:
43
+ disc_conditional: False
44
+ disc_in_channels: 3
45
+ disc_start: 0 # from 0 epoch
46
+ disc_weight: 0.8
47
+ gen_loss_weight: 0.1
48
+ lecam_loss_weight: 0.05
49
+ codebook_weight: 0.1
50
+ commit_weight: 0.25
51
+ codebook_enlarge_ratio: 0
52
+ codebook_enlarge_steps: 2000
53
+
54
+ n_embed: 262144
55
+ embed_dim: 18
56
+ learning_rate: 1e-4
57
+ sample_minimization_weight: 1.0
58
+ batch_maximization_weight: 1.0
59
+ scheduler_type: "None"
60
+ use_ema: True
61
+ resume_lr:
62
+ lr_drop_epoch: [200, 250]
63
+
64
+ data:
65
+ class_path: main.DataModuleFromConfig
66
+ init_args:
67
+ batch_size: 8
68
+ num_workers: 16
69
+ train:
70
+ target: src.Open_MAGVIT2.data.imagenet.ImageNetTrain
71
+ params:
72
+ config:
73
+ size: 256
74
+ subset:
75
+ validation:
76
+ target: src.Open_MAGVIT2.data.imagenet.ImageNetValidation
77
+ params:
78
+ config:
79
+ size: 256
80
+ subset:
81
+ test:
82
+ target: src.Open_MAGVIT2.data.imagenet.ImageNetValidation
83
+ params:
84
+ config:
85
+ size: 256
86
+ subset:
87
+
88
+ ckpt_path: null # to resume
var/vae_ch160v4096z32.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c3ec27ae28a3f87055e83211ea8cc8558bd1985d7b51742d074fb4c2fcf186c
3
+ size 436075834