Spaces:
Sleeping
Sleeping
Commit
·
3b4f4ee
1
Parent(s):
0216954
update yamls
Browse files- dac-vae/base.yml +5 -6
- dac-vae/config.yml +5 -6
- flowae/configs/datasets/dae.yaml +6 -6
- flowae/configs/datasets/imagenet_ae.yaml +25 -6
- flowae/configs/datasets/imagenet_zdm.yaml +23 -6
- flowae/configs/experiments/dito-B-audio.yaml +10 -11
- flowae/configs/experiments/dito-B-f8c4-noise-sync.yaml +9 -9
- flowae/configs/experiments/dito-B-f8c4.yaml +8 -8
- flowae/configs/experiments/dito-L-f8c4.yaml +8 -8
- flowae/configs/experiments/dito-XL-f8c4-noise-sync.yaml +9 -9
- flowae/configs/experiments/dito-XL-f8c4.yaml +8 -8
- flowae/configs/experiments/eval50k_zdm-XL_dito-XL-f8c4-noise-sync.yaml +9 -9
- flowae/configs/experiments/eval50k_zdm-XL_dito-XL-f8c4.yaml +9 -9
- flowae/configs/experiments/zdm-XL_dito-XL-f8c4-noise-sync.yaml +9 -9
- flowae/configs/experiments/zdm-XL_dito-XL-f8c4.yaml +10 -10
- flowae/configs/models/zdm-XL_imagenet.yaml +4 -4
- flowae/configs/trainers/dito.yaml +3 -3
- flowae/configs/trainers/glpto.yaml +4 -4
- flowae/configs/trainers/zdm.yaml +2 -2
- flowae/load/wandb.yaml +3 -3
dac-vae/base.yml
CHANGED
|
@@ -19,12 +19,11 @@ discriminator:
|
|
| 19 |
periods: [2, 3, 5, 7, 11]
|
| 20 |
fft_sizes: [2048, 1024, 512]
|
| 21 |
bands:
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
|
| 29 |
max_norm: 1000
|
| 30 |
max_norm_d: 10
|
|
|
|
| 19 |
periods: [2, 3, 5, 7, 11]
|
| 20 |
fft_sizes: [2048, 1024, 512]
|
| 21 |
bands:
|
| 22 |
+
- [0.0, 0.1]
|
| 23 |
+
- [0.1, 0.25]
|
| 24 |
+
- [0.25, 0.5]
|
| 25 |
+
- [0.5, 0.75]
|
| 26 |
+
- [0.75, 1.0]
|
|
|
|
| 27 |
|
| 28 |
max_norm: 1000
|
| 29 |
max_norm_d: 10
|
dac-vae/config.yml
CHANGED
|
@@ -19,12 +19,11 @@ discriminator:
|
|
| 19 |
periods: [2, 3, 5, 7, 11]
|
| 20 |
fft_sizes: [2048, 1024, 512]
|
| 21 |
bands:
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
|
| 29 |
max_norm: 1000
|
| 30 |
max_norm_d: 10
|
|
|
|
| 19 |
periods: [2, 3, 5, 7, 11]
|
| 20 |
fft_sizes: [2048, 1024, 512]
|
| 21 |
bands:
|
| 22 |
+
- [0.0, 0.1]
|
| 23 |
+
- [0.1, 0.25]
|
| 24 |
+
- [0.25, 0.5]
|
| 25 |
+
- [0.5, 0.75]
|
| 26 |
+
- [0.75, 1.0]
|
|
|
|
| 27 |
|
| 28 |
max_norm: 1000
|
| 29 |
max_norm_d: 10
|
flowae/configs/datasets/dae.yaml
CHANGED
|
@@ -6,7 +6,7 @@ datasets:
|
|
| 6 |
dataset:
|
| 7 |
name: class_folder_audio
|
| 8 |
args:
|
| 9 |
-
root_path:
|
| 10 |
sample_rate: 24000
|
| 11 |
duration: 0.38
|
| 12 |
shuffle: true
|
|
@@ -19,14 +19,14 @@ datasets:
|
|
| 19 |
batch_size: 52
|
| 20 |
num_workers: 8
|
| 21 |
drop_last: true
|
| 22 |
-
|
| 23 |
val:
|
| 24 |
name: wrapper_audio_cae
|
| 25 |
args:
|
| 26 |
dataset:
|
| 27 |
name: class_folder_audio
|
| 28 |
args:
|
| 29 |
-
root_path:
|
| 30 |
sample_rate: 24000
|
| 31 |
duration: 5.0
|
| 32 |
shuffle: false
|
|
@@ -39,14 +39,14 @@ datasets:
|
|
| 39 |
batch_size: 4
|
| 40 |
num_workers: 8
|
| 41 |
drop_last: false
|
| 42 |
-
|
| 43 |
eval_ae:
|
| 44 |
name: wrapper_audio_cae
|
| 45 |
args:
|
| 46 |
dataset:
|
| 47 |
name: class_folder_audio
|
| 48 |
args:
|
| 49 |
-
root_path:
|
| 50 |
sample_rate: 24000
|
| 51 |
duration: 5.0
|
| 52 |
shuffle: false
|
|
@@ -67,4 +67,4 @@ eval_ae_max_samples: 100
|
|
| 67 |
val_idx: [0, 1, 2, 3, 4, 5, 6, 7]
|
| 68 |
|
| 69 |
# Enable autoencoder evaluation
|
| 70 |
-
evaluate_ae: true
|
|
|
|
| 6 |
dataset:
|
| 7 |
name: class_folder_audio
|
| 8 |
args:
|
| 9 |
+
root_path: '/home/masuser/minimax-audio/dataset/Emilia/EN'
|
| 10 |
sample_rate: 24000
|
| 11 |
duration: 0.38
|
| 12 |
shuffle: true
|
|
|
|
| 19 |
batch_size: 52
|
| 20 |
num_workers: 8
|
| 21 |
drop_last: true
|
| 22 |
+
|
| 23 |
val:
|
| 24 |
name: wrapper_audio_cae
|
| 25 |
args:
|
| 26 |
dataset:
|
| 27 |
name: class_folder_audio
|
| 28 |
args:
|
| 29 |
+
root_path: '/home/masuser/minimax-audio/dataset/libritts'
|
| 30 |
sample_rate: 24000
|
| 31 |
duration: 5.0
|
| 32 |
shuffle: false
|
|
|
|
| 39 |
batch_size: 4
|
| 40 |
num_workers: 8
|
| 41 |
drop_last: false
|
| 42 |
+
|
| 43 |
eval_ae:
|
| 44 |
name: wrapper_audio_cae
|
| 45 |
args:
|
| 46 |
dataset:
|
| 47 |
name: class_folder_audio
|
| 48 |
args:
|
| 49 |
+
root_path: '/home/masuser/minimax-audio/dataset/libritts'
|
| 50 |
sample_rate: 24000
|
| 51 |
duration: 5.0
|
| 52 |
shuffle: false
|
|
|
|
| 67 |
val_idx: [0, 1, 2, 3, 4, 5, 6, 7]
|
| 68 |
|
| 69 |
# Enable autoencoder evaluation
|
| 70 |
+
evaluate_ae: true
|
flowae/configs/datasets/imagenet_ae.yaml
CHANGED
|
@@ -4,7 +4,14 @@ datasets:
|
|
| 4 |
args:
|
| 5 |
dataset:
|
| 6 |
name: class_folder
|
| 7 |
-
args:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
resize_inp: 256
|
| 9 |
gt_glores_lb: 256
|
| 10 |
gt_glores_ub: 256
|
|
@@ -12,13 +19,19 @@ datasets:
|
|
| 12 |
loader:
|
| 13 |
batch_size: 14
|
| 14 |
num_workers: 24
|
| 15 |
-
|
| 16 |
val:
|
| 17 |
name: wrapper_cae
|
| 18 |
args:
|
| 19 |
dataset:
|
| 20 |
name: class_folder
|
| 21 |
-
args:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
resize_inp: 256
|
| 23 |
gt_glores_lb: 256
|
| 24 |
gt_glores_ub: 256
|
|
@@ -26,13 +39,19 @@ datasets:
|
|
| 26 |
loader:
|
| 27 |
batch_size: 14
|
| 28 |
num_workers: 24
|
| 29 |
-
|
| 30 |
eval_ae:
|
| 31 |
name: wrapper_cae
|
| 32 |
args:
|
| 33 |
dataset:
|
| 34 |
name: class_folder
|
| 35 |
-
args:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
resize_inp: 256
|
| 37 |
gt_glores_lb: 256
|
| 38 |
gt_glores_ub: 256
|
|
@@ -44,4 +63,4 @@ datasets:
|
|
| 44 |
|
| 45 |
visualize_ae_dir: /mnt/nvme/dito
|
| 46 |
visualize_ae_random_n_samples: 32
|
| 47 |
-
eval_ae_max_samples: 5000
|
|
|
|
| 4 |
args:
|
| 5 |
dataset:
|
| 6 |
name: class_folder
|
| 7 |
+
args:
|
| 8 |
+
{
|
| 9 |
+
root_path: /home/masuser/minimax-audio/mnist_png/training,
|
| 10 |
+
resize: 256,
|
| 11 |
+
rand_crop: 256,
|
| 12 |
+
rand_flip: true,
|
| 13 |
+
image_only: true,
|
| 14 |
+
}
|
| 15 |
resize_inp: 256
|
| 16 |
gt_glores_lb: 256
|
| 17 |
gt_glores_ub: 256
|
|
|
|
| 19 |
loader:
|
| 20 |
batch_size: 14
|
| 21 |
num_workers: 24
|
| 22 |
+
|
| 23 |
val:
|
| 24 |
name: wrapper_cae
|
| 25 |
args:
|
| 26 |
dataset:
|
| 27 |
name: class_folder
|
| 28 |
+
args:
|
| 29 |
+
{
|
| 30 |
+
root_path: /home/masuser/minimax-audio/mnist_png/testing,
|
| 31 |
+
resize: 256,
|
| 32 |
+
square_crop: true,
|
| 33 |
+
image_only: true,
|
| 34 |
+
}
|
| 35 |
resize_inp: 256
|
| 36 |
gt_glores_lb: 256
|
| 37 |
gt_glores_ub: 256
|
|
|
|
| 39 |
loader:
|
| 40 |
batch_size: 14
|
| 41 |
num_workers: 24
|
| 42 |
+
|
| 43 |
eval_ae:
|
| 44 |
name: wrapper_cae
|
| 45 |
args:
|
| 46 |
dataset:
|
| 47 |
name: class_folder
|
| 48 |
+
args:
|
| 49 |
+
{
|
| 50 |
+
root_path: /home/masuser/minimax-audio/mnist_png/testing,
|
| 51 |
+
resize: 256,
|
| 52 |
+
square_crop: true,
|
| 53 |
+
image_only: true,
|
| 54 |
+
}
|
| 55 |
resize_inp: 256
|
| 56 |
gt_glores_lb: 256
|
| 57 |
gt_glores_ub: 256
|
|
|
|
| 63 |
|
| 64 |
visualize_ae_dir: /mnt/nvme/dito
|
| 65 |
visualize_ae_random_n_samples: 32
|
| 66 |
+
eval_ae_max_samples: 5000
|
flowae/configs/datasets/imagenet_zdm.yaml
CHANGED
|
@@ -4,7 +4,14 @@ datasets:
|
|
| 4 |
args:
|
| 5 |
dataset:
|
| 6 |
name: class_folder
|
| 7 |
-
args:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
resize_inp: 256
|
| 9 |
gt_glores_lb: 256
|
| 10 |
gt_glores_ub: 256
|
|
@@ -12,13 +19,18 @@ datasets:
|
|
| 12 |
loader:
|
| 13 |
batch_size: 64
|
| 14 |
num_workers: 24
|
| 15 |
-
|
| 16 |
val:
|
| 17 |
name: wrapper_cae
|
| 18 |
args:
|
| 19 |
dataset:
|
| 20 |
name: class_folder
|
| 21 |
-
args:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
resize_inp: 256
|
| 23 |
gt_glores_lb: 256
|
| 24 |
gt_glores_ub: 256
|
|
@@ -26,13 +38,18 @@ datasets:
|
|
| 26 |
loader:
|
| 27 |
batch_size: 64
|
| 28 |
num_workers: 24
|
| 29 |
-
|
| 30 |
eval_zdm:
|
| 31 |
name: wrapper_cae
|
| 32 |
args:
|
| 33 |
dataset:
|
| 34 |
name: class_folder
|
| 35 |
-
args:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
resize_inp: 256
|
| 37 |
gt_glores_lb: 256
|
| 38 |
gt_glores_ub: 256
|
|
@@ -50,4 +67,4 @@ visualize_zdm_random_n_samples: 12
|
|
| 50 |
visualize_zdm_batch_size: 6
|
| 51 |
visualize_zdm_guidance_list: [4]
|
| 52 |
visualize_zdm_denoising_file: null
|
| 53 |
-
eval_zdm_max_samples: 5000
|
|
|
|
| 4 |
args:
|
| 5 |
dataset:
|
| 6 |
name: class_folder
|
| 7 |
+
args:
|
| 8 |
+
{
|
| 9 |
+
root_path: /home/masuser/minimax-audio/mnist_png/training,
|
| 10 |
+
resize: 256,
|
| 11 |
+
square_crop: true,
|
| 12 |
+
rand_flip: true,
|
| 13 |
+
drop_label_p: 0.1,
|
| 14 |
+
}
|
| 15 |
resize_inp: 256
|
| 16 |
gt_glores_lb: 256
|
| 17 |
gt_glores_ub: 256
|
|
|
|
| 19 |
loader:
|
| 20 |
batch_size: 64
|
| 21 |
num_workers: 24
|
| 22 |
+
|
| 23 |
val:
|
| 24 |
name: wrapper_cae
|
| 25 |
args:
|
| 26 |
dataset:
|
| 27 |
name: class_folder
|
| 28 |
+
args:
|
| 29 |
+
{
|
| 30 |
+
root_path: /home/masuser/minimax-audio/mnist_png/testing,
|
| 31 |
+
resize: 256,
|
| 32 |
+
square_crop: true,
|
| 33 |
+
}
|
| 34 |
resize_inp: 256
|
| 35 |
gt_glores_lb: 256
|
| 36 |
gt_glores_ub: 256
|
|
|
|
| 38 |
loader:
|
| 39 |
batch_size: 64
|
| 40 |
num_workers: 24
|
| 41 |
+
|
| 42 |
eval_zdm:
|
| 43 |
name: wrapper_cae
|
| 44 |
args:
|
| 45 |
dataset:
|
| 46 |
name: class_folder
|
| 47 |
+
args:
|
| 48 |
+
{
|
| 49 |
+
root_path: /home/masuser/minimax-audio/mnist_png/testing,
|
| 50 |
+
resize: 256,
|
| 51 |
+
square_crop: true,
|
| 52 |
+
}
|
| 53 |
resize_inp: 256
|
| 54 |
gt_glores_lb: 256
|
| 55 |
gt_glores_ub: 256
|
|
|
|
| 67 |
visualize_zdm_batch_size: 6
|
| 68 |
visualize_zdm_guidance_list: [4]
|
| 69 |
visualize_zdm_denoising_file: null
|
| 70 |
+
eval_zdm_max_samples: 5000
|
flowae/configs/experiments/dito-B-audio.yaml
CHANGED
|
@@ -8,21 +8,21 @@ model:
|
|
| 8 |
# Encoder
|
| 9 |
encoder:
|
| 10 |
name: dac_encoder
|
| 11 |
-
args: {config_name: snake}
|
| 12 |
-
|
| 13 |
# Latent configuration - now fully convolutional
|
| 14 |
-
z_channels: 64
|
| 15 |
|
| 16 |
zaug_p: 0.1
|
| 17 |
zaug_decoding_loss_type: suffix
|
| 18 |
zaug_zdm_diffusion:
|
| 19 |
name: fm
|
| 20 |
-
args: {timescale: 1000.0}
|
| 21 |
-
|
| 22 |
# Decoder (identity for DiTo)
|
| 23 |
decoder:
|
| 24 |
name: identity
|
| 25 |
-
|
| 26 |
# Renderer - Fully convolutional for dynamic duration
|
| 27 |
renderer:
|
| 28 |
name: fixres_renderer_wrapper
|
|
@@ -37,12 +37,11 @@ model:
|
|
| 37 |
c2: 512
|
| 38 |
pe_dim: 320
|
| 39 |
t_dim: 1280
|
| 40 |
-
|
| 41 |
# Diffusion configuration
|
| 42 |
render_diffusion:
|
| 43 |
name: fm
|
| 44 |
-
args: {timescale: 1000.0}
|
| 45 |
-
|
| 46 |
-
render_sampler: {name: fm_euler_sampler}
|
| 47 |
-
render_n_steps: 50
|
| 48 |
|
|
|
|
|
|
|
|
|
| 8 |
# Encoder
|
| 9 |
encoder:
|
| 10 |
name: dac_encoder
|
| 11 |
+
args: { config_name: snake }
|
| 12 |
+
|
| 13 |
# Latent configuration - now fully convolutional
|
| 14 |
+
z_channels: 64 # Number of latent channels
|
| 15 |
|
| 16 |
zaug_p: 0.1
|
| 17 |
zaug_decoding_loss_type: suffix
|
| 18 |
zaug_zdm_diffusion:
|
| 19 |
name: fm
|
| 20 |
+
args: { timescale: 1000.0 }
|
| 21 |
+
|
| 22 |
# Decoder (identity for DiTo)
|
| 23 |
decoder:
|
| 24 |
name: identity
|
| 25 |
+
|
| 26 |
# Renderer - Fully convolutional for dynamic duration
|
| 27 |
renderer:
|
| 28 |
name: fixres_renderer_wrapper
|
|
|
|
| 37 |
c2: 512
|
| 38 |
pe_dim: 320
|
| 39 |
t_dim: 1280
|
| 40 |
+
|
| 41 |
# Diffusion configuration
|
| 42 |
render_diffusion:
|
| 43 |
name: fm
|
| 44 |
+
args: { timescale: 1000.0 }
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
+
render_sampler: { name: fm_euler_sampler }
|
| 47 |
+
render_n_steps: 50
|
flowae/configs/experiments/dito-B-f8c4-noise-sync.yaml
CHANGED
|
@@ -7,8 +7,8 @@ model:
|
|
| 7 |
args:
|
| 8 |
encoder:
|
| 9 |
name: vqgan_encoder
|
| 10 |
-
args: {config_name: f8c4}
|
| 11 |
-
|
| 12 |
z_shape: [64, 1, 1]
|
| 13 |
z_layernorm: true
|
| 14 |
|
|
@@ -16,10 +16,10 @@ model:
|
|
| 16 |
zaug_decoding_loss_type: suffix
|
| 17 |
zaug_zdm_diffusion:
|
| 18 |
name: fm
|
| 19 |
-
args: {timescale: 1000.0}
|
| 20 |
-
|
| 21 |
-
decoder: {name: identity}
|
| 22 |
-
|
| 23 |
renderer:
|
| 24 |
name: fixres_renderer_wrapper
|
| 25 |
args:
|
|
@@ -33,11 +33,11 @@ model:
|
|
| 33 |
c2: 512
|
| 34 |
pe_dim: 320
|
| 35 |
t_dim: 1280
|
| 36 |
-
|
| 37 |
render_diffusion:
|
| 38 |
name: fm
|
| 39 |
-
args: {timescale: 1000.0}
|
| 40 |
-
render_sampler: {name: fm_euler_sampler}
|
| 41 |
render_n_steps: 50
|
| 42 |
|
| 43 |
loss_config: {}
|
|
|
|
| 7 |
args:
|
| 8 |
encoder:
|
| 9 |
name: vqgan_encoder
|
| 10 |
+
args: { config_name: f8c4 }
|
| 11 |
+
|
| 12 |
z_shape: [64, 1, 1]
|
| 13 |
z_layernorm: true
|
| 14 |
|
|
|
|
| 16 |
zaug_decoding_loss_type: suffix
|
| 17 |
zaug_zdm_diffusion:
|
| 18 |
name: fm
|
| 19 |
+
args: { timescale: 1000.0 }
|
| 20 |
+
|
| 21 |
+
decoder: { name: identity }
|
| 22 |
+
|
| 23 |
renderer:
|
| 24 |
name: fixres_renderer_wrapper
|
| 25 |
args:
|
|
|
|
| 33 |
c2: 512
|
| 34 |
pe_dim: 320
|
| 35 |
t_dim: 1280
|
| 36 |
+
|
| 37 |
render_diffusion:
|
| 38 |
name: fm
|
| 39 |
+
args: { timescale: 1000.0 }
|
| 40 |
+
render_sampler: { name: fm_euler_sampler }
|
| 41 |
render_n_steps: 50
|
| 42 |
|
| 43 |
loss_config: {}
|
flowae/configs/experiments/dito-B-f8c4.yaml
CHANGED
|
@@ -7,13 +7,13 @@ model:
|
|
| 7 |
args:
|
| 8 |
encoder:
|
| 9 |
name: vqgan_encoder
|
| 10 |
-
args: {config_name: f8c4}
|
| 11 |
-
|
| 12 |
z_shape: [4, 32, 32]
|
| 13 |
z_layernorm: true
|
| 14 |
-
|
| 15 |
-
decoder: {name: identity}
|
| 16 |
-
|
| 17 |
renderer:
|
| 18 |
name: fixres_renderer_wrapper
|
| 19 |
args:
|
|
@@ -27,11 +27,11 @@ model:
|
|
| 27 |
c2: 512
|
| 28 |
pe_dim: 320
|
| 29 |
t_dim: 1280
|
| 30 |
-
|
| 31 |
render_diffusion:
|
| 32 |
name: fm
|
| 33 |
-
args: {timescale: 1000.0}
|
| 34 |
-
render_sampler: {name: fm_euler_sampler}
|
| 35 |
render_n_steps: 50
|
| 36 |
|
| 37 |
loss_config: {}
|
|
|
|
| 7 |
args:
|
| 8 |
encoder:
|
| 9 |
name: vqgan_encoder
|
| 10 |
+
args: { config_name: f8c4 }
|
| 11 |
+
|
| 12 |
z_shape: [4, 32, 32]
|
| 13 |
z_layernorm: true
|
| 14 |
+
|
| 15 |
+
decoder: { name: identity }
|
| 16 |
+
|
| 17 |
renderer:
|
| 18 |
name: fixres_renderer_wrapper
|
| 19 |
args:
|
|
|
|
| 27 |
c2: 512
|
| 28 |
pe_dim: 320
|
| 29 |
t_dim: 1280
|
| 30 |
+
|
| 31 |
render_diffusion:
|
| 32 |
name: fm
|
| 33 |
+
args: { timescale: 1000.0 }
|
| 34 |
+
render_sampler: { name: fm_euler_sampler }
|
| 35 |
render_n_steps: 50
|
| 36 |
|
| 37 |
loss_config: {}
|
flowae/configs/experiments/dito-L-f8c4.yaml
CHANGED
|
@@ -7,13 +7,13 @@ model:
|
|
| 7 |
args:
|
| 8 |
encoder:
|
| 9 |
name: vqgan_encoder
|
| 10 |
-
args: {config_name: f8c4}
|
| 11 |
-
|
| 12 |
z_shape: [4, 32, 32]
|
| 13 |
z_layernorm: true
|
| 14 |
-
|
| 15 |
-
decoder: {name: identity}
|
| 16 |
-
|
| 17 |
renderer:
|
| 18 |
name: fixres_renderer_wrapper
|
| 19 |
args:
|
|
@@ -27,11 +27,11 @@ model:
|
|
| 27 |
c2: 768
|
| 28 |
pe_dim: 320
|
| 29 |
t_dim: 1280
|
| 30 |
-
|
| 31 |
render_diffusion:
|
| 32 |
name: fm
|
| 33 |
-
args: {timescale: 1000.0}
|
| 34 |
-
render_sampler: {name: fm_euler_sampler}
|
| 35 |
render_n_steps: 50
|
| 36 |
|
| 37 |
loss_config: {}
|
|
|
|
| 7 |
args:
|
| 8 |
encoder:
|
| 9 |
name: vqgan_encoder
|
| 10 |
+
args: { config_name: f8c4 }
|
| 11 |
+
|
| 12 |
z_shape: [4, 32, 32]
|
| 13 |
z_layernorm: true
|
| 14 |
+
|
| 15 |
+
decoder: { name: identity }
|
| 16 |
+
|
| 17 |
renderer:
|
| 18 |
name: fixres_renderer_wrapper
|
| 19 |
args:
|
|
|
|
| 27 |
c2: 768
|
| 28 |
pe_dim: 320
|
| 29 |
t_dim: 1280
|
| 30 |
+
|
| 31 |
render_diffusion:
|
| 32 |
name: fm
|
| 33 |
+
args: { timescale: 1000.0 }
|
| 34 |
+
render_sampler: { name: fm_euler_sampler }
|
| 35 |
render_n_steps: 50
|
| 36 |
|
| 37 |
loss_config: {}
|
flowae/configs/experiments/dito-XL-f8c4-noise-sync.yaml
CHANGED
|
@@ -7,8 +7,8 @@ model:
|
|
| 7 |
args:
|
| 8 |
encoder:
|
| 9 |
name: vqgan_encoder
|
| 10 |
-
args: {config_name: f8c4}
|
| 11 |
-
|
| 12 |
z_shape: [4, 32, 32]
|
| 13 |
z_layernorm: true
|
| 14 |
|
|
@@ -16,10 +16,10 @@ model:
|
|
| 16 |
zaug_decoding_loss_type: suffix
|
| 17 |
zaug_zdm_diffusion:
|
| 18 |
name: fm
|
| 19 |
-
args: {timescale: 1000.0}
|
| 20 |
-
|
| 21 |
-
decoder: {name: identity}
|
| 22 |
-
|
| 23 |
renderer:
|
| 24 |
name: fixres_renderer_wrapper
|
| 25 |
args:
|
|
@@ -33,11 +33,11 @@ model:
|
|
| 33 |
c2: 1024
|
| 34 |
pe_dim: 320
|
| 35 |
t_dim: 1280
|
| 36 |
-
|
| 37 |
render_diffusion:
|
| 38 |
name: fm
|
| 39 |
-
args: {timescale: 1000.0}
|
| 40 |
-
render_sampler: {name: fm_euler_sampler}
|
| 41 |
render_n_steps: 50
|
| 42 |
|
| 43 |
loss_config: {}
|
|
|
|
| 7 |
args:
|
| 8 |
encoder:
|
| 9 |
name: vqgan_encoder
|
| 10 |
+
args: { config_name: f8c4 }
|
| 11 |
+
|
| 12 |
z_shape: [4, 32, 32]
|
| 13 |
z_layernorm: true
|
| 14 |
|
|
|
|
| 16 |
zaug_decoding_loss_type: suffix
|
| 17 |
zaug_zdm_diffusion:
|
| 18 |
name: fm
|
| 19 |
+
args: { timescale: 1000.0 }
|
| 20 |
+
|
| 21 |
+
decoder: { name: identity }
|
| 22 |
+
|
| 23 |
renderer:
|
| 24 |
name: fixres_renderer_wrapper
|
| 25 |
args:
|
|
|
|
| 33 |
c2: 1024
|
| 34 |
pe_dim: 320
|
| 35 |
t_dim: 1280
|
| 36 |
+
|
| 37 |
render_diffusion:
|
| 38 |
name: fm
|
| 39 |
+
args: { timescale: 1000.0 }
|
| 40 |
+
render_sampler: { name: fm_euler_sampler }
|
| 41 |
render_n_steps: 50
|
| 42 |
|
| 43 |
loss_config: {}
|
flowae/configs/experiments/dito-XL-f8c4.yaml
CHANGED
|
@@ -7,13 +7,13 @@ model:
|
|
| 7 |
args:
|
| 8 |
encoder:
|
| 9 |
name: vqgan_encoder
|
| 10 |
-
args: {config_name: f8c4}
|
| 11 |
-
|
| 12 |
z_shape: [4, 32, 32]
|
| 13 |
z_layernorm: true
|
| 14 |
-
|
| 15 |
-
decoder: {name: identity}
|
| 16 |
-
|
| 17 |
renderer:
|
| 18 |
name: fixres_renderer_wrapper
|
| 19 |
args:
|
|
@@ -27,11 +27,11 @@ model:
|
|
| 27 |
c2: 1024
|
| 28 |
pe_dim: 320
|
| 29 |
t_dim: 1280
|
| 30 |
-
|
| 31 |
render_diffusion:
|
| 32 |
name: fm
|
| 33 |
-
args: {timescale: 1000.0}
|
| 34 |
-
render_sampler: {name: fm_euler_sampler}
|
| 35 |
render_n_steps: 50
|
| 36 |
|
| 37 |
loss_config: {}
|
|
|
|
| 7 |
args:
|
| 8 |
encoder:
|
| 9 |
name: vqgan_encoder
|
| 10 |
+
args: { config_name: f8c4 }
|
| 11 |
+
|
| 12 |
z_shape: [4, 32, 32]
|
| 13 |
z_layernorm: true
|
| 14 |
+
|
| 15 |
+
decoder: { name: identity }
|
| 16 |
+
|
| 17 |
renderer:
|
| 18 |
name: fixres_renderer_wrapper
|
| 19 |
args:
|
|
|
|
| 27 |
c2: 1024
|
| 28 |
pe_dim: 320
|
| 29 |
t_dim: 1280
|
| 30 |
+
|
| 31 |
render_diffusion:
|
| 32 |
name: fm
|
| 33 |
+
args: { timescale: 1000.0 }
|
| 34 |
+
render_sampler: { name: fm_euler_sampler }
|
| 35 |
render_n_steps: 50
|
| 36 |
|
| 37 |
loss_config: {}
|
flowae/configs/experiments/eval50k_zdm-XL_dito-XL-f8c4-noise-sync.yaml
CHANGED
|
@@ -11,16 +11,16 @@ model:
|
|
| 11 |
args:
|
| 12 |
zdm_force_guidance: 2.0
|
| 13 |
renderer_ema_rate: 1
|
| 14 |
-
|
| 15 |
encoder:
|
| 16 |
name: vqgan_encoder
|
| 17 |
-
args: {config_name: f8c4}
|
| 18 |
-
|
| 19 |
z_shape: [4, 32, 32]
|
| 20 |
z_layernorm: true
|
| 21 |
-
|
| 22 |
-
decoder: {name: identity}
|
| 23 |
-
|
| 24 |
renderer:
|
| 25 |
name: fixres_renderer_wrapper
|
| 26 |
args:
|
|
@@ -34,11 +34,11 @@ model:
|
|
| 34 |
c2: 1024
|
| 35 |
pe_dim: 320
|
| 36 |
t_dim: 1280
|
| 37 |
-
|
| 38 |
render_diffusion:
|
| 39 |
name: fm
|
| 40 |
-
args: {timescale: 1000.0}
|
| 41 |
-
render_sampler: {name: fm_euler_sampler}
|
| 42 |
render_n_steps: 50
|
| 43 |
|
| 44 |
loss_config: {}
|
|
|
|
| 11 |
args:
|
| 12 |
zdm_force_guidance: 2.0
|
| 13 |
renderer_ema_rate: 1
|
| 14 |
+
|
| 15 |
encoder:
|
| 16 |
name: vqgan_encoder
|
| 17 |
+
args: { config_name: f8c4 }
|
| 18 |
+
|
| 19 |
z_shape: [4, 32, 32]
|
| 20 |
z_layernorm: true
|
| 21 |
+
|
| 22 |
+
decoder: { name: identity }
|
| 23 |
+
|
| 24 |
renderer:
|
| 25 |
name: fixres_renderer_wrapper
|
| 26 |
args:
|
|
|
|
| 34 |
c2: 1024
|
| 35 |
pe_dim: 320
|
| 36 |
t_dim: 1280
|
| 37 |
+
|
| 38 |
render_diffusion:
|
| 39 |
name: fm
|
| 40 |
+
args: { timescale: 1000.0 }
|
| 41 |
+
render_sampler: { name: fm_euler_sampler }
|
| 42 |
render_n_steps: 50
|
| 43 |
|
| 44 |
loss_config: {}
|
flowae/configs/experiments/eval50k_zdm-XL_dito-XL-f8c4.yaml
CHANGED
|
@@ -11,16 +11,16 @@ model:
|
|
| 11 |
args:
|
| 12 |
zdm_force_guidance: 2.0
|
| 13 |
renderer_ema_rate: 1
|
| 14 |
-
|
| 15 |
encoder:
|
| 16 |
name: vqgan_encoder
|
| 17 |
-
args: {config_name: f8c4}
|
| 18 |
-
|
| 19 |
z_shape: [4, 32, 32]
|
| 20 |
z_layernorm: true
|
| 21 |
-
|
| 22 |
-
decoder: {name: identity}
|
| 23 |
-
|
| 24 |
renderer:
|
| 25 |
name: fixres_renderer_wrapper
|
| 26 |
args:
|
|
@@ -34,11 +34,11 @@ model:
|
|
| 34 |
c2: 1024
|
| 35 |
pe_dim: 320
|
| 36 |
t_dim: 1280
|
| 37 |
-
|
| 38 |
render_diffusion:
|
| 39 |
name: fm
|
| 40 |
-
args: {timescale: 1000.0}
|
| 41 |
-
render_sampler: {name: fm_euler_sampler}
|
| 42 |
render_n_steps: 50
|
| 43 |
|
| 44 |
loss_config: {}
|
|
|
|
| 11 |
args:
|
| 12 |
zdm_force_guidance: 2.0
|
| 13 |
renderer_ema_rate: 1
|
| 14 |
+
|
| 15 |
encoder:
|
| 16 |
name: vqgan_encoder
|
| 17 |
+
args: { config_name: f8c4 }
|
| 18 |
+
|
| 19 |
z_shape: [4, 32, 32]
|
| 20 |
z_layernorm: true
|
| 21 |
+
|
| 22 |
+
decoder: { name: identity }
|
| 23 |
+
|
| 24 |
renderer:
|
| 25 |
name: fixres_renderer_wrapper
|
| 26 |
args:
|
|
|
|
| 34 |
c2: 1024
|
| 35 |
pe_dim: 320
|
| 36 |
t_dim: 1280
|
| 37 |
+
|
| 38 |
render_diffusion:
|
| 39 |
name: fm
|
| 40 |
+
args: { timescale: 1000.0 }
|
| 41 |
+
render_sampler: { name: fm_euler_sampler }
|
| 42 |
render_n_steps: 50
|
| 43 |
|
| 44 |
loss_config: {}
|
flowae/configs/experiments/zdm-XL_dito-XL-f8c4-noise-sync.yaml
CHANGED
|
@@ -8,16 +8,16 @@ model:
|
|
| 8 |
name: dito
|
| 9 |
args:
|
| 10 |
renderer_ema_rate: 1
|
| 11 |
-
|
| 12 |
encoder:
|
| 13 |
name: vqgan_encoder
|
| 14 |
-
args: {config_name: f8c4}
|
| 15 |
-
|
| 16 |
z_shape: [4, 32, 32]
|
| 17 |
z_layernorm: true
|
| 18 |
-
|
| 19 |
-
decoder: {name: identity}
|
| 20 |
-
|
| 21 |
renderer:
|
| 22 |
name: fixres_renderer_wrapper
|
| 23 |
args:
|
|
@@ -31,11 +31,11 @@ model:
|
|
| 31 |
c2: 1024
|
| 32 |
pe_dim: 320
|
| 33 |
t_dim: 1280
|
| 34 |
-
|
| 35 |
render_diffusion:
|
| 36 |
name: fm
|
| 37 |
-
args: {timescale: 1000.0}
|
| 38 |
-
render_sampler: {name: fm_euler_sampler}
|
| 39 |
render_n_steps: 50
|
| 40 |
|
| 41 |
loss_config: {}
|
|
|
|
| 8 |
name: dito
|
| 9 |
args:
|
| 10 |
renderer_ema_rate: 1
|
| 11 |
+
|
| 12 |
encoder:
|
| 13 |
name: vqgan_encoder
|
| 14 |
+
args: { config_name: f8c4 }
|
| 15 |
+
|
| 16 |
z_shape: [4, 32, 32]
|
| 17 |
z_layernorm: true
|
| 18 |
+
|
| 19 |
+
decoder: { name: identity }
|
| 20 |
+
|
| 21 |
renderer:
|
| 22 |
name: fixres_renderer_wrapper
|
| 23 |
args:
|
|
|
|
| 31 |
c2: 1024
|
| 32 |
pe_dim: 320
|
| 33 |
t_dim: 1280
|
| 34 |
+
|
| 35 |
render_diffusion:
|
| 36 |
name: fm
|
| 37 |
+
args: { timescale: 1000.0 }
|
| 38 |
+
render_sampler: { name: fm_euler_sampler }
|
| 39 |
render_n_steps: 50
|
| 40 |
|
| 41 |
loss_config: {}
|
flowae/configs/experiments/zdm-XL_dito-XL-f8c4.yaml
CHANGED
|
@@ -4,20 +4,20 @@ __base__:
|
|
| 4 |
- configs/trainers/zdm.yaml
|
| 5 |
|
| 6 |
model:
|
| 7 |
-
load_ckpt:
|
| 8 |
name: dito
|
| 9 |
args:
|
| 10 |
renderer_ema_rate: 1
|
| 11 |
-
|
| 12 |
encoder:
|
| 13 |
name: vqgan_encoder
|
| 14 |
-
args: {config_name: f8c4}
|
| 15 |
-
|
| 16 |
z_shape: [4, 32, 32]
|
| 17 |
z_layernorm: true
|
| 18 |
-
|
| 19 |
-
decoder: {name: identity}
|
| 20 |
-
|
| 21 |
renderer:
|
| 22 |
name: fixres_renderer_wrapper
|
| 23 |
args:
|
|
@@ -31,11 +31,11 @@ model:
|
|
| 31 |
c2: 1024
|
| 32 |
pe_dim: 320
|
| 33 |
t_dim: 1280
|
| 34 |
-
|
| 35 |
render_diffusion:
|
| 36 |
name: fm
|
| 37 |
-
args: {timescale: 1000.0}
|
| 38 |
-
render_sampler: {name: fm_euler_sampler}
|
| 39 |
render_n_steps: 50
|
| 40 |
|
| 41 |
loss_config: {}
|
|
|
|
| 4 |
- configs/trainers/zdm.yaml
|
| 5 |
|
| 6 |
model:
|
| 7 |
+
load_ckpt:
|
| 8 |
name: dito
|
| 9 |
args:
|
| 10 |
renderer_ema_rate: 1
|
| 11 |
+
|
| 12 |
encoder:
|
| 13 |
name: vqgan_encoder
|
| 14 |
+
args: { config_name: f8c4 }
|
| 15 |
+
|
| 16 |
z_shape: [4, 32, 32]
|
| 17 |
z_layernorm: true
|
| 18 |
+
|
| 19 |
+
decoder: { name: identity }
|
| 20 |
+
|
| 21 |
renderer:
|
| 22 |
name: fixres_renderer_wrapper
|
| 23 |
args:
|
|
|
|
| 31 |
c2: 1024
|
| 32 |
pe_dim: 320
|
| 33 |
t_dim: 1280
|
| 34 |
+
|
| 35 |
render_diffusion:
|
| 36 |
name: fm
|
| 37 |
+
args: { timescale: 1000.0 }
|
| 38 |
+
render_sampler: { name: fm_euler_sampler }
|
| 39 |
render_n_steps: 50
|
| 40 |
|
| 41 |
loss_config: {}
|
flowae/configs/models/zdm-XL_imagenet.yaml
CHANGED
|
@@ -2,11 +2,11 @@ model:
|
|
| 2 |
args:
|
| 3 |
zdm_net:
|
| 4 |
name: dit_xl_2
|
| 5 |
-
args: {n_classes: 1001}
|
| 6 |
zdm_diffusion:
|
| 7 |
name: fm
|
| 8 |
-
args: {timescale: 1000.0}
|
| 9 |
-
zdm_sampler: {name: fm_euler_sampler}
|
| 10 |
zdm_n_steps: 200
|
| 11 |
zdm_train_normalize: false
|
| 12 |
-
zdm_class_cond: 1000
|
|
|
|
| 2 |
args:
|
| 3 |
zdm_net:
|
| 4 |
name: dit_xl_2
|
| 5 |
+
args: { n_classes: 1001 }
|
| 6 |
zdm_diffusion:
|
| 7 |
name: fm
|
| 8 |
+
args: { timescale: 1000.0 }
|
| 9 |
+
zdm_sampler: { name: fm_euler_sampler }
|
| 10 |
zdm_n_steps: 200
|
| 11 |
zdm_train_normalize: false
|
| 12 |
+
zdm_class_cond: 1000
|
flowae/configs/trainers/dito.yaml
CHANGED
|
@@ -11,9 +11,9 @@ vis_iter: 50000
|
|
| 11 |
optimizers:
|
| 12 |
encoder:
|
| 13 |
name: adamw
|
| 14 |
-
args: {lr: 1.e-4}
|
| 15 |
renderer:
|
| 16 |
name: adamw
|
| 17 |
-
args: {lr: 1.e-4}
|
| 18 |
|
| 19 |
-
evaluate_ae: true
|
|
|
|
| 11 |
optimizers:
|
| 12 |
encoder:
|
| 13 |
name: adamw
|
| 14 |
+
args: { lr: 1.e-4 }
|
| 15 |
renderer:
|
| 16 |
name: adamw
|
| 17 |
+
args: { lr: 1.e-4 }
|
| 18 |
|
| 19 |
+
evaluate_ae: true
|
flowae/configs/trainers/glpto.yaml
CHANGED
|
@@ -11,14 +11,14 @@ vis_iter: 50000
|
|
| 11 |
optimizers:
|
| 12 |
encoder:
|
| 13 |
name: adam
|
| 14 |
-
args: {lr: 1.e-4, betas: [0.5, 0.9]}
|
| 15 |
renderer:
|
| 16 |
name: adam
|
| 17 |
-
args: {lr: 1.e-4, betas: [0.5, 0.9]}
|
| 18 |
disc:
|
| 19 |
name: adam
|
| 20 |
-
args: {lr: 1.e-4, betas: [0.5, 0.9]}
|
| 21 |
gan_start_after_iters: 50000
|
| 22 |
find_unused_parameters: true
|
| 23 |
|
| 24 |
-
evaluate_ae: true
|
|
|
|
| 11 |
optimizers:
|
| 12 |
encoder:
|
| 13 |
name: adam
|
| 14 |
+
args: { lr: 1.e-4, betas: [0.5, 0.9] }
|
| 15 |
renderer:
|
| 16 |
name: adam
|
| 17 |
+
args: { lr: 1.e-4, betas: [0.5, 0.9] }
|
| 18 |
disc:
|
| 19 |
name: adam
|
| 20 |
+
args: { lr: 1.e-4, betas: [0.5, 0.9] }
|
| 21 |
gan_start_after_iters: 50000
|
| 22 |
find_unused_parameters: true
|
| 23 |
|
| 24 |
+
evaluate_ae: true
|
flowae/configs/trainers/zdm.yaml
CHANGED
|
@@ -14,7 +14,7 @@ ckpt_select_metric:
|
|
| 14 |
optimizers:
|
| 15 |
zdm:
|
| 16 |
name: adamw
|
| 17 |
-
args: {lr: 1.e-4, weight_decay: 0.0}
|
| 18 |
find_unused_parameters: true
|
| 19 |
|
| 20 |
-
evaluate_zdm: true
|
|
|
|
| 14 |
optimizers:
|
| 15 |
zdm:
|
| 16 |
name: adamw
|
| 17 |
+
args: { lr: 1.e-4, weight_decay: 0.0 }
|
| 18 |
find_unused_parameters: true
|
| 19 |
|
| 20 |
+
evaluate_zdm: true
|
flowae/load/wandb.yaml
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
-
entity:
|
| 2 |
-
api_key:
|
| 3 |
-
project:
|
|
|
|
| 1 |
+
entity:
|
| 2 |
+
api_key:
|
| 3 |
+
project:
|