training configs
Browse files- lifm/FMiT-B2-16f8/config.yaml +132 -0
- lifm/FMiT-B2-4f4/config.yaml +132 -0
- lifm/FMiT-B4-4f4/config.yaml +132 -0
- lifm/FMiT-L2-16f8/config.yaml +132 -0
- lifm/FMiT-L2-4f4/config.yaml +132 -0
- lifm/FMiT-L4-4f4/config.yaml +132 -0
- lifm/FMiT-S2-16f8/config.yaml +132 -0
- lifm/FMiT-S2-4f4/config.yaml +132 -0
- lifm/FMiT-S4-4f4/config.yaml +132 -0
- lifm/UNet-B-16f8/config.yaml +152 -0
- lifm/UNet-B-4f4/config.yaml +152 -0
- lifm/UNet-L-16f8/config.yaml +152 -0
- lifm/UNet-L-4f4/config.yaml +152 -0
- lifm/UNet-S-16f8/config.yaml +152 -0
- lifm/UNet-S-4f4/config.yaml +152 -0
- lvfm/FMvT-S2-16f8/config.yaml +145 -0
- lvfm/FMvT-S2-4f4/config.yaml +145 -0
- lvfm/FMvT-S4-4f4/config.yaml +145 -0
- lvfm/STUNet-S-16f8/config.yaml +152 -0
- lvfm/STUNet-S-4f4/config.yaml +152 -0
- reid/dynamic-4f4/config.yaml +64 -0
- reid/lvh-4f4/config.yaml +64 -0
- reid/ped_a4c-4f4/config.yaml +64 -0
- reid/ped_psax-4f4/config.yaml +64 -0
- vae/avae-16f8/config.yaml +54 -0
- vae/avae-4f4/config.yaml +54 -0
- vae/avae-4f8/config.yaml +54 -0
lifm/FMiT-B2-16f8/config.yaml
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
globals:
|
| 2 |
+
target_fps: original
|
| 3 |
+
target_nframes: 64
|
| 4 |
+
outputs:
|
| 5 |
+
- image
|
| 6 |
+
- view
|
| 7 |
+
resolution: 112
|
| 8 |
+
latent_res: 14
|
| 9 |
+
latent_channels: 16
|
| 10 |
+
denoiser:
|
| 11 |
+
target: echosyn.common.models.SegDiTTransformer2DModel
|
| 12 |
+
args:
|
| 13 |
+
num_attention_heads: 12
|
| 14 |
+
attention_head_dim: 64
|
| 15 |
+
in_channels: 17
|
| 16 |
+
out_channels: 16
|
| 17 |
+
num_layers: 12
|
| 18 |
+
dropout: 0.0
|
| 19 |
+
norm_num_groups: 32
|
| 20 |
+
attention_bias: true
|
| 21 |
+
sample_size: ${globals.latent_res}
|
| 22 |
+
patch_size: 2
|
| 23 |
+
activation_fn: gelu-approximate
|
| 24 |
+
num_embeds_ada_norm: 1000
|
| 25 |
+
upcast_attention: false
|
| 26 |
+
norm_type: ada_norm_zero
|
| 27 |
+
norm_elementwise_affine: false
|
| 28 |
+
norm_eps: 1.0e-05
|
| 29 |
+
optimizer:
|
| 30 |
+
target: torch.optim.AdamW
|
| 31 |
+
args:
|
| 32 |
+
lr: 5.0e-05
|
| 33 |
+
betas:
|
| 34 |
+
- 0.9
|
| 35 |
+
- 0.999
|
| 36 |
+
weight_decay: 0.01
|
| 37 |
+
eps: 1.0e-08
|
| 38 |
+
scheduler:
|
| 39 |
+
target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
|
| 40 |
+
args:
|
| 41 |
+
warmup_steps: 5000
|
| 42 |
+
ref_steps: ${max_train_steps}
|
| 43 |
+
eta_min: 1.0e-06
|
| 44 |
+
decay_rate: 2
|
| 45 |
+
vae:
|
| 46 |
+
target: diffusers.AutoencoderKL
|
| 47 |
+
pretrained: vae/avae-16f8
|
| 48 |
+
datasets:
|
| 49 |
+
- name: LatentSeg
|
| 50 |
+
active: true
|
| 51 |
+
params:
|
| 52 |
+
root: avae-16f8/dynamic
|
| 53 |
+
outputs: ${globals.outputs}
|
| 54 |
+
target_fps: ${globals.target_fps}
|
| 55 |
+
view_label: A4C
|
| 56 |
+
target_nframes: ${globals.target_nframes}
|
| 57 |
+
latent_channels: ${globals.latent_channels}
|
| 58 |
+
segmentation_root: segmentations/dynamic
|
| 59 |
+
target_resolution: ${globals.latent_res}
|
| 60 |
+
- name: LatentSeg
|
| 61 |
+
active: true
|
| 62 |
+
params:
|
| 63 |
+
root: avae-16f8/ped_a4c
|
| 64 |
+
outputs: ${globals.outputs}
|
| 65 |
+
target_fps: ${globals.target_fps}
|
| 66 |
+
view_label: A4C
|
| 67 |
+
target_nframes: ${globals.target_nframes}
|
| 68 |
+
latent_channels: ${globals.latent_channels}
|
| 69 |
+
segmentation_root: segmentations/ped_a4c
|
| 70 |
+
target_resolution: ${globals.latent_res}
|
| 71 |
+
- name: LatentSeg
|
| 72 |
+
active: true
|
| 73 |
+
params:
|
| 74 |
+
root: avae-16f8/ped_psax
|
| 75 |
+
outputs: ${globals.outputs}
|
| 76 |
+
target_fps: ${globals.target_fps}
|
| 77 |
+
view_label: PSAX
|
| 78 |
+
target_nframes: ${globals.target_nframes}
|
| 79 |
+
latent_channels: ${globals.latent_channels}
|
| 80 |
+
segmentation_root: segmentations/ped_psax
|
| 81 |
+
target_resolution: ${globals.latent_res}
|
| 82 |
+
- name: LatentSeg
|
| 83 |
+
active: true
|
| 84 |
+
params:
|
| 85 |
+
root: avae-16f8/lvh
|
| 86 |
+
outputs: ${globals.outputs}
|
| 87 |
+
target_fps: ${globals.target_fps}
|
| 88 |
+
view_label: PLAX
|
| 89 |
+
target_nframes: ${globals.target_nframes}
|
| 90 |
+
latent_channels: ${globals.latent_channels}
|
| 91 |
+
segmentation_root: no_seg
|
| 92 |
+
target_resolution: ${globals.latent_res}
|
| 93 |
+
dataloader:
|
| 94 |
+
target: torch.utils.data.DataLoader
|
| 95 |
+
args:
|
| 96 |
+
shuffle: true
|
| 97 |
+
batch_size: 128
|
| 98 |
+
num_workers: 16
|
| 99 |
+
pin_memory: true
|
| 100 |
+
drop_last: true
|
| 101 |
+
persistent_workers: true
|
| 102 |
+
max_train_steps: 1000000
|
| 103 |
+
gradient_accumulation_steps: 1
|
| 104 |
+
mixed_precision: fp16
|
| 105 |
+
use_ema: true
|
| 106 |
+
noise_offset: 0.1
|
| 107 |
+
max_grad_norm: 0.1
|
| 108 |
+
max_grad_value: -1
|
| 109 |
+
pad_latents: false
|
| 110 |
+
sample_latents: true
|
| 111 |
+
output_dir: experiments/${wandb_args.name}
|
| 112 |
+
logging_dir: logs
|
| 113 |
+
report_to: wandb
|
| 114 |
+
wandb_args:
|
| 115 |
+
project: EchoFlow
|
| 116 |
+
name: FMiT-B2-16f8
|
| 117 |
+
group: FMiT
|
| 118 |
+
checkpointing_steps: 10000
|
| 119 |
+
checkpoints_to_keep:
|
| 120 |
+
- 50000
|
| 121 |
+
- 100000
|
| 122 |
+
- 200000
|
| 123 |
+
- 500000
|
| 124 |
+
- 1000000
|
| 125 |
+
resume_from_checkpoint: latest
|
| 126 |
+
validation:
|
| 127 |
+
samples: 4
|
| 128 |
+
steps: 5000
|
| 129 |
+
method: euler
|
| 130 |
+
timesteps: 25
|
| 131 |
+
seed: 42
|
| 132 |
+
num_train_epochs: 45455
|
lifm/FMiT-B2-4f4/config.yaml
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
globals:
|
| 2 |
+
target_fps: original
|
| 3 |
+
target_nframes: 64
|
| 4 |
+
outputs:
|
| 5 |
+
- image
|
| 6 |
+
- view
|
| 7 |
+
resolution: 112
|
| 8 |
+
latent_res: 28
|
| 9 |
+
latent_channels: 4
|
| 10 |
+
denoiser:
|
| 11 |
+
target: echosyn.common.models.SegDiTTransformer2DModel
|
| 12 |
+
args:
|
| 13 |
+
num_attention_heads: 12
|
| 14 |
+
attention_head_dim: 64
|
| 15 |
+
in_channels: 5
|
| 16 |
+
out_channels: 4
|
| 17 |
+
num_layers: 12
|
| 18 |
+
dropout: 0.0
|
| 19 |
+
norm_num_groups: 32
|
| 20 |
+
attention_bias: true
|
| 21 |
+
sample_size: 28
|
| 22 |
+
patch_size: 2
|
| 23 |
+
activation_fn: gelu-approximate
|
| 24 |
+
num_embeds_ada_norm: 1000
|
| 25 |
+
upcast_attention: false
|
| 26 |
+
norm_type: ada_norm_zero
|
| 27 |
+
norm_elementwise_affine: false
|
| 28 |
+
norm_eps: 1.0e-05
|
| 29 |
+
optimizer:
|
| 30 |
+
target: torch.optim.AdamW
|
| 31 |
+
args:
|
| 32 |
+
lr: 5.0e-05
|
| 33 |
+
betas:
|
| 34 |
+
- 0.9
|
| 35 |
+
- 0.999
|
| 36 |
+
weight_decay: 0.01
|
| 37 |
+
eps: 1.0e-08
|
| 38 |
+
scheduler:
|
| 39 |
+
target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
|
| 40 |
+
args:
|
| 41 |
+
warmup_steps: 5000
|
| 42 |
+
ref_steps: ${max_train_steps}
|
| 43 |
+
eta_min: 1.0e-06
|
| 44 |
+
decay_rate: 2
|
| 45 |
+
vae:
|
| 46 |
+
target: diffusers.AutoencoderKL
|
| 47 |
+
pretrained: vae/avae-4f4
|
| 48 |
+
datasets:
|
| 49 |
+
- name: LatentSeg
|
| 50 |
+
active: true
|
| 51 |
+
params:
|
| 52 |
+
root: avae-4f4/dynamic
|
| 53 |
+
outputs: ${globals.outputs}
|
| 54 |
+
target_fps: ${globals.target_fps}
|
| 55 |
+
view_label: A4C
|
| 56 |
+
target_nframes: ${globals.target_nframes}
|
| 57 |
+
latent_channels: ${globals.latent_channels}
|
| 58 |
+
segmentation_root: segmentations/dynamic
|
| 59 |
+
target_resolution: ${globals.latent_res}
|
| 60 |
+
- name: LatentSeg
|
| 61 |
+
active: true
|
| 62 |
+
params:
|
| 63 |
+
root: avae-4f4/ped_a4c
|
| 64 |
+
outputs: ${globals.outputs}
|
| 65 |
+
target_fps: ${globals.target_fps}
|
| 66 |
+
view_label: A4C
|
| 67 |
+
target_nframes: ${globals.target_nframes}
|
| 68 |
+
latent_channels: ${globals.latent_channels}
|
| 69 |
+
segmentation_root: segmentations/ped_a4c
|
| 70 |
+
target_resolution: ${globals.latent_res}
|
| 71 |
+
- name: LatentSeg
|
| 72 |
+
active: true
|
| 73 |
+
params:
|
| 74 |
+
root: avae-4f4/ped_psax
|
| 75 |
+
outputs: ${globals.outputs}
|
| 76 |
+
target_fps: ${globals.target_fps}
|
| 77 |
+
view_label: PSAX
|
| 78 |
+
target_nframes: ${globals.target_nframes}
|
| 79 |
+
latent_channels: ${globals.latent_channels}
|
| 80 |
+
segmentation_root: segmentations/ped_psax
|
| 81 |
+
target_resolution: ${globals.latent_res}
|
| 82 |
+
- name: LatentSeg
|
| 83 |
+
active: true
|
| 84 |
+
params:
|
| 85 |
+
root: avae-4f4/lvh
|
| 86 |
+
outputs: ${globals.outputs}
|
| 87 |
+
target_fps: ${globals.target_fps}
|
| 88 |
+
view_label: PLAX
|
| 89 |
+
target_nframes: ${globals.target_nframes}
|
| 90 |
+
latent_channels: ${globals.latent_channels}
|
| 91 |
+
segmentation_root: no_seg
|
| 92 |
+
target_resolution: ${globals.latent_res}
|
| 93 |
+
dataloader:
|
| 94 |
+
target: torch.utils.data.DataLoader
|
| 95 |
+
args:
|
| 96 |
+
shuffle: true
|
| 97 |
+
batch_size: 128
|
| 98 |
+
num_workers: 16
|
| 99 |
+
pin_memory: true
|
| 100 |
+
drop_last: true
|
| 101 |
+
persistent_workers: true
|
| 102 |
+
max_train_steps: 1000000
|
| 103 |
+
gradient_accumulation_steps: 1
|
| 104 |
+
mixed_precision: fp16
|
| 105 |
+
use_ema: true
|
| 106 |
+
noise_offset: 0.1
|
| 107 |
+
max_grad_norm: 0.1
|
| 108 |
+
max_grad_value: -1
|
| 109 |
+
pad_latents: false
|
| 110 |
+
sample_latents: true
|
| 111 |
+
output_dir: experiments/${wandb_args.name}
|
| 112 |
+
logging_dir: logs
|
| 113 |
+
report_to: wandb
|
| 114 |
+
wandb_args:
|
| 115 |
+
project: EchoFlow
|
| 116 |
+
name: FMiT-B2-4f4
|
| 117 |
+
group: FMiT
|
| 118 |
+
checkpointing_steps: 10000
|
| 119 |
+
checkpoints_to_keep:
|
| 120 |
+
- 50000
|
| 121 |
+
- 100000
|
| 122 |
+
- 200000
|
| 123 |
+
- 500000
|
| 124 |
+
- 1000000
|
| 125 |
+
resume_from_checkpoint: latest
|
| 126 |
+
validation:
|
| 127 |
+
samples: 4
|
| 128 |
+
steps: 5000
|
| 129 |
+
method: euler
|
| 130 |
+
timesteps: 25
|
| 131 |
+
seed: 42
|
| 132 |
+
num_train_epochs: 45455
|
lifm/FMiT-B4-4f4/config.yaml
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
globals:
|
| 2 |
+
target_fps: original
|
| 3 |
+
target_nframes: 64
|
| 4 |
+
outputs:
|
| 5 |
+
- image
|
| 6 |
+
- view
|
| 7 |
+
resolution: 112
|
| 8 |
+
latent_res: 28
|
| 9 |
+
latent_channels: 4
|
| 10 |
+
denoiser:
|
| 11 |
+
target: echosyn.common.models.SegDiTTransformer2DModel
|
| 12 |
+
args:
|
| 13 |
+
num_attention_heads: 12
|
| 14 |
+
attention_head_dim: 64
|
| 15 |
+
in_channels: 5
|
| 16 |
+
out_channels: 4
|
| 17 |
+
num_layers: 12
|
| 18 |
+
dropout: 0.0
|
| 19 |
+
norm_num_groups: 32
|
| 20 |
+
attention_bias: true
|
| 21 |
+
sample_size: 28
|
| 22 |
+
patch_size: 4
|
| 23 |
+
activation_fn: gelu-approximate
|
| 24 |
+
num_embeds_ada_norm: 1000
|
| 25 |
+
upcast_attention: false
|
| 26 |
+
norm_type: ada_norm_zero
|
| 27 |
+
norm_elementwise_affine: false
|
| 28 |
+
norm_eps: 1.0e-05
|
| 29 |
+
optimizer:
|
| 30 |
+
target: torch.optim.AdamW
|
| 31 |
+
args:
|
| 32 |
+
lr: 5.0e-05
|
| 33 |
+
betas:
|
| 34 |
+
- 0.9
|
| 35 |
+
- 0.999
|
| 36 |
+
weight_decay: 0.01
|
| 37 |
+
eps: 1.0e-08
|
| 38 |
+
scheduler:
|
| 39 |
+
target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
|
| 40 |
+
args:
|
| 41 |
+
warmup_steps: 5000
|
| 42 |
+
ref_steps: ${max_train_steps}
|
| 43 |
+
eta_min: 1.0e-06
|
| 44 |
+
decay_rate: 2
|
| 45 |
+
vae:
|
| 46 |
+
target: diffusers.AutoencoderKL
|
| 47 |
+
pretrained: vae/avae-4f4
|
| 48 |
+
datasets:
|
| 49 |
+
- name: LatentSeg
|
| 50 |
+
active: true
|
| 51 |
+
params:
|
| 52 |
+
root: avae-4f4/dynamic
|
| 53 |
+
outputs: ${globals.outputs}
|
| 54 |
+
target_fps: ${globals.target_fps}
|
| 55 |
+
view_label: A4C
|
| 56 |
+
target_nframes: ${globals.target_nframes}
|
| 57 |
+
latent_channels: ${globals.latent_channels}
|
| 58 |
+
segmentation_root: segmentations/dynamic
|
| 59 |
+
target_resolution: ${globals.latent_res}
|
| 60 |
+
- name: LatentSeg
|
| 61 |
+
active: true
|
| 62 |
+
params:
|
| 63 |
+
root: avae-4f4/ped_a4c
|
| 64 |
+
outputs: ${globals.outputs}
|
| 65 |
+
target_fps: ${globals.target_fps}
|
| 66 |
+
view_label: A4C
|
| 67 |
+
target_nframes: ${globals.target_nframes}
|
| 68 |
+
latent_channels: ${globals.latent_channels}
|
| 69 |
+
segmentation_root: segmentations/ped_a4c
|
| 70 |
+
target_resolution: ${globals.latent_res}
|
| 71 |
+
- name: LatentSeg
|
| 72 |
+
active: true
|
| 73 |
+
params:
|
| 74 |
+
root: avae-4f4/ped_psax
|
| 75 |
+
outputs: ${globals.outputs}
|
| 76 |
+
target_fps: ${globals.target_fps}
|
| 77 |
+
view_label: PSAX
|
| 78 |
+
target_nframes: ${globals.target_nframes}
|
| 79 |
+
latent_channels: ${globals.latent_channels}
|
| 80 |
+
segmentation_root: segmentations/ped_psax
|
| 81 |
+
target_resolution: ${globals.latent_res}
|
| 82 |
+
- name: LatentSeg
|
| 83 |
+
active: true
|
| 84 |
+
params:
|
| 85 |
+
root: avae-4f4/lvh
|
| 86 |
+
outputs: ${globals.outputs}
|
| 87 |
+
target_fps: ${globals.target_fps}
|
| 88 |
+
view_label: PLAX
|
| 89 |
+
target_nframes: ${globals.target_nframes}
|
| 90 |
+
latent_channels: ${globals.latent_channels}
|
| 91 |
+
segmentation_root: no_seg
|
| 92 |
+
target_resolution: ${globals.latent_res}
|
| 93 |
+
dataloader:
|
| 94 |
+
target: torch.utils.data.DataLoader
|
| 95 |
+
args:
|
| 96 |
+
shuffle: true
|
| 97 |
+
batch_size: 128
|
| 98 |
+
num_workers: 16
|
| 99 |
+
pin_memory: true
|
| 100 |
+
drop_last: true
|
| 101 |
+
persistent_workers: true
|
| 102 |
+
max_train_steps: 1000000
|
| 103 |
+
gradient_accumulation_steps: 1
|
| 104 |
+
mixed_precision: fp16
|
| 105 |
+
use_ema: true
|
| 106 |
+
noise_offset: 0.1
|
| 107 |
+
max_grad_norm: 0.1
|
| 108 |
+
max_grad_value: -1
|
| 109 |
+
pad_latents: false
|
| 110 |
+
sample_latents: true
|
| 111 |
+
output_dir: experiments/${wandb_args.name}
|
| 112 |
+
logging_dir: logs
|
| 113 |
+
report_to: wandb
|
| 114 |
+
wandb_args:
|
| 115 |
+
project: EchoFlow
|
| 116 |
+
name: FMiT-B4-4f4
|
| 117 |
+
group: FMiT
|
| 118 |
+
checkpointing_steps: 10000
|
| 119 |
+
checkpoints_to_keep:
|
| 120 |
+
- 50000
|
| 121 |
+
- 100000
|
| 122 |
+
- 200000
|
| 123 |
+
- 500000
|
| 124 |
+
- 1000000
|
| 125 |
+
resume_from_checkpoint: latest
|
| 126 |
+
validation:
|
| 127 |
+
samples: 4
|
| 128 |
+
steps: 5000
|
| 129 |
+
method: euler
|
| 130 |
+
timesteps: 25
|
| 131 |
+
seed: 42
|
| 132 |
+
num_train_epochs: 45455
|
lifm/FMiT-L2-16f8/config.yaml
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
globals:
|
| 2 |
+
target_fps: original
|
| 3 |
+
target_nframes: 64
|
| 4 |
+
outputs:
|
| 5 |
+
- image
|
| 6 |
+
- view
|
| 7 |
+
resolution: 112
|
| 8 |
+
latent_res: 14
|
| 9 |
+
latent_channels: 16
|
| 10 |
+
denoiser:
|
| 11 |
+
target: echosyn.common.models.SegDiTTransformer2DModel
|
| 12 |
+
args:
|
| 13 |
+
num_attention_heads: 16
|
| 14 |
+
attention_head_dim: 64
|
| 15 |
+
in_channels: 17
|
| 16 |
+
out_channels: 16
|
| 17 |
+
num_layers: 24
|
| 18 |
+
dropout: 0.0
|
| 19 |
+
norm_num_groups: 32
|
| 20 |
+
attention_bias: true
|
| 21 |
+
sample_size: ${globals.latent_res}
|
| 22 |
+
patch_size: 2
|
| 23 |
+
activation_fn: gelu-approximate
|
| 24 |
+
num_embeds_ada_norm: 1000
|
| 25 |
+
upcast_attention: false
|
| 26 |
+
norm_type: ada_norm_zero
|
| 27 |
+
norm_elementwise_affine: false
|
| 28 |
+
norm_eps: 1.0e-05
|
| 29 |
+
optimizer:
|
| 30 |
+
target: torch.optim.AdamW
|
| 31 |
+
args:
|
| 32 |
+
lr: 5.0e-05
|
| 33 |
+
betas:
|
| 34 |
+
- 0.9
|
| 35 |
+
- 0.999
|
| 36 |
+
weight_decay: 0.01
|
| 37 |
+
eps: 1.0e-08
|
| 38 |
+
scheduler:
|
| 39 |
+
target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
|
| 40 |
+
args:
|
| 41 |
+
warmup_steps: 5000
|
| 42 |
+
ref_steps: ${max_train_steps}
|
| 43 |
+
eta_min: 1.0e-06
|
| 44 |
+
decay_rate: 2
|
| 45 |
+
vae:
|
| 46 |
+
target: diffusers.AutoencoderKL
|
| 47 |
+
pretrained: vae/avae-16f8
|
| 48 |
+
datasets:
|
| 49 |
+
- name: LatentSeg
|
| 50 |
+
active: true
|
| 51 |
+
params:
|
| 52 |
+
root: avae-16f8/dynamic
|
| 53 |
+
outputs: ${globals.outputs}
|
| 54 |
+
target_fps: ${globals.target_fps}
|
| 55 |
+
view_label: A4C
|
| 56 |
+
target_nframes: ${globals.target_nframes}
|
| 57 |
+
latent_channels: ${globals.latent_channels}
|
| 58 |
+
segmentation_root: segmentations/dynamic
|
| 59 |
+
target_resolution: ${globals.latent_res}
|
| 60 |
+
- name: LatentSeg
|
| 61 |
+
active: true
|
| 62 |
+
params:
|
| 63 |
+
root: avae-16f8/ped_a4c
|
| 64 |
+
outputs: ${globals.outputs}
|
| 65 |
+
target_fps: ${globals.target_fps}
|
| 66 |
+
view_label: A4C
|
| 67 |
+
target_nframes: ${globals.target_nframes}
|
| 68 |
+
latent_channels: ${globals.latent_channels}
|
| 69 |
+
segmentation_root: segmentations/ped_a4c
|
| 70 |
+
target_resolution: ${globals.latent_res}
|
| 71 |
+
- name: LatentSeg
|
| 72 |
+
active: true
|
| 73 |
+
params:
|
| 74 |
+
root: avae-16f8/ped_psax
|
| 75 |
+
outputs: ${globals.outputs}
|
| 76 |
+
target_fps: ${globals.target_fps}
|
| 77 |
+
view_label: PSAX
|
| 78 |
+
target_nframes: ${globals.target_nframes}
|
| 79 |
+
latent_channels: ${globals.latent_channels}
|
| 80 |
+
segmentation_root: segmentations/ped_psax
|
| 81 |
+
target_resolution: ${globals.latent_res}
|
| 82 |
+
- name: LatentSeg
|
| 83 |
+
active: true
|
| 84 |
+
params:
|
| 85 |
+
root: avae-16f8/lvh
|
| 86 |
+
outputs: ${globals.outputs}
|
| 87 |
+
target_fps: ${globals.target_fps}
|
| 88 |
+
view_label: PLAX
|
| 89 |
+
target_nframes: ${globals.target_nframes}
|
| 90 |
+
latent_channels: ${globals.latent_channels}
|
| 91 |
+
segmentation_root: no_seg
|
| 92 |
+
target_resolution: ${globals.latent_res}
|
| 93 |
+
dataloader:
|
| 94 |
+
target: torch.utils.data.DataLoader
|
| 95 |
+
args:
|
| 96 |
+
shuffle: true
|
| 97 |
+
batch_size: 128
|
| 98 |
+
num_workers: 16
|
| 99 |
+
pin_memory: true
|
| 100 |
+
drop_last: true
|
| 101 |
+
persistent_workers: true
|
| 102 |
+
max_train_steps: 1000000
|
| 103 |
+
gradient_accumulation_steps: 1
|
| 104 |
+
mixed_precision: fp16
|
| 105 |
+
use_ema: true
|
| 106 |
+
noise_offset: 0.1
|
| 107 |
+
max_grad_norm: 0.1
|
| 108 |
+
max_grad_value: -1
|
| 109 |
+
pad_latents: false
|
| 110 |
+
sample_latents: true
|
| 111 |
+
output_dir: experiments/${wandb_args.name}
|
| 112 |
+
logging_dir: logs
|
| 113 |
+
report_to: wandb
|
| 114 |
+
wandb_args:
|
| 115 |
+
project: EchoFlow
|
| 116 |
+
name: FMiT-L2-16f8
|
| 117 |
+
group: FMiT
|
| 118 |
+
checkpointing_steps: 10000
|
| 119 |
+
checkpoints_to_keep:
|
| 120 |
+
- 50000
|
| 121 |
+
- 100000
|
| 122 |
+
- 200000
|
| 123 |
+
- 500000
|
| 124 |
+
- 1000000
|
| 125 |
+
resume_from_checkpoint: latest
|
| 126 |
+
validation:
|
| 127 |
+
samples: 4
|
| 128 |
+
steps: 5000
|
| 129 |
+
method: euler
|
| 130 |
+
timesteps: 25
|
| 131 |
+
seed: 42
|
| 132 |
+
num_train_epochs: 45455
|
lifm/FMiT-L2-4f4/config.yaml
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
globals:
|
| 2 |
+
target_fps: original
|
| 3 |
+
target_nframes: 64
|
| 4 |
+
outputs:
|
| 5 |
+
- image
|
| 6 |
+
- view
|
| 7 |
+
resolution: 112
|
| 8 |
+
latent_res: 28
|
| 9 |
+
latent_channels: 4
|
| 10 |
+
denoiser:
|
| 11 |
+
target: echosyn.common.models.SegDiTTransformer2DModel
|
| 12 |
+
args:
|
| 13 |
+
num_attention_heads: 16
|
| 14 |
+
attention_head_dim: 64
|
| 15 |
+
in_channels: 5
|
| 16 |
+
out_channels: 4
|
| 17 |
+
num_layers: 24
|
| 18 |
+
dropout: 0.0
|
| 19 |
+
norm_num_groups: 32
|
| 20 |
+
attention_bias: true
|
| 21 |
+
sample_size: 14
|
| 22 |
+
patch_size: 2
|
| 23 |
+
activation_fn: gelu-approximate
|
| 24 |
+
num_embeds_ada_norm: 1000
|
| 25 |
+
upcast_attention: false
|
| 26 |
+
norm_type: ada_norm_zero
|
| 27 |
+
norm_elementwise_affine: false
|
| 28 |
+
norm_eps: 1.0e-05
|
| 29 |
+
optimizer:
|
| 30 |
+
target: torch.optim.AdamW
|
| 31 |
+
args:
|
| 32 |
+
lr: 5.0e-05
|
| 33 |
+
betas:
|
| 34 |
+
- 0.9
|
| 35 |
+
- 0.999
|
| 36 |
+
weight_decay: 0.01
|
| 37 |
+
eps: 1.0e-08
|
| 38 |
+
scheduler:
|
| 39 |
+
target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
|
| 40 |
+
args:
|
| 41 |
+
warmup_steps: 5000
|
| 42 |
+
ref_steps: ${max_train_steps}
|
| 43 |
+
eta_min: 1.0e-06
|
| 44 |
+
decay_rate: 2
|
| 45 |
+
vae:
|
| 46 |
+
target: diffusers.AutoencoderKL
|
| 47 |
+
pretrained: vae/avae-4f4
|
| 48 |
+
datasets:
|
| 49 |
+
- name: LatentSeg
|
| 50 |
+
active: true
|
| 51 |
+
params:
|
| 52 |
+
root: avae-4f4/dynamic
|
| 53 |
+
outputs: ${globals.outputs}
|
| 54 |
+
target_fps: ${globals.target_fps}
|
| 55 |
+
view_label: A4C
|
| 56 |
+
target_nframes: ${globals.target_nframes}
|
| 57 |
+
latent_channels: ${globals.latent_channels}
|
| 58 |
+
segmentation_root: segmentations/dynamic
|
| 59 |
+
target_resolution: ${globals.latent_res}
|
| 60 |
+
- name: LatentSeg
|
| 61 |
+
active: true
|
| 62 |
+
params:
|
| 63 |
+
root: avae-4f4/ped_a4c
|
| 64 |
+
outputs: ${globals.outputs}
|
| 65 |
+
target_fps: ${globals.target_fps}
|
| 66 |
+
view_label: A4C
|
| 67 |
+
target_nframes: ${globals.target_nframes}
|
| 68 |
+
latent_channels: ${globals.latent_channels}
|
| 69 |
+
segmentation_root: segmentations/ped_a4c
|
| 70 |
+
target_resolution: ${globals.latent_res}
|
| 71 |
+
- name: LatentSeg
|
| 72 |
+
active: true
|
| 73 |
+
params:
|
| 74 |
+
root: avae-4f4/ped_psax
|
| 75 |
+
outputs: ${globals.outputs}
|
| 76 |
+
target_fps: ${globals.target_fps}
|
| 77 |
+
view_label: PSAX
|
| 78 |
+
target_nframes: ${globals.target_nframes}
|
| 79 |
+
latent_channels: ${globals.latent_channels}
|
| 80 |
+
segmentation_root: segmentations/ped_psax
|
| 81 |
+
target_resolution: ${globals.latent_res}
|
| 82 |
+
- name: LatentSeg
|
| 83 |
+
active: true
|
| 84 |
+
params:
|
| 85 |
+
root: avae-4f4/lvh
|
| 86 |
+
outputs: ${globals.outputs}
|
| 87 |
+
target_fps: ${globals.target_fps}
|
| 88 |
+
view_label: PLAX
|
| 89 |
+
target_nframes: ${globals.target_nframes}
|
| 90 |
+
latent_channels: ${globals.latent_channels}
|
| 91 |
+
segmentation_root: no_seg
|
| 92 |
+
target_resolution: ${globals.latent_res}
|
| 93 |
+
dataloader:
|
| 94 |
+
target: torch.utils.data.DataLoader
|
| 95 |
+
args:
|
| 96 |
+
shuffle: true
|
| 97 |
+
batch_size: 128
|
| 98 |
+
num_workers: 16
|
| 99 |
+
pin_memory: true
|
| 100 |
+
drop_last: true
|
| 101 |
+
persistent_workers: true
|
| 102 |
+
max_train_steps: 1000000
|
| 103 |
+
gradient_accumulation_steps: 1
|
| 104 |
+
mixed_precision: fp16
|
| 105 |
+
use_ema: true
|
| 106 |
+
noise_offset: 0.1
|
| 107 |
+
max_grad_norm: 0.1
|
| 108 |
+
max_grad_value: -1
|
| 109 |
+
pad_latents: false
|
| 110 |
+
sample_latents: true
|
| 111 |
+
output_dir: experiments/${wandb_args.name}
|
| 112 |
+
logging_dir: logs
|
| 113 |
+
report_to: wandb
|
| 114 |
+
wandb_args:
|
| 115 |
+
project: EchoFlow
|
| 116 |
+
name: FMiT-L2-4f4
|
| 117 |
+
group: FMiT
|
| 118 |
+
checkpointing_steps: 10000
|
| 119 |
+
checkpoints_to_keep:
|
| 120 |
+
- 50000
|
| 121 |
+
- 100000
|
| 122 |
+
- 200000
|
| 123 |
+
- 500000
|
| 124 |
+
- 1000000
|
| 125 |
+
resume_from_checkpoint: latest
|
| 126 |
+
validation:
|
| 127 |
+
samples: 4
|
| 128 |
+
steps: 5000
|
| 129 |
+
method: euler
|
| 130 |
+
timesteps: 25
|
| 131 |
+
seed: 42
|
| 132 |
+
num_train_epochs: 45455
|
lifm/FMiT-L4-4f4/config.yaml
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
globals:
|
| 2 |
+
target_fps: original
|
| 3 |
+
target_nframes: 64
|
| 4 |
+
outputs:
|
| 5 |
+
- image
|
| 6 |
+
- view
|
| 7 |
+
resolution: 112
|
| 8 |
+
latent_res: 28
|
| 9 |
+
latent_channels: 4
|
| 10 |
+
denoiser:
|
| 11 |
+
target: echosyn.common.models.SegDiTTransformer2DModel
|
| 12 |
+
args:
|
| 13 |
+
num_attention_heads: 16
|
| 14 |
+
attention_head_dim: 64
|
| 15 |
+
in_channels: 5
|
| 16 |
+
out_channels: 4
|
| 17 |
+
num_layers: 24
|
| 18 |
+
dropout: 0.0
|
| 19 |
+
norm_num_groups: 32
|
| 20 |
+
attention_bias: true
|
| 21 |
+
sample_size: 14
|
| 22 |
+
patch_size: 4
|
| 23 |
+
activation_fn: gelu-approximate
|
| 24 |
+
num_embeds_ada_norm: 1000
|
| 25 |
+
upcast_attention: false
|
| 26 |
+
norm_type: ada_norm_zero
|
| 27 |
+
norm_elementwise_affine: false
|
| 28 |
+
norm_eps: 1.0e-05
|
| 29 |
+
optimizer:
|
| 30 |
+
target: torch.optim.AdamW
|
| 31 |
+
args:
|
| 32 |
+
lr: 5.0e-05
|
| 33 |
+
betas:
|
| 34 |
+
- 0.9
|
| 35 |
+
- 0.999
|
| 36 |
+
weight_decay: 0.01
|
| 37 |
+
eps: 1.0e-08
|
| 38 |
+
scheduler:
|
| 39 |
+
target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
|
| 40 |
+
args:
|
| 41 |
+
warmup_steps: 5000
|
| 42 |
+
ref_steps: ${max_train_steps}
|
| 43 |
+
eta_min: 1.0e-06
|
| 44 |
+
decay_rate: 2
|
| 45 |
+
vae:
|
| 46 |
+
target: diffusers.AutoencoderKL
|
| 47 |
+
pretrained: vae/avae-4f4
|
| 48 |
+
datasets:
|
| 49 |
+
- name: LatentSeg
|
| 50 |
+
active: true
|
| 51 |
+
params:
|
| 52 |
+
root: avae-4f4/dynamic
|
| 53 |
+
outputs: ${globals.outputs}
|
| 54 |
+
target_fps: ${globals.target_fps}
|
| 55 |
+
view_label: A4C
|
| 56 |
+
target_nframes: ${globals.target_nframes}
|
| 57 |
+
latent_channels: ${globals.latent_channels}
|
| 58 |
+
segmentation_root: segmentations/dynamic
|
| 59 |
+
target_resolution: ${globals.latent_res}
|
| 60 |
+
- name: LatentSeg
|
| 61 |
+
active: true
|
| 62 |
+
params:
|
| 63 |
+
root: avae-4f4/ped_a4c
|
| 64 |
+
outputs: ${globals.outputs}
|
| 65 |
+
target_fps: ${globals.target_fps}
|
| 66 |
+
view_label: A4C
|
| 67 |
+
target_nframes: ${globals.target_nframes}
|
| 68 |
+
latent_channels: ${globals.latent_channels}
|
| 69 |
+
segmentation_root: segmentations/ped_a4c
|
| 70 |
+
target_resolution: ${globals.latent_res}
|
| 71 |
+
- name: LatentSeg
|
| 72 |
+
active: true
|
| 73 |
+
params:
|
| 74 |
+
root: avae-4f4/ped_psax
|
| 75 |
+
outputs: ${globals.outputs}
|
| 76 |
+
target_fps: ${globals.target_fps}
|
| 77 |
+
view_label: PSAX
|
| 78 |
+
target_nframes: ${globals.target_nframes}
|
| 79 |
+
latent_channels: ${globals.latent_channels}
|
| 80 |
+
segmentation_root: segmentations/ped_psax
|
| 81 |
+
target_resolution: ${globals.latent_res}
|
| 82 |
+
- name: LatentSeg
|
| 83 |
+
active: true
|
| 84 |
+
params:
|
| 85 |
+
root: avae-4f4/lvh
|
| 86 |
+
outputs: ${globals.outputs}
|
| 87 |
+
target_fps: ${globals.target_fps}
|
| 88 |
+
view_label: PLAX
|
| 89 |
+
target_nframes: ${globals.target_nframes}
|
| 90 |
+
latent_channels: ${globals.latent_channels}
|
| 91 |
+
segmentation_root: no_seg
|
| 92 |
+
target_resolution: ${globals.latent_res}
|
| 93 |
+
dataloader:
|
| 94 |
+
target: torch.utils.data.DataLoader
|
| 95 |
+
args:
|
| 96 |
+
shuffle: true
|
| 97 |
+
batch_size: 128
|
| 98 |
+
num_workers: 16
|
| 99 |
+
pin_memory: true
|
| 100 |
+
drop_last: true
|
| 101 |
+
persistent_workers: true
|
| 102 |
+
max_train_steps: 1000000
|
| 103 |
+
gradient_accumulation_steps: 1
|
| 104 |
+
mixed_precision: fp16
|
| 105 |
+
use_ema: true
|
| 106 |
+
noise_offset: 0.1
|
| 107 |
+
max_grad_norm: 0.1
|
| 108 |
+
max_grad_value: -1
|
| 109 |
+
pad_latents: false
|
| 110 |
+
sample_latents: true
|
| 111 |
+
output_dir: experiments/${wandb_args.name}
|
| 112 |
+
logging_dir: logs
|
| 113 |
+
report_to: wandb
|
| 114 |
+
wandb_args:
|
| 115 |
+
project: EchoFlow
|
| 116 |
+
name: FMiT-L4-4f4
|
| 117 |
+
group: FMiT
|
| 118 |
+
checkpointing_steps: 10000
|
| 119 |
+
checkpoints_to_keep:
|
| 120 |
+
- 50000
|
| 121 |
+
- 100000
|
| 122 |
+
- 200000
|
| 123 |
+
- 500000
|
| 124 |
+
- 1000000
|
| 125 |
+
resume_from_checkpoint: latest
|
| 126 |
+
validation:
|
| 127 |
+
samples: 4
|
| 128 |
+
steps: 5000
|
| 129 |
+
method: euler
|
| 130 |
+
timesteps: 25
|
| 131 |
+
seed: 42
|
| 132 |
+
num_train_epochs: 45455
|
lifm/FMiT-S2-16f8/config.yaml
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
globals:
|
| 2 |
+
target_fps: original
|
| 3 |
+
target_nframes: 64
|
| 4 |
+
outputs:
|
| 5 |
+
- image
|
| 6 |
+
- view
|
| 7 |
+
resolution: 112
|
| 8 |
+
latent_res: 14
|
| 9 |
+
latent_channels: 16
|
| 10 |
+
denoiser:
|
| 11 |
+
target: echosyn.common.models.SegDiTTransformer2DModel
|
| 12 |
+
args:
|
| 13 |
+
num_attention_heads: 6
|
| 14 |
+
attention_head_dim: 64
|
| 15 |
+
in_channels: 17
|
| 16 |
+
out_channels: 16
|
| 17 |
+
num_layers: 12
|
| 18 |
+
dropout: 0.0
|
| 19 |
+
norm_num_groups: 32
|
| 20 |
+
attention_bias: true
|
| 21 |
+
sample_size: ${globals.latent_res}
|
| 22 |
+
patch_size: 2
|
| 23 |
+
activation_fn: gelu-approximate
|
| 24 |
+
num_embeds_ada_norm: 1000
|
| 25 |
+
upcast_attention: false
|
| 26 |
+
norm_type: ada_norm_zero
|
| 27 |
+
norm_elementwise_affine: false
|
| 28 |
+
norm_eps: 1.0e-05
|
| 29 |
+
optimizer:
|
| 30 |
+
target: torch.optim.AdamW
|
| 31 |
+
args:
|
| 32 |
+
lr: 5.0e-05
|
| 33 |
+
betas:
|
| 34 |
+
- 0.9
|
| 35 |
+
- 0.999
|
| 36 |
+
weight_decay: 0.01
|
| 37 |
+
eps: 1.0e-08
|
| 38 |
+
scheduler:
|
| 39 |
+
target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
|
| 40 |
+
args:
|
| 41 |
+
warmup_steps: 5000
|
| 42 |
+
ref_steps: ${max_train_steps}
|
| 43 |
+
eta_min: 1.0e-06
|
| 44 |
+
decay_rate: 2
|
| 45 |
+
vae:
|
| 46 |
+
target: diffusers.AutoencoderKL
|
| 47 |
+
pretrained: vae/avae-16f8
|
| 48 |
+
datasets:
|
| 49 |
+
- name: LatentSeg
|
| 50 |
+
active: true
|
| 51 |
+
params:
|
| 52 |
+
root: avae-16f8/dynamic
|
| 53 |
+
outputs: ${globals.outputs}
|
| 54 |
+
target_fps: ${globals.target_fps}
|
| 55 |
+
view_label: A4C
|
| 56 |
+
target_nframes: ${globals.target_nframes}
|
| 57 |
+
latent_channels: ${globals.latent_channels}
|
| 58 |
+
segmentation_root: segmentations/dynamic
|
| 59 |
+
target_resolution: ${globals.latent_res}
|
| 60 |
+
- name: LatentSeg
|
| 61 |
+
active: true
|
| 62 |
+
params:
|
| 63 |
+
root: avae-16f8/ped_a4c
|
| 64 |
+
outputs: ${globals.outputs}
|
| 65 |
+
target_fps: ${globals.target_fps}
|
| 66 |
+
view_label: A4C
|
| 67 |
+
target_nframes: ${globals.target_nframes}
|
| 68 |
+
latent_channels: ${globals.latent_channels}
|
| 69 |
+
segmentation_root: segmentations/ped_a4c
|
| 70 |
+
target_resolution: ${globals.latent_res}
|
| 71 |
+
- name: LatentSeg
|
| 72 |
+
active: true
|
| 73 |
+
params:
|
| 74 |
+
root: avae-16f8/ped_psax
|
| 75 |
+
outputs: ${globals.outputs}
|
| 76 |
+
target_fps: ${globals.target_fps}
|
| 77 |
+
view_label: PSAX
|
| 78 |
+
target_nframes: ${globals.target_nframes}
|
| 79 |
+
latent_channels: ${globals.latent_channels}
|
| 80 |
+
segmentation_root: segmentations/ped_psax
|
| 81 |
+
target_resolution: ${globals.latent_res}
|
| 82 |
+
- name: LatentSeg
|
| 83 |
+
active: true
|
| 84 |
+
params:
|
| 85 |
+
root: avae-16f8/lvh
|
| 86 |
+
outputs: ${globals.outputs}
|
| 87 |
+
target_fps: ${globals.target_fps}
|
| 88 |
+
view_label: PLAX
|
| 89 |
+
target_nframes: ${globals.target_nframes}
|
| 90 |
+
latent_channels: ${globals.latent_channels}
|
| 91 |
+
segmentation_root: no_seg
|
| 92 |
+
target_resolution: ${globals.latent_res}
|
| 93 |
+
dataloader:
|
| 94 |
+
target: torch.utils.data.DataLoader
|
| 95 |
+
args:
|
| 96 |
+
shuffle: true
|
| 97 |
+
batch_size: 128
|
| 98 |
+
num_workers: 16
|
| 99 |
+
pin_memory: true
|
| 100 |
+
drop_last: true
|
| 101 |
+
persistent_workers: true
|
| 102 |
+
max_train_steps: 1000000
|
| 103 |
+
gradient_accumulation_steps: 1
|
| 104 |
+
mixed_precision: fp16
|
| 105 |
+
use_ema: true
|
| 106 |
+
noise_offset: 0.1
|
| 107 |
+
max_grad_norm: 0.1
|
| 108 |
+
max_grad_value: -1
|
| 109 |
+
pad_latents: false
|
| 110 |
+
sample_latents: true
|
| 111 |
+
output_dir: experiments/${wandb_args.name}
|
| 112 |
+
logging_dir: logs
|
| 113 |
+
report_to: wandb
|
| 114 |
+
wandb_args:
|
| 115 |
+
project: EchoFlow
|
| 116 |
+
name: FMiT-S2-16f8
|
| 117 |
+
group: FMiT
|
| 118 |
+
checkpointing_steps: 10000
|
| 119 |
+
checkpoints_to_keep:
|
| 120 |
+
- 50000
|
| 121 |
+
- 100000
|
| 122 |
+
- 200000
|
| 123 |
+
- 500000
|
| 124 |
+
- 1000000
|
| 125 |
+
resume_from_checkpoint: latest
|
| 126 |
+
validation:
|
| 127 |
+
samples: 4
|
| 128 |
+
steps: 5000
|
| 129 |
+
method: euler
|
| 130 |
+
timesteps: 25
|
| 131 |
+
seed: 42
|
| 132 |
+
num_train_epochs: 45455
|
lifm/FMiT-S2-4f4/config.yaml
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
globals:
|
| 2 |
+
target_fps: original
|
| 3 |
+
target_nframes: 64
|
| 4 |
+
outputs:
|
| 5 |
+
- image
|
| 6 |
+
- view
|
| 7 |
+
resolution: 112
|
| 8 |
+
latent_res: 28
|
| 9 |
+
latent_channels: 4
|
| 10 |
+
denoiser:
|
| 11 |
+
target: echosyn.common.models.SegDiTTransformer2DModel
|
| 12 |
+
args:
|
| 13 |
+
num_attention_heads: 6
|
| 14 |
+
attention_head_dim: 64
|
| 15 |
+
in_channels: 5
|
| 16 |
+
out_channels: 4
|
| 17 |
+
num_layers: 12
|
| 18 |
+
dropout: 0.0
|
| 19 |
+
norm_num_groups: 32
|
| 20 |
+
attention_bias: true
|
| 21 |
+
sample_size: ${globals.latent_res}
|
| 22 |
+
patch_size: 2
|
| 23 |
+
activation_fn: gelu-approximate
|
| 24 |
+
num_embeds_ada_norm: 1000
|
| 25 |
+
upcast_attention: false
|
| 26 |
+
norm_type: ada_norm_zero
|
| 27 |
+
norm_elementwise_affine: false
|
| 28 |
+
norm_eps: 1.0e-05
|
| 29 |
+
optimizer:
|
| 30 |
+
target: torch.optim.AdamW
|
| 31 |
+
args:
|
| 32 |
+
lr: 5.0e-05
|
| 33 |
+
betas:
|
| 34 |
+
- 0.9
|
| 35 |
+
- 0.999
|
| 36 |
+
weight_decay: 0.01
|
| 37 |
+
eps: 1.0e-08
|
| 38 |
+
scheduler:
|
| 39 |
+
target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
|
| 40 |
+
args:
|
| 41 |
+
warmup_steps: 5000
|
| 42 |
+
ref_steps: ${max_train_steps}
|
| 43 |
+
eta_min: 1.0e-06
|
| 44 |
+
decay_rate: 2
|
| 45 |
+
vae:
|
| 46 |
+
target: diffusers.AutoencoderKL
|
| 47 |
+
pretrained: vae/avae-4f4
|
| 48 |
+
datasets:
|
| 49 |
+
- name: LatentSeg
|
| 50 |
+
active: true
|
| 51 |
+
params:
|
| 52 |
+
root: avae-4f4/dynamic
|
| 53 |
+
outputs: ${globals.outputs}
|
| 54 |
+
target_fps: ${globals.target_fps}
|
| 55 |
+
view_label: A4C
|
| 56 |
+
target_nframes: ${globals.target_nframes}
|
| 57 |
+
latent_channels: ${globals.latent_channels}
|
| 58 |
+
segmentation_root: segmentations/dynamic
|
| 59 |
+
target_resolution: ${globals.latent_res}
|
| 60 |
+
- name: LatentSeg
|
| 61 |
+
active: true
|
| 62 |
+
params:
|
| 63 |
+
root: avae-4f4/ped_a4c
|
| 64 |
+
outputs: ${globals.outputs}
|
| 65 |
+
target_fps: ${globals.target_fps}
|
| 66 |
+
view_label: A4C
|
| 67 |
+
target_nframes: ${globals.target_nframes}
|
| 68 |
+
latent_channels: ${globals.latent_channels}
|
| 69 |
+
segmentation_root: segmentations/ped_a4c
|
| 70 |
+
target_resolution: ${globals.latent_res}
|
| 71 |
+
- name: LatentSeg
|
| 72 |
+
active: true
|
| 73 |
+
params:
|
| 74 |
+
root: avae-4f4/ped_psax
|
| 75 |
+
outputs: ${globals.outputs}
|
| 76 |
+
target_fps: ${globals.target_fps}
|
| 77 |
+
view_label: PSAX
|
| 78 |
+
target_nframes: ${globals.target_nframes}
|
| 79 |
+
latent_channels: ${globals.latent_channels}
|
| 80 |
+
segmentation_root: segmentations/ped_psax
|
| 81 |
+
target_resolution: ${globals.latent_res}
|
| 82 |
+
- name: LatentSeg
|
| 83 |
+
active: true
|
| 84 |
+
params:
|
| 85 |
+
root: avae-4f4/lvh
|
| 86 |
+
outputs: ${globals.outputs}
|
| 87 |
+
target_fps: ${globals.target_fps}
|
| 88 |
+
view_label: PLAX
|
| 89 |
+
target_nframes: ${globals.target_nframes}
|
| 90 |
+
latent_channels: ${globals.latent_channels}
|
| 91 |
+
segmentation_root: no_seg
|
| 92 |
+
target_resolution: ${globals.latent_res}
|
| 93 |
+
dataloader:
|
| 94 |
+
target: torch.utils.data.DataLoader
|
| 95 |
+
args:
|
| 96 |
+
shuffle: true
|
| 97 |
+
batch_size: 128
|
| 98 |
+
num_workers: 16
|
| 99 |
+
pin_memory: true
|
| 100 |
+
drop_last: true
|
| 101 |
+
persistent_workers: true
|
| 102 |
+
max_train_steps: 1000000
|
| 103 |
+
gradient_accumulation_steps: 1
|
| 104 |
+
mixed_precision: fp16
|
| 105 |
+
use_ema: true
|
| 106 |
+
noise_offset: 0.1
|
| 107 |
+
max_grad_norm: 0.1
|
| 108 |
+
max_grad_value: -1
|
| 109 |
+
pad_latents: false
|
| 110 |
+
sample_latents: true
|
| 111 |
+
output_dir: experiments/${wandb_args.name}
|
| 112 |
+
logging_dir: logs
|
| 113 |
+
report_to: wandb
|
| 114 |
+
wandb_args:
|
| 115 |
+
project: EchoFlow
|
| 116 |
+
name: FMiT-S2-4f4
|
| 117 |
+
group: FMiT
|
| 118 |
+
checkpointing_steps: 10000
|
| 119 |
+
checkpoints_to_keep:
|
| 120 |
+
- 50000
|
| 121 |
+
- 100000
|
| 122 |
+
- 200000
|
| 123 |
+
- 500000
|
| 124 |
+
- 1000000
|
| 125 |
+
resume_from_checkpoint: latest
|
| 126 |
+
validation:
|
| 127 |
+
samples: 4
|
| 128 |
+
steps: 5000
|
| 129 |
+
method: euler
|
| 130 |
+
timesteps: 25
|
| 131 |
+
seed: 42
|
| 132 |
+
num_train_epochs: 45455
|
lifm/FMiT-S4-4f4/config.yaml
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
globals:
|
| 2 |
+
target_fps: original
|
| 3 |
+
target_nframes: 64
|
| 4 |
+
outputs:
|
| 5 |
+
- image
|
| 6 |
+
- view
|
| 7 |
+
resolution: 112
|
| 8 |
+
latent_res: 28
|
| 9 |
+
latent_channels: 4
|
| 10 |
+
denoiser:
|
| 11 |
+
target: echosyn.common.models.SegDiTTransformer2DModel
|
| 12 |
+
args:
|
| 13 |
+
num_attention_heads: 6
|
| 14 |
+
attention_head_dim: 64
|
| 15 |
+
in_channels: 5
|
| 16 |
+
out_channels: 4
|
| 17 |
+
num_layers: 12
|
| 18 |
+
dropout: 0.0
|
| 19 |
+
norm_num_groups: 32
|
| 20 |
+
attention_bias: true
|
| 21 |
+
sample_size: ${globals.latent_res}
|
| 22 |
+
patch_size: 4
|
| 23 |
+
activation_fn: gelu-approximate
|
| 24 |
+
num_embeds_ada_norm: 1000
|
| 25 |
+
upcast_attention: false
|
| 26 |
+
norm_type: ada_norm_zero
|
| 27 |
+
norm_elementwise_affine: false
|
| 28 |
+
norm_eps: 1.0e-05
|
| 29 |
+
optimizer:
|
| 30 |
+
target: torch.optim.AdamW
|
| 31 |
+
args:
|
| 32 |
+
lr: 5.0e-05
|
| 33 |
+
betas:
|
| 34 |
+
- 0.9
|
| 35 |
+
- 0.999
|
| 36 |
+
weight_decay: 0.01
|
| 37 |
+
eps: 1.0e-08
|
| 38 |
+
scheduler:
|
| 39 |
+
target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
|
| 40 |
+
args:
|
| 41 |
+
warmup_steps: 5000
|
| 42 |
+
ref_steps: ${max_train_steps}
|
| 43 |
+
eta_min: 1.0e-06
|
| 44 |
+
decay_rate: 2
|
| 45 |
+
vae:
|
| 46 |
+
target: diffusers.AutoencoderKL
|
| 47 |
+
pretrained: vae/avae-4f4
|
| 48 |
+
datasets:
|
| 49 |
+
- name: LatentSeg
|
| 50 |
+
active: true
|
| 51 |
+
params:
|
| 52 |
+
root: avae-4f4/dynamic
|
| 53 |
+
outputs: ${globals.outputs}
|
| 54 |
+
target_fps: ${globals.target_fps}
|
| 55 |
+
view_label: A4C
|
| 56 |
+
target_nframes: ${globals.target_nframes}
|
| 57 |
+
latent_channels: ${globals.latent_channels}
|
| 58 |
+
segmentation_root: segmentations/dynamic
|
| 59 |
+
target_resolution: ${globals.latent_res}
|
| 60 |
+
- name: LatentSeg
|
| 61 |
+
active: true
|
| 62 |
+
params:
|
| 63 |
+
root: avae-4f4/ped_a4c
|
| 64 |
+
outputs: ${globals.outputs}
|
| 65 |
+
target_fps: ${globals.target_fps}
|
| 66 |
+
view_label: A4C
|
| 67 |
+
target_nframes: ${globals.target_nframes}
|
| 68 |
+
latent_channels: ${globals.latent_channels}
|
| 69 |
+
segmentation_root: segmentations/ped_a4c
|
| 70 |
+
target_resolution: ${globals.latent_res}
|
| 71 |
+
- name: LatentSeg
|
| 72 |
+
active: true
|
| 73 |
+
params:
|
| 74 |
+
root: avae-4f4/ped_psax
|
| 75 |
+
outputs: ${globals.outputs}
|
| 76 |
+
target_fps: ${globals.target_fps}
|
| 77 |
+
view_label: PSAX
|
| 78 |
+
target_nframes: ${globals.target_nframes}
|
| 79 |
+
latent_channels: ${globals.latent_channels}
|
| 80 |
+
segmentation_root: segmentations/ped_psax
|
| 81 |
+
target_resolution: ${globals.latent_res}
|
| 82 |
+
- name: LatentSeg
|
| 83 |
+
active: true
|
| 84 |
+
params:
|
| 85 |
+
root: avae-4f4/lvh
|
| 86 |
+
outputs: ${globals.outputs}
|
| 87 |
+
target_fps: ${globals.target_fps}
|
| 88 |
+
view_label: PLAX
|
| 89 |
+
target_nframes: ${globals.target_nframes}
|
| 90 |
+
latent_channels: ${globals.latent_channels}
|
| 91 |
+
segmentation_root: no_seg
|
| 92 |
+
target_resolution: ${globals.latent_res}
|
| 93 |
+
dataloader:
|
| 94 |
+
target: torch.utils.data.DataLoader
|
| 95 |
+
args:
|
| 96 |
+
shuffle: true
|
| 97 |
+
batch_size: 128
|
| 98 |
+
num_workers: 16
|
| 99 |
+
pin_memory: true
|
| 100 |
+
drop_last: true
|
| 101 |
+
persistent_workers: true
|
| 102 |
+
max_train_steps: 1000000
|
| 103 |
+
gradient_accumulation_steps: 1
|
| 104 |
+
mixed_precision: fp16
|
| 105 |
+
use_ema: true
|
| 106 |
+
noise_offset: 0.1
|
| 107 |
+
max_grad_norm: 0.1
|
| 108 |
+
max_grad_value: -1
|
| 109 |
+
pad_latents: false
|
| 110 |
+
sample_latents: true
|
| 111 |
+
output_dir: experiments/${wandb_args.name}
|
| 112 |
+
logging_dir: logs
|
| 113 |
+
report_to: wandb
|
| 114 |
+
wandb_args:
|
| 115 |
+
project: EchoFlow
|
| 116 |
+
name: FMiT-S4-4f4
|
| 117 |
+
group: FMiT
|
| 118 |
+
checkpointing_steps: 10000
|
| 119 |
+
checkpoints_to_keep:
|
| 120 |
+
- 50000
|
| 121 |
+
- 100000
|
| 122 |
+
- 200000
|
| 123 |
+
- 500000
|
| 124 |
+
- 1000000
|
| 125 |
+
resume_from_checkpoint: latest
|
| 126 |
+
validation:
|
| 127 |
+
samples: 4
|
| 128 |
+
steps: 5000
|
| 129 |
+
method: euler
|
| 130 |
+
timesteps: 25
|
| 131 |
+
seed: 42
|
| 132 |
+
num_train_epochs: 45455
|
lifm/UNet-B-16f8/config.yaml
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
globals:
|
| 2 |
+
target_fps: original
|
| 3 |
+
target_nframes: 64
|
| 4 |
+
outputs:
|
| 5 |
+
- image
|
| 6 |
+
- view
|
| 7 |
+
resolution: 112
|
| 8 |
+
latent_res: 14
|
| 9 |
+
latent_channels: 16
|
| 10 |
+
denoiser:
|
| 11 |
+
target: echosyn.common.models.SegUnet2DModel
|
| 12 |
+
args:
|
| 13 |
+
sample_size: 28
|
| 14 |
+
in_channels: 17
|
| 15 |
+
out_channels: 16
|
| 16 |
+
center_input_sample: false
|
| 17 |
+
time_embedding_type: positional
|
| 18 |
+
freq_shift: 0
|
| 19 |
+
flip_sin_to_cos: true
|
| 20 |
+
down_block_types:
|
| 21 |
+
- AttnDownBlock2D
|
| 22 |
+
- AttnDownBlock2D
|
| 23 |
+
- AttnDownBlock2D
|
| 24 |
+
- DownBlock2D
|
| 25 |
+
up_block_types:
|
| 26 |
+
- UpBlock2D
|
| 27 |
+
- AttnUpBlock2D
|
| 28 |
+
- AttnUpBlock2D
|
| 29 |
+
- AttnUpBlock2D
|
| 30 |
+
block_out_channels:
|
| 31 |
+
- 160
|
| 32 |
+
- 320
|
| 33 |
+
- 480
|
| 34 |
+
- 640
|
| 35 |
+
layers_per_block: 2
|
| 36 |
+
mid_block_scale_factor: 1
|
| 37 |
+
downsample_padding: 1
|
| 38 |
+
downsample_type: resnet
|
| 39 |
+
upsample_type: resnet
|
| 40 |
+
dropout: 0.0
|
| 41 |
+
act_fn: silu
|
| 42 |
+
attention_head_dim: 8
|
| 43 |
+
norm_num_groups: 32
|
| 44 |
+
attn_norm_num_groups: null
|
| 45 |
+
norm_eps: 1.0e-05
|
| 46 |
+
resnet_time_scale_shift: default
|
| 47 |
+
class_embed_type: timestep
|
| 48 |
+
num_class_embeds: null
|
| 49 |
+
optimizer:
|
| 50 |
+
target: torch.optim.AdamW
|
| 51 |
+
args:
|
| 52 |
+
lr: 5.0e-05
|
| 53 |
+
betas:
|
| 54 |
+
- 0.9
|
| 55 |
+
- 0.999
|
| 56 |
+
weight_decay: 0.01
|
| 57 |
+
eps: 1.0e-08
|
| 58 |
+
scheduler:
|
| 59 |
+
target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
|
| 60 |
+
args:
|
| 61 |
+
warmup_steps: 5000
|
| 62 |
+
ref_steps: ${max_train_steps}
|
| 63 |
+
eta_min: 1.0e-06
|
| 64 |
+
decay_rate: 2
|
| 65 |
+
vae:
|
| 66 |
+
target: diffusers.AutoencoderKL
|
| 67 |
+
pretrained: vae/avae-16f8
|
| 68 |
+
datasets:
|
| 69 |
+
- name: LatentSeg
|
| 70 |
+
active: true
|
| 71 |
+
params:
|
| 72 |
+
root: avae-16f8/dynamic
|
| 73 |
+
outputs: ${globals.outputs}
|
| 74 |
+
target_fps: ${globals.target_fps}
|
| 75 |
+
view_label: A4C
|
| 76 |
+
target_nframes: ${globals.target_nframes}
|
| 77 |
+
latent_channels: ${globals.latent_channels}
|
| 78 |
+
segmentation_root: segmentations/dynamic
|
| 79 |
+
target_resolution: ${globals.latent_res}
|
| 80 |
+
- name: LatentSeg
|
| 81 |
+
active: true
|
| 82 |
+
params:
|
| 83 |
+
root: avae-16f8/ped_a4c
|
| 84 |
+
outputs: ${globals.outputs}
|
| 85 |
+
target_fps: ${globals.target_fps}
|
| 86 |
+
view_label: A4C
|
| 87 |
+
target_nframes: ${globals.target_nframes}
|
| 88 |
+
latent_channels: ${globals.latent_channels}
|
| 89 |
+
segmentation_root: segmentations/ped_a4c
|
| 90 |
+
target_resolution: ${globals.latent_res}
|
| 91 |
+
- name: LatentSeg
|
| 92 |
+
active: true
|
| 93 |
+
params:
|
| 94 |
+
root: avae-16f8/ped_psax
|
| 95 |
+
outputs: ${globals.outputs}
|
| 96 |
+
target_fps: ${globals.target_fps}
|
| 97 |
+
view_label: PSAX
|
| 98 |
+
target_nframes: ${globals.target_nframes}
|
| 99 |
+
latent_channels: ${globals.latent_channels}
|
| 100 |
+
segmentation_root: segmentations/ped_psax
|
| 101 |
+
target_resolution: ${globals.latent_res}
|
| 102 |
+
- name: LatentSeg
|
| 103 |
+
active: true
|
| 104 |
+
params:
|
| 105 |
+
root: avae-16f8/lvh
|
| 106 |
+
outputs: ${globals.outputs}
|
| 107 |
+
target_fps: ${globals.target_fps}
|
| 108 |
+
view_label: PLAX
|
| 109 |
+
target_nframes: ${globals.target_nframes}
|
| 110 |
+
latent_channels: ${globals.latent_channels}
|
| 111 |
+
segmentation_root: no_seg
|
| 112 |
+
target_resolution: ${globals.latent_res}
|
| 113 |
+
dataloader:
|
| 114 |
+
target: torch.utils.data.DataLoader
|
| 115 |
+
args:
|
| 116 |
+
shuffle: true
|
| 117 |
+
batch_size: 128
|
| 118 |
+
num_workers: 16
|
| 119 |
+
pin_memory: true
|
| 120 |
+
drop_last: true
|
| 121 |
+
persistent_workers: true
|
| 122 |
+
max_train_steps: 1000000
|
| 123 |
+
gradient_accumulation_steps: 1
|
| 124 |
+
mixed_precision: bf16
|
| 125 |
+
use_ema: true
|
| 126 |
+
noise_offset: 0.1
|
| 127 |
+
max_grad_norm: 1.0
|
| 128 |
+
max_grad_value: -1
|
| 129 |
+
pad_latents: false
|
| 130 |
+
sample_latents: true
|
| 131 |
+
output_dir: experiments/${wandb_args.name}
|
| 132 |
+
logging_dir: logs
|
| 133 |
+
report_to: wandb
|
| 134 |
+
wandb_args:
|
| 135 |
+
project: EchoFlow
|
| 136 |
+
name: UNet-B-16f8
|
| 137 |
+
group: UNet
|
| 138 |
+
checkpointing_steps: 10000
|
| 139 |
+
checkpoints_to_keep:
|
| 140 |
+
- 50000
|
| 141 |
+
- 100000
|
| 142 |
+
- 200000
|
| 143 |
+
- 500000
|
| 144 |
+
- 1000000
|
| 145 |
+
resume_from_checkpoint: latest
|
| 146 |
+
validation:
|
| 147 |
+
samples: 4
|
| 148 |
+
steps: 5000
|
| 149 |
+
method: euler
|
| 150 |
+
timesteps: 25
|
| 151 |
+
seed: 42
|
| 152 |
+
num_train_epochs: 45455
|
lifm/UNet-B-4f4/config.yaml
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
globals:
|
| 2 |
+
target_fps: original
|
| 3 |
+
target_nframes: 64
|
| 4 |
+
outputs:
|
| 5 |
+
- image
|
| 6 |
+
- view
|
| 7 |
+
resolution: 112
|
| 8 |
+
latent_res: 28
|
| 9 |
+
latent_channels: 4
|
| 10 |
+
denoiser:
|
| 11 |
+
target: echosyn.common.models.SegUnet2DModel
|
| 12 |
+
args:
|
| 13 |
+
sample_size: 28
|
| 14 |
+
in_channels: 5
|
| 15 |
+
out_channels: 4
|
| 16 |
+
center_input_sample: false
|
| 17 |
+
time_embedding_type: positional
|
| 18 |
+
freq_shift: 0
|
| 19 |
+
flip_sin_to_cos: true
|
| 20 |
+
down_block_types:
|
| 21 |
+
- AttnDownBlock2D
|
| 22 |
+
- AttnDownBlock2D
|
| 23 |
+
- AttnDownBlock2D
|
| 24 |
+
- DownBlock2D
|
| 25 |
+
up_block_types:
|
| 26 |
+
- UpBlock2D
|
| 27 |
+
- AttnUpBlock2D
|
| 28 |
+
- AttnUpBlock2D
|
| 29 |
+
- AttnUpBlock2D
|
| 30 |
+
block_out_channels:
|
| 31 |
+
- 160
|
| 32 |
+
- 320
|
| 33 |
+
- 480
|
| 34 |
+
- 640
|
| 35 |
+
layers_per_block: 2
|
| 36 |
+
mid_block_scale_factor: 1
|
| 37 |
+
downsample_padding: 1
|
| 38 |
+
downsample_type: resnet
|
| 39 |
+
upsample_type: resnet
|
| 40 |
+
dropout: 0.0
|
| 41 |
+
act_fn: silu
|
| 42 |
+
attention_head_dim: 8
|
| 43 |
+
norm_num_groups: 32
|
| 44 |
+
attn_norm_num_groups: null
|
| 45 |
+
norm_eps: 1.0e-05
|
| 46 |
+
resnet_time_scale_shift: default
|
| 47 |
+
class_embed_type: timestep
|
| 48 |
+
num_class_embeds: null
|
| 49 |
+
optimizer:
|
| 50 |
+
target: torch.optim.AdamW
|
| 51 |
+
args:
|
| 52 |
+
lr: 5.0e-05
|
| 53 |
+
betas:
|
| 54 |
+
- 0.9
|
| 55 |
+
- 0.999
|
| 56 |
+
weight_decay: 0.01
|
| 57 |
+
eps: 1.0e-08
|
| 58 |
+
scheduler:
|
| 59 |
+
target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
|
| 60 |
+
args:
|
| 61 |
+
warmup_steps: 5000
|
| 62 |
+
ref_steps: ${max_train_steps}
|
| 63 |
+
eta_min: 1.0e-06
|
| 64 |
+
decay_rate: 2
|
| 65 |
+
vae:
|
| 66 |
+
target: diffusers.AutoencoderKL
|
| 67 |
+
pretrained: vae/avae-4f4
|
| 68 |
+
datasets:
|
| 69 |
+
- name: LatentSeg
|
| 70 |
+
active: true
|
| 71 |
+
params:
|
| 72 |
+
root: avae-4f4/dynamic
|
| 73 |
+
outputs: ${globals.outputs}
|
| 74 |
+
target_fps: ${globals.target_fps}
|
| 75 |
+
view_label: A4C
|
| 76 |
+
target_nframes: ${globals.target_nframes}
|
| 77 |
+
latent_channels: ${globals.latent_channels}
|
| 78 |
+
segmentation_root: segmentations/dynamic
|
| 79 |
+
target_resolution: ${globals.latent_res}
|
| 80 |
+
- name: LatentSeg
|
| 81 |
+
active: true
|
| 82 |
+
params:
|
| 83 |
+
root: avae-4f4/ped_a4c
|
| 84 |
+
outputs: ${globals.outputs}
|
| 85 |
+
target_fps: ${globals.target_fps}
|
| 86 |
+
view_label: A4C
|
| 87 |
+
target_nframes: ${globals.target_nframes}
|
| 88 |
+
latent_channels: ${globals.latent_channels}
|
| 89 |
+
segmentation_root: segmentations/ped_a4c
|
| 90 |
+
target_resolution: ${globals.latent_res}
|
| 91 |
+
- name: LatentSeg
|
| 92 |
+
active: true
|
| 93 |
+
params:
|
| 94 |
+
root: avae-4f4/ped_psax
|
| 95 |
+
outputs: ${globals.outputs}
|
| 96 |
+
target_fps: ${globals.target_fps}
|
| 97 |
+
view_label: PSAX
|
| 98 |
+
target_nframes: ${globals.target_nframes}
|
| 99 |
+
latent_channels: ${globals.latent_channels}
|
| 100 |
+
segmentation_root: segmentations/ped_psax
|
| 101 |
+
target_resolution: ${globals.latent_res}
|
| 102 |
+
- name: LatentSeg
|
| 103 |
+
active: true
|
| 104 |
+
params:
|
| 105 |
+
root: avae-4f4/lvh
|
| 106 |
+
outputs: ${globals.outputs}
|
| 107 |
+
target_fps: ${globals.target_fps}
|
| 108 |
+
view_label: PLAX
|
| 109 |
+
target_nframes: ${globals.target_nframes}
|
| 110 |
+
latent_channels: ${globals.latent_channels}
|
| 111 |
+
segmentation_root: no_seg
|
| 112 |
+
target_resolution: ${globals.latent_res}
|
| 113 |
+
dataloader:
|
| 114 |
+
target: torch.utils.data.DataLoader
|
| 115 |
+
args:
|
| 116 |
+
shuffle: true
|
| 117 |
+
batch_size: 128
|
| 118 |
+
num_workers: 16
|
| 119 |
+
pin_memory: true
|
| 120 |
+
drop_last: true
|
| 121 |
+
persistent_workers: true
|
| 122 |
+
max_train_steps: 1000000
|
| 123 |
+
gradient_accumulation_steps: 1
|
| 124 |
+
mixed_precision: bf16
|
| 125 |
+
use_ema: true
|
| 126 |
+
noise_offset: 0.1
|
| 127 |
+
max_grad_norm: 1.0
|
| 128 |
+
max_grad_value: -1
|
| 129 |
+
pad_latents: false
|
| 130 |
+
sample_latents: true
|
| 131 |
+
output_dir: experiments/${wandb_args.name}
|
| 132 |
+
logging_dir: logs
|
| 133 |
+
report_to: wandb
|
| 134 |
+
wandb_args:
|
| 135 |
+
project: EchoFlow
|
| 136 |
+
name: UNet-B-4f4
|
| 137 |
+
group: UNet
|
| 138 |
+
checkpointing_steps: 10000
|
| 139 |
+
checkpoints_to_keep:
|
| 140 |
+
- 50000
|
| 141 |
+
- 100000
|
| 142 |
+
- 200000
|
| 143 |
+
- 500000
|
| 144 |
+
- 1000000
|
| 145 |
+
resume_from_checkpoint: latest
|
| 146 |
+
validation:
|
| 147 |
+
samples: 4
|
| 148 |
+
steps: 5000
|
| 149 |
+
method: euler
|
| 150 |
+
timesteps: 25
|
| 151 |
+
seed: 42
|
| 152 |
+
num_train_epochs: 45455
|
lifm/UNet-L-16f8/config.yaml
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
globals:
|
| 2 |
+
target_fps: original
|
| 3 |
+
target_nframes: 64
|
| 4 |
+
outputs:
|
| 5 |
+
- image
|
| 6 |
+
- view
|
| 7 |
+
resolution: 112
|
| 8 |
+
latent_res: 14
|
| 9 |
+
latent_channels: 16
|
| 10 |
+
denoiser:
|
| 11 |
+
target: echosyn.common.models.SegUnet2DModel
|
| 12 |
+
args:
|
| 13 |
+
sample_size: 28
|
| 14 |
+
in_channels: 17
|
| 15 |
+
out_channels: 16
|
| 16 |
+
center_input_sample: false
|
| 17 |
+
time_embedding_type: positional
|
| 18 |
+
freq_shift: 0
|
| 19 |
+
flip_sin_to_cos: true
|
| 20 |
+
down_block_types:
|
| 21 |
+
- AttnDownBlock2D
|
| 22 |
+
- AttnDownBlock2D
|
| 23 |
+
- AttnDownBlock2D
|
| 24 |
+
- DownBlock2D
|
| 25 |
+
up_block_types:
|
| 26 |
+
- UpBlock2D
|
| 27 |
+
- AttnUpBlock2D
|
| 28 |
+
- AttnUpBlock2D
|
| 29 |
+
- AttnUpBlock2D
|
| 30 |
+
block_out_channels:
|
| 31 |
+
- 320
|
| 32 |
+
- 640
|
| 33 |
+
- 960
|
| 34 |
+
- 1280
|
| 35 |
+
layers_per_block: 2
|
| 36 |
+
mid_block_scale_factor: 1
|
| 37 |
+
downsample_padding: 1
|
| 38 |
+
downsample_type: resnet
|
| 39 |
+
upsample_type: resnet
|
| 40 |
+
dropout: 0.0
|
| 41 |
+
act_fn: silu
|
| 42 |
+
attention_head_dim: 8
|
| 43 |
+
norm_num_groups: 32
|
| 44 |
+
attn_norm_num_groups: null
|
| 45 |
+
norm_eps: 1.0e-05
|
| 46 |
+
resnet_time_scale_shift: default
|
| 47 |
+
class_embed_type: timestep
|
| 48 |
+
num_class_embeds: null
|
| 49 |
+
optimizer:
|
| 50 |
+
target: torch.optim.AdamW
|
| 51 |
+
args:
|
| 52 |
+
lr: 5.0e-05
|
| 53 |
+
betas:
|
| 54 |
+
- 0.9
|
| 55 |
+
- 0.999
|
| 56 |
+
weight_decay: 0.01
|
| 57 |
+
eps: 1.0e-08
|
| 58 |
+
scheduler:
|
| 59 |
+
target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
|
| 60 |
+
args:
|
| 61 |
+
warmup_steps: 5000
|
| 62 |
+
ref_steps: ${max_train_steps}
|
| 63 |
+
eta_min: 1.0e-06
|
| 64 |
+
decay_rate: 2
|
| 65 |
+
vae:
|
| 66 |
+
target: diffusers.AutoencoderKL
|
| 67 |
+
pretrained: vae/avae-16f8
|
| 68 |
+
datasets:
|
| 69 |
+
- name: LatentSeg
|
| 70 |
+
active: true
|
| 71 |
+
params:
|
| 72 |
+
root: avae-16f8/dynamic
|
| 73 |
+
outputs: ${globals.outputs}
|
| 74 |
+
target_fps: ${globals.target_fps}
|
| 75 |
+
view_label: A4C
|
| 76 |
+
target_nframes: ${globals.target_nframes}
|
| 77 |
+
latent_channels: ${globals.latent_channels}
|
| 78 |
+
segmentation_root: segmentations/dynamic
|
| 79 |
+
target_resolution: ${globals.latent_res}
|
| 80 |
+
- name: LatentSeg
|
| 81 |
+
active: true
|
| 82 |
+
params:
|
| 83 |
+
root: avae-16f8/ped_a4c
|
| 84 |
+
outputs: ${globals.outputs}
|
| 85 |
+
target_fps: ${globals.target_fps}
|
| 86 |
+
view_label: A4C
|
| 87 |
+
target_nframes: ${globals.target_nframes}
|
| 88 |
+
latent_channels: ${globals.latent_channels}
|
| 89 |
+
segmentation_root: segmentations/ped_a4c
|
| 90 |
+
target_resolution: ${globals.latent_res}
|
| 91 |
+
- name: LatentSeg
|
| 92 |
+
active: true
|
| 93 |
+
params:
|
| 94 |
+
root: avae-16f8/ped_psax
|
| 95 |
+
outputs: ${globals.outputs}
|
| 96 |
+
target_fps: ${globals.target_fps}
|
| 97 |
+
view_label: PSAX
|
| 98 |
+
target_nframes: ${globals.target_nframes}
|
| 99 |
+
latent_channels: ${globals.latent_channels}
|
| 100 |
+
segmentation_root: segmentations/ped_psax
|
| 101 |
+
target_resolution: ${globals.latent_res}
|
| 102 |
+
- name: LatentSeg
|
| 103 |
+
active: true
|
| 104 |
+
params:
|
| 105 |
+
root: avae-16f8/lvh
|
| 106 |
+
outputs: ${globals.outputs}
|
| 107 |
+
target_fps: ${globals.target_fps}
|
| 108 |
+
view_label: PLAX
|
| 109 |
+
target_nframes: ${globals.target_nframes}
|
| 110 |
+
latent_channels: ${globals.latent_channels}
|
| 111 |
+
segmentation_root: no_seg
|
| 112 |
+
target_resolution: ${globals.latent_res}
|
| 113 |
+
dataloader:
|
| 114 |
+
target: torch.utils.data.DataLoader
|
| 115 |
+
args:
|
| 116 |
+
shuffle: true
|
| 117 |
+
batch_size: 128
|
| 118 |
+
num_workers: 16
|
| 119 |
+
pin_memory: true
|
| 120 |
+
drop_last: true
|
| 121 |
+
persistent_workers: true
|
| 122 |
+
max_train_steps: 1000000
|
| 123 |
+
gradient_accumulation_steps: 1
|
| 124 |
+
mixed_precision: bf16
|
| 125 |
+
use_ema: true
|
| 126 |
+
noise_offset: 0.1
|
| 127 |
+
max_grad_norm: 1.0
|
| 128 |
+
max_grad_value: -1
|
| 129 |
+
pad_latents: false
|
| 130 |
+
sample_latents: true
|
| 131 |
+
output_dir: experiments/${wandb_args.name}
|
| 132 |
+
logging_dir: logs
|
| 133 |
+
report_to: wandb
|
| 134 |
+
wandb_args:
|
| 135 |
+
project: EchoFlow
|
| 136 |
+
name: UNet-L-16f8
|
| 137 |
+
group: UNet
|
| 138 |
+
checkpointing_steps: 10000
|
| 139 |
+
checkpoints_to_keep:
|
| 140 |
+
- 50000
|
| 141 |
+
- 100000
|
| 142 |
+
- 200000
|
| 143 |
+
- 500000
|
| 144 |
+
- 1000000
|
| 145 |
+
resume_from_checkpoint: latest
|
| 146 |
+
validation:
|
| 147 |
+
samples: 4
|
| 148 |
+
steps: 5000
|
| 149 |
+
method: euler
|
| 150 |
+
timesteps: 25
|
| 151 |
+
seed: 42
|
| 152 |
+
num_train_epochs: 45455
|
lifm/UNet-L-4f4/config.yaml
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
globals:
|
| 2 |
+
target_fps: original
|
| 3 |
+
target_nframes: 64
|
| 4 |
+
outputs:
|
| 5 |
+
- image
|
| 6 |
+
- view
|
| 7 |
+
resolution: 112
|
| 8 |
+
latent_res: 28
|
| 9 |
+
latent_channels: 4
|
| 10 |
+
denoiser:
|
| 11 |
+
target: echosyn.common.models.SegUnet2DModel
|
| 12 |
+
args:
|
| 13 |
+
sample_size: 28
|
| 14 |
+
in_channels: 5
|
| 15 |
+
out_channels: 4
|
| 16 |
+
center_input_sample: false
|
| 17 |
+
time_embedding_type: positional
|
| 18 |
+
freq_shift: 0
|
| 19 |
+
flip_sin_to_cos: true
|
| 20 |
+
down_block_types:
|
| 21 |
+
- AttnDownBlock2D
|
| 22 |
+
- AttnDownBlock2D
|
| 23 |
+
- AttnDownBlock2D
|
| 24 |
+
- DownBlock2D
|
| 25 |
+
up_block_types:
|
| 26 |
+
- UpBlock2D
|
| 27 |
+
- AttnUpBlock2D
|
| 28 |
+
- AttnUpBlock2D
|
| 29 |
+
- AttnUpBlock2D
|
| 30 |
+
block_out_channels:
|
| 31 |
+
- 320
|
| 32 |
+
- 640
|
| 33 |
+
- 960
|
| 34 |
+
- 1280
|
| 35 |
+
layers_per_block: 2
|
| 36 |
+
mid_block_scale_factor: 1
|
| 37 |
+
downsample_padding: 1
|
| 38 |
+
downsample_type: resnet
|
| 39 |
+
upsample_type: resnet
|
| 40 |
+
dropout: 0.0
|
| 41 |
+
act_fn: silu
|
| 42 |
+
attention_head_dim: 8
|
| 43 |
+
norm_num_groups: 32
|
| 44 |
+
attn_norm_num_groups: null
|
| 45 |
+
norm_eps: 1.0e-05
|
| 46 |
+
resnet_time_scale_shift: default
|
| 47 |
+
class_embed_type: timestep
|
| 48 |
+
num_class_embeds: null
|
| 49 |
+
optimizer:
|
| 50 |
+
target: torch.optim.AdamW
|
| 51 |
+
args:
|
| 52 |
+
lr: 5.0e-05
|
| 53 |
+
betas:
|
| 54 |
+
- 0.9
|
| 55 |
+
- 0.999
|
| 56 |
+
weight_decay: 0.01
|
| 57 |
+
eps: 1.0e-08
|
| 58 |
+
scheduler:
|
| 59 |
+
target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
|
| 60 |
+
args:
|
| 61 |
+
warmup_steps: 5000
|
| 62 |
+
ref_steps: ${max_train_steps}
|
| 63 |
+
eta_min: 1.0e-06
|
| 64 |
+
decay_rate: 2
|
| 65 |
+
vae:
|
| 66 |
+
target: diffusers.AutoencoderKL
|
| 67 |
+
pretrained: vae/avae-4f4
|
| 68 |
+
datasets:
|
| 69 |
+
- name: LatentSeg
|
| 70 |
+
active: true
|
| 71 |
+
params:
|
| 72 |
+
root: avae-4f4/dynamic
|
| 73 |
+
outputs: ${globals.outputs}
|
| 74 |
+
target_fps: ${globals.target_fps}
|
| 75 |
+
view_label: A4C
|
| 76 |
+
target_nframes: ${globals.target_nframes}
|
| 77 |
+
latent_channels: ${globals.latent_channels}
|
| 78 |
+
segmentation_root: segmentations/dynamic
|
| 79 |
+
target_resolution: ${globals.latent_res}
|
| 80 |
+
- name: LatentSeg
|
| 81 |
+
active: true
|
| 82 |
+
params:
|
| 83 |
+
root: avae-4f4/ped_a4c
|
| 84 |
+
outputs: ${globals.outputs}
|
| 85 |
+
target_fps: ${globals.target_fps}
|
| 86 |
+
view_label: A4C
|
| 87 |
+
target_nframes: ${globals.target_nframes}
|
| 88 |
+
latent_channels: ${globals.latent_channels}
|
| 89 |
+
segmentation_root: segmentations/ped_a4c
|
| 90 |
+
target_resolution: ${globals.latent_res}
|
| 91 |
+
- name: LatentSeg
|
| 92 |
+
active: true
|
| 93 |
+
params:
|
| 94 |
+
root: avae-4f4/ped_psax
|
| 95 |
+
outputs: ${globals.outputs}
|
| 96 |
+
target_fps: ${globals.target_fps}
|
| 97 |
+
view_label: PSAX
|
| 98 |
+
target_nframes: ${globals.target_nframes}
|
| 99 |
+
latent_channels: ${globals.latent_channels}
|
| 100 |
+
segmentation_root: segmentations/ped_psax
|
| 101 |
+
target_resolution: ${globals.latent_res}
|
| 102 |
+
- name: LatentSeg
|
| 103 |
+
active: true
|
| 104 |
+
params:
|
| 105 |
+
root: avae-4f4/lvh
|
| 106 |
+
outputs: ${globals.outputs}
|
| 107 |
+
target_fps: ${globals.target_fps}
|
| 108 |
+
view_label: PLAX
|
| 109 |
+
target_nframes: ${globals.target_nframes}
|
| 110 |
+
latent_channels: ${globals.latent_channels}
|
| 111 |
+
segmentation_root: no_seg
|
| 112 |
+
target_resolution: ${globals.latent_res}
|
| 113 |
+
dataloader:
|
| 114 |
+
target: torch.utils.data.DataLoader
|
| 115 |
+
args:
|
| 116 |
+
shuffle: true
|
| 117 |
+
batch_size: 128
|
| 118 |
+
num_workers: 16
|
| 119 |
+
pin_memory: true
|
| 120 |
+
drop_last: true
|
| 121 |
+
persistent_workers: true
|
| 122 |
+
max_train_steps: 1000000
|
| 123 |
+
gradient_accumulation_steps: 1
|
| 124 |
+
mixed_precision: bf16
|
| 125 |
+
use_ema: true
|
| 126 |
+
noise_offset: 0.1
|
| 127 |
+
max_grad_norm: 1.0
|
| 128 |
+
max_grad_value: -1
|
| 129 |
+
pad_latents: false
|
| 130 |
+
sample_latents: true
|
| 131 |
+
output_dir: experiments/${wandb_args.name}
|
| 132 |
+
logging_dir: logs
|
| 133 |
+
report_to: wandb
|
| 134 |
+
wandb_args:
|
| 135 |
+
project: EchoFlow
|
| 136 |
+
name: UNet-L-4f4
|
| 137 |
+
group: UNet
|
| 138 |
+
checkpointing_steps: 10000
|
| 139 |
+
checkpoints_to_keep:
|
| 140 |
+
- 50000
|
| 141 |
+
- 100000
|
| 142 |
+
- 200000
|
| 143 |
+
- 500000
|
| 144 |
+
- 1000000
|
| 145 |
+
resume_from_checkpoint: latest
|
| 146 |
+
validation:
|
| 147 |
+
samples: 4
|
| 148 |
+
steps: 5000
|
| 149 |
+
method: euler
|
| 150 |
+
timesteps: 25
|
| 151 |
+
seed: 42
|
| 152 |
+
num_train_epochs: 45455
|
lifm/UNet-S-16f8/config.yaml
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
globals:
|
| 2 |
+
target_fps: original
|
| 3 |
+
target_nframes: 64
|
| 4 |
+
outputs:
|
| 5 |
+
- image
|
| 6 |
+
- view
|
| 7 |
+
resolution: 112
|
| 8 |
+
latent_res: 14
|
| 9 |
+
latent_channels: 16
|
| 10 |
+
denoiser:
|
| 11 |
+
target: echosyn.common.models.SegUnet2DModel
|
| 12 |
+
args:
|
| 13 |
+
sample_size: 28
|
| 14 |
+
in_channels: 17
|
| 15 |
+
out_channels: 16
|
| 16 |
+
center_input_sample: false
|
| 17 |
+
time_embedding_type: positional
|
| 18 |
+
freq_shift: 0
|
| 19 |
+
flip_sin_to_cos: true
|
| 20 |
+
down_block_types:
|
| 21 |
+
- AttnDownBlock2D
|
| 22 |
+
- AttnDownBlock2D
|
| 23 |
+
- AttnDownBlock2D
|
| 24 |
+
- DownBlock2D
|
| 25 |
+
up_block_types:
|
| 26 |
+
- UpBlock2D
|
| 27 |
+
- AttnUpBlock2D
|
| 28 |
+
- AttnUpBlock2D
|
| 29 |
+
- AttnUpBlock2D
|
| 30 |
+
block_out_channels:
|
| 31 |
+
- 96
|
| 32 |
+
- 192
|
| 33 |
+
- 288
|
| 34 |
+
- 384
|
| 35 |
+
layers_per_block: 2
|
| 36 |
+
mid_block_scale_factor: 1
|
| 37 |
+
downsample_padding: 1
|
| 38 |
+
downsample_type: resnet
|
| 39 |
+
upsample_type: resnet
|
| 40 |
+
dropout: 0.0
|
| 41 |
+
act_fn: silu
|
| 42 |
+
attention_head_dim: 8
|
| 43 |
+
norm_num_groups: 32
|
| 44 |
+
attn_norm_num_groups: null
|
| 45 |
+
norm_eps: 1.0e-05
|
| 46 |
+
resnet_time_scale_shift: default
|
| 47 |
+
class_embed_type: timestep
|
| 48 |
+
num_class_embeds: null
|
| 49 |
+
optimizer:
|
| 50 |
+
target: torch.optim.AdamW
|
| 51 |
+
args:
|
| 52 |
+
lr: 5.0e-05
|
| 53 |
+
betas:
|
| 54 |
+
- 0.9
|
| 55 |
+
- 0.999
|
| 56 |
+
weight_decay: 0.01
|
| 57 |
+
eps: 1.0e-08
|
| 58 |
+
scheduler:
|
| 59 |
+
target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
|
| 60 |
+
args:
|
| 61 |
+
warmup_steps: 5000
|
| 62 |
+
ref_steps: ${max_train_steps}
|
| 63 |
+
eta_min: 1.0e-06
|
| 64 |
+
decay_rate: 2
|
| 65 |
+
vae:
|
| 66 |
+
target: diffusers.AutoencoderKL
|
| 67 |
+
pretrained: vae/avae-16f8
|
| 68 |
+
datasets:
|
| 69 |
+
- name: LatentSeg
|
| 70 |
+
active: true
|
| 71 |
+
params:
|
| 72 |
+
root: avae-16f8/dynamic
|
| 73 |
+
outputs: ${globals.outputs}
|
| 74 |
+
target_fps: ${globals.target_fps}
|
| 75 |
+
view_label: A4C
|
| 76 |
+
target_nframes: ${globals.target_nframes}
|
| 77 |
+
latent_channels: ${globals.latent_channels}
|
| 78 |
+
segmentation_root: segmentations/dynamic
|
| 79 |
+
target_resolution: ${globals.latent_res}
|
| 80 |
+
- name: LatentSeg
|
| 81 |
+
active: true
|
| 82 |
+
params:
|
| 83 |
+
root: avae-16f8/ped_a4c
|
| 84 |
+
outputs: ${globals.outputs}
|
| 85 |
+
target_fps: ${globals.target_fps}
|
| 86 |
+
view_label: A4C
|
| 87 |
+
target_nframes: ${globals.target_nframes}
|
| 88 |
+
latent_channels: ${globals.latent_channels}
|
| 89 |
+
segmentation_root: segmentations/ped_a4c
|
| 90 |
+
target_resolution: ${globals.latent_res}
|
| 91 |
+
- name: LatentSeg
|
| 92 |
+
active: true
|
| 93 |
+
params:
|
| 94 |
+
root: avae-16f8/ped_psax
|
| 95 |
+
outputs: ${globals.outputs}
|
| 96 |
+
target_fps: ${globals.target_fps}
|
| 97 |
+
view_label: PSAX
|
| 98 |
+
target_nframes: ${globals.target_nframes}
|
| 99 |
+
latent_channels: ${globals.latent_channels}
|
| 100 |
+
segmentation_root: segmentations/ped_psax
|
| 101 |
+
target_resolution: ${globals.latent_res}
|
| 102 |
+
- name: LatentSeg
|
| 103 |
+
active: true
|
| 104 |
+
params:
|
| 105 |
+
root: avae-16f8/lvh
|
| 106 |
+
outputs: ${globals.outputs}
|
| 107 |
+
target_fps: ${globals.target_fps}
|
| 108 |
+
view_label: PLAX
|
| 109 |
+
target_nframes: ${globals.target_nframes}
|
| 110 |
+
latent_channels: ${globals.latent_channels}
|
| 111 |
+
segmentation_root: no_seg
|
| 112 |
+
target_resolution: ${globals.latent_res}
|
| 113 |
+
dataloader:
|
| 114 |
+
target: torch.utils.data.DataLoader
|
| 115 |
+
args:
|
| 116 |
+
shuffle: true
|
| 117 |
+
batch_size: 128
|
| 118 |
+
num_workers: 16
|
| 119 |
+
pin_memory: true
|
| 120 |
+
drop_last: true
|
| 121 |
+
persistent_workers: true
|
| 122 |
+
max_train_steps: 1000000
|
| 123 |
+
gradient_accumulation_steps: 1
|
| 124 |
+
mixed_precision: bf16
|
| 125 |
+
use_ema: true
|
| 126 |
+
noise_offset: 0.1
|
| 127 |
+
max_grad_norm: 1.0
|
| 128 |
+
max_grad_value: -1
|
| 129 |
+
pad_latents: false
|
| 130 |
+
sample_latents: true
|
| 131 |
+
output_dir: experiments/${wandb_args.name}
|
| 132 |
+
logging_dir: logs
|
| 133 |
+
report_to: wandb
|
| 134 |
+
wandb_args:
|
| 135 |
+
project: EchoFlow
|
| 136 |
+
name: UNet-S-16f8
|
| 137 |
+
group: UNet
|
| 138 |
+
checkpointing_steps: 10000
|
| 139 |
+
checkpoints_to_keep:
|
| 140 |
+
- 50000
|
| 141 |
+
- 100000
|
| 142 |
+
- 200000
|
| 143 |
+
- 500000
|
| 144 |
+
- 1000000
|
| 145 |
+
resume_from_checkpoint: latest
|
| 146 |
+
validation:
|
| 147 |
+
samples: 4
|
| 148 |
+
steps: 5000
|
| 149 |
+
method: euler
|
| 150 |
+
timesteps: 25
|
| 151 |
+
seed: 42
|
| 152 |
+
num_train_epochs: 45455
|
lifm/UNet-S-4f4/config.yaml
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
globals:
|
| 2 |
+
target_fps: original
|
| 3 |
+
target_nframes: 64
|
| 4 |
+
outputs:
|
| 5 |
+
- image
|
| 6 |
+
- view
|
| 7 |
+
resolution: 112
|
| 8 |
+
latent_res: 28
|
| 9 |
+
latent_channels: 4
|
| 10 |
+
denoiser:
|
| 11 |
+
target: echosyn.common.models.SegUnet2DModel
|
| 12 |
+
args:
|
| 13 |
+
sample_size: 28
|
| 14 |
+
in_channels: 5
|
| 15 |
+
out_channels: 4
|
| 16 |
+
center_input_sample: false
|
| 17 |
+
time_embedding_type: positional
|
| 18 |
+
freq_shift: 0
|
| 19 |
+
flip_sin_to_cos: true
|
| 20 |
+
down_block_types:
|
| 21 |
+
- AttnDownBlock2D
|
| 22 |
+
- AttnDownBlock2D
|
| 23 |
+
- AttnDownBlock2D
|
| 24 |
+
- DownBlock2D
|
| 25 |
+
up_block_types:
|
| 26 |
+
- UpBlock2D
|
| 27 |
+
- AttnUpBlock2D
|
| 28 |
+
- AttnUpBlock2D
|
| 29 |
+
- AttnUpBlock2D
|
| 30 |
+
block_out_channels:
|
| 31 |
+
- 96
|
| 32 |
+
- 192
|
| 33 |
+
- 288
|
| 34 |
+
- 384
|
| 35 |
+
layers_per_block: 2
|
| 36 |
+
mid_block_scale_factor: 1
|
| 37 |
+
downsample_padding: 1
|
| 38 |
+
downsample_type: resnet
|
| 39 |
+
upsample_type: resnet
|
| 40 |
+
dropout: 0.0
|
| 41 |
+
act_fn: silu
|
| 42 |
+
attention_head_dim: 8
|
| 43 |
+
norm_num_groups: 32
|
| 44 |
+
attn_norm_num_groups: null
|
| 45 |
+
norm_eps: 1.0e-05
|
| 46 |
+
resnet_time_scale_shift: default
|
| 47 |
+
class_embed_type: timestep
|
| 48 |
+
num_class_embeds: null
|
| 49 |
+
optimizer:
|
| 50 |
+
target: torch.optim.AdamW
|
| 51 |
+
args:
|
| 52 |
+
lr: 5.0e-05
|
| 53 |
+
betas:
|
| 54 |
+
- 0.9
|
| 55 |
+
- 0.999
|
| 56 |
+
weight_decay: 0.01
|
| 57 |
+
eps: 1.0e-08
|
| 58 |
+
scheduler:
|
| 59 |
+
target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
|
| 60 |
+
args:
|
| 61 |
+
warmup_steps: 5000
|
| 62 |
+
ref_steps: ${max_train_steps}
|
| 63 |
+
eta_min: 1.0e-06
|
| 64 |
+
decay_rate: 2
|
| 65 |
+
vae:
|
| 66 |
+
target: diffusers.AutoencoderKL
|
| 67 |
+
pretrained: vae/avae-4f4
|
| 68 |
+
datasets:
|
| 69 |
+
- name: LatentSeg
|
| 70 |
+
active: true
|
| 71 |
+
params:
|
| 72 |
+
root: avae-4f4/dynamic
|
| 73 |
+
outputs: ${globals.outputs}
|
| 74 |
+
target_fps: ${globals.target_fps}
|
| 75 |
+
view_label: A4C
|
| 76 |
+
target_nframes: ${globals.target_nframes}
|
| 77 |
+
latent_channels: ${globals.latent_channels}
|
| 78 |
+
segmentation_root: segmentations/dynamic
|
| 79 |
+
target_resolution: ${globals.latent_res}
|
| 80 |
+
- name: LatentSeg
|
| 81 |
+
active: true
|
| 82 |
+
params:
|
| 83 |
+
root: avae-4f4/ped_a4c
|
| 84 |
+
outputs: ${globals.outputs}
|
| 85 |
+
target_fps: ${globals.target_fps}
|
| 86 |
+
view_label: A4C
|
| 87 |
+
target_nframes: ${globals.target_nframes}
|
| 88 |
+
latent_channels: ${globals.latent_channels}
|
| 89 |
+
segmentation_root: segmentations/ped_a4c
|
| 90 |
+
target_resolution: ${globals.latent_res}
|
| 91 |
+
- name: LatentSeg
|
| 92 |
+
active: true
|
| 93 |
+
params:
|
| 94 |
+
root: avae-4f4/ped_psax
|
| 95 |
+
outputs: ${globals.outputs}
|
| 96 |
+
target_fps: ${globals.target_fps}
|
| 97 |
+
view_label: PSAX
|
| 98 |
+
target_nframes: ${globals.target_nframes}
|
| 99 |
+
latent_channels: ${globals.latent_channels}
|
| 100 |
+
segmentation_root: segmentations/ped_psax
|
| 101 |
+
target_resolution: ${globals.latent_res}
|
| 102 |
+
- name: LatentSeg
|
| 103 |
+
active: true
|
| 104 |
+
params:
|
| 105 |
+
root: avae-4f4/lvh
|
| 106 |
+
outputs: ${globals.outputs}
|
| 107 |
+
target_fps: ${globals.target_fps}
|
| 108 |
+
view_label: PLAX
|
| 109 |
+
target_nframes: ${globals.target_nframes}
|
| 110 |
+
latent_channels: ${globals.latent_channels}
|
| 111 |
+
segmentation_root: no_seg
|
| 112 |
+
target_resolution: ${globals.latent_res}
|
| 113 |
+
dataloader:
|
| 114 |
+
target: torch.utils.data.DataLoader
|
| 115 |
+
args:
|
| 116 |
+
shuffle: true
|
| 117 |
+
batch_size: 128
|
| 118 |
+
num_workers: 16
|
| 119 |
+
pin_memory: true
|
| 120 |
+
drop_last: true
|
| 121 |
+
persistent_workers: true
|
| 122 |
+
max_train_steps: 1000000
|
| 123 |
+
gradient_accumulation_steps: 1
|
| 124 |
+
mixed_precision: bf16
|
| 125 |
+
use_ema: true
|
| 126 |
+
noise_offset: 0.1
|
| 127 |
+
max_grad_norm: 1.0
|
| 128 |
+
max_grad_value: -1
|
| 129 |
+
pad_latents: false
|
| 130 |
+
sample_latents: true
|
| 131 |
+
output_dir: experiments/${wandb_args.name}
|
| 132 |
+
logging_dir: logs
|
| 133 |
+
report_to: wandb
|
| 134 |
+
wandb_args:
|
| 135 |
+
project: EchoFlow
|
| 136 |
+
name: UNet-S-4f4
|
| 137 |
+
group: UNet
|
| 138 |
+
checkpointing_steps: 10000
|
| 139 |
+
checkpoints_to_keep:
|
| 140 |
+
- 50000
|
| 141 |
+
- 100000
|
| 142 |
+
- 200000
|
| 143 |
+
- 500000
|
| 144 |
+
- 1000000
|
| 145 |
+
resume_from_checkpoint: latest
|
| 146 |
+
validation:
|
| 147 |
+
samples: 4
|
| 148 |
+
steps: 5000
|
| 149 |
+
method: euler
|
| 150 |
+
timesteps: 25
|
| 151 |
+
seed: 42
|
| 152 |
+
num_train_epochs: 45455
|
lvfm/FMvT-S2-16f8/config.yaml
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
globals:
|
| 2 |
+
target_fps: 32
|
| 3 |
+
target_nframes: 64
|
| 4 |
+
outputs:
|
| 5 |
+
- video
|
| 6 |
+
- lvef
|
| 7 |
+
- image
|
| 8 |
+
resolution: 112
|
| 9 |
+
latent_res: 14
|
| 10 |
+
latent_channels: 16
|
| 11 |
+
denoiser:
|
| 12 |
+
target: echosyn.common.models.DiffuserSTDiT
|
| 13 |
+
args:
|
| 14 |
+
input_size:
|
| 15 |
+
- ${globals.target_nframes}
|
| 16 |
+
- ${globals.latent_res}
|
| 17 |
+
- ${globals.latent_res}
|
| 18 |
+
in_channels: 32
|
| 19 |
+
out_channels: ${globals.latent_channels}
|
| 20 |
+
patch_size:
|
| 21 |
+
- 1
|
| 22 |
+
- 2
|
| 23 |
+
- 2
|
| 24 |
+
hidden_size: 384
|
| 25 |
+
depth: 12
|
| 26 |
+
num_heads: 6
|
| 27 |
+
mlp_ratio: 4.0
|
| 28 |
+
class_dropout_prob: 0.0
|
| 29 |
+
drop_path: 0.0
|
| 30 |
+
no_temporal_pos_emb: false
|
| 31 |
+
caption_channels: 1
|
| 32 |
+
model_max_length: 1
|
| 33 |
+
space_scale: 1.0
|
| 34 |
+
time_scale: 1.0
|
| 35 |
+
enable_flashattn: false
|
| 36 |
+
optimizer:
|
| 37 |
+
target: torch.optim.AdamW
|
| 38 |
+
args:
|
| 39 |
+
lr: 0.0001
|
| 40 |
+
betas:
|
| 41 |
+
- 0.9
|
| 42 |
+
- 0.999
|
| 43 |
+
weight_decay: 0.01
|
| 44 |
+
eps: 1.0e-08
|
| 45 |
+
scheduler:
|
| 46 |
+
target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
|
| 47 |
+
args:
|
| 48 |
+
warmup_steps: 2000
|
| 49 |
+
ref_steps: ${max_train_steps}
|
| 50 |
+
eta_min: 1.0e-06
|
| 51 |
+
decay_rate: 2.0
|
| 52 |
+
vae:
|
| 53 |
+
target: diffusers.AutoencoderKL
|
| 54 |
+
pretrained: vae/avae-16f8
|
| 55 |
+
datasets:
|
| 56 |
+
- name: Latent
|
| 57 |
+
active: true
|
| 58 |
+
params:
|
| 59 |
+
root: avae-16f8/dynamic
|
| 60 |
+
target_fps: ${globals.target_fps}
|
| 61 |
+
target_nframes: ${globals.target_nframes}
|
| 62 |
+
target_resolution: ${globals.latent_res}
|
| 63 |
+
outputs: ${globals.outputs}
|
| 64 |
+
latent_channels: ${globals.latent_channels}
|
| 65 |
+
- name: Latent
|
| 66 |
+
active: true
|
| 67 |
+
params:
|
| 68 |
+
root: avae-16f8/ped_a4c
|
| 69 |
+
target_fps: ${globals.target_fps}
|
| 70 |
+
target_nframes: ${globals.target_nframes}
|
| 71 |
+
target_resolution: ${globals.latent_res}
|
| 72 |
+
outputs: ${globals.outputs}
|
| 73 |
+
latent_channels: ${globals.latent_channels}
|
| 74 |
+
- name: Latent
|
| 75 |
+
active: true
|
| 76 |
+
params:
|
| 77 |
+
root: avae-16f8/ped_psax
|
| 78 |
+
target_fps: ${globals.target_fps}
|
| 79 |
+
target_nframes: ${globals.target_nframes}
|
| 80 |
+
target_resolution: ${globals.latent_res}
|
| 81 |
+
outputs: ${globals.outputs}
|
| 82 |
+
latent_channels: ${globals.latent_channels}
|
| 83 |
+
- name: Latent
|
| 84 |
+
active: true
|
| 85 |
+
params:
|
| 86 |
+
root: avae-16f8/lvh
|
| 87 |
+
target_fps: ${globals.target_fps}
|
| 88 |
+
target_nframes: ${globals.target_nframes}
|
| 89 |
+
target_resolution: ${globals.latent_res}
|
| 90 |
+
outputs: ${globals.outputs}
|
| 91 |
+
latent_channels: ${globals.latent_channels}
|
| 92 |
+
dataloader:
|
| 93 |
+
target: torch.utils.data.DataLoader
|
| 94 |
+
args:
|
| 95 |
+
shuffle: true
|
| 96 |
+
batch_size: 64
|
| 97 |
+
num_workers: 64
|
| 98 |
+
pin_memory: true
|
| 99 |
+
drop_last: true
|
| 100 |
+
persistent_workers: true
|
| 101 |
+
max_train_steps: 1000000
|
| 102 |
+
gradient_accumulation_steps: 1
|
| 103 |
+
mixed_precision: bf16
|
| 104 |
+
use_ema: true
|
| 105 |
+
max_grad_norm: 1.0
|
| 106 |
+
max_grad_value: -1
|
| 107 |
+
sample_latents: true
|
| 108 |
+
noise_offset: 0.05
|
| 109 |
+
noise_cond_image: 0.05
|
| 110 |
+
no_conditionning: false
|
| 111 |
+
p_drop_conditionning: 0.1
|
| 112 |
+
output_dir: experiments/${wandb_args.name}
|
| 113 |
+
logging_dir: logs
|
| 114 |
+
report_to: wandb
|
| 115 |
+
wandb_args:
|
| 116 |
+
project: EchoFlow
|
| 117 |
+
name: FMvT-S2-16f8
|
| 118 |
+
group: FMvT
|
| 119 |
+
checkpointing_steps: 10000
|
| 120 |
+
checkpoints_to_keep:
|
| 121 |
+
- 50000
|
| 122 |
+
- 100000
|
| 123 |
+
- 200000
|
| 124 |
+
- 300000
|
| 125 |
+
- 500000
|
| 126 |
+
- 1000000
|
| 127 |
+
resume_from_checkpoint: latest
|
| 128 |
+
validation:
|
| 129 |
+
samples: 4
|
| 130 |
+
steps: 5000
|
| 131 |
+
timesteps: 25
|
| 132 |
+
frames: ${globals.target_nframes}
|
| 133 |
+
fps: ${globals.target_fps}
|
| 134 |
+
lvefs:
|
| 135 |
+
- -1.0
|
| 136 |
+
- 0.3
|
| 137 |
+
- 0.6
|
| 138 |
+
- 0.9
|
| 139 |
+
cond_image_mask:
|
| 140 |
+
- 0
|
| 141 |
+
- 1
|
| 142 |
+
- 1
|
| 143 |
+
- 1
|
| 144 |
+
seed: 42
|
| 145 |
+
num_train_epochs: 28572
|
lvfm/FMvT-S2-4f4/config.yaml
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
globals:
|
| 2 |
+
target_fps: 32
|
| 3 |
+
target_nframes: 64
|
| 4 |
+
outputs:
|
| 5 |
+
- video
|
| 6 |
+
- lvef
|
| 7 |
+
- image
|
| 8 |
+
resolution: 112
|
| 9 |
+
latent_res: 28
|
| 10 |
+
latent_channels: 4
|
| 11 |
+
denoiser:
|
| 12 |
+
target: echosyn.common.models.DiffuserSTDiT
|
| 13 |
+
args:
|
| 14 |
+
input_size:
|
| 15 |
+
- ${globals.target_nframes}
|
| 16 |
+
- ${globals.latent_res}
|
| 17 |
+
- ${globals.latent_res}
|
| 18 |
+
in_channels: 8
|
| 19 |
+
out_channels: ${globals.latent_channels}
|
| 20 |
+
patch_size:
|
| 21 |
+
- 1
|
| 22 |
+
- 2
|
| 23 |
+
- 2
|
| 24 |
+
hidden_size: 384
|
| 25 |
+
depth: 12
|
| 26 |
+
num_heads: 6
|
| 27 |
+
mlp_ratio: 4.0
|
| 28 |
+
class_dropout_prob: 0.0
|
| 29 |
+
drop_path: 0.0
|
| 30 |
+
no_temporal_pos_emb: false
|
| 31 |
+
caption_channels: 1
|
| 32 |
+
model_max_length: 1
|
| 33 |
+
space_scale: 1.0
|
| 34 |
+
time_scale: 1.0
|
| 35 |
+
enable_flashattn: false
|
| 36 |
+
optimizer:
|
| 37 |
+
target: torch.optim.AdamW
|
| 38 |
+
args:
|
| 39 |
+
lr: 0.0001
|
| 40 |
+
betas:
|
| 41 |
+
- 0.9
|
| 42 |
+
- 0.999
|
| 43 |
+
weight_decay: 0.01
|
| 44 |
+
eps: 1.0e-08
|
| 45 |
+
scheduler:
|
| 46 |
+
target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
|
| 47 |
+
args:
|
| 48 |
+
warmup_steps: 2000
|
| 49 |
+
ref_steps: ${max_train_steps}
|
| 50 |
+
eta_min: 1.0e-06
|
| 51 |
+
decay_rate: 2.0
|
| 52 |
+
vae:
|
| 53 |
+
target: diffusers.AutoencoderKL
|
| 54 |
+
pretrained: vae/avae-4f4
|
| 55 |
+
datasets:
|
| 56 |
+
- name: Latent
|
| 57 |
+
active: true
|
| 58 |
+
params:
|
| 59 |
+
root: avae-4f4/dynamic
|
| 60 |
+
target_fps: ${globals.target_fps}
|
| 61 |
+
target_nframes: ${globals.target_nframes}
|
| 62 |
+
target_resolution: ${globals.latent_res}
|
| 63 |
+
outputs: ${globals.outputs}
|
| 64 |
+
latent_channels: ${globals.latent_channels}
|
| 65 |
+
- name: Latent
|
| 66 |
+
active: true
|
| 67 |
+
params:
|
| 68 |
+
root: avae-4f4/ped_a4c
|
| 69 |
+
target_fps: ${globals.target_fps}
|
| 70 |
+
target_nframes: ${globals.target_nframes}
|
| 71 |
+
target_resolution: ${globals.latent_res}
|
| 72 |
+
outputs: ${globals.outputs}
|
| 73 |
+
latent_channels: ${globals.latent_channels}
|
| 74 |
+
- name: Latent
|
| 75 |
+
active: true
|
| 76 |
+
params:
|
| 77 |
+
root: avae-4f4/ped_psax
|
| 78 |
+
target_fps: ${globals.target_fps}
|
| 79 |
+
target_nframes: ${globals.target_nframes}
|
| 80 |
+
target_resolution: ${globals.latent_res}
|
| 81 |
+
outputs: ${globals.outputs}
|
| 82 |
+
latent_channels: ${globals.latent_channels}
|
| 83 |
+
- name: Latent
|
| 84 |
+
active: true
|
| 85 |
+
params:
|
| 86 |
+
root: avae-4f4/lvh
|
| 87 |
+
target_fps: ${globals.target_fps}
|
| 88 |
+
target_nframes: ${globals.target_nframes}
|
| 89 |
+
target_resolution: ${globals.latent_res}
|
| 90 |
+
outputs: ${globals.outputs}
|
| 91 |
+
latent_channels: ${globals.latent_channels}
|
| 92 |
+
dataloader:
|
| 93 |
+
target: torch.utils.data.DataLoader
|
| 94 |
+
args:
|
| 95 |
+
shuffle: true
|
| 96 |
+
batch_size: 16
|
| 97 |
+
num_workers: 16
|
| 98 |
+
pin_memory: true
|
| 99 |
+
drop_last: true
|
| 100 |
+
persistent_workers: true
|
| 101 |
+
max_train_steps: 1000000
|
| 102 |
+
gradient_accumulation_steps: 1
|
| 103 |
+
mixed_precision: bf16
|
| 104 |
+
use_ema: true
|
| 105 |
+
max_grad_norm: 1.0
|
| 106 |
+
max_grad_value: -1
|
| 107 |
+
sample_latents: true
|
| 108 |
+
noise_offset: 0.05
|
| 109 |
+
noise_cond_image: 0.05
|
| 110 |
+
no_conditionning: false
|
| 111 |
+
p_drop_conditionning: 0.3
|
| 112 |
+
output_dir: experiments/${wandb_args.name}
|
| 113 |
+
logging_dir: logs
|
| 114 |
+
report_to: wandb
|
| 115 |
+
wandb_args:
|
| 116 |
+
project: EchoFlow
|
| 117 |
+
name: FMvT-S2-4f4
|
| 118 |
+
group: FMvT
|
| 119 |
+
checkpointing_steps: 10000
|
| 120 |
+
checkpoints_to_keep:
|
| 121 |
+
- 50000
|
| 122 |
+
- 100000
|
| 123 |
+
- 200000
|
| 124 |
+
- 300000
|
| 125 |
+
- 500000
|
| 126 |
+
- 1000000
|
| 127 |
+
resume_from_checkpoint: latest
|
| 128 |
+
validation:
|
| 129 |
+
samples: 4
|
| 130 |
+
steps: 5000
|
| 131 |
+
timesteps: 25
|
| 132 |
+
frames: ${globals.target_nframes}
|
| 133 |
+
fps: ${globals.target_fps}
|
| 134 |
+
lvefs:
|
| 135 |
+
- -1.0
|
| 136 |
+
- 0.3
|
| 137 |
+
- 0.6
|
| 138 |
+
- 0.9
|
| 139 |
+
cond_image_mask:
|
| 140 |
+
- 0
|
| 141 |
+
- 1
|
| 142 |
+
- 1
|
| 143 |
+
- 1
|
| 144 |
+
seed: 42
|
| 145 |
+
num_train_epochs: 28572
|
lvfm/FMvT-S4-4f4/config.yaml
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
globals:
|
| 2 |
+
target_fps: 32
|
| 3 |
+
target_nframes: 64
|
| 4 |
+
outputs:
|
| 5 |
+
- video
|
| 6 |
+
- lvef
|
| 7 |
+
- image
|
| 8 |
+
resolution: 112
|
| 9 |
+
latent_res: 28
|
| 10 |
+
latent_channels: 4
|
| 11 |
+
denoiser:
|
| 12 |
+
target: echosyn.common.models.DiffuserSTDiT
|
| 13 |
+
args:
|
| 14 |
+
input_size:
|
| 15 |
+
- ${globals.target_nframes}
|
| 16 |
+
- ${globals.latent_res}
|
| 17 |
+
- ${globals.latent_res}
|
| 18 |
+
in_channels: 8
|
| 19 |
+
out_channels: ${globals.latent_channels}
|
| 20 |
+
patch_size:
|
| 21 |
+
- 1
|
| 22 |
+
- 4
|
| 23 |
+
- 4
|
| 24 |
+
hidden_size: 384
|
| 25 |
+
depth: 12
|
| 26 |
+
num_heads: 6
|
| 27 |
+
mlp_ratio: 4.0
|
| 28 |
+
class_dropout_prob: 0.0
|
| 29 |
+
drop_path: 0.0
|
| 30 |
+
no_temporal_pos_emb: false
|
| 31 |
+
caption_channels: 1
|
| 32 |
+
model_max_length: 1
|
| 33 |
+
space_scale: 1.0
|
| 34 |
+
time_scale: 1.0
|
| 35 |
+
enable_flashattn: false
|
| 36 |
+
optimizer:
|
| 37 |
+
target: torch.optim.AdamW
|
| 38 |
+
args:
|
| 39 |
+
lr: 0.0001
|
| 40 |
+
betas:
|
| 41 |
+
- 0.9
|
| 42 |
+
- 0.999
|
| 43 |
+
weight_decay: 0.01
|
| 44 |
+
eps: 1.0e-08
|
| 45 |
+
scheduler:
|
| 46 |
+
target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
|
| 47 |
+
args:
|
| 48 |
+
warmup_steps: 2000
|
| 49 |
+
ref_steps: ${max_train_steps}
|
| 50 |
+
eta_min: 1.0e-06
|
| 51 |
+
decay_rate: 2.0
|
| 52 |
+
vae:
|
| 53 |
+
target: diffusers.AutoencoderKL
|
| 54 |
+
pretrained: vae/avae-4f4
|
| 55 |
+
datasets:
|
| 56 |
+
- name: Latent
|
| 57 |
+
active: true
|
| 58 |
+
params:
|
| 59 |
+
root: avae-4f4/dynamic
|
| 60 |
+
target_fps: ${globals.target_fps}
|
| 61 |
+
target_nframes: ${globals.target_nframes}
|
| 62 |
+
target_resolution: ${globals.latent_res}
|
| 63 |
+
outputs: ${globals.outputs}
|
| 64 |
+
latent_channels: ${globals.latent_channels}
|
| 65 |
+
- name: Latent
|
| 66 |
+
active: true
|
| 67 |
+
params:
|
| 68 |
+
root: avae-4f4/ped_a4c
|
| 69 |
+
target_fps: ${globals.target_fps}
|
| 70 |
+
target_nframes: ${globals.target_nframes}
|
| 71 |
+
target_resolution: ${globals.latent_res}
|
| 72 |
+
outputs: ${globals.outputs}
|
| 73 |
+
latent_channels: ${globals.latent_channels}
|
| 74 |
+
- name: Latent
|
| 75 |
+
active: true
|
| 76 |
+
params:
|
| 77 |
+
root: avae-4f4/ped_psax
|
| 78 |
+
target_fps: ${globals.target_fps}
|
| 79 |
+
target_nframes: ${globals.target_nframes}
|
| 80 |
+
target_resolution: ${globals.latent_res}
|
| 81 |
+
outputs: ${globals.outputs}
|
| 82 |
+
latent_channels: ${globals.latent_channels}
|
| 83 |
+
- name: Latent
|
| 84 |
+
active: true
|
| 85 |
+
params:
|
| 86 |
+
root: avae-4f4/lvh
|
| 87 |
+
target_fps: ${globals.target_fps}
|
| 88 |
+
target_nframes: ${globals.target_nframes}
|
| 89 |
+
target_resolution: ${globals.latent_res}
|
| 90 |
+
outputs: ${globals.outputs}
|
| 91 |
+
latent_channels: ${globals.latent_channels}
|
| 92 |
+
dataloader:
|
| 93 |
+
target: torch.utils.data.DataLoader
|
| 94 |
+
args:
|
| 95 |
+
shuffle: true
|
| 96 |
+
batch_size: 64
|
| 97 |
+
num_workers: 64
|
| 98 |
+
pin_memory: true
|
| 99 |
+
drop_last: true
|
| 100 |
+
persistent_workers: true
|
| 101 |
+
max_train_steps: 1000000
|
| 102 |
+
gradient_accumulation_steps: 1
|
| 103 |
+
mixed_precision: bf16
|
| 104 |
+
use_ema: true
|
| 105 |
+
max_grad_norm: 1.0
|
| 106 |
+
max_grad_value: -1
|
| 107 |
+
sample_latents: true
|
| 108 |
+
noise_offset: 0.05
|
| 109 |
+
noise_cond_image: 0.05
|
| 110 |
+
no_conditionning: false
|
| 111 |
+
p_drop_conditionning: 0.3
|
| 112 |
+
output_dir: experiments/${wandb_args.name}
|
| 113 |
+
logging_dir: logs
|
| 114 |
+
report_to: wandb
|
| 115 |
+
wandb_args:
|
| 116 |
+
project: EchoFlow
|
| 117 |
+
name: FMvT-S4-4f4
|
| 118 |
+
group: FMvT
|
| 119 |
+
checkpointing_steps: 10000
|
| 120 |
+
checkpoints_to_keep:
|
| 121 |
+
- 50000
|
| 122 |
+
- 100000
|
| 123 |
+
- 200000
|
| 124 |
+
- 300000
|
| 125 |
+
- 500000
|
| 126 |
+
- 1000000
|
| 127 |
+
resume_from_checkpoint: latest
|
| 128 |
+
validation:
|
| 129 |
+
samples: 4
|
| 130 |
+
steps: 5000
|
| 131 |
+
timesteps: 25
|
| 132 |
+
frames: ${globals.target_nframes}
|
| 133 |
+
fps: ${globals.target_fps}
|
| 134 |
+
lvefs:
|
| 135 |
+
- -1.0
|
| 136 |
+
- 0.3
|
| 137 |
+
- 0.6
|
| 138 |
+
- 0.9
|
| 139 |
+
cond_image_mask:
|
| 140 |
+
- 0
|
| 141 |
+
- 1
|
| 142 |
+
- 1
|
| 143 |
+
- 1
|
| 144 |
+
seed: 42
|
| 145 |
+
num_train_epochs: 28572
|
lvfm/STUNet-S-16f8/config.yaml
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
globals:
|
| 2 |
+
target_fps: 32
|
| 3 |
+
target_nframes: 64
|
| 4 |
+
outputs:
|
| 5 |
+
- video
|
| 6 |
+
- lvef
|
| 7 |
+
- image
|
| 8 |
+
resolution: 112
|
| 9 |
+
latent_res: 14
|
| 10 |
+
latent_channels: 16
|
| 11 |
+
denoiser:
|
| 12 |
+
target: echosyn.common.models.UNetSTIC
|
| 13 |
+
args:
|
| 14 |
+
in_channels: 32
|
| 15 |
+
out_channels: ${globals.latent_channels}
|
| 16 |
+
sample_size: ${globals.latent_res}
|
| 17 |
+
addition_time_embed_dim: 1
|
| 18 |
+
block_out_channels:
|
| 19 |
+
- 64
|
| 20 |
+
- 128
|
| 21 |
+
- 192
|
| 22 |
+
- 256
|
| 23 |
+
cross_attention_dim: 1
|
| 24 |
+
down_block_types:
|
| 25 |
+
- CrossAttnDownBlockSpatioTemporal
|
| 26 |
+
- CrossAttnDownBlockSpatioTemporal
|
| 27 |
+
- CrossAttnDownBlockSpatioTemporal
|
| 28 |
+
- DownBlockSpatioTemporal
|
| 29 |
+
layers_per_block: 2
|
| 30 |
+
num_attention_heads:
|
| 31 |
+
- 8
|
| 32 |
+
- 16
|
| 33 |
+
- 16
|
| 34 |
+
- 32
|
| 35 |
+
num_frames: 64
|
| 36 |
+
projection_class_embeddings_input_dim: 1
|
| 37 |
+
transformer_layers_per_block: 1
|
| 38 |
+
up_block_types:
|
| 39 |
+
- UpBlockSpatioTemporal
|
| 40 |
+
- CrossAttnUpBlockSpatioTemporal
|
| 41 |
+
- CrossAttnUpBlockSpatioTemporal
|
| 42 |
+
- CrossAttnUpBlockSpatioTemporal
|
| 43 |
+
optimizer:
|
| 44 |
+
target: torch.optim.AdamW
|
| 45 |
+
args:
|
| 46 |
+
lr: 0.0001
|
| 47 |
+
betas:
|
| 48 |
+
- 0.9
|
| 49 |
+
- 0.999
|
| 50 |
+
weight_decay: 0.01
|
| 51 |
+
eps: 1.0e-08
|
| 52 |
+
scheduler:
|
| 53 |
+
target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
|
| 54 |
+
args:
|
| 55 |
+
warmup_steps: 2000
|
| 56 |
+
ref_steps: ${max_train_steps}
|
| 57 |
+
eta_min: 1.0e-06
|
| 58 |
+
decay_rate: 2.0
|
| 59 |
+
vae:
|
| 60 |
+
target: diffusers.AutoencoderKL
|
| 61 |
+
pretrained: vae/avae-16f8
|
| 62 |
+
datasets:
|
| 63 |
+
- name: Latent
|
| 64 |
+
active: true
|
| 65 |
+
params:
|
| 66 |
+
root: avae-16f8/dynamic
|
| 67 |
+
target_fps: ${globals.target_fps}
|
| 68 |
+
target_nframes: ${globals.target_nframes}
|
| 69 |
+
target_resolution: ${globals.latent_res}
|
| 70 |
+
outputs: ${globals.outputs}
|
| 71 |
+
latent_channels: ${globals.latent_channels}
|
| 72 |
+
- name: Latent
|
| 73 |
+
active: true
|
| 74 |
+
params:
|
| 75 |
+
root: avae-16f8/ped_a4c
|
| 76 |
+
target_fps: ${globals.target_fps}
|
| 77 |
+
target_nframes: ${globals.target_nframes}
|
| 78 |
+
target_resolution: ${globals.latent_res}
|
| 79 |
+
outputs: ${globals.outputs}
|
| 80 |
+
latent_channels: ${globals.latent_channels}
|
| 81 |
+
- name: Latent
|
| 82 |
+
active: true
|
| 83 |
+
params:
|
| 84 |
+
root: avae-16f8/ped_psax
|
| 85 |
+
target_fps: ${globals.target_fps}
|
| 86 |
+
target_nframes: ${globals.target_nframes}
|
| 87 |
+
target_resolution: ${globals.latent_res}
|
| 88 |
+
outputs: ${globals.outputs}
|
| 89 |
+
latent_channels: ${globals.latent_channels}
|
| 90 |
+
- name: Latent
|
| 91 |
+
active: true
|
| 92 |
+
params:
|
| 93 |
+
root: avae-16f8/lvh
|
| 94 |
+
target_fps: ${globals.target_fps}
|
| 95 |
+
target_nframes: ${globals.target_nframes}
|
| 96 |
+
target_resolution: ${globals.latent_res}
|
| 97 |
+
outputs: ${globals.outputs}
|
| 98 |
+
latent_channels: ${globals.latent_channels}
|
| 99 |
+
dataloader:
|
| 100 |
+
target: torch.utils.data.DataLoader
|
| 101 |
+
args:
|
| 102 |
+
shuffle: true
|
| 103 |
+
batch_size: 32
|
| 104 |
+
num_workers: 32
|
| 105 |
+
pin_memory: true
|
| 106 |
+
drop_last: true
|
| 107 |
+
persistent_workers: true
|
| 108 |
+
max_train_steps: 1000000
|
| 109 |
+
gradient_accumulation_steps: 1
|
| 110 |
+
mixed_precision: bf16
|
| 111 |
+
use_ema: true
|
| 112 |
+
max_grad_norm: 1.0
|
| 113 |
+
max_grad_value: -1
|
| 114 |
+
sample_latents: true
|
| 115 |
+
noise_offset: 0.05
|
| 116 |
+
noise_cond_image: 0.05
|
| 117 |
+
no_conditionning: false
|
| 118 |
+
p_drop_conditionning: 0.3
|
| 119 |
+
output_dir: experiments/${wandb_args.name}
|
| 120 |
+
logging_dir: logs
|
| 121 |
+
report_to: wandb
|
| 122 |
+
wandb_args:
|
| 123 |
+
project: EchoFlow
|
| 124 |
+
name: STUNet-S-16f8
|
| 125 |
+
group: STUNet
|
| 126 |
+
checkpointing_steps: 10000
|
| 127 |
+
checkpoints_to_keep:
|
| 128 |
+
- 50000
|
| 129 |
+
- 100000
|
| 130 |
+
- 200000
|
| 131 |
+
- 300000
|
| 132 |
+
- 500000
|
| 133 |
+
- 1000000
|
| 134 |
+
resume_from_checkpoint: latest
|
| 135 |
+
validation:
|
| 136 |
+
samples: 4
|
| 137 |
+
steps: 5000
|
| 138 |
+
timesteps: 25
|
| 139 |
+
frames: ${globals.target_nframes}
|
| 140 |
+
fps: ${globals.target_fps}
|
| 141 |
+
lvefs:
|
| 142 |
+
- -1.0
|
| 143 |
+
- 0.3
|
| 144 |
+
- 0.6
|
| 145 |
+
- 0.9
|
| 146 |
+
cond_image_mask:
|
| 147 |
+
- 0
|
| 148 |
+
- 1
|
| 149 |
+
- 1
|
| 150 |
+
- 1
|
| 151 |
+
seed: 42
|
| 152 |
+
num_train_epochs: 28572
|
lvfm/STUNet-S-4f4/config.yaml
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
globals:
|
| 2 |
+
target_fps: 32
|
| 3 |
+
target_nframes: 64
|
| 4 |
+
outputs:
|
| 5 |
+
- video
|
| 6 |
+
- lvef
|
| 7 |
+
- image
|
| 8 |
+
resolution: 112
|
| 9 |
+
latent_res: 28
|
| 10 |
+
latent_channels: 4
|
| 11 |
+
denoiser:
|
| 12 |
+
target: echosyn.common.models.UNetSTIC
|
| 13 |
+
args:
|
| 14 |
+
in_channels: 8
|
| 15 |
+
out_channels: ${globals.latent_channels}
|
| 16 |
+
sample_size: ${globals.latent_res}
|
| 17 |
+
addition_time_embed_dim: 1
|
| 18 |
+
block_out_channels:
|
| 19 |
+
- 64
|
| 20 |
+
- 128
|
| 21 |
+
- 192
|
| 22 |
+
- 256
|
| 23 |
+
cross_attention_dim: 1
|
| 24 |
+
down_block_types:
|
| 25 |
+
- CrossAttnDownBlockSpatioTemporal
|
| 26 |
+
- CrossAttnDownBlockSpatioTemporal
|
| 27 |
+
- CrossAttnDownBlockSpatioTemporal
|
| 28 |
+
- DownBlockSpatioTemporal
|
| 29 |
+
layers_per_block: 2
|
| 30 |
+
num_attention_heads:
|
| 31 |
+
- 8
|
| 32 |
+
- 16
|
| 33 |
+
- 16
|
| 34 |
+
- 32
|
| 35 |
+
num_frames: 64
|
| 36 |
+
projection_class_embeddings_input_dim: 1
|
| 37 |
+
transformer_layers_per_block: 1
|
| 38 |
+
up_block_types:
|
| 39 |
+
- UpBlockSpatioTemporal
|
| 40 |
+
- CrossAttnUpBlockSpatioTemporal
|
| 41 |
+
- CrossAttnUpBlockSpatioTemporal
|
| 42 |
+
- CrossAttnUpBlockSpatioTemporal
|
| 43 |
+
optimizer:
|
| 44 |
+
target: torch.optim.AdamW
|
| 45 |
+
args:
|
| 46 |
+
lr: 0.0001
|
| 47 |
+
betas:
|
| 48 |
+
- 0.9
|
| 49 |
+
- 0.999
|
| 50 |
+
weight_decay: 0.01
|
| 51 |
+
eps: 1.0e-08
|
| 52 |
+
scheduler:
|
| 53 |
+
target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
|
| 54 |
+
args:
|
| 55 |
+
warmup_steps: 2000
|
| 56 |
+
ref_steps: ${max_train_steps}
|
| 57 |
+
eta_min: 1.0e-06
|
| 58 |
+
decay_rate: 2.0
|
| 59 |
+
vae:
|
| 60 |
+
target: diffusers.AutoencoderKL
|
| 61 |
+
pretrained: vae/avae-4f4
|
| 62 |
+
datasets:
|
| 63 |
+
- name: Latent
|
| 64 |
+
active: true
|
| 65 |
+
params:
|
| 66 |
+
root: avae-4f4/dynamic
|
| 67 |
+
target_fps: ${globals.target_fps}
|
| 68 |
+
target_nframes: ${globals.target_nframes}
|
| 69 |
+
target_resolution: ${globals.latent_res}
|
| 70 |
+
outputs: ${globals.outputs}
|
| 71 |
+
latent_channels: ${globals.latent_channels}
|
| 72 |
+
- name: Latent
|
| 73 |
+
active: true
|
| 74 |
+
params:
|
| 75 |
+
root: avae-4f4/ped_a4c
|
| 76 |
+
target_fps: ${globals.target_fps}
|
| 77 |
+
target_nframes: ${globals.target_nframes}
|
| 78 |
+
target_resolution: ${globals.latent_res}
|
| 79 |
+
outputs: ${globals.outputs}
|
| 80 |
+
latent_channels: ${globals.latent_channels}
|
| 81 |
+
- name: Latent
|
| 82 |
+
active: true
|
| 83 |
+
params:
|
| 84 |
+
root: avae-4f4/ped_psax
|
| 85 |
+
target_fps: ${globals.target_fps}
|
| 86 |
+
target_nframes: ${globals.target_nframes}
|
| 87 |
+
target_resolution: ${globals.latent_res}
|
| 88 |
+
outputs: ${globals.outputs}
|
| 89 |
+
latent_channels: ${globals.latent_channels}
|
| 90 |
+
- name: Latent
|
| 91 |
+
active: true
|
| 92 |
+
params:
|
| 93 |
+
root: avae-4f4/lvh
|
| 94 |
+
target_fps: ${globals.target_fps}
|
| 95 |
+
target_nframes: ${globals.target_nframes}
|
| 96 |
+
target_resolution: ${globals.latent_res}
|
| 97 |
+
outputs: ${globals.outputs}
|
| 98 |
+
latent_channels: ${globals.latent_channels}
|
| 99 |
+
dataloader:
|
| 100 |
+
target: torch.utils.data.DataLoader
|
| 101 |
+
args:
|
| 102 |
+
shuffle: true
|
| 103 |
+
batch_size: 8
|
| 104 |
+
num_workers: 8
|
| 105 |
+
pin_memory: true
|
| 106 |
+
drop_last: true
|
| 107 |
+
persistent_workers: true
|
| 108 |
+
max_train_steps: 1000000
|
| 109 |
+
gradient_accumulation_steps: 1
|
| 110 |
+
mixed_precision: bf16
|
| 111 |
+
use_ema: true
|
| 112 |
+
max_grad_norm: 1.0
|
| 113 |
+
max_grad_value: -1
|
| 114 |
+
sample_latents: true
|
| 115 |
+
noise_offset: 0.05
|
| 116 |
+
noise_cond_image: 0.05
|
| 117 |
+
no_conditionning: false
|
| 118 |
+
p_drop_conditionning: 0.3
|
| 119 |
+
output_dir: experiments/${wandb_args.name}
|
| 120 |
+
logging_dir: logs
|
| 121 |
+
report_to: wandb
|
| 122 |
+
wandb_args:
|
| 123 |
+
project: EchoFlow
|
| 124 |
+
name: STUNet-S-4f4
|
| 125 |
+
group: STUNet
|
| 126 |
+
checkpointing_steps: 10000
|
| 127 |
+
checkpoints_to_keep:
|
| 128 |
+
- 50000
|
| 129 |
+
- 100000
|
| 130 |
+
- 200000
|
| 131 |
+
- 300000
|
| 132 |
+
- 500000
|
| 133 |
+
- 1000000
|
| 134 |
+
resume_from_checkpoint: latest
|
| 135 |
+
validation:
|
| 136 |
+
samples: 4
|
| 137 |
+
steps: 5000
|
| 138 |
+
timesteps: 25
|
| 139 |
+
frames: ${globals.target_nframes}
|
| 140 |
+
fps: ${globals.target_fps}
|
| 141 |
+
lvefs:
|
| 142 |
+
- -1.0
|
| 143 |
+
- 0.3
|
| 144 |
+
- 0.6
|
| 145 |
+
- 0.9
|
| 146 |
+
cond_image_mask:
|
| 147 |
+
- 0
|
| 148 |
+
- 1
|
| 149 |
+
- 1
|
| 150 |
+
- 1
|
| 151 |
+
seed: 42
|
| 152 |
+
num_train_epochs: 28572
|
reid/dynamic-4f4/config.yaml
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
globals:
|
| 2 |
+
latent_channels: 4
|
| 3 |
+
dataset:
|
| 4 |
+
target: echosyn.common.datasets.ContrastivePair
|
| 5 |
+
args:
|
| 6 |
+
root: avae-4f4/dynamic
|
| 7 |
+
folder: Latents
|
| 8 |
+
extension: pt
|
| 9 |
+
dataloader:
|
| 10 |
+
target: torch.utils.data.DataLoader
|
| 11 |
+
args:
|
| 12 |
+
shuffle: true
|
| 13 |
+
batch_size: 32
|
| 14 |
+
num_workers: 16
|
| 15 |
+
pin_memory: true
|
| 16 |
+
drop_last: true
|
| 17 |
+
persistent_workers: true
|
| 18 |
+
backbone:
|
| 19 |
+
target: echosyn.reindentification.model.ResNet18
|
| 20 |
+
args:
|
| 21 |
+
weights: torchvision.models.ResNet18_Weights.IMAGENET1K_V1
|
| 22 |
+
progress: false
|
| 23 |
+
model:
|
| 24 |
+
target: echosyn.reindentification.model.ContrastiveModel
|
| 25 |
+
args:
|
| 26 |
+
in_channels: 4
|
| 27 |
+
out_channels: 256
|
| 28 |
+
kl_loss_weight: 0.0
|
| 29 |
+
optimizer:
|
| 30 |
+
target: torch.optim.AdamW
|
| 31 |
+
args:
|
| 32 |
+
lr: 0.0001
|
| 33 |
+
betas:
|
| 34 |
+
- 0.9
|
| 35 |
+
- 0.999
|
| 36 |
+
weight_decay: 0.01
|
| 37 |
+
eps: 1.0e-08
|
| 38 |
+
scheduler:
|
| 39 |
+
target: torch.optim.lr_scheduler.ConstantLR
|
| 40 |
+
args:
|
| 41 |
+
factor: 1.0
|
| 42 |
+
vae:
|
| 43 |
+
target: diffusers.AutoencoderKL
|
| 44 |
+
pretrained: vae/avae-4f4
|
| 45 |
+
max_train_steps: 60000
|
| 46 |
+
gradient_accumulation_steps: 1
|
| 47 |
+
mixed_precision: bf16
|
| 48 |
+
max_grad_norm: 10.0
|
| 49 |
+
sample_latents: true
|
| 50 |
+
validation_steps: 10000
|
| 51 |
+
validation_samples: 99999
|
| 52 |
+
output_dir: experiments/${wandb_args.group}/${wandb_args.name}
|
| 53 |
+
logging_dir: logs
|
| 54 |
+
report_to: wandb
|
| 55 |
+
wandb_args:
|
| 56 |
+
project: EchoFlow
|
| 57 |
+
name: dynamic_4f4
|
| 58 |
+
group: reindentification
|
| 59 |
+
checkpointing_steps: 10000
|
| 60 |
+
checkpoints_total_limit: 3
|
| 61 |
+
resume_from_checkpoint: null
|
| 62 |
+
seed: 42
|
| 63 |
+
no_wandb: false
|
| 64 |
+
num_train_epochs: 258
|
reid/lvh-4f4/config.yaml
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
globals:
|
| 2 |
+
latent_channels: 4
|
| 3 |
+
dataset:
|
| 4 |
+
target: echosyn.common.datasets.ContrastivePair
|
| 5 |
+
args:
|
| 6 |
+
root: avae-4f4/lvh
|
| 7 |
+
folder: Latents
|
| 8 |
+
extension: pt
|
| 9 |
+
dataloader:
|
| 10 |
+
target: torch.utils.data.DataLoader
|
| 11 |
+
args:
|
| 12 |
+
shuffle: true
|
| 13 |
+
batch_size: 32
|
| 14 |
+
num_workers: 16
|
| 15 |
+
pin_memory: true
|
| 16 |
+
drop_last: true
|
| 17 |
+
persistent_workers: true
|
| 18 |
+
backbone:
|
| 19 |
+
target: echosyn.reindentification.model.ResNet18
|
| 20 |
+
args:
|
| 21 |
+
weights: torchvision.models.ResNet18_Weights.IMAGENET1K_V1
|
| 22 |
+
progress: false
|
| 23 |
+
model:
|
| 24 |
+
target: echosyn.reindentification.model.ContrastiveModel
|
| 25 |
+
args:
|
| 26 |
+
in_channels: 4
|
| 27 |
+
out_channels: 256
|
| 28 |
+
kl_loss_weight: 0.0
|
| 29 |
+
optimizer:
|
| 30 |
+
target: torch.optim.AdamW
|
| 31 |
+
args:
|
| 32 |
+
lr: 0.0001
|
| 33 |
+
betas:
|
| 34 |
+
- 0.9
|
| 35 |
+
- 0.999
|
| 36 |
+
weight_decay: 0.01
|
| 37 |
+
eps: 1.0e-08
|
| 38 |
+
scheduler:
|
| 39 |
+
target: torch.optim.lr_scheduler.ConstantLR
|
| 40 |
+
args:
|
| 41 |
+
factor: 1.0
|
| 42 |
+
vae:
|
| 43 |
+
target: diffusers.AutoencoderKL
|
| 44 |
+
pretrained: vae/avae-4f4
|
| 45 |
+
max_train_steps: 60000
|
| 46 |
+
gradient_accumulation_steps: 1
|
| 47 |
+
mixed_precision: bf16
|
| 48 |
+
max_grad_norm: 10.0
|
| 49 |
+
sample_latents: true
|
| 50 |
+
validation_steps: 10000
|
| 51 |
+
validation_samples: 99999
|
| 52 |
+
output_dir: experiments/${wandb_args.group}/${wandb_args.name}
|
| 53 |
+
logging_dir: logs
|
| 54 |
+
report_to: wandb
|
| 55 |
+
wandb_args:
|
| 56 |
+
project: EchoFlow
|
| 57 |
+
name: lvh_4f4
|
| 58 |
+
group: reindentification
|
| 59 |
+
checkpointing_steps: 10000
|
| 60 |
+
checkpoints_total_limit: 3
|
| 61 |
+
resume_from_checkpoint: null
|
| 62 |
+
seed: 42
|
| 63 |
+
no_wandb: false
|
| 64 |
+
num_train_epochs: 203
|
reid/ped_a4c-4f4/config.yaml
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
globals:
|
| 2 |
+
latent_channels: 4
|
| 3 |
+
dataset:
|
| 4 |
+
target: echosyn.common.datasets.ContrastivePair
|
| 5 |
+
args:
|
| 6 |
+
root: avae-4f4/ped_a4c
|
| 7 |
+
folder: Latents
|
| 8 |
+
extension: pt
|
| 9 |
+
dataloader:
|
| 10 |
+
target: torch.utils.data.DataLoader
|
| 11 |
+
args:
|
| 12 |
+
shuffle: true
|
| 13 |
+
batch_size: 32
|
| 14 |
+
num_workers: 16
|
| 15 |
+
pin_memory: true
|
| 16 |
+
drop_last: true
|
| 17 |
+
persistent_workers: true
|
| 18 |
+
backbone:
|
| 19 |
+
target: echosyn.reindentification.model.ResNet18
|
| 20 |
+
args:
|
| 21 |
+
weights: torchvision.models.ResNet18_Weights.IMAGENET1K_V1
|
| 22 |
+
progress: false
|
| 23 |
+
model:
|
| 24 |
+
target: echosyn.reindentification.model.ContrastiveModel
|
| 25 |
+
args:
|
| 26 |
+
in_channels: 4
|
| 27 |
+
out_channels: 256
|
| 28 |
+
kl_loss_weight: 0.0
|
| 29 |
+
optimizer:
|
| 30 |
+
target: torch.optim.AdamW
|
| 31 |
+
args:
|
| 32 |
+
lr: 0.0001
|
| 33 |
+
betas:
|
| 34 |
+
- 0.9
|
| 35 |
+
- 0.999
|
| 36 |
+
weight_decay: 0.01
|
| 37 |
+
eps: 1.0e-08
|
| 38 |
+
scheduler:
|
| 39 |
+
target: torch.optim.lr_scheduler.ConstantLR
|
| 40 |
+
args:
|
| 41 |
+
factor: 1.0
|
| 42 |
+
vae:
|
| 43 |
+
target: diffusers.AutoencoderKL
|
| 44 |
+
pretrained: vae/avae-4f4
|
| 45 |
+
max_train_steps: 60000
|
| 46 |
+
gradient_accumulation_steps: 1
|
| 47 |
+
mixed_precision: bf16
|
| 48 |
+
max_grad_norm: 10.0
|
| 49 |
+
sample_latents: true
|
| 50 |
+
validation_steps: 10000
|
| 51 |
+
validation_samples: 99999
|
| 52 |
+
output_dir: experiments/${wandb_args.group}/${wandb_args.name}
|
| 53 |
+
logging_dir: logs
|
| 54 |
+
report_to: wandb
|
| 55 |
+
wandb_args:
|
| 56 |
+
project: EchoFlow
|
| 57 |
+
name: ped_a4c_4f4
|
| 58 |
+
group: reindentification
|
| 59 |
+
checkpointing_steps: 10000
|
| 60 |
+
checkpoints_total_limit: 3
|
| 61 |
+
resume_from_checkpoint: null
|
| 62 |
+
seed: 42
|
| 63 |
+
no_wandb: false
|
| 64 |
+
num_train_epochs: 750
|
reid/ped_psax-4f4/config.yaml
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
globals:
|
| 2 |
+
latent_channels: 4
|
| 3 |
+
dataset:
|
| 4 |
+
target: echosyn.common.datasets.ContrastivePair
|
| 5 |
+
args:
|
| 6 |
+
root: avae-4f4/ped_psax
|
| 7 |
+
folder: Latents
|
| 8 |
+
extension: pt
|
| 9 |
+
dataloader:
|
| 10 |
+
target: torch.utils.data.DataLoader
|
| 11 |
+
args:
|
| 12 |
+
shuffle: true
|
| 13 |
+
batch_size: 32
|
| 14 |
+
num_workers: 16
|
| 15 |
+
pin_memory: true
|
| 16 |
+
drop_last: true
|
| 17 |
+
persistent_workers: true
|
| 18 |
+
backbone:
|
| 19 |
+
target: echosyn.reindentification.model.ResNet18
|
| 20 |
+
args:
|
| 21 |
+
weights: torchvision.models.ResNet18_Weights.IMAGENET1K_V1
|
| 22 |
+
progress: false
|
| 23 |
+
model:
|
| 24 |
+
target: echosyn.reindentification.model.ContrastiveModel
|
| 25 |
+
args:
|
| 26 |
+
in_channels: 4
|
| 27 |
+
out_channels: 256
|
| 28 |
+
kl_loss_weight: 0.0
|
| 29 |
+
optimizer:
|
| 30 |
+
target: torch.optim.AdamW
|
| 31 |
+
args:
|
| 32 |
+
lr: 0.0001
|
| 33 |
+
betas:
|
| 34 |
+
- 0.9
|
| 35 |
+
- 0.999
|
| 36 |
+
weight_decay: 0.01
|
| 37 |
+
eps: 1.0e-08
|
| 38 |
+
scheduler:
|
| 39 |
+
target: torch.optim.lr_scheduler.ConstantLR
|
| 40 |
+
args:
|
| 41 |
+
factor: 1.0
|
| 42 |
+
vae:
|
| 43 |
+
target: diffusers.AutoencoderKL
|
| 44 |
+
pretrained: vae/avae-4f4
|
| 45 |
+
max_train_steps: 60000
|
| 46 |
+
gradient_accumulation_steps: 1
|
| 47 |
+
mixed_precision: bf16
|
| 48 |
+
max_grad_norm: 10.0
|
| 49 |
+
sample_latents: true
|
| 50 |
+
validation_steps: 10000
|
| 51 |
+
validation_samples: 99999
|
| 52 |
+
output_dir: experiments/${wandb_args.group}/${wandb_args.name}
|
| 53 |
+
logging_dir: logs
|
| 54 |
+
report_to: wandb
|
| 55 |
+
wandb_args:
|
| 56 |
+
project: EchoFlow
|
| 57 |
+
name: ped_psax_4f4
|
| 58 |
+
group: reindentification
|
| 59 |
+
checkpointing_steps: 10000
|
| 60 |
+
checkpoints_total_limit: 3
|
| 61 |
+
resume_from_checkpoint: null
|
| 62 |
+
seed: 42
|
| 63 |
+
no_wandb: false
|
| 64 |
+
num_train_epochs: 541
|
vae/avae-16f8/config.yaml
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model:
|
| 2 |
+
base_learning_rate: 2e-6 # ~5e-4 after scaloing
|
| 3 |
+
target: ldm.models.autoencoder.AutoencoderKL
|
| 4 |
+
params:
|
| 5 |
+
monitor: "val/rec_loss"
|
| 6 |
+
embed_dim: 16
|
| 7 |
+
lossconfig:
|
| 8 |
+
target: ldm.modules.losses.LPIPSWithDiscriminator
|
| 9 |
+
params:
|
| 10 |
+
disc_start: 50001
|
| 11 |
+
kl_weight: 0.000001
|
| 12 |
+
disc_weight: 0.5
|
| 13 |
+
|
| 14 |
+
ddconfig:
|
| 15 |
+
double_z: True
|
| 16 |
+
z_channels: 16
|
| 17 |
+
resolution: 112
|
| 18 |
+
in_channels: 3
|
| 19 |
+
out_ch: 3
|
| 20 |
+
ch: 128
|
| 21 |
+
ch_mult: [ 1,2,2,4 ] # num_down = len(ch_mult)-1
|
| 22 |
+
num_res_blocks: 2
|
| 23 |
+
attn_resolutions: [ ]
|
| 24 |
+
dropout: 0.0
|
| 25 |
+
|
| 26 |
+
data:
|
| 27 |
+
target: main.DataModuleFromConfig
|
| 28 |
+
params:
|
| 29 |
+
batch_size: 32
|
| 30 |
+
num_workers: 16
|
| 31 |
+
train:
|
| 32 |
+
target: taming.data.custom.CustomTrain
|
| 33 |
+
params:
|
| 34 |
+
training_images_list_file: ${oc.env:TMPDIR}/train.txt
|
| 35 |
+
size: 112
|
| 36 |
+
validation:
|
| 37 |
+
target: taming.data.custom.CustomTest
|
| 38 |
+
params:
|
| 39 |
+
test_images_list_file: ${oc.env:TMPDIR}/val.txt
|
| 40 |
+
size: 112
|
| 41 |
+
|
| 42 |
+
lightning:
|
| 43 |
+
callbacks:
|
| 44 |
+
image_logger:
|
| 45 |
+
target: main.ImageLogger
|
| 46 |
+
params:
|
| 47 |
+
batch_frequency: 1000
|
| 48 |
+
max_images: 8
|
| 49 |
+
increase_log_steps: True
|
| 50 |
+
|
| 51 |
+
trainer:
|
| 52 |
+
benchmark: True
|
| 53 |
+
accumulate_grad_batches: 2
|
| 54 |
+
max_epochs: 1000
|
vae/avae-4f4/config.yaml
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model:
|
| 2 |
+
base_learning_rate: 2e-6 # ~5e-4 after scaloing
|
| 3 |
+
target: ldm.models.autoencoder.AutoencoderKL
|
| 4 |
+
params:
|
| 5 |
+
monitor: "val/rec_loss"
|
| 6 |
+
embed_dim: 4
|
| 7 |
+
lossconfig:
|
| 8 |
+
target: ldm.modules.losses.LPIPSWithDiscriminator
|
| 9 |
+
params:
|
| 10 |
+
disc_start: 50001
|
| 11 |
+
kl_weight: 0.000001
|
| 12 |
+
disc_weight: 0.5
|
| 13 |
+
|
| 14 |
+
ddconfig:
|
| 15 |
+
double_z: True
|
| 16 |
+
z_channels: 4
|
| 17 |
+
resolution: 112
|
| 18 |
+
in_channels: 3
|
| 19 |
+
out_ch: 3
|
| 20 |
+
ch: 128
|
| 21 |
+
ch_mult: [ 1,2,4 ] # num_down = len(ch_mult)-1
|
| 22 |
+
num_res_blocks: 2
|
| 23 |
+
attn_resolutions: [ ]
|
| 24 |
+
dropout: 0.0
|
| 25 |
+
|
| 26 |
+
data:
|
| 27 |
+
target: main.DataModuleFromConfig
|
| 28 |
+
params:
|
| 29 |
+
batch_size: 32
|
| 30 |
+
num_workers: 16
|
| 31 |
+
train:
|
| 32 |
+
target: taming.data.custom.CustomTrain
|
| 33 |
+
params:
|
| 34 |
+
training_images_list_file: ${oc.env:TMPDIR}/train.txt
|
| 35 |
+
size: 112
|
| 36 |
+
validation:
|
| 37 |
+
target: taming.data.custom.CustomTest
|
| 38 |
+
params:
|
| 39 |
+
test_images_list_file: ${oc.env:TMPDIR}/val.txt
|
| 40 |
+
size: 112
|
| 41 |
+
|
| 42 |
+
lightning:
|
| 43 |
+
callbacks:
|
| 44 |
+
image_logger:
|
| 45 |
+
target: main.ImageLogger
|
| 46 |
+
params:
|
| 47 |
+
batch_frequency: 1000
|
| 48 |
+
max_images: 8
|
| 49 |
+
increase_log_steps: True
|
| 50 |
+
|
| 51 |
+
trainer:
|
| 52 |
+
benchmark: True
|
| 53 |
+
accumulate_grad_batches: 2
|
| 54 |
+
max_epochs: 1000
|
vae/avae-4f8/config.yaml
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model:
|
| 2 |
+
base_learning_rate: 2e-6 # ~5e-4 after scaloing
|
| 3 |
+
target: ldm.models.autoencoder.AutoencoderKL
|
| 4 |
+
params:
|
| 5 |
+
monitor: "val/rec_loss"
|
| 6 |
+
embed_dim: 4
|
| 7 |
+
lossconfig:
|
| 8 |
+
target: ldm.modules.losses.LPIPSWithDiscriminator
|
| 9 |
+
params:
|
| 10 |
+
disc_start: 50001
|
| 11 |
+
kl_weight: 0.000001
|
| 12 |
+
disc_weight: 0.5
|
| 13 |
+
|
| 14 |
+
ddconfig:
|
| 15 |
+
double_z: True
|
| 16 |
+
z_channels: 4
|
| 17 |
+
resolution: 112
|
| 18 |
+
in_channels: 3
|
| 19 |
+
out_ch: 3
|
| 20 |
+
ch: 128
|
| 21 |
+
ch_mult: [ 1,2,2,4 ] # num_down = len(ch_mult)-1
|
| 22 |
+
num_res_blocks: 2
|
| 23 |
+
attn_resolutions: [ ]
|
| 24 |
+
dropout: 0.0
|
| 25 |
+
|
| 26 |
+
data:
|
| 27 |
+
target: main.DataModuleFromConfig
|
| 28 |
+
params:
|
| 29 |
+
batch_size: 32
|
| 30 |
+
num_workers: 16
|
| 31 |
+
train:
|
| 32 |
+
target: taming.data.custom.CustomTrain
|
| 33 |
+
params:
|
| 34 |
+
training_images_list_file: ${oc.env:TMPDIR}/train.txt
|
| 35 |
+
size: 112
|
| 36 |
+
validation:
|
| 37 |
+
target: taming.data.custom.CustomTest
|
| 38 |
+
params:
|
| 39 |
+
test_images_list_file: ${oc.env:TMPDIR}/val.txt
|
| 40 |
+
size: 112
|
| 41 |
+
|
| 42 |
+
lightning:
|
| 43 |
+
callbacks:
|
| 44 |
+
image_logger:
|
| 45 |
+
target: main.ImageLogger
|
| 46 |
+
params:
|
| 47 |
+
batch_frequency: 1000
|
| 48 |
+
max_images: 8
|
| 49 |
+
increase_log_steps: True
|
| 50 |
+
|
| 51 |
+
trainer:
|
| 52 |
+
benchmark: True
|
| 53 |
+
accumulate_grad_batches: 2
|
| 54 |
+
max_epochs: 1000
|