HReynaud commited on
Commit
514f603
·
1 Parent(s): 8f6cdca

training configs

Browse files
lifm/FMiT-B2-16f8/config.yaml ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ globals:
2
+ target_fps: original
3
+ target_nframes: 64
4
+ outputs:
5
+ - image
6
+ - view
7
+ resolution: 112
8
+ latent_res: 14
9
+ latent_channels: 16
10
+ denoiser:
11
+ target: echosyn.common.models.SegDiTTransformer2DModel
12
+ args:
13
+ num_attention_heads: 12
14
+ attention_head_dim: 64
15
+ in_channels: 17
16
+ out_channels: 16
17
+ num_layers: 12
18
+ dropout: 0.0
19
+ norm_num_groups: 32
20
+ attention_bias: true
21
+ sample_size: ${globals.latent_res}
22
+ patch_size: 2
23
+ activation_fn: gelu-approximate
24
+ num_embeds_ada_norm: 1000
25
+ upcast_attention: false
26
+ norm_type: ada_norm_zero
27
+ norm_elementwise_affine: false
28
+ norm_eps: 1.0e-05
29
+ optimizer:
30
+ target: torch.optim.AdamW
31
+ args:
32
+ lr: 5.0e-05
33
+ betas:
34
+ - 0.9
35
+ - 0.999
36
+ weight_decay: 0.01
37
+ eps: 1.0e-08
38
+ scheduler:
39
+ target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
40
+ args:
41
+ warmup_steps: 5000
42
+ ref_steps: ${max_train_steps}
43
+ eta_min: 1.0e-06
44
+ decay_rate: 2
45
+ vae:
46
+ target: diffusers.AutoencoderKL
47
+ pretrained: vae/avae-16f8
48
+ datasets:
49
+ - name: LatentSeg
50
+ active: true
51
+ params:
52
+ root: avae-16f8/dynamic
53
+ outputs: ${globals.outputs}
54
+ target_fps: ${globals.target_fps}
55
+ view_label: A4C
56
+ target_nframes: ${globals.target_nframes}
57
+ latent_channels: ${globals.latent_channels}
58
+ segmentation_root: segmentations/dynamic
59
+ target_resolution: ${globals.latent_res}
60
+ - name: LatentSeg
61
+ active: true
62
+ params:
63
+ root: avae-16f8/ped_a4c
64
+ outputs: ${globals.outputs}
65
+ target_fps: ${globals.target_fps}
66
+ view_label: A4C
67
+ target_nframes: ${globals.target_nframes}
68
+ latent_channels: ${globals.latent_channels}
69
+ segmentation_root: segmentations/ped_a4c
70
+ target_resolution: ${globals.latent_res}
71
+ - name: LatentSeg
72
+ active: true
73
+ params:
74
+ root: avae-16f8/ped_psax
75
+ outputs: ${globals.outputs}
76
+ target_fps: ${globals.target_fps}
77
+ view_label: PSAX
78
+ target_nframes: ${globals.target_nframes}
79
+ latent_channels: ${globals.latent_channels}
80
+ segmentation_root: segmentations/ped_psax
81
+ target_resolution: ${globals.latent_res}
82
+ - name: LatentSeg
83
+ active: true
84
+ params:
85
+ root: avae-16f8/lvh
86
+ outputs: ${globals.outputs}
87
+ target_fps: ${globals.target_fps}
88
+ view_label: PLAX
89
+ target_nframes: ${globals.target_nframes}
90
+ latent_channels: ${globals.latent_channels}
91
+ segmentation_root: no_seg
92
+ target_resolution: ${globals.latent_res}
93
+ dataloader:
94
+ target: torch.utils.data.DataLoader
95
+ args:
96
+ shuffle: true
97
+ batch_size: 128
98
+ num_workers: 16
99
+ pin_memory: true
100
+ drop_last: true
101
+ persistent_workers: true
102
+ max_train_steps: 1000000
103
+ gradient_accumulation_steps: 1
104
+ mixed_precision: fp16
105
+ use_ema: true
106
+ noise_offset: 0.1
107
+ max_grad_norm: 0.1
108
+ max_grad_value: -1
109
+ pad_latents: false
110
+ sample_latents: true
111
+ output_dir: experiments/${wandb_args.name}
112
+ logging_dir: logs
113
+ report_to: wandb
114
+ wandb_args:
115
+ project: EchoFlow
116
+ name: FMiT-B2-16f8
117
+ group: FMiT
118
+ checkpointing_steps: 10000
119
+ checkpoints_to_keep:
120
+ - 50000
121
+ - 100000
122
+ - 200000
123
+ - 500000
124
+ - 1000000
125
+ resume_from_checkpoint: latest
126
+ validation:
127
+ samples: 4
128
+ steps: 5000
129
+ method: euler
130
+ timesteps: 25
131
+ seed: 42
132
+ num_train_epochs: 45455
lifm/FMiT-B2-4f4/config.yaml ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ globals:
2
+ target_fps: original
3
+ target_nframes: 64
4
+ outputs:
5
+ - image
6
+ - view
7
+ resolution: 112
8
+ latent_res: 28
9
+ latent_channels: 4
10
+ denoiser:
11
+ target: echosyn.common.models.SegDiTTransformer2DModel
12
+ args:
13
+ num_attention_heads: 12
14
+ attention_head_dim: 64
15
+ in_channels: 5
16
+ out_channels: 4
17
+ num_layers: 12
18
+ dropout: 0.0
19
+ norm_num_groups: 32
20
+ attention_bias: true
21
+ sample_size: 28
22
+ patch_size: 2
23
+ activation_fn: gelu-approximate
24
+ num_embeds_ada_norm: 1000
25
+ upcast_attention: false
26
+ norm_type: ada_norm_zero
27
+ norm_elementwise_affine: false
28
+ norm_eps: 1.0e-05
29
+ optimizer:
30
+ target: torch.optim.AdamW
31
+ args:
32
+ lr: 5.0e-05
33
+ betas:
34
+ - 0.9
35
+ - 0.999
36
+ weight_decay: 0.01
37
+ eps: 1.0e-08
38
+ scheduler:
39
+ target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
40
+ args:
41
+ warmup_steps: 5000
42
+ ref_steps: ${max_train_steps}
43
+ eta_min: 1.0e-06
44
+ decay_rate: 2
45
+ vae:
46
+ target: diffusers.AutoencoderKL
47
+ pretrained: vae/avae-4f4
48
+ datasets:
49
+ - name: LatentSeg
50
+ active: true
51
+ params:
52
+ root: avae-4f4/dynamic
53
+ outputs: ${globals.outputs}
54
+ target_fps: ${globals.target_fps}
55
+ view_label: A4C
56
+ target_nframes: ${globals.target_nframes}
57
+ latent_channels: ${globals.latent_channels}
58
+ segmentation_root: segmentations/dynamic
59
+ target_resolution: ${globals.latent_res}
60
+ - name: LatentSeg
61
+ active: true
62
+ params:
63
+ root: avae-4f4/ped_a4c
64
+ outputs: ${globals.outputs}
65
+ target_fps: ${globals.target_fps}
66
+ view_label: A4C
67
+ target_nframes: ${globals.target_nframes}
68
+ latent_channels: ${globals.latent_channels}
69
+ segmentation_root: segmentations/ped_a4c
70
+ target_resolution: ${globals.latent_res}
71
+ - name: LatentSeg
72
+ active: true
73
+ params:
74
+ root: avae-4f4/ped_psax
75
+ outputs: ${globals.outputs}
76
+ target_fps: ${globals.target_fps}
77
+ view_label: PSAX
78
+ target_nframes: ${globals.target_nframes}
79
+ latent_channels: ${globals.latent_channels}
80
+ segmentation_root: segmentations/ped_psax
81
+ target_resolution: ${globals.latent_res}
82
+ - name: LatentSeg
83
+ active: true
84
+ params:
85
+ root: avae-4f4/lvh
86
+ outputs: ${globals.outputs}
87
+ target_fps: ${globals.target_fps}
88
+ view_label: PLAX
89
+ target_nframes: ${globals.target_nframes}
90
+ latent_channels: ${globals.latent_channels}
91
+ segmentation_root: no_seg
92
+ target_resolution: ${globals.latent_res}
93
+ dataloader:
94
+ target: torch.utils.data.DataLoader
95
+ args:
96
+ shuffle: true
97
+ batch_size: 128
98
+ num_workers: 16
99
+ pin_memory: true
100
+ drop_last: true
101
+ persistent_workers: true
102
+ max_train_steps: 1000000
103
+ gradient_accumulation_steps: 1
104
+ mixed_precision: fp16
105
+ use_ema: true
106
+ noise_offset: 0.1
107
+ max_grad_norm: 0.1
108
+ max_grad_value: -1
109
+ pad_latents: false
110
+ sample_latents: true
111
+ output_dir: experiments/${wandb_args.name}
112
+ logging_dir: logs
113
+ report_to: wandb
114
+ wandb_args:
115
+ project: EchoFlow
116
+ name: FMiT-B2-4f4
117
+ group: FMiT
118
+ checkpointing_steps: 10000
119
+ checkpoints_to_keep:
120
+ - 50000
121
+ - 100000
122
+ - 200000
123
+ - 500000
124
+ - 1000000
125
+ resume_from_checkpoint: latest
126
+ validation:
127
+ samples: 4
128
+ steps: 5000
129
+ method: euler
130
+ timesteps: 25
131
+ seed: 42
132
+ num_train_epochs: 45455
lifm/FMiT-B4-4f4/config.yaml ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ globals:
2
+ target_fps: original
3
+ target_nframes: 64
4
+ outputs:
5
+ - image
6
+ - view
7
+ resolution: 112
8
+ latent_res: 28
9
+ latent_channels: 4
10
+ denoiser:
11
+ target: echosyn.common.models.SegDiTTransformer2DModel
12
+ args:
13
+ num_attention_heads: 12
14
+ attention_head_dim: 64
15
+ in_channels: 5
16
+ out_channels: 4
17
+ num_layers: 12
18
+ dropout: 0.0
19
+ norm_num_groups: 32
20
+ attention_bias: true
21
+ sample_size: 28
22
+ patch_size: 4
23
+ activation_fn: gelu-approximate
24
+ num_embeds_ada_norm: 1000
25
+ upcast_attention: false
26
+ norm_type: ada_norm_zero
27
+ norm_elementwise_affine: false
28
+ norm_eps: 1.0e-05
29
+ optimizer:
30
+ target: torch.optim.AdamW
31
+ args:
32
+ lr: 5.0e-05
33
+ betas:
34
+ - 0.9
35
+ - 0.999
36
+ weight_decay: 0.01
37
+ eps: 1.0e-08
38
+ scheduler:
39
+ target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
40
+ args:
41
+ warmup_steps: 5000
42
+ ref_steps: ${max_train_steps}
43
+ eta_min: 1.0e-06
44
+ decay_rate: 2
45
+ vae:
46
+ target: diffusers.AutoencoderKL
47
+ pretrained: vae/avae-4f4
48
+ datasets:
49
+ - name: LatentSeg
50
+ active: true
51
+ params:
52
+ root: avae-4f4/dynamic
53
+ outputs: ${globals.outputs}
54
+ target_fps: ${globals.target_fps}
55
+ view_label: A4C
56
+ target_nframes: ${globals.target_nframes}
57
+ latent_channels: ${globals.latent_channels}
58
+ segmentation_root: segmentations/dynamic
59
+ target_resolution: ${globals.latent_res}
60
+ - name: LatentSeg
61
+ active: true
62
+ params:
63
+ root: avae-4f4/ped_a4c
64
+ outputs: ${globals.outputs}
65
+ target_fps: ${globals.target_fps}
66
+ view_label: A4C
67
+ target_nframes: ${globals.target_nframes}
68
+ latent_channels: ${globals.latent_channels}
69
+ segmentation_root: segmentations/ped_a4c
70
+ target_resolution: ${globals.latent_res}
71
+ - name: LatentSeg
72
+ active: true
73
+ params:
74
+ root: avae-4f4/ped_psax
75
+ outputs: ${globals.outputs}
76
+ target_fps: ${globals.target_fps}
77
+ view_label: PSAX
78
+ target_nframes: ${globals.target_nframes}
79
+ latent_channels: ${globals.latent_channels}
80
+ segmentation_root: segmentations/ped_psax
81
+ target_resolution: ${globals.latent_res}
82
+ - name: LatentSeg
83
+ active: true
84
+ params:
85
+ root: avae-4f4/lvh
86
+ outputs: ${globals.outputs}
87
+ target_fps: ${globals.target_fps}
88
+ view_label: PLAX
89
+ target_nframes: ${globals.target_nframes}
90
+ latent_channels: ${globals.latent_channels}
91
+ segmentation_root: no_seg
92
+ target_resolution: ${globals.latent_res}
93
+ dataloader:
94
+ target: torch.utils.data.DataLoader
95
+ args:
96
+ shuffle: true
97
+ batch_size: 128
98
+ num_workers: 16
99
+ pin_memory: true
100
+ drop_last: true
101
+ persistent_workers: true
102
+ max_train_steps: 1000000
103
+ gradient_accumulation_steps: 1
104
+ mixed_precision: fp16
105
+ use_ema: true
106
+ noise_offset: 0.1
107
+ max_grad_norm: 0.1
108
+ max_grad_value: -1
109
+ pad_latents: false
110
+ sample_latents: true
111
+ output_dir: experiments/${wandb_args.name}
112
+ logging_dir: logs
113
+ report_to: wandb
114
+ wandb_args:
115
+ project: EchoFlow
116
+ name: FMiT-B4-4f4
117
+ group: FMiT
118
+ checkpointing_steps: 10000
119
+ checkpoints_to_keep:
120
+ - 50000
121
+ - 100000
122
+ - 200000
123
+ - 500000
124
+ - 1000000
125
+ resume_from_checkpoint: latest
126
+ validation:
127
+ samples: 4
128
+ steps: 5000
129
+ method: euler
130
+ timesteps: 25
131
+ seed: 42
132
+ num_train_epochs: 45455
lifm/FMiT-L2-16f8/config.yaml ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ globals:
2
+ target_fps: original
3
+ target_nframes: 64
4
+ outputs:
5
+ - image
6
+ - view
7
+ resolution: 112
8
+ latent_res: 14
9
+ latent_channels: 16
10
+ denoiser:
11
+ target: echosyn.common.models.SegDiTTransformer2DModel
12
+ args:
13
+ num_attention_heads: 16
14
+ attention_head_dim: 64
15
+ in_channels: 17
16
+ out_channels: 16
17
+ num_layers: 24
18
+ dropout: 0.0
19
+ norm_num_groups: 32
20
+ attention_bias: true
21
+ sample_size: ${globals.latent_res}
22
+ patch_size: 2
23
+ activation_fn: gelu-approximate
24
+ num_embeds_ada_norm: 1000
25
+ upcast_attention: false
26
+ norm_type: ada_norm_zero
27
+ norm_elementwise_affine: false
28
+ norm_eps: 1.0e-05
29
+ optimizer:
30
+ target: torch.optim.AdamW
31
+ args:
32
+ lr: 5.0e-05
33
+ betas:
34
+ - 0.9
35
+ - 0.999
36
+ weight_decay: 0.01
37
+ eps: 1.0e-08
38
+ scheduler:
39
+ target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
40
+ args:
41
+ warmup_steps: 5000
42
+ ref_steps: ${max_train_steps}
43
+ eta_min: 1.0e-06
44
+ decay_rate: 2
45
+ vae:
46
+ target: diffusers.AutoencoderKL
47
+ pretrained: vae/avae-16f8
48
+ datasets:
49
+ - name: LatentSeg
50
+ active: true
51
+ params:
52
+ root: avae-16f8/dynamic
53
+ outputs: ${globals.outputs}
54
+ target_fps: ${globals.target_fps}
55
+ view_label: A4C
56
+ target_nframes: ${globals.target_nframes}
57
+ latent_channels: ${globals.latent_channels}
58
+ segmentation_root: segmentations/dynamic
59
+ target_resolution: ${globals.latent_res}
60
+ - name: LatentSeg
61
+ active: true
62
+ params:
63
+ root: avae-16f8/ped_a4c
64
+ outputs: ${globals.outputs}
65
+ target_fps: ${globals.target_fps}
66
+ view_label: A4C
67
+ target_nframes: ${globals.target_nframes}
68
+ latent_channels: ${globals.latent_channels}
69
+ segmentation_root: segmentations/ped_a4c
70
+ target_resolution: ${globals.latent_res}
71
+ - name: LatentSeg
72
+ active: true
73
+ params:
74
+ root: avae-16f8/ped_psax
75
+ outputs: ${globals.outputs}
76
+ target_fps: ${globals.target_fps}
77
+ view_label: PSAX
78
+ target_nframes: ${globals.target_nframes}
79
+ latent_channels: ${globals.latent_channels}
80
+ segmentation_root: segmentations/ped_psax
81
+ target_resolution: ${globals.latent_res}
82
+ - name: LatentSeg
83
+ active: true
84
+ params:
85
+ root: avae-16f8/lvh
86
+ outputs: ${globals.outputs}
87
+ target_fps: ${globals.target_fps}
88
+ view_label: PLAX
89
+ target_nframes: ${globals.target_nframes}
90
+ latent_channels: ${globals.latent_channels}
91
+ segmentation_root: no_seg
92
+ target_resolution: ${globals.latent_res}
93
+ dataloader:
94
+ target: torch.utils.data.DataLoader
95
+ args:
96
+ shuffle: true
97
+ batch_size: 128
98
+ num_workers: 16
99
+ pin_memory: true
100
+ drop_last: true
101
+ persistent_workers: true
102
+ max_train_steps: 1000000
103
+ gradient_accumulation_steps: 1
104
+ mixed_precision: fp16
105
+ use_ema: true
106
+ noise_offset: 0.1
107
+ max_grad_norm: 0.1
108
+ max_grad_value: -1
109
+ pad_latents: false
110
+ sample_latents: true
111
+ output_dir: experiments/${wandb_args.name}
112
+ logging_dir: logs
113
+ report_to: wandb
114
+ wandb_args:
115
+ project: EchoFlow
116
+ name: FMiT-L2-16f8
117
+ group: FMiT
118
+ checkpointing_steps: 10000
119
+ checkpoints_to_keep:
120
+ - 50000
121
+ - 100000
122
+ - 200000
123
+ - 500000
124
+ - 1000000
125
+ resume_from_checkpoint: latest
126
+ validation:
127
+ samples: 4
128
+ steps: 5000
129
+ method: euler
130
+ timesteps: 25
131
+ seed: 42
132
+ num_train_epochs: 45455
lifm/FMiT-L2-4f4/config.yaml ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ globals:
2
+ target_fps: original
3
+ target_nframes: 64
4
+ outputs:
5
+ - image
6
+ - view
7
+ resolution: 112
8
+ latent_res: 28
9
+ latent_channels: 4
10
+ denoiser:
11
+ target: echosyn.common.models.SegDiTTransformer2DModel
12
+ args:
13
+ num_attention_heads: 16
14
+ attention_head_dim: 64
15
+ in_channels: 5
16
+ out_channels: 4
17
+ num_layers: 24
18
+ dropout: 0.0
19
+ norm_num_groups: 32
20
+ attention_bias: true
21
+ sample_size: 14
22
+ patch_size: 2
23
+ activation_fn: gelu-approximate
24
+ num_embeds_ada_norm: 1000
25
+ upcast_attention: false
26
+ norm_type: ada_norm_zero
27
+ norm_elementwise_affine: false
28
+ norm_eps: 1.0e-05
29
+ optimizer:
30
+ target: torch.optim.AdamW
31
+ args:
32
+ lr: 5.0e-05
33
+ betas:
34
+ - 0.9
35
+ - 0.999
36
+ weight_decay: 0.01
37
+ eps: 1.0e-08
38
+ scheduler:
39
+ target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
40
+ args:
41
+ warmup_steps: 5000
42
+ ref_steps: ${max_train_steps}
43
+ eta_min: 1.0e-06
44
+ decay_rate: 2
45
+ vae:
46
+ target: diffusers.AutoencoderKL
47
+ pretrained: vae/avae-4f4
48
+ datasets:
49
+ - name: LatentSeg
50
+ active: true
51
+ params:
52
+ root: avae-4f4/dynamic
53
+ outputs: ${globals.outputs}
54
+ target_fps: ${globals.target_fps}
55
+ view_label: A4C
56
+ target_nframes: ${globals.target_nframes}
57
+ latent_channels: ${globals.latent_channels}
58
+ segmentation_root: segmentations/dynamic
59
+ target_resolution: ${globals.latent_res}
60
+ - name: LatentSeg
61
+ active: true
62
+ params:
63
+ root: avae-4f4/ped_a4c
64
+ outputs: ${globals.outputs}
65
+ target_fps: ${globals.target_fps}
66
+ view_label: A4C
67
+ target_nframes: ${globals.target_nframes}
68
+ latent_channels: ${globals.latent_channels}
69
+ segmentation_root: segmentations/ped_a4c
70
+ target_resolution: ${globals.latent_res}
71
+ - name: LatentSeg
72
+ active: true
73
+ params:
74
+ root: avae-4f4/ped_psax
75
+ outputs: ${globals.outputs}
76
+ target_fps: ${globals.target_fps}
77
+ view_label: PSAX
78
+ target_nframes: ${globals.target_nframes}
79
+ latent_channels: ${globals.latent_channels}
80
+ segmentation_root: segmentations/ped_psax
81
+ target_resolution: ${globals.latent_res}
82
+ - name: LatentSeg
83
+ active: true
84
+ params:
85
+ root: avae-4f4/lvh
86
+ outputs: ${globals.outputs}
87
+ target_fps: ${globals.target_fps}
88
+ view_label: PLAX
89
+ target_nframes: ${globals.target_nframes}
90
+ latent_channels: ${globals.latent_channels}
91
+ segmentation_root: no_seg
92
+ target_resolution: ${globals.latent_res}
93
+ dataloader:
94
+ target: torch.utils.data.DataLoader
95
+ args:
96
+ shuffle: true
97
+ batch_size: 128
98
+ num_workers: 16
99
+ pin_memory: true
100
+ drop_last: true
101
+ persistent_workers: true
102
+ max_train_steps: 1000000
103
+ gradient_accumulation_steps: 1
104
+ mixed_precision: fp16
105
+ use_ema: true
106
+ noise_offset: 0.1
107
+ max_grad_norm: 0.1
108
+ max_grad_value: -1
109
+ pad_latents: false
110
+ sample_latents: true
111
+ output_dir: experiments/${wandb_args.name}
112
+ logging_dir: logs
113
+ report_to: wandb
114
+ wandb_args:
115
+ project: EchoFlow
116
+ name: FMiT-L2-4f4
117
+ group: FMiT
118
+ checkpointing_steps: 10000
119
+ checkpoints_to_keep:
120
+ - 50000
121
+ - 100000
122
+ - 200000
123
+ - 500000
124
+ - 1000000
125
+ resume_from_checkpoint: latest
126
+ validation:
127
+ samples: 4
128
+ steps: 5000
129
+ method: euler
130
+ timesteps: 25
131
+ seed: 42
132
+ num_train_epochs: 45455
lifm/FMiT-L4-4f4/config.yaml ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ globals:
2
+ target_fps: original
3
+ target_nframes: 64
4
+ outputs:
5
+ - image
6
+ - view
7
+ resolution: 112
8
+ latent_res: 28
9
+ latent_channels: 4
10
+ denoiser:
11
+ target: echosyn.common.models.SegDiTTransformer2DModel
12
+ args:
13
+ num_attention_heads: 16
14
+ attention_head_dim: 64
15
+ in_channels: 5
16
+ out_channels: 4
17
+ num_layers: 24
18
+ dropout: 0.0
19
+ norm_num_groups: 32
20
+ attention_bias: true
21
+ sample_size: 14
22
+ patch_size: 4
23
+ activation_fn: gelu-approximate
24
+ num_embeds_ada_norm: 1000
25
+ upcast_attention: false
26
+ norm_type: ada_norm_zero
27
+ norm_elementwise_affine: false
28
+ norm_eps: 1.0e-05
29
+ optimizer:
30
+ target: torch.optim.AdamW
31
+ args:
32
+ lr: 5.0e-05
33
+ betas:
34
+ - 0.9
35
+ - 0.999
36
+ weight_decay: 0.01
37
+ eps: 1.0e-08
38
+ scheduler:
39
+ target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
40
+ args:
41
+ warmup_steps: 5000
42
+ ref_steps: ${max_train_steps}
43
+ eta_min: 1.0e-06
44
+ decay_rate: 2
45
+ vae:
46
+ target: diffusers.AutoencoderKL
47
+ pretrained: vae/avae-4f4
48
+ datasets:
49
+ - name: LatentSeg
50
+ active: true
51
+ params:
52
+ root: avae-4f4/dynamic
53
+ outputs: ${globals.outputs}
54
+ target_fps: ${globals.target_fps}
55
+ view_label: A4C
56
+ target_nframes: ${globals.target_nframes}
57
+ latent_channels: ${globals.latent_channels}
58
+ segmentation_root: segmentations/dynamic
59
+ target_resolution: ${globals.latent_res}
60
+ - name: LatentSeg
61
+ active: true
62
+ params:
63
+ root: avae-4f4/ped_a4c
64
+ outputs: ${globals.outputs}
65
+ target_fps: ${globals.target_fps}
66
+ view_label: A4C
67
+ target_nframes: ${globals.target_nframes}
68
+ latent_channels: ${globals.latent_channels}
69
+ segmentation_root: segmentations/ped_a4c
70
+ target_resolution: ${globals.latent_res}
71
+ - name: LatentSeg
72
+ active: true
73
+ params:
74
+ root: avae-4f4/ped_psax
75
+ outputs: ${globals.outputs}
76
+ target_fps: ${globals.target_fps}
77
+ view_label: PSAX
78
+ target_nframes: ${globals.target_nframes}
79
+ latent_channels: ${globals.latent_channels}
80
+ segmentation_root: segmentations/ped_psax
81
+ target_resolution: ${globals.latent_res}
82
+ - name: LatentSeg
83
+ active: true
84
+ params:
85
+ root: avae-4f4/lvh
86
+ outputs: ${globals.outputs}
87
+ target_fps: ${globals.target_fps}
88
+ view_label: PLAX
89
+ target_nframes: ${globals.target_nframes}
90
+ latent_channels: ${globals.latent_channels}
91
+ segmentation_root: no_seg
92
+ target_resolution: ${globals.latent_res}
93
+ dataloader:
94
+ target: torch.utils.data.DataLoader
95
+ args:
96
+ shuffle: true
97
+ batch_size: 128
98
+ num_workers: 16
99
+ pin_memory: true
100
+ drop_last: true
101
+ persistent_workers: true
102
+ max_train_steps: 1000000
103
+ gradient_accumulation_steps: 1
104
+ mixed_precision: fp16
105
+ use_ema: true
106
+ noise_offset: 0.1
107
+ max_grad_norm: 0.1
108
+ max_grad_value: -1
109
+ pad_latents: false
110
+ sample_latents: true
111
+ output_dir: experiments/${wandb_args.name}
112
+ logging_dir: logs
113
+ report_to: wandb
114
+ wandb_args:
115
+ project: EchoFlow
116
+ name: FMiT-L4-4f4
117
+ group: FMiT
118
+ checkpointing_steps: 10000
119
+ checkpoints_to_keep:
120
+ - 50000
121
+ - 100000
122
+ - 200000
123
+ - 500000
124
+ - 1000000
125
+ resume_from_checkpoint: latest
126
+ validation:
127
+ samples: 4
128
+ steps: 5000
129
+ method: euler
130
+ timesteps: 25
131
+ seed: 42
132
+ num_train_epochs: 45455
lifm/FMiT-S2-16f8/config.yaml ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ globals:
2
+ target_fps: original
3
+ target_nframes: 64
4
+ outputs:
5
+ - image
6
+ - view
7
+ resolution: 112
8
+ latent_res: 14
9
+ latent_channels: 16
10
+ denoiser:
11
+ target: echosyn.common.models.SegDiTTransformer2DModel
12
+ args:
13
+ num_attention_heads: 6
14
+ attention_head_dim: 64
15
+ in_channels: 17
16
+ out_channels: 16
17
+ num_layers: 12
18
+ dropout: 0.0
19
+ norm_num_groups: 32
20
+ attention_bias: true
21
+ sample_size: ${globals.latent_res}
22
+ patch_size: 2
23
+ activation_fn: gelu-approximate
24
+ num_embeds_ada_norm: 1000
25
+ upcast_attention: false
26
+ norm_type: ada_norm_zero
27
+ norm_elementwise_affine: false
28
+ norm_eps: 1.0e-05
29
+ optimizer:
30
+ target: torch.optim.AdamW
31
+ args:
32
+ lr: 5.0e-05
33
+ betas:
34
+ - 0.9
35
+ - 0.999
36
+ weight_decay: 0.01
37
+ eps: 1.0e-08
38
+ scheduler:
39
+ target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
40
+ args:
41
+ warmup_steps: 5000
42
+ ref_steps: ${max_train_steps}
43
+ eta_min: 1.0e-06
44
+ decay_rate: 2
45
+ vae:
46
+ target: diffusers.AutoencoderKL
47
+ pretrained: vae/avae-16f8
48
+ datasets:
49
+ - name: LatentSeg
50
+ active: true
51
+ params:
52
+ root: avae-16f8/dynamic
53
+ outputs: ${globals.outputs}
54
+ target_fps: ${globals.target_fps}
55
+ view_label: A4C
56
+ target_nframes: ${globals.target_nframes}
57
+ latent_channels: ${globals.latent_channels}
58
+ segmentation_root: segmentations/dynamic
59
+ target_resolution: ${globals.latent_res}
60
+ - name: LatentSeg
61
+ active: true
62
+ params:
63
+ root: avae-16f8/ped_a4c
64
+ outputs: ${globals.outputs}
65
+ target_fps: ${globals.target_fps}
66
+ view_label: A4C
67
+ target_nframes: ${globals.target_nframes}
68
+ latent_channels: ${globals.latent_channels}
69
+ segmentation_root: segmentations/ped_a4c
70
+ target_resolution: ${globals.latent_res}
71
+ - name: LatentSeg
72
+ active: true
73
+ params:
74
+ root: avae-16f8/ped_psax
75
+ outputs: ${globals.outputs}
76
+ target_fps: ${globals.target_fps}
77
+ view_label: PSAX
78
+ target_nframes: ${globals.target_nframes}
79
+ latent_channels: ${globals.latent_channels}
80
+ segmentation_root: segmentations/ped_psax
81
+ target_resolution: ${globals.latent_res}
82
+ - name: LatentSeg
83
+ active: true
84
+ params:
85
+ root: avae-16f8/lvh
86
+ outputs: ${globals.outputs}
87
+ target_fps: ${globals.target_fps}
88
+ view_label: PLAX
89
+ target_nframes: ${globals.target_nframes}
90
+ latent_channels: ${globals.latent_channels}
91
+ segmentation_root: no_seg
92
+ target_resolution: ${globals.latent_res}
93
+ dataloader:
94
+ target: torch.utils.data.DataLoader
95
+ args:
96
+ shuffle: true
97
+ batch_size: 128
98
+ num_workers: 16
99
+ pin_memory: true
100
+ drop_last: true
101
+ persistent_workers: true
102
+ max_train_steps: 1000000
103
+ gradient_accumulation_steps: 1
104
+ mixed_precision: fp16
105
+ use_ema: true
106
+ noise_offset: 0.1
107
+ max_grad_norm: 0.1
108
+ max_grad_value: -1
109
+ pad_latents: false
110
+ sample_latents: true
111
+ output_dir: experiments/${wandb_args.name}
112
+ logging_dir: logs
113
+ report_to: wandb
114
+ wandb_args:
115
+ project: EchoFlow
116
+ name: FMiT-S2-16f8
117
+ group: FMiT
118
+ checkpointing_steps: 10000
119
+ checkpoints_to_keep:
120
+ - 50000
121
+ - 100000
122
+ - 200000
123
+ - 500000
124
+ - 1000000
125
+ resume_from_checkpoint: latest
126
+ validation:
127
+ samples: 4
128
+ steps: 5000
129
+ method: euler
130
+ timesteps: 25
131
+ seed: 42
132
+ num_train_epochs: 45455
lifm/FMiT-S2-4f4/config.yaml ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ globals:
2
+ target_fps: original
3
+ target_nframes: 64
4
+ outputs:
5
+ - image
6
+ - view
7
+ resolution: 112
8
+ latent_res: 28
9
+ latent_channels: 4
10
+ denoiser:
11
+ target: echosyn.common.models.SegDiTTransformer2DModel
12
+ args:
13
+ num_attention_heads: 6
14
+ attention_head_dim: 64
15
+ in_channels: 5
16
+ out_channels: 4
17
+ num_layers: 12
18
+ dropout: 0.0
19
+ norm_num_groups: 32
20
+ attention_bias: true
21
+ sample_size: ${globals.latent_res}
22
+ patch_size: 2
23
+ activation_fn: gelu-approximate
24
+ num_embeds_ada_norm: 1000
25
+ upcast_attention: false
26
+ norm_type: ada_norm_zero
27
+ norm_elementwise_affine: false
28
+ norm_eps: 1.0e-05
29
+ optimizer:
30
+ target: torch.optim.AdamW
31
+ args:
32
+ lr: 5.0e-05
33
+ betas:
34
+ - 0.9
35
+ - 0.999
36
+ weight_decay: 0.01
37
+ eps: 1.0e-08
38
+ scheduler:
39
+ target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
40
+ args:
41
+ warmup_steps: 5000
42
+ ref_steps: ${max_train_steps}
43
+ eta_min: 1.0e-06
44
+ decay_rate: 2
45
+ vae:
46
+ target: diffusers.AutoencoderKL
47
+ pretrained: vae/avae-4f4
48
+ datasets:
49
+ - name: LatentSeg
50
+ active: true
51
+ params:
52
+ root: avae-4f4/dynamic
53
+ outputs: ${globals.outputs}
54
+ target_fps: ${globals.target_fps}
55
+ view_label: A4C
56
+ target_nframes: ${globals.target_nframes}
57
+ latent_channels: ${globals.latent_channels}
58
+ segmentation_root: segmentations/dynamic
59
+ target_resolution: ${globals.latent_res}
60
+ - name: LatentSeg
61
+ active: true
62
+ params:
63
+ root: avae-4f4/ped_a4c
64
+ outputs: ${globals.outputs}
65
+ target_fps: ${globals.target_fps}
66
+ view_label: A4C
67
+ target_nframes: ${globals.target_nframes}
68
+ latent_channels: ${globals.latent_channels}
69
+ segmentation_root: segmentations/ped_a4c
70
+ target_resolution: ${globals.latent_res}
71
+ - name: LatentSeg
72
+ active: true
73
+ params:
74
+ root: avae-4f4/ped_psax
75
+ outputs: ${globals.outputs}
76
+ target_fps: ${globals.target_fps}
77
+ view_label: PSAX
78
+ target_nframes: ${globals.target_nframes}
79
+ latent_channels: ${globals.latent_channels}
80
+ segmentation_root: segmentations/ped_psax
81
+ target_resolution: ${globals.latent_res}
82
+ - name: LatentSeg
83
+ active: true
84
+ params:
85
+ root: avae-4f4/lvh
86
+ outputs: ${globals.outputs}
87
+ target_fps: ${globals.target_fps}
88
+ view_label: PLAX
89
+ target_nframes: ${globals.target_nframes}
90
+ latent_channels: ${globals.latent_channels}
91
+ segmentation_root: no_seg
92
+ target_resolution: ${globals.latent_res}
93
+ dataloader:
94
+ target: torch.utils.data.DataLoader
95
+ args:
96
+ shuffle: true
97
+ batch_size: 128
98
+ num_workers: 16
99
+ pin_memory: true
100
+ drop_last: true
101
+ persistent_workers: true
102
+ max_train_steps: 1000000
103
+ gradient_accumulation_steps: 1
104
+ mixed_precision: fp16
105
+ use_ema: true
106
+ noise_offset: 0.1
107
+ max_grad_norm: 0.1
108
+ max_grad_value: -1
109
+ pad_latents: false
110
+ sample_latents: true
111
+ output_dir: experiments/${wandb_args.name}
112
+ logging_dir: logs
113
+ report_to: wandb
114
+ wandb_args:
115
+ project: EchoFlow
116
+ name: FMiT-S2-4f4
117
+ group: FMiT
118
+ checkpointing_steps: 10000
119
+ checkpoints_to_keep:
120
+ - 50000
121
+ - 100000
122
+ - 200000
123
+ - 500000
124
+ - 1000000
125
+ resume_from_checkpoint: latest
126
+ validation:
127
+ samples: 4
128
+ steps: 5000
129
+ method: euler
130
+ timesteps: 25
131
+ seed: 42
132
+ num_train_epochs: 45455
lifm/FMiT-S4-4f4/config.yaml ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ globals:
2
+ target_fps: original
3
+ target_nframes: 64
4
+ outputs:
5
+ - image
6
+ - view
7
+ resolution: 112
8
+ latent_res: 28
9
+ latent_channels: 4
10
+ denoiser:
11
+ target: echosyn.common.models.SegDiTTransformer2DModel
12
+ args:
13
+ num_attention_heads: 6
14
+ attention_head_dim: 64
15
+ in_channels: 5
16
+ out_channels: 4
17
+ num_layers: 12
18
+ dropout: 0.0
19
+ norm_num_groups: 32
20
+ attention_bias: true
21
+ sample_size: ${globals.latent_res}
22
+ patch_size: 4
23
+ activation_fn: gelu-approximate
24
+ num_embeds_ada_norm: 1000
25
+ upcast_attention: false
26
+ norm_type: ada_norm_zero
27
+ norm_elementwise_affine: false
28
+ norm_eps: 1.0e-05
29
+ optimizer:
30
+ target: torch.optim.AdamW
31
+ args:
32
+ lr: 5.0e-05
33
+ betas:
34
+ - 0.9
35
+ - 0.999
36
+ weight_decay: 0.01
37
+ eps: 1.0e-08
38
+ scheduler:
39
+ target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
40
+ args:
41
+ warmup_steps: 5000
42
+ ref_steps: ${max_train_steps}
43
+ eta_min: 1.0e-06
44
+ decay_rate: 2
45
+ vae:
46
+ target: diffusers.AutoencoderKL
47
+ pretrained: vae/avae-4f4
48
+ datasets:
49
+ - name: LatentSeg
50
+ active: true
51
+ params:
52
+ root: avae-4f4/dynamic
53
+ outputs: ${globals.outputs}
54
+ target_fps: ${globals.target_fps}
55
+ view_label: A4C
56
+ target_nframes: ${globals.target_nframes}
57
+ latent_channels: ${globals.latent_channels}
58
+ segmentation_root: segmentations/dynamic
59
+ target_resolution: ${globals.latent_res}
60
+ - name: LatentSeg
61
+ active: true
62
+ params:
63
+ root: avae-4f4/ped_a4c
64
+ outputs: ${globals.outputs}
65
+ target_fps: ${globals.target_fps}
66
+ view_label: A4C
67
+ target_nframes: ${globals.target_nframes}
68
+ latent_channels: ${globals.latent_channels}
69
+ segmentation_root: segmentations/ped_a4c
70
+ target_resolution: ${globals.latent_res}
71
+ - name: LatentSeg
72
+ active: true
73
+ params:
74
+ root: avae-4f4/ped_psax
75
+ outputs: ${globals.outputs}
76
+ target_fps: ${globals.target_fps}
77
+ view_label: PSAX
78
+ target_nframes: ${globals.target_nframes}
79
+ latent_channels: ${globals.latent_channels}
80
+ segmentation_root: segmentations/ped_psax
81
+ target_resolution: ${globals.latent_res}
82
+ - name: LatentSeg
83
+ active: true
84
+ params:
85
+ root: avae-4f4/lvh
86
+ outputs: ${globals.outputs}
87
+ target_fps: ${globals.target_fps}
88
+ view_label: PLAX
89
+ target_nframes: ${globals.target_nframes}
90
+ latent_channels: ${globals.latent_channels}
91
+ segmentation_root: no_seg
92
+ target_resolution: ${globals.latent_res}
93
+ dataloader:
94
+ target: torch.utils.data.DataLoader
95
+ args:
96
+ shuffle: true
97
+ batch_size: 128
98
+ num_workers: 16
99
+ pin_memory: true
100
+ drop_last: true
101
+ persistent_workers: true
102
+ max_train_steps: 1000000
103
+ gradient_accumulation_steps: 1
104
+ mixed_precision: fp16
105
+ use_ema: true
106
+ noise_offset: 0.1
107
+ max_grad_norm: 0.1
108
+ max_grad_value: -1
109
+ pad_latents: false
110
+ sample_latents: true
111
+ output_dir: experiments/${wandb_args.name}
112
+ logging_dir: logs
113
+ report_to: wandb
114
+ wandb_args:
115
+ project: EchoFlow
116
+ name: FMiT-S4-4f4
117
+ group: FMiT
118
+ checkpointing_steps: 10000
119
+ checkpoints_to_keep:
120
+ - 50000
121
+ - 100000
122
+ - 200000
123
+ - 500000
124
+ - 1000000
125
+ resume_from_checkpoint: latest
126
+ validation:
127
+ samples: 4
128
+ steps: 5000
129
+ method: euler
130
+ timesteps: 25
131
+ seed: 42
132
+ num_train_epochs: 45455
lifm/UNet-B-16f8/config.yaml ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ globals:
2
+ target_fps: original
3
+ target_nframes: 64
4
+ outputs:
5
+ - image
6
+ - view
7
+ resolution: 112
8
+ latent_res: 14
9
+ latent_channels: 16
10
+ denoiser:
11
+ target: echosyn.common.models.SegUnet2DModel
12
+ args:
13
+ sample_size: 28
14
+ in_channels: 17
15
+ out_channels: 16
16
+ center_input_sample: false
17
+ time_embedding_type: positional
18
+ freq_shift: 0
19
+ flip_sin_to_cos: true
20
+ down_block_types:
21
+ - AttnDownBlock2D
22
+ - AttnDownBlock2D
23
+ - AttnDownBlock2D
24
+ - DownBlock2D
25
+ up_block_types:
26
+ - UpBlock2D
27
+ - AttnUpBlock2D
28
+ - AttnUpBlock2D
29
+ - AttnUpBlock2D
30
+ block_out_channels:
31
+ - 160
32
+ - 320
33
+ - 480
34
+ - 640
35
+ layers_per_block: 2
36
+ mid_block_scale_factor: 1
37
+ downsample_padding: 1
38
+ downsample_type: resnet
39
+ upsample_type: resnet
40
+ dropout: 0.0
41
+ act_fn: silu
42
+ attention_head_dim: 8
43
+ norm_num_groups: 32
44
+ attn_norm_num_groups: null
45
+ norm_eps: 1.0e-05
46
+ resnet_time_scale_shift: default
47
+ class_embed_type: timestep
48
+ num_class_embeds: null
49
+ optimizer:
50
+ target: torch.optim.AdamW
51
+ args:
52
+ lr: 5.0e-05
53
+ betas:
54
+ - 0.9
55
+ - 0.999
56
+ weight_decay: 0.01
57
+ eps: 1.0e-08
58
+ scheduler:
59
+ target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
60
+ args:
61
+ warmup_steps: 5000
62
+ ref_steps: ${max_train_steps}
63
+ eta_min: 1.0e-06
64
+ decay_rate: 2
65
+ vae:
66
+ target: diffusers.AutoencoderKL
67
+ pretrained: vae/avae-16f8
68
+ datasets:
69
+ - name: LatentSeg
70
+ active: true
71
+ params:
72
+ root: avae-16f8/dynamic
73
+ outputs: ${globals.outputs}
74
+ target_fps: ${globals.target_fps}
75
+ view_label: A4C
76
+ target_nframes: ${globals.target_nframes}
77
+ latent_channels: ${globals.latent_channels}
78
+ segmentation_root: segmentations/dynamic
79
+ target_resolution: ${globals.latent_res}
80
+ - name: LatentSeg
81
+ active: true
82
+ params:
83
+ root: avae-16f8/ped_a4c
84
+ outputs: ${globals.outputs}
85
+ target_fps: ${globals.target_fps}
86
+ view_label: A4C
87
+ target_nframes: ${globals.target_nframes}
88
+ latent_channels: ${globals.latent_channels}
89
+ segmentation_root: segmentations/ped_a4c
90
+ target_resolution: ${globals.latent_res}
91
+ - name: LatentSeg
92
+ active: true
93
+ params:
94
+ root: avae-16f8/ped_psax
95
+ outputs: ${globals.outputs}
96
+ target_fps: ${globals.target_fps}
97
+ view_label: PSAX
98
+ target_nframes: ${globals.target_nframes}
99
+ latent_channels: ${globals.latent_channels}
100
+ segmentation_root: segmentations/ped_psax
101
+ target_resolution: ${globals.latent_res}
102
+ - name: LatentSeg
103
+ active: true
104
+ params:
105
+ root: avae-16f8/lvh
106
+ outputs: ${globals.outputs}
107
+ target_fps: ${globals.target_fps}
108
+ view_label: PLAX
109
+ target_nframes: ${globals.target_nframes}
110
+ latent_channels: ${globals.latent_channels}
111
+ segmentation_root: no_seg
112
+ target_resolution: ${globals.latent_res}
113
+ dataloader:
114
+ target: torch.utils.data.DataLoader
115
+ args:
116
+ shuffle: true
117
+ batch_size: 128
118
+ num_workers: 16
119
+ pin_memory: true
120
+ drop_last: true
121
+ persistent_workers: true
122
+ max_train_steps: 1000000
123
+ gradient_accumulation_steps: 1
124
+ mixed_precision: bf16
125
+ use_ema: true
126
+ noise_offset: 0.1
127
+ max_grad_norm: 1.0
128
+ max_grad_value: -1
129
+ pad_latents: false
130
+ sample_latents: true
131
+ output_dir: experiments/${wandb_args.name}
132
+ logging_dir: logs
133
+ report_to: wandb
134
+ wandb_args:
135
+ project: EchoFlow
136
+ name: UNet-B-16f8
137
+ group: UNet
138
+ checkpointing_steps: 10000
139
+ checkpoints_to_keep:
140
+ - 50000
141
+ - 100000
142
+ - 200000
143
+ - 500000
144
+ - 1000000
145
+ resume_from_checkpoint: latest
146
+ validation:
147
+ samples: 4
148
+ steps: 5000
149
+ method: euler
150
+ timesteps: 25
151
+ seed: 42
152
+ num_train_epochs: 45455
lifm/UNet-B-4f4/config.yaml ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ globals:
2
+ target_fps: original
3
+ target_nframes: 64
4
+ outputs:
5
+ - image
6
+ - view
7
+ resolution: 112
8
+ latent_res: 28
9
+ latent_channels: 4
10
+ denoiser:
11
+ target: echosyn.common.models.SegUnet2DModel
12
+ args:
13
+ sample_size: 28
14
+ in_channels: 5
15
+ out_channels: 4
16
+ center_input_sample: false
17
+ time_embedding_type: positional
18
+ freq_shift: 0
19
+ flip_sin_to_cos: true
20
+ down_block_types:
21
+ - AttnDownBlock2D
22
+ - AttnDownBlock2D
23
+ - AttnDownBlock2D
24
+ - DownBlock2D
25
+ up_block_types:
26
+ - UpBlock2D
27
+ - AttnUpBlock2D
28
+ - AttnUpBlock2D
29
+ - AttnUpBlock2D
30
+ block_out_channels:
31
+ - 160
32
+ - 320
33
+ - 480
34
+ - 640
35
+ layers_per_block: 2
36
+ mid_block_scale_factor: 1
37
+ downsample_padding: 1
38
+ downsample_type: resnet
39
+ upsample_type: resnet
40
+ dropout: 0.0
41
+ act_fn: silu
42
+ attention_head_dim: 8
43
+ norm_num_groups: 32
44
+ attn_norm_num_groups: null
45
+ norm_eps: 1.0e-05
46
+ resnet_time_scale_shift: default
47
+ class_embed_type: timestep
48
+ num_class_embeds: null
49
+ optimizer:
50
+ target: torch.optim.AdamW
51
+ args:
52
+ lr: 5.0e-05
53
+ betas:
54
+ - 0.9
55
+ - 0.999
56
+ weight_decay: 0.01
57
+ eps: 1.0e-08
58
+ scheduler:
59
+ target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
60
+ args:
61
+ warmup_steps: 5000
62
+ ref_steps: ${max_train_steps}
63
+ eta_min: 1.0e-06
64
+ decay_rate: 2
65
+ vae:
66
+ target: diffusers.AutoencoderKL
67
+ pretrained: vae/avae-4f4
68
+ datasets:
69
+ - name: LatentSeg
70
+ active: true
71
+ params:
72
+ root: avae-4f4/dynamic
73
+ outputs: ${globals.outputs}
74
+ target_fps: ${globals.target_fps}
75
+ view_label: A4C
76
+ target_nframes: ${globals.target_nframes}
77
+ latent_channels: ${globals.latent_channels}
78
+ segmentation_root: segmentations/dynamic
79
+ target_resolution: ${globals.latent_res}
80
+ - name: LatentSeg
81
+ active: true
82
+ params:
83
+ root: avae-4f4/ped_a4c
84
+ outputs: ${globals.outputs}
85
+ target_fps: ${globals.target_fps}
86
+ view_label: A4C
87
+ target_nframes: ${globals.target_nframes}
88
+ latent_channels: ${globals.latent_channels}
89
+ segmentation_root: segmentations/ped_a4c
90
+ target_resolution: ${globals.latent_res}
91
+ - name: LatentSeg
92
+ active: true
93
+ params:
94
+ root: avae-4f4/ped_psax
95
+ outputs: ${globals.outputs}
96
+ target_fps: ${globals.target_fps}
97
+ view_label: PSAX
98
+ target_nframes: ${globals.target_nframes}
99
+ latent_channels: ${globals.latent_channels}
100
+ segmentation_root: segmentations/ped_psax
101
+ target_resolution: ${globals.latent_res}
102
+ - name: LatentSeg
103
+ active: true
104
+ params:
105
+ root: avae-4f4/lvh
106
+ outputs: ${globals.outputs}
107
+ target_fps: ${globals.target_fps}
108
+ view_label: PLAX
109
+ target_nframes: ${globals.target_nframes}
110
+ latent_channels: ${globals.latent_channels}
111
+ segmentation_root: no_seg
112
+ target_resolution: ${globals.latent_res}
113
+ dataloader:
114
+ target: torch.utils.data.DataLoader
115
+ args:
116
+ shuffle: true
117
+ batch_size: 128
118
+ num_workers: 16
119
+ pin_memory: true
120
+ drop_last: true
121
+ persistent_workers: true
122
+ max_train_steps: 1000000
123
+ gradient_accumulation_steps: 1
124
+ mixed_precision: bf16
125
+ use_ema: true
126
+ noise_offset: 0.1
127
+ max_grad_norm: 1.0
128
+ max_grad_value: -1
129
+ pad_latents: false
130
+ sample_latents: true
131
+ output_dir: experiments/${wandb_args.name}
132
+ logging_dir: logs
133
+ report_to: wandb
134
+ wandb_args:
135
+ project: EchoFlow
136
+ name: UNet-B-4f4
137
+ group: UNet
138
+ checkpointing_steps: 10000
139
+ checkpoints_to_keep:
140
+ - 50000
141
+ - 100000
142
+ - 200000
143
+ - 500000
144
+ - 1000000
145
+ resume_from_checkpoint: latest
146
+ validation:
147
+ samples: 4
148
+ steps: 5000
149
+ method: euler
150
+ timesteps: 25
151
+ seed: 42
152
+ num_train_epochs: 45455
lifm/UNet-L-16f8/config.yaml ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ globals:
2
+ target_fps: original
3
+ target_nframes: 64
4
+ outputs:
5
+ - image
6
+ - view
7
+ resolution: 112
8
+ latent_res: 14
9
+ latent_channels: 16
10
+ denoiser:
11
+ target: echosyn.common.models.SegUnet2DModel
12
+ args:
13
+ sample_size: 28
14
+ in_channels: 17
15
+ out_channels: 16
16
+ center_input_sample: false
17
+ time_embedding_type: positional
18
+ freq_shift: 0
19
+ flip_sin_to_cos: true
20
+ down_block_types:
21
+ - AttnDownBlock2D
22
+ - AttnDownBlock2D
23
+ - AttnDownBlock2D
24
+ - DownBlock2D
25
+ up_block_types:
26
+ - UpBlock2D
27
+ - AttnUpBlock2D
28
+ - AttnUpBlock2D
29
+ - AttnUpBlock2D
30
+ block_out_channels:
31
+ - 320
32
+ - 640
33
+ - 960
34
+ - 1280
35
+ layers_per_block: 2
36
+ mid_block_scale_factor: 1
37
+ downsample_padding: 1
38
+ downsample_type: resnet
39
+ upsample_type: resnet
40
+ dropout: 0.0
41
+ act_fn: silu
42
+ attention_head_dim: 8
43
+ norm_num_groups: 32
44
+ attn_norm_num_groups: null
45
+ norm_eps: 1.0e-05
46
+ resnet_time_scale_shift: default
47
+ class_embed_type: timestep
48
+ num_class_embeds: null
49
+ optimizer:
50
+ target: torch.optim.AdamW
51
+ args:
52
+ lr: 5.0e-05
53
+ betas:
54
+ - 0.9
55
+ - 0.999
56
+ weight_decay: 0.01
57
+ eps: 1.0e-08
58
+ scheduler:
59
+ target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
60
+ args:
61
+ warmup_steps: 5000
62
+ ref_steps: ${max_train_steps}
63
+ eta_min: 1.0e-06
64
+ decay_rate: 2
65
+ vae:
66
+ target: diffusers.AutoencoderKL
67
+ pretrained: vae/avae-16f8
68
+ datasets:
69
+ - name: LatentSeg
70
+ active: true
71
+ params:
72
+ root: avae-16f8/dynamic
73
+ outputs: ${globals.outputs}
74
+ target_fps: ${globals.target_fps}
75
+ view_label: A4C
76
+ target_nframes: ${globals.target_nframes}
77
+ latent_channels: ${globals.latent_channels}
78
+ segmentation_root: segmentations/dynamic
79
+ target_resolution: ${globals.latent_res}
80
+ - name: LatentSeg
81
+ active: true
82
+ params:
83
+ root: avae-16f8/ped_a4c
84
+ outputs: ${globals.outputs}
85
+ target_fps: ${globals.target_fps}
86
+ view_label: A4C
87
+ target_nframes: ${globals.target_nframes}
88
+ latent_channels: ${globals.latent_channels}
89
+ segmentation_root: segmentations/ped_a4c
90
+ target_resolution: ${globals.latent_res}
91
+ - name: LatentSeg
92
+ active: true
93
+ params:
94
+ root: avae-16f8/ped_psax
95
+ outputs: ${globals.outputs}
96
+ target_fps: ${globals.target_fps}
97
+ view_label: PSAX
98
+ target_nframes: ${globals.target_nframes}
99
+ latent_channels: ${globals.latent_channels}
100
+ segmentation_root: segmentations/ped_psax
101
+ target_resolution: ${globals.latent_res}
102
+ - name: LatentSeg
103
+ active: true
104
+ params:
105
+ root: avae-16f8/lvh
106
+ outputs: ${globals.outputs}
107
+ target_fps: ${globals.target_fps}
108
+ view_label: PLAX
109
+ target_nframes: ${globals.target_nframes}
110
+ latent_channels: ${globals.latent_channels}
111
+ segmentation_root: no_seg
112
+ target_resolution: ${globals.latent_res}
113
+ dataloader:
114
+ target: torch.utils.data.DataLoader
115
+ args:
116
+ shuffle: true
117
+ batch_size: 128
118
+ num_workers: 16
119
+ pin_memory: true
120
+ drop_last: true
121
+ persistent_workers: true
122
+ max_train_steps: 1000000
123
+ gradient_accumulation_steps: 1
124
+ mixed_precision: bf16
125
+ use_ema: true
126
+ noise_offset: 0.1
127
+ max_grad_norm: 1.0
128
+ max_grad_value: -1
129
+ pad_latents: false
130
+ sample_latents: true
131
+ output_dir: experiments/${wandb_args.name}
132
+ logging_dir: logs
133
+ report_to: wandb
134
+ wandb_args:
135
+ project: EchoFlow
136
+ name: UNet-L-16f8
137
+ group: UNet
138
+ checkpointing_steps: 10000
139
+ checkpoints_to_keep:
140
+ - 50000
141
+ - 100000
142
+ - 200000
143
+ - 500000
144
+ - 1000000
145
+ resume_from_checkpoint: latest
146
+ validation:
147
+ samples: 4
148
+ steps: 5000
149
+ method: euler
150
+ timesteps: 25
151
+ seed: 42
152
+ num_train_epochs: 45455
lifm/UNet-L-4f4/config.yaml ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ globals:
2
+ target_fps: original
3
+ target_nframes: 64
4
+ outputs:
5
+ - image
6
+ - view
7
+ resolution: 112
8
+ latent_res: 28
9
+ latent_channels: 4
10
+ denoiser:
11
+ target: echosyn.common.models.SegUnet2DModel
12
+ args:
13
+ sample_size: 28
14
+ in_channels: 5
15
+ out_channels: 4
16
+ center_input_sample: false
17
+ time_embedding_type: positional
18
+ freq_shift: 0
19
+ flip_sin_to_cos: true
20
+ down_block_types:
21
+ - AttnDownBlock2D
22
+ - AttnDownBlock2D
23
+ - AttnDownBlock2D
24
+ - DownBlock2D
25
+ up_block_types:
26
+ - UpBlock2D
27
+ - AttnUpBlock2D
28
+ - AttnUpBlock2D
29
+ - AttnUpBlock2D
30
+ block_out_channels:
31
+ - 320
32
+ - 640
33
+ - 960
34
+ - 1280
35
+ layers_per_block: 2
36
+ mid_block_scale_factor: 1
37
+ downsample_padding: 1
38
+ downsample_type: resnet
39
+ upsample_type: resnet
40
+ dropout: 0.0
41
+ act_fn: silu
42
+ attention_head_dim: 8
43
+ norm_num_groups: 32
44
+ attn_norm_num_groups: null
45
+ norm_eps: 1.0e-05
46
+ resnet_time_scale_shift: default
47
+ class_embed_type: timestep
48
+ num_class_embeds: null
49
+ optimizer:
50
+ target: torch.optim.AdamW
51
+ args:
52
+ lr: 5.0e-05
53
+ betas:
54
+ - 0.9
55
+ - 0.999
56
+ weight_decay: 0.01
57
+ eps: 1.0e-08
58
+ scheduler:
59
+ target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
60
+ args:
61
+ warmup_steps: 5000
62
+ ref_steps: ${max_train_steps}
63
+ eta_min: 1.0e-06
64
+ decay_rate: 2
65
+ vae:
66
+ target: diffusers.AutoencoderKL
67
+ pretrained: vae/avae-4f4
68
+ datasets:
69
+ - name: LatentSeg
70
+ active: true
71
+ params:
72
+ root: avae-4f4/dynamic
73
+ outputs: ${globals.outputs}
74
+ target_fps: ${globals.target_fps}
75
+ view_label: A4C
76
+ target_nframes: ${globals.target_nframes}
77
+ latent_channels: ${globals.latent_channels}
78
+ segmentation_root: segmentations/dynamic
79
+ target_resolution: ${globals.latent_res}
80
+ - name: LatentSeg
81
+ active: true
82
+ params:
83
+ root: avae-4f4/ped_a4c
84
+ outputs: ${globals.outputs}
85
+ target_fps: ${globals.target_fps}
86
+ view_label: A4C
87
+ target_nframes: ${globals.target_nframes}
88
+ latent_channels: ${globals.latent_channels}
89
+ segmentation_root: segmentations/ped_a4c
90
+ target_resolution: ${globals.latent_res}
91
+ - name: LatentSeg
92
+ active: true
93
+ params:
94
+ root: avae-4f4/ped_psax
95
+ outputs: ${globals.outputs}
96
+ target_fps: ${globals.target_fps}
97
+ view_label: PSAX
98
+ target_nframes: ${globals.target_nframes}
99
+ latent_channels: ${globals.latent_channels}
100
+ segmentation_root: segmentations/ped_psax
101
+ target_resolution: ${globals.latent_res}
102
+ - name: LatentSeg
103
+ active: true
104
+ params:
105
+ root: avae-4f4/lvh
106
+ outputs: ${globals.outputs}
107
+ target_fps: ${globals.target_fps}
108
+ view_label: PLAX
109
+ target_nframes: ${globals.target_nframes}
110
+ latent_channels: ${globals.latent_channels}
111
+ segmentation_root: no_seg
112
+ target_resolution: ${globals.latent_res}
113
+ dataloader:
114
+ target: torch.utils.data.DataLoader
115
+ args:
116
+ shuffle: true
117
+ batch_size: 128
118
+ num_workers: 16
119
+ pin_memory: true
120
+ drop_last: true
121
+ persistent_workers: true
122
+ max_train_steps: 1000000
123
+ gradient_accumulation_steps: 1
124
+ mixed_precision: bf16
125
+ use_ema: true
126
+ noise_offset: 0.1
127
+ max_grad_norm: 1.0
128
+ max_grad_value: -1
129
+ pad_latents: false
130
+ sample_latents: true
131
+ output_dir: experiments/${wandb_args.name}
132
+ logging_dir: logs
133
+ report_to: wandb
134
+ wandb_args:
135
+ project: EchoFlow
136
+ name: UNet-L-4f4
137
+ group: UNet
138
+ checkpointing_steps: 10000
139
+ checkpoints_to_keep:
140
+ - 50000
141
+ - 100000
142
+ - 200000
143
+ - 500000
144
+ - 1000000
145
+ resume_from_checkpoint: latest
146
+ validation:
147
+ samples: 4
148
+ steps: 5000
149
+ method: euler
150
+ timesteps: 25
151
+ seed: 42
152
+ num_train_epochs: 45455
lifm/UNet-S-16f8/config.yaml ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ globals:
2
+ target_fps: original
3
+ target_nframes: 64
4
+ outputs:
5
+ - image
6
+ - view
7
+ resolution: 112
8
+ latent_res: 14
9
+ latent_channels: 16
10
+ denoiser:
11
+ target: echosyn.common.models.SegUnet2DModel
12
+ args:
13
+ sample_size: 28
14
+ in_channels: 17
15
+ out_channels: 16
16
+ center_input_sample: false
17
+ time_embedding_type: positional
18
+ freq_shift: 0
19
+ flip_sin_to_cos: true
20
+ down_block_types:
21
+ - AttnDownBlock2D
22
+ - AttnDownBlock2D
23
+ - AttnDownBlock2D
24
+ - DownBlock2D
25
+ up_block_types:
26
+ - UpBlock2D
27
+ - AttnUpBlock2D
28
+ - AttnUpBlock2D
29
+ - AttnUpBlock2D
30
+ block_out_channels:
31
+ - 96
32
+ - 192
33
+ - 288
34
+ - 384
35
+ layers_per_block: 2
36
+ mid_block_scale_factor: 1
37
+ downsample_padding: 1
38
+ downsample_type: resnet
39
+ upsample_type: resnet
40
+ dropout: 0.0
41
+ act_fn: silu
42
+ attention_head_dim: 8
43
+ norm_num_groups: 32
44
+ attn_norm_num_groups: null
45
+ norm_eps: 1.0e-05
46
+ resnet_time_scale_shift: default
47
+ class_embed_type: timestep
48
+ num_class_embeds: null
49
+ optimizer:
50
+ target: torch.optim.AdamW
51
+ args:
52
+ lr: 5.0e-05
53
+ betas:
54
+ - 0.9
55
+ - 0.999
56
+ weight_decay: 0.01
57
+ eps: 1.0e-08
58
+ scheduler:
59
+ target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
60
+ args:
61
+ warmup_steps: 5000
62
+ ref_steps: ${max_train_steps}
63
+ eta_min: 1.0e-06
64
+ decay_rate: 2
65
+ vae:
66
+ target: diffusers.AutoencoderKL
67
+ pretrained: vae/avae-16f8
68
+ datasets:
69
+ - name: LatentSeg
70
+ active: true
71
+ params:
72
+ root: avae-16f8/dynamic
73
+ outputs: ${globals.outputs}
74
+ target_fps: ${globals.target_fps}
75
+ view_label: A4C
76
+ target_nframes: ${globals.target_nframes}
77
+ latent_channels: ${globals.latent_channels}
78
+ segmentation_root: segmentations/dynamic
79
+ target_resolution: ${globals.latent_res}
80
+ - name: LatentSeg
81
+ active: true
82
+ params:
83
+ root: avae-16f8/ped_a4c
84
+ outputs: ${globals.outputs}
85
+ target_fps: ${globals.target_fps}
86
+ view_label: A4C
87
+ target_nframes: ${globals.target_nframes}
88
+ latent_channels: ${globals.latent_channels}
89
+ segmentation_root: segmentations/ped_a4c
90
+ target_resolution: ${globals.latent_res}
91
+ - name: LatentSeg
92
+ active: true
93
+ params:
94
+ root: avae-16f8/ped_psax
95
+ outputs: ${globals.outputs}
96
+ target_fps: ${globals.target_fps}
97
+ view_label: PSAX
98
+ target_nframes: ${globals.target_nframes}
99
+ latent_channels: ${globals.latent_channels}
100
+ segmentation_root: segmentations/ped_psax
101
+ target_resolution: ${globals.latent_res}
102
+ - name: LatentSeg
103
+ active: true
104
+ params:
105
+ root: avae-16f8/lvh
106
+ outputs: ${globals.outputs}
107
+ target_fps: ${globals.target_fps}
108
+ view_label: PLAX
109
+ target_nframes: ${globals.target_nframes}
110
+ latent_channels: ${globals.latent_channels}
111
+ segmentation_root: no_seg
112
+ target_resolution: ${globals.latent_res}
113
+ dataloader:
114
+ target: torch.utils.data.DataLoader
115
+ args:
116
+ shuffle: true
117
+ batch_size: 128
118
+ num_workers: 16
119
+ pin_memory: true
120
+ drop_last: true
121
+ persistent_workers: true
122
+ max_train_steps: 1000000
123
+ gradient_accumulation_steps: 1
124
+ mixed_precision: bf16
125
+ use_ema: true
126
+ noise_offset: 0.1
127
+ max_grad_norm: 1.0
128
+ max_grad_value: -1
129
+ pad_latents: false
130
+ sample_latents: true
131
+ output_dir: experiments/${wandb_args.name}
132
+ logging_dir: logs
133
+ report_to: wandb
134
+ wandb_args:
135
+ project: EchoFlow
136
+ name: UNet-S-16f8
137
+ group: UNet
138
+ checkpointing_steps: 10000
139
+ checkpoints_to_keep:
140
+ - 50000
141
+ - 100000
142
+ - 200000
143
+ - 500000
144
+ - 1000000
145
+ resume_from_checkpoint: latest
146
+ validation:
147
+ samples: 4
148
+ steps: 5000
149
+ method: euler
150
+ timesteps: 25
151
+ seed: 42
152
+ num_train_epochs: 45455
lifm/UNet-S-4f4/config.yaml ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ globals:
2
+ target_fps: original
3
+ target_nframes: 64
4
+ outputs:
5
+ - image
6
+ - view
7
+ resolution: 112
8
+ latent_res: 28
9
+ latent_channels: 4
10
+ denoiser:
11
+ target: echosyn.common.models.SegUnet2DModel
12
+ args:
13
+ sample_size: 28
14
+ in_channels: 5
15
+ out_channels: 4
16
+ center_input_sample: false
17
+ time_embedding_type: positional
18
+ freq_shift: 0
19
+ flip_sin_to_cos: true
20
+ down_block_types:
21
+ - AttnDownBlock2D
22
+ - AttnDownBlock2D
23
+ - AttnDownBlock2D
24
+ - DownBlock2D
25
+ up_block_types:
26
+ - UpBlock2D
27
+ - AttnUpBlock2D
28
+ - AttnUpBlock2D
29
+ - AttnUpBlock2D
30
+ block_out_channels:
31
+ - 96
32
+ - 192
33
+ - 288
34
+ - 384
35
+ layers_per_block: 2
36
+ mid_block_scale_factor: 1
37
+ downsample_padding: 1
38
+ downsample_type: resnet
39
+ upsample_type: resnet
40
+ dropout: 0.0
41
+ act_fn: silu
42
+ attention_head_dim: 8
43
+ norm_num_groups: 32
44
+ attn_norm_num_groups: null
45
+ norm_eps: 1.0e-05
46
+ resnet_time_scale_shift: default
47
+ class_embed_type: timestep
48
+ num_class_embeds: null
49
+ optimizer:
50
+ target: torch.optim.AdamW
51
+ args:
52
+ lr: 5.0e-05
53
+ betas:
54
+ - 0.9
55
+ - 0.999
56
+ weight_decay: 0.01
57
+ eps: 1.0e-08
58
+ scheduler:
59
+ target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
60
+ args:
61
+ warmup_steps: 5000
62
+ ref_steps: ${max_train_steps}
63
+ eta_min: 1.0e-06
64
+ decay_rate: 2
65
+ vae:
66
+ target: diffusers.AutoencoderKL
67
+ pretrained: vae/avae-4f4
68
+ datasets:
69
+ - name: LatentSeg
70
+ active: true
71
+ params:
72
+ root: avae-4f4/dynamic
73
+ outputs: ${globals.outputs}
74
+ target_fps: ${globals.target_fps}
75
+ view_label: A4C
76
+ target_nframes: ${globals.target_nframes}
77
+ latent_channels: ${globals.latent_channels}
78
+ segmentation_root: segmentations/dynamic
79
+ target_resolution: ${globals.latent_res}
80
+ - name: LatentSeg
81
+ active: true
82
+ params:
83
+ root: avae-4f4/ped_a4c
84
+ outputs: ${globals.outputs}
85
+ target_fps: ${globals.target_fps}
86
+ view_label: A4C
87
+ target_nframes: ${globals.target_nframes}
88
+ latent_channels: ${globals.latent_channels}
89
+ segmentation_root: segmentations/ped_a4c
90
+ target_resolution: ${globals.latent_res}
91
+ - name: LatentSeg
92
+ active: true
93
+ params:
94
+ root: avae-4f4/ped_psax
95
+ outputs: ${globals.outputs}
96
+ target_fps: ${globals.target_fps}
97
+ view_label: PSAX
98
+ target_nframes: ${globals.target_nframes}
99
+ latent_channels: ${globals.latent_channels}
100
+ segmentation_root: segmentations/ped_psax
101
+ target_resolution: ${globals.latent_res}
102
+ - name: LatentSeg
103
+ active: true
104
+ params:
105
+ root: avae-4f4/lvh
106
+ outputs: ${globals.outputs}
107
+ target_fps: ${globals.target_fps}
108
+ view_label: PLAX
109
+ target_nframes: ${globals.target_nframes}
110
+ latent_channels: ${globals.latent_channels}
111
+ segmentation_root: no_seg
112
+ target_resolution: ${globals.latent_res}
113
+ dataloader:
114
+ target: torch.utils.data.DataLoader
115
+ args:
116
+ shuffle: true
117
+ batch_size: 128
118
+ num_workers: 16
119
+ pin_memory: true
120
+ drop_last: true
121
+ persistent_workers: true
122
+ max_train_steps: 1000000
123
+ gradient_accumulation_steps: 1
124
+ mixed_precision: bf16
125
+ use_ema: true
126
+ noise_offset: 0.1
127
+ max_grad_norm: 1.0
128
+ max_grad_value: -1
129
+ pad_latents: false
130
+ sample_latents: true
131
+ output_dir: experiments/${wandb_args.name}
132
+ logging_dir: logs
133
+ report_to: wandb
134
+ wandb_args:
135
+ project: EchoFlow
136
+ name: UNet-S-4f4
137
+ group: UNet
138
+ checkpointing_steps: 10000
139
+ checkpoints_to_keep:
140
+ - 50000
141
+ - 100000
142
+ - 200000
143
+ - 500000
144
+ - 1000000
145
+ resume_from_checkpoint: latest
146
+ validation:
147
+ samples: 4
148
+ steps: 5000
149
+ method: euler
150
+ timesteps: 25
151
+ seed: 42
152
+ num_train_epochs: 45455
lvfm/FMvT-S2-16f8/config.yaml ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ globals:
2
+ target_fps: 32
3
+ target_nframes: 64
4
+ outputs:
5
+ - video
6
+ - lvef
7
+ - image
8
+ resolution: 112
9
+ latent_res: 14
10
+ latent_channels: 16
11
+ denoiser:
12
+ target: echosyn.common.models.DiffuserSTDiT
13
+ args:
14
+ input_size:
15
+ - ${globals.target_nframes}
16
+ - ${globals.latent_res}
17
+ - ${globals.latent_res}
18
+ in_channels: 32
19
+ out_channels: ${globals.latent_channels}
20
+ patch_size:
21
+ - 1
22
+ - 2
23
+ - 2
24
+ hidden_size: 384
25
+ depth: 12
26
+ num_heads: 6
27
+ mlp_ratio: 4.0
28
+ class_dropout_prob: 0.0
29
+ drop_path: 0.0
30
+ no_temporal_pos_emb: false
31
+ caption_channels: 1
32
+ model_max_length: 1
33
+ space_scale: 1.0
34
+ time_scale: 1.0
35
+ enable_flashattn: false
36
+ optimizer:
37
+ target: torch.optim.AdamW
38
+ args:
39
+ lr: 0.0001
40
+ betas:
41
+ - 0.9
42
+ - 0.999
43
+ weight_decay: 0.01
44
+ eps: 1.0e-08
45
+ scheduler:
46
+ target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
47
+ args:
48
+ warmup_steps: 2000
49
+ ref_steps: ${max_train_steps}
50
+ eta_min: 1.0e-06
51
+ decay_rate: 2.0
52
+ vae:
53
+ target: diffusers.AutoencoderKL
54
+ pretrained: vae/avae-16f8
55
+ datasets:
56
+ - name: Latent
57
+ active: true
58
+ params:
59
+ root: avae-16f8/dynamic
60
+ target_fps: ${globals.target_fps}
61
+ target_nframes: ${globals.target_nframes}
62
+ target_resolution: ${globals.latent_res}
63
+ outputs: ${globals.outputs}
64
+ latent_channels: ${globals.latent_channels}
65
+ - name: Latent
66
+ active: true
67
+ params:
68
+ root: avae-16f8/ped_a4c
69
+ target_fps: ${globals.target_fps}
70
+ target_nframes: ${globals.target_nframes}
71
+ target_resolution: ${globals.latent_res}
72
+ outputs: ${globals.outputs}
73
+ latent_channels: ${globals.latent_channels}
74
+ - name: Latent
75
+ active: true
76
+ params:
77
+ root: avae-16f8/ped_psax
78
+ target_fps: ${globals.target_fps}
79
+ target_nframes: ${globals.target_nframes}
80
+ target_resolution: ${globals.latent_res}
81
+ outputs: ${globals.outputs}
82
+ latent_channels: ${globals.latent_channels}
83
+ - name: Latent
84
+ active: true
85
+ params:
86
+ root: avae-16f8/lvh
87
+ target_fps: ${globals.target_fps}
88
+ target_nframes: ${globals.target_nframes}
89
+ target_resolution: ${globals.latent_res}
90
+ outputs: ${globals.outputs}
91
+ latent_channels: ${globals.latent_channels}
92
+ dataloader:
93
+ target: torch.utils.data.DataLoader
94
+ args:
95
+ shuffle: true
96
+ batch_size: 64
97
+ num_workers: 64
98
+ pin_memory: true
99
+ drop_last: true
100
+ persistent_workers: true
101
+ max_train_steps: 1000000
102
+ gradient_accumulation_steps: 1
103
+ mixed_precision: bf16
104
+ use_ema: true
105
+ max_grad_norm: 1.0
106
+ max_grad_value: -1
107
+ sample_latents: true
108
+ noise_offset: 0.05
109
+ noise_cond_image: 0.05
110
+ no_conditionning: false
111
+ p_drop_conditionning: 0.1
112
+ output_dir: experiments/${wandb_args.name}
113
+ logging_dir: logs
114
+ report_to: wandb
115
+ wandb_args:
116
+ project: EchoFlow
117
+ name: FMvT-S2-16f8
118
+ group: FMvT
119
+ checkpointing_steps: 10000
120
+ checkpoints_to_keep:
121
+ - 50000
122
+ - 100000
123
+ - 200000
124
+ - 300000
125
+ - 500000
126
+ - 1000000
127
+ resume_from_checkpoint: latest
128
+ validation:
129
+ samples: 4
130
+ steps: 5000
131
+ timesteps: 25
132
+ frames: ${globals.target_nframes}
133
+ fps: ${globals.target_fps}
134
+ lvefs:
135
+ - -1.0
136
+ - 0.3
137
+ - 0.6
138
+ - 0.9
139
+ cond_image_mask:
140
+ - 0
141
+ - 1
142
+ - 1
143
+ - 1
144
+ seed: 42
145
+ num_train_epochs: 28572
lvfm/FMvT-S2-4f4/config.yaml ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ globals:
2
+ target_fps: 32
3
+ target_nframes: 64
4
+ outputs:
5
+ - video
6
+ - lvef
7
+ - image
8
+ resolution: 112
9
+ latent_res: 28
10
+ latent_channels: 4
11
+ denoiser:
12
+ target: echosyn.common.models.DiffuserSTDiT
13
+ args:
14
+ input_size:
15
+ - ${globals.target_nframes}
16
+ - ${globals.latent_res}
17
+ - ${globals.latent_res}
18
+ in_channels: 8
19
+ out_channels: ${globals.latent_channels}
20
+ patch_size:
21
+ - 1
22
+ - 2
23
+ - 2
24
+ hidden_size: 384
25
+ depth: 12
26
+ num_heads: 6
27
+ mlp_ratio: 4.0
28
+ class_dropout_prob: 0.0
29
+ drop_path: 0.0
30
+ no_temporal_pos_emb: false
31
+ caption_channels: 1
32
+ model_max_length: 1
33
+ space_scale: 1.0
34
+ time_scale: 1.0
35
+ enable_flashattn: false
36
+ optimizer:
37
+ target: torch.optim.AdamW
38
+ args:
39
+ lr: 0.0001
40
+ betas:
41
+ - 0.9
42
+ - 0.999
43
+ weight_decay: 0.01
44
+ eps: 1.0e-08
45
+ scheduler:
46
+ target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
47
+ args:
48
+ warmup_steps: 2000
49
+ ref_steps: ${max_train_steps}
50
+ eta_min: 1.0e-06
51
+ decay_rate: 2.0
52
+ vae:
53
+ target: diffusers.AutoencoderKL
54
+ pretrained: vae/avae-4f4
55
+ datasets:
56
+ - name: Latent
57
+ active: true
58
+ params:
59
+ root: avae-4f4/dynamic
60
+ target_fps: ${globals.target_fps}
61
+ target_nframes: ${globals.target_nframes}
62
+ target_resolution: ${globals.latent_res}
63
+ outputs: ${globals.outputs}
64
+ latent_channels: ${globals.latent_channels}
65
+ - name: Latent
66
+ active: true
67
+ params:
68
+ root: avae-4f4/ped_a4c
69
+ target_fps: ${globals.target_fps}
70
+ target_nframes: ${globals.target_nframes}
71
+ target_resolution: ${globals.latent_res}
72
+ outputs: ${globals.outputs}
73
+ latent_channels: ${globals.latent_channels}
74
+ - name: Latent
75
+ active: true
76
+ params:
77
+ root: avae-4f4/ped_psax
78
+ target_fps: ${globals.target_fps}
79
+ target_nframes: ${globals.target_nframes}
80
+ target_resolution: ${globals.latent_res}
81
+ outputs: ${globals.outputs}
82
+ latent_channels: ${globals.latent_channels}
83
+ - name: Latent
84
+ active: true
85
+ params:
86
+ root: avae-4f4/lvh
87
+ target_fps: ${globals.target_fps}
88
+ target_nframes: ${globals.target_nframes}
89
+ target_resolution: ${globals.latent_res}
90
+ outputs: ${globals.outputs}
91
+ latent_channels: ${globals.latent_channels}
92
+ dataloader:
93
+ target: torch.utils.data.DataLoader
94
+ args:
95
+ shuffle: true
96
+ batch_size: 16
97
+ num_workers: 16
98
+ pin_memory: true
99
+ drop_last: true
100
+ persistent_workers: true
101
+ max_train_steps: 1000000
102
+ gradient_accumulation_steps: 1
103
+ mixed_precision: bf16
104
+ use_ema: true
105
+ max_grad_norm: 1.0
106
+ max_grad_value: -1
107
+ sample_latents: true
108
+ noise_offset: 0.05
109
+ noise_cond_image: 0.05
110
+ no_conditionning: false
111
+ p_drop_conditionning: 0.3
112
+ output_dir: experiments/${wandb_args.name}
113
+ logging_dir: logs
114
+ report_to: wandb
115
+ wandb_args:
116
+ project: EchoFlow
117
+ name: FMvT-S2-4f4
118
+ group: FMvT
119
+ checkpointing_steps: 10000
120
+ checkpoints_to_keep:
121
+ - 50000
122
+ - 100000
123
+ - 200000
124
+ - 300000
125
+ - 500000
126
+ - 1000000
127
+ resume_from_checkpoint: latest
128
+ validation:
129
+ samples: 4
130
+ steps: 5000
131
+ timesteps: 25
132
+ frames: ${globals.target_nframes}
133
+ fps: ${globals.target_fps}
134
+ lvefs:
135
+ - -1.0
136
+ - 0.3
137
+ - 0.6
138
+ - 0.9
139
+ cond_image_mask:
140
+ - 0
141
+ - 1
142
+ - 1
143
+ - 1
144
+ seed: 42
145
+ num_train_epochs: 28572
lvfm/FMvT-S4-4f4/config.yaml ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ globals:
2
+ target_fps: 32
3
+ target_nframes: 64
4
+ outputs:
5
+ - video
6
+ - lvef
7
+ - image
8
+ resolution: 112
9
+ latent_res: 28
10
+ latent_channels: 4
11
+ denoiser:
12
+ target: echosyn.common.models.DiffuserSTDiT
13
+ args:
14
+ input_size:
15
+ - ${globals.target_nframes}
16
+ - ${globals.latent_res}
17
+ - ${globals.latent_res}
18
+ in_channels: 8
19
+ out_channels: ${globals.latent_channels}
20
+ patch_size:
21
+ - 1
22
+ - 4
23
+ - 4
24
+ hidden_size: 384
25
+ depth: 12
26
+ num_heads: 6
27
+ mlp_ratio: 4.0
28
+ class_dropout_prob: 0.0
29
+ drop_path: 0.0
30
+ no_temporal_pos_emb: false
31
+ caption_channels: 1
32
+ model_max_length: 1
33
+ space_scale: 1.0
34
+ time_scale: 1.0
35
+ enable_flashattn: false
36
+ optimizer:
37
+ target: torch.optim.AdamW
38
+ args:
39
+ lr: 0.0001
40
+ betas:
41
+ - 0.9
42
+ - 0.999
43
+ weight_decay: 0.01
44
+ eps: 1.0e-08
45
+ scheduler:
46
+ target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
47
+ args:
48
+ warmup_steps: 2000
49
+ ref_steps: ${max_train_steps}
50
+ eta_min: 1.0e-06
51
+ decay_rate: 2.0
52
+ vae:
53
+ target: diffusers.AutoencoderKL
54
+ pretrained: vae/avae-4f4
55
+ datasets:
56
+ - name: Latent
57
+ active: true
58
+ params:
59
+ root: avae-4f4/dynamic
60
+ target_fps: ${globals.target_fps}
61
+ target_nframes: ${globals.target_nframes}
62
+ target_resolution: ${globals.latent_res}
63
+ outputs: ${globals.outputs}
64
+ latent_channels: ${globals.latent_channels}
65
+ - name: Latent
66
+ active: true
67
+ params:
68
+ root: avae-4f4/ped_a4c
69
+ target_fps: ${globals.target_fps}
70
+ target_nframes: ${globals.target_nframes}
71
+ target_resolution: ${globals.latent_res}
72
+ outputs: ${globals.outputs}
73
+ latent_channels: ${globals.latent_channels}
74
+ - name: Latent
75
+ active: true
76
+ params:
77
+ root: avae-4f4/ped_psax
78
+ target_fps: ${globals.target_fps}
79
+ target_nframes: ${globals.target_nframes}
80
+ target_resolution: ${globals.latent_res}
81
+ outputs: ${globals.outputs}
82
+ latent_channels: ${globals.latent_channels}
83
+ - name: Latent
84
+ active: true
85
+ params:
86
+ root: avae-4f4/lvh
87
+ target_fps: ${globals.target_fps}
88
+ target_nframes: ${globals.target_nframes}
89
+ target_resolution: ${globals.latent_res}
90
+ outputs: ${globals.outputs}
91
+ latent_channels: ${globals.latent_channels}
92
+ dataloader:
93
+ target: torch.utils.data.DataLoader
94
+ args:
95
+ shuffle: true
96
+ batch_size: 64
97
+ num_workers: 64
98
+ pin_memory: true
99
+ drop_last: true
100
+ persistent_workers: true
101
+ max_train_steps: 1000000
102
+ gradient_accumulation_steps: 1
103
+ mixed_precision: bf16
104
+ use_ema: true
105
+ max_grad_norm: 1.0
106
+ max_grad_value: -1
107
+ sample_latents: true
108
+ noise_offset: 0.05
109
+ noise_cond_image: 0.05
110
+ no_conditionning: false
111
+ p_drop_conditionning: 0.3
112
+ output_dir: experiments/${wandb_args.name}
113
+ logging_dir: logs
114
+ report_to: wandb
115
+ wandb_args:
116
+ project: EchoFlow
117
+ name: FMvT-S4-4f4
118
+ group: FMvT
119
+ checkpointing_steps: 10000
120
+ checkpoints_to_keep:
121
+ - 50000
122
+ - 100000
123
+ - 200000
124
+ - 300000
125
+ - 500000
126
+ - 1000000
127
+ resume_from_checkpoint: latest
128
+ validation:
129
+ samples: 4
130
+ steps: 5000
131
+ timesteps: 25
132
+ frames: ${globals.target_nframes}
133
+ fps: ${globals.target_fps}
134
+ lvefs:
135
+ - -1.0
136
+ - 0.3
137
+ - 0.6
138
+ - 0.9
139
+ cond_image_mask:
140
+ - 0
141
+ - 1
142
+ - 1
143
+ - 1
144
+ seed: 42
145
+ num_train_epochs: 28572
lvfm/STUNet-S-16f8/config.yaml ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ globals:
2
+ target_fps: 32
3
+ target_nframes: 64
4
+ outputs:
5
+ - video
6
+ - lvef
7
+ - image
8
+ resolution: 112
9
+ latent_res: 14
10
+ latent_channels: 16
11
+ denoiser:
12
+ target: echosyn.common.models.UNetSTIC
13
+ args:
14
+ in_channels: 32
15
+ out_channels: ${globals.latent_channels}
16
+ sample_size: ${globals.latent_res}
17
+ addition_time_embed_dim: 1
18
+ block_out_channels:
19
+ - 64
20
+ - 128
21
+ - 192
22
+ - 256
23
+ cross_attention_dim: 1
24
+ down_block_types:
25
+ - CrossAttnDownBlockSpatioTemporal
26
+ - CrossAttnDownBlockSpatioTemporal
27
+ - CrossAttnDownBlockSpatioTemporal
28
+ - DownBlockSpatioTemporal
29
+ layers_per_block: 2
30
+ num_attention_heads:
31
+ - 8
32
+ - 16
33
+ - 16
34
+ - 32
35
+ num_frames: 64
36
+ projection_class_embeddings_input_dim: 1
37
+ transformer_layers_per_block: 1
38
+ up_block_types:
39
+ - UpBlockSpatioTemporal
40
+ - CrossAttnUpBlockSpatioTemporal
41
+ - CrossAttnUpBlockSpatioTemporal
42
+ - CrossAttnUpBlockSpatioTemporal
43
+ optimizer:
44
+ target: torch.optim.AdamW
45
+ args:
46
+ lr: 0.0001
47
+ betas:
48
+ - 0.9
49
+ - 0.999
50
+ weight_decay: 0.01
51
+ eps: 1.0e-08
52
+ scheduler:
53
+ target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
54
+ args:
55
+ warmup_steps: 2000
56
+ ref_steps: ${max_train_steps}
57
+ eta_min: 1.0e-06
58
+ decay_rate: 2.0
59
+ vae:
60
+ target: diffusers.AutoencoderKL
61
+ pretrained: vae/avae-16f8
62
+ datasets:
63
+ - name: Latent
64
+ active: true
65
+ params:
66
+ root: avae-16f8/dynamic
67
+ target_fps: ${globals.target_fps}
68
+ target_nframes: ${globals.target_nframes}
69
+ target_resolution: ${globals.latent_res}
70
+ outputs: ${globals.outputs}
71
+ latent_channels: ${globals.latent_channels}
72
+ - name: Latent
73
+ active: true
74
+ params:
75
+ root: avae-16f8/ped_a4c
76
+ target_fps: ${globals.target_fps}
77
+ target_nframes: ${globals.target_nframes}
78
+ target_resolution: ${globals.latent_res}
79
+ outputs: ${globals.outputs}
80
+ latent_channels: ${globals.latent_channels}
81
+ - name: Latent
82
+ active: true
83
+ params:
84
+ root: avae-16f8/ped_psax
85
+ target_fps: ${globals.target_fps}
86
+ target_nframes: ${globals.target_nframes}
87
+ target_resolution: ${globals.latent_res}
88
+ outputs: ${globals.outputs}
89
+ latent_channels: ${globals.latent_channels}
90
+ - name: Latent
91
+ active: true
92
+ params:
93
+ root: avae-16f8/lvh
94
+ target_fps: ${globals.target_fps}
95
+ target_nframes: ${globals.target_nframes}
96
+ target_resolution: ${globals.latent_res}
97
+ outputs: ${globals.outputs}
98
+ latent_channels: ${globals.latent_channels}
99
+ dataloader:
100
+ target: torch.utils.data.DataLoader
101
+ args:
102
+ shuffle: true
103
+ batch_size: 32
104
+ num_workers: 32
105
+ pin_memory: true
106
+ drop_last: true
107
+ persistent_workers: true
108
+ max_train_steps: 1000000
109
+ gradient_accumulation_steps: 1
110
+ mixed_precision: bf16
111
+ use_ema: true
112
+ max_grad_norm: 1.0
113
+ max_grad_value: -1
114
+ sample_latents: true
115
+ noise_offset: 0.05
116
+ noise_cond_image: 0.05
117
+ no_conditionning: false
118
+ p_drop_conditionning: 0.3
119
+ output_dir: experiments/${wandb_args.name}
120
+ logging_dir: logs
121
+ report_to: wandb
122
+ wandb_args:
123
+ project: EchoFlow
124
+ name: STUNet-S-16f8
125
+ group: STUNet
126
+ checkpointing_steps: 10000
127
+ checkpoints_to_keep:
128
+ - 50000
129
+ - 100000
130
+ - 200000
131
+ - 300000
132
+ - 500000
133
+ - 1000000
134
+ resume_from_checkpoint: latest
135
+ validation:
136
+ samples: 4
137
+ steps: 5000
138
+ timesteps: 25
139
+ frames: ${globals.target_nframes}
140
+ fps: ${globals.target_fps}
141
+ lvefs:
142
+ - -1.0
143
+ - 0.3
144
+ - 0.6
145
+ - 0.9
146
+ cond_image_mask:
147
+ - 0
148
+ - 1
149
+ - 1
150
+ - 1
151
+ seed: 42
152
+ num_train_epochs: 28572
lvfm/STUNet-S-4f4/config.yaml ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ globals:
2
+ target_fps: 32
3
+ target_nframes: 64
4
+ outputs:
5
+ - video
6
+ - lvef
7
+ - image
8
+ resolution: 112
9
+ latent_res: 28
10
+ latent_channels: 4
11
+ denoiser:
12
+ target: echosyn.common.models.UNetSTIC
13
+ args:
14
+ in_channels: 8
15
+ out_channels: ${globals.latent_channels}
16
+ sample_size: ${globals.latent_res}
17
+ addition_time_embed_dim: 1
18
+ block_out_channels:
19
+ - 64
20
+ - 128
21
+ - 192
22
+ - 256
23
+ cross_attention_dim: 1
24
+ down_block_types:
25
+ - CrossAttnDownBlockSpatioTemporal
26
+ - CrossAttnDownBlockSpatioTemporal
27
+ - CrossAttnDownBlockSpatioTemporal
28
+ - DownBlockSpatioTemporal
29
+ layers_per_block: 2
30
+ num_attention_heads:
31
+ - 8
32
+ - 16
33
+ - 16
34
+ - 32
35
+ num_frames: 64
36
+ projection_class_embeddings_input_dim: 1
37
+ transformer_layers_per_block: 1
38
+ up_block_types:
39
+ - UpBlockSpatioTemporal
40
+ - CrossAttnUpBlockSpatioTemporal
41
+ - CrossAttnUpBlockSpatioTemporal
42
+ - CrossAttnUpBlockSpatioTemporal
43
+ optimizer:
44
+ target: torch.optim.AdamW
45
+ args:
46
+ lr: 0.0001
47
+ betas:
48
+ - 0.9
49
+ - 0.999
50
+ weight_decay: 0.01
51
+ eps: 1.0e-08
52
+ scheduler:
53
+ target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
54
+ args:
55
+ warmup_steps: 2000
56
+ ref_steps: ${max_train_steps}
57
+ eta_min: 1.0e-06
58
+ decay_rate: 2.0
59
+ vae:
60
+ target: diffusers.AutoencoderKL
61
+ pretrained: vae/avae-4f4
62
+ datasets:
63
+ - name: Latent
64
+ active: true
65
+ params:
66
+ root: avae-4f4/dynamic
67
+ target_fps: ${globals.target_fps}
68
+ target_nframes: ${globals.target_nframes}
69
+ target_resolution: ${globals.latent_res}
70
+ outputs: ${globals.outputs}
71
+ latent_channels: ${globals.latent_channels}
72
+ - name: Latent
73
+ active: true
74
+ params:
75
+ root: avae-4f4/ped_a4c
76
+ target_fps: ${globals.target_fps}
77
+ target_nframes: ${globals.target_nframes}
78
+ target_resolution: ${globals.latent_res}
79
+ outputs: ${globals.outputs}
80
+ latent_channels: ${globals.latent_channels}
81
+ - name: Latent
82
+ active: true
83
+ params:
84
+ root: avae-4f4/ped_psax
85
+ target_fps: ${globals.target_fps}
86
+ target_nframes: ${globals.target_nframes}
87
+ target_resolution: ${globals.latent_res}
88
+ outputs: ${globals.outputs}
89
+ latent_channels: ${globals.latent_channels}
90
+ - name: Latent
91
+ active: true
92
+ params:
93
+ root: avae-4f4/lvh
94
+ target_fps: ${globals.target_fps}
95
+ target_nframes: ${globals.target_nframes}
96
+ target_resolution: ${globals.latent_res}
97
+ outputs: ${globals.outputs}
98
+ latent_channels: ${globals.latent_channels}
99
+ dataloader:
100
+ target: torch.utils.data.DataLoader
101
+ args:
102
+ shuffle: true
103
+ batch_size: 8
104
+ num_workers: 8
105
+ pin_memory: true
106
+ drop_last: true
107
+ persistent_workers: true
108
+ max_train_steps: 1000000
109
+ gradient_accumulation_steps: 1
110
+ mixed_precision: bf16
111
+ use_ema: true
112
+ max_grad_norm: 1.0
113
+ max_grad_value: -1
114
+ sample_latents: true
115
+ noise_offset: 0.05
116
+ noise_cond_image: 0.05
117
+ no_conditionning: false
118
+ p_drop_conditionning: 0.3
119
+ output_dir: experiments/${wandb_args.name}
120
+ logging_dir: logs
121
+ report_to: wandb
122
+ wandb_args:
123
+ project: EchoFlow
124
+ name: STUNet-S-4f4
125
+ group: STUNet
126
+ checkpointing_steps: 10000
127
+ checkpoints_to_keep:
128
+ - 50000
129
+ - 100000
130
+ - 200000
131
+ - 300000
132
+ - 500000
133
+ - 1000000
134
+ resume_from_checkpoint: latest
135
+ validation:
136
+ samples: 4
137
+ steps: 5000
138
+ timesteps: 25
139
+ frames: ${globals.target_nframes}
140
+ fps: ${globals.target_fps}
141
+ lvefs:
142
+ - -1.0
143
+ - 0.3
144
+ - 0.6
145
+ - 0.9
146
+ cond_image_mask:
147
+ - 0
148
+ - 1
149
+ - 1
150
+ - 1
151
+ seed: 42
152
+ num_train_epochs: 28572
reid/dynamic-4f4/config.yaml ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ globals:
2
+ latent_channels: 4
3
+ dataset:
4
+ target: echosyn.common.datasets.ContrastivePair
5
+ args:
6
+ root: avae-4f4/dynamic
7
+ folder: Latents
8
+ extension: pt
9
+ dataloader:
10
+ target: torch.utils.data.DataLoader
11
+ args:
12
+ shuffle: true
13
+ batch_size: 32
14
+ num_workers: 16
15
+ pin_memory: true
16
+ drop_last: true
17
+ persistent_workers: true
18
+ backbone:
19
+ target: echosyn.reindentification.model.ResNet18
20
+ args:
21
+ weights: torchvision.models.ResNet18_Weights.IMAGENET1K_V1
22
+ progress: false
23
+ model:
24
+ target: echosyn.reindentification.model.ContrastiveModel
25
+ args:
26
+ in_channels: 4
27
+ out_channels: 256
28
+ kl_loss_weight: 0.0
29
+ optimizer:
30
+ target: torch.optim.AdamW
31
+ args:
32
+ lr: 0.0001
33
+ betas:
34
+ - 0.9
35
+ - 0.999
36
+ weight_decay: 0.01
37
+ eps: 1.0e-08
38
+ scheduler:
39
+ target: torch.optim.lr_scheduler.ConstantLR
40
+ args:
41
+ factor: 1.0
42
+ vae:
43
+ target: diffusers.AutoencoderKL
44
+ pretrained: vae/avae-4f4
45
+ max_train_steps: 60000
46
+ gradient_accumulation_steps: 1
47
+ mixed_precision: bf16
48
+ max_grad_norm: 10.0
49
+ sample_latents: true
50
+ validation_steps: 10000
51
+ validation_samples: 99999
52
+ output_dir: experiments/${wandb_args.group}/${wandb_args.name}
53
+ logging_dir: logs
54
+ report_to: wandb
55
+ wandb_args:
56
+ project: EchoFlow
57
+ name: dynamic_4f4
58
+ group: reindentification
59
+ checkpointing_steps: 10000
60
+ checkpoints_total_limit: 3
61
+ resume_from_checkpoint: null
62
+ seed: 42
63
+ no_wandb: false
64
+ num_train_epochs: 258
reid/lvh-4f4/config.yaml ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ globals:
2
+ latent_channels: 4
3
+ dataset:
4
+ target: echosyn.common.datasets.ContrastivePair
5
+ args:
6
+ root: avae-4f4/lvh
7
+ folder: Latents
8
+ extension: pt
9
+ dataloader:
10
+ target: torch.utils.data.DataLoader
11
+ args:
12
+ shuffle: true
13
+ batch_size: 32
14
+ num_workers: 16
15
+ pin_memory: true
16
+ drop_last: true
17
+ persistent_workers: true
18
+ backbone:
19
+ target: echosyn.reindentification.model.ResNet18
20
+ args:
21
+ weights: torchvision.models.ResNet18_Weights.IMAGENET1K_V1
22
+ progress: false
23
+ model:
24
+ target: echosyn.reindentification.model.ContrastiveModel
25
+ args:
26
+ in_channels: 4
27
+ out_channels: 256
28
+ kl_loss_weight: 0.0
29
+ optimizer:
30
+ target: torch.optim.AdamW
31
+ args:
32
+ lr: 0.0001
33
+ betas:
34
+ - 0.9
35
+ - 0.999
36
+ weight_decay: 0.01
37
+ eps: 1.0e-08
38
+ scheduler:
39
+ target: torch.optim.lr_scheduler.ConstantLR
40
+ args:
41
+ factor: 1.0
42
+ vae:
43
+ target: diffusers.AutoencoderKL
44
+ pretrained: vae/avae-4f4
45
+ max_train_steps: 60000
46
+ gradient_accumulation_steps: 1
47
+ mixed_precision: bf16
48
+ max_grad_norm: 10.0
49
+ sample_latents: true
50
+ validation_steps: 10000
51
+ validation_samples: 99999
52
+ output_dir: experiments/${wandb_args.group}/${wandb_args.name}
53
+ logging_dir: logs
54
+ report_to: wandb
55
+ wandb_args:
56
+ project: EchoFlow
57
+ name: lvh_4f4
58
+ group: reindentification
59
+ checkpointing_steps: 10000
60
+ checkpoints_total_limit: 3
61
+ resume_from_checkpoint: null
62
+ seed: 42
63
+ no_wandb: false
64
+ num_train_epochs: 203
reid/ped_a4c-4f4/config.yaml ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ globals:
2
+ latent_channels: 4
3
+ dataset:
4
+ target: echosyn.common.datasets.ContrastivePair
5
+ args:
6
+ root: avae-4f4/ped_a4c
7
+ folder: Latents
8
+ extension: pt
9
+ dataloader:
10
+ target: torch.utils.data.DataLoader
11
+ args:
12
+ shuffle: true
13
+ batch_size: 32
14
+ num_workers: 16
15
+ pin_memory: true
16
+ drop_last: true
17
+ persistent_workers: true
18
+ backbone:
19
+ target: echosyn.reindentification.model.ResNet18
20
+ args:
21
+ weights: torchvision.models.ResNet18_Weights.IMAGENET1K_V1
22
+ progress: false
23
+ model:
24
+ target: echosyn.reindentification.model.ContrastiveModel
25
+ args:
26
+ in_channels: 4
27
+ out_channels: 256
28
+ kl_loss_weight: 0.0
29
+ optimizer:
30
+ target: torch.optim.AdamW
31
+ args:
32
+ lr: 0.0001
33
+ betas:
34
+ - 0.9
35
+ - 0.999
36
+ weight_decay: 0.01
37
+ eps: 1.0e-08
38
+ scheduler:
39
+ target: torch.optim.lr_scheduler.ConstantLR
40
+ args:
41
+ factor: 1.0
42
+ vae:
43
+ target: diffusers.AutoencoderKL
44
+ pretrained: vae/avae-4f4
45
+ max_train_steps: 60000
46
+ gradient_accumulation_steps: 1
47
+ mixed_precision: bf16
48
+ max_grad_norm: 10.0
49
+ sample_latents: true
50
+ validation_steps: 10000
51
+ validation_samples: 99999
52
+ output_dir: experiments/${wandb_args.group}/${wandb_args.name}
53
+ logging_dir: logs
54
+ report_to: wandb
55
+ wandb_args:
56
+ project: EchoFlow
57
+ name: ped_a4c_4f4
58
+ group: reindentification
59
+ checkpointing_steps: 10000
60
+ checkpoints_total_limit: 3
61
+ resume_from_checkpoint: null
62
+ seed: 42
63
+ no_wandb: false
64
+ num_train_epochs: 750
reid/ped_psax-4f4/config.yaml ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ globals:
2
+ latent_channels: 4
3
+ dataset:
4
+ target: echosyn.common.datasets.ContrastivePair
5
+ args:
6
+ root: avae-4f4/ped_psax
7
+ folder: Latents
8
+ extension: pt
9
+ dataloader:
10
+ target: torch.utils.data.DataLoader
11
+ args:
12
+ shuffle: true
13
+ batch_size: 32
14
+ num_workers: 16
15
+ pin_memory: true
16
+ drop_last: true
17
+ persistent_workers: true
18
+ backbone:
19
+ target: echosyn.reindentification.model.ResNet18
20
+ args:
21
+ weights: torchvision.models.ResNet18_Weights.IMAGENET1K_V1
22
+ progress: false
23
+ model:
24
+ target: echosyn.reindentification.model.ContrastiveModel
25
+ args:
26
+ in_channels: 4
27
+ out_channels: 256
28
+ kl_loss_weight: 0.0
29
+ optimizer:
30
+ target: torch.optim.AdamW
31
+ args:
32
+ lr: 0.0001
33
+ betas:
34
+ - 0.9
35
+ - 0.999
36
+ weight_decay: 0.01
37
+ eps: 1.0e-08
38
+ scheduler:
39
+ target: torch.optim.lr_scheduler.ConstantLR
40
+ args:
41
+ factor: 1.0
42
+ vae:
43
+ target: diffusers.AutoencoderKL
44
+ pretrained: vae/avae-4f4
45
+ max_train_steps: 60000
46
+ gradient_accumulation_steps: 1
47
+ mixed_precision: bf16
48
+ max_grad_norm: 10.0
49
+ sample_latents: true
50
+ validation_steps: 10000
51
+ validation_samples: 99999
52
+ output_dir: experiments/${wandb_args.group}/${wandb_args.name}
53
+ logging_dir: logs
54
+ report_to: wandb
55
+ wandb_args:
56
+ project: EchoFlow
57
+ name: ped_psax_4f4
58
+ group: reindentification
59
+ checkpointing_steps: 10000
60
+ checkpoints_total_limit: 3
61
+ resume_from_checkpoint: null
62
+ seed: 42
63
+ no_wandb: false
64
+ num_train_epochs: 541
vae/avae-16f8/config.yaml ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 2e-6 # ~5e-4 after scaloing
3
+ target: ldm.models.autoencoder.AutoencoderKL
4
+ params:
5
+ monitor: "val/rec_loss"
6
+ embed_dim: 16
7
+ lossconfig:
8
+ target: ldm.modules.losses.LPIPSWithDiscriminator
9
+ params:
10
+ disc_start: 50001
11
+ kl_weight: 0.000001
12
+ disc_weight: 0.5
13
+
14
+ ddconfig:
15
+ double_z: True
16
+ z_channels: 16
17
+ resolution: 112
18
+ in_channels: 3
19
+ out_ch: 3
20
+ ch: 128
21
+ ch_mult: [ 1,2,2,4 ] # num_down = len(ch_mult)-1
22
+ num_res_blocks: 2
23
+ attn_resolutions: [ ]
24
+ dropout: 0.0
25
+
26
+ data:
27
+ target: main.DataModuleFromConfig
28
+ params:
29
+ batch_size: 32
30
+ num_workers: 16
31
+ train:
32
+ target: taming.data.custom.CustomTrain
33
+ params:
34
+ training_images_list_file: ${oc.env:TMPDIR}/train.txt
35
+ size: 112
36
+ validation:
37
+ target: taming.data.custom.CustomTest
38
+ params:
39
+ test_images_list_file: ${oc.env:TMPDIR}/val.txt
40
+ size: 112
41
+
42
+ lightning:
43
+ callbacks:
44
+ image_logger:
45
+ target: main.ImageLogger
46
+ params:
47
+ batch_frequency: 1000
48
+ max_images: 8
49
+ increase_log_steps: True
50
+
51
+ trainer:
52
+ benchmark: True
53
+ accumulate_grad_batches: 2
54
+ max_epochs: 1000
vae/avae-4f4/config.yaml ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 2e-6 # ~5e-4 after scaloing
3
+ target: ldm.models.autoencoder.AutoencoderKL
4
+ params:
5
+ monitor: "val/rec_loss"
6
+ embed_dim: 4
7
+ lossconfig:
8
+ target: ldm.modules.losses.LPIPSWithDiscriminator
9
+ params:
10
+ disc_start: 50001
11
+ kl_weight: 0.000001
12
+ disc_weight: 0.5
13
+
14
+ ddconfig:
15
+ double_z: True
16
+ z_channels: 4
17
+ resolution: 112
18
+ in_channels: 3
19
+ out_ch: 3
20
+ ch: 128
21
+ ch_mult: [ 1,2,4 ] # num_down = len(ch_mult)-1
22
+ num_res_blocks: 2
23
+ attn_resolutions: [ ]
24
+ dropout: 0.0
25
+
26
+ data:
27
+ target: main.DataModuleFromConfig
28
+ params:
29
+ batch_size: 32
30
+ num_workers: 16
31
+ train:
32
+ target: taming.data.custom.CustomTrain
33
+ params:
34
+ training_images_list_file: ${oc.env:TMPDIR}/train.txt
35
+ size: 112
36
+ validation:
37
+ target: taming.data.custom.CustomTest
38
+ params:
39
+ test_images_list_file: ${oc.env:TMPDIR}/val.txt
40
+ size: 112
41
+
42
+ lightning:
43
+ callbacks:
44
+ image_logger:
45
+ target: main.ImageLogger
46
+ params:
47
+ batch_frequency: 1000
48
+ max_images: 8
49
+ increase_log_steps: True
50
+
51
+ trainer:
52
+ benchmark: True
53
+ accumulate_grad_batches: 2
54
+ max_epochs: 1000
vae/avae-4f8/config.yaml ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 2e-6 # ~5e-4 after scaloing
3
+ target: ldm.models.autoencoder.AutoencoderKL
4
+ params:
5
+ monitor: "val/rec_loss"
6
+ embed_dim: 4
7
+ lossconfig:
8
+ target: ldm.modules.losses.LPIPSWithDiscriminator
9
+ params:
10
+ disc_start: 50001
11
+ kl_weight: 0.000001
12
+ disc_weight: 0.5
13
+
14
+ ddconfig:
15
+ double_z: True
16
+ z_channels: 4
17
+ resolution: 112
18
+ in_channels: 3
19
+ out_ch: 3
20
+ ch: 128
21
+ ch_mult: [ 1,2,2,4 ] # num_down = len(ch_mult)-1
22
+ num_res_blocks: 2
23
+ attn_resolutions: [ ]
24
+ dropout: 0.0
25
+
26
+ data:
27
+ target: main.DataModuleFromConfig
28
+ params:
29
+ batch_size: 32
30
+ num_workers: 16
31
+ train:
32
+ target: taming.data.custom.CustomTrain
33
+ params:
34
+ training_images_list_file: ${oc.env:TMPDIR}/train.txt
35
+ size: 112
36
+ validation:
37
+ target: taming.data.custom.CustomTest
38
+ params:
39
+ test_images_list_file: ${oc.env:TMPDIR}/val.txt
40
+ size: 112
41
+
42
+ lightning:
43
+ callbacks:
44
+ image_logger:
45
+ target: main.ImageLogger
46
+ params:
47
+ batch_frequency: 1000
48
+ max_images: 8
49
+ increase_log_steps: True
50
+
51
+ trainer:
52
+ benchmark: True
53
+ accumulate_grad_batches: 2
54
+ max_epochs: 1000