HReynaud commited on Mar 24, 2025

Commit

514f603

1 Parent(s): 8f6cdca

training configs

Browse files

Files changed (27) hide show

lifm/FMiT-B2-16f8/config.yaml +132 -0
lifm/FMiT-B2-4f4/config.yaml +132 -0
lifm/FMiT-B4-4f4/config.yaml +132 -0
lifm/FMiT-L2-16f8/config.yaml +132 -0
lifm/FMiT-L2-4f4/config.yaml +132 -0
lifm/FMiT-L4-4f4/config.yaml +132 -0
lifm/FMiT-S2-16f8/config.yaml +132 -0
lifm/FMiT-S2-4f4/config.yaml +132 -0
lifm/FMiT-S4-4f4/config.yaml +132 -0
lifm/UNet-B-16f8/config.yaml +152 -0
lifm/UNet-B-4f4/config.yaml +152 -0
lifm/UNet-L-16f8/config.yaml +152 -0
lifm/UNet-L-4f4/config.yaml +152 -0
lifm/UNet-S-16f8/config.yaml +152 -0
lifm/UNet-S-4f4/config.yaml +152 -0
lvfm/FMvT-S2-16f8/config.yaml +145 -0
lvfm/FMvT-S2-4f4/config.yaml +145 -0
lvfm/FMvT-S4-4f4/config.yaml +145 -0
lvfm/STUNet-S-16f8/config.yaml +152 -0
lvfm/STUNet-S-4f4/config.yaml +152 -0
reid/dynamic-4f4/config.yaml +64 -0
reid/lvh-4f4/config.yaml +64 -0
reid/ped_a4c-4f4/config.yaml +64 -0
reid/ped_psax-4f4/config.yaml +64 -0
vae/avae-16f8/config.yaml +54 -0
vae/avae-4f4/config.yaml +54 -0
vae/avae-4f8/config.yaml +54 -0

lifm/FMiT-B2-16f8/config.yaml ADDED Viewed

	@@ -0,0 +1,132 @@

+globals:
+  target_fps: original
+  target_nframes: 64
+  outputs:
+  - image
+  - view
+  resolution: 112
+  latent_res: 14
+  latent_channels: 16
+denoiser:
+  target: echosyn.common.models.SegDiTTransformer2DModel
+  args:
+    num_attention_heads: 12
+    attention_head_dim: 64
+    in_channels: 17
+    out_channels: 16
+    num_layers: 12
+    dropout: 0.0
+    norm_num_groups: 32
+    attention_bias: true
+    sample_size: ${globals.latent_res}
+    patch_size: 2
+    activation_fn: gelu-approximate
+    num_embeds_ada_norm: 1000
+    upcast_attention: false
+    norm_type: ada_norm_zero
+    norm_elementwise_affine: false
+    norm_eps: 1.0e-05
+optimizer:
+  target: torch.optim.AdamW
+  args:
+    lr: 5.0e-05
+    betas:
+    - 0.9
+    - 0.999
+    weight_decay: 0.01
+    eps: 1.0e-08
+scheduler:
+  target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
+  args:
+    warmup_steps: 5000
+    ref_steps: ${max_train_steps}
+    eta_min: 1.0e-06
+    decay_rate: 2
+vae:
+  target: diffusers.AutoencoderKL
+  pretrained: vae/avae-16f8
+datasets:
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-16f8/dynamic
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: A4C
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/dynamic
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-16f8/ped_a4c
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: A4C
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/ped_a4c
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-16f8/ped_psax
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: PSAX
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/ped_psax
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-16f8/lvh
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: PLAX
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: no_seg
+    target_resolution: ${globals.latent_res}
+dataloader:
+  target: torch.utils.data.DataLoader
+  args:
+    shuffle: true
+    batch_size: 128
+    num_workers: 16
+    pin_memory: true
+    drop_last: true
+    persistent_workers: true
+max_train_steps: 1000000
+gradient_accumulation_steps: 1
+mixed_precision: fp16
+use_ema: true
+noise_offset: 0.1
+max_grad_norm: 0.1
+max_grad_value: -1
+pad_latents: false
+sample_latents: true
+output_dir: experiments/${wandb_args.name}
+logging_dir: logs
+report_to: wandb
+wandb_args:
+  project: EchoFlow
+  name: FMiT-B2-16f8
+  group: FMiT
+checkpointing_steps: 10000
+checkpoints_to_keep:
+- 50000
+- 100000
+- 200000
+- 500000
+- 1000000
+resume_from_checkpoint: latest
+validation:
+  samples: 4
+  steps: 5000
+  method: euler
+  timesteps: 25
+seed: 42
+num_train_epochs: 45455

lifm/FMiT-B2-4f4/config.yaml ADDED Viewed

	@@ -0,0 +1,132 @@

+globals:
+  target_fps: original
+  target_nframes: 64
+  outputs:
+  - image
+  - view
+  resolution: 112
+  latent_res: 28
+  latent_channels: 4
+denoiser:
+  target: echosyn.common.models.SegDiTTransformer2DModel
+  args:
+    num_attention_heads: 12
+    attention_head_dim: 64
+    in_channels: 5
+    out_channels: 4
+    num_layers: 12
+    dropout: 0.0
+    norm_num_groups: 32
+    attention_bias: true
+    sample_size: 28
+    patch_size: 2
+    activation_fn: gelu-approximate
+    num_embeds_ada_norm: 1000
+    upcast_attention: false
+    norm_type: ada_norm_zero
+    norm_elementwise_affine: false
+    norm_eps: 1.0e-05
+optimizer:
+  target: torch.optim.AdamW
+  args:
+    lr: 5.0e-05
+    betas:
+    - 0.9
+    - 0.999
+    weight_decay: 0.01
+    eps: 1.0e-08
+scheduler:
+  target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
+  args:
+    warmup_steps: 5000
+    ref_steps: ${max_train_steps}
+    eta_min: 1.0e-06
+    decay_rate: 2
+vae:
+  target: diffusers.AutoencoderKL
+  pretrained: vae/avae-4f4
+datasets:
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-4f4/dynamic
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: A4C
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/dynamic
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-4f4/ped_a4c
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: A4C
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/ped_a4c
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-4f4/ped_psax
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: PSAX
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/ped_psax
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-4f4/lvh
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: PLAX
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: no_seg
+    target_resolution: ${globals.latent_res}
+dataloader:
+  target: torch.utils.data.DataLoader
+  args:
+    shuffle: true
+    batch_size: 128
+    num_workers: 16
+    pin_memory: true
+    drop_last: true
+    persistent_workers: true
+max_train_steps: 1000000
+gradient_accumulation_steps: 1
+mixed_precision: fp16
+use_ema: true
+noise_offset: 0.1
+max_grad_norm: 0.1
+max_grad_value: -1
+pad_latents: false
+sample_latents: true
+output_dir: experiments/${wandb_args.name}
+logging_dir: logs
+report_to: wandb
+wandb_args:
+  project: EchoFlow
+  name: FMiT-B2-4f4
+  group: FMiT
+checkpointing_steps: 10000
+checkpoints_to_keep:
+- 50000
+- 100000
+- 200000
+- 500000
+- 1000000
+resume_from_checkpoint: latest
+validation:
+  samples: 4
+  steps: 5000
+  method: euler
+  timesteps: 25
+seed: 42
+num_train_epochs: 45455

lifm/FMiT-B4-4f4/config.yaml ADDED Viewed

	@@ -0,0 +1,132 @@

+globals:
+  target_fps: original
+  target_nframes: 64
+  outputs:
+  - image
+  - view
+  resolution: 112
+  latent_res: 28
+  latent_channels: 4
+denoiser:
+  target: echosyn.common.models.SegDiTTransformer2DModel
+  args:
+    num_attention_heads: 12
+    attention_head_dim: 64
+    in_channels: 5
+    out_channels: 4
+    num_layers: 12
+    dropout: 0.0
+    norm_num_groups: 32
+    attention_bias: true
+    sample_size: 28
+    patch_size: 4
+    activation_fn: gelu-approximate
+    num_embeds_ada_norm: 1000
+    upcast_attention: false
+    norm_type: ada_norm_zero
+    norm_elementwise_affine: false
+    norm_eps: 1.0e-05
+optimizer:
+  target: torch.optim.AdamW
+  args:
+    lr: 5.0e-05
+    betas:
+    - 0.9
+    - 0.999
+    weight_decay: 0.01
+    eps: 1.0e-08
+scheduler:
+  target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
+  args:
+    warmup_steps: 5000
+    ref_steps: ${max_train_steps}
+    eta_min: 1.0e-06
+    decay_rate: 2
+vae:
+  target: diffusers.AutoencoderKL
+  pretrained: vae/avae-4f4
+datasets:
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-4f4/dynamic
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: A4C
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/dynamic
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-4f4/ped_a4c
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: A4C
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/ped_a4c
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-4f4/ped_psax
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: PSAX
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/ped_psax
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-4f4/lvh
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: PLAX
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: no_seg
+    target_resolution: ${globals.latent_res}
+dataloader:
+  target: torch.utils.data.DataLoader
+  args:
+    shuffle: true
+    batch_size: 128
+    num_workers: 16
+    pin_memory: true
+    drop_last: true
+    persistent_workers: true
+max_train_steps: 1000000
+gradient_accumulation_steps: 1
+mixed_precision: fp16
+use_ema: true
+noise_offset: 0.1
+max_grad_norm: 0.1
+max_grad_value: -1
+pad_latents: false
+sample_latents: true
+output_dir: experiments/${wandb_args.name}
+logging_dir: logs
+report_to: wandb
+wandb_args:
+  project: EchoFlow
+  name: FMiT-B4-4f4
+  group: FMiT
+checkpointing_steps: 10000
+checkpoints_to_keep:
+- 50000
+- 100000
+- 200000
+- 500000
+- 1000000
+resume_from_checkpoint: latest
+validation:
+  samples: 4
+  steps: 5000
+  method: euler
+  timesteps: 25
+seed: 42
+num_train_epochs: 45455

lifm/FMiT-L2-16f8/config.yaml ADDED Viewed

	@@ -0,0 +1,132 @@

+globals:
+  target_fps: original
+  target_nframes: 64
+  outputs:
+  - image
+  - view
+  resolution: 112
+  latent_res: 14
+  latent_channels: 16
+denoiser:
+  target: echosyn.common.models.SegDiTTransformer2DModel
+  args:
+    num_attention_heads: 16
+    attention_head_dim: 64
+    in_channels: 17
+    out_channels: 16
+    num_layers: 24
+    dropout: 0.0
+    norm_num_groups: 32
+    attention_bias: true
+    sample_size: ${globals.latent_res}
+    patch_size: 2
+    activation_fn: gelu-approximate
+    num_embeds_ada_norm: 1000
+    upcast_attention: false
+    norm_type: ada_norm_zero
+    norm_elementwise_affine: false
+    norm_eps: 1.0e-05
+optimizer:
+  target: torch.optim.AdamW
+  args:
+    lr: 5.0e-05
+    betas:
+    - 0.9
+    - 0.999
+    weight_decay: 0.01
+    eps: 1.0e-08
+scheduler:
+  target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
+  args:
+    warmup_steps: 5000
+    ref_steps: ${max_train_steps}
+    eta_min: 1.0e-06
+    decay_rate: 2
+vae:
+  target: diffusers.AutoencoderKL
+  pretrained: vae/avae-16f8
+datasets:
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-16f8/dynamic
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: A4C
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/dynamic
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-16f8/ped_a4c
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: A4C
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/ped_a4c
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-16f8/ped_psax
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: PSAX
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/ped_psax
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-16f8/lvh
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: PLAX
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: no_seg
+    target_resolution: ${globals.latent_res}
+dataloader:
+  target: torch.utils.data.DataLoader
+  args:
+    shuffle: true
+    batch_size: 128
+    num_workers: 16
+    pin_memory: true
+    drop_last: true
+    persistent_workers: true
+max_train_steps: 1000000
+gradient_accumulation_steps: 1
+mixed_precision: fp16
+use_ema: true
+noise_offset: 0.1
+max_grad_norm: 0.1
+max_grad_value: -1
+pad_latents: false
+sample_latents: true
+output_dir: experiments/${wandb_args.name}
+logging_dir: logs
+report_to: wandb
+wandb_args:
+  project: EchoFlow
+  name: FMiT-L2-16f8
+  group: FMiT
+checkpointing_steps: 10000
+checkpoints_to_keep:
+- 50000
+- 100000
+- 200000
+- 500000
+- 1000000
+resume_from_checkpoint: latest
+validation:
+  samples: 4
+  steps: 5000
+  method: euler
+  timesteps: 25
+seed: 42
+num_train_epochs: 45455

lifm/FMiT-L2-4f4/config.yaml ADDED Viewed

	@@ -0,0 +1,132 @@

+globals:
+  target_fps: original
+  target_nframes: 64
+  outputs:
+  - image
+  - view
+  resolution: 112
+  latent_res: 28
+  latent_channels: 4
+denoiser:
+  target: echosyn.common.models.SegDiTTransformer2DModel
+  args:
+    num_attention_heads: 16
+    attention_head_dim: 64
+    in_channels: 5
+    out_channels: 4
+    num_layers: 24
+    dropout: 0.0
+    norm_num_groups: 32
+    attention_bias: true
+    sample_size: 14
+    patch_size: 2
+    activation_fn: gelu-approximate
+    num_embeds_ada_norm: 1000
+    upcast_attention: false
+    norm_type: ada_norm_zero
+    norm_elementwise_affine: false
+    norm_eps: 1.0e-05
+optimizer:
+  target: torch.optim.AdamW
+  args:
+    lr: 5.0e-05
+    betas:
+    - 0.9
+    - 0.999
+    weight_decay: 0.01
+    eps: 1.0e-08
+scheduler:
+  target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
+  args:
+    warmup_steps: 5000
+    ref_steps: ${max_train_steps}
+    eta_min: 1.0e-06
+    decay_rate: 2
+vae:
+  target: diffusers.AutoencoderKL
+  pretrained: vae/avae-4f4
+datasets:
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-4f4/dynamic
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: A4C
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/dynamic
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-4f4/ped_a4c
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: A4C
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/ped_a4c
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-4f4/ped_psax
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: PSAX
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/ped_psax
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-4f4/lvh
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: PLAX
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: no_seg
+    target_resolution: ${globals.latent_res}
+dataloader:
+  target: torch.utils.data.DataLoader
+  args:
+    shuffle: true
+    batch_size: 128
+    num_workers: 16
+    pin_memory: true
+    drop_last: true
+    persistent_workers: true
+max_train_steps: 1000000
+gradient_accumulation_steps: 1
+mixed_precision: fp16
+use_ema: true
+noise_offset: 0.1
+max_grad_norm: 0.1
+max_grad_value: -1
+pad_latents: false
+sample_latents: true
+output_dir: experiments/${wandb_args.name}
+logging_dir: logs
+report_to: wandb
+wandb_args:
+  project: EchoFlow
+  name: FMiT-L2-4f4
+  group: FMiT
+checkpointing_steps: 10000
+checkpoints_to_keep:
+- 50000
+- 100000
+- 200000
+- 500000
+- 1000000
+resume_from_checkpoint: latest
+validation:
+  samples: 4
+  steps: 5000
+  method: euler
+  timesteps: 25
+seed: 42
+num_train_epochs: 45455

lifm/FMiT-L4-4f4/config.yaml ADDED Viewed

	@@ -0,0 +1,132 @@

+globals:
+  target_fps: original
+  target_nframes: 64
+  outputs:
+  - image
+  - view
+  resolution: 112
+  latent_res: 28
+  latent_channels: 4
+denoiser:
+  target: echosyn.common.models.SegDiTTransformer2DModel
+  args:
+    num_attention_heads: 16
+    attention_head_dim: 64
+    in_channels: 5
+    out_channels: 4
+    num_layers: 24
+    dropout: 0.0
+    norm_num_groups: 32
+    attention_bias: true
+    sample_size: 14
+    patch_size: 4
+    activation_fn: gelu-approximate
+    num_embeds_ada_norm: 1000
+    upcast_attention: false
+    norm_type: ada_norm_zero
+    norm_elementwise_affine: false
+    norm_eps: 1.0e-05
+optimizer:
+  target: torch.optim.AdamW
+  args:
+    lr: 5.0e-05
+    betas:
+    - 0.9
+    - 0.999
+    weight_decay: 0.01
+    eps: 1.0e-08
+scheduler:
+  target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
+  args:
+    warmup_steps: 5000
+    ref_steps: ${max_train_steps}
+    eta_min: 1.0e-06
+    decay_rate: 2
+vae:
+  target: diffusers.AutoencoderKL
+  pretrained: vae/avae-4f4
+datasets:
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-4f4/dynamic
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: A4C
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/dynamic
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-4f4/ped_a4c
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: A4C
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/ped_a4c
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-4f4/ped_psax
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: PSAX
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/ped_psax
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-4f4/lvh
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: PLAX
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: no_seg
+    target_resolution: ${globals.latent_res}
+dataloader:
+  target: torch.utils.data.DataLoader
+  args:
+    shuffle: true
+    batch_size: 128
+    num_workers: 16
+    pin_memory: true
+    drop_last: true
+    persistent_workers: true
+max_train_steps: 1000000
+gradient_accumulation_steps: 1
+mixed_precision: fp16
+use_ema: true
+noise_offset: 0.1
+max_grad_norm: 0.1
+max_grad_value: -1
+pad_latents: false
+sample_latents: true
+output_dir: experiments/${wandb_args.name}
+logging_dir: logs
+report_to: wandb
+wandb_args:
+  project: EchoFlow
+  name: FMiT-L4-4f4
+  group: FMiT
+checkpointing_steps: 10000
+checkpoints_to_keep:
+- 50000
+- 100000
+- 200000
+- 500000
+- 1000000
+resume_from_checkpoint: latest
+validation:
+  samples: 4
+  steps: 5000
+  method: euler
+  timesteps: 25
+seed: 42
+num_train_epochs: 45455

lifm/FMiT-S2-16f8/config.yaml ADDED Viewed

	@@ -0,0 +1,132 @@

+globals:
+  target_fps: original
+  target_nframes: 64
+  outputs:
+  - image
+  - view
+  resolution: 112
+  latent_res: 14
+  latent_channels: 16
+denoiser:
+  target: echosyn.common.models.SegDiTTransformer2DModel
+  args:
+    num_attention_heads: 6
+    attention_head_dim: 64
+    in_channels: 17
+    out_channels: 16
+    num_layers: 12
+    dropout: 0.0
+    norm_num_groups: 32
+    attention_bias: true
+    sample_size: ${globals.latent_res}
+    patch_size: 2
+    activation_fn: gelu-approximate
+    num_embeds_ada_norm: 1000
+    upcast_attention: false
+    norm_type: ada_norm_zero
+    norm_elementwise_affine: false
+    norm_eps: 1.0e-05
+optimizer:
+  target: torch.optim.AdamW
+  args:
+    lr: 5.0e-05
+    betas:
+    - 0.9
+    - 0.999
+    weight_decay: 0.01
+    eps: 1.0e-08
+scheduler:
+  target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
+  args:
+    warmup_steps: 5000
+    ref_steps: ${max_train_steps}
+    eta_min: 1.0e-06
+    decay_rate: 2
+vae:
+  target: diffusers.AutoencoderKL
+  pretrained: vae/avae-16f8
+datasets:
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-16f8/dynamic
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: A4C
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/dynamic
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-16f8/ped_a4c
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: A4C
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/ped_a4c
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-16f8/ped_psax
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: PSAX
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/ped_psax
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-16f8/lvh
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: PLAX
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: no_seg
+    target_resolution: ${globals.latent_res}
+dataloader:
+  target: torch.utils.data.DataLoader
+  args:
+    shuffle: true
+    batch_size: 128
+    num_workers: 16
+    pin_memory: true
+    drop_last: true
+    persistent_workers: true
+max_train_steps: 1000000
+gradient_accumulation_steps: 1
+mixed_precision: fp16
+use_ema: true
+noise_offset: 0.1
+max_grad_norm: 0.1
+max_grad_value: -1
+pad_latents: false
+sample_latents: true
+output_dir: experiments/${wandb_args.name}
+logging_dir: logs
+report_to: wandb
+wandb_args:
+  project: EchoFlow
+  name: FMiT-S2-16f8
+  group: FMiT
+checkpointing_steps: 10000
+checkpoints_to_keep:
+- 50000
+- 100000
+- 200000
+- 500000
+- 1000000
+resume_from_checkpoint: latest
+validation:
+  samples: 4
+  steps: 5000
+  method: euler
+  timesteps: 25
+seed: 42
+num_train_epochs: 45455

lifm/FMiT-S2-4f4/config.yaml ADDED Viewed

	@@ -0,0 +1,132 @@

+globals:
+  target_fps: original
+  target_nframes: 64
+  outputs:
+  - image
+  - view
+  resolution: 112
+  latent_res: 28
+  latent_channels: 4
+denoiser:
+  target: echosyn.common.models.SegDiTTransformer2DModel
+  args:
+    num_attention_heads: 6
+    attention_head_dim: 64
+    in_channels: 5
+    out_channels: 4
+    num_layers: 12
+    dropout: 0.0
+    norm_num_groups: 32
+    attention_bias: true
+    sample_size: ${globals.latent_res}
+    patch_size: 2
+    activation_fn: gelu-approximate
+    num_embeds_ada_norm: 1000
+    upcast_attention: false
+    norm_type: ada_norm_zero
+    norm_elementwise_affine: false
+    norm_eps: 1.0e-05
+optimizer:
+  target: torch.optim.AdamW
+  args:
+    lr: 5.0e-05
+    betas:
+    - 0.9
+    - 0.999
+    weight_decay: 0.01
+    eps: 1.0e-08
+scheduler:
+  target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
+  args:
+    warmup_steps: 5000
+    ref_steps: ${max_train_steps}
+    eta_min: 1.0e-06
+    decay_rate: 2
+vae:
+  target: diffusers.AutoencoderKL
+  pretrained: vae/avae-4f4
+datasets:
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-4f4/dynamic
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: A4C
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/dynamic
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-4f4/ped_a4c
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: A4C
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/ped_a4c
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-4f4/ped_psax
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: PSAX
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/ped_psax
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-4f4/lvh
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: PLAX
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: no_seg
+    target_resolution: ${globals.latent_res}
+dataloader:
+  target: torch.utils.data.DataLoader
+  args:
+    shuffle: true
+    batch_size: 128
+    num_workers: 16
+    pin_memory: true
+    drop_last: true
+    persistent_workers: true
+max_train_steps: 1000000
+gradient_accumulation_steps: 1
+mixed_precision: fp16
+use_ema: true
+noise_offset: 0.1
+max_grad_norm: 0.1
+max_grad_value: -1
+pad_latents: false
+sample_latents: true
+output_dir: experiments/${wandb_args.name}
+logging_dir: logs
+report_to: wandb
+wandb_args:
+  project: EchoFlow
+  name: FMiT-S2-4f4
+  group: FMiT
+checkpointing_steps: 10000
+checkpoints_to_keep:
+- 50000
+- 100000
+- 200000
+- 500000
+- 1000000
+resume_from_checkpoint: latest
+validation:
+  samples: 4
+  steps: 5000
+  method: euler
+  timesteps: 25
+seed: 42
+num_train_epochs: 45455

lifm/FMiT-S4-4f4/config.yaml ADDED Viewed

	@@ -0,0 +1,132 @@

+globals:
+  target_fps: original
+  target_nframes: 64
+  outputs:
+  - image
+  - view
+  resolution: 112
+  latent_res: 28
+  latent_channels: 4
+denoiser:
+  target: echosyn.common.models.SegDiTTransformer2DModel
+  args:
+    num_attention_heads: 6
+    attention_head_dim: 64
+    in_channels: 5
+    out_channels: 4
+    num_layers: 12
+    dropout: 0.0
+    norm_num_groups: 32
+    attention_bias: true
+    sample_size: ${globals.latent_res}
+    patch_size: 4
+    activation_fn: gelu-approximate
+    num_embeds_ada_norm: 1000
+    upcast_attention: false
+    norm_type: ada_norm_zero
+    norm_elementwise_affine: false
+    norm_eps: 1.0e-05
+optimizer:
+  target: torch.optim.AdamW
+  args:
+    lr: 5.0e-05
+    betas:
+    - 0.9
+    - 0.999
+    weight_decay: 0.01
+    eps: 1.0e-08
+scheduler:
+  target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
+  args:
+    warmup_steps: 5000
+    ref_steps: ${max_train_steps}
+    eta_min: 1.0e-06
+    decay_rate: 2
+vae:
+  target: diffusers.AutoencoderKL
+  pretrained: vae/avae-4f4
+datasets:
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-4f4/dynamic
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: A4C
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/dynamic
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-4f4/ped_a4c
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: A4C
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/ped_a4c
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-4f4/ped_psax
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: PSAX
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/ped_psax
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-4f4/lvh
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: PLAX
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: no_seg
+    target_resolution: ${globals.latent_res}
+dataloader:
+  target: torch.utils.data.DataLoader
+  args:
+    shuffle: true
+    batch_size: 128
+    num_workers: 16
+    pin_memory: true
+    drop_last: true
+    persistent_workers: true
+max_train_steps: 1000000
+gradient_accumulation_steps: 1
+mixed_precision: fp16
+use_ema: true
+noise_offset: 0.1
+max_grad_norm: 0.1
+max_grad_value: -1
+pad_latents: false
+sample_latents: true
+output_dir: experiments/${wandb_args.name}
+logging_dir: logs
+report_to: wandb
+wandb_args:
+  project: EchoFlow
+  name: FMiT-S4-4f4
+  group: FMiT
+checkpointing_steps: 10000
+checkpoints_to_keep:
+- 50000
+- 100000
+- 200000
+- 500000
+- 1000000
+resume_from_checkpoint: latest
+validation:
+  samples: 4
+  steps: 5000
+  method: euler
+  timesteps: 25
+seed: 42
+num_train_epochs: 45455

lifm/UNet-B-16f8/config.yaml ADDED Viewed

	@@ -0,0 +1,152 @@

+globals:
+  target_fps: original
+  target_nframes: 64
+  outputs:
+  - image
+  - view
+  resolution: 112
+  latent_res: 14
+  latent_channels: 16
+denoiser:
+  target: echosyn.common.models.SegUnet2DModel
+  args:
+    sample_size: 28
+    in_channels: 17
+    out_channels: 16
+    center_input_sample: false
+    time_embedding_type: positional
+    freq_shift: 0
+    flip_sin_to_cos: true
+    down_block_types:
+    - AttnDownBlock2D
+    - AttnDownBlock2D
+    - AttnDownBlock2D
+    - DownBlock2D
+    up_block_types:
+    - UpBlock2D
+    - AttnUpBlock2D
+    - AttnUpBlock2D
+    - AttnUpBlock2D
+    block_out_channels:
+    - 160
+    - 320
+    - 480
+    - 640
+    layers_per_block: 2
+    mid_block_scale_factor: 1
+    downsample_padding: 1
+    downsample_type: resnet
+    upsample_type: resnet
+    dropout: 0.0
+    act_fn: silu
+    attention_head_dim: 8
+    norm_num_groups: 32
+    attn_norm_num_groups: null
+    norm_eps: 1.0e-05
+    resnet_time_scale_shift: default
+    class_embed_type: timestep
+    num_class_embeds: null
+optimizer:
+  target: torch.optim.AdamW
+  args:
+    lr: 5.0e-05
+    betas:
+    - 0.9
+    - 0.999
+    weight_decay: 0.01
+    eps: 1.0e-08
+scheduler:
+  target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
+  args:
+    warmup_steps: 5000
+    ref_steps: ${max_train_steps}
+    eta_min: 1.0e-06
+    decay_rate: 2
+vae:
+  target: diffusers.AutoencoderKL
+  pretrained: vae/avae-16f8
+datasets:
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-16f8/dynamic
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: A4C
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/dynamic
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-16f8/ped_a4c
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: A4C
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/ped_a4c
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-16f8/ped_psax
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: PSAX
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/ped_psax
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-16f8/lvh
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: PLAX
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: no_seg
+    target_resolution: ${globals.latent_res}
+dataloader:
+  target: torch.utils.data.DataLoader
+  args:
+    shuffle: true
+    batch_size: 128
+    num_workers: 16
+    pin_memory: true
+    drop_last: true
+    persistent_workers: true
+max_train_steps: 1000000
+gradient_accumulation_steps: 1
+mixed_precision: bf16
+use_ema: true
+noise_offset: 0.1
+max_grad_norm: 1.0
+max_grad_value: -1
+pad_latents: false
+sample_latents: true
+output_dir: experiments/${wandb_args.name}
+logging_dir: logs
+report_to: wandb
+wandb_args:
+  project: EchoFlow
+  name: UNet-B-16f8
+  group: UNet
+checkpointing_steps: 10000
+checkpoints_to_keep:
+- 50000
+- 100000
+- 200000
+- 500000
+- 1000000
+resume_from_checkpoint: latest
+validation:
+  samples: 4
+  steps: 5000
+  method: euler
+  timesteps: 25
+seed: 42
+num_train_epochs: 45455

lifm/UNet-B-4f4/config.yaml ADDED Viewed

	@@ -0,0 +1,152 @@

+globals:
+  target_fps: original
+  target_nframes: 64
+  outputs:
+  - image
+  - view
+  resolution: 112
+  latent_res: 28
+  latent_channels: 4
+denoiser:
+  target: echosyn.common.models.SegUnet2DModel
+  args:
+    sample_size: 28
+    in_channels: 5
+    out_channels: 4
+    center_input_sample: false
+    time_embedding_type: positional
+    freq_shift: 0
+    flip_sin_to_cos: true
+    down_block_types:
+    - AttnDownBlock2D
+    - AttnDownBlock2D
+    - AttnDownBlock2D
+    - DownBlock2D
+    up_block_types:
+    - UpBlock2D
+    - AttnUpBlock2D
+    - AttnUpBlock2D
+    - AttnUpBlock2D
+    block_out_channels:
+    - 160
+    - 320
+    - 480
+    - 640
+    layers_per_block: 2
+    mid_block_scale_factor: 1
+    downsample_padding: 1
+    downsample_type: resnet
+    upsample_type: resnet
+    dropout: 0.0
+    act_fn: silu
+    attention_head_dim: 8
+    norm_num_groups: 32
+    attn_norm_num_groups: null
+    norm_eps: 1.0e-05
+    resnet_time_scale_shift: default
+    class_embed_type: timestep
+    num_class_embeds: null
+optimizer:
+  target: torch.optim.AdamW
+  args:
+    lr: 5.0e-05
+    betas:
+    - 0.9
+    - 0.999
+    weight_decay: 0.01
+    eps: 1.0e-08
+scheduler:
+  target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
+  args:
+    warmup_steps: 5000
+    ref_steps: ${max_train_steps}
+    eta_min: 1.0e-06
+    decay_rate: 2
+vae:
+  target: diffusers.AutoencoderKL
+  pretrained: vae/avae-4f4
+datasets:
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-4f4/dynamic
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: A4C
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/dynamic
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-4f4/ped_a4c
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: A4C
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/ped_a4c
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-4f4/ped_psax
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: PSAX
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/ped_psax
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-4f4/lvh
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: PLAX
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: no_seg
+    target_resolution: ${globals.latent_res}
+dataloader:
+  target: torch.utils.data.DataLoader
+  args:
+    shuffle: true
+    batch_size: 128
+    num_workers: 16
+    pin_memory: true
+    drop_last: true
+    persistent_workers: true
+max_train_steps: 1000000
+gradient_accumulation_steps: 1
+mixed_precision: bf16
+use_ema: true
+noise_offset: 0.1
+max_grad_norm: 1.0
+max_grad_value: -1
+pad_latents: false
+sample_latents: true
+output_dir: experiments/${wandb_args.name}
+logging_dir: logs
+report_to: wandb
+wandb_args:
+  project: EchoFlow
+  name: UNet-B-4f4
+  group: UNet
+checkpointing_steps: 10000
+checkpoints_to_keep:
+- 50000
+- 100000
+- 200000
+- 500000
+- 1000000
+resume_from_checkpoint: latest
+validation:
+  samples: 4
+  steps: 5000
+  method: euler
+  timesteps: 25
+seed: 42
+num_train_epochs: 45455

lifm/UNet-L-16f8/config.yaml ADDED Viewed

	@@ -0,0 +1,152 @@

+globals:
+  target_fps: original
+  target_nframes: 64
+  outputs:
+  - image
+  - view
+  resolution: 112
+  latent_res: 14
+  latent_channels: 16
+denoiser:
+  target: echosyn.common.models.SegUnet2DModel
+  args:
+    sample_size: 28
+    in_channels: 17
+    out_channels: 16
+    center_input_sample: false
+    time_embedding_type: positional
+    freq_shift: 0
+    flip_sin_to_cos: true
+    down_block_types:
+    - AttnDownBlock2D
+    - AttnDownBlock2D
+    - AttnDownBlock2D
+    - DownBlock2D
+    up_block_types:
+    - UpBlock2D
+    - AttnUpBlock2D
+    - AttnUpBlock2D
+    - AttnUpBlock2D
+    block_out_channels:
+    - 320
+    - 640
+    - 960
+    - 1280
+    layers_per_block: 2
+    mid_block_scale_factor: 1
+    downsample_padding: 1
+    downsample_type: resnet
+    upsample_type: resnet
+    dropout: 0.0
+    act_fn: silu
+    attention_head_dim: 8
+    norm_num_groups: 32
+    attn_norm_num_groups: null
+    norm_eps: 1.0e-05
+    resnet_time_scale_shift: default
+    class_embed_type: timestep
+    num_class_embeds: null
+optimizer:
+  target: torch.optim.AdamW
+  args:
+    lr: 5.0e-05
+    betas:
+    - 0.9
+    - 0.999
+    weight_decay: 0.01
+    eps: 1.0e-08
+scheduler:
+  target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
+  args:
+    warmup_steps: 5000
+    ref_steps: ${max_train_steps}
+    eta_min: 1.0e-06
+    decay_rate: 2
+vae:
+  target: diffusers.AutoencoderKL
+  pretrained: vae/avae-16f8
+datasets:
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-16f8/dynamic
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: A4C
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/dynamic
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-16f8/ped_a4c
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: A4C
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/ped_a4c
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-16f8/ped_psax
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: PSAX
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/ped_psax
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-16f8/lvh
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: PLAX
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: no_seg
+    target_resolution: ${globals.latent_res}
+dataloader:
+  target: torch.utils.data.DataLoader
+  args:
+    shuffle: true
+    batch_size: 128
+    num_workers: 16
+    pin_memory: true
+    drop_last: true
+    persistent_workers: true
+max_train_steps: 1000000
+gradient_accumulation_steps: 1
+mixed_precision: bf16
+use_ema: true
+noise_offset: 0.1
+max_grad_norm: 1.0
+max_grad_value: -1
+pad_latents: false
+sample_latents: true
+output_dir: experiments/${wandb_args.name}
+logging_dir: logs
+report_to: wandb
+wandb_args:
+  project: EchoFlow
+  name: UNet-L-16f8
+  group: UNet
+checkpointing_steps: 10000
+checkpoints_to_keep:
+- 50000
+- 100000
+- 200000
+- 500000
+- 1000000
+resume_from_checkpoint: latest
+validation:
+  samples: 4
+  steps: 5000
+  method: euler
+  timesteps: 25
+seed: 42
+num_train_epochs: 45455

lifm/UNet-L-4f4/config.yaml ADDED Viewed

	@@ -0,0 +1,152 @@

+globals:
+  target_fps: original
+  target_nframes: 64
+  outputs:
+  - image
+  - view
+  resolution: 112
+  latent_res: 28
+  latent_channels: 4
+denoiser:
+  target: echosyn.common.models.SegUnet2DModel
+  args:
+    sample_size: 28
+    in_channels: 5
+    out_channels: 4
+    center_input_sample: false
+    time_embedding_type: positional
+    freq_shift: 0
+    flip_sin_to_cos: true
+    down_block_types:
+    - AttnDownBlock2D
+    - AttnDownBlock2D
+    - AttnDownBlock2D
+    - DownBlock2D
+    up_block_types:
+    - UpBlock2D
+    - AttnUpBlock2D
+    - AttnUpBlock2D
+    - AttnUpBlock2D
+    block_out_channels:
+    - 320
+    - 640
+    - 960
+    - 1280
+    layers_per_block: 2
+    mid_block_scale_factor: 1
+    downsample_padding: 1
+    downsample_type: resnet
+    upsample_type: resnet
+    dropout: 0.0
+    act_fn: silu
+    attention_head_dim: 8
+    norm_num_groups: 32
+    attn_norm_num_groups: null
+    norm_eps: 1.0e-05
+    resnet_time_scale_shift: default
+    class_embed_type: timestep
+    num_class_embeds: null
+optimizer:
+  target: torch.optim.AdamW
+  args:
+    lr: 5.0e-05
+    betas:
+    - 0.9
+    - 0.999
+    weight_decay: 0.01
+    eps: 1.0e-08
+scheduler:
+  target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
+  args:
+    warmup_steps: 5000
+    ref_steps: ${max_train_steps}
+    eta_min: 1.0e-06
+    decay_rate: 2
+vae:
+  target: diffusers.AutoencoderKL
+  pretrained: vae/avae-4f4
+datasets:
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-4f4/dynamic
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: A4C
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/dynamic
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-4f4/ped_a4c
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: A4C
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/ped_a4c
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-4f4/ped_psax
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: PSAX
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/ped_psax
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-4f4/lvh
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: PLAX
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: no_seg
+    target_resolution: ${globals.latent_res}
+dataloader:
+  target: torch.utils.data.DataLoader
+  args:
+    shuffle: true
+    batch_size: 128
+    num_workers: 16
+    pin_memory: true
+    drop_last: true
+    persistent_workers: true
+max_train_steps: 1000000
+gradient_accumulation_steps: 1
+mixed_precision: bf16
+use_ema: true
+noise_offset: 0.1
+max_grad_norm: 1.0
+max_grad_value: -1
+pad_latents: false
+sample_latents: true
+output_dir: experiments/${wandb_args.name}
+logging_dir: logs
+report_to: wandb
+wandb_args:
+  project: EchoFlow
+  name: UNet-L-4f4
+  group: UNet
+checkpointing_steps: 10000
+checkpoints_to_keep:
+- 50000
+- 100000
+- 200000
+- 500000
+- 1000000
+resume_from_checkpoint: latest
+validation:
+  samples: 4
+  steps: 5000
+  method: euler
+  timesteps: 25
+seed: 42
+num_train_epochs: 45455

lifm/UNet-S-16f8/config.yaml ADDED Viewed

	@@ -0,0 +1,152 @@

+globals:
+  target_fps: original
+  target_nframes: 64
+  outputs:
+  - image
+  - view
+  resolution: 112
+  latent_res: 14
+  latent_channels: 16
+denoiser:
+  target: echosyn.common.models.SegUnet2DModel
+  args:
+    sample_size: 28
+    in_channels: 17
+    out_channels: 16
+    center_input_sample: false
+    time_embedding_type: positional
+    freq_shift: 0
+    flip_sin_to_cos: true
+    down_block_types:
+    - AttnDownBlock2D
+    - AttnDownBlock2D
+    - AttnDownBlock2D
+    - DownBlock2D
+    up_block_types:
+    - UpBlock2D
+    - AttnUpBlock2D
+    - AttnUpBlock2D
+    - AttnUpBlock2D
+    block_out_channels:
+    - 96
+    - 192
+    - 288
+    - 384
+    layers_per_block: 2
+    mid_block_scale_factor: 1
+    downsample_padding: 1
+    downsample_type: resnet
+    upsample_type: resnet
+    dropout: 0.0
+    act_fn: silu
+    attention_head_dim: 8
+    norm_num_groups: 32
+    attn_norm_num_groups: null
+    norm_eps: 1.0e-05
+    resnet_time_scale_shift: default
+    class_embed_type: timestep
+    num_class_embeds: null
+optimizer:
+  target: torch.optim.AdamW
+  args:
+    lr: 5.0e-05
+    betas:
+    - 0.9
+    - 0.999
+    weight_decay: 0.01
+    eps: 1.0e-08
+scheduler:
+  target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
+  args:
+    warmup_steps: 5000
+    ref_steps: ${max_train_steps}
+    eta_min: 1.0e-06
+    decay_rate: 2
+vae:
+  target: diffusers.AutoencoderKL
+  pretrained: vae/avae-16f8
+datasets:
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-16f8/dynamic
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: A4C
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/dynamic
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-16f8/ped_a4c
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: A4C
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/ped_a4c
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-16f8/ped_psax
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: PSAX
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/ped_psax
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-16f8/lvh
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: PLAX
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: no_seg
+    target_resolution: ${globals.latent_res}
+dataloader:
+  target: torch.utils.data.DataLoader
+  args:
+    shuffle: true
+    batch_size: 128
+    num_workers: 16
+    pin_memory: true
+    drop_last: true
+    persistent_workers: true
+max_train_steps: 1000000
+gradient_accumulation_steps: 1
+mixed_precision: bf16
+use_ema: true
+noise_offset: 0.1
+max_grad_norm: 1.0
+max_grad_value: -1
+pad_latents: false
+sample_latents: true
+output_dir: experiments/${wandb_args.name}
+logging_dir: logs
+report_to: wandb
+wandb_args:
+  project: EchoFlow
+  name: UNet-S-16f8
+  group: UNet
+checkpointing_steps: 10000
+checkpoints_to_keep:
+- 50000
+- 100000
+- 200000
+- 500000
+- 1000000
+resume_from_checkpoint: latest
+validation:
+  samples: 4
+  steps: 5000
+  method: euler
+  timesteps: 25
+seed: 42
+num_train_epochs: 45455

lifm/UNet-S-4f4/config.yaml ADDED Viewed

	@@ -0,0 +1,152 @@

+globals:
+  target_fps: original
+  target_nframes: 64
+  outputs:
+  - image
+  - view
+  resolution: 112
+  latent_res: 28
+  latent_channels: 4
+denoiser:
+  target: echosyn.common.models.SegUnet2DModel
+  args:
+    sample_size: 28
+    in_channels: 5
+    out_channels: 4
+    center_input_sample: false
+    time_embedding_type: positional
+    freq_shift: 0
+    flip_sin_to_cos: true
+    down_block_types:
+    - AttnDownBlock2D
+    - AttnDownBlock2D
+    - AttnDownBlock2D
+    - DownBlock2D
+    up_block_types:
+    - UpBlock2D
+    - AttnUpBlock2D
+    - AttnUpBlock2D
+    - AttnUpBlock2D
+    block_out_channels:
+    - 96
+    - 192
+    - 288
+    - 384
+    layers_per_block: 2
+    mid_block_scale_factor: 1
+    downsample_padding: 1
+    downsample_type: resnet
+    upsample_type: resnet
+    dropout: 0.0
+    act_fn: silu
+    attention_head_dim: 8
+    norm_num_groups: 32
+    attn_norm_num_groups: null
+    norm_eps: 1.0e-05
+    resnet_time_scale_shift: default
+    class_embed_type: timestep
+    num_class_embeds: null
+optimizer:
+  target: torch.optim.AdamW
+  args:
+    lr: 5.0e-05
+    betas:
+    - 0.9
+    - 0.999
+    weight_decay: 0.01
+    eps: 1.0e-08
+scheduler:
+  target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
+  args:
+    warmup_steps: 5000
+    ref_steps: ${max_train_steps}
+    eta_min: 1.0e-06
+    decay_rate: 2
+vae:
+  target: diffusers.AutoencoderKL
+  pretrained: vae/avae-4f4
+datasets:
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-4f4/dynamic
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: A4C
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/dynamic
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-4f4/ped_a4c
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: A4C
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/ped_a4c
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-4f4/ped_psax
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: PSAX
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: segmentations/ped_psax
+    target_resolution: ${globals.latent_res}
+- name: LatentSeg
+  active: true
+  params:
+    root: avae-4f4/lvh
+    outputs: ${globals.outputs}
+    target_fps: ${globals.target_fps}
+    view_label: PLAX
+    target_nframes: ${globals.target_nframes}
+    latent_channels: ${globals.latent_channels}
+    segmentation_root: no_seg
+    target_resolution: ${globals.latent_res}
+dataloader:
+  target: torch.utils.data.DataLoader
+  args:
+    shuffle: true
+    batch_size: 128
+    num_workers: 16
+    pin_memory: true
+    drop_last: true
+    persistent_workers: true
+max_train_steps: 1000000
+gradient_accumulation_steps: 1
+mixed_precision: bf16
+use_ema: true
+noise_offset: 0.1
+max_grad_norm: 1.0
+max_grad_value: -1
+pad_latents: false
+sample_latents: true
+output_dir: experiments/${wandb_args.name}
+logging_dir: logs
+report_to: wandb
+wandb_args:
+  project: EchoFlow
+  name: UNet-S-4f4
+  group: UNet
+checkpointing_steps: 10000
+checkpoints_to_keep:
+- 50000
+- 100000
+- 200000
+- 500000
+- 1000000
+resume_from_checkpoint: latest
+validation:
+  samples: 4
+  steps: 5000
+  method: euler
+  timesteps: 25
+seed: 42
+num_train_epochs: 45455

lvfm/FMvT-S2-16f8/config.yaml ADDED Viewed

	@@ -0,0 +1,145 @@

+globals:
+  target_fps: 32
+  target_nframes: 64
+  outputs:
+  - video
+  - lvef
+  - image
+  resolution: 112
+  latent_res: 14
+  latent_channels: 16
+denoiser:
+  target: echosyn.common.models.DiffuserSTDiT
+  args:
+    input_size:
+    - ${globals.target_nframes}
+    - ${globals.latent_res}
+    - ${globals.latent_res}
+    in_channels: 32
+    out_channels: ${globals.latent_channels}
+    patch_size:
+    - 1
+    - 2
+    - 2
+    hidden_size: 384
+    depth: 12
+    num_heads: 6
+    mlp_ratio: 4.0
+    class_dropout_prob: 0.0
+    drop_path: 0.0
+    no_temporal_pos_emb: false
+    caption_channels: 1
+    model_max_length: 1
+    space_scale: 1.0
+    time_scale: 1.0
+    enable_flashattn: false
+optimizer:
+  target: torch.optim.AdamW
+  args:
+    lr: 0.0001
+    betas:
+    - 0.9
+    - 0.999
+    weight_decay: 0.01
+    eps: 1.0e-08
+scheduler:
+  target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
+  args:
+    warmup_steps: 2000
+    ref_steps: ${max_train_steps}
+    eta_min: 1.0e-06
+    decay_rate: 2.0
+vae:
+  target: diffusers.AutoencoderKL
+  pretrained: vae/avae-16f8
+datasets:
+- name: Latent
+  active: true
+  params:
+    root: avae-16f8/dynamic
+    target_fps: ${globals.target_fps}
+    target_nframes: ${globals.target_nframes}
+    target_resolution: ${globals.latent_res}
+    outputs: ${globals.outputs}
+    latent_channels: ${globals.latent_channels}
+- name: Latent
+  active: true
+  params:
+    root: avae-16f8/ped_a4c
+    target_fps: ${globals.target_fps}
+    target_nframes: ${globals.target_nframes}
+    target_resolution: ${globals.latent_res}
+    outputs: ${globals.outputs}
+    latent_channels: ${globals.latent_channels}
+- name: Latent
+  active: true
+  params:
+    root: avae-16f8/ped_psax
+    target_fps: ${globals.target_fps}
+    target_nframes: ${globals.target_nframes}
+    target_resolution: ${globals.latent_res}
+    outputs: ${globals.outputs}
+    latent_channels: ${globals.latent_channels}
+- name: Latent
+  active: true
+  params:
+    root: avae-16f8/lvh
+    target_fps: ${globals.target_fps}
+    target_nframes: ${globals.target_nframes}
+    target_resolution: ${globals.latent_res}
+    outputs: ${globals.outputs}
+    latent_channels: ${globals.latent_channels}
+dataloader:
+  target: torch.utils.data.DataLoader
+  args:
+    shuffle: true
+    batch_size: 64
+    num_workers: 64
+    pin_memory: true
+    drop_last: true
+    persistent_workers: true
+max_train_steps: 1000000
+gradient_accumulation_steps: 1
+mixed_precision: bf16
+use_ema: true
+max_grad_norm: 1.0
+max_grad_value: -1
+sample_latents: true
+noise_offset: 0.05
+noise_cond_image: 0.05
+no_conditionning: false
+p_drop_conditionning: 0.1
+output_dir: experiments/${wandb_args.name}
+logging_dir: logs
+report_to: wandb
+wandb_args:
+  project: EchoFlow
+  name: FMvT-S2-16f8
+  group: FMvT
+checkpointing_steps: 10000
+checkpoints_to_keep:
+- 50000
+- 100000
+- 200000
+- 300000
+- 500000
+- 1000000
+resume_from_checkpoint: latest
+validation:
+  samples: 4
+  steps: 5000
+  timesteps: 25
+  frames: ${globals.target_nframes}
+  fps: ${globals.target_fps}
+  lvefs:
+  - -1.0
+  - 0.3
+  - 0.6
+  - 0.9
+  cond_image_mask:
+  - 0
+  - 1
+  - 1
+  - 1
+seed: 42
+num_train_epochs: 28572

lvfm/FMvT-S2-4f4/config.yaml ADDED Viewed

	@@ -0,0 +1,145 @@

+globals:
+  target_fps: 32
+  target_nframes: 64
+  outputs:
+  - video
+  - lvef
+  - image
+  resolution: 112
+  latent_res: 28
+  latent_channels: 4
+denoiser:
+  target: echosyn.common.models.DiffuserSTDiT
+  args:
+    input_size:
+    - ${globals.target_nframes}
+    - ${globals.latent_res}
+    - ${globals.latent_res}
+    in_channels: 8
+    out_channels: ${globals.latent_channels}
+    patch_size:
+    - 1
+    - 2
+    - 2
+    hidden_size: 384
+    depth: 12
+    num_heads: 6
+    mlp_ratio: 4.0
+    class_dropout_prob: 0.0
+    drop_path: 0.0
+    no_temporal_pos_emb: false
+    caption_channels: 1
+    model_max_length: 1
+    space_scale: 1.0
+    time_scale: 1.0
+    enable_flashattn: false
+optimizer:
+  target: torch.optim.AdamW
+  args:
+    lr: 0.0001
+    betas:
+    - 0.9
+    - 0.999
+    weight_decay: 0.01
+    eps: 1.0e-08
+scheduler:
+  target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
+  args:
+    warmup_steps: 2000
+    ref_steps: ${max_train_steps}
+    eta_min: 1.0e-06
+    decay_rate: 2.0
+vae:
+  target: diffusers.AutoencoderKL
+  pretrained: vae/avae-4f4
+datasets:
+- name: Latent
+  active: true
+  params:
+    root: avae-4f4/dynamic
+    target_fps: ${globals.target_fps}
+    target_nframes: ${globals.target_nframes}
+    target_resolution: ${globals.latent_res}
+    outputs: ${globals.outputs}
+    latent_channels: ${globals.latent_channels}
+- name: Latent
+  active: true
+  params:
+    root: avae-4f4/ped_a4c
+    target_fps: ${globals.target_fps}
+    target_nframes: ${globals.target_nframes}
+    target_resolution: ${globals.latent_res}
+    outputs: ${globals.outputs}
+    latent_channels: ${globals.latent_channels}
+- name: Latent
+  active: true
+  params:
+    root: avae-4f4/ped_psax
+    target_fps: ${globals.target_fps}
+    target_nframes: ${globals.target_nframes}
+    target_resolution: ${globals.latent_res}
+    outputs: ${globals.outputs}
+    latent_channels: ${globals.latent_channels}
+- name: Latent
+  active: true
+  params:
+    root: avae-4f4/lvh
+    target_fps: ${globals.target_fps}
+    target_nframes: ${globals.target_nframes}
+    target_resolution: ${globals.latent_res}
+    outputs: ${globals.outputs}
+    latent_channels: ${globals.latent_channels}
+dataloader:
+  target: torch.utils.data.DataLoader
+  args:
+    shuffle: true
+    batch_size: 16
+    num_workers: 16
+    pin_memory: true
+    drop_last: true
+    persistent_workers: true
+max_train_steps: 1000000
+gradient_accumulation_steps: 1
+mixed_precision: bf16
+use_ema: true
+max_grad_norm: 1.0
+max_grad_value: -1
+sample_latents: true
+noise_offset: 0.05
+noise_cond_image: 0.05
+no_conditionning: false
+p_drop_conditionning: 0.3
+output_dir: experiments/${wandb_args.name}
+logging_dir: logs
+report_to: wandb
+wandb_args:
+  project: EchoFlow
+  name: FMvT-S2-4f4
+  group: FMvT
+checkpointing_steps: 10000
+checkpoints_to_keep:
+- 50000
+- 100000
+- 200000
+- 300000
+- 500000
+- 1000000
+resume_from_checkpoint: latest
+validation:
+  samples: 4
+  steps: 5000
+  timesteps: 25
+  frames: ${globals.target_nframes}
+  fps: ${globals.target_fps}
+  lvefs:
+  - -1.0
+  - 0.3
+  - 0.6
+  - 0.9
+  cond_image_mask:
+  - 0
+  - 1
+  - 1
+  - 1
+seed: 42
+num_train_epochs: 28572

lvfm/FMvT-S4-4f4/config.yaml ADDED Viewed

	@@ -0,0 +1,145 @@

+globals:
+  target_fps: 32
+  target_nframes: 64
+  outputs:
+  - video
+  - lvef
+  - image
+  resolution: 112
+  latent_res: 28
+  latent_channels: 4
+denoiser:
+  target: echosyn.common.models.DiffuserSTDiT
+  args:
+    input_size:
+    - ${globals.target_nframes}
+    - ${globals.latent_res}
+    - ${globals.latent_res}
+    in_channels: 8
+    out_channels: ${globals.latent_channels}
+    patch_size:
+    - 1
+    - 4
+    - 4
+    hidden_size: 384
+    depth: 12
+    num_heads: 6
+    mlp_ratio: 4.0
+    class_dropout_prob: 0.0
+    drop_path: 0.0
+    no_temporal_pos_emb: false
+    caption_channels: 1
+    model_max_length: 1
+    space_scale: 1.0
+    time_scale: 1.0
+    enable_flashattn: false
+optimizer:
+  target: torch.optim.AdamW
+  args:
+    lr: 0.0001
+    betas:
+    - 0.9
+    - 0.999
+    weight_decay: 0.01
+    eps: 1.0e-08
+scheduler:
+  target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
+  args:
+    warmup_steps: 2000
+    ref_steps: ${max_train_steps}
+    eta_min: 1.0e-06
+    decay_rate: 2.0
+vae:
+  target: diffusers.AutoencoderKL
+  pretrained: vae/avae-4f4
+datasets:
+- name: Latent
+  active: true
+  params:
+    root: avae-4f4/dynamic
+    target_fps: ${globals.target_fps}
+    target_nframes: ${globals.target_nframes}
+    target_resolution: ${globals.latent_res}
+    outputs: ${globals.outputs}
+    latent_channels: ${globals.latent_channels}
+- name: Latent
+  active: true
+  params:
+    root: avae-4f4/ped_a4c
+    target_fps: ${globals.target_fps}
+    target_nframes: ${globals.target_nframes}
+    target_resolution: ${globals.latent_res}
+    outputs: ${globals.outputs}
+    latent_channels: ${globals.latent_channels}
+- name: Latent
+  active: true
+  params:
+    root: avae-4f4/ped_psax
+    target_fps: ${globals.target_fps}
+    target_nframes: ${globals.target_nframes}
+    target_resolution: ${globals.latent_res}
+    outputs: ${globals.outputs}
+    latent_channels: ${globals.latent_channels}
+- name: Latent
+  active: true
+  params:
+    root: avae-4f4/lvh
+    target_fps: ${globals.target_fps}
+    target_nframes: ${globals.target_nframes}
+    target_resolution: ${globals.latent_res}
+    outputs: ${globals.outputs}
+    latent_channels: ${globals.latent_channels}
+dataloader:
+  target: torch.utils.data.DataLoader
+  args:
+    shuffle: true
+    batch_size: 64
+    num_workers: 64
+    pin_memory: true
+    drop_last: true
+    persistent_workers: true
+max_train_steps: 1000000
+gradient_accumulation_steps: 1
+mixed_precision: bf16
+use_ema: true
+max_grad_norm: 1.0
+max_grad_value: -1
+sample_latents: true
+noise_offset: 0.05
+noise_cond_image: 0.05
+no_conditionning: false
+p_drop_conditionning: 0.3
+output_dir: experiments/${wandb_args.name}
+logging_dir: logs
+report_to: wandb
+wandb_args:
+  project: EchoFlow
+  name: FMvT-S4-4f4
+  group: FMvT
+checkpointing_steps: 10000
+checkpoints_to_keep:
+- 50000
+- 100000
+- 200000
+- 300000
+- 500000
+- 1000000
+resume_from_checkpoint: latest
+validation:
+  samples: 4
+  steps: 5000
+  timesteps: 25
+  frames: ${globals.target_nframes}
+  fps: ${globals.target_fps}
+  lvefs:
+  - -1.0
+  - 0.3
+  - 0.6
+  - 0.9
+  cond_image_mask:
+  - 0
+  - 1
+  - 1
+  - 1
+seed: 42
+num_train_epochs: 28572

lvfm/STUNet-S-16f8/config.yaml ADDED Viewed

	@@ -0,0 +1,152 @@

+globals:
+  target_fps: 32
+  target_nframes: 64
+  outputs:
+  - video
+  - lvef
+  - image
+  resolution: 112
+  latent_res: 14
+  latent_channels: 16
+denoiser:
+  target: echosyn.common.models.UNetSTIC
+  args:
+    in_channels: 32
+    out_channels: ${globals.latent_channels}
+    sample_size: ${globals.latent_res}
+    addition_time_embed_dim: 1
+    block_out_channels:
+    - 64
+    - 128
+    - 192
+    - 256
+    cross_attention_dim: 1
+    down_block_types:
+    - CrossAttnDownBlockSpatioTemporal
+    - CrossAttnDownBlockSpatioTemporal
+    - CrossAttnDownBlockSpatioTemporal
+    - DownBlockSpatioTemporal
+    layers_per_block: 2
+    num_attention_heads:
+    - 8
+    - 16
+    - 16
+    - 32
+    num_frames: 64
+    projection_class_embeddings_input_dim: 1
+    transformer_layers_per_block: 1
+    up_block_types:
+    - UpBlockSpatioTemporal
+    - CrossAttnUpBlockSpatioTemporal
+    - CrossAttnUpBlockSpatioTemporal
+    - CrossAttnUpBlockSpatioTemporal
+optimizer:
+  target: torch.optim.AdamW
+  args:
+    lr: 0.0001
+    betas:
+    - 0.9
+    - 0.999
+    weight_decay: 0.01
+    eps: 1.0e-08
+scheduler:
+  target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
+  args:
+    warmup_steps: 2000
+    ref_steps: ${max_train_steps}
+    eta_min: 1.0e-06
+    decay_rate: 2.0
+vae:
+  target: diffusers.AutoencoderKL
+  pretrained: vae/avae-16f8
+datasets:
+- name: Latent
+  active: true
+  params:
+    root: avae-16f8/dynamic
+    target_fps: ${globals.target_fps}
+    target_nframes: ${globals.target_nframes}
+    target_resolution: ${globals.latent_res}
+    outputs: ${globals.outputs}
+    latent_channels: ${globals.latent_channels}
+- name: Latent
+  active: true
+  params:
+    root: avae-16f8/ped_a4c
+    target_fps: ${globals.target_fps}
+    target_nframes: ${globals.target_nframes}
+    target_resolution: ${globals.latent_res}
+    outputs: ${globals.outputs}
+    latent_channels: ${globals.latent_channels}
+- name: Latent
+  active: true
+  params:
+    root: avae-16f8/ped_psax
+    target_fps: ${globals.target_fps}
+    target_nframes: ${globals.target_nframes}
+    target_resolution: ${globals.latent_res}
+    outputs: ${globals.outputs}
+    latent_channels: ${globals.latent_channels}
+- name: Latent
+  active: true
+  params:
+    root: avae-16f8/lvh
+    target_fps: ${globals.target_fps}
+    target_nframes: ${globals.target_nframes}
+    target_resolution: ${globals.latent_res}
+    outputs: ${globals.outputs}
+    latent_channels: ${globals.latent_channels}
+dataloader:
+  target: torch.utils.data.DataLoader
+  args:
+    shuffle: true
+    batch_size: 32
+    num_workers: 32
+    pin_memory: true
+    drop_last: true
+    persistent_workers: true
+max_train_steps: 1000000
+gradient_accumulation_steps: 1
+mixed_precision: bf16
+use_ema: true
+max_grad_norm: 1.0
+max_grad_value: -1
+sample_latents: true
+noise_offset: 0.05
+noise_cond_image: 0.05
+no_conditionning: false
+p_drop_conditionning: 0.3
+output_dir: experiments/${wandb_args.name}
+logging_dir: logs
+report_to: wandb
+wandb_args:
+  project: EchoFlow
+  name: STUNet-S-16f8
+  group: STUNet
+checkpointing_steps: 10000
+checkpoints_to_keep:
+- 50000
+- 100000
+- 200000
+- 300000
+- 500000
+- 1000000
+resume_from_checkpoint: latest
+validation:
+  samples: 4
+  steps: 5000
+  timesteps: 25
+  frames: ${globals.target_nframes}
+  fps: ${globals.target_fps}
+  lvefs:
+  - -1.0
+  - 0.3
+  - 0.6
+  - 0.9
+  cond_image_mask:
+  - 0
+  - 1
+  - 1
+  - 1
+seed: 42
+num_train_epochs: 28572

lvfm/STUNet-S-4f4/config.yaml ADDED Viewed

	@@ -0,0 +1,152 @@

+globals:
+  target_fps: 32
+  target_nframes: 64
+  outputs:
+  - video
+  - lvef
+  - image
+  resolution: 112
+  latent_res: 28
+  latent_channels: 4
+denoiser:
+  target: echosyn.common.models.UNetSTIC
+  args:
+    in_channels: 8
+    out_channels: ${globals.latent_channels}
+    sample_size: ${globals.latent_res}
+    addition_time_embed_dim: 1
+    block_out_channels:
+    - 64
+    - 128
+    - 192
+    - 256
+    cross_attention_dim: 1
+    down_block_types:
+    - CrossAttnDownBlockSpatioTemporal
+    - CrossAttnDownBlockSpatioTemporal
+    - CrossAttnDownBlockSpatioTemporal
+    - DownBlockSpatioTemporal
+    layers_per_block: 2
+    num_attention_heads:
+    - 8
+    - 16
+    - 16
+    - 32
+    num_frames: 64
+    projection_class_embeddings_input_dim: 1
+    transformer_layers_per_block: 1
+    up_block_types:
+    - UpBlockSpatioTemporal
+    - CrossAttnUpBlockSpatioTemporal
+    - CrossAttnUpBlockSpatioTemporal
+    - CrossAttnUpBlockSpatioTemporal
+optimizer:
+  target: torch.optim.AdamW
+  args:
+    lr: 0.0001
+    betas:
+    - 0.9
+    - 0.999
+    weight_decay: 0.01
+    eps: 1.0e-08
+scheduler:
+  target: echosyn.common.schedulers.StepBasedLearningRateScheduleWithWarmup
+  args:
+    warmup_steps: 2000
+    ref_steps: ${max_train_steps}
+    eta_min: 1.0e-06
+    decay_rate: 2.0
+vae:
+  target: diffusers.AutoencoderKL
+  pretrained: vae/avae-4f4
+datasets:
+- name: Latent
+  active: true
+  params:
+    root: avae-4f4/dynamic
+    target_fps: ${globals.target_fps}
+    target_nframes: ${globals.target_nframes}
+    target_resolution: ${globals.latent_res}
+    outputs: ${globals.outputs}
+    latent_channels: ${globals.latent_channels}
+- name: Latent
+  active: true
+  params:
+    root: avae-4f4/ped_a4c
+    target_fps: ${globals.target_fps}
+    target_nframes: ${globals.target_nframes}
+    target_resolution: ${globals.latent_res}
+    outputs: ${globals.outputs}
+    latent_channels: ${globals.latent_channels}
+- name: Latent
+  active: true
+  params:
+    root: avae-4f4/ped_psax
+    target_fps: ${globals.target_fps}
+    target_nframes: ${globals.target_nframes}
+    target_resolution: ${globals.latent_res}
+    outputs: ${globals.outputs}
+    latent_channels: ${globals.latent_channels}
+- name: Latent
+  active: true
+  params:
+    root: avae-4f4/lvh
+    target_fps: ${globals.target_fps}
+    target_nframes: ${globals.target_nframes}
+    target_resolution: ${globals.latent_res}
+    outputs: ${globals.outputs}
+    latent_channels: ${globals.latent_channels}
+dataloader:
+  target: torch.utils.data.DataLoader
+  args:
+    shuffle: true
+    batch_size: 8
+    num_workers: 8
+    pin_memory: true
+    drop_last: true
+    persistent_workers: true
+max_train_steps: 1000000
+gradient_accumulation_steps: 1
+mixed_precision: bf16
+use_ema: true
+max_grad_norm: 1.0
+max_grad_value: -1
+sample_latents: true
+noise_offset: 0.05
+noise_cond_image: 0.05
+no_conditionning: false
+p_drop_conditionning: 0.3
+output_dir: experiments/${wandb_args.name}
+logging_dir: logs
+report_to: wandb
+wandb_args:
+  project: EchoFlow
+  name: STUNet-S-4f4
+  group: STUNet
+checkpointing_steps: 10000
+checkpoints_to_keep:
+- 50000
+- 100000
+- 200000
+- 300000
+- 500000
+- 1000000
+resume_from_checkpoint: latest
+validation:
+  samples: 4
+  steps: 5000
+  timesteps: 25
+  frames: ${globals.target_nframes}
+  fps: ${globals.target_fps}
+  lvefs:
+  - -1.0
+  - 0.3
+  - 0.6
+  - 0.9
+  cond_image_mask:
+  - 0
+  - 1
+  - 1
+  - 1
+seed: 42
+num_train_epochs: 28572

reid/dynamic-4f4/config.yaml ADDED Viewed

	@@ -0,0 +1,64 @@

+globals:
+  latent_channels: 4
+dataset:
+  target: echosyn.common.datasets.ContrastivePair
+  args:
+    root: avae-4f4/dynamic
+    folder: Latents
+    extension: pt
+dataloader:
+  target: torch.utils.data.DataLoader
+  args:
+    shuffle: true
+    batch_size: 32
+    num_workers: 16
+    pin_memory: true
+    drop_last: true
+    persistent_workers: true
+backbone:
+  target: echosyn.reindentification.model.ResNet18
+  args:
+    weights: torchvision.models.ResNet18_Weights.IMAGENET1K_V1
+    progress: false
+model:
+  target: echosyn.reindentification.model.ContrastiveModel
+  args:
+    in_channels: 4
+    out_channels: 256
+    kl_loss_weight: 0.0
+optimizer:
+  target: torch.optim.AdamW
+  args:
+    lr: 0.0001
+    betas:
+    - 0.9
+    - 0.999
+    weight_decay: 0.01
+    eps: 1.0e-08
+scheduler:
+  target: torch.optim.lr_scheduler.ConstantLR
+  args:
+    factor: 1.0
+vae:
+  target: diffusers.AutoencoderKL
+  pretrained: vae/avae-4f4
+max_train_steps: 60000
+gradient_accumulation_steps: 1
+mixed_precision: bf16
+max_grad_norm: 10.0
+sample_latents: true
+validation_steps: 10000
+validation_samples: 99999
+output_dir: experiments/${wandb_args.group}/${wandb_args.name}
+logging_dir: logs
+report_to: wandb
+wandb_args:
+  project: EchoFlow
+  name: dynamic_4f4
+  group: reindentification
+checkpointing_steps: 10000
+checkpoints_total_limit: 3
+resume_from_checkpoint: null
+seed: 42
+no_wandb: false
+num_train_epochs: 258

reid/lvh-4f4/config.yaml ADDED Viewed

	@@ -0,0 +1,64 @@

+globals:
+  latent_channels: 4
+dataset:
+  target: echosyn.common.datasets.ContrastivePair
+  args:
+    root: avae-4f4/lvh
+    folder: Latents
+    extension: pt
+dataloader:
+  target: torch.utils.data.DataLoader
+  args:
+    shuffle: true
+    batch_size: 32
+    num_workers: 16
+    pin_memory: true
+    drop_last: true
+    persistent_workers: true
+backbone:
+  target: echosyn.reindentification.model.ResNet18
+  args:
+    weights: torchvision.models.ResNet18_Weights.IMAGENET1K_V1
+    progress: false
+model:
+  target: echosyn.reindentification.model.ContrastiveModel
+  args:
+    in_channels: 4
+    out_channels: 256
+    kl_loss_weight: 0.0
+optimizer:
+  target: torch.optim.AdamW
+  args:
+    lr: 0.0001
+    betas:
+    - 0.9
+    - 0.999
+    weight_decay: 0.01
+    eps: 1.0e-08
+scheduler:
+  target: torch.optim.lr_scheduler.ConstantLR
+  args:
+    factor: 1.0
+vae:
+  target: diffusers.AutoencoderKL
+  pretrained: vae/avae-4f4
+max_train_steps: 60000
+gradient_accumulation_steps: 1
+mixed_precision: bf16
+max_grad_norm: 10.0
+sample_latents: true
+validation_steps: 10000
+validation_samples: 99999
+output_dir: experiments/${wandb_args.group}/${wandb_args.name}
+logging_dir: logs
+report_to: wandb
+wandb_args:
+  project: EchoFlow
+  name: lvh_4f4
+  group: reindentification
+checkpointing_steps: 10000
+checkpoints_total_limit: 3
+resume_from_checkpoint: null
+seed: 42
+no_wandb: false
+num_train_epochs: 203

reid/ped_a4c-4f4/config.yaml ADDED Viewed

	@@ -0,0 +1,64 @@

+globals:
+  latent_channels: 4
+dataset:
+  target: echosyn.common.datasets.ContrastivePair
+  args:
+    root: avae-4f4/ped_a4c
+    folder: Latents
+    extension: pt
+dataloader:
+  target: torch.utils.data.DataLoader
+  args:
+    shuffle: true
+    batch_size: 32
+    num_workers: 16
+    pin_memory: true
+    drop_last: true
+    persistent_workers: true
+backbone:
+  target: echosyn.reindentification.model.ResNet18
+  args:
+    weights: torchvision.models.ResNet18_Weights.IMAGENET1K_V1
+    progress: false
+model:
+  target: echosyn.reindentification.model.ContrastiveModel
+  args:
+    in_channels: 4
+    out_channels: 256
+    kl_loss_weight: 0.0
+optimizer:
+  target: torch.optim.AdamW
+  args:
+    lr: 0.0001
+    betas:
+    - 0.9
+    - 0.999
+    weight_decay: 0.01
+    eps: 1.0e-08
+scheduler:
+  target: torch.optim.lr_scheduler.ConstantLR
+  args:
+    factor: 1.0
+vae:
+  target: diffusers.AutoencoderKL
+  pretrained: vae/avae-4f4
+max_train_steps: 60000
+gradient_accumulation_steps: 1
+mixed_precision: bf16
+max_grad_norm: 10.0
+sample_latents: true
+validation_steps: 10000
+validation_samples: 99999
+output_dir: experiments/${wandb_args.group}/${wandb_args.name}
+logging_dir: logs
+report_to: wandb
+wandb_args:
+  project: EchoFlow
+  name: ped_a4c_4f4
+  group: reindentification
+checkpointing_steps: 10000
+checkpoints_total_limit: 3
+resume_from_checkpoint: null
+seed: 42
+no_wandb: false
+num_train_epochs: 750

reid/ped_psax-4f4/config.yaml ADDED Viewed

	@@ -0,0 +1,64 @@

+globals:
+  latent_channels: 4
+dataset:
+  target: echosyn.common.datasets.ContrastivePair
+  args:
+    root: avae-4f4/ped_psax
+    folder: Latents
+    extension: pt
+dataloader:
+  target: torch.utils.data.DataLoader
+  args:
+    shuffle: true
+    batch_size: 32
+    num_workers: 16
+    pin_memory: true
+    drop_last: true
+    persistent_workers: true
+backbone:
+  target: echosyn.reindentification.model.ResNet18
+  args:
+    weights: torchvision.models.ResNet18_Weights.IMAGENET1K_V1
+    progress: false
+model:
+  target: echosyn.reindentification.model.ContrastiveModel
+  args:
+    in_channels: 4
+    out_channels: 256
+    kl_loss_weight: 0.0
+optimizer:
+  target: torch.optim.AdamW
+  args:
+    lr: 0.0001
+    betas:
+    - 0.9
+    - 0.999
+    weight_decay: 0.01
+    eps: 1.0e-08
+scheduler:
+  target: torch.optim.lr_scheduler.ConstantLR
+  args:
+    factor: 1.0
+vae:
+  target: diffusers.AutoencoderKL
+  pretrained: vae/avae-4f4
+max_train_steps: 60000
+gradient_accumulation_steps: 1
+mixed_precision: bf16
+max_grad_norm: 10.0
+sample_latents: true
+validation_steps: 10000
+validation_samples: 99999
+output_dir: experiments/${wandb_args.group}/${wandb_args.name}
+logging_dir: logs
+report_to: wandb
+wandb_args:
+  project: EchoFlow
+  name: ped_psax_4f4
+  group: reindentification
+checkpointing_steps: 10000
+checkpoints_total_limit: 3
+resume_from_checkpoint: null
+seed: 42
+no_wandb: false
+num_train_epochs: 541

vae/avae-16f8/config.yaml ADDED Viewed

	@@ -0,0 +1,54 @@

+model:
+  base_learning_rate: 2e-6 # ~5e-4 after scaloing
+  target: ldm.models.autoencoder.AutoencoderKL
+  params:
+    monitor: "val/rec_loss"
+    embed_dim: 16
+    lossconfig:
+      target: ldm.modules.losses.LPIPSWithDiscriminator
+      params:
+        disc_start: 50001
+        kl_weight: 0.000001
+        disc_weight: 0.5
+    ddconfig:
+      double_z: True
+      z_channels: 16
+      resolution: 112
+      in_channels: 3
+      out_ch: 3
+      ch: 128
+      ch_mult: [ 1,2,2,4 ]  # num_down = len(ch_mult)-1
+      num_res_blocks: 2
+      attn_resolutions: [ ]
+      dropout: 0.0
+data:
+  target: main.DataModuleFromConfig
+  params:
+    batch_size: 32
+    num_workers: 16
+    train:
+      target: taming.data.custom.CustomTrain
+      params:
+        training_images_list_file: ${oc.env:TMPDIR}/train.txt
+        size: 112
+    validation:
+      target: taming.data.custom.CustomTest
+      params:
+        test_images_list_file: ${oc.env:TMPDIR}/val.txt
+        size: 112
+lightning:
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        batch_frequency: 1000
+        max_images: 8
+        increase_log_steps: True
+  trainer:
+    benchmark: True
+    accumulate_grad_batches: 2
+    max_epochs: 1000

vae/avae-4f4/config.yaml ADDED Viewed

	@@ -0,0 +1,54 @@

+model:
+  base_learning_rate: 2e-6 # ~5e-4 after scaloing
+  target: ldm.models.autoencoder.AutoencoderKL
+  params:
+    monitor: "val/rec_loss"
+    embed_dim: 4
+    lossconfig:
+      target: ldm.modules.losses.LPIPSWithDiscriminator
+      params:
+        disc_start: 50001
+        kl_weight: 0.000001
+        disc_weight: 0.5
+    ddconfig:
+      double_z: True
+      z_channels: 4
+      resolution: 112
+      in_channels: 3
+      out_ch: 3
+      ch: 128
+      ch_mult: [ 1,2,4 ]  # num_down = len(ch_mult)-1
+      num_res_blocks: 2
+      attn_resolutions: [ ]
+      dropout: 0.0
+data:
+  target: main.DataModuleFromConfig
+  params:
+    batch_size: 32
+    num_workers: 16
+    train:
+      target: taming.data.custom.CustomTrain
+      params:
+        training_images_list_file: ${oc.env:TMPDIR}/train.txt
+        size: 112
+    validation:
+      target: taming.data.custom.CustomTest
+      params:
+        test_images_list_file: ${oc.env:TMPDIR}/val.txt
+        size: 112
+lightning:
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        batch_frequency: 1000
+        max_images: 8
+        increase_log_steps: True
+  trainer:
+    benchmark: True
+    accumulate_grad_batches: 2
+    max_epochs: 1000

vae/avae-4f8/config.yaml ADDED Viewed

	@@ -0,0 +1,54 @@

+model:
+  base_learning_rate: 2e-6 # ~5e-4 after scaloing
+  target: ldm.models.autoencoder.AutoencoderKL
+  params:
+    monitor: "val/rec_loss"
+    embed_dim: 4
+    lossconfig:
+      target: ldm.modules.losses.LPIPSWithDiscriminator
+      params:
+        disc_start: 50001
+        kl_weight: 0.000001
+        disc_weight: 0.5
+    ddconfig:
+      double_z: True
+      z_channels: 4
+      resolution: 112
+      in_channels: 3
+      out_ch: 3
+      ch: 128
+      ch_mult: [ 1,2,2,4 ]  # num_down = len(ch_mult)-1
+      num_res_blocks: 2
+      attn_resolutions: [ ]
+      dropout: 0.0
+data:
+  target: main.DataModuleFromConfig
+  params:
+    batch_size: 32
+    num_workers: 16
+    train:
+      target: taming.data.custom.CustomTrain
+      params:
+        training_images_list_file: ${oc.env:TMPDIR}/train.txt
+        size: 112
+    validation:
+      target: taming.data.custom.CustomTest
+      params:
+        test_images_list_file: ${oc.env:TMPDIR}/val.txt
+        size: 112
+lightning:
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        batch_frequency: 1000
+        max_images: 8
+        increase_log_steps: True
+  trainer:
+    benchmark: True
+    accumulate_grad_batches: 2
+    max_epochs: 1000