diff --git "a/logs/exp_dc_f32c32_FM.log" "b/logs/exp_dc_f32c32_FM.log" new file mode 100644--- /dev/null +++ "b/logs/exp_dc_f32c32_FM.log" @@ -0,0 +1,1631 @@ +nohup: ignoring input +The following values were not passed to `accelerate launch` and had defaults used instead: + `--num_processes` was set to a value of `8` + More than one GPU was found, enabling multi-GPU training. + If this was unintended please pass in `--num_processes=1`. + `--num_machines` was set to a value of `1` + `--mixed_precision` was set to a value of `'no'` + `--dynamo_backend` was set to a value of `'no'` +To avoid this warning pass in values for each of the problematic parameters or run `accelerate config`. +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torch/__init__.py:1617: UserWarning: Please use the new API settings to control TF32 behavior, such as torch.backends.cudnn.conv.fp32_precision = 'tf32' or torch.backends.cuda.matmul.fp32_precision = 'ieee'. Old settings, e.g, torch.backends.cuda.matmul.allow_tf32 = True, torch.backends.cudnn.allow_tf32 = True, allowTF32CuDNN() and allowTF32CuBLAS() will be deprecated after Pytorch 2.9. Please see https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices (Triggered internally at /pytorch/aten/src/ATen/Context.cpp:80.) + _C._set_float32_matmul_precision(precision) +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torch/__init__.py:1617: UserWarning: Please use the new API settings to control TF32 behavior, such as torch.backends.cudnn.conv.fp32_precision = 'tf32' or torch.backends.cuda.matmul.fp32_precision = 'ieee'. Old settings, e.g, torch.backends.cuda.matmul.allow_tf32 = True, torch.backends.cudnn.allow_tf32 = True, allowTF32CuDNN() and allowTF32CuBLAS() will be deprecated after Pytorch 2.9. Please see https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices (Triggered internally at /pytorch/aten/src/ATen/Context.cpp:80.) + _C._set_float32_matmul_precision(precision) +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torch/__init__.py:1617: UserWarning: Please use the new API settings to control TF32 behavior, such as torch.backends.cudnn.conv.fp32_precision = 'tf32' or torch.backends.cuda.matmul.fp32_precision = 'ieee'. Old settings, e.g, torch.backends.cuda.matmul.allow_tf32 = True, torch.backends.cudnn.allow_tf32 = True, allowTF32CuDNN() and allowTF32CuBLAS() will be deprecated after Pytorch 2.9. Please see https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices (Triggered internally at /pytorch/aten/src/ATen/Context.cpp:80.) + _C._set_float32_matmul_precision(precision) +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torch/__init__.py:1617: UserWarning: Please use the new API settings to control TF32 behavior, such as torch.backends.cudnn.conv.fp32_precision = 'tf32' or torch.backends.cuda.matmul.fp32_precision = 'ieee'. Old settings, e.g, torch.backends.cuda.matmul.allow_tf32 = True, torch.backends.cudnn.allow_tf32 = True, allowTF32CuDNN() and allowTF32CuBLAS() will be deprecated after Pytorch 2.9. Please see https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices (Triggered internally at /pytorch/aten/src/ATen/Context.cpp:80.) + _C._set_float32_matmul_precision(precision) +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torch/__init__.py:1617: UserWarning: Please use the new API settings to control TF32 behavior, such as torch.backends.cudnn.conv.fp32_precision = 'tf32' or torch.backends.cuda.matmul.fp32_precision = 'ieee'. Old settings, e.g, torch.backends.cuda.matmul.allow_tf32 = True, torch.backends.cudnn.allow_tf32 = True, allowTF32CuDNN() and allowTF32CuBLAS() will be deprecated after Pytorch 2.9. Please see https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices (Triggered internally at /pytorch/aten/src/ATen/Context.cpp:80.) + _C._set_float32_matmul_precision(precision) +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torch/__init__.py:1617: UserWarning: Please use the new API settings to control TF32 behavior, such as torch.backends.cudnn.conv.fp32_precision = 'tf32' or torch.backends.cuda.matmul.fp32_precision = 'ieee'. Old settings, e.g, torch.backends.cuda.matmul.allow_tf32 = True, torch.backends.cudnn.allow_tf32 = True, allowTF32CuDNN() and allowTF32CuBLAS() will be deprecated after Pytorch 2.9. Please see https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices (Triggered internally at /pytorch/aten/src/ATen/Context.cpp:80.) + _C._set_float32_matmul_precision(precision) +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torch/__init__.py:1617: UserWarning: Please use the new API settings to control TF32 behavior, such as torch.backends.cudnn.conv.fp32_precision = 'tf32' or torch.backends.cuda.matmul.fp32_precision = 'ieee'. Old settings, e.g, torch.backends.cuda.matmul.allow_tf32 = True, torch.backends.cudnn.allow_tf32 = True, allowTF32CuDNN() and allowTF32CuBLAS() will be deprecated after Pytorch 2.9. Please see https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices (Triggered internally at /pytorch/aten/src/ATen/Context.cpp:80.) + _C._set_float32_matmul_precision(precision) +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torch/__init__.py:1617: UserWarning: Please use the new API settings to control TF32 behavior, such as torch.backends.cudnn.conv.fp32_precision = 'tf32' or torch.backends.cuda.matmul.fp32_precision = 'ieee'. Old settings, e.g, torch.backends.cuda.matmul.allow_tf32 = True, torch.backends.cudnn.allow_tf32 = True, allowTF32CuDNN() and allowTF32CuBLAS() will be deprecated after Pytorch 2.9. Please see https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices (Triggered internally at /pytorch/aten/src/ATen/Context.cpp:80.) + _C._set_float32_matmul_precision(precision) +Setting up [LPIPS] perceptual loss: trunk [alex], v[0.1], spatial [off] +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torchvision/models/_utils.py:208: UserWarning: The parameter 'pretrained' is deprecated since 0.13 and may be removed in the future, please use 'weights' instead. + warnings.warn( +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and may be removed in the future. The current behavior is equivalent to passing `weights=AlexNet_Weights.IMAGENET1K_V1`. You can also use `weights=AlexNet_Weights.DEFAULT` to get the most up-to-date weights. + warnings.warn(msg) +Setting up [LPIPS] perceptual loss: trunk [alex], v[0.1], spatial [off] +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torchvision/models/_utils.py:208: UserWarning: The parameter 'pretrained' is deprecated since 0.13 and may be removed in the future, please use 'weights' instead. + warnings.warn( +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and may be removed in the future. The current behavior is equivalent to passing `weights=AlexNet_Weights.IMAGENET1K_V1`. You can also use `weights=AlexNet_Weights.DEFAULT` to get the most up-to-date weights. + warnings.warn(msg) +Setting up [LPIPS] perceptual loss: trunk [alex], v[0.1], spatial [off] +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torchvision/models/_utils.py:208: UserWarning: The parameter 'pretrained' is deprecated since 0.13 and may be removed in the future, please use 'weights' instead. + warnings.warn( +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and may be removed in the future. The current behavior is equivalent to passing `weights=AlexNet_Weights.IMAGENET1K_V1`. You can also use `weights=AlexNet_Weights.DEFAULT` to get the most up-to-date weights. + warnings.warn(msg) +Setting up [LPIPS] perceptual loss: trunk [alex], v[0.1], spatial [off] +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torchvision/models/_utils.py:208: UserWarning: The parameter 'pretrained' is deprecated since 0.13 and may be removed in the future, please use 'weights' instead. + warnings.warn( +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and may be removed in the future. The current behavior is equivalent to passing `weights=AlexNet_Weights.IMAGENET1K_V1`. You can also use `weights=AlexNet_Weights.DEFAULT` to get the most up-to-date weights. + warnings.warn(msg) +Setting up [LPIPS] perceptual loss: trunk [alex], v[0.1], spatial [off] +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torchvision/models/_utils.py:208: UserWarning: The parameter 'pretrained' is deprecated since 0.13 and may be removed in the future, please use 'weights' instead. + warnings.warn( +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and may be removed in the future. The current behavior is equivalent to passing `weights=AlexNet_Weights.IMAGENET1K_V1`. You can also use `weights=AlexNet_Weights.DEFAULT` to get the most up-to-date weights. + warnings.warn(msg) +Setting up [LPIPS] perceptual loss: trunk [alex], v[0.1], spatial [off] +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torchvision/models/_utils.py:208: UserWarning: The parameter 'pretrained' is deprecated since 0.13 and may be removed in the future, please use 'weights' instead. + warnings.warn( +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and may be removed in the future. The current behavior is equivalent to passing `weights=AlexNet_Weights.IMAGENET1K_V1`. You can also use `weights=AlexNet_Weights.DEFAULT` to get the most up-to-date weights. + warnings.warn(msg) +Setting up [LPIPS] perceptual loss: trunk [alex], v[0.1], spatial [off] +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torchvision/models/_utils.py:208: UserWarning: The parameter 'pretrained' is deprecated since 0.13 and may be removed in the future, please use 'weights' instead. + warnings.warn( +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and may be removed in the future. The current behavior is equivalent to passing `weights=AlexNet_Weights.IMAGENET1K_V1`. You can also use `weights=AlexNet_Weights.DEFAULT` to get the most up-to-date weights. + warnings.warn(msg) +Setting up [LPIPS] perceptual loss: trunk [alex], v[0.1], spatial [off] +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torchvision/models/_utils.py:208: UserWarning: The parameter 'pretrained' is deprecated since 0.13 and may be removed in the future, please use 'weights' instead. + warnings.warn( +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and may be removed in the future. The current behavior is equivalent to passing `weights=AlexNet_Weights.IMAGENET1K_V1`. You can also use `weights=AlexNet_Weights.DEFAULT` to get the most up-to-date weights. + warnings.warn(msg) +Loading model from: /workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/lpips/weights/v0.1/alex.pth +Loading model from: /workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/lpips/weights/v0.1/alex.pth +Using cached /workspace/DC_SSDAE/runs/cache +Using cached /workspace/DC_SSDAE/runs/cache +Using cache found in /workspace/DC_SSDAE/runs/cache/facebookresearch_dino_main +Using cache found in /workspace/DC_SSDAE/runs/cache/facebookresearch_dino_main +Loading model from: /workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/lpips/weights/v0.1/alex.pth +Using cached /workspace/DC_SSDAE/runs/cache +Using cache found in /workspace/DC_SSDAE/runs/cache/facebookresearch_dino_main +Loading model from: /workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/lpips/weights/v0.1/alex.pth +Loading model from: /workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/lpips/weights/v0.1/alex.pth +Loading model from: /workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/lpips/weights/v0.1/alex.pth +Using cached /workspace/DC_SSDAE/runs/cache +Using cached /workspace/DC_SSDAE/runs/cache +Using cache found in /workspace/DC_SSDAE/runs/cache/facebookresearch_dino_main +Using cache found in /workspace/DC_SSDAE/runs/cache/facebookresearch_dino_main +Using cached /workspace/DC_SSDAE/runs/cache +Using cache found in /workspace/DC_SSDAE/runs/cache/facebookresearch_dino_main +Loading model from: /workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/lpips/weights/v0.1/alex.pth +Using cached /workspace/DC_SSDAE/runs/cache +Using cache found in /workspace/DC_SSDAE/runs/cache/facebookresearch_dino_main +Loading model from: /workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/lpips/weights/v0.1/alex.pth +Using cached /workspace/DC_SSDAE/runs/cache +Using cache found in /workspace/DC_SSDAE/runs/cache/facebookresearch_dino_main +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torch/nn/utils/weight_norm.py:144: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`. + WeightNorm.apply(module, name, dim) +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torch/nn/utils/weight_norm.py:144: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`. + WeightNorm.apply(module, name, dim) +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torch/nn/utils/weight_norm.py:144: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`. + WeightNorm.apply(module, name, dim) +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torch/nn/utils/weight_norm.py:144: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`. + WeightNorm.apply(module, name, dim) +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torch/nn/utils/weight_norm.py:144: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`. + WeightNorm.apply(module, name, dim) +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torch/nn/utils/weight_norm.py:144: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`. + WeightNorm.apply(module, name, dim) +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torch/nn/utils/weight_norm.py:144: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`. + WeightNorm.apply(module, name, dim) +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torch/nn/utils/weight_norm.py:144: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`. + WeightNorm.apply(module, name, dim) +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/peft/tuners/tuners_utils.py:196: UserWarning: Already found a `peft_config` attribute in the model. This will lead to having multiple adapters in the model. Make sure to know what you are doing! + warnings.warn( +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/peft/tuners/tuners_utils.py:196: UserWarning: Already found a `peft_config` attribute in the model. This will lead to having multiple adapters in the model. Make sure to know what you are doing! + warnings.warn( +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/peft/tuners/tuners_utils.py:196: UserWarning: Already found a `peft_config` attribute in the model. This will lead to having multiple adapters in the model. Make sure to know what you are doing! + warnings.warn( +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/peft/tuners/tuners_utils.py:196: UserWarning: Already found a `peft_config` attribute in the model. This will lead to having multiple adapters in the model. Make sure to know what you are doing! + warnings.warn( +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/peft/tuners/tuners_utils.py:196: UserWarning: Already found a `peft_config` attribute in the model. This will lead to having multiple adapters in the model. Make sure to know what you are doing! + warnings.warn( +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/peft/tuners/tuners_utils.py:196: UserWarning: Already found a `peft_config` attribute in the model. This will lead to having multiple adapters in the model. Make sure to know what you are doing! + warnings.warn( +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/peft/tuners/tuners_utils.py:196: UserWarning: Already found a `peft_config` attribute in the model. This will lead to having multiple adapters in the model. Make sure to know what you are doing! + warnings.warn( +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/peft/tuners/tuners_utils.py:196: UserWarning: Already found a `peft_config` attribute in the model. This will lead to having multiple adapters in the model. Make sure to know what you are doing! + warnings.warn( +File already exists: /workspace/DC_SSDAE/runs/cache/weights-inception-2015-12-05-6726825d.pth +File already exists: /workspace/DC_SSDAE/runs/cache/weights-inception-2015-12-05-6726825d.pth +File already exists: /workspace/DC_SSDAE/runs/cache/weights-inception-2015-12-05-6726825d.pth +File already exists: /workspace/DC_SSDAE/runs/cache/weights-inception-2015-12-05-6726825d.pth +File already exists: /workspace/DC_SSDAE/runs/cache/weights-inception-2015-12-05-6726825d.pth +File already exists: /workspace/DC_SSDAE/runs/cache/weights-inception-2015-12-05-6726825d.pth +File already exists: /workspace/DC_SSDAE/runs/cache/weights-inception-2015-12-05-6726825d.pth +File already exists: /workspace/DC_SSDAE/runs/cache/weights-inception-2015-12-05-6726825d.pth +[2025-10-25 04:11:21,158][main][INFO] - Will write tensorboard logs inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_FM/tensorboard_logs +[2025-10-25 04:11:21,179][main][INFO] - Runtime at /workspace/DC_SSDAE +[2025-10-25 04:11:21,180][main][INFO] - Running inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_FM +[2025-10-25 04:11:21,181][main][INFO] - Running args: ['main.py', 'run_name=train_enc_dc_f32c32_FM', 'dataset.im_size=128', 'dataset.aug_scale=2', 'training.epochs=20', 'dc_ssdae.encoder_train=true'] +[2025-10-25 04:11:21,182][main][INFO] - Command: 'main.py' 'run_name=train_enc_dc_f32c32_FM' 'dataset.im_size=128' 'dataset.aug_scale=2' 'training.epochs=20' 'dc_ssdae.encoder_train=true' +[2025-10-25 04:11:21,182][main][INFO] - Accelerator with 8 processes, running on cuda:0 +[2025-10-25 04:11:21,186][main][INFO] - Hydra configuration: +seed: 0 +task: train +runtime_path: ${hydra:runtime.cwd} +ckpt_dir: ${runtime_path}/runs +run_name: train_enc_dc_f32c32_FM +cache_dir: ${ckpt_dir}/cache +run_dir: ${ckpt_dir}/jobs/${run_name} +checkpoint_path: ${run_dir}/checkpoints +dataset: + imagenet_root: imagenet_data + im_size: 128 + batch_size: 192 + aug_scale: 2 + limit: null +distill_teacher: false +dc_ssdae: + compile: false + checkpoint: null + encoder: f32c32 + encoder_checkpoint: null + encoder_train: true + decoder: S + trainer_type: FM + encoder_type: dc + sampler: + steps: 10 + ema: + decay: 0.999 + start_iter: 50000 +aux_losses: + compile: ${dc_ssdae.compile} + repa: + i_extract: 4 + n_layers: 2 + lpips: true +training: + sdpa_kernel: 2 + mixed_precision: bf16 + grad_accumulate: 1 + grad_clip: 0.1 + epochs: 20 + eval_freq: 1 + save_on_best: FID + log_freq: 100 + lr: 0.0003 + weight_decay: 0.001 +losses: + diffusion: 1 + repa: 0.25 + lpips: 0.5 + kl: 1.0e-06 +show_samples: 8 + + + +[2025-10-25 04:11:35,084][main][INFO] - Loaded ImageNet dataset: {'train': Dataset ImageNet + Number of datapoints: 1279867 + Root location: ../../../imagenet_data + Split: train + StandardTransform +Transform: Compose( + RandomResize(min_size=128, max_size=256, interpolation=InterpolationMode.LANCZOS, antialias=True) + RandomCrop(size=(128, 128), pad_if_needed=False, fill=0, padding_mode=constant) + RandomHorizontalFlip(p=0.5) + ToImage() + ToDtype(scale=True) + Normalize(mean=[0.5], std=[0.5], inplace=False) + ), 'test': Dataset ImageNet + Number of datapoints: 49950 + Root location: ../../../imagenet_data + Split: validation + StandardTransform +Transform: Compose( + Resize(size=[128], interpolation=InterpolationMode.BILINEAR, antialias=True) + CenterCrop(size=(128, 128)) + ToImage() + ToDtype(scale=True) + Normalize(mean=[0.5], std=[0.5], inplace=False) + )} +[WARNING] Model buffers behavior should be defined using the '_ema' parameter. No _ema key for the buffer decoder.batch_norm_z.running_mean. Will default to 'ingore'. +[WARNING] Model buffers behavior should be defined using the '_ema' parameter. No _ema key for the buffer decoder.batch_norm_z.running_var. Will default to 'ingore'. +[WARNING] Model buffers behavior should be defined using the '_ema' parameter. No _ema key for the buffer decoder.batch_norm_z.num_batches_tracked. Will default to 'ingore'. +[WARNING] Model buffers behavior should be defined using the '_ema' parameter. No _ema key for the buffer decoder.batch_norm_z.running_mean. Will default to 'ingore'. +[WARNING] Model buffers behavior should be defined using the '_ema' parameter. No _ema key for the buffer decoder.batch_norm_z.running_var. Will default to 'ingore'. +[WARNING] Model buffers behavior should be defined using the '_ema' parameter. No _ema key for the buffer decoder.batch_norm_z.num_batches_tracked. Will default to 'ingore'. +[WARNING] Model buffers behavior should be defined using the '_ema' parameter. No _ema key for the buffer decoder.batch_norm_z.running_mean. Will default to 'ingore'. +[WARNING] Model buffers behavior should be defined using the '_ema' parameter. No _ema key for the buffer decoder.batch_norm_z.running_var. Will default to 'ingore'. +[WARNING] Model buffers behavior should be defined using the '_ema' parameter. No _ema key for the buffer decoder.batch_norm_z.num_batches_tracked. Will default to 'ingore'. +[WARNING] Model buffers behavior should be defined using the '_ema' parameter. No _ema key for the buffer decoder.batch_norm_z.running_mean. Will default to 'ingore'. +[WARNING] Model buffers behavior should be defined using the '_ema' parameter. No _ema key for the buffer decoder.batch_norm_z.running_var. Will default to 'ingore'. +[WARNING] Model buffers behavior should be defined using the '_ema' parameter. No _ema key for the buffer decoder.batch_norm_z.num_batches_tracked. Will default to 'ingore'. +[WARNING] Model buffers behavior should be defined using the '_ema' parameter. No _ema key for the buffer decoder.batch_norm_z.running_mean. Will default to 'ingore'. +[WARNING] Model buffers behavior should be defined using the '_ema' parameter. No _ema key for the buffer decoder.batch_norm_z.running_var. Will default to 'ingore'. +[WARNING] Model buffers behavior should be defined using the '_ema' parameter. No _ema key for the buffer decoder.batch_norm_z.num_batches_tracked. Will default to 'ingore'. +[WARNING] Model buffers behavior should be defined using the '_ema' parameter. No _ema key for the buffer decoder.batch_norm_z.running_mean. Will default to 'ingore'. +[WARNING] Model buffers behavior should be defined using the '_ema' parameter. No _ema key for the buffer decoder.batch_norm_z.running_var. Will default to 'ingore'. +[WARNING] Model buffers behavior should be defined using the '_ema' parameter. No _ema key for the buffer decoder.batch_norm_z.num_batches_tracked. Will default to 'ingore'. +[WARNING] Model buffers behavior should be defined using the '_ema' parameter. No _ema key for the buffer decoder.batch_norm_z.running_mean. Will default to 'ingore'. +[WARNING] Model buffers behavior should be defined using the '_ema' parameter. No _ema key for the buffer decoder.batch_norm_z.running_var. Will default to 'ingore'. +[WARNING] Model buffers behavior should be defined using the '_ema' parameter. No _ema key for the buffer decoder.batch_norm_z.num_batches_tracked. Will default to 'ingore'. +[WARNING] Model buffers behavior should be defined using the '_ema' parameter. No _ema key for the buffer decoder.batch_norm_z.running_mean. Will default to 'ingore'. +[WARNING] Model buffers behavior should be defined using the '_ema' parameter. No _ema key for the buffer decoder.batch_norm_z.running_var. Will default to 'ingore'. +[WARNING] Model buffers behavior should be defined using the '_ema' parameter. No _ema key for the buffer decoder.batch_norm_z.num_batches_tracked. Will default to 'ingore'. +Setting up [LPIPS] perceptual loss: trunk [vgg], v[0.1], spatial [off] +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torchvision/models/_utils.py:208: UserWarning: The parameter 'pretrained' is deprecated since 0.13 and may be removed in the future, please use 'weights' instead. + warnings.warn( +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and may be removed in the future. The current behavior is equivalent to passing `weights=VGG16_Weights.IMAGENET1K_V1`. You can also use `weights=VGG16_Weights.DEFAULT` to get the most up-to-date weights. + warnings.warn(msg) +Setting up [LPIPS] perceptual loss: trunk [vgg], v[0.1], spatial [off] +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torchvision/models/_utils.py:208: UserWarning: The parameter 'pretrained' is deprecated since 0.13 and may be removed in the future, please use 'weights' instead. + warnings.warn( +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and may be removed in the future. The current behavior is equivalent to passing `weights=VGG16_Weights.IMAGENET1K_V1`. You can also use `weights=VGG16_Weights.DEFAULT` to get the most up-to-date weights. + warnings.warn(msg) +Setting up [LPIPS] perceptual loss: trunk [vgg], v[0.1], spatial [off] +Setting up [LPIPS] perceptual loss: trunk [vgg], v[0.1], spatial [off] +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torchvision/models/_utils.py:208: UserWarning: The parameter 'pretrained' is deprecated since 0.13 and may be removed in the future, please use 'weights' instead. + warnings.warn( +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and may be removed in the future. The current behavior is equivalent to passing `weights=VGG16_Weights.IMAGENET1K_V1`. You can also use `weights=VGG16_Weights.DEFAULT` to get the most up-to-date weights. + warnings.warn(msg) +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torchvision/models/_utils.py:208: UserWarning: The parameter 'pretrained' is deprecated since 0.13 and may be removed in the future, please use 'weights' instead. + warnings.warn( +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and may be removed in the future. The current behavior is equivalent to passing `weights=VGG16_Weights.IMAGENET1K_V1`. You can also use `weights=VGG16_Weights.DEFAULT` to get the most up-to-date weights. + warnings.warn(msg) +Setting up [LPIPS] perceptual loss: trunk [vgg], v[0.1], spatial [off] +Setting up [LPIPS] perceptual loss: trunk [vgg], v[0.1], spatial [off] +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torchvision/models/_utils.py:208: UserWarning: The parameter 'pretrained' is deprecated since 0.13 and may be removed in the future, please use 'weights' instead. + warnings.warn( +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and may be removed in the future. The current behavior is equivalent to passing `weights=VGG16_Weights.IMAGENET1K_V1`. You can also use `weights=VGG16_Weights.DEFAULT` to get the most up-to-date weights. + warnings.warn(msg) +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torchvision/models/_utils.py:208: UserWarning: The parameter 'pretrained' is deprecated since 0.13 and may be removed in the future, please use 'weights' instead. + warnings.warn( +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and may be removed in the future. The current behavior is equivalent to passing `weights=VGG16_Weights.IMAGENET1K_V1`. You can also use `weights=VGG16_Weights.DEFAULT` to get the most up-to-date weights. + warnings.warn(msg) +Setting up [LPIPS] perceptual loss: trunk [vgg], v[0.1], spatial [off] +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torchvision/models/_utils.py:208: UserWarning: The parameter 'pretrained' is deprecated since 0.13 and may be removed in the future, please use 'weights' instead. + warnings.warn( +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and may be removed in the future. The current behavior is equivalent to passing `weights=VGG16_Weights.IMAGENET1K_V1`. You can also use `weights=VGG16_Weights.DEFAULT` to get the most up-to-date weights. + warnings.warn(msg) +Setting up [LPIPS] perceptual loss: trunk [vgg], v[0.1], spatial [off] +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torchvision/models/_utils.py:208: UserWarning: The parameter 'pretrained' is deprecated since 0.13 and may be removed in the future, please use 'weights' instead. + warnings.warn( +/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and may be removed in the future. The current behavior is equivalent to passing `weights=VGG16_Weights.IMAGENET1K_V1`. You can also use `weights=VGG16_Weights.DEFAULT` to get the most up-to-date weights. + warnings.warn(msg) +Loading model from: /workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/lpips/weights/v0.1/vgg.pth +Loading model from: /workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/lpips/weights/v0.1/vgg.pth +Loading model from: /workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/lpips/weights/v0.1/vgg.pth +Loading model from: /workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/lpips/weights/v0.1/vgg.pth +Loading model from: /workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/lpips/weights/v0.1/vgg.pth +Loading model from: /workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/lpips/weights/v0.1/vgg.pth +Loading model from: /workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/lpips/weights/v0.1/vgg.pth +Loading model from: /workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/lpips/weights/v0.1/vgg.pth +[2025-10-25 04:11:47,948][main][INFO] - ae parameters count: +[2025-10-25 04:11:47,953][main][INFO] - Total: #230.9M (trainable: #230.9M) +[2025-10-25 04:11:47,954][main][INFO] - - encoder: #217.4M (trainable: #217.4M) +[2025-10-25 04:11:47,955][main][INFO] - - project_in: #1.8K (trainable: #1.8K) +[2025-10-25 04:11:47,956][main][INFO] - - stages: #216.9M (trainable: #216.9M) +[2025-10-25 04:11:47,956][main][INFO] - - project_out: #576.1K (trainable: #576.1K) +[2025-10-25 04:11:47,958][main][INFO] - - decoder: #13.5M (trainable: #13.5M) +[2025-10-25 04:11:47,958][main][INFO] - - conv_in_img: #896 (trainable: #896) +[2025-10-25 04:11:47,959][main][INFO] - - conv_in_z: #9.0K (trainable: #9.0K) +[2025-10-25 04:11:47,959][main][INFO] - - conv_in: #36.1K (trainable: #36.1K) +[2025-10-25 04:11:47,959][main][INFO] - - batch_norm_z: #64 (trainable: #64) +[2025-10-25 04:11:47,960][main][INFO] - - time_proj: #0 (trainable: #0) +[2025-10-25 04:11:47,960][main][INFO] - - time_embedding: #80.5K (trainable: #80.5K) +[2025-10-25 04:11:47,960][main][INFO] - - ada_ctx_proj: #54.1K (trainable: #54.1K) +[2025-10-25 04:11:47,961][main][INFO] - - down_blocks: #3.0M (trainable: #3.0M) +[2025-10-25 04:11:47,962][main][INFO] - - mid_block: #3.4M (trainable: #3.4M) +[2025-10-25 04:11:47,963][main][INFO] - - up_blocks: #6.9M (trainable: #6.9M) +[2025-10-25 04:11:47,963][main][INFO] - - conv_norm_out: #128 (trainable: #128) +[2025-10-25 04:11:47,964][main][INFO] - - conv_out_act: #0 (trainable: #0) +[2025-10-25 04:11:47,964][main][INFO] - - conv_out: #1.7K (trainable: #1.7K) +[2025-10-25 04:11:47,969][main][INFO] - ae: EMAWrapper( + (model): DistributedDataParallel( + (module): DC_SSDAE( + (encoder): DCEncoder( + (project_in): ConvPixelUnshuffleDownSampleLayer( + (conv): ConvLayer( + (conv): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (stages): ModuleList( + (0): OpSequential( + (op_list): ModuleList() + ) + (1): OpSequential( + (op_list): ModuleList( + (0-4): 5 x ResidualBlock( + (main): ResBlock( + (conv1): ConvLayer( + (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (act): SiLU() + ) + (conv2): ConvLayer( + (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) + ) + ) + (shortcut): IdentityLayer() + ) + (5): ResidualBlock( + (main): ConvPixelUnshuffleDownSampleLayer( + (conv): ConvLayer( + (conv): Conv2d(256, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (shortcut): PixelUnshuffleChannelAveragingDownSampleLayer() + ) + ) + ) + (2): OpSequential( + (op_list): ModuleList( + (0-9): 10 x ResidualBlock( + (main): ResBlock( + (conv1): ConvLayer( + (conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (act): SiLU() + ) + (conv2): ConvLayer( + (conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) + ) + ) + (shortcut): IdentityLayer() + ) + (10): ResidualBlock( + (main): ConvPixelUnshuffleDownSampleLayer( + (conv): ConvLayer( + (conv): Conv2d(512, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (shortcut): PixelUnshuffleChannelAveragingDownSampleLayer() + ) + ) + ) + (3): OpSequential( + (op_list): ModuleList( + (0-3): 4 x ResidualBlock( + (main): ResBlock( + (conv1): ConvLayer( + (conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (act): SiLU() + ) + (conv2): ConvLayer( + (conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) + ) + ) + (shortcut): IdentityLayer() + ) + (4): ResidualBlock( + (main): ConvPixelUnshuffleDownSampleLayer( + (conv): ConvLayer( + (conv): Conv2d(512, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (shortcut): PixelUnshuffleChannelAveragingDownSampleLayer() + ) + ) + ) + (4): OpSequential( + (op_list): ModuleList( + (0-3): 4 x ResidualBlock( + (main): ResBlock( + (conv1): ConvLayer( + (conv): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (act): SiLU() + ) + (conv2): ConvLayer( + (conv): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) + ) + ) + (shortcut): IdentityLayer() + ) + (4): ResidualBlock( + (main): ConvPixelUnshuffleDownSampleLayer( + (conv): ConvLayer( + (conv): Conv2d(1024, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (shortcut): PixelUnshuffleChannelAveragingDownSampleLayer() + ) + ) + ) + (5): OpSequential( + (op_list): ModuleList( + (0-3): 4 x ResidualBlock( + (main): ResBlock( + (conv1): ConvLayer( + (conv): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (act): SiLU() + ) + (conv2): ConvLayer( + (conv): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) + ) + ) + (shortcut): IdentityLayer() + ) + ) + ) + ) + (project_out): OpSequential( + (op_list): ModuleList( + (0): ConvLayer( + (conv): Conv2d(1024, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + ) + ) + (decoder): UViTDecoder( + (conv_in_img): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (conv_in_z): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (conv_in): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (batch_norm_z): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (time_proj): Timesteps() + (time_embedding): TimestepEmbedding( + (linear_1): Linear(in_features=64, out_features=256, bias=True) + (act): SiLU() + (linear_2): Linear(in_features=256, out_features=256, bias=True) + ) + (ada_ctx_proj): Sequential( + (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): SiLU() + (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (down_blocks): ModuleList( + (0): DownBlock2D( + (resnets): ModuleList( + (0-1): 2 x ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=128, bias=True) + (norm2): GroupNorm(32, 64, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + ) + ) + (downsamplers): ModuleList( + (0): Downsample2D( + (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) + ) + ) + ) + (1): DownBlock2D( + (resnets): ModuleList( + (0): ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(64, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=192, bias=True) + (norm2): GroupNorm(32, 96, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + (conv_shortcut): Conv2d(64, 96, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 192, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=192, bias=True) + (norm2): GroupNorm(32, 96, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + ) + ) + (downsamplers): ModuleList( + (0): Downsample2D( + (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) + ) + ) + ) + (2): DownBlock2D( + (resnets): ModuleList( + (0): ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 192, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(96, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=320, bias=True) + (norm2): GroupNorm(32, 160, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + (conv_shortcut): Conv2d(96, 160, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 320, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=320, bias=True) + (norm2): GroupNorm(32, 160, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + ) + ) + (downsamplers): ModuleList( + (0): Downsample2D( + (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) + ) + ) + ) + (3): DownBlock2D( + (resnets): ModuleList( + (0-1): 2 x ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 320, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=320, bias=True) + (norm2): GroupNorm(32, 160, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + ) + ) + ) + ) + (mid_block): UViTMiddleTransformer( + (proj_in): Linear(in_features=160, out_features=160, bias=True) + (transformer_blocks): ModuleList( + (0-7): 8 x TransformerBlock( + (norm1): AdaLayerNorm( + (silu): SiLU() + (linear): Linear(in_features=64, out_features=320, bias=True) + (norm): LayerNorm((160,), eps=1e-05, elementwise_affine=False) + ) + (attn1): Attention( + (to_q): Linear(in_features=160, out_features=160, bias=False) + (to_k): Linear(in_features=160, out_features=160, bias=False) + (to_v): Linear(in_features=160, out_features=160, bias=False) + (out_proj): Linear(in_features=160, out_features=160, bias=True) + (out_drop): Dropout(p=0.0, inplace=False) + ) + (norm2): LayerNorm((160,), eps=1e-05, elementwise_affine=True) + (ff): FeedForward( + (proj_in_act): GEGLU( + (proj): Linear(in_features=160, out_features=1280, bias=True) + ) + (drop): Dropout(p=0.0, inplace=False) + (proj_out): Linear(in_features=640, out_features=160, bias=True) + ) + (relative_position_bias): RelativePositionBias() + ) + ) + (proj_out): Linear(in_features=160, out_features=160, bias=True) + (norm): GroupNorm(32, 160, eps=1e-06, affine=True) + ) + (up_blocks): ModuleList( + (0): UpBlock2D( + (resnets): ModuleList( + (0-2): 3 x ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 640, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(320, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=320, bias=True) + (norm2): GroupNorm(32, 160, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + (conv_shortcut): Conv2d(320, 160, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + (upsamplers): ModuleList( + (0): Upsample2D( + (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + ) + (1): UpBlock2D( + (resnets): ModuleList( + (0-1): 2 x ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 640, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(320, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=320, bias=True) + (norm2): GroupNorm(32, 160, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + (conv_shortcut): Conv2d(320, 160, kernel_size=(1, 1), stride=(1, 1)) + ) + (2): ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 512, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(256, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=320, bias=True) + (norm2): GroupNorm(32, 160, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + (conv_shortcut): Conv2d(256, 160, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + (upsamplers): ModuleList( + (0): Upsample2D( + (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + ) + (2): UpBlock2D( + (resnets): ModuleList( + (0): ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 512, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(256, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=192, bias=True) + (norm2): GroupNorm(32, 96, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + (conv_shortcut): Conv2d(256, 96, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 384, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(192, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=192, bias=True) + (norm2): GroupNorm(32, 96, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + (conv_shortcut): Conv2d(192, 96, kernel_size=(1, 1), stride=(1, 1)) + ) + (2): ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 320, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(160, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=192, bias=True) + (norm2): GroupNorm(32, 96, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + (conv_shortcut): Conv2d(160, 96, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + (upsamplers): ModuleList( + (0): Upsample2D( + (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + ) + (3): UpBlock2D( + (resnets): ModuleList( + (0): ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 320, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(160, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=128, bias=True) + (norm2): GroupNorm(32, 64, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + (conv_shortcut): Conv2d(160, 64, kernel_size=(1, 1), stride=(1, 1)) + ) + (1-2): 2 x ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(128, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=128, bias=True) + (norm2): GroupNorm(32, 64, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + (conv_shortcut): Conv2d(128, 64, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + ) + ) + (conv_norm_out): GroupNorm(32, 64, eps=1e-05, affine=True) + (conv_out_act): SiLU() + (conv_out): Conv2d(64, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + ) + (ema): EMA(ema_model=DC_SSDAE, decay=0.999, start_iter=50000) +) +[2025-10-25 04:11:47,970][main][INFO] - aux_losses parameters count: +[2025-10-25 04:11:47,971][main][INFO] - Total: #96.7M (trainable: #145.9K) +[2025-10-25 04:11:47,972][main][INFO] - - repa_loss: #82.7M (trainable: #145.9K) +[2025-10-25 04:11:47,972][main][INFO] - - lpips_loss: #14.0M (trainable: #0) +[2025-10-25 04:11:47,973][main][INFO] - aux_losses: DistributedDataParallel( + (module): SSDDLosses( + (repa_loss): REPALoss( + (features_extractor): Frozen(DinoEncoder/Dinov2Model) + (repa_mlp): Sequential( + (0): Linear(in_features=160, out_features=160, bias=True) + (1): SiLU() + (2): Linear(in_features=160, out_features=768, bias=True) + ) + (repa_loss): CosineSimilarity() + ) + (lpips_loss): Frozen(LPIPS) + ) +) +[2025-10-25 04:11:47,978][main][INFO] - Optimizer for autoencoder: RAdamScheduleFree ( +Parameter Group 0 + betas: (0.9, 0.999) + eps: 1e-08 + foreach: True + k: 0 + lr: 0.0003 + lr_max: -1.0 + r: 0.0 + scheduled_lr: 0.0 + silent_sgd_phase: True + train_mode: False + weight_decay: 0.001 + weight_lr_power: 2.0 + weight_sum: 0.0 + +Parameter Group 1 + betas: (0.9, 0.999) + eps: 1e-08 + foreach: True + k: 0 + lr: 0.0003 + lr_max: -1.0 + r: 0.0 + scheduled_lr: 0.0 + silent_sgd_phase: True + train_mode: False + weight_decay: 0.0 + weight_lr_power: 2.0 + weight_sum: 0.0 +) +[2025-10-25 04:11:47,983][main][INFO] - No training state found to resume from None +[2025-10-25 04:11:47,984][main][INFO] - ====================== RUNNING TASK train +[2025-10-25 04:11:47,984][main][INFO] - Starting training +[2025-10-25 04:11:47,984][main][INFO] - Batch size of 192 (24 per GPU, 1 acumulation step(s) 8 process(es)) +[2025-10-25 04:11:47,993][main][INFO] - --- + + +[2025-10-25 04:11:47,993][main][INFO] - [T_total=00:00:26 | T_train=00:00:00] Start epoch 0 +[T_total=00:00:36 | T_train=00:00:09 | T_epoch=00:00:09] Epoch 0, batch 1 / 6666 (step 0) loss=7.98227e+07 (avg=7.982e+07) [[all losses: diffusion=1.35239 ; kl=7.98227e+13 ; lpips=0.750356 ; repa=0.995288]] +[T_total=00:03:20 | T_train=00:02:53 | T_epoch=00:02:53] Epoch 0, batch 101 / 6666 (step 100) loss=4.42995e+06 (avg=4.43e+06) [[all losses: diffusion=1.1898 ; kl=4.42995e+12 ; lpips=0.700778 ; repa=0.953524 ; sum_loss=4.42995e+06]] +[T_total=00:06:04 | T_train=00:05:37 | T_epoch=00:05:37] Epoch 0, batch 201 / 6666 (step 200) loss=2.226e+06 (avg=2.226e+06) [[all losses: diffusion=0.747228 ; kl=2.22599e+12 ; lpips=0.637647 ; repa=0.927318 ; sum_loss=2.226e+06]] +[T_total=00:08:48 | T_train=00:08:21 | T_epoch=00:08:21] Epoch 0, batch 301 / 6666 (step 300) loss=1.48646e+06 (avg=1.486e+06) [[all losses: diffusion=0.553895 ; kl=1.48646e+12 ; lpips=0.587502 ; repa=0.901102 ; sum_loss=1.48646e+06]] +[T_total=00:11:32 | T_train=00:11:05 | T_epoch=00:11:05] Epoch 0, batch 401 / 6666 (step 400) loss=1.11577e+06 (avg=1.116e+06) [[all losses: diffusion=0.451203 ; kl=1.11577e+12 ; lpips=0.55372 ; repa=0.877731 ; sum_loss=1.11577e+06]] +[T_total=00:14:16 | T_train=00:13:49 | T_epoch=00:13:49] Epoch 0, batch 501 / 6666 (step 500) loss=893065 (avg=8.931e+05) [[all losses: diffusion=0.387243 ; kl=8.93064e+11 ; lpips=0.530042 ; repa=0.858555 ; sum_loss=893065]] +[T_total=00:17:00 | T_train=00:16:33 | T_epoch=00:16:33] Epoch 0, batch 601 / 6666 (step 600) loss=744468 (avg=7.445e+05) [[all losses: diffusion=0.343565 ; kl=7.44468e+11 ; lpips=0.512361 ; repa=0.84252 ; sum_loss=744468]] +[T_total=00:19:44 | T_train=00:19:17 | T_epoch=00:19:17] Epoch 0, batch 701 / 6666 (step 700) loss=638267 (avg=6.383e+05) [[all losses: diffusion=0.311698 ; kl=6.38267e+11 ; lpips=0.498459 ; repa=0.828846 ; sum_loss=638267]] +[T_total=00:22:28 | T_train=00:22:01 | T_epoch=00:22:01] Epoch 0, batch 801 / 6666 (step 800) loss=558584 (avg=5.586e+05) [[all losses: diffusion=0.287489 ; kl=5.58583e+11 ; lpips=0.48754 ; repa=0.817161 ; sum_loss=558584]] +[T_total=00:25:12 | T_train=00:24:45 | T_epoch=00:24:45] Epoch 0, batch 901 / 6666 (step 900) loss=496588 (avg=4.966e+05) [[all losses: diffusion=0.268364 ; kl=4.96587e+11 ; lpips=0.478222 ; repa=0.807135 ; sum_loss=496588]] +[T_total=00:27:56 | T_train=00:27:29 | T_epoch=00:27:29] Epoch 0, batch 1001 / 6666 (step 1000) loss=446979 (avg=4.47e+05) [[all losses: diffusion=0.252811 ; kl=4.46978e+11 ; lpips=0.470516 ; repa=0.798425 ; sum_loss=446979]] +[T_total=00:30:40 | T_train=00:30:13 | T_epoch=00:30:13] Epoch 0, batch 1101 / 6666 (step 1100) loss=406381 (avg=4.064e+05) [[all losses: diffusion=0.240129 ; kl=4.06381e+11 ; lpips=0.463378 ; repa=0.790714 ; sum_loss=406381]] +[T_total=00:33:24 | T_train=00:32:57 | T_epoch=00:32:57] Epoch 0, batch 1201 / 6666 (step 1200) loss=372627 (avg=3.726e+05) [[all losses: diffusion=0.229782 ; kl=3.72626e+11 ; lpips=0.45847 ; repa=0.783973 ; sum_loss=372627]] +[T_total=00:36:08 | T_train=00:35:41 | T_epoch=00:35:41] Epoch 0, batch 1301 / 6666 (step 1300) loss=343985 (avg=3.44e+05) [[all losses: diffusion=0.220689 ; kl=3.43984e+11 ; lpips=0.453284 ; repa=0.777787 ; sum_loss=343985]] +[T_total=00:38:52 | T_train=00:38:25 | T_epoch=00:38:25] Epoch 0, batch 1401 / 6666 (step 1400) loss=319432 (avg=3.194e+05) [[all losses: diffusion=0.212837 ; kl=3.19432e+11 ; lpips=0.448663 ; repa=0.772056 ; sum_loss=319432]] +[T_total=00:41:36 | T_train=00:41:09 | T_epoch=00:41:09] Epoch 0, batch 1501 / 6666 (step 1500) loss=298161 (avg=2.982e+05) [[all losses: diffusion=0.206076 ; kl=2.9816e+11 ; lpips=0.444743 ; repa=0.766849 ; sum_loss=298161]] +[T_total=00:44:19 | T_train=00:43:53 | T_epoch=00:43:53] Epoch 0, batch 1601 / 6666 (step 1600) loss=279538 (avg=2.795e+05) [[all losses: diffusion=0.200098 ; kl=2.79537e+11 ; lpips=0.44049 ; repa=0.761926 ; sum_loss=279538]] +[T_total=00:47:03 | T_train=00:46:37 | T_epoch=00:46:37] Epoch 0, batch 1701 / 6666 (step 1700) loss=263104 (avg=2.631e+05) [[all losses: diffusion=0.194673 ; kl=2.63103e+11 ; lpips=0.437218 ; repa=0.757459 ; sum_loss=263104]] +[T_total=00:49:47 | T_train=00:49:20 | T_epoch=00:49:20] Epoch 0, batch 1801 / 6666 (step 1800) loss=248495 (avg=2.485e+05) [[all losses: diffusion=0.189843 ; kl=2.48495e+11 ; lpips=0.433805 ; repa=0.753188 ; sum_loss=248495]] +[T_total=00:52:31 | T_train=00:52:04 | T_epoch=00:52:04] Epoch 0, batch 1901 / 6666 (step 1900) loss=235423 (avg=2.354e+05) [[all losses: diffusion=0.185441 ; kl=2.35423e+11 ; lpips=0.430557 ; repa=0.74918 ; sum_loss=235423]] +[T_total=00:55:15 | T_train=00:54:48 | T_epoch=00:54:48] Epoch 0, batch 2001 / 6666 (step 2000) loss=223658 (avg=2.237e+05) [[all losses: diffusion=0.18155 ; kl=2.23658e+11 ; lpips=0.42789 ; repa=0.74553 ; sum_loss=223658]] +[T_total=00:57:59 | T_train=00:57:32 | T_epoch=00:57:32] Epoch 0, batch 2101 / 6666 (step 2100) loss=213013 (avg=2.13e+05) [[all losses: diffusion=0.177931 ; kl=2.13013e+11 ; lpips=0.425059 ; repa=0.742016 ; sum_loss=213013]] +[T_total=01:00:43 | T_train=01:00:16 | T_epoch=01:00:16] Epoch 0, batch 2201 / 6666 (step 2200) loss=203335 (avg=2.033e+05) [[all losses: diffusion=0.174669 ; kl=2.03335e+11 ; lpips=0.422229 ; repa=0.738699 ; sum_loss=203335]] +[T_total=01:03:27 | T_train=01:03:01 | T_epoch=01:03:01] Epoch 0, batch 2301 / 6666 (step 2300) loss=194498 (avg=1.945e+05) [[all losses: diffusion=0.171763 ; kl=1.94498e+11 ; lpips=0.420135 ; repa=0.735667 ; sum_loss=194498]] +[T_total=01:06:11 | T_train=01:05:44 | T_epoch=01:05:44] Epoch 0, batch 2401 / 6666 (step 2400) loss=186398 (avg=1.864e+05) [[all losses: diffusion=0.16893 ; kl=1.86397e+11 ; lpips=0.417684 ; repa=0.732678 ; sum_loss=186398]] +[T_total=01:08:55 | T_train=01:08:28 | T_epoch=01:08:28] Epoch 0, batch 2501 / 6666 (step 2500) loss=178945 (avg=1.789e+05) [[all losses: diffusion=0.166345 ; kl=1.78944e+11 ; lpips=0.415514 ; repa=0.729864 ; sum_loss=178945]] +[T_total=01:11:39 | T_train=01:11:12 | T_epoch=01:11:12] Epoch 0, batch 2601 / 6666 (step 2600) loss=172065 (avg=1.721e+05) [[all losses: diffusion=0.16394 ; kl=1.72064e+11 ; lpips=0.41318 ; repa=0.727205 ; sum_loss=172065]] +[T_total=01:14:22 | T_train=01:13:55 | T_epoch=01:13:55] Epoch 0, batch 2701 / 6666 (step 2700) loss=165695 (avg=1.657e+05) [[all losses: diffusion=0.161669 ; kl=1.65694e+11 ; lpips=0.411074 ; repa=0.724636 ; sum_loss=165695]] +[T_total=01:17:05 | T_train=01:16:39 | T_epoch=01:16:39] Epoch 0, batch 2801 / 6666 (step 2800) loss=159779 (avg=1.598e+05) [[all losses: diffusion=0.159516 ; kl=1.59779e+11 ; lpips=0.409232 ; repa=0.722217 ; sum_loss=159779]] +[T_total=01:19:49 | T_train=01:19:23 | T_epoch=01:19:23] Epoch 0, batch 2901 / 6666 (step 2900) loss=154271 (avg=1.543e+05) [[all losses: diffusion=0.157551 ; kl=1.54271e+11 ; lpips=0.407319 ; repa=0.719872 ; sum_loss=154271]] +[T_total=01:22:33 | T_train=01:22:06 | T_epoch=01:22:06] Epoch 0, batch 3001 / 6666 (step 3000) loss=149147 (avg=1.491e+05) [[all losses: diffusion=0.155757 ; kl=1.49147e+11 ; lpips=0.405743 ; repa=0.717694 ; sum_loss=149147]] +[T_total=01:25:17 | T_train=01:24:50 | T_epoch=01:24:50] Epoch 0, batch 3101 / 6666 (step 3100) loss=144338 (avg=1.443e+05) [[all losses: diffusion=0.154043 ; kl=1.44337e+11 ; lpips=0.403968 ; repa=0.715546 ; sum_loss=144338]] +[T_total=01:28:01 | T_train=01:27:34 | T_epoch=01:27:34] Epoch 0, batch 3201 / 6666 (step 3200) loss=139828 (avg=1.398e+05) [[all losses: diffusion=0.15237 ; kl=1.39828e+11 ; lpips=0.402247 ; repa=0.713448 ; sum_loss=139828]] +[T_total=01:30:44 | T_train=01:30:18 | T_epoch=01:30:18] Epoch 0, batch 3301 / 6666 (step 3300) loss=135592 (avg=1.356e+05) [[all losses: diffusion=0.150841 ; kl=1.35592e+11 ; lpips=0.400374 ; repa=0.711401 ; sum_loss=135592]] +[T_total=01:33:28 | T_train=01:33:01 | T_epoch=01:33:01] Epoch 0, batch 3401 / 6666 (step 3400) loss=131606 (avg=1.316e+05) [[all losses: diffusion=0.149364 ; kl=1.31605e+11 ; lpips=0.398906 ; repa=0.709518 ; sum_loss=131606]] +[T_total=01:36:12 | T_train=01:35:45 | T_epoch=01:35:45] Epoch 0, batch 3501 / 6666 (step 3500) loss=127848 (avg=1.278e+05) [[all losses: diffusion=0.147982 ; kl=1.27847e+11 ; lpips=0.397529 ; repa=0.707721 ; sum_loss=127848]] +[T_total=01:38:56 | T_train=01:38:29 | T_epoch=01:38:29] Epoch 0, batch 3601 / 6666 (step 3600) loss=124298 (avg=1.243e+05) [[all losses: diffusion=0.146635 ; kl=1.24297e+11 ; lpips=0.395941 ; repa=0.705905 ; sum_loss=124298]] +[T_total=01:41:39 | T_train=01:41:13 | T_epoch=01:41:13] Epoch 0, batch 3701 / 6666 (step 3700) loss=120939 (avg=1.209e+05) [[all losses: diffusion=0.14539 ; kl=1.20939e+11 ; lpips=0.394409 ; repa=0.704188 ; sum_loss=120939]] +[T_total=01:44:23 | T_train=01:43:56 | T_epoch=01:43:56] Epoch 0, batch 3801 / 6666 (step 3800) loss=117757 (avg=1.178e+05) [[all losses: diffusion=0.144199 ; kl=1.17757e+11 ; lpips=0.39279 ; repa=0.702464 ; sum_loss=117757]] +[T_total=01:47:07 | T_train=01:46:40 | T_epoch=01:46:40] Epoch 0, batch 3901 / 6666 (step 3900) loss=114739 (avg=1.147e+05) [[all losses: diffusion=0.143076 ; kl=1.14738e+11 ; lpips=0.391549 ; repa=0.700919 ; sum_loss=114739]] +[T_total=01:49:51 | T_train=01:49:24 | T_epoch=01:49:24] Epoch 0, batch 4001 / 6666 (step 4000) loss=111871 (avg=1.119e+05) [[all losses: diffusion=0.142003 ; kl=1.11871e+11 ; lpips=0.390032 ; repa=0.699311 ; sum_loss=111871]] +[T_total=01:52:35 | T_train=01:52:08 | T_epoch=01:52:08] Epoch 0, batch 4101 / 6666 (step 4100) loss=109143 (avg=1.091e+05) [[all losses: diffusion=0.140951 ; kl=1.09143e+11 ; lpips=0.38867 ; repa=0.69779 ; sum_loss=109143]] +[T_total=01:55:19 | T_train=01:54:52 | T_epoch=01:54:52] Epoch 0, batch 4201 / 6666 (step 4200) loss=106545 (avg=1.065e+05) [[all losses: diffusion=0.139927 ; kl=1.06545e+11 ; lpips=0.387279 ; repa=0.696287 ; sum_loss=106545]] +[T_total=01:58:02 | T_train=01:57:36 | T_epoch=01:57:36] Epoch 0, batch 4301 / 6666 (step 4300) loss=104068 (avg=1.041e+05) [[all losses: diffusion=0.138941 ; kl=1.04067e+11 ; lpips=0.385905 ; repa=0.694821 ; sum_loss=104068]] +[T_total=02:00:46 | T_train=02:00:19 | T_epoch=02:00:19] Epoch 0, batch 4401 / 6666 (step 4400) loss=101703 (avg=1.017e+05) [[all losses: diffusion=0.138088 ; kl=1.01703e+11 ; lpips=0.384818 ; repa=0.693478 ; sum_loss=101703]] +[T_total=02:03:30 | T_train=02:03:03 | T_epoch=02:03:03] Epoch 0, batch 4501 / 6666 (step 4500) loss=99443.8 (avg=9.944e+04) [[all losses: diffusion=0.137204 ; kl=9.94433e+10 ; lpips=0.383534 ; repa=0.692084 ; sum_loss=99443.8]] +[T_total=02:06:13 | T_train=02:05:46 | T_epoch=02:05:46] Epoch 0, batch 4601 / 6666 (step 4600) loss=97285.3 (avg=9.729e+04) [[all losses: diffusion=0.136382 ; kl=9.72848e+10 ; lpips=0.382465 ; repa=0.690806 ; sum_loss=97285.3]] +[T_total=02:08:57 | T_train=02:08:30 | T_epoch=02:08:30] Epoch 0, batch 4701 / 6666 (step 4700) loss=95215.9 (avg=9.522e+04) [[all losses: diffusion=0.135576 ; kl=9.52154e+10 ; lpips=0.381187 ; repa=0.689488 ; sum_loss=95215.9]] +[T_total=02:11:40 | T_train=02:11:14 | T_epoch=02:11:14] Epoch 0, batch 4801 / 6666 (step 4800) loss=93232.6 (avg=9.323e+04) [[all losses: diffusion=0.134785 ; kl=9.32321e+10 ; lpips=0.380088 ; repa=0.688212 ; sum_loss=93232.6]] +[T_total=02:14:24 | T_train=02:13:57 | T_epoch=02:13:57] Epoch 0, batch 4901 / 6666 (step 4900) loss=91330.3 (avg=9.133e+04) [[all losses: diffusion=0.134018 ; kl=9.13298e+10 ; lpips=0.378915 ; repa=0.686957 ; sum_loss=91330.3]] +[T_total=02:17:08 | T_train=02:16:41 | T_epoch=02:16:41] Epoch 0, batch 5001 / 6666 (step 5000) loss=89504.1 (avg=8.95e+04) [[all losses: diffusion=0.133269 ; kl=8.95036e+10 ; lpips=0.377776 ; repa=0.685715 ; sum_loss=89504.1]] +[T_total=02:19:52 | T_train=02:19:25 | T_epoch=02:19:25] Epoch 0, batch 5101 / 6666 (step 5100) loss=87749.5 (avg=8.775e+04) [[all losses: diffusion=0.132582 ; kl=8.7749e+10 ; lpips=0.37657 ; repa=0.684502 ; sum_loss=87749.5]] +[T_total=02:22:35 | T_train=02:22:09 | T_epoch=02:22:09] Epoch 0, batch 5201 / 6666 (step 5200) loss=86062.4 (avg=8.606e+04) [[all losses: diffusion=0.131981 ; kl=8.6062e+10 ; lpips=0.375763 ; repa=0.683465 ; sum_loss=86062.4]] +[T_total=02:25:19 | T_train=02:24:52 | T_epoch=02:24:52] Epoch 0, batch 5301 / 6666 (step 5300) loss=84438.9 (avg=8.444e+04) [[all losses: diffusion=0.131319 ; kl=8.44385e+10 ; lpips=0.374613 ; repa=0.682315 ; sum_loss=84438.9]] +[T_total=02:28:03 | T_train=02:27:36 | T_epoch=02:27:36] Epoch 0, batch 5401 / 6666 (step 5400) loss=82875.6 (avg=8.288e+04) [[all losses: diffusion=0.13068 ; kl=8.28751e+10 ; lpips=0.373568 ; repa=0.681193 ; sum_loss=82875.6]] +[T_total=02:30:47 | T_train=02:30:20 | T_epoch=02:30:20] Epoch 0, batch 5501 / 6666 (step 5500) loss=81369 (avg=8.137e+04) [[all losses: diffusion=0.130048 ; kl=8.13685e+10 ; lpips=0.372563 ; repa=0.680121 ; sum_loss=81369]] +[T_total=02:33:30 | T_train=02:33:03 | T_epoch=02:33:03] Epoch 0, batch 5601 / 6666 (step 5600) loss=79918.6 (avg=7.992e+04) [[all losses: diffusion=0.129543 ; kl=7.99181e+10 ; lpips=0.371987 ; repa=0.679231 ; sum_loss=79918.6]] +[T_total=02:36:14 | T_train=02:35:47 | T_epoch=02:35:47] Epoch 0, batch 5701 / 6666 (step 5700) loss=78516.8 (avg=7.852e+04) [[all losses: diffusion=0.128967 ; kl=7.85163e+10 ; lpips=0.370938 ; repa=0.678176 ; sum_loss=78516.8]] +[T_total=02:38:58 | T_train=02:38:31 | T_epoch=02:38:31] Epoch 0, batch 5801 / 6666 (step 5800) loss=77163.3 (avg=7.716e+04) [[all losses: diffusion=0.128416 ; kl=7.71628e+10 ; lpips=0.369938 ; repa=0.677144 ; sum_loss=77163.3]] +[T_total=02:41:41 | T_train=02:41:15 | T_epoch=02:41:15] Epoch 0, batch 5901 / 6666 (step 5900) loss=75855.7 (avg=7.586e+04) [[all losses: diffusion=0.127848 ; kl=7.58552e+10 ; lpips=0.369027 ; repa=0.676136 ; sum_loss=75855.7]] +[T_total=02:44:25 | T_train=02:43:58 | T_epoch=02:43:58] Epoch 0, batch 6001 / 6666 (step 6000) loss=74591.6 (avg=7.459e+04) [[all losses: diffusion=0.127307 ; kl=7.45912e+10 ; lpips=0.368068 ; repa=0.675133 ; sum_loss=74591.6]] +[T_total=02:47:09 | T_train=02:46:42 | T_epoch=02:46:42] Epoch 0, batch 6101 / 6666 (step 6100) loss=73369 (avg=7.337e+04) [[all losses: diffusion=0.126812 ; kl=7.33685e+10 ; lpips=0.36707 ; repa=0.674153 ; sum_loss=73369]] +[T_total=02:49:53 | T_train=02:49:26 | T_epoch=02:49:26] Epoch 0, batch 6201 / 6666 (step 6200) loss=72187.3 (avg=7.219e+04) [[all losses: diffusion=0.126414 ; kl=7.21869e+10 ; lpips=0.3666 ; repa=0.673374 ; sum_loss=72187.3]] +[T_total=02:52:36 | T_train=02:52:09 | T_epoch=02:52:09] Epoch 0, batch 6301 / 6666 (step 6300) loss=71041.7 (avg=7.104e+04) [[all losses: diffusion=0.12591 ; kl=7.10412e+10 ; lpips=0.365679 ; repa=0.672437 ; sum_loss=71041.7]] +[T_total=02:55:20 | T_train=02:54:53 | T_epoch=02:54:53] Epoch 0, batch 6401 / 6666 (step 6400) loss=69931.8 (avg=6.993e+04) [[all losses: diffusion=0.125412 ; kl=6.99314e+10 ; lpips=0.36484 ; repa=0.67151 ; sum_loss=69931.8]] +[T_total=02:58:04 | T_train=02:57:37 | T_epoch=02:57:37] Epoch 0, batch 6501 / 6666 (step 6500) loss=68856.1 (avg=6.886e+04) [[all losses: diffusion=0.12495 ; kl=6.88557e+10 ; lpips=0.36393 ; repa=0.670594 ; sum_loss=68856.1]] +[T_total=03:00:48 | T_train=03:00:21 | T_epoch=03:00:21] Epoch 0, batch 6601 / 6666 (step 6600) loss=67813 (avg=6.781e+04) [[all losses: diffusion=0.124489 ; kl=6.78126e+10 ; lpips=0.363029 ; repa=0.669699 ; sum_loss=67813]] +[2025-10-25 07:13:55,982][main][INFO] - [T_total=03:02:34 | T_train=03:02:07 | T_epoch=03:02:07] End of epoch 0 (6666 steps) train loss 67151.8 +[2025-10-25 07:13:55,984][main][INFO] - [Epoch 0] All losses: [[diffusion=0.124198 ; kl=6.71513e+10 ; lpips=0.362462 ; repa=0.669115]] + Reconstructing from test set: 0%| | 0/261 [00:00 + sys.exit(main()) + ^^^^^^ + File "/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/accelerate/commands/accelerate_cli.py", line 50, in main + args.func(args) + File "/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/accelerate/commands/launch.py", line 1226, in launch_command + multi_gpu_launcher(args) + File "/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/accelerate/commands/launch.py", line 853, in multi_gpu_launcher + distrib_run.run(args) + File "/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torch/distributed/run.py", line 927, in run + elastic_launch( + File "/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torch/distributed/launcher/api.py", line 156, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torch/distributed/launcher/api.py", line 284, in launch_agent + result = agent.run() + ^^^^^^^^^^^ + File "/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torch/distributed/elastic/metrics/api.py", line 138, in wrapper + result = f(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^ + File "/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torch/distributed/elastic/agent/server/api.py", line 717, in run + result = self._invoke_run(role) + ^^^^^^^^^^^^^^^^^^^^^^ + File "/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torch/distributed/elastic/agent/server/api.py", line 881, in _invoke_run + time.sleep(monitor_interval) + File "/workspace/miniconda3/envs/DC_SSDAE/lib/python3.12/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 85, in _terminate_process_handler + raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval) +torch.distributed.elastic.multiprocessing.api.SignalException: Process 58142 got signal: 15