maxmo2009 commited on 16 days ago

Commit

2af0e94

verified ·

1 Parent(s): 9f30236

Sync from local: code + epoch-110 checkpoint, clean README

Replace existing repo with current local OmniMorph: full source tree (training/inference/registration scripts), Diffusion/OMorpher modules, dataloader mappings (16 datasets), and Models/all_om_net/000110_all_om_net.pth (final checkpoint, 3.0G). README rewritten to remove internal links/credentials. BERT external model and intermediate checkpoints not bundled.

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
.gitignore +1 -0
Config/config_om.yaml +15 -20
Config/config_reg_brain.yaml +36 -0
Config/config_reg_hip.yaml +48 -0
Dataloader/dataLoader.py +172 -69
Dataloader/dataloader_utils.py +3 -3
Dataloader/deal_with_json.py +150 -0
Dataloader/embding_gen.py +10 -2
Dataloader/nifty_mappings/AbdomenAtlas_mappings.json +2 -2
Dataloader/nifty_mappings/AbdomenCT1k_mappings.json +2 -2
Dataloader/nifty_mappings/Brats2019_mappings.json +2 -2
Dataloader/nifty_mappings/Brats2020_mappings.json +2 -2
Dataloader/nifty_mappings/Brats2021_mappings.json +2 -2
Dataloader/nifty_mappings/CIA_mappings.json +2 -2
Dataloader/nifty_mappings/Kaggle_osic_mappings.json +0 -0
Dataloader/nifty_mappings/MSD_mappings.json +2 -2
Dataloader/nifty_mappings/MnMs_mappings.json +0 -0
Dataloader/nifty_mappings/OAI_ZIB_KL_mappings.json +3 -0
Dataloader/nifty_mappings/OAI_ZIB_WOMAC_mappings.json +3 -0
Dataloader/nifty_mappings/OASIS_1_mappings.json +2 -2
Dataloader/nifty_mappings/OASIS_2_mappings.json +2 -2
Dataloader/nifty_mappings/PSMA-CT-Longitud_mappings.json +2 -2
Dataloader/nifty_mappings/PSMA-FDG-PET-CT-LESION_mappings.json +2 -2
Dataloader/nifty_mappings/TotalSegmentorCT_MRI_mappings.json +2 -2
Diffusion/diffuser-reg.py +541 -0
Diffusion/diffuser.py +45 -20
Diffusion/diffuser_opt.py +357 -0
Diffusion/losses.py +44 -7
Diffusion/losses_opt.py +141 -0
Diffusion/networks.py +328 -17
Diffusion/networks0.py +1195 -0
Diffusion/networks_opt.py +239 -0
Diffusion/safe_conv_transpose.py +401 -0
Models/all_om_net/000110_all_om_net.pth +3 -0
OM_reg.py +10 -18
OM_reg_flexres.py +382 -0
OM_train_2modes-reg.py +517 -0
OM_train_2modes.py +60 -69
OM_train_3modes-XPU.py +957 -0
OM_train_3modes.py +697 -198
OM_train_3modes_cudaonly.py +512 -0
OM_train_3modes_opt.py +513 -0
OM_train_3modes_original.py +585 -0
OMorpher/__init__.py +3 -0
OMorpher/omorpher.py +1058 -0
README.md +129 -80
Scripts/OM_aug_om.py +239 -0
Scripts/OM_reg_flexres_om.py +315 -0
Scripts/OM_reg_pair_ext.py +676 -0

.gitattributes CHANGED Viewed

@@ -46,3 +46,5 @@ Dataloader/nifty_mappings/OASIS_2_mappings.json filter=lfs diff=lfs merge=lfs -t
 Dataloader/nifty_mappings/PSMA-CT-Longitud_mappings.json filter=lfs diff=lfs merge=lfs -text
 Dataloader/nifty_mappings/PSMA-FDG-PET-CT-LESION_mappings.json filter=lfs diff=lfs merge=lfs -text
 Dataloader/nifty_mappings/TotalSegmentorCT_MRI_mappings.json filter=lfs diff=lfs merge=lfs -text

 Dataloader/nifty_mappings/PSMA-CT-Longitud_mappings.json filter=lfs diff=lfs merge=lfs -text
 Dataloader/nifty_mappings/PSMA-FDG-PET-CT-LESION_mappings.json filter=lfs diff=lfs merge=lfs -text
 Dataloader/nifty_mappings/TotalSegmentorCT_MRI_mappings.json filter=lfs diff=lfs merge=lfs -text
+Dataloader/nifty_mappings/OAI_ZIB_KL_mappings.json filter=lfs diff=lfs merge=lfs -text
+Dataloader/nifty_mappings/OAI_ZIB_WOMAC_mappings.json filter=lfs diff=lfs merge=lfs -text

.gitignore CHANGED Viewed

@@ -15,6 +15,7 @@ External/
 # Logs
 Log/
 swanlog/
 train_log.txt
 aug_log.txt

 # Logs
 Log/
+Logs/
 swanlog/
 train_log.txt
 aug_log.txt

Config/config_om.yaml CHANGED Viewed

@@ -1,13 +1,13 @@
 data_name: all
-# net_name: recresacnet
-net_name: recmutattnnet
-# net_name: recmutattnnet1
 # net_name: defrecmutattnnet
 ndims: 3
 img_size: 128
-batchsize: 2
 ddf_pad_mode: border
-device: cuda
 img_pad_mode: zeros
 num_input_chn: 1
 padding_mode: border
@@ -19,23 +19,21 @@ v_scale: 5.0e-05
 epoch: 10000
 epoch_per_save: 1
 lr: 0.00001
-noise_scale: 0.1
 # =========================
 # AUGMENTATION SETTING
 patients_list: []
 # model_id_str: '000000'
 # model_id_str: '000180' # before registration training
-# model_id_str: '000353'  # good augmentation results on msd
-model_id_str: '000354'  #
 # model_id_str: '000157'
 # model_id_str: '000171'
-start_noise_step: 48      # starting from which noise step to add noise
 noise_step: 1
-aug_coe: 64                  # how many times each sample will be augmented
-# start_noise_step: 56      # starting from which noise step to add noise
-# noise_step: 4
-# aug_coe: 4                  # how many times each sample will be augmented
-condition_type: 'uncon'       # 'None', 'none', 'adding','independ', 'downsample', 'slice', 'project', 'uncon'
 # aug_img_savepath: Data/Aug_data/totseg/img/
 # aug_msk_savepath: Data/Aug_data/totseg/msk/
 # aug_ddf_savepath: Data/Aug_data/totseg/ddf/
@@ -45,9 +43,6 @@ condition_type: 'uncon'       # 'None', 'none', 'adding','independ', 'downsample
 reg_img_savepath: Data/Reg_data/om/img/
 reg_msk_savepath: Data/Reg_data/om/msk/
 reg_ddf_savepath: Data/Reg_data/om/ddf/
-# aug_img_savepath: Data/Aug_data/msd/img/
-# aug_msk_savepath: Data/Aug_data/msd/msk/
-# aug_ddf_savepath: Data/Aug_data/msd/ddf/
-aug_img_savepath: Data/Aug_data/mnms/img/
-aug_msk_savepath: Data/Aug_data/mnms/msk/
-aug_ddf_savepath: Data/Aug_data/mnms/ddf/

 data_name: all
+net_name: om_net
+# net_name: recmutattnnet
+# net_name: recmulmodmutattnnet
 # net_name: defrecmutattnnet
 ndims: 3
 img_size: 128
+batchsize: 3
 ddf_pad_mode: border
+device: xpu
 img_pad_mode: zeros
 num_input_chn: 1
 padding_mode: border
 epoch: 10000
 epoch_per_save: 1
 lr: 0.00001
+noise_scale: 0.05
 # =========================
 # AUGMENTATION SETTING
 patients_list: []
 # model_id_str: '000000'
 # model_id_str: '000180' # before registration training
+# model_id_str: '000356'
 # model_id_str: '000157'
 # model_id_str: '000171'
+model_id_str: '000009'
+start_noise_step: 75
 noise_step: 1
+# aug_coe: 32                  # how many times each sample will be augmented
+aug_coe: 1                  # how many times each sample will be augmented
+condition_type: 'slice'       # 'None', 'none', 'adding','independ', 'downsample', 'slice', 'project', 'uncon'
 # aug_img_savepath: Data/Aug_data/totseg/img/
 # aug_msk_savepath: Data/Aug_data/totseg/msk/
 # aug_ddf_savepath: Data/Aug_data/totseg/ddf/
 reg_img_savepath: Data/Reg_data/om/img/
 reg_msk_savepath: Data/Reg_data/om/msk/
 reg_ddf_savepath: Data/Reg_data/om/ddf/
+aug_img_savepath: Data/Aug_data/msd/img/
+aug_msk_savepath: Data/Aug_data/msd/msk/
+aug_ddf_savepath: Data/Aug_data/msd/ddf/

Config/config_reg_brain.yaml ADDED Viewed

	@@ -0,0 +1,36 @@

+data_name: all
+# net_name: recresacnet
+# net_name: recmutattnnet
+net_name: recmulmodmutattnnet
+# net_name: defrecmutattnnet
+ndims: 3
+img_size: 128
+batchsize: 3
+ddf_pad_mode: border
+device: xpu
+img_pad_mode: zeros
+num_input_chn: 1
+padding_mode: border
+resample_mode: bilinear
+timesteps: 80
+v_scale: 5.0e-05
+# =========================
+# TRAINING SETTING
+epoch: 10000
+epoch_per_save: 1
+lr: 0.00001
+noise_scale: 0.1
+# =========================
+# AUGMENTATION SETTING
+patients_list: []
+model_id_str: '000009'
+start_noise_step: 75
+noise_step: 1
+aug_coe: 1
+condition_type: 'none'
+reg_img_savepath: Data/Reg_data/unpair_brain/img/
+reg_msk_savepath: Data/Reg_data/unpair_brain/msk/
+reg_ddf_savepath: Data/Reg_data/unpair_brain/ddf/
+aug_img_savepath: Data/Aug_data/unpair_brain/img/
+aug_msk_savepath: Data/Aug_data/unpair_brain/msk/
+aug_ddf_savepath: Data/Aug_data/unpair_brain/ddf/

Config/config_reg_hip.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+data_name: all
+# net_name: recresacnet
+# net_name: recmutattnnet
+net_name: recmulmodmutattnnet
+# net_name: defrecmutattnnet
+ndims: 3
+img_size: 128
+batchsize: 3
+ddf_pad_mode: border
+device: xpu
+img_pad_mode: zeros
+num_input_chn: 1
+padding_mode: border
+resample_mode: bilinear
+timesteps: 80
+v_scale: 5.0e-05
+# =========================
+# TRAINING SETTING
+epoch: 10000
+epoch_per_save: 1
+lr: 0.00001
+noise_scale: 0.1
+# =========================
+# AUGMENTATION SETTING
+patients_list: []
+# model_id_str: '000000'
+# model_id_str: '000180' # before registration training
+# model_id_str: '000356'
+# model_id_str: '000157'
+# model_id_str: '000171'
+model_id_str: '000009'
+start_noise_step: 75
+noise_step: 1
+# aug_coe: 32                  # how many times each sample will be augmented
+aug_coe: 1                  # how many times each sample will be augmented
+condition_type: 'none'       # 'None', 'none', 'adding','independ', 'downsample', 'slice', 'project', 'uncon'
+# aug_img_savepath: Data/Aug_data/totseg/img/
+# aug_msk_savepath: Data/Aug_data/totseg/msk/
+# aug_ddf_savepath: Data/Aug_data/totseg/ddf/
+# aug_img_savepath: Data/Aug_data/om/img/
+# aug_msk_savepath: Data/Aug_data/om/msk/
+# aug_ddf_savepath: Data/Aug_data/om/ddf/
+reg_img_savepath: Data/Reg_data/pair_hip/img/
+reg_msk_savepath: Data/Reg_data/pair_hip/msk/
+reg_ddf_savepath: Data/Reg_data/pair_hip/ddf/
+aug_img_savepath: Data/Aug_data/pair_hip/img/
+aug_msk_savepath: Data/Aug_data/pair_hip/msk/
+aug_ddf_savepath: Data/Aug_data/pair_hip/ddf/

Dataloader/dataLoader.py CHANGED Viewed

@@ -1,3 +1,6 @@
 import torch
 from torch.utils.data import Dataset, DataLoader
 import json
@@ -5,8 +8,8 @@ import SimpleITK as sitk
 import numpy as np
 from skimage.transform import rescale, resize, downscale_local_mean
 # from torchvision.transforms import v2
-import sys
-sys.path.append('./')
 from Dataloader.dataloader_utils import *
 import random
@@ -18,22 +21,42 @@ import random
 # }
 mapping_files = {
-    'MSD': '/home/data/Github/OmniMorph/Dataloader/nifty_mappings/MSD_mappings.json',
-    'TotalSegmentor': '/home/data/Github/OmniMorph/Dataloader/nifty_mappings/TotalSegmentorCT_MRI_mappings.json',
-    'Kaggle_osic': '/home/data/Github/OmniMorph/Dataloader/nifty_mappings/Kaggle_osic_mappings.json',
-    'CancerImageArchive': '/home/data/Github/OmniMorph/Dataloader/nifty_mappings/CIA_mappings.json',
-    'MnMs': '/home/jachin/data/Github/OmniMorph/Dataloader/nifty_mappings/MnMs_mappings.json',
-    # 'Brats2019': '/home/jachin/data/Github/OmniMorph/Dataloader/nifty_mappings/Brats2019_mappings.json',
-    'Brats2020': '/home/jachin/data/Github/OmniMorph/Dataloader/nifty_mappings/Brats2020_mappings.json',
-    'Brats2021': '/home/jachin/data/Github/OmniMorph/Dataloader/nifty_mappings/Brats2021_mappings.json',
-    'OASIS_1': '/home/jachin/data/Github/OmniMorph/Dataloader/nifty_mappings/OASIS_1_mappings.json',
-    'OASIS_2': '/home/jachin/data/Github/OmniMorph/Dataloader/nifty_mappings/OASIS_2_mappings.json',
-    'PSMA-FDG-PET-CT-LESION':'/home/jachin/data/Github/OmniMorph/Dataloader/nifty_mappings/PSMA-FDG-PET-CT-LESION_mappings.json',
-    'PSMA-CT':'/home/jachin/data/Github/OmniMorph/Dataloader/nifty_mappings/PSMA-CT-Longitud_mappings.json',
-    'AbdomenAtlas':'/home/jachin/data/Github/OmniMorph/Dataloader/nifty_mappings/AbdomenAtlas_mappings.json',
-    'AbdomenCT1k':'/home/jachin/data/Github/OmniMorph/Dataloader/nifty_mappings/AbdomenCT1k_mappings.json',
 }
 CLAMP_RANGE = [-400, 400]  # default clamp range for the images
@@ -74,50 +97,9 @@ def sample_random_uniform_multi_order(high=1., low=0., order_num=2, type='high')
             sample_value = np.random.uniform(low, high=sample_value)
     return sample_value
-class DummyOMDataset_indiv(Dataset):
-    """Dummy dataset that generates random 3D volumes and embeddings for XPU testing."""
-    def __init__(self, out_sz=128, num_samples=100, embd_dim=1024, transform=None):
-        self.out_sz = out_sz
-        self.num_samples = num_samples
-        self.embd_dim = embd_dim
-        self.transform = transform
-    def __len__(self):
-        return self.num_samples
-    def __getitem__(self, idx):
-        volume = np.random.rand(1, self.out_sz, self.out_sz, self.out_sz).astype(np.float64)
-        embd = np.random.randn(self.embd_dim).astype(np.float32)
-        if self.transform is not None:
-            volume = self.transform(volume)
-        return volume, embd
-class DummyOMDataset_pair(Dataset):
-    """Dummy dataset that generates random paired 3D volumes and embeddings for XPU testing."""
-    def __init__(self, out_sz=128, num_samples=100, embd_dim=1024, transform=None):
-        self.out_sz = out_sz
-        self.num_samples = num_samples
-        self.embd_dim = embd_dim
-        self.transform = transform
-    def __len__(self):
-        return self.num_samples
-    def __getitem__(self, idx):
-        volume_A = np.random.rand(1, self.out_sz, self.out_sz, self.out_sz).astype(np.float64)
-        volume_B = np.random.rand(1, self.out_sz, self.out_sz, self.out_sz).astype(np.float64)
-        embd_A = np.random.randn(self.embd_dim).astype(np.float32)
-        embd_B = np.random.randn(self.embd_dim).astype(np.float32)
-        if self.transform is not None:
-            volume_A = self.transform(volume_A)
-            volume_B = self.transform(volume_B)
-        return [volume_A, volume_B, embd_A, embd_B]
 class OminiDataset(object):
     """Base class for OmniMorph datasets."""
-    def init(self, out_sz, transform, clamp_range, min_crop_ratio, ROIs, modality,reverse_axis_order ,min_dim,mapping_files):
         # self.mappings = mapping_files
         self.ALLdata = self.combine_data(mappings = mapping_files)
@@ -155,10 +137,27 @@ class OminiDataset(object):
     def combine_data(self, mappings = mapping_files):
         ALLdata = {}
         for j in mappings.keys():
             with open(mappings[j], 'r') as f:
                 mappings_tmp = json.load(f)
-                ALLdata.update(mappings_tmp)
         return ALLdata
     def get_3D_volume(self, volume, select_channel = None):
@@ -301,10 +300,27 @@ class OminiDataset_v1(Dataset):
     def combine_data(self):
         ALLdata = {}
         for j in self.mappings.keys():
             with open(self.mappings[j], 'r') as f:
                 mappings = json.load(f)
-                ALLdata.update(mappings)
         return ALLdata
     def __len__(self):
@@ -442,10 +458,27 @@ class OMDataset_indiv(Dataset):
     def combine_data(self, mappings = mapping_files):
         ALLdata = {}
         for j in mappings.keys():
             with open(mappings[j], 'r') as f:
                 mappings_tmp = json.load(f)
-                ALLdata.update(mappings_tmp)
         return ALLdata
     def __len__(self):
@@ -496,7 +529,7 @@ class OMDataset_indiv(Dataset):
         return [volume, embd]
 class OminiDataset_paired(Dataset):
-    def __init__(self, out_sz = 128, transform=None, clamp_range = CLAMP_RANGE, min_crop_ratio = 0.9, ROIs = None, modality = None, reverse_axis_order = False):
         # self.mappings = mapping_files
         self.ALLdata = self.combine_data(mappings=mapping_files)
         self.out_sz = out_sz
@@ -525,10 +558,27 @@ class OminiDataset_paired(Dataset):
     def combine_data(self, mappings = mapping_files):
         ALLdata = {}
         for j in mappings.keys():
             with open(mappings[j], 'r') as f:
                 mappings_tmp = json.load(f)
-                ALLdata.update(mappings_tmp)
         return ALLdata
     def normalize(self, volume, eps=1e-7):
@@ -747,10 +797,27 @@ class OMDataset_pair(Dataset):
     def combine_data(self, mappings = mapping_files):
         ALLdata = {}
         for j in mappings.keys():
             with open(mappings[j], 'r') as f:
                 mappings_tmp = json.load(f)
-                ALLdata.update(mappings_tmp)
         return ALLdata
     def normalize(self, volume, eps=1e-7):
@@ -911,8 +978,8 @@ class OMDataset_pair(Dataset):
         paired_key = random.choice(paired_keys)
-        print(f"Key: {key}, Paired Key: {paired_key}")
-        print(f"ROI: {self.ALLdata_filtered[key]['ROI']}, {self.ALLdata_filtered[paired_key]['ROI']}; Modality: {self.ALLdata_filtered[key]['Modality']}, {self.ALLdata_filtered[paired_key]['Modality']}")
         volume_B = sitk.ReadImage(paired_key)
@@ -1004,10 +1071,27 @@ class OminiDataset_paired_inf(object):
     def combine_data(self, mappings = mapping_files):
         ALLdata = {}
         for j in mappings.keys():
             with open(mappings[j], 'r') as f:
                 mappings_tmp = json.load(f)
-                ALLdata.update(mappings_tmp)
         return ALLdata
     def __len__(self):
@@ -1244,10 +1328,27 @@ class OminiDataset_inference_w_all(object):
     def combine_data(self, mappings = mapping_files):
         ALLdata = {}
         for j in mappings.keys():
             with open(mappings[j], 'r') as f:
                 mappings_tmp = json.load(f)
-                ALLdata.update(mappings_tmp)
         return ALLdata
     def normalize(self, volume, eps=1e-7):
@@ -1414,6 +1515,7 @@ class OminiDataset_inference_w_all(object):
                         # print(f"Label with channels, pad_width_lab: {pad_width_lab}")
                     else:
                         pad_width_lab = pad_width
                     label = self.apply_pad_crop(label, pad_width_lab, crop_slices)
                     # print(f"After pad and crop, label shape: {label.shape}, key: {key}, label key: {lk}")
                     label_dict[lk] = resize(label,[self.out_sz]*self.ndims, anti_aliasing = False, preserve_range = True, order=0)
@@ -1442,6 +1544,7 @@ class OminiDataset_inference_w_all(object):
         return return_dict
 class OminiDataset_bertembd(OminiDataset):
     def __init__(self,
                  out_sz = 128,
@@ -1453,7 +1556,7 @@ class OminiDataset_bertembd(OminiDataset):
                  reverse_axis_order = False,
                  min_dim = 3,
                  mapping_files = mapping_files):
-        super().init(out_sz = out_sz,
                      transform = transform,
                      clamp_range = clamp_range,
                      min_crop_ratio = min_crop_ratio,

+import os, sys
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
 import torch
 from torch.utils.data import Dataset, DataLoader
 import json
 import numpy as np
 from skimage.transform import rescale, resize, downscale_local_mean
 # from torchvision.transforms import v2
+# sys.path.append('./')
+sys.path.append(ROOT_DIR)
 from Dataloader.dataloader_utils import *
 import random
 # }
+# mapping_files = {
+#     'MSD': '/home/data/Github/OmniMorph/Dataloader/nifty_mappings/MSD_mappings.json',
+#     'TotalSegmentor': '/home/data/Github/OmniMorph/Dataloader/nifty_mappings/TotalSegmentorCT_MRI_mappings.json',
+#     'Kaggle_osic': '/home/data/Github/OmniMorph/Dataloader/nifty_mappings/Kaggle_osic_mappings.json',
+#     'CancerImageArchive': '/home/data/Github/OmniMorph/Dataloader/nifty_mappings/CIA_mappings.json',
+#     'MnMs': '/home/jachin/data/Github/OmniMorph/Dataloader/nifty_mappings/MnMs_mappings.json',
+#     # 'Brats2019': '/home/jachin/data/Github/OmniMorph/Dataloader/nifty_mappings/Brats2019_mappings.json',
+#     'Brats2020': '/home/jachin/data/Github/OmniMorph/Dataloader/nifty_mappings/Brats2020_mappings.json',
+#     'Brats2021': '/home/jachin/data/Github/OmniMorph/Dataloader/nifty_mappings/Brats2021_mappings.json',
+#     'OASIS_1': '/home/jachin/data/Github/OmniMorph/Dataloader/nifty_mappings/OASIS_1_mappings.json',
+#     'OASIS_2': '/home/jachin/data/Github/OmniMorph/Dataloader/nifty_mappings/OASIS_2_mappings.json',
+#     'PSMA-FDG-PET-CT-LESION':'/home/jachin/data/Github/OmniMorph/Dataloader/nifty_mappings/PSMA-FDG-PET-CT-LESION_mappings.json',
+#     'PSMA-CT':'/home/jachin/data/Github/OmniMorph/Dataloader/nifty_mappings/PSMA-CT-Longitud_mappings.json',
+#     'AbdomenAtlas':'/home/jachin/data/Github/OmniMorph/Dataloader/nifty_mappings/AbdomenAtlas_mappings.json',
+#     'AbdomenCT1k':'/home/jachin/data/Github/OmniMorph/Dataloader/nifty_mappings/AbdomenCT1k_mappings.json',
+# }
 mapping_files = {
+    'MSD': 'nifty_mappings/MSD_mappings.json',
+    'TotalSegmentor': 'nifty_mappings/TotalSegmentorCT_MRI_mappings.json',
+    'Kaggle_osic': 'nifty_mappings/Kaggle_osic_mappings.json',
+    'CancerImageArchive': 'nifty_mappings/CIA_mappings.json',
+    'MnMs': 'nifty_mappings/MnMs_mappings.json',
+    # 'Brats2019': 'nifty_mappings/Brats2019_mappings.json', # should be commented out after testing
+    'Brats2020': 'nifty_mappings/Brats2020_mappings.json',
+    'Brats2021': 'nifty_mappings/Brats2021_mappings.json',
+    'OASIS_1': 'nifty_mappings/OASIS_1_mappings.json',
+    'OASIS_2': 'nifty_mappings/OASIS_2_mappings.json',
+    'PSMA-FDG-PET-CT-LESION':'nifty_mappings/PSMA-FDG-PET-CT-LESION_mappings.json',
+    'PSMA-CT':'nifty_mappings/PSMA-CT-Longitud_mappings.json',
+    'AbdomenAtlas':'nifty_mappings/AbdomenAtlas_mappings.json',
+    'AbdomenCT1k':'nifty_mappings/AbdomenCT1k_mappings.json',
+    'OAI_ZIB': 'nifty_mappings/OAI_ZIB_KL_mappings.json',
+    # 'OAI_ZIB': 'nifty_mappings/OAI_ZIB_WOMAC_mappings.json',  # alternative: WOMAC scores instead of KL-grade
 }
+for k,v in mapping_files.items():
+    mapping_files[k] = os.path.join(ROOT_DIR, v)
 CLAMP_RANGE = [-400, 400]  # default clamp range for the images
             sample_value = np.random.uniform(low, high=sample_value)
     return sample_value
 class OminiDataset(object):
     """Base class for OmniMorph datasets."""
+    def __init__(self, out_sz, transform, clamp_range, min_crop_ratio, ROIs, modality,reverse_axis_order ,min_dim,mapping_files):
         # self.mappings = mapping_files
         self.ALLdata = self.combine_data(mappings = mapping_files)
     def combine_data(self, mappings = mapping_files):
         ALLdata = {}
+        total_entries = 0
+        total_skipped = 0
         for j in mappings.keys():
             with open(mappings[j], 'r') as f:
                 mappings_tmp = json.load(f)
+                skipped = 0
+                for k, v in mappings_tmp.items():
+                    if not os.path.exists(k) or os.path.getsize(k) == 0:
+                        skipped += 1
+                        continue
+                    ALLdata[k] = v
+                accessible = len(mappings_tmp) - skipped
+                total_entries += len(mappings_tmp)
+                total_skipped += skipped
+                if skipped > 0:
+                    print(f"  WARNING: {j}: {accessible}/{len(mappings_tmp)} accessible ({skipped} missing/empty)")
+        if total_skipped > 0:
+            print(f"  DATA LOADING WARNING: {len(ALLdata)}/{total_entries} total files accessible ({total_skipped} missing)")
+        if len(ALLdata) < 1000:
+            print(f"  *** CRITICAL WARNING: Only {len(ALLdata)} files loaded! Expected ~15000+. "
+                  f"Check that data paths in nifty_mappings/ JSON files are accessible from this node. ***")
         return ALLdata
     def get_3D_volume(self, volume, select_channel = None):
     def combine_data(self):
         ALLdata = {}
+        total_entries = 0
+        total_skipped = 0
         for j in self.mappings.keys():
             with open(self.mappings[j], 'r') as f:
                 mappings = json.load(f)
+                skipped = 0
+                for k, v in mappings.items():
+                    if not os.path.exists(k) or os.path.getsize(k) == 0:
+                        skipped += 1
+                        continue
+                    ALLdata[k] = v
+                accessible = len(mappings) - skipped
+                total_entries += len(mappings)
+                total_skipped += skipped
+                if skipped > 0:
+                    print(f"  WARNING: {j}: {accessible}/{len(mappings)} accessible ({skipped} missing/empty)")
+        if total_skipped > 0:
+            print(f"  DATA LOADING WARNING: {len(ALLdata)}/{total_entries} total files accessible ({total_skipped} missing)")
+        if len(ALLdata) < 1000:
+            print(f"  *** CRITICAL WARNING: Only {len(ALLdata)} files loaded! Expected ~15000+. "
+                  f"Check that data paths in nifty_mappings/ JSON files are accessible from this node. ***")
         return ALLdata
     def __len__(self):
     def combine_data(self, mappings = mapping_files):
         ALLdata = {}
+        total_entries = 0
+        total_skipped = 0
         for j in mappings.keys():
             with open(mappings[j], 'r') as f:
                 mappings_tmp = json.load(f)
+                skipped = 0
+                for k, v in mappings_tmp.items():
+                    if not os.path.exists(k) or os.path.getsize(k) == 0:
+                        skipped += 1
+                        continue
+                    ALLdata[k] = v
+                accessible = len(mappings_tmp) - skipped
+                total_entries += len(mappings_tmp)
+                total_skipped += skipped
+                if skipped > 0:
+                    print(f"  WARNING: {j}: {accessible}/{len(mappings_tmp)} accessible ({skipped} missing/empty)")
+        if total_skipped > 0:
+            print(f"  DATA LOADING WARNING: {len(ALLdata)}/{total_entries} total files accessible ({total_skipped} missing)")
+        if len(ALLdata) < 1000:
+            print(f"  *** CRITICAL WARNING: Only {len(ALLdata)} files loaded! Expected ~15000+. "
+                  f"Check that data paths in nifty_mappings/ JSON files are accessible from this node. ***")
         return ALLdata
     def __len__(self):
         return [volume, embd]
 class OminiDataset_paired(Dataset):
+    def __init__(self, out_sz = 128, transform=None, clamp_range = CLAMP_RANGE, min_crop_ratio = 0.85, ROIs = None, modality = None, reverse_axis_order = False):
         # self.mappings = mapping_files
         self.ALLdata = self.combine_data(mappings=mapping_files)
         self.out_sz = out_sz
     def combine_data(self, mappings = mapping_files):
         ALLdata = {}
+        total_entries = 0
+        total_skipped = 0
         for j in mappings.keys():
             with open(mappings[j], 'r') as f:
                 mappings_tmp = json.load(f)
+                skipped = 0
+                for k, v in mappings_tmp.items():
+                    if not os.path.exists(k) or os.path.getsize(k) == 0:
+                        skipped += 1
+                        continue
+                    ALLdata[k] = v
+                accessible = len(mappings_tmp) - skipped
+                total_entries += len(mappings_tmp)
+                total_skipped += skipped
+                if skipped > 0:
+                    print(f"  WARNING: {j}: {accessible}/{len(mappings_tmp)} accessible ({skipped} missing/empty)")
+        if total_skipped > 0:
+            print(f"  DATA LOADING WARNING: {len(ALLdata)}/{total_entries} total files accessible ({total_skipped} missing)")
+        if len(ALLdata) < 1000:
+            print(f"  *** CRITICAL WARNING: Only {len(ALLdata)} files loaded! Expected ~15000+. "
+                  f"Check that data paths in nifty_mappings/ JSON files are accessible from this node. ***")
         return ALLdata
     def normalize(self, volume, eps=1e-7):
     def combine_data(self, mappings = mapping_files):
         ALLdata = {}
+        total_entries = 0
+        total_skipped = 0
         for j in mappings.keys():
             with open(mappings[j], 'r') as f:
                 mappings_tmp = json.load(f)
+                skipped = 0
+                for k, v in mappings_tmp.items():
+                    if not os.path.exists(k) or os.path.getsize(k) == 0:
+                        skipped += 1
+                        continue
+                    ALLdata[k] = v
+                accessible = len(mappings_tmp) - skipped
+                total_entries += len(mappings_tmp)
+                total_skipped += skipped
+                if skipped > 0:
+                    print(f"  WARNING: {j}: {accessible}/{len(mappings_tmp)} accessible ({skipped} missing/empty)")
+        if total_skipped > 0:
+            print(f"  DATA LOADING WARNING: {len(ALLdata)}/{total_entries} total files accessible ({total_skipped} missing)")
+        if len(ALLdata) < 1000:
+            print(f"  *** CRITICAL WARNING: Only {len(ALLdata)} files loaded! Expected ~15000+. "
+                  f"Check that data paths in nifty_mappings/ JSON files are accessible from this node. ***")
         return ALLdata
     def normalize(self, volume, eps=1e-7):
         paired_key = random.choice(paired_keys)
+        # print(f"Key: {key}, Paired Key: {paired_key}")
+        # print(f"ROI: {self.ALLdata_filtered[key]['ROI']}, {self.ALLdata_filtered[paired_key]['ROI']}; Modality: {self.ALLdata_filtered[key]['Modality']}, {self.ALLdata_filtered[paired_key]['Modality']}")
         volume_B = sitk.ReadImage(paired_key)
     def combine_data(self, mappings = mapping_files):
         ALLdata = {}
+        total_entries = 0
+        total_skipped = 0
         for j in mappings.keys():
             with open(mappings[j], 'r') as f:
                 mappings_tmp = json.load(f)
+                skipped = 0
+                for k, v in mappings_tmp.items():
+                    if not os.path.exists(k) or os.path.getsize(k) == 0:
+                        skipped += 1
+                        continue
+                    ALLdata[k] = v
+                accessible = len(mappings_tmp) - skipped
+                total_entries += len(mappings_tmp)
+                total_skipped += skipped
+                if skipped > 0:
+                    print(f"  WARNING: {j}: {accessible}/{len(mappings_tmp)} accessible ({skipped} missing/empty)")
+        if total_skipped > 0:
+            print(f"  DATA LOADING WARNING: {len(ALLdata)}/{total_entries} total files accessible ({total_skipped} missing)")
+        if len(ALLdata) < 1000:
+            print(f"  *** CRITICAL WARNING: Only {len(ALLdata)} files loaded! Expected ~15000+. "
+                  f"Check that data paths in nifty_mappings/ JSON files are accessible from this node. ***")
         return ALLdata
     def __len__(self):
     def combine_data(self, mappings = mapping_files):
         ALLdata = {}
+        total_entries = 0
+        total_skipped = 0
         for j in mappings.keys():
             with open(mappings[j], 'r') as f:
                 mappings_tmp = json.load(f)
+                skipped = 0
+                for k, v in mappings_tmp.items():
+                    if not os.path.exists(k) or os.path.getsize(k) == 0:
+                        skipped += 1
+                        continue
+                    ALLdata[k] = v
+                accessible = len(mappings_tmp) - skipped
+                total_entries += len(mappings_tmp)
+                total_skipped += skipped
+                if skipped > 0:
+                    print(f"  WARNING: {j}: {accessible}/{len(mappings_tmp)} accessible ({skipped} missing/empty)")
+        if total_skipped > 0:
+            print(f"  DATA LOADING WARNING: {len(ALLdata)}/{total_entries} total files accessible ({total_skipped} missing)")
+        if len(ALLdata) < 1000:
+            print(f"  *** CRITICAL WARNING: Only {len(ALLdata)} files loaded! Expected ~15000+. "
+                  f"Check that data paths in nifty_mappings/ JSON files are accessible from this node. ***")
         return ALLdata
     def normalize(self, volume, eps=1e-7):
                         # print(f"Label with channels, pad_width_lab: {pad_width_lab}")
                     else:
                         pad_width_lab = pad_width
                     label = self.apply_pad_crop(label, pad_width_lab, crop_slices)
                     # print(f"After pad and crop, label shape: {label.shape}, key: {key}, label key: {lk}")
                     label_dict[lk] = resize(label,[self.out_sz]*self.ndims, anti_aliasing = False, preserve_range = True, order=0)
         return return_dict
 class OminiDataset_bertembd(OminiDataset):
     def __init__(self,
                  out_sz = 128,
                  reverse_axis_order = False,
                  min_dim = 3,
                  mapping_files = mapping_files):
+        super().__init__(out_sz = out_sz,
                      transform = transform,
                      clamp_range = clamp_range,
                      min_crop_ratio = min_crop_ratio,

Dataloader/dataloader_utils.py CHANGED Viewed

@@ -48,9 +48,9 @@ def get_sizeRange_dict(roi=''):
       'abdomen': [240, 1024],
       'pelvis': [220, 1024],
       'thorax': [220, 1024],
-      'arm': [140, 1024],
-      'hand': [140, 1024],
-      'leg': [160, 1024],
       'skeleton': [130, 1024],
     }
     if roi in sizeRange_dict:

       'abdomen': [240, 1024],
       'pelvis': [220, 1024],
       'thorax': [220, 1024],
+      'arm': [100, 1024],
+      'hand': [100, 1024],
+      'leg': [100, 1024],
       'skeleton': [130, 1024],
     }
     if roi in sizeRange_dict:

Dataloader/deal_with_json.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import os, sys
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(ROOT_DIR)
+import json
+# CORRECT_DATA_PATH = os.path.join(ROOT_DIR, '../..')
+# CORRECT_DATA_PATH = os.path.join('/hy-tmp')
+CORRECT_DATA_PATH = '/home/dn-zhen2/rds/rds-airr-p51-TWhPgQVLKbA/Data/Omini3D'
+def traverse_and_print(data, path=()):
+    for key, value in data.items():
+        current_path = path + (key,)
+        if isinstance(key, str) and 'DATASETS' in key:
+            print(f"KEY (str): {key}")
+        if isinstance(value, str) and 'DATASETS' in value:
+            print(f"  VALUE (str): {value}")
+        elif isinstance(value, dict):
+            traverse_and_print(value, current_path)
+def traverse_and_check(data, path=()):
+    failed_files = []
+    for key, value in data.items():
+        current_path = path + (key,)
+        if isinstance(key, str) and 'DATASETS_processed' in key:
+            if os.path.isfile(key):
+                print(f'\rCheck pass: {key}', end='')
+            else:
+                print(f'\rCheck fail ! : {key}')
+                failed_files.append(key)
+        if isinstance(value, str) and 'DATASETS_processed' in value:
+            if os.path.isfile(value):
+                print(f'\rCheck pass: {value}', end='')
+            else:
+                print(f'\rCheck fail ! : {value}')
+                failed_files.append(value)
+        elif isinstance(value, dict):
+            traverse_and_check(value, current_path)
+    if failed_files != []:
+        print(f'\nCheck finished. Failed files: {failed_files}')
+        return False
+    else:
+        print('\nAll files check passed!')
+        return True
+def traverse_and_revise(data, path=()):
+    what_need_change = [
+        '/home/jachin/data/Github/data/data_gen_def',
+        '/home/data/Github/data/data_gen_def',
+    ]
+    for key, value in list(data.items()):
+        current_path = path + (key,)
+        new_key = key
+        if isinstance(key, str) and 'data_gen_def' in key:
+            for wnc in what_need_change:
+                if wnc in key:
+                    new_key = key.replace(wnc, CORRECT_DATA_PATH)
+            # change keys
+            data[new_key] = data.pop(key)
+            value = data[new_key]
+            current_path = path + (new_key,)
+        if isinstance(value, str) and 'data_gen_def' in value:
+            for wnc in what_need_change:
+                if wnc in value:
+                    data[new_key] = value.replace(wnc, CORRECT_DATA_PATH)
+        elif isinstance(value, dict):
+            traverse_and_revise(value, current_path)
+    return data
+def traverse_and_rename_label(data, old_label, new_label, task_keys=("segmentation", "registration")):
+    """Rename a label key inside Label_path -> segmentation/registration for every entry.
+    Example: rename "brain" -> "brain_tumour" to fix the BraTS mislabel.
+    """
+    count = 0
+    for key, value in data.items():
+        if not isinstance(value, dict):
+            continue
+        label_path = value.get("Label_path")
+        if isinstance(label_path, dict):
+            for tk in task_keys:
+                task_dict = label_path.get(tk)
+                if isinstance(task_dict, dict) and old_label in task_dict:
+                    task_dict[new_label] = task_dict.pop(old_label)
+                    count += 1
+        else:
+            # recurse into nested dicts
+            count += traverse_and_rename_label(value, old_label, new_label, task_keys)
+    return count
+mapping_files = {
+    'MSD': 'nifty_mappings/MSD_mappings.json',
+    'TotalSegmentor': 'nifty_mappings/TotalSegmentorCT_MRI_mappings.json',
+    'Kaggle_osic': 'nifty_mappings/Kaggle_osic_mappings.json',
+    'CancerImageArchive': 'nifty_mappings/CIA_mappings.json',
+    'MnMs': 'nifty_mappings/MnMs_mappings.json',
+    'Brats2019': 'nifty_mappings/Brats2019_mappings.json',
+    'Brats2020': 'nifty_mappings/Brats2020_mappings.json',
+    'Brats2021': 'nifty_mappings/Brats2021_mappings.json',
+    'OASIS_1': 'nifty_mappings/OASIS_1_mappings.json',
+    'OASIS_2': 'nifty_mappings/OASIS_2_mappings.json',
+    'PSMA-FDG-PET-CT-LESION':'nifty_mappings/PSMA-FDG-PET-CT-LESION_mappings.json',
+    'PSMA-CT':'nifty_mappings/PSMA-CT-Longitud_mappings.json',
+    'AbdomenAtlas':'nifty_mappings/AbdomenAtlas_mappings.json',
+    'AbdomenCT1k':'nifty_mappings/AbdomenCT1k_mappings.json',
+}
+for k,v in mapping_files.items():
+    mapping_files[k] = os.path.join(ROOT_DIR, v)
+if __name__ == "__main__":
+    # --- Fix BraTS / MSD mislabel: "brain" -> "brain_tumour" ---
+    rename_datasets = ['Brats2019', 'Brats2020', 'Brats2021', 'MSD']
+    for ds_name in rename_datasets:
+        if ds_name not in mapping_files:
+            continue
+        v = mapping_files[ds_name]
+        with open(v, 'r') as f:
+            mappings_tmp = json.load(f)
+        n = traverse_and_rename_label(mappings_tmp, 'brain', 'brain_tumour')
+        if n > 0:
+            with open(v, 'w') as f:
+                json.dump(mappings_tmp, f, indent=4)
+            print(f'[{ds_name}] Renamed "brain" -> "brain_tumour" in {n} entries, saved to {v}')
+        else:
+            print(f'[{ds_name}] No "brain" labels found (already renamed?)')
+    # --- Path revision (uncomment to run) ---
+    # for k,v in mapping_files.items():
+    #     with open(v, 'r') as f:
+    #         mappings_tmp = json.load(f)
+    #         new_mappings_tmp = traverse_and_revise(mappings_tmp)
+    #         # traverse_and_print(new_mappings_tmp)
+    #         # all_good = traverse_and_check(new_mappings_tmp)
+    #     # save in-place
+    #     with open(v, 'w') as f:
+    #         json.dump(new_mappings_tmp, f, indent=4)
+    #     print(f'Saved revised mapping to {v}')

Dataloader/embding_gen.py CHANGED Viewed

@@ -23,7 +23,9 @@ mapping_files = {
     # 'Brats2020': '/home/jachin/data/Github/data/data_gen_def/DATASETS_processed/BRATS/BRATS2020/nifti_mappings.json',
     # 'Brats2021': '/home/jachin/data/Github/data/data_gen_def/DATASETS_processed/BRATS/BRATS2021/nifti_mappings.json',
     # 'OASIS_1': '/home/data/Github/data/data_gen_def/DATASETS_processed/OASIS/OASIS_1/CS_SECTIONAL/nifti_mappings.json',
-    'OASIS_2': '/home/data/Github/data/data_gen_def/DATASETS_processed/OASIS/OASIS_2/RAW_V2/nifti_mappings.json',
     # 'PSMA-FDG-PET-CT-LESION':'/home/jachin/data/Github/data/data_gen_def/DATASETS_processed/PSMA/PSMA-FDG-PET-CT-LESION/V2/nifti_mappings.json',
     # 'PSMA-CT':'/home/jachin/data/Github/data/data_gen_def/DATASETS_processed/PSMA/Longitudinal-CT/nifti_mappings.json',
     # 'AbdomenAtlas':'/home/jachin/data/Github/data/data_gen_def/DATASETS_processed/AbdomenAtlas_v2/nifti_mappings.json',
@@ -45,6 +47,8 @@ save_paths = {
     'PSMA-CT':'/home/jachin/data/Github/OmniMorph/Dataloader/nifty_mappings/PSMA-CT-Longitud_mappings.json',
     'AbdomenAtlas':'/home/jachin/data/Github/OmniMorph/Dataloader/nifty_mappings/AbdomenAtlas_mappings.json',
     'AbdomenCT1k':'/home/jachin/data/Github/OmniMorph/Dataloader/nifty_mappings/AbdomenCT1k_mappings.json',
 }
 query = {
     'MSD': ['description'],
@@ -61,6 +65,8 @@ query = {
     'PSMA-CT':[],
     'AbdomenAtlas':[],
     'AbdomenCT1k':[],
 }
 add_text = {
     'MSD': {},
@@ -77,11 +83,13 @@ add_text = {
     'PSMA-FDG-PET-CT-LESION':{'description': 'malignant melanoma, lymphoma, lung cancer, or healthy'},
     'AbdomenAtlas':{},
     'AbdomenCT1k':{},
 }
 # bert intialization
-model_name = '/home/jachin/data/Github/OmniMorph/External/Models/bert_large_uncased'
 reduce_method = 'mean'
 max_words_num = 32  # max number of words in the caption > 2
 # max_words_num = 64  # max number of words in the caption > 2

     # 'Brats2020': '/home/jachin/data/Github/data/data_gen_def/DATASETS_processed/BRATS/BRATS2020/nifti_mappings.json',
     # 'Brats2021': '/home/jachin/data/Github/data/data_gen_def/DATASETS_processed/BRATS/BRATS2021/nifti_mappings.json',
     # 'OASIS_1': '/home/data/Github/data/data_gen_def/DATASETS_processed/OASIS/OASIS_1/CS_SECTIONAL/nifti_mappings.json',
+    # 'OASIS_2': '/home/data/Github/data/data_gen_def/DATASETS_processed/OASIS/OASIS_2/RAW_V2/nifti_mappings.json',
+    'OAI_ZIB_KL': '/home/dn-zhen2/rds/rds-airr-p51-TWhPgQVLKbA/Data/Omini3D/DATASETS_processed/OAI_ZIB/nifti_mappings.json',
+    'OAI_ZIB_WOMAC': '/home/dn-zhen2/rds/rds-airr-p51-TWhPgQVLKbA/Data/Omini3D/DATASETS_processed/OAI_ZIB/nifti_mappings.json',
     # 'PSMA-FDG-PET-CT-LESION':'/home/jachin/data/Github/data/data_gen_def/DATASETS_processed/PSMA/PSMA-FDG-PET-CT-LESION/V2/nifti_mappings.json',
     # 'PSMA-CT':'/home/jachin/data/Github/data/data_gen_def/DATASETS_processed/PSMA/Longitudinal-CT/nifti_mappings.json',
     # 'AbdomenAtlas':'/home/jachin/data/Github/data/data_gen_def/DATASETS_processed/AbdomenAtlas_v2/nifti_mappings.json',
     'PSMA-CT':'/home/jachin/data/Github/OmniMorph/Dataloader/nifty_mappings/PSMA-CT-Longitud_mappings.json',
     'AbdomenAtlas':'/home/jachin/data/Github/OmniMorph/Dataloader/nifty_mappings/AbdomenAtlas_mappings.json',
     'AbdomenCT1k':'/home/jachin/data/Github/OmniMorph/Dataloader/nifty_mappings/AbdomenCT1k_mappings.json',
+    'OAI_ZIB_KL': '/home/dn-zhen2/rds/rds-airr-p51-TWhPgQVLKbA/Code/OmniMorph/Dataloader/nifty_mappings/OAI_ZIB_KL_mappings.json',
+    'OAI_ZIB_WOMAC': '/home/dn-zhen2/rds/rds-airr-p51-TWhPgQVLKbA/Code/OmniMorph/Dataloader/nifty_mappings/OAI_ZIB_WOMAC_mappings.json',
 }
 query = {
     'MSD': ['description'],
     'PSMA-CT':[],
     'AbdomenAtlas':[],
     'AbdomenCT1k':[],
+    'OAI_ZIB_KL': ['Age', 'Gender', 'KL_Grade', 'BMI'],
+    'OAI_ZIB_WOMAC': ['Age', 'Gender', 'WOMAC_Pain', 'WOMAC_ADL', 'WOMAC_Stiffness', 'BMI'],
 }
 add_text = {
     'MSD': {},
     'PSMA-FDG-PET-CT-LESION':{'description': 'malignant melanoma, lymphoma, lung cancer, or healthy'},
     'AbdomenAtlas':{},
     'AbdomenCT1k':{},
+    'OAI_ZIB_KL': {'description': 'right knee osteoarthritis'},
+    'OAI_ZIB_WOMAC': {'description': 'right knee osteoarthritis'},
 }
 # bert intialization
+model_name = '/rds/project/rds-TWhPgQVLKbA/Code/OmniMorph/External/Models/bert_large_uncased'
 reduce_method = 'mean'
 max_words_num = 32  # max number of words in the caption > 2
 # max_words_num = 64  # max number of words in the caption > 2

Dataloader/nifty_mappings/AbdomenAtlas_mappings.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:303c3fb7388e7b3b01cb6f494c3ac3f542da98487039e5b2415786ac4af58ba0
-size 179457573

 version https://git-lfs.github.com/spec/v1
+oid sha256:6000e9ba6b4fac278a1288826696ab7d5f77c97929d7e001dfb8938d7d5aa0a8
+size 182087319

Dataloader/nifty_mappings/AbdomenCT1k_mappings.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0abaaa1013fdafe3fae6d5544746a66d8b20892ceb3cf9141a125113984e8350
-size 37315918

 version https://git-lfs.github.com/spec/v1
+oid sha256:a36ccd80e859aefd7334fb99ebca10601bb39be9e6432a1f59b4e98e9c4069a8
+size 30687976

Dataloader/nifty_mappings/Brats2019_mappings.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1c5b80fc861484d36d8d6e0f97c404e2c321ee965cc1556a868205f5937d24fe
-size 12126490

 version https://git-lfs.github.com/spec/v1
+oid sha256:8f128806b4673b7e1219990f0e2c5732abd1080fd4de271195fa74538c32ab70
+size 12178080

Dataloader/nifty_mappings/Brats2020_mappings.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:de345c6a66a4f33552aacbb961cd034ac488500ff5d48810579055f0543162dc
-size 17743015

 version https://git-lfs.github.com/spec/v1
+oid sha256:90659bf584857b9e543163431e3730c6e6ce229b3386dc8ab13e7411a6b00c78
+size 17815563

Dataloader/nifty_mappings/Brats2021_mappings.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4990a7031d6ac91e1c33e6db046dddf234f67dd8edecd07691675945b9d00af5
-size 44722001

 version https://git-lfs.github.com/spec/v1
+oid sha256:c758b9cfb8190f3b77eef03ea93a43f95e2d9e89dae4b08f6ae4dabc65024b97
+size 44888384

Dataloader/nifty_mappings/CIA_mappings.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:98cbd21d3d5b7f5fb84091705fbbfcd0f8f26cb26ff4b34ffcf546cf1cedb48a
-size 32744567

 version https://git-lfs.github.com/spec/v1
+oid sha256:1aef79728ee6d2ab15ab7225a52d5e437cd10d33cfdcbb6f4d9c2aee1687d5f3
+size 32803157

Dataloader/nifty_mappings/Kaggle_osic_mappings.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

Dataloader/nifty_mappings/MSD_mappings.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a1ab13c61cd6829f088ee92bff4ce12a0f0e19fc9367682291fbd9717b149e83
-size 92620864

 version https://git-lfs.github.com/spec/v1
+oid sha256:8b777fb0d1ab09b22dcb3048b25cf60a31ccc30749888f1f02d7dc4b43715ad6
+size 92732794

Dataloader/nifty_mappings/MnMs_mappings.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

Dataloader/nifty_mappings/OAI_ZIB_KL_mappings.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b5ab4159932276f0ccd52efe44986ed184b504162f568cec68fc76fa0769efad
+size 18096063

Dataloader/nifty_mappings/OAI_ZIB_WOMAC_mappings.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4dad37ced9f1dbe3819dd6ac0d51b6585c25e641b4d07352d706aaf3ac17c19a
+size 18119154

Dataloader/nifty_mappings/OASIS_1_mappings.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8784bff1bb5c9ba08fccc8ca9776f3f26c9b2993c1c446ef17d5ba1dd2bda490
-size 15609846

 version https://git-lfs.github.com/spec/v1
+oid sha256:a39ccde5fe81bd7b2b5fa1cc64feb7094ff83851bfd40a5287e01d817e45db59
+size 15646470

Dataloader/nifty_mappings/OASIS_2_mappings.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4f88910a0846e056b0d4caacd6e6ebfebde52b537828756e217d9a6c6343177c
-size 13396017

 version https://git-lfs.github.com/spec/v1
+oid sha256:7665f7769ef262f1758af1cf42e1610f211c53d35a625a457c5a50bca3841757
+size 13440390

Dataloader/nifty_mappings/PSMA-CT-Longitud_mappings.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c3c8729df59b6e9771fa791c5fe1cd7636e83a3c17109613984cdce0d92eefdc
-size 11700732

 version https://git-lfs.github.com/spec/v1
+oid sha256:ebd252fec7062df77452b0bdeab47013314aba638cf0b0de295bc62748d2cfec
+size 11728536

Dataloader/nifty_mappings/PSMA-FDG-PET-CT-LESION_mappings.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:922363b739e1f14243731ea283ee730bc55724a27360d2f28f32b01b23ede5d9
-size 48425273

 version https://git-lfs.github.com/spec/v1
+oid sha256:cab3cbb5a5a651e1c3446079a3c18b944ed1893893ccd25451c110f13eebe4cc
+size 48538337

Dataloader/nifty_mappings/TotalSegmentorCT_MRI_mappings.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c36ba45053fea97244c259af0151ddb02e8281fce8c8f439cc88733bd71d668f
-size 67962146

 version https://git-lfs.github.com/spec/v1
+oid sha256:a922ecc5c136bcc3427f81e970d1cdd02e3b6c61bedc198e99b6fec8c380b4c3
+size 69966911

Diffusion/diffuser-reg.py ADDED Viewed

	@@ -0,0 +1,541 @@

+from torch import nn
+import torch
+import numpy as np
+from torch.nn.utils.stateless import functional_call
+import Diffusion.utils_diff as utils
+from Diffusion.networks import *
+# from networks import *
+import random
+EPS = 1e-8
+class DeformDDPM(nn.Module):
+    def __init__(
+        self,
+        network,
+        n_steps=50,
+        beta_schedule_fn = None,
+        device='cpu',
+        image_chw=(1, 28, 28),
+        batch_size = 1,
+        img_pad_mode = "zeros",
+        ddf_pad_mode="border",
+        padding_mode="border",
+        v_scale = 0.008/256,
+        resample_mode=None,
+        inf_mode = False,
+        ):
+        super(DeformDDPM, self).__init__()
+        self.rec_num=2
+        self.ndims=len(image_chw)-1
+        self.n_steps = n_steps
+        self.v_scale = v_scale
+        self.device = device
+        self.msk_noise_scale = torch.tensor(0)
+        # self.msk_noise_scale = torch.tensor(1)
+        # print('================')
+        # print("device:",device)
+        # if device == 'cpu':
+        #     print("num_device: 1")
+        # else:
+        #     print("num_device:", torch.cuda.device_count())
+        # print('================')
+        self.num_device = torch.cuda.device_count()
+        self.batch_size = batch_size #//self.num_device
+        self.img_pad_mode = img_pad_mode
+        self.ddf_pad_mode = ddf_pad_mode
+        self.padding_mode = padding_mode
+        self.resample_mode = resample_mode
+        self.image_chw = image_chw
+        self.network = network#.to(self.device)
+        self.ddf_stn_full = STN(
+                                    img_sz = self.image_chw[1],
+                                    ndims = self.ndims,
+                                    padding_mode = self.padding_mode,
+                                    device = self.device,
+                                )
+        self._DDF_Encoder_init()
+        self.copy_opt = nn.Identity()
+        self.inf_mode = inf_mode
+        return
+    def get_stn(self):
+        return self.img_stn, self.ddf_stn_full
+    def _DDF_Encoder_init(self, ctl_ratio=4, ctl_sz=None, resample_mode=None):
+        if ctl_sz is None:
+            ctl_sz = self.image_chw[1] // ctl_ratio
+        self.ctl_sz=ctl_sz
+        self.img_sz=self.image_chw[1]
+        self.ddf_stn_rec=STN(img_sz=ctl_sz,ndims=self.ndims,device=self.device,padding_mode=self.ddf_pad_mode)
+        self.img_stn=STN(img_sz=self.img_sz,ndims=self.ndims,device=self.device,padding_mode=self.img_pad_mode,resample_mode=self.resample_mode)
+        self.msk_stn=STN(img_sz=self.img_sz,ndims=self.ndims,device=self.device,padding_mode=self.img_pad_mode,resample_mode='nearest')
+    def _get_ddf_scale(self,t,divide_num=1,max_ddf_num=200):   # 128
+        rec_num = 1
+        mul_num_ddf = torch.floor_divide(2*torch.pow(t,1.3), 3*divide_num).int()
+        mul_num_dvf = torch.floor_divide(torch.pow(t,0.6), divide_num).int()
+        # print("time_step:",t,"mul_num_ddf:",mul_num_ddf,"mul_num_dvf:",mul_num_dvf)
+        # mul_num_ddf = self._sample_random_uniform_multi_order(high=mul_num_ddf)
+        # mul_num_dvf = self._sample_random_uniform_multi_order(high=mul_num_dvf)
+        mul_num_ddf = torch.clamp(mul_num_ddf, min=1, max=max_ddf_num)
+        mul_num_dvf = torch.clamp(mul_num_dvf, min=0, max=max_ddf_num)
+        # print("time_step:",t,"mul_num_ddf:",mul_num_ddf,"mul_num_dvf:",mul_num_dvf)
+        return rec_num,mul_num_ddf,mul_num_dvf
+    # def _sample_random_uniform_multi_order(self, high=None, low=0, order_num=3):
+    #     # high: tensor of shape (...), low: int or tensor broadcastable to high
+    #     sample_num = torch.full_like(high, low) if not isinstance(low, torch.Tensor) else low.clone()
+    #     for _ in range(order_num):
+    #         # For each element, sample in [sample_num, high]
+    #         # torch.randint requires scalar low/high, so we use elementwise sampling
+    #         rand_shape = high.shape
+    #         # Clamp sample_num to be <= high
+    #         sample_num = torch.minimum(sample_num, high)
+    #         # Generate random numbers for each element
+    #         rand = torch.empty(rand_shape, dtype=high.dtype, device=high.device)
+    #         for idx in np.ndindex(rand_shape):
+    #             l = sample_num[idx].item()
+    #             h = high[idx].item()
+    #             if l >= h:
+    #                 rand[idx] = l
+    #             else:
+    #                 rand[idx] = torch.randint(l, h + 1, (1,), device=high.device)
+    #         sample_num = rand.to(high.dtype)
+    #     return sample_num
+    def _get_random_ddf(self,img,t):
+        rec_num, mul_num_ddf, mul_num_dvf = self._get_ddf_scale(t=t)
+        ddf_forward,dvf_forward = self._random_ddf_generate(rec_num=rec_num, mul_num=[mul_num_ddf,mul_num_dvf])
+        warped_img = self.img_stn(img,ddf_forward)
+        return warped_img, dvf_forward,ddf_forward
+    def _multiscale_dvf_generate(self,v_scale,ctl_szs=[4,8,16,32,64], rand_v_scale=True):
+        dvf=0
+        if self.img_sz is None:
+            self.img_sz=max(ctl_szs)
+        if 1 in ctl_szs:
+            dvf_rot = utils.random_ddf(batch_size=self.batch_size, ndims=self.ndims, img_sz=[self.ctl_sz]*self.ndims, range_gauss=0, rot_range=np.pi/90)
+            dvf = dvf + dvf_rot
+        for ctl_sz in ctl_szs:
+            _v_scale = self._sample_random_uniform_multi_order(high=v_scale, low=1e-8, order_num=2) if rand_v_scale else v_scale
+            # temp>>
+            if ctl_sz <= 2:
+                _v_scale = _v_scale/2
+            # temp<<
+            dvf_comp = torch.randn([self.batch_size, self.ndims] + [ctl_sz]*self.ndims) * _v_scale
+            dvf_comp = F.interpolate(dvf_comp * self.ctl_sz / ctl_sz, [self.ctl_sz]*self.ndims, align_corners=False, mode='bilinear' if self.ndims == 2 else 'trilinear')
+            dvf=dvf+dvf_comp
+        return dvf
+    def _sample_random_uniform_multi_order(self, high=None, low=0., order_num=3):
+        sample_value = low
+        for _ in range(order_num):
+            sample_value = np.random.uniform(low=sample_value, high=high)
+        return sample_value
+    def _random_ddf_generate(self,rec_num=3,mul_num=[torch.tensor([5]),torch.tensor([5])],ddf0=None,keep_inverse=False,noise_ratio=0.08,select_num=4, flip_ratio=0.5):
+        crop_rate=2
+        for _ in range(self.ndims+1):
+            mul_num=[torch.unsqueeze(n,-1) for n in mul_num]
+        # v_scale = v_scale *crop_rate
+        ctl_ddf_sz=[self.batch_size, self.ndims] + [self.ctl_sz] * self.ndims
+        if ddf0 is not None:
+            ddf=ddf0
+        else:
+            ddf = torch.zeros(ctl_ddf_sz) * 0
+        dddf = torch.zeros(ctl_ddf_sz) * 0
+        scale_num = min(8,int(math.log2(self.ctl_sz)))   # allow affine
+        # scale_num = min(5,int(math.log2(self.ctl_sz))-1)   # semi-allow affine
+        # scale_num = min(5,int(math.log2(self.ctl_sz))-2)   # avoid coupling between deformation and affine
+        ctl_szs_all = [self.ctl_sz // (2 ** i) for i in range(scale_num)]
+        for i in range(rec_num):
+            # Randomly select 5 elements from ctl_szs (if there are at least 5)
+            if len(ctl_szs_all) > select_num:
+                ctl_szs = random.sample(ctl_szs_all, select_num)
+            dvf = self._multiscale_dvf_generate(self.v_scale, ctl_szs=ctl_szs).to(self.device)
+            # if True:
+            if noise_ratio==0:
+                dvf0=dvf
+            else:
+                dvf0=dvf+self.ddf_stn_rec(self._multiscale_dvf_generate(self.v_scale*noise_ratio,ctl_szs=ctl_szs, rand_v_scale=False).to(self.device),dvf)
+            # print([num.shape for num in mul_num])
+            for j in range(torch.max(mul_num[0]).item()):
+                flag = [(n>j).int().to(self.device) for n in mul_num]
+                ddf = dvf0*flag[0] + self.ddf_stn_rec(ddf, dvf0*flag[0])
+                dddf = dvf*flag[1] + self.ddf_stn_rec(dddf, dvf*flag[1])
+        ddf = F.interpolate(ddf * self.img_sz/self.ctl_sz, self.img_sz*crop_rate, mode='bilinear' if self.ndims == 2 else 'trilinear')
+        # ddf = ddf[...,img_sz//2:img_sz*3//2,img_sz//2:img_sz*3//2]
+        if self.ndims==2:
+            ddf = ddf[..., self.img_sz // 2:self.img_sz * 3 // 2, self.img_sz // 2:self.img_sz * 3 // 2]
+        else:
+            ddf = ddf[..., self.img_sz // 2:self.img_sz * 3 // 2, self.img_sz // 2:self.img_sz * 3 // 2, self.img_sz // 2:self.img_sz * 3 // 2]
+        # if rec_num==1:
+        if True:
+            dddf = F.interpolate(dddf * self.img_sz/self.ctl_sz, self.img_sz*crop_rate, mode='bilinear' if self.ndims == 2 else 'trilinear')
+            # dddf = dddf[...,img_sz//2:img_sz*3//2,img_sz//2:img_sz*3//2]
+            if self.ndims == 2:
+                dddf = dddf[..., self.img_sz // 2:self.img_sz * 3 // 2, self.img_sz // 2:self.img_sz * 3 // 2]
+            else:
+                dddf = dddf[..., self.img_sz // 2:self.img_sz * 3 // 2, self.img_sz // 2:self.img_sz * 3 // 2, self.img_sz // 2:self.img_sz * 3 // 2]
+            return ddf,dddf
+        else:
+            return ddf
+    def create_noise_map(self, img, noise_type='gaussian', noise_scale=0.1):
+        if noise_type == 'gaussian':
+            noise_map = torch.randn_like(img) * noise_scale
+        elif noise_type == 'uniform':
+            noise_map = torch.rand_like(img)*noise_scale*2-noise_scale # 0-1
+        elif noise_type == 'binary':
+            noise_map = torch.bernoulli(torch.rand_like(img))
+        else:
+            noise_map = torch.zeros_like(img)
+        noise_map = noise_map.to(img.device)
+        return noise_map
+    def add_noise(self, img, noise_map=None, noise_ratio_range=[0.,1.]):
+        noise_ratio = np.random.uniform(noise_ratio_range[0], noise_ratio_range[1])
+        return img * (1-noise_ratio) + noise_map * noise_ratio, noise_ratio
+    def apply_noise(self, img, noise_map=None, apply_mask=None):
+        return img * apply_mask + noise_map * (1-apply_mask)
+    def downsample(self, img, down_ratio_range=[1./32,1]):
+        down_ratio = list(np.random.uniform(down_ratio_range[0], down_ratio_range[1],[self.ndims]))
+        # print(down_ratio)
+        down_img = F.interpolate(img, scale_factor=down_ratio, mode='bilinear' if self.ndims == 2 else 'trilinear')
+        # print(down_img)
+        # return F.interpolate(down_img, size=[self.image_chw[1]]*self.ndims, mode='bilinear' if self.ndims == 2 else 'trilinear', align_corners=False), np.prod(down_ratio)
+        return F.interpolate(down_img, size=[self.image_chw[1]]*self.ndims, mode='bilinear' if self.ndims == 2 else 'trilinear', align_corners=False), np.sqrt(np.prod(down_ratio)) # jzheng: cond weight based on entropy
+    def get_slice_mask(self, img, slice_num_range=[0,32]):
+        slice_num_range[1] = min(slice_num_range[1], self.image_chw[1])
+        mask = torch.zeros_like(img)
+        sample_ratio = 0
+        for i in range(self.ndims):
+            if self.inf_mode:
+                slice_num = 1  # use max slice num for inference for better performance
+                slice_idx = [self.image_chw[1]//2]  # use middle slice for inference for better performance
+            else:
+                slice_num = random.randint(slice_num_range[0], slice_num_range[1])
+                slice_idx = random.sample(range(self.image_chw[1]), slice_num)
+            transpose_list = [0, 1, 1 + self.ndims] + list(range(2, 1 + self.ndims))
+            for idx in slice_idx:
+                mask[..., idx] = 1
+            mask = mask.permute(*transpose_list)
+            # sample_ratio += slice_num / self.image_chw[1] / self.ndims
+            sample_ratio += np.sqrt(slice_num / self.image_chw[1]) / self.ndims     # jzheng: cond weight based on entropy
+        # print(mask)
+        # print("sample_ratio:", sample_ratio)
+        return mask, sample_ratio
+    def project(self, img):
+        proj_img = torch.zeros_like(img)
+        rand_bourn = np.random.randint(0, 2, size=[self.ndims])
+        proj_dim_num = np.sum(rand_bourn)
+        for i,pflag in zip(range(2, 2 + self.ndims), rand_bourn):
+            if pflag:
+                proj_img += torch.mean(img, dim=i, keepdim=True)
+                # print("projecting dim:", i)
+        return proj_img/(proj_dim_num+EPS), proj_dim_num
+    def proc_cond_img(self, img, proc_type=None,noise_scale=0.1):
+        # Remove torch.no_grad() since most operations are not differentiable anyway
+        proc_img = img.clone().detach()
+        if proc_type is None:
+            # Heavily bias towards 'uncon' for efficiency
+            proc_type = random.choices(
+                # ['adding', 'independ', 'downsample', 'slice', 'project', 'none', 'uncon'],
+                # weights=[1, 1, 1, 1, 1, 1, 3], k=1
+                ['adding', 'independ', 'downsample', 'slice','slice1', 'none', 'uncon'],
+                weights=[1, 1, 1, 1, 1, 3], k=1
+            )[0]
+        mask = torch.tensor(1, device=img.device)
+        cond_ratio = torch.tensor(1., device=img.device)
+        self.msk_noise_scale = torch.tensor(0, device=img.device)
+        noise_type = random.choice(['gaussian', 'uniform', 'none'])
+        # Precompute noise_map only if needed
+        noise_map = None
+        if proc_type not in ['none', None, '']:
+            if proc_type == 'uncon':
+                noise_map = self.create_noise_map(img, noise_type=noise_type,noise_scale=noise_scale)
+                proc_img = noise_map
+                mask = torch.tensor(0, device=img.device)
+                cond_ratio = torch.tensor(0, device=img.device)
+                return proc_img, mask, cond_ratio
+            if proc_type in ['adding', 'independ', 'slice','slice1']:
+                # self.msk_noise_scale = 0
+                noise_map = self.create_noise_map(img, noise_type=noise_type,noise_scale=noise_scale)
+            if proc_type == 'adding':
+                proc_img, noise_ratio = self.add_noise(proc_img, noise_map=noise_map, noise_ratio_range=[0., 1.])
+                cond_ratio = torch.tensor(1 - noise_ratio, device=img.device)
+            elif proc_type == 'independ':
+                mask = self.create_noise_map(img, noise_type='binary')
+                if self.msk_noise_scale == 0:
+                    proc_img = img * mask
+                else:
+                    proc_img = self.apply_noise(proc_img, noise_map=noise_map*self.msk_noise_scale, apply_mask=mask)
+                with torch.no_grad():
+                    cond_ratio = mask.float().mean()
+            elif proc_type == 'downsample':
+                # proc_img, down_ratio = self.downsample(proc_img, down_ratio_range=[1./32, 1])
+                proc_img, down_ratio = self.downsample(proc_img, down_ratio_range=[1./64, 1])
+                cond_ratio = torch.tensor(down_ratio, device=img.device)
+            elif proc_type == 'slice' or proc_type == 'slice1':
+                if proc_type == 'slice1':
+                    slice_num_max = 1
+                else:
+                    slice_num_max = random.randint(1, 64)
+                    slice_num_max = random.randint(1, slice_num_max)
+                mask, sample_ratio = self.get_slice_mask(img, slice_num_range=[0, slice_num_max])
+                if self.msk_noise_scale == 0:
+                    proc_img = img * mask
+                else:
+                    proc_img = self.apply_noise(proc_img, noise_map=noise_map*self.msk_noise_scale, apply_mask=mask)
+                cond_ratio = torch.tensor(sample_ratio, device=img.device)
+            elif proc_type == 'project':
+                proc_img, proj_num = self.project(proc_img)
+                cond_ratio = torch.tensor(proj_num / (128 * self.ndims), device=img.device)
+                # cond_ratio = torch.tensor(proj_num / (32 * self.ndims), device=img.device)  # jzheng: cond weight based on entropy
+        return proc_img, mask, cond_ratio
+    def diffuse(self, x_0, t):
+        t=torch.tensor(t)
+        # img_t, dvf_forward, ddf_forward, ddf_stn, img_stn = self.ddf_enc(img= x_0, t=t)
+        # return img_t, dvf_forward,ddf_forward,ddf_stn,img_stn
+        return self._get_random_ddf(img = x_0, t = t)
+    def recover(self, x, y, t,rec_num=2, text=None):
+        if isinstance(t, list):
+            t=[torch.tensor(t0) for t0 in t]
+            t=[t0.to(x.device) for t0 in t]
+        else:
+            t=torch.tensor(t)
+            t.to(x.device)
+        if rec_num is None:
+            rec_num = self.rec_num
+        return self.network(x=x, y=y, t=t, rec_num=rec_num, text=text)
+    def recover_frozen_params_but_grad_input(self, x, y, t,rec_num=2, text=None):
+        """
+        use detach to recover:
+        - but not include no_grad
+        """
+        if isinstance(t, list):
+            t = [torch.tensor(t0, device=x.device) for t0 in t]
+        else:
+            t = torch.tensor(t, device=x.device)
+        if rec_num is None:
+            rec_num = self.rec_num
+        # params = {k: v.detach() for k, v in self.network.named_parameters()}
+        # buffers = dict(self.network.named_buffers())  # BN running stats etc. buffer
+        # # functional_call require position args，here kwargs doesnot work, so：
+        # def _forward(module, kw):
+        #     return module(**kw)
+        # # functional_call(module, ...) can only pass args/kwargs to module.forward
+        # # PyTorch 2.x support functional_call(module, (params, buffers), args, kwargs)
+        # return functional_call(
+        #     self.network,
+        #     (params, buffers),
+        #     args=(),
+        #     kwargs=dict(x=x, y=y, t=t, rec_num=rec_num, text=text),
+        # )
+        # 1) param detached
+        params = {k: v.detach() for k, v in self.network.named_parameters()}
+        # 2) buffers keeps unchanged
+        buffers = dict(self.network.named_buffers())
+        # 3) old version of PyTorch doesnot support passing params and buffers together
+        params_and_buffers = {}
+        params_and_buffers.update(params)
+        params_and_buffers.update(buffers)
+        return functional_call(
+            self.network,
+            params_and_buffers,
+            (),
+            kwargs=dict(x=x, y=y, t=t, rec_num=rec_num, text=text),
+        )
+    def _single_step(self, x0, t, rec_num=2, proc_type=None,mask=None, cond_imgs=None, text=None):
+        if mask is None:
+            mask = 1
+        # org_imgs=self.copy_opt(x0)
+        if cond_imgs is None:
+            cond_imgs, mask_tgt, cond_ratio = self.proc_cond_img(x0,proc_type=proc_type)
+        noisy_imgs, dvf_I,_ = self.diffuse(x0, t)
+        if isinstance(self.network,DefRec_MutAttnNet):
+            t = [t] * 1
+        return self.recover(x=noisy_imgs*mask, y=cond_imgs, t=t, rec_num=rec_num, text=text), dvf_I
+    def forward(self, img_org, cond_imgs=None, proc_type=None, T=None, **kwargs):
+        if T is not None:
+            return self.diff_recover(img_org=img_org, T=T, proc_type=proc_type, cond_imgs=cond_imgs, **kwargs)
+        else:
+            return self._single_step(x0=img_org, proc_type=proc_type, cond_imgs=cond_imgs, **kwargs)
+            # if mask is None:
+            #     mask = 1
+            # cond_imgs = self.proc_cond_img(x0, proc_type=proc_type, **kwargs)
+            # noisy_imgs, dvf_I, _ = self.diffuse(x0, t)
+            # if isinstance(self.network, DefRec_MutAttnNet):
+            #     t = [t] * 1
+            # return self.recover(x=noisy_imgs * mask, y=cond_imgs, t=t, rec_num=rec_num), dvf_I
+    def diff_recover(self,
+                     img_org,
+                     msk_org=None,
+                     T=[None,None],
+                     ddf_rand=None,
+                     v_scale = None,
+                     t_save=None,
+                     cond_imgs=None,
+                     proc_type=None,
+                     text=None,
+                     ):
+        if cond_imgs is None:
+            cond_imgs = img_org.clone().detach()
+        # if proc_type is not None:
+        cond_imgs,mask_tgt,cond_ratio=self.proc_cond_img(cond_imgs, proc_type=proc_type)
+        if ddf_rand is None:
+            if v_scale is not None:
+                self.v_scale=v_scale
+                self._DDF_Encoder_init()
+            if T[0] is None or T[0] == 0:
+                img_diff = img_org.clone().detach()
+                ddf_rand = torch.zeros_like(img_diff)
+            else:
+                img_diff, _, ddf_rand = self._get_random_ddf(img= img_org, t=torch.tensor(np.array([T[0]])).to(self.device))
+        else:
+            img_diff = self.img_stn(img_org.clone().detach(), ddf_rand)
+        ddf_comp = ddf_rand.clone().detach()
+        img_rec = img_diff.clone().detach()
+        if msk_org is not None:
+            msk_diff = self.msk_stn(msk_org.clone().detach(), ddf_rand)
+        else:
+            msk_diff = None
+        msk_rec = msk_diff.clone().detach() if msk_org is not None else None
+        img_save=[]
+        msk_save=[]
+        if isinstance(self.network,DefRec_MutAttnNet):
+            # Denosing image via list of t
+            t_list = list(range(T[1]-1, -1, -1))
+            pre_dvf_I = self.recover(x=img_rec, y=cond_imgs, t=t_list,rec_num=None, text=text)
+            ddf_comp = self.ddf_stn_full(ddf_comp, pre_dvf_I) + pre_dvf_I
+            img_rec = self.img_stn(img_org.clone().detach(), ddf_comp)
+            if msk_org is not None:
+                msk_rec = self.msk_stn(msk_org.clone().detach(), ddf_comp)
+        else:
+            # Denosing image
+            if isinstance(T[-1], int):
+                time_steps = range(T[-1] - 1, -1, -1)
+                trainable_iterations =[]
+            else:
+                time_steps = T[-1]
+                # # Randomly select k iterations to make their parameters trainable
+                # win_len = 2  # Number of iterations to make trainable
+                # if len(time_steps) <= win_len:
+                #     win_start = 0
+                # else:
+                #     win_start = random.randint(len(time_steps)//2, len(time_steps) - win_len)
+                # win_end = win_start + win_len - 1
+                k=2
+                # trainable_iterations = time_steps[win_start: win_start + win_len]
+                # trainable_iterations = random.sample(time_steps, k)
+                trainable_iterations = time_steps[-1:-k-1:-1]
+                # print(time_steps)
+                # print("trainable_iterations:", trainable_iterations)
+            for i in time_steps:
+                t = torch.tensor(np.array([i])).to(self.device)
+                if i in trainable_iterations:
+                    # Make parameters trainable for this iteration
+                    pre_dvf_I = self.recover(x=img_rec, y=cond_imgs, t=t, rec_num=None, text=text)
+                else:
+                    # Freeze parameters for this iteration using torch.no_grad()
+                    with torch.no_grad():
+                        pre_dvf_I = self.recover(x=img_rec, y=cond_imgs, t=t, rec_num=None, text=text)
+            # for idx, i in enumerate(time_steps):
+            #     t = torch.tensor(np.array([i])).to(self.device)
+            #     if idx < win_start:
+            #         # just no_grad
+            #         with torch.no_grad():
+            #             pre_dvf_I = self.recover(x=img_rec, y=cond_imgs, t=t, rec_num=None, text=text)
+            #     elif win_start <= idx <= win_end:
+            #         # normal update
+            #         pre_dvf_I = self.recover(x=img_rec, y=cond_imgs, t=t, rec_num=None, text=text)
+            #     else:
+            #         # freeze params but keep grad for input
+            #         pre_dvf_I = self.recover_frozen_params_but_grad_input(
+            #             x=img_rec, y=cond_imgs, t=t, rec_num=None, text=text
+            #         )
+                ddf_comp = self.ddf_stn_full(ddf_comp, pre_dvf_I) + pre_dvf_I
+                # Apply to image
+                img_rec = self.img_stn(img_org.clone().detach(), ddf_comp)
+                if msk_org is not None:
+                    msk_rec = self.msk_stn(msk_org.clone().detach(), ddf_comp)
+                if t_save is not None:
+                    if i in t_save:
+                        img_save.append(img_rec)
+                        if msk_org is not None:
+                            msk_save.append(msk_rec)
+            # for i in time_steps:
+            #     t = torch.tensor(np.array([i])).to(self.device)
+            #     pre_dvf_I = self.recover(x=img_rec, y=cond_imgs, t=t,rec_num=None)
+            #     ddf_comp = self.ddf_stn_full(ddf_comp, pre_dvf_I) + pre_dvf_I
+            #     # apply to image
+            #     img_rec = self.img_stn(img_org.clone().detach(), ddf_comp)
+            #     if msk_org is not None:
+            #         msk_rec = self.img_stn(msk_org.clone().detach(), ddf_comp)
+            #     if t_save is not None:
+            #         if i in t_save:
+            #             img_save.append(img_rec)
+            #             if msk_org is not None:
+            #                 msk_save.append(msk_rec)
+        # print(torch.max(torch.abs(ddf_comp)))
+        # print(torch.max(torch.abs(ddf_rand)))
+        return [ddf_comp,ddf_rand],[img_rec,img_diff,img_save],[msk_rec,msk_diff,msk_save]
+if __name__ == "__main__":
+    H, W = 8, 8
+    deformddpm = DeformDDPM(network=get_net(name="recmutattnnet")(n_steps=80, ndims=2, num_input_chn=1),image_chw=(1, H, W),device='cpu')
+    # img = torch.zeros([1, 1, H, W])
+    img = torch.randn([1, 1, H, W])
+    t = 1
+    rec_num = 2
+    # proc_type = 'adding'
+    # proc_type = 'independ'
+    # proc_type = 'downsample'
+    proc_type = 'slice'
+    # proc_type = 'project'
+    # proc_type = 'none'
+    print(img)
+    cond_imgs, mask_tgt = deformddpm.proc_cond_img(img, proc_type=proc_type)
+    print(cond_imgs)
+    # img_rec, dvf_I = deformddpm.forward(img, t, rec_num=rec_num, proc_type=proc_type)
+    # print(img_rec.shape, dvf_I.shape)
+    # proc_type = 'adding'
+    # ddf_comp, ddf_rand = deformddpm.diff_recover(img, T=[1,1], proc_type=proc_type)

Diffusion/diffuser.py CHANGED Viewed

@@ -27,6 +27,7 @@ class DeformDDPM(nn.Module):
         padding_mode="border",
         v_scale = 0.008/256,
         resample_mode=None,
         ):
         super(DeformDDPM, self).__init__()
         self.rec_num=2
@@ -35,6 +36,7 @@ class DeformDDPM(nn.Module):
         self.v_scale = v_scale
         self.device = device
         self.msk_noise_scale = torch.tensor(0)
         # print('================')
         # print("device:",device)
@@ -61,6 +63,7 @@ class DeformDDPM(nn.Module):
                                 )
         self._DDF_Encoder_init()
         self.copy_opt = nn.Identity()
         return
     def get_stn(self):
@@ -78,7 +81,8 @@ class DeformDDPM(nn.Module):
     def _get_ddf_scale(self,t,divide_num=1,max_ddf_num=200):   # 128
         rec_num = 1
         mul_num_ddf = torch.floor_divide(2*torch.pow(t,1.3), 3*divide_num).int()
-        mul_num_dvf = torch.floor_divide(torch.pow(t,0.6), divide_num).int()
         # print("time_step:",t,"mul_num_ddf:",mul_num_ddf,"mul_num_dvf:",mul_num_dvf)
         # mul_num_ddf = self._sample_random_uniform_multi_order(high=mul_num_ddf)
         # mul_num_dvf = self._sample_random_uniform_multi_order(high=mul_num_dvf)
@@ -110,7 +114,7 @@ class DeformDDPM(nn.Module):
     def _get_random_ddf(self,img,t):
         rec_num, mul_num_ddf, mul_num_dvf = self._get_ddf_scale(t=t)
-        ddf_forward,dvf_forward = self._random_ddf_generate(rec_num=rec_num, mul_num=[mul_num_ddf,mul_num_dvf])
         warped_img = self.img_stn(img,ddf_forward)
         return warped_img, dvf_forward,ddf_forward
@@ -122,8 +126,10 @@ class DeformDDPM(nn.Module):
             dvf_rot = utils.random_ddf(batch_size=self.batch_size, ndims=self.ndims, img_sz=[self.ctl_sz]*self.ndims, range_gauss=0, rot_range=np.pi/90)
             dvf = dvf + dvf_rot
         for ctl_sz in ctl_szs:
-            _v_scale = self._sample_random_uniform_multi_order(high=v_scale, low=1e-8, order_num=2) if rand_v_scale else v_scale
             # temp>>
             if ctl_sz <= 2:
                 _v_scale = _v_scale/2
             # temp<<
@@ -138,7 +144,7 @@ class DeformDDPM(nn.Module):
             sample_value = np.random.uniform(low=sample_value, high=high)
         return sample_value
-    def _random_ddf_generate(self,rec_num=3,mul_num=[torch.tensor([5]),torch.tensor([5])],ddf0=None,keep_inverse=False,noise_ratio=0.08,select_num=4, flip_ratio=0.5):
         crop_rate=2
         for _ in range(self.ndims+1):
             mul_num=[torch.unsqueeze(n,-1) for n in mul_num]
@@ -188,11 +194,11 @@ class DeformDDPM(nn.Module):
         else:
             return ddf
-    def create_noise_map(self, img, noise_type='gaussian', noise_ratio=0.2):
         if noise_type == 'gaussian':
-            noise_map = torch.randn_like(img) * noise_ratio
         elif noise_type == 'uniform':
-            noise_map = torch.rand_like(img) # 0-1
         elif noise_type == 'binary':
             noise_map = torch.bernoulli(torch.rand_like(img))
         else:
@@ -220,8 +226,18 @@ class DeformDDPM(nn.Module):
         mask = torch.zeros_like(img)
         sample_ratio = 0
         for i in range(self.ndims):
-            slice_num = random.randint(slice_num_range[0], slice_num_range[1])
-            slice_idx = random.sample(range(self.image_chw[1]), slice_num)
             transpose_list = [0, 1, 1 + self.ndims] + list(range(2, 1 + self.ndims))
             for idx in slice_idx:
                 mask[..., idx] = 1
@@ -243,7 +259,7 @@ class DeformDDPM(nn.Module):
                 # print("projecting dim:", i)
         return proj_img/(proj_dim_num+EPS), proj_dim_num
-    def proc_cond_img(self, img, proc_type=None):
         # Remove torch.no_grad() since most operations are not differentiable anyway
         proc_img = img.clone().detach()
         if proc_type is None:
@@ -251,7 +267,7 @@ class DeformDDPM(nn.Module):
             proc_type = random.choices(
                 # ['adding', 'independ', 'downsample', 'slice', 'project', 'none', 'uncon'],
                 # weights=[1, 1, 1, 1, 1, 1, 3], k=1
-                ['adding', 'independ', 'downsample', 'slice', 'none', 'uncon'],
                 weights=[1, 1, 1, 1, 1, 3], k=1
             )[0]
         mask = torch.tensor(1, device=img.device)
@@ -262,14 +278,14 @@ class DeformDDPM(nn.Module):
         noise_map = None
         if proc_type not in ['none', None, '']:
             if proc_type == 'uncon':
-                noise_map = self.create_noise_map(img, noise_type=noise_type)
                 proc_img = noise_map
                 mask = torch.tensor(0, device=img.device)
                 cond_ratio = torch.tensor(0, device=img.device)
                 return proc_img, mask, cond_ratio
-            if proc_type in ['adding', 'independ', 'slice']:
                 # self.msk_noise_scale = 0
-                noise_map = self.create_noise_map(img, noise_type=noise_type)
             if proc_type == 'adding':
                 proc_img, noise_ratio = self.add_noise(proc_img, noise_map=noise_map, noise_ratio_range=[0., 1.])
                 cond_ratio = torch.tensor(1 - noise_ratio, device=img.device)
@@ -285,9 +301,12 @@ class DeformDDPM(nn.Module):
                 # proc_img, down_ratio = self.downsample(proc_img, down_ratio_range=[1./32, 1])
                 proc_img, down_ratio = self.downsample(proc_img, down_ratio_range=[1./64, 1])
                 cond_ratio = torch.tensor(down_ratio, device=img.device)
-            elif proc_type == 'slice':
-                slice_num_max = random.randint(1, 64)
-                slice_num_max = random.randint(1, slice_num_max)
                 mask, sample_ratio = self.get_slice_mask(img, slice_num_range=[0, slice_num_max])
                 if self.msk_noise_scale == 0:
                     proc_img = img * mask
@@ -373,8 +392,14 @@ class DeformDDPM(nn.Module):
             t = [t] * 1
         return self.recover(x=noisy_imgs*mask, y=cond_imgs, t=t, rec_num=rec_num, text=text), dvf_I
-    def forward(self, img_org, cond_imgs=None, proc_type=None, T=None, **kwargs):
-        if T is not None:
             return self.diff_recover(img_org=img_org, T=T, proc_type=proc_type, cond_imgs=cond_imgs, **kwargs)
         else:
             return self._single_step(x0=img_org, proc_type=proc_type, cond_imgs=cond_imgs, **kwargs)
@@ -446,7 +471,7 @@ class DeformDDPM(nn.Module):
                 #     win_start = random.randint(len(time_steps)//2, len(time_steps) - win_len)
                 # win_end = win_start + win_len - 1
-                k=2
                 # trainable_iterations = time_steps[win_start: win_start + win_len]
                 # trainable_iterations = random.sample(time_steps, k)
                 trainable_iterations = time_steps[-1:-k-1:-1]

         padding_mode="border",
         v_scale = 0.008/256,
         resample_mode=None,
+        inf_mode = False,
         ):
         super(DeformDDPM, self).__init__()
         self.rec_num=2
         self.v_scale = v_scale
         self.device = device
         self.msk_noise_scale = torch.tensor(0)
+        # self.msk_noise_scale = torch.tensor(1)
         # print('================')
         # print("device:",device)
                                 )
         self._DDF_Encoder_init()
         self.copy_opt = nn.Identity()
+        self.inf_mode = inf_mode
         return
     def get_stn(self):
     def _get_ddf_scale(self,t,divide_num=1,max_ddf_num=200):   # 128
         rec_num = 1
         mul_num_ddf = torch.floor_divide(2*torch.pow(t,1.3), 3*divide_num).int()
+        # mul_num_dvf = torch.floor_divide(torch.pow(t,0.6), divide_num).int()
+        mul_num_dvf = torch.floor_divide(torch.pow(t,0.75), divide_num).int()    # raise the power number to increase the dvf ratio, which can help the training of ddf_stn_rec and make the model more robust to large deformation
         # print("time_step:",t,"mul_num_ddf:",mul_num_ddf,"mul_num_dvf:",mul_num_dvf)
         # mul_num_ddf = self._sample_random_uniform_multi_order(high=mul_num_ddf)
         # mul_num_dvf = self._sample_random_uniform_multi_order(high=mul_num_dvf)
     def _get_random_ddf(self,img,t):
         rec_num, mul_num_ddf, mul_num_dvf = self._get_ddf_scale(t=t)
+        ddf_forward,dvf_forward = self._random_ddf_generate(rec_num=rec_num, mul_num=[mul_num_ddf,mul_num_dvf],select_num=random.choice([1, 2, 3, 3, 4, 4]))
         warped_img = self.img_stn(img,ddf_forward)
         return warped_img, dvf_forward,ddf_forward
             dvf_rot = utils.random_ddf(batch_size=self.batch_size, ndims=self.ndims, img_sz=[self.ctl_sz]*self.ndims, range_gauss=0, rot_range=np.pi/90)
             dvf = dvf + dvf_rot
         for ctl_sz in ctl_szs:
+            _v_scale = self._sample_random_uniform_multi_order(high=v_scale, low=0., order_num=random.choice([1, 2])) if rand_v_scale else v_scale
             # temp>>
+            if ctl_sz <= 4:
+                _v_scale = _v_scale/2
             if ctl_sz <= 2:
                 _v_scale = _v_scale/2
             # temp<<
             sample_value = np.random.uniform(low=sample_value, high=high)
         return sample_value
+    def _random_ddf_generate(self,rec_num=3,mul_num=[torch.tensor([5]),torch.tensor([5])],ddf0=None,keep_inverse=False,noise_ratio=0.08,select_num=3, flip_ratio=0.5):
         crop_rate=2
         for _ in range(self.ndims+1):
             mul_num=[torch.unsqueeze(n,-1) for n in mul_num]
         else:
             return ddf
+    def create_noise_map(self, img, noise_type='gaussian', noise_scale=0.1):
         if noise_type == 'gaussian':
+            noise_map = torch.randn_like(img) * noise_scale
         elif noise_type == 'uniform':
+            noise_map = torch.rand_like(img)*noise_scale*2-noise_scale # 0-1
         elif noise_type == 'binary':
             noise_map = torch.bernoulli(torch.rand_like(img))
         else:
         mask = torch.zeros_like(img)
         sample_ratio = 0
         for i in range(self.ndims):
+            if self.inf_mode:
+                if i== 0:
+                    slice_num = 1  # use max slice num for inference for better performance
+                    slice_idx = [self.image_chw[1]//2]  # use middle slice for inference for better performance
+                else:
+                    slice_num = 0
+                    slice_idx = []
+                # slice_num = 1  # use max slice num for inference for better performance
+                # slice_idx = [self.image_chw[1]//2]  # use middle slice for inference for better performance
+            else:
+                slice_num = random.randint(slice_num_range[0], slice_num_range[1])
+                slice_idx = random.sample(range(self.image_chw[1]), slice_num)
             transpose_list = [0, 1, 1 + self.ndims] + list(range(2, 1 + self.ndims))
             for idx in slice_idx:
                 mask[..., idx] = 1
                 # print("projecting dim:", i)
         return proj_img/(proj_dim_num+EPS), proj_dim_num
+    def proc_cond_img(self, img, proc_type=None,noise_scale=0.1):
         # Remove torch.no_grad() since most operations are not differentiable anyway
         proc_img = img.clone().detach()
         if proc_type is None:
             proc_type = random.choices(
                 # ['adding', 'independ', 'downsample', 'slice', 'project', 'none', 'uncon'],
                 # weights=[1, 1, 1, 1, 1, 1, 3], k=1
+                ['adding', 'independ', 'downsample', 'slice','slice1', 'none', 'uncon'],
                 weights=[1, 1, 1, 1, 1, 3], k=1
             )[0]
         mask = torch.tensor(1, device=img.device)
         noise_map = None
         if proc_type not in ['none', None, '']:
             if proc_type == 'uncon':
+                noise_map = self.create_noise_map(img, noise_type=noise_type,noise_scale=noise_scale)
                 proc_img = noise_map
                 mask = torch.tensor(0, device=img.device)
                 cond_ratio = torch.tensor(0, device=img.device)
                 return proc_img, mask, cond_ratio
+            if proc_type in ['adding', 'independ', 'slice','slice1']:
                 # self.msk_noise_scale = 0
+                noise_map = self.create_noise_map(img, noise_type=noise_type,noise_scale=noise_scale)
             if proc_type == 'adding':
                 proc_img, noise_ratio = self.add_noise(proc_img, noise_map=noise_map, noise_ratio_range=[0., 1.])
                 cond_ratio = torch.tensor(1 - noise_ratio, device=img.device)
                 # proc_img, down_ratio = self.downsample(proc_img, down_ratio_range=[1./32, 1])
                 proc_img, down_ratio = self.downsample(proc_img, down_ratio_range=[1./64, 1])
                 cond_ratio = torch.tensor(down_ratio, device=img.device)
+            elif proc_type == 'slice' or proc_type == 'slice1':
+                if proc_type == 'slice1':
+                    slice_num_max = 1
+                else:
+                    slice_num_max = random.randint(1, 64)
+                    slice_num_max = random.randint(1, slice_num_max)
                 mask, sample_ratio = self.get_slice_mask(img, slice_num_range=[0, slice_num_max])
                 if self.msk_noise_scale == 0:
                     proc_img = img * mask
             t = [t] * 1
         return self.recover(x=noisy_imgs*mask, y=cond_imgs, t=t, rec_num=rec_num, text=text), dvf_I
+    def forward(self, img_org, cond_imgs=None, proc_type=None, T=None, output_embedding=False, **kwargs):
+        if output_embedding:
+            # Direct network forward for contrastive embedding (no diffusion).
+            # Returns img_embd so DDP's prepare_for_backward traces the correct subgraph
+            # (encoder + mid + attn + img2txt only, no decoder).
+            self.network(x=img_org, y=cond_imgs, t=T, text=kwargs.get('text'), rec_num=1)
+            return self.network.img_embd
+        elif T is not None:
             return self.diff_recover(img_org=img_org, T=T, proc_type=proc_type, cond_imgs=cond_imgs, **kwargs)
         else:
             return self._single_step(x0=img_org, proc_type=proc_type, cond_imgs=cond_imgs, **kwargs)
                 #     win_start = random.randint(len(time_steps)//2, len(time_steps) - win_len)
                 # win_end = win_start + win_len - 1
+                k = 1 if len(time_steps) > 16 else 2
                 # trainable_iterations = time_steps[win_start: win_start + win_len]
                 # trainable_iterations = random.sample(time_steps, k)
                 trainable_iterations = time_steps[-1:-k-1:-1]

Diffusion/diffuser_opt.py ADDED Viewed

	@@ -0,0 +1,357 @@

+"""
+diffuser_opt.py — Optimized DeformDDPM subclass.
+Inherits from Diffusion.diffuser.DeformDDPM and overrides only the methods
+that benefit from optimization.
+Key optimizations:
+  1. diff_recover(): hoist img_org/msk_org .clone().detach() outside the loop,
+     pre-compute timestep tensors, use torch.no_grad() for frozen steps
+  2. _random_ddf_generate(): scaling-and-squaring for O(log n) composition
+     instead of O(n), crop-first upsampling (4x faster), on-device tensors.
+  3. proc_cond_img(): skip clone for 'uncon' path (most common, ~3/8 weight)
+  4. _DDF_Encoder_init(): use OptSTN (register_buffer, no per-call .to(device))
+  5. recover(): fix t tensor bug (was staying on CPU), avoid redundant torch.tensor()
+  6. _multiscale_dvf_generate(): generate random tensors on device to avoid
+     CPU→GPU transfer of 3D volumes.
+"""
+from torch import nn
+import torch
+import numpy as np
+import torch.nn.functional as F
+import random
+import math
+import Diffusion.utils_diff as utils
+from Diffusion.diffuser import DeformDDPM as _BaseDeformDDPM
+from Diffusion.networks import *
+from Diffusion.networks_opt import OptSTN
+EPS = 1e-8
+class DeformDDPM(_BaseDeformDDPM):
+    """Drop-in replacement for DeformDDPM with speed optimizations."""
+    # ------------------------------------------------------------------
+    # Optimization 4: use OptSTN (register_buffer, no per-call .to())
+    # ------------------------------------------------------------------
+    def _DDF_Encoder_init(self, ctl_ratio=4, ctl_sz=None, resample_mode=None):
+        if ctl_sz is None:
+            ctl_sz = self.image_chw[1] // ctl_ratio
+        self.ctl_sz = ctl_sz
+        self.img_sz = self.image_chw[1]
+        # OPT: use OptSTN instead of STN — register_buffer for ref_grid/max_sz
+        self.ddf_stn_rec = OptSTN(img_sz=ctl_sz, ndims=self.ndims, device=self.device,
+                                  padding_mode=self.ddf_pad_mode)
+        self.img_stn = OptSTN(img_sz=self.img_sz, ndims=self.ndims, device=self.device,
+                              padding_mode=self.img_pad_mode, resample_mode=self.resample_mode)
+        self.msk_stn = OptSTN(img_sz=self.img_sz, ndims=self.ndims, device=self.device,
+                              padding_mode=self.img_pad_mode, resample_mode='nearest')
+    def __init__(self, network, n_steps=50, beta_schedule_fn=None, device='cpu',
+                 image_chw=(1, 28, 28), batch_size=1, img_pad_mode="zeros",
+                 ddf_pad_mode="border", padding_mode="border",
+                 v_scale=0.008/256, resample_mode=None, inf_mode=False):
+        # Call parent __init__ — it creates STN instances
+        super().__init__(
+            network=network, n_steps=n_steps, beta_schedule_fn=beta_schedule_fn,
+            device=device, image_chw=image_chw, batch_size=batch_size,
+            img_pad_mode=img_pad_mode, ddf_pad_mode=ddf_pad_mode,
+            padding_mode=padding_mode, v_scale=v_scale, resample_mode=resample_mode,
+            inf_mode=inf_mode,
+        )
+        # OPT: replace ddf_stn_full with OptSTN too
+        self.ddf_stn_full = OptSTN(
+            img_sz=self.image_chw[1], ndims=self.ndims,
+            padding_mode=self.padding_mode, device=self.device,
+        )
+    # ------------------------------------------------------------------
+    # Optimization 5: fix recover() t tensor bug + avoid redundant copies
+    # ------------------------------------------------------------------
+    def recover(self, x, y, t, rec_num=2, text=None):
+        # OPT: don't recreate t if already a tensor on the right device
+        if isinstance(t, list):
+            t = [t0 if isinstance(t0, torch.Tensor) else torch.tensor(t0, device=x.device)
+                 for t0 in t]
+            t = [t0.to(x.device) if t0.device != x.device else t0 for t0 in t]
+        elif isinstance(t, torch.Tensor):
+            # OPT: skip torch.tensor() copy — just ensure correct device
+            if t.device != x.device:
+                t = t.to(x.device)
+        else:
+            t = torch.tensor(t, device=x.device)
+        if rec_num is None:
+            rec_num = self.rec_num
+        return self.network(x=x, y=y, t=t, rec_num=rec_num, text=text)
+    # ------------------------------------------------------------------
+    # Optimization 2: scaling-and-squaring + crop-first upsample
+    # ------------------------------------------------------------------
+    def _compose_n_times(self, dvf, n):
+        """Compute n-fold self-composition of dvf using scaling-and-squaring.
+        Uses binary decomposition: O(log n) STN calls instead of O(n).
+        E.g. n=87 → ~10 calls, n=200 → ~9 calls (vs 87/200 iterative calls).
+        The result is the same deformation (n-fold composition) but computed
+        via a different sequence of grid_sample interpolations, so there are
+        small numerical differences (~1e-2 to 1e-1) vs iterative composition.
+        This is acceptable because DDF generation is stochastic augmentation.
+        """
+        if n <= 0:
+            return torch.zeros_like(dvf)
+        result = None
+        current = dvf  # current = dvf^(2^i), starts as dvf^1
+        while n > 0:
+            if n & 1:  # bit is set → accumulate this power
+                if result is None:
+                    result = current.clone()
+                else:
+                    # result = current ∘ result (apply result first, then current)
+                    result = result + self.ddf_stn_rec(current, result)
+            n >>= 1
+            if n > 0:
+                # Square: current = current ∘ current
+                current = current + self.ddf_stn_rec(current, current)
+        return result
+    def _crop_upsample(self, field):
+        """Upsample DDF from ctl_sz to img_sz with 2x oversampling + center crop.
+        Instead of upsampling the full ctl_sz→img_sz*2 (e.g. 32³→256³) then
+        cropping to img_sz (128³), we crop the control-point field first
+        (to ~20³) then upsample to ~160³ and crop to 128³. This is 4x faster
+        and bit-identical because trilinear interpolation is local.
+        """
+        crop_rate = 2
+        upscale = self.img_sz * crop_rate // self.ctl_sz  # e.g. 8
+        margin = 2  # voxels of margin for interpolation boundary
+        lo = self.ctl_sz // 4 - margin    # e.g. 6
+        hi = self.ctl_sz * 3 // 4 + margin  # e.g. 26
+        crop_sz = hi - lo                   # e.g. 20
+        up_sz = crop_sz * upscale           # e.g. 160
+        pad = (up_sz - self.img_sz) // 2    # e.g. 16
+        mode = 'bilinear' if self.ndims == 2 else 'trilinear'
+        if self.ndims == 2:
+            field_crop = field[..., lo:hi, lo:hi] * self.img_sz / self.ctl_sz
+            field_up = F.interpolate(field_crop, up_sz, mode=mode)
+            return field_up[..., pad:pad + self.img_sz, pad:pad + self.img_sz]
+        else:
+            field_crop = field[..., lo:hi, lo:hi, lo:hi] * self.img_sz / self.ctl_sz
+            field_up = F.interpolate(field_crop, up_sz, mode=mode)
+            return field_up[..., pad:pad + self.img_sz,
+                                 pad:pad + self.img_sz,
+                                 pad:pad + self.img_sz]
+    def _random_ddf_generate(self, rec_num=3, mul_num=[torch.tensor([5]), torch.tensor([5])],
+                             ddf0=None, keep_inverse=False, noise_ratio=0.08, select_num=3, flip_ratio=0.5):
+        for _ in range(self.ndims + 1):
+            mul_num = [torch.unsqueeze(n, -1) for n in mul_num]
+        ctl_ddf_sz = [self.batch_size, self.ndims] + [self.ctl_sz] * self.ndims
+        if ddf0 is not None:
+            ddf = ddf0
+        else:
+            ddf = torch.zeros(ctl_ddf_sz, device=self.device)
+        dddf = torch.zeros(ctl_ddf_sz, device=self.device)
+        scale_num = min(8, int(math.log2(self.ctl_sz)))
+        ctl_szs_all = [self.ctl_sz // (2 ** i) for i in range(scale_num)]
+        for i in range(rec_num):
+            if len(ctl_szs_all) > select_num:
+                ctl_szs = random.sample(ctl_szs_all, select_num)
+            else:
+                ctl_szs = ctl_szs_all
+            dvf = self._multiscale_dvf_generate(self.v_scale, ctl_szs=ctl_szs)
+            if noise_ratio == 0:
+                dvf0 = dvf
+            else:
+                dvf0 = dvf + self.ddf_stn_rec(
+                    self._multiscale_dvf_generate(self.v_scale * noise_ratio, ctl_szs=ctl_szs, rand_v_scale=False),
+                    dvf)
+            mul_num_ddf_val = int(torch.max(mul_num[0]).item())
+            mul_num_dvf_val = int(torch.max(mul_num[1]).item())
+            # OPT: scaling-and-squaring — O(log n) STN calls instead of O(n)
+            # For t=40: 10 calls instead of 80. For t=79: 9 calls instead of 195.
+            ddf = self._compose_n_times(dvf0, mul_num_ddf_val)
+            dddf = self._compose_n_times(dvf, mul_num_dvf_val)
+        # OPT: crop-first upsample — 4x fewer voxels to interpolate (bit-identical)
+        ddf = self._crop_upsample(ddf)
+        dddf = self._crop_upsample(dddf)
+        return ddf, dddf
+    # ------------------------------------------------------------------
+    # Optimization 6: generate DVF on device to avoid CPU→GPU transfer
+    # ------------------------------------------------------------------
+    def _multiscale_dvf_generate(self, v_scale, ctl_szs=[4, 8, 16, 32, 64], rand_v_scale=True):
+        dvf = 0
+        if self.img_sz is None:
+            self.img_sz = max(ctl_szs)
+        if 1 in ctl_szs:
+            dvf_rot = utils.random_ddf(
+                batch_size=self.batch_size, ndims=self.ndims,
+                img_sz=[self.ctl_sz] * self.ndims, range_gauss=0, rot_range=np.pi / 90)
+            dvf = dvf + dvf_rot
+        for ctl_sz in ctl_szs:
+            _v_scale = self._sample_random_uniform_multi_order(
+                high=v_scale, low=0., order_num=random.choice([1, 1, 2])) if rand_v_scale else v_scale
+            if ctl_sz <= 2:
+                _v_scale = _v_scale / 2
+            # OPT: generate random tensor directly on device
+            dvf_comp = torch.randn([self.batch_size, self.ndims] + [ctl_sz] * self.ndims,
+                                   device=self.device) * _v_scale
+            dvf_comp = F.interpolate(dvf_comp * self.ctl_sz / ctl_sz, [self.ctl_sz] * self.ndims,
+                                     align_corners=False,
+                                     mode='bilinear' if self.ndims == 2 else 'trilinear')
+            dvf = dvf + dvf_comp
+        return dvf
+    # ------------------------------------------------------------------
+    # Optimization 3: skip clone for 'uncon' (most common conditioning type)
+    # ------------------------------------------------------------------
+    def proc_cond_img(self, img, proc_type=None, noise_scale=0.1):
+        if proc_type is None:
+            proc_type = random.choices(
+                ['adding', 'independ', 'downsample', 'slice', 'slice1', 'none', 'uncon'],
+                weights=[1, 1, 1, 1, 1, 3], k=1
+            )[0]
+        mask = torch.tensor(1, device=img.device)
+        cond_ratio = torch.tensor(1., device=img.device)
+        self.msk_noise_scale = torch.tensor(0, device=img.device)
+        noise_type = random.choice(['gaussian', 'uniform', 'none'])
+        if proc_type not in ['none', None, '']:
+            # OPT: handle 'uncon' before cloning — no need to clone img
+            if proc_type == 'uncon':
+                noise_map = self.create_noise_map(img, noise_type=noise_type, noise_scale=noise_scale)
+                proc_img = noise_map
+                mask = torch.tensor(0, device=img.device)
+                cond_ratio = torch.tensor(0, device=img.device)
+                return proc_img, mask, cond_ratio
+            # Only clone when we actually need the image data
+            proc_img = img.clone().detach()
+            noise_map = None
+            if proc_type in ['adding', 'independ', 'slice', 'slice1']:
+                noise_map = self.create_noise_map(img, noise_type=noise_type, noise_scale=noise_scale)
+            if proc_type == 'adding':
+                proc_img, noise_ratio = self.add_noise(proc_img, noise_map=noise_map, noise_ratio_range=[0., 1.])
+                cond_ratio = torch.tensor(1 - noise_ratio, device=img.device)
+            elif proc_type == 'independ':
+                mask = self.create_noise_map(img, noise_type='binary')
+                if self.msk_noise_scale == 0:
+                    proc_img = img * mask
+                else:
+                    proc_img = self.apply_noise(proc_img, noise_map=noise_map * self.msk_noise_scale, apply_mask=mask)
+                with torch.no_grad():
+                    cond_ratio = mask.float().mean()
+            elif proc_type == 'downsample':
+                proc_img, down_ratio = self.downsample(proc_img, down_ratio_range=[1. / 64, 1])
+                cond_ratio = torch.tensor(down_ratio, device=img.device)
+            elif proc_type == 'slice' or proc_type == 'slice1':
+                if proc_type == 'slice1':
+                    slice_num_max = 1
+                else:
+                    slice_num_max = random.randint(1, 64)
+                    slice_num_max = random.randint(1, slice_num_max)
+                mask, sample_ratio = self.get_slice_mask(img, slice_num_range=[0, slice_num_max])
+                if self.msk_noise_scale == 0:
+                    proc_img = img * mask
+                else:
+                    proc_img = self.apply_noise(proc_img, noise_map=noise_map * self.msk_noise_scale, apply_mask=mask)
+                cond_ratio = torch.tensor(sample_ratio, device=img.device)
+            elif proc_type == 'project':
+                proc_img, proj_num = self.project(proc_img)
+                cond_ratio = torch.tensor(proj_num / (128 * self.ndims), device=img.device)
+            return proc_img, mask, cond_ratio
+        else:
+            # 'none' type — still need clone
+            proc_img = img.clone().detach()
+            return proc_img, mask, cond_ratio
+    # ------------------------------------------------------------------
+    # Optimization 1: hoist clone, pre-compute timestep tensors,
+    #                  use inference_mode for frozen iterations
+    # ------------------------------------------------------------------
+    def diff_recover(self, img_org, msk_org=None, T=[None, None], ddf_rand=None,
+                     v_scale=None, t_save=None, cond_imgs=None, proc_type=None, text=None):
+        if cond_imgs is None:
+            cond_imgs = img_org.clone().detach()
+        cond_imgs, mask_tgt, cond_ratio = self.proc_cond_img(cond_imgs, proc_type=proc_type)
+        if ddf_rand is None:
+            if v_scale is not None:
+                self.v_scale = v_scale
+                self._DDF_Encoder_init()
+            if T[0] is None or T[0] == 0:
+                img_diff = img_org.clone().detach()
+                ddf_rand = torch.zeros_like(img_diff)
+            else:
+                img_diff, _, ddf_rand = self._get_random_ddf(
+                    img=img_org, t=torch.tensor(np.array([T[0]])).to(self.device))
+        else:
+            img_diff = self.img_stn(img_org.clone().detach(), ddf_rand)
+        ddf_comp = ddf_rand.clone().detach()
+        img_rec = img_diff.clone().detach()
+        if msk_org is not None:
+            msk_diff = self.msk_stn(msk_org.clone().detach(), ddf_rand)
+        else:
+            msk_diff = None
+        msk_rec = msk_diff.clone().detach() if msk_org is not None else None
+        img_save = []
+        msk_save = []
+        # OPT: hoist clone().detach() outside the loop — grid_sample is read-only
+        img_org_ref = img_org.clone().detach()
+        msk_org_ref = msk_org.clone().detach() if msk_org is not None else None
+        if isinstance(self.network, DefRec_MutAttnNet):
+            t_list = list(range(T[1] - 1, -1, -1))
+            pre_dvf_I = self.recover(x=img_rec, y=cond_imgs, t=t_list, rec_num=None, text=text)
+            ddf_comp = self.ddf_stn_full(ddf_comp, pre_dvf_I) + pre_dvf_I
+            img_rec = self.img_stn(img_org_ref, ddf_comp)
+            if msk_org is not None:
+                msk_rec = self.msk_stn(msk_org_ref, ddf_comp)
+        else:
+            if isinstance(T[-1], int):
+                time_steps = range(T[-1] - 1, -1, -1)
+                trainable_iterations = []
+            else:
+                time_steps = T[-1]
+                k = 2
+                trainable_iterations = time_steps[-1:-k - 1:-1]
+            # OPT: pre-compute trainable index threshold — avoid unhashable list issue
+            t_save_set = set(t_save) if t_save is not None else None
+            num_time_steps = len(time_steps) if not isinstance(time_steps, range) else len(time_steps)
+            trainable_start_idx = num_time_steps - len(trainable_iterations)
+            for step_idx, i in enumerate(time_steps):
+                # OPT: create tensor directly on device, no numpy intermediate
+                t = torch.tensor([i], device=self.device)
+                if step_idx >= trainable_start_idx:
+                    pre_dvf_I = self.recover(x=img_rec, y=cond_imgs, t=t, rec_num=None, text=text)
+                else:
+                    # OPT: no_grad for frozen iterations (inference_mode not safe here
+                    # because ddf_comp is composed across frozen+trainable iterations)
+                    with torch.no_grad():
+                        pre_dvf_I = self.recover(x=img_rec, y=cond_imgs, t=t, rec_num=None, text=text)
+                ddf_comp = self.ddf_stn_full(ddf_comp, pre_dvf_I) + pre_dvf_I
+                # OPT: use pre-cloned reference instead of cloning each iteration
+                img_rec = self.img_stn(img_org_ref, ddf_comp)
+                if msk_org is not None:
+                    msk_rec = self.msk_stn(msk_org_ref, ddf_comp)
+                if t_save_set is not None:
+                    if i in t_save_set:
+                        img_save.append(img_rec)
+                        if msk_org is not None:
+                            msk_save.append(msk_rec)
+        return [ddf_comp, ddf_rand], [img_rec, img_diff, img_save], [msk_rec, msk_diff, msk_save]

Diffusion/losses.py CHANGED Viewed

@@ -21,7 +21,7 @@ class LMSE(torch.nn.Module):
     Labeled Mean Square Error (LMSE)
     """
-    def __init__(self, eps=1e-7, relate_eps=5e-1, win=None, smooth=False):
         super(LMSE, self).__init__()
         self.eps = eps
         self.relate_eps = relate_eps
@@ -72,7 +72,7 @@ class LNCC(torch.nn.Module):
     Local (over window) normalized cross-correlation (LNCC)
     """
-    def __init__(self, win=None, num_ch=1, eps=1e-6, central=True, smooth=True):
         super(LNCC, self).__init__()
         self.scale = 2e0
         self.win = win
@@ -84,11 +84,11 @@ class LNCC(torch.nn.Module):
         # Set window size
         if self.win is None:
-            self.win = [9] * self.ndims
         self.padding = [(w-1) // 2 for w in self.win]
         if smooth:
-            self.kernels = self._build_kernel(std=0.45)
         self.sum_filt = self._build_kernel(std=0.0)
     def _build_kernel(self, std=0.0):
@@ -153,7 +153,7 @@ class LNCC(torch.nn.Module):
             J_var = J2_sum
         # cc = (cross * cross) / (I_var * J_var + self.eps)
-        cc = (cross * cross) / (I_var + self.eps) / (J_var + self.eps)
         if label is not None:
             label = label.float()
             cc = torch.sum(cc * label, dim=(2, 3, 4)) / (torch.sum(label, dim=(2, 3, 4)) + self.eps)
@@ -164,6 +164,43 @@ class LNCC(torch.nn.Module):
         return -self.lncc(I*self.scale, J*self.scale, label=label)
 class NCC(torch.nn.Module):
     # def __init__(self, eps_scale=10e-7,img_sz=256):
@@ -236,7 +273,7 @@ class Grad(torch.nn.Module):
     N-D gradient loss
     """
-    def __init__(self, penalty=['l1'],ndims=2, eps=1e-8, outrange_weight=1e4,outrange_thresh=0.5, detj_weight=2, apear_scale=4, dist=1, sign=1,waive_thresh=10**-5):
         super(Grad, self).__init__()
         self.penalty = penalty
         self.eps = eps
@@ -521,7 +558,7 @@ if __name__ == "__main__":
     img3d_t = torch.empty(1,1,size,size,size).uniform_(0,1)#*-0.000001
     # img3d_t = img3d.clone().detach()
     # img3d_t = torch.zeros_like(img3d)
-    translation = 2
     start = 0
     end = 32
     # img3d_t[:,:,translation:,translation:,translation:] = img3d[:,:,:size-translation,:size-translation,:size-translation]

     Labeled Mean Square Error (LMSE)
     """
+    def __init__(self, eps=1e-7, relate_eps=1e-1, win=None, smooth=False):
         super(LMSE, self).__init__()
         self.eps = eps
         self.relate_eps = relate_eps
     Local (over window) normalized cross-correlation (LNCC)
     """
+    def __init__(self, win=None, num_ch=1, eps=1e-3, central=True, smooth=True):
         super(LNCC, self).__init__()
         self.scale = 2e0
         self.win = win
         # Set window size
         if self.win is None:
+            self.win = [11] * self.ndims
         self.padding = [(w-1) // 2 for w in self.win]
         if smooth:
+            self.kernels = self._build_kernel(std=0.5)
         self.sum_filt = self._build_kernel(std=0.0)
     def _build_kernel(self, std=0.0):
             J_var = J2_sum
         # cc = (cross * cross) / (I_var * J_var + self.eps)
+        cc = (cross * cross) / (I_var + self.eps) / (J_var + self.eps)   # eps must be large enough to avoid numerical unstability
         if label is not None:
             label = label.float()
             cc = torch.sum(cc * label, dim=(2, 3, 4)) / (torch.sum(label, dim=(2, 3, 4)) + self.eps)
         return -self.lncc(I*self.scale, J*self.scale, label=label)
+class MSLNCC(LNCC):
+    """
+    Multi-Scale Local Normalized Cross-Correlation (MSLNCC)
+    Computes LNCC at multiple scales and combines with weighted sum.
+    Images are downsampled via average pooling, labels via max pooling.
+    """
+    def __init__(self, win=None, num_ch=1, eps=1e-3, central=True, smooth=False,
+                 scale_ratios=[1, 0.5, 0.25], scale_weights=[0.25, 0.5, 0.75]):
+        super(MSLNCC, self).__init__(win=win, num_ch=num_ch, eps=eps,
+                                     central=central, smooth=smooth)
+        if win is None:
+            win = [9] * self.ndims
+        self.scale_ratios = scale_ratios
+        self.scale_weights = scale_weights
+    def _downsample(self, I, J, label, ratio):
+        """Downsample images via average pooling, labels via max pooling."""
+        if ratio >= 1.0:
+            return I, J, label
+        factor = int(1.0 / ratio)
+        I_down = F.avg_pool3d(I, kernel_size=factor, stride=factor)
+        J_down = F.avg_pool3d(J, kernel_size=factor, stride=factor)
+        label_down = None
+        if label is not None:
+            label_down = F.max_pool3d(label.float(), kernel_size=factor, stride=factor)
+        return I_down, J_down, label_down
+    def forward(self, I, J, label=None):
+        total_loss = 0.0
+        total_weight = 0.0
+        for ratio, weight in zip(self.scale_ratios, self.scale_weights):
+            I_s, J_s, label_s = self._downsample(I, J, label, ratio)
+            total_loss += weight * self.lncc(I_s * self.scale, J_s * self.scale, label=label_s)
+            total_weight += weight
+        return -total_loss / total_weight
 class NCC(torch.nn.Module):
     # def __init__(self, eps_scale=10e-7,img_sz=256):
     N-D gradient loss
     """
+    def __init__(self, penalty=['l1'],ndims=3, eps=1e-8, outrange_weight=1e4,outrange_thresh=0.5, detj_weight=1e4, apear_scale=8, dist=1, sign=1,waive_thresh=10**-4):
         super(Grad, self).__init__()
         self.penalty = penalty
         self.eps = eps
     img3d_t = torch.empty(1,1,size,size,size).uniform_(0,1)#*-0.000001
     # img3d_t = img3d.clone().detach()
     # img3d_t = torch.zeros_like(img3d)
+    translation = 16
     start = 0
     end = 32
     # img3d_t[:,:,translation:,translation:,translation:] = img3d[:,:,:size-translation,:size-translation,:size-translation]

Diffusion/losses_opt.py ADDED Viewed

	@@ -0,0 +1,141 @@

+"""
+losses_opt.py — Optimized loss functions.
+Inherits from Diffusion.losses and overrides LNCC and MSLNCC to use
+register_buffer for convolution kernels (auto device transfer, no
+per-call .to(device) overhead).
+All other loss classes (LMSE, NCC, MRSE, RMSE, Grad) are re-exported
+unchanged.
+"""
+import numpy as np
+import torch
+import torch.nn.functional as F
+# Re-export unchanged classes
+from Diffusion.losses import (
+    LMSE,
+    NCC,
+    MRSE,
+    RMSE,
+    Grad,
+    avg_std_skew_kurt,
+    grad_std,
+    avg_std,
+    EPS,
+    eps_scale,
+)
+class LNCC(torch.nn.Module):
+    """
+    Local (over window) normalized cross-correlation (LNCC).
+    Optimized: kernels stored as registered buffers for automatic device transfer.
+    """
+    def __init__(self, win=None, num_ch=1, eps=1e-3, central=True, smooth=True):
+        super(LNCC, self).__init__()
+        self.scale = 2e0
+        self.win = win
+        self.eps = eps
+        self.central = central
+        self.ndims = 3
+        self.strides = [1] * (self.ndims + 2)
+        self.smooth = smooth
+        if self.win is None:
+            self.win = [11] * self.ndims
+        self.padding = [(w - 1) // 2 for w in self.win]
+        if smooth:
+            self.tail = None  # will be set in _build_kernel
+            kernels = self._build_kernel(std=0.5)
+            self.register_buffer('kernels', kernels)  # OPT: auto device transfer
+        self.register_buffer('sum_filt', self._build_kernel(std=0.0))  # OPT: auto device transfer
+    def _build_kernel(self, std=0.0):
+        if std == 0.0:
+            return torch.ones([1, 1, *self.win]) / np.prod(self.win)
+        else:
+            self.tail = int(np.ceil(std)) * 2
+            k = torch.exp(-0.5 * (torch.arange(-self.tail, self.tail + 1, dtype=torch.float32) ** 2) / std ** 2)
+            kernel = k / torch.sum(k)
+            kernel = kernel.view(-1, 1, 1) * kernel.view(1, -1, 1) * kernel.view(1, 1, -1)
+            return kernel.unsqueeze(0).unsqueeze(0)
+    def lncc(self, I, J, label=None):
+        # OPT: no .to(I.device) needed — buffers auto-transfer with module.to()
+        if self.smooth:
+            I = torch.nn.functional.conv3d(I, self.kernels, stride=1, padding=self.tail)
+            J = torch.nn.functional.conv3d(J, self.kernels, stride=1, padding=self.tail)
+        I2 = I * I
+        J2 = J * J
+        IJ = I * J
+        if self.central:
+            I_sum = torch.nn.functional.conv3d(I, self.sum_filt, stride=1, padding=self.padding)
+            J_sum = torch.nn.functional.conv3d(J, self.sum_filt, stride=1, padding=self.padding)
+            I2_sum = torch.nn.functional.conv3d(I2, self.sum_filt, stride=1, padding=self.padding)
+            J2_sum = torch.nn.functional.conv3d(J2, self.sum_filt, stride=1, padding=self.padding)
+            IJ_sum = torch.nn.functional.conv3d(IJ, self.sum_filt, stride=1, padding=self.padding)
+            cross = IJ_sum - (I_sum * J_sum)
+            I_var = I2_sum - (I_sum * I_sum)
+            J_var = J2_sum - (J_sum * J_sum)
+        else:
+            I2_sum = torch.nn.functional.conv3d(I2, self.sum_filt, stride=1, padding=self.padding)
+            J2_sum = torch.nn.functional.conv3d(J2, self.sum_filt, stride=1, padding=self.padding)
+            IJ_sum = torch.nn.functional.conv3d(IJ, self.sum_filt, stride=1, padding=self.padding)
+            cross = IJ_sum
+            I_var = I2_sum
+            J_var = J2_sum
+        cc = (cross * cross) / (I_var + self.eps) / (J_var + self.eps)
+        if label is not None:
+            label = label.float()
+            cc = torch.sum(cc * label, dim=(2, 3, 4)) / (torch.sum(label, dim=(2, 3, 4)) + self.eps)
+        return torch.mean(cc)
+    def forward(self, I, J, label=None):
+        return -self.lncc(I * self.scale, J * self.scale, label=label)
+class MSLNCC(LNCC):
+    """
+    Multi-Scale Local Normalized Cross-Correlation (MSLNCC).
+    Optimized: inherits buffer-based kernels from LNCC.
+    """
+    def __init__(self, win=None, num_ch=1, eps=1e-3, central=True, smooth=False,
+                 scale_ratios=[1, 0.5, 0.25], scale_weights=[0.75, 0.5, 0.25]):
+        super(MSLNCC, self).__init__(win=win, num_ch=num_ch, eps=eps,
+                                     central=central, smooth=smooth)
+        if win is None:
+            win = [9] * self.ndims
+        self.scale_ratios = scale_ratios
+        self.scale_weights = scale_weights
+    def _downsample(self, I, J, label, ratio):
+        if ratio >= 1.0:
+            return I, J, label
+        factor = int(1.0 / ratio)
+        I_down = F.avg_pool3d(I, kernel_size=factor, stride=factor)
+        J_down = F.avg_pool3d(J, kernel_size=factor, stride=factor)
+        label_down = None
+        if label is not None:
+            label_down = F.max_pool3d(label.float(), kernel_size=factor, stride=factor)
+        return I_down, J_down, label_down
+    def forward(self, I, J, label=None):
+        total_loss = 0.0
+        total_weight = 0.0
+        for ratio, weight in zip(self.scale_ratios, self.scale_weights):
+            I_s, J_s, label_s = self._downsample(I, J, label, ratio)
+            total_loss += weight * self.lncc(I_s * self.scale, J_s * self.scale, label=label_s)
+            total_weight += weight
+        return -total_loss / total_weight

Diffusion/networks.py CHANGED Viewed

@@ -1,8 +1,28 @@
 from torch import nn
 import torch
 import torch.nn.functional as F
 import numpy as np
 import math
 def get_net(name="recresnet"):
     name = name.lower()
@@ -16,8 +36,10 @@ def get_net(name="recresnet"):
         net = RecMutAttnNet1
     elif name == "defrecmutattnnet":
         net = DefRec_MutAttnNet
-    elif name == "recmutattnnet_contrastive":
-        net = RecMutAttnNet_contrastive
     else:
         net = None
     return net
@@ -440,6 +462,7 @@ class DefRec_MutAttnNet(nn.Module):
             nn.Linear(dim_out, dim_out)
         )
 class RecMutAttnNet1(nn.Module):
     def __init__(self, n_steps=1000, time_emb_dim=100, ndims=2, num_input_chn=1, res=0, conditional_input=True,text_feat_chn=1024, num_heads=4):
         super(RecMutAttnNet1, self).__init__()
@@ -749,6 +772,8 @@ class RecMutAttnNet(nn.Module):
             else:
                 ddf = ddf_one + self.resample(ddf, ddf=ddf_one, img_sz=self.img_sz, padding_mode="border")
             img = self.resample(x, ddf=ddf, img_sz=self.img_sz)
         return ddf
@@ -759,9 +784,9 @@ class RecMutAttnNet(nn.Module):
             nn.Linear(dim_out, dim_out)
         )
-class RecMutAttnNet_contrastive(nn.Module):
     def __init__(self, n_steps=1000, time_emb_dim=100, ndims=2, num_input_chn=1, res=0, conditional_input=True,text_feat_chn=1024, num_heads=4):
-        super(RecMutAttnNet_contrastive, self).__init__()
         # self.feat_channels = [num_input_chn, 8, 16, 32, 32, 64]
         self.feat_channels = [num_input_chn, 16, 32, 64, 128, 256]
@@ -785,16 +810,21 @@ class RecMutAttnNet_contrastive(nn.Module):
         self.block_down = nn.ModuleList()
         self.block_up = nn.ModuleList()
         if self.conditional_input:
             self.block_down_cond = nn.ModuleList()
             self.fuse_conv0 = nn.ModuleList()
             self.fuse_conv1 = nn.ModuleList()
-            self.attn_layer = nn.MultiheadAttention(self.feat_channels[-1], self.num_heads)
             Global_Maxpool = getattr(nn, 'AdaptiveMaxPool%dd' % self.dimension)
             self.global_maxpool = Global_Maxpool(1)
             self.img2txt = self.Conv(self.feat_channels[-1], self.text_feat_chn, 1, 1, 0)
             self.txt_proc = AtrousBlock([self.text_feat_chn] + [1] * ndims, self.text_feat_chn, self.text_feat_chn, ndims=ndims, normalize=False, atrous_rates=[0, 0])
             self.txt2img = self.Conv(self.text_feat_chn, self.feat_channels[-1], 1, 1, 0)
-            self.text = torch.zeros(1, self.text_feat_chn, *([1]*self.dimension))
         self.img_res = [res]*self.dimension
         self.ref_grid = torch.reshape(torch.stack(torch.meshgrid([torch.arange(end=imsz) for imsz in self.img_res]), 0),
                                       [1, self.dimension]+list(self.img_res))
@@ -811,6 +841,11 @@ class RecMutAttnNet_contrastive(nn.Module):
                 AtrousBlock([self.feat_channels[i]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i], self.feat_channels[i], ndims=ndims)
             ))
             if self.conditional_input:
                 self.block_down_cond.append(nn.Sequential(
                     AtrousBlock([self.feat_channels[i-1]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i-1], self.feat_channels[i], ndims=ndims),
                     AtrousBlock([self.feat_channels[i]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i], self.feat_channels[i], ndims=ndims),
@@ -829,12 +864,14 @@ class RecMutAttnNet_contrastive(nn.Module):
             ))
         # Bottleneck
         self.tmid = self._make_te(time_emb_dim, self.feat_channels[-1])
         self.b_mid = nn.Sequential(
             AtrousBlock([self.feat_channels[self.hier_num]] + [res // (2**self.hier_num)] * ndims, self.feat_channels[self.hier_num], self.feat_channels[self.hier_num], ndims=ndims),
             AtrousBlock([self.feat_channels[self.hier_num]] + [res // (2**self.hier_num)] * ndims, self.feat_channels[self.hier_num], self.feat_channels[self.hier_num], ndims=ndims),
             AtrousBlock([self.feat_channels[self.hier_num]] + [res // (2**self.hier_num)] * ndims, self.feat_channels[self.hier_num], self.feat_channels[self.hier_num], ndims=ndims)
         )
         self.conv_out = self.Conv(self.feat_channels[1], ndims, 3, 1, 1)
@@ -860,6 +897,7 @@ class RecMutAttnNet_contrastive(nn.Module):
         self.max_sz = [img_sz[0]] * self.dimension
         ts_emb_shape=[n,-1]+[1]*self.dimension
         self.img_sz = torch.reshape(torch.tensor([(imsz - 1) / 2 for imsz in img_sz], device=self.device), [1]*(self.dimension+1)+[self.dimension])
         if list(img_sz) != self.img_res:
             # print ("Reinitialize the ref_grid to match the model's input image size.")
@@ -870,6 +908,13 @@ class RecMutAttnNet_contrastive(nn.Module):
         img = x
         t = self.time_embed(t)
         for rec_id in range(rec_num):
             if self.conditional_input:
@@ -879,7 +924,7 @@ class RecMutAttnNet_contrastive(nn.Module):
             for i in range(self.hier_num):
                 out = self.block_down[i](out + self.ted_layers[i](t).reshape(ts_emb_shape))
                 if self.conditional_input:
-                    tgt = self.block_down_cond[i](tgt)
                     out = self.fuse_conv0[i](torch.cat([out, tgt], axis=1))
                     tgt = self.fuse_conv1[i](torch.cat([tgt, out], axis=1))
                 enc_list.append(out)
@@ -893,19 +938,24 @@ class RecMutAttnNet_contrastive(nn.Module):
                 # out += self.attn_layer(out, tgt, tgt)[0]
                 out_shape = out.shape
                 tgt_shape = tgt.shape
-                # out = out.view(out_shape[0], out_shape[1], -1).permute(2, 0, 1)  # (N, C, H, W) -> (H*W, N, C)
-                tgt = tgt.view(tgt_shape[0], tgt_shape[1], -1).permute(2, 0, 1)  # (N, C, H, W) -> (H*W, N, C)
-                out_attn, _ = self.attn_layer(out.view(out_shape[0], out_shape[1], -1).permute(2, 0, 1), tgt, tgt)
                 out_attn = out_attn.permute(1, 2, 0).contiguous().view(out_shape)  # (H*W, N, C) -> (N, C, H, W)
                 out = out + out_attn
             if self.conditional_input:
-                if text is None:
-                    text = self.text
-                    text = text.to(self.device)
-                text = text.view(-1, self.text_feat_chn, *([1]*self.dimension))
-                img_embd = self.global_maxpool(self.img2txt(out)).view(n, -1)  # [B, 1024]
-                out_txt = self.img2txt(out) + text
                 out_txt = self.txt_proc(out_txt)
                 out_txt = self.txt2img(out_txt)
                 out = out + out_txt
@@ -922,8 +972,264 @@ class RecMutAttnNet_contrastive(nn.Module):
             else:
                 ddf = ddf_one + self.resample(ddf, ddf=ddf_one, img_sz=self.img_sz, padding_mode="border")
             img = self.resample(x, ddf=ddf, img_sz=self.img_sz)
-        return ddf, img_embd
     def _make_te(self, dim_in, dim_out):
         return nn.Sequential(
@@ -931,6 +1237,8 @@ class RecMutAttnNet_contrastive(nn.Module):
             nn.ReLU(),
             nn.Linear(dim_out, dim_out)
         )
 # class RecMutAttnNet(nn.Module):
 #     def __init__(self, n_steps=1000, time_emb_dim=100, ndims=2, num_input_chn=1, res=0, conditional_input=True):
 #         super(RecMutAttnNet, self).__init__()
@@ -1085,6 +1393,8 @@ def composite(ddfs,stn=None):
         comp_ddf = ddfs[i] + stn(comp_ddf,ddfs[i])
     return comp_ddf
 class STN(nn.Module):
     def __init__(self,ndims=2,img_sz=None,max_sz=None,device=None,padding_mode="border",resample_mode=None):
         super(STN, self).__init__()
@@ -1148,6 +1458,7 @@ class STN(nn.Module):
         resampled_x = self.resample(x, ddf=ddf, img_sz=self.img_sz, padding_mode=self.padding_mode)
         return resampled_x
 if __name__ == '__main__':
     ndims = 3
     res = 128

 from torch import nn
 import torch
 import torch.nn.functional as F
+from torch.utils.checkpoint import checkpoint as grad_checkpoint
 import numpy as np
 import math
+from Diffusion.safe_conv_transpose import SafeConvTranspose3d
+class UpsampleConv(nn.Module):
+    """Drop-in replacement for ConvTranspose3d/2d that avoids the XPU memory leak.
+    ConvTranspose3d backward leaks ~0.33 GiB/step on Intel XPU (oneDNN bug).
+    This uses F.interpolate (zero leak) + Conv (negligible leak) instead.
+    Also avoids checkerboard artifacts common with transposed convolutions.
+    """
+    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, ndims=3):
+        super().__init__()
+        self.scale_factor = stride
+        self.mode = 'trilinear' if ndims == 3 else 'bilinear'
+        Conv = getattr(nn, f'Conv{ndims}d')
+        self.conv = Conv(in_channels, out_channels, 3, 1, 1)
+    def forward(self, x):
+        x = F.interpolate(x, scale_factor=self.scale_factor, mode=self.mode, align_corners=False)
+        return self.conv(x)
 def get_net(name="recresnet"):
     name = name.lower()
         net = RecMutAttnNet1
     elif name == "defrecmutattnnet":
         net = DefRec_MutAttnNet
+    elif name == "recmulmodmutattnnet":
+        net = RecMulModMutAttnNet
+    elif name == "om_net":
+        net = OM_net
     else:
         net = None
     return net
             nn.Linear(dim_out, dim_out)
         )
 class RecMutAttnNet1(nn.Module):
     def __init__(self, n_steps=1000, time_emb_dim=100, ndims=2, num_input_chn=1, res=0, conditional_input=True,text_feat_chn=1024, num_heads=4):
         super(RecMutAttnNet1, self).__init__()
             else:
                 ddf = ddf_one + self.resample(ddf, ddf=ddf_one, img_sz=self.img_sz, padding_mode="border")
             img = self.resample(x, ddf=ddf, img_sz=self.img_sz)
+        # print(torch.max(torch.abs(ddf)))
         return ddf
             nn.Linear(dim_out, dim_out)
         )
+class RecMulModMutAttnNet(nn.Module):
     def __init__(self, n_steps=1000, time_emb_dim=100, ndims=2, num_input_chn=1, res=0, conditional_input=True,text_feat_chn=1024, num_heads=4):
+        super(RecMulModMutAttnNet, self).__init__()
         # self.feat_channels = [num_input_chn, 8, 16, 32, 32, 64]
         self.feat_channels = [num_input_chn, 16, 32, 64, 128, 256]
         self.block_down = nn.ModuleList()
         self.block_up = nn.ModuleList()
         if self.conditional_input:
+            # self.gate_img = nn.ModuleList()
+            self.txt_layers = nn.ModuleList()
             self.block_down_cond = nn.ModuleList()
             self.fuse_conv0 = nn.ModuleList()
             self.fuse_conv1 = nn.ModuleList()
+            self.attn_layer0 = nn.MultiheadAttention(self.feat_channels[-1], self.num_heads)
+            self.attn_layer1 = nn.MultiheadAttention(self.feat_channels[-1], self.num_heads)
             Global_Maxpool = getattr(nn, 'AdaptiveMaxPool%dd' % self.dimension)
             self.global_maxpool = Global_Maxpool(1)
             self.img2txt = self.Conv(self.feat_channels[-1], self.text_feat_chn, 1, 1, 0)
             self.txt_proc = AtrousBlock([self.text_feat_chn] + [1] * ndims, self.text_feat_chn, self.text_feat_chn, ndims=ndims, normalize=False, atrous_rates=[0, 0])
             self.txt2img = self.Conv(self.text_feat_chn, self.feat_channels[-1], 1, 1, 0)
+            # self.text = torch.zeros(1, self.text_feat_chn, *([1]*self.dimension))
+            self.text = torch.zeros(1, self.text_feat_chn)
         self.img_res = [res]*self.dimension
         self.ref_grid = torch.reshape(torch.stack(torch.meshgrid([torch.arange(end=imsz) for imsz in self.img_res]), 0),
                                       [1, self.dimension]+list(self.img_res))
                 AtrousBlock([self.feat_channels[i]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i], self.feat_channels[i], ndims=ndims)
             ))
             if self.conditional_input:
+                # self.gate_img.append(nn.Sequential(
+                #     nn.ConvNd(self.dimension, self.feat_channels[i], self.feat_channels[i], kernel_size=1, stride=1, padding=0),
+                #     nn.Sigmoid()
+                # ))
+                self.txt_layers.append((self._make_te(self.text_feat_chn, self.feat_channels[i])))
                 self.block_down_cond.append(nn.Sequential(
                     AtrousBlock([self.feat_channels[i-1]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i-1], self.feat_channels[i], ndims=ndims),
                     AtrousBlock([self.feat_channels[i]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i], self.feat_channels[i], ndims=ndims),
             ))
         # Bottleneck
+        self.txt_layers.append((self._make_te(self.text_feat_chn, self.text_feat_chn)))
         self.tmid = self._make_te(time_emb_dim, self.feat_channels[-1])
         self.b_mid = nn.Sequential(
             AtrousBlock([self.feat_channels[self.hier_num]] + [res // (2**self.hier_num)] * ndims, self.feat_channels[self.hier_num], self.feat_channels[self.hier_num], ndims=ndims),
             AtrousBlock([self.feat_channels[self.hier_num]] + [res // (2**self.hier_num)] * ndims, self.feat_channels[self.hier_num], self.feat_channels[self.hier_num], ndims=ndims),
             AtrousBlock([self.feat_channels[self.hier_num]] + [res // (2**self.hier_num)] * ndims, self.feat_channels[self.hier_num], self.feat_channels[self.hier_num], ndims=ndims)
         )
+        self.fuse = self.Conv(2*self.feat_channels[self.hier_num], self.feat_channels[self.hier_num], 1, 1, 0)
         self.conv_out = self.Conv(self.feat_channels[1], ndims, 3, 1, 1)
         self.max_sz = [img_sz[0]] * self.dimension
         ts_emb_shape=[n,-1]+[1]*self.dimension
         self.img_sz = torch.reshape(torch.tensor([(imsz - 1) / 2 for imsz in img_sz], device=self.device), [1]*(self.dimension+1)+[self.dimension])
         if list(img_sz) != self.img_res:
             # print ("Reinitialize the ref_grid to match the model's input image size.")
         img = x
         t = self.time_embed(t)
+        if text is None:
+            text = self.text
+            # print(text.shape)
+            text = text.to(self.device)
+            txt_shape = [1,-1]+[1]*self.dimension
+        else:
+            txt_shape = [n,-1]+[1]*self.dimension
         for rec_id in range(rec_num):
             if self.conditional_input:
             for i in range(self.hier_num):
                 out = self.block_down[i](out + self.ted_layers[i](t).reshape(ts_emb_shape))
                 if self.conditional_input:
+                    tgt = self.block_down_cond[i](tgt) + self.txt_layers[i](text).reshape(txt_shape)
                     out = self.fuse_conv0[i](torch.cat([out, tgt], axis=1))
                     tgt = self.fuse_conv1[i](torch.cat([tgt, out], axis=1))
                 enc_list.append(out)
                 # out += self.attn_layer(out, tgt, tgt)[0]
                 out_shape = out.shape
                 tgt_shape = tgt.shape
+                out_flat = out.view(out_shape[0], out_shape[1], -1).permute(2, 0, 1)  # (N, C, H, W) -> (H*W, N, C)
+                tgt_flat = tgt.view(tgt_shape[0], tgt_shape[1], -1).permute(2, 0, 1)  # (N, C, H, W) -> (H*W, N, C)
+                out_attn, _ = self.attn_layer0(out_flat, tgt_flat, tgt_flat)
+                tgt_attn, _ = self.attn_layer1(tgt_flat, out_flat, out_flat)
                 out_attn = out_attn.permute(1, 2, 0).contiguous().view(out_shape)  # (H*W, N, C) -> (N, C, H, W)
+                tgt_attn = tgt_attn.permute(1, 2, 0).contiguous().view(tgt_shape)  # (H*W, N, C) -> (N, C, H, W)
                 out = out + out_attn
+                tgt = tgt + tgt_attn
+                out = self.fuse(torch.cat([out, tgt], dim=1))
             if self.conditional_input:
+                # text = text.view(-1, self.text_feat_chn, *([1]*self.dimension))
+                # out_txt = self.img2txt(out) + text.reshape(txt_shape)
+                img_txt_feat = self.img2txt(out)
+                self.img_embd = self.global_maxpool(img_txt_feat).view(n, -1)  # [B, 1024]
+                out_txt = self.txt_layers[-1](text).reshape(txt_shape) + img_txt_feat
                 out_txt = self.txt_proc(out_txt)
                 out_txt = self.txt2img(out_txt)
                 out = out + out_txt
             else:
                 ddf = ddf_one + self.resample(ddf, ddf=ddf_one, img_sz=self.img_sz, padding_mode="border")
             img = self.resample(x, ddf=ddf, img_sz=self.img_sz)
+        # print(torch.max(torch.abs(ddf)))
+        return ddf
+    def _make_te(self, dim_in, dim_out):
+        return nn.Sequential(
+            nn.Linear(dim_in, dim_out),
+            nn.ReLU(),
+            nn.Linear(dim_out, dim_out)
+        )
+class OM_net(nn.Module):
+    """
+    Extended RecMulModMutAttnNet with gated attention mechanisms:
+    1. Text Gate (bottleneck): sigmoid weight w_txt to interpolate between
+       text-enhanced features and raw image features. Learns to suppress
+       text branch when text embedding is zeros (no text provided).
+    2. Target Gate (each encoder level): per-voxel spatial gate using
+       residual AtrousBlock to identify condition vs. noise voxels in the
+       target/condition image path, weighting the fuse_conv1 output.
+    Supports gradient checkpointing via `use_checkpoint` flag to reduce
+    peak activation memory (trades compute for memory).
+    """
+    def __init__(self, n_steps=1000, time_emb_dim=100, ndims=2, num_input_chn=1, res=0,
+                 conditional_input=True, text_feat_chn=1024, num_heads=4,
+                 use_conv_transpose=False):
+        super(OM_net, self).__init__()
+        self.use_checkpoint = False  # Set True to enable gradient checkpointing
+        self.use_conv_transpose = use_conv_transpose
+        self.feat_channels = [num_input_chn, 12, 32, 64, 128, 512]
+        self.conditional_input = conditional_input
+        self.num_heads = num_heads
+        self.text_feat_chn = text_feat_chn
+        self.dimension = ndims
+        self.Conv = getattr(nn, 'Conv%dd' % self.dimension)
+        self.ConvT = getattr(nn, 'ConvTranspose%dd' % self.dimension)
+        # Sinusoidal embedding
+        self.time_embed = nn.Embedding(n_steps, time_emb_dim)
+        self.time_embed.weight.data = sinusoidal_embedding(n_steps, time_emb_dim)
+        self.time_embed.requires_grad_(False)
+        self.hier_num = len(self.feat_channels) - 1
+        self.down_layers = nn.ModuleList()
+        self.up_layers = nn.ModuleList()
+        self.ted_layers = nn.ModuleList()
+        self.teu_layers = nn.ModuleList()
+        self.block_down = nn.ModuleList()
+        self.block_up = nn.ModuleList()
+        if self.conditional_input:
+            self.txt_layers = nn.ModuleList()
+            self.block_down_cond = nn.ModuleList()
+            self.fuse_conv0 = nn.ModuleList()
+            self.fuse_conv1 = nn.ModuleList()
+            self.tgt_gate = nn.ModuleList()  # Target gate per encoder level
+            self.attn_layer0 = nn.MultiheadAttention(self.feat_channels[-1], self.num_heads)
+            self.attn_layer1 = nn.MultiheadAttention(self.feat_channels[-1], self.num_heads)
+            Global_Maxpool = getattr(nn, 'AdaptiveMaxPool%dd' % self.dimension)
+            self.global_maxpool = Global_Maxpool(1)
+            self.img2txt = self.Conv(self.feat_channels[-1], self.text_feat_chn, 1, 1, 0)
+            self.txt_proc = AtrousBlock([self.text_feat_chn] + [1] * ndims, self.text_feat_chn, self.text_feat_chn, ndims=ndims, normalize=False, atrous_rates=[0, 0])
+            self.txt2img = self.Conv(self.text_feat_chn, self.feat_channels[-1], 1, 1, 0)
+            self.text = torch.zeros(1, self.text_feat_chn)
+            # Text Gate: text-only MLP → sigmoid weight (computed before rec loop)
+            self.text_gate = nn.Sequential(
+                nn.Linear(self.text_feat_chn, self.text_feat_chn // 4),
+                nn.ReLU(),
+                nn.Linear(self.text_feat_chn // 4, 1),
+                nn.Sigmoid()
+            )
+        self.img_res = [res]*self.dimension
+        self.ref_grid = torch.reshape(torch.stack(torch.meshgrid([torch.arange(end=imsz) for imsz in self.img_res]), 0),
+                                      [1, self.dimension]+list(self.img_res))
+        for i in range(1, self.hier_num + 1):
+            j=-i
+            self.down_layers.append(self.Conv(self.feat_channels[i], self.feat_channels[i], 4, 2, 1))
+            self.up_layers.append(SafeConvTranspose3d(self.feat_channels[j], self.feat_channels[j], 4, 2, 1))
+            self.ted_layers.append(self._make_te(time_emb_dim, self.feat_channels[i-1]))
+            self.teu_layers.append(self._make_te(time_emb_dim, 2*self.feat_channels[j]))
+            self.block_down.append(nn.Sequential(
+                AtrousBlock([self.feat_channels[i-1]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i-1], self.feat_channels[i], ndims=ndims),
+                AtrousBlock([self.feat_channels[i]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i], self.feat_channels[i], ndims=ndims),
+                AtrousBlock([self.feat_channels[i]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i], self.feat_channels[i], ndims=ndims)
+            ))
+            if self.conditional_input:
+                self.txt_layers.append((self._make_te(self.text_feat_chn, self.feat_channels[i])))
+                self.block_down_cond.append(nn.Sequential(
+                    AtrousBlock([self.feat_channels[i-1]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i-1], self.feat_channels[i], ndims=ndims),
+                    AtrousBlock([self.feat_channels[i]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i], self.feat_channels[i], ndims=ndims),
+                    AtrousBlock([self.feat_channels[i]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i], self.feat_channels[i], ndims=ndims)
+                ))
+                self.fuse_conv0.append(self.Conv(2*self.feat_channels[i], self.feat_channels[i], 1, 1, 0))
+                self.fuse_conv1.append(self.Conv(2*self.feat_channels[i], self.feat_channels[i], 1, 1, 0))
+                # Target Gate: residual AtrousBlock → 2-channel softmax (condition vs noise)
+                self.tgt_gate.append(nn.Sequential(
+                    AtrousBlock([self.feat_channels[i]] + [res // (2 ** (i-1))] * ndims,
+                               self.feat_channels[i], self.feat_channels[i], ndims=ndims, atrous_rates=[1, 3]),
+                    self.Conv(self.feat_channels[i], 2, 1, 1, 0)
+                ))
+            if i==self.hier_num:
+                k=j
+            else:
+                k=j-1
+            self.block_up.append(nn.Sequential(
+                AtrousBlock([2*self.feat_channels[j]] + [res // (2 ** (self.hier_num-i-1))] * ndims, 2*self.feat_channels[j], self.feat_channels[j], ndims=ndims, normalize=False),
+                AtrousBlock([self.feat_channels[j]] + [res // (2 ** (self.hier_num-i-1))] * ndims, self.feat_channels[j], self.feat_channels[j], ndims=ndims, normalize=False),
+                AtrousBlock([self.feat_channels[j]] + [res // (2 ** (self.hier_num-i-1))] * ndims, self.feat_channels[j], self.feat_channels[k], ndims=ndims, normalize=False)
+            ))
+        # Bottleneck
+        self.txt_layers.append((self._make_te(self.text_feat_chn, self.text_feat_chn)))
+        self.tmid = self._make_te(time_emb_dim, self.feat_channels[-1])
+        self.b_mid = nn.Sequential(
+            AtrousBlock([self.feat_channels[self.hier_num]] + [res // (2**self.hier_num)] * ndims, self.feat_channels[self.hier_num], self.feat_channels[self.hier_num], ndims=ndims),
+            AtrousBlock([self.feat_channels[self.hier_num]] + [res // (2**self.hier_num)] * ndims, self.feat_channels[self.hier_num], self.feat_channels[self.hier_num], ndims=ndims),
+            AtrousBlock([self.feat_channels[self.hier_num]] + [res // (2**self.hier_num)] * ndims, self.feat_channels[self.hier_num], self.feat_channels[self.hier_num], ndims=ndims)
+        )
+        self.fuse = self.Conv(2*self.feat_channels[self.hier_num], self.feat_channels[self.hier_num], 1, 1, 0)
+        self.conv_out = self.Conv(self.feat_channels[1], ndims, 3, 1, 1)
+        # Initialize target gates toward pass-through (condition confidence high)
+        self._init_tgt_gates()
+    def _init_tgt_gates(self):
+        """Bias target gates so condition channel starts moderately high (~0.73).
+        Milder than [2,-2] to ensure both cond*tgt and (1-cond)*out halves of
+        fuse_conv1 input have enough signal for healthy early gradient flow."""
+        for gate_seq in self.tgt_gate:
+            final_conv = gate_seq[-1]  # the Conv that outputs 2 channels
+            with torch.no_grad():
+                final_conv.bias.data[0] = 1.0   # condition channel → softmax ~0.73
+                final_conv.bias.data[1] = -1.0  # noise channel → softmax ~0.27
+    def _encoder_level(self, i, out, tgt, t, ts_emb_shape, text, txt_shape, w_txt):
+        """Single encoder level — extracted for gradient checkpointing."""
+        out = self.block_down[i](out + self.ted_layers[i](t).reshape(ts_emb_shape))
+        if self.conditional_input and tgt is not None:
+            tgt = self.block_down_cond[i](tgt) + w_txt * self.txt_layers[i](text).reshape(txt_shape)
+            gate_logits = self.tgt_gate[i](tgt)
+            cond_confidence = F.softmax(gate_logits, dim=1)[:, 0:1]
+            tgt = self.fuse_conv1[i](torch.cat([cond_confidence*tgt, (1-cond_confidence)*out], axis=1))
+            out = self.fuse_conv0[i](torch.cat([out, tgt], axis=1))
+        return out, tgt
+    def boundary_limit(self, sample_coords0, max_sz, plus=0., minus=1.):
+        sample_coords = torch.split(sample_coords0, split_size_or_sections=1, dim=1)
+        return torch.cat([torch.clamp(x * sz, min=minus - 1 * sz + plus, max=1 * sz - minus + plus) / sz for x, sz in
+                          zip(sample_coords, max_sz)], 1)
+    def resample(self, vol, ddf, ref=None, img_sz=None, padding_mode="zeros"):
+        ref = self.ref_grid if ref is None else ref
+        img_sz = self.max_sz if img_sz is None else img_sz
+        resample_mode = 'bilinear'
+        return F.grid_sample(vol, torch.flip((ddf * torch.Tensor(
+            np.reshape(np.array(self.max_sz), [1, self.dimension]+[1]*self.dimension)).to(self.device) + ref).permute(
+            [0]+list(range(2,2+self.dimension))+[1]) / img_sz - 1, dims=[-1]), mode=resample_mode, padding_mode=padding_mode,
+                             align_corners=True)
+    def forward(self, x=None, y=None, t=None, text=None, rec_num=2, ndims=2):
+        self.device = x.device
+        img_sz = x.size()[2:]
+        n = x.size()[0]
+        self.max_sz = [img_sz[0]] * self.dimension
+        ts_emb_shape=[n,-1]+[1]*self.dimension
+        self.img_sz = torch.reshape(torch.tensor([(imsz - 1) / 2 for imsz in img_sz], device=self.device), [1]*(self.dimension+1)+[self.dimension])
+        if list(img_sz) != self.img_res:
+            self.ref_grid = torch.reshape(torch.stack(torch.meshgrid([torch.arange(end=imsz) for imsz in img_sz]), 0),
+                                        [1, self.dimension]+list(img_sz))
+        self.ref_grid = self.ref_grid.to(self.device)
+        img = x
+        t = self.time_embed(t)
+        if text is None:
+            text = self.text
+            text = text.to(self.device)
+            txt_shape = [1,-1]+[1]*self.dimension
+        else:
+            txt_shape = [n,-1]+[1]*self.dimension
+        # Text Gate: compute w_txt from text embedding alone before rec loop
+        txt_vec = text.view(text.size(0), -1)  # [1, 1024] or [n, 1024]
+        if txt_vec.size(0) == 1 and n > 1:
+            txt_vec = txt_vec.expand(n, -1)
+        w_txt = self.text_gate(txt_vec)  # [B, 1]
+        w_txt = w_txt.view([w_txt.size(0), 1] + [1] * self.dimension)
+        for rec_id in range(rec_num):
+            if self.conditional_input:
+                tgt = y
+            enc_list = []
+            out = img
+            for i in range(self.hier_num):
+                # Gradient checkpointing on early encoder levels (large feature maps)
+                # to reduce peak activation memory. Levels 0-2 have 128^3, 64^3, 32^3 maps.
+                if self.use_checkpoint and self.training and i < 3:
+                    out, tgt = grad_checkpoint(
+                        self._encoder_level, i, out, tgt if self.conditional_input else None,
+                        t, ts_emb_shape, text, txt_shape, w_txt,
+                        use_reentrant=False,
+                    )
+                else:
+                    out, tgt = self._encoder_level(
+                        i, out, tgt if self.conditional_input else None,
+                        t, ts_emb_shape, text, txt_shape, w_txt,
+                    )
+                enc_list.append(out)
+                out = self.down_layers[i](out)
+                if self.conditional_input:
+                    tgt = self.down_layers[i](tgt)
+            out = self.b_mid(out + self.tmid(t).reshape(ts_emb_shape))
+            if self.conditional_input:
+                out_shape = out.shape
+                tgt_shape = tgt.shape
+                out_flat = out.view(out_shape[0], out_shape[1], -1).permute(2, 0, 1)
+                tgt_flat = tgt.view(tgt_shape[0], tgt_shape[1], -1).permute(2, 0, 1)
+                out_attn, _ = self.attn_layer0(out_flat, tgt_flat, tgt_flat)
+                tgt_attn, _ = self.attn_layer1(tgt_flat, out_flat, out_flat)
+                out_attn = out_attn.permute(1, 2, 0).contiguous().view(out_shape)
+                tgt_attn = tgt_attn.permute(1, 2, 0).contiguous().view(tgt_shape)
+                out = out + out_attn
+                tgt = tgt + tgt_attn
+                out = self.fuse(torch.cat([out, tgt], dim=1))
+            if self.conditional_input:
+                img_txt_feat = self.img2txt(out)
+                self.img_embd = self.global_maxpool(img_txt_feat).view(n, -1)  # [B, 1024]
+                out_txt = self.txt_layers[-1](text).reshape(txt_shape) - img_txt_feat
+                out_txt = self.txt_proc(out_txt)
+                out_txt = self.txt2img(out_txt)
+                # Text Gate: w_txt precomputed from text embedding alone
+                out = (1 - w_txt) * out + w_txt * out_txt
+            for i in range(self.hier_num):
+                out = torch.cat((self.up_layers[i](out),enc_list[-i-1]), dim=1)
+                out = self.block_up[i](out + self.teu_layers[i](t).reshape(ts_emb_shape))
+            out = self.conv_out(out)/128
+            ddf_one = self.boundary_limit(out, max_sz=1 * self.max_sz)
+            if rec_id == 0:
+                ddf = ddf_one
+            else:
+                ddf = ddf_one + self.resample(ddf, ddf=ddf_one, img_sz=self.img_sz, padding_mode="border")
+            img = self.resample(x, ddf=ddf, img_sz=self.img_sz)
+        return ddf
     def _make_te(self, dim_in, dim_out):
         return nn.Sequential(
             nn.ReLU(),
             nn.Linear(dim_out, dim_out)
         )
 # class RecMutAttnNet(nn.Module):
 #     def __init__(self, n_steps=1000, time_emb_dim=100, ndims=2, num_input_chn=1, res=0, conditional_input=True):
 #         super(RecMutAttnNet, self).__init__()
         comp_ddf = ddfs[i] + stn(comp_ddf,ddfs[i])
     return comp_ddf
 class STN(nn.Module):
     def __init__(self,ndims=2,img_sz=None,max_sz=None,device=None,padding_mode="border",resample_mode=None):
         super(STN, self).__init__()
         resampled_x = self.resample(x, ddf=ddf, img_sz=self.img_sz, padding_mode=self.padding_mode)
         return resampled_x
 if __name__ == '__main__':
     ndims = 3
     res = 128

Diffusion/networks0.py ADDED Viewed

	@@ -0,0 +1,1195 @@

+from torch import nn
+import torch
+import torch.nn.functional as F
+import numpy as np
+import math
+def get_net(name="recresnet"):
+    name = name.lower()
+    if name == "recresacnet":
+        net = RecResACNet
+    elif name == "recmutattnnet":
+        net = RecMutAttnNet
+    elif name == "recmutattnnet0":
+        net = RecMutAttnNet0
+    elif name == "recmutattnnet1":
+        net = RecMutAttnNet1
+    elif name == "defrecmutattnnet":
+        net = DefRec_MutAttnNet
+    elif name == "recmulmodmutattnnet":
+        net = RecMulModMutAttnNet
+    else:
+        net = None
+    return net
+def sinusoidal_embedding(n, d):
+    # Returns the standard positional embedding
+    embedding = torch.zeros(n, d)
+    wk = torch.tensor([1 / 10_000 ** (2 * j / d) for j in range(d)])
+    wk = wk.reshape((1, d))
+    t = torch.arange(n).reshape((n, 1))
+    embedding[:,::2] = torch.sin(t * wk[:,::2])
+    embedding[:,1::2] = torch.cos(t * wk[:,::2])
+    return embedding
+class AtrousBlock(nn.Module):
+    def __init__(self, shape, in_c, out_c, kernel_size=3, stride=1, atrous_rates=[1,3], ndims=2, activation=None, normalize=True):
+        super(AtrousBlock, self).__init__()
+        # if 0 not in shape:
+        if normalize:
+            # print(shape)
+            # self.ln = nn.LayerNorm(shape)     # jzheng 15/03/2024
+            norm=getattr(nn, 'InstanceNorm%dd' % ndims)     # jzheng 15/03/2024
+            self.ln = norm(out_c,affine=True)
+        else:
+            self.ln = nn.Identity()
+        Conv=getattr(nn,'Conv%dd' % ndims)
+        if in_c!=out_c:
+            self.conv0 = Conv(in_c, out_c, kernel_size, 1, (kernel_size-1)//2*1) #if in_c!=out_c else None
+        else:
+            self.conv0 = None
+        self.convs = nn.ModuleList([
+            Conv(out_c, out_c, kernel_size, 1, (kernel_size-1)//2*ar, dilation=ar)
+            if ar>0 else Conv(out_c, out_c, 1, 1, 0)
+            for ar in atrous_rates
+        ])
+        # self.conv1 = Conv(out_c, out_c, kernel_size, stride, padding)
+        # self.conv2 = Conv(out_c, out_c, kernel_size, stride, padding)
+        self.activation = nn.LeakyReLU(1e-6) if activation is None else activation
+        # self.activation = nn.ReLU() if activation is None else activation
+        # self.activation = nn.ReLU()
+        self.normalize = normalize
+    def forward(self, x):
+        if self.conv0 is not None:
+            x = self.conv0(x) #if self.conv0 is not None else x
+        x = self.ln(x) if self.normalize else x     # jzheng 15/03/2024
+        out=nn.Identity()(x)
+        for conv in self.convs:
+            out = self.activation(out)
+            out = conv(out)
+        return self.activation(out+x)
+# ==============================================
+# Unconditional Network
+# ==============================================
+class RecResACNet(nn.Module):
+    def __init__(self, n_steps=1000, time_emb_dim=100, ndims=2, num_input_chn=1, res=0):
+        super(RecResACNet, self).__init__()
+        self.dimension = ndims
+        self.Conv = getattr(nn, 'Conv%dd' % self.dimension)
+        self.ConvT = getattr(nn, 'ConvTranspose%dd' % self.dimension)
+        # Sinusoidal embedding
+        self.time_embed = nn.Embedding(n_steps, time_emb_dim)
+        self.time_embed.weight.data = sinusoidal_embedding(n_steps, time_emb_dim)
+        self.time_embed.requires_grad_(False)
+        # First half
+        self.te1 = self._make_te(time_emb_dim, 1)
+        self.b1 = nn.Sequential(
+            AtrousBlock([num_input_chn] + [res] * ndims, num_input_chn, 10, ndims=ndims),
+            AtrousBlock([10] + [res] * ndims, 10, 10, ndims=ndims),
+            AtrousBlock([10] + [res] * ndims, 10, 10, ndims=ndims),
+        )
+        self.down1 = self.Conv(10, 10, 4, 2, 1)
+        self.te2 = self._make_te(time_emb_dim, 10)
+        self.b2 = nn.Sequential(
+            AtrousBlock([10] + [res // 2] * ndims, 10, 20, ndims=ndims),
+            AtrousBlock([20] + [res // 2] * ndims, 20, 20, ndims=ndims),
+            AtrousBlock([20] + [res // 2] * ndims, 20, 20, ndims=ndims)
+        )
+        self.down2 = self.Conv(20, 20, 4, 2, 1)
+        self.te3 = self._make_te(time_emb_dim, 20)
+        self.b3 = nn.Sequential(
+            AtrousBlock([20] + [res // 4] * ndims, 20, 40, ndims=ndims),
+            AtrousBlock([40] + [res // 4] * ndims, 40, 40, ndims=ndims),
+            AtrousBlock([40] + [res // 4] * ndims, 40, 40, ndims=ndims)
+        )
+        self.down3 = self.Conv(40, 40, 4, 2, 1)
+        # Bottleneck
+        self.te_mid = self._make_te(time_emb_dim, 40)
+        self.b_mid = nn.Sequential(
+            AtrousBlock([40] + [res // 8] * ndims, 40, 20, ndims=ndims),
+            AtrousBlock([20] + [res // 8] * ndims, 20, 20, ndims=ndims),
+            AtrousBlock([20] + [res // 8] * ndims, 20, 40, ndims=ndims)
+        )
+        # Second half
+        self.up1 = self.ConvT(40, 40, 4, 2, 1)
+        self.te4 = self._make_te(time_emb_dim, 80)
+        self.b4 = nn.Sequential(
+            AtrousBlock([80] + [res // 4] * ndims, 80, 40, ndims=ndims, normalize=False),
+            AtrousBlock([40] + [res // 4] * ndims, 40, 20, ndims=ndims, normalize=False),
+            AtrousBlock([20] + [res // 4] * ndims, 20, 20, ndims=ndims, normalize=False)
+        )
+        self.up2 = self.ConvT(20, 20, 4, 2, 1)
+        self.te5 = self._make_te(time_emb_dim, 40)
+        self.b5 = nn.Sequential(
+            AtrousBlock([40] + [res // 2] * ndims, 40, 20, ndims=ndims, normalize=False),
+            AtrousBlock([20] + [res // 2] * ndims, 20, 10, ndims=ndims, normalize=False),
+            AtrousBlock([10] + [res // 2] * ndims, 10, 10, ndims=ndims, normalize=False)
+        )
+        self.up3 = self.ConvT(10, 10, 4, 2, 1)
+        self.te_out = self._make_te(time_emb_dim, 20)
+        self.b_out = nn.Sequential(
+            AtrousBlock([20] + [res // 1] * ndims, 20, 10, ndims=ndims, normalize=False),
+            AtrousBlock([10] + [res // 1] * ndims, 10, 10, ndims=ndims, normalize=False),
+            AtrousBlock([10] + [res // 1] * ndims, 10, 10, ndims=ndims, normalize=False)
+        )
+        self.conv_out = self.Conv(10, ndims, 3, 1, 1)
+    def boundary_limit(self, sample_coords0, max_sz, plus=0., minus=1.):
+        sample_coords = torch.split(sample_coords0, split_size_or_sections=1, dim=1)
+        return torch.cat([torch.clamp(x * sz, min=minus - 1 * sz + plus, max=1 * sz - minus + plus) / sz for x, sz in
+                          zip(sample_coords, max_sz)], 1)
+    def resample(self, vol, ddf, ref=None, img_sz=None, padding_mode="zeros"):
+        ref = self.ref_grid if ref is None else ref
+        img_sz = self.max_sz if img_sz is None else img_sz
+        # resample_mode = 'bicubic'
+        resample_mode = 'bilinear' # if self.dimension==2 else 'trilinear'
+        # padding_mode = "border"
+        if True:
+            # return F.grid_sample(vol, torch.flip(torch.transpose(ddf * torch.Tensor(np.reshape(np.array(self.max_sz), [1, 1, 1, self.dimension])).cuda() + ref,[0, 2, 3, 1]) / img_sz - 1, dims=[-1]), mode=resample_mode, padding_mode=padding_mode,align_corners=True)
+            return F.grid_sample(vol, torch.flip((ddf * torch.Tensor(
+                np.reshape(np.array(self.max_sz), [1, self.dimension]+[1]*self.dimension)).to(self.device) + ref).permute(
+                [0]+list(range(2,2+self.dimension))+[1]) / img_sz - 1, dims=[-1]), mode=resample_mode, padding_mode=padding_mode,
+                                 align_corners=True)
+    def forward(self, x=None, t=None, y=None, rec_num=2, ndims=2):
+        #
+        self.device = x.device
+        # [h, w] = x.size()[2:]
+        img_sz = x.size()[2:]
+        n = x.size()[0]
+        self.max_sz = [img_sz[0]] * self.dimension
+        ts_emb_shape=[n,-1]+[1]*self.dimension
+        # [h,w]=img_sz
+        # self.img_sz = torch.reshape(torch.tensor([(h - 1) / 2., (w - 1) / 2.], device=self.device), [1, 1, 1, 2])
+        self.img_sz = torch.reshape(torch.tensor([(imsz - 1) / 2 for imsz in img_sz], device=self.device), [1]*(self.dimension+1)+[self.dimension])
+        # self.ref_grid = torch.reshape(torch.stack(torch.meshgrid([torch.arange(end=h), torch.arange(end=w)]), 0),
+        #                               [1, 2, h, w]).to(self.device)
+        self.ref_grid = torch.reshape(torch.stack(torch.meshgrid([torch.arange(end=imsz) for imsz in img_sz]), 0),
+                                      [1, self.dimension]+list(img_sz)).to(self.device)
+        img = x
+        # x is (N, 2, 28, 28) (image with positional embedding stacked on channel dimension)
+        t = self.time_embed(t)
+        for rec_id in range(rec_num):
+            out1 = self.b1(img + self.te1(t).reshape(ts_emb_shape))  # (N, 10, 28, 28)
+            out2 = self.b2(self.down1(out1) + self.te2(t).reshape(ts_emb_shape))  # (N, 20, 14, 14)
+            out3 = self.b3(self.down2(out2) + self.te3(t).reshape(ts_emb_shape))  # (N, 40, 7, 7)
+            out_mid = self.b_mid(self.down3(out3) * self.te_mid(t).reshape(ts_emb_shape))  # (N, 40, 3, 3)
+            out4 = torch.cat((out3, self.up1(out_mid)), dim=1)  # (N, 80, 7, 7)
+            out4 = self.b4(out4 + self.te4(t).reshape(ts_emb_shape))  # (N, 20, 7, 7)
+            out5 = torch.cat((out2, self.up2(out4)), dim=1)  # (N, 40, 14, 14)
+            out5 = self.b5(out5 + self.te5(t).reshape(ts_emb_shape))  # (N, 10, 14, 14)
+            out = torch.cat((out1, self.up3(out5)), dim=1)  # (N, 20, 28, 28)
+            out = self.b_out(out + self.te_out(t).reshape(ts_emb_shape))  # (N, 1, 28, 28)
+            out = self.conv_out(out)
+            ddf_one = self.boundary_limit(out, max_sz=1 * self.max_sz)
+            if rec_id == 0:
+                ddf = ddf_one
+            else:
+                ddf = ddf_one + self.resample(ddf, ddf=ddf_one, img_sz=self.img_sz, padding_mode="border")
+            img = self.resample(x, ddf=ddf, img_sz=self.img_sz)
+        return ddf
+    def _make_te(self, dim_in, dim_out):
+        # make time embedding
+        return nn.Sequential(
+            nn.Linear(dim_in, dim_out),
+            # nn.SiLU(),
+            nn.ReLU(),
+            nn.Linear(dim_out, dim_out)
+        )
+# ==============================================
+# Conditional Network
+# ==============================================
+class cross_attn(nn.Module):
+    def __init__(self, q, k, v, ndims=2):
+        self.q = q
+        self.k = k
+        self.v = v
+        self.ndims = ndims
+        self.Conv = getattr(nn, 'Conv%dd' % self.ndims)
+        self.ConvT = getattr(nn, 'ConvTranspose%dd' % self.ndims)
+        self.softmax = nn.Softmax(dim=-1)
+        self.gamma = nn.Parameter(torch.zeros(1))
+    def forward(self, x, y):
+        q = self.q(x)
+        k = self.k(y)
+        v = self.v(y)
+        attn = self.softmax(torch.matmul(q, k.transpose(-2, -1)))
+        out = torch.matmul(attn, v)
+        return out
+class DefRec_MutAttnNet(nn.Module):
+    def __init__(self, n_steps=1000, time_emb_dim=100, ndims=2, num_input_chn=1, res=0, conditional_input=True,text_feat_chn=1024, num_heads=4):
+        super(DefRec_MutAttnNet, self).__init__()
+        # self.feat_channels = [num_input_chn, 8, 16, 32, 32, 64]
+        # self.feat_channels = [num_input_chn, 16, 32, 64, 128, 256]
+        self.feat_channels = [num_input_chn, 16, 32, 128, 256, 512]
+        self.conditional_input = conditional_input
+        self.num_heads = num_heads
+        self.text_feat_chn = text_feat_chn
+        self.dimension = ndims
+        self.Conv = getattr(nn, 'Conv%dd' % self.dimension)
+        self.ConvT = getattr(nn, 'ConvTranspose%dd' % self.dimension)
+        self.copy = nn.Identity()
+        # Sinusoidal embedding
+        self.time_embed = nn.Embedding(n_steps, time_emb_dim)
+        self.time_embed.weight.data = sinusoidal_embedding(n_steps, time_emb_dim)
+        self.time_embed.requires_grad_(False)
+        self.hier_num = len(self.feat_channels) - 1
+        self.down_layers = nn.ModuleList()
+        self.up_layers = nn.ModuleList()
+        self.ted_layers = nn.ModuleList()
+        self.teu_layers = nn.ModuleList()
+        self.block_down = nn.ModuleList()
+        self.block_up = nn.ModuleList()
+        if self.conditional_input:
+            self.block_down_cond = nn.ModuleList()
+            self.fuse_conv0 = nn.ModuleList()
+            # self.fuse_conv1 = nn.ModuleList()
+            self.attn_layer = nn.MultiheadAttention(self.feat_channels[-1], self.num_heads)
+            Global_Maxpool = getattr(nn, 'AdaptiveMaxPool%dd' % self.dimension)
+            self.global_maxpool = Global_Maxpool(1)
+            self.img2txt = self.Conv(self.feat_channels[-1], self.text_feat_chn, 1, 1, 0)
+            self.txt_proc = AtrousBlock([self.text_feat_chn] + [1] * ndims, self.text_feat_chn, self.text_feat_chn, ndims=ndims, normalize=False, atrous_rates=[0, 0])
+            self.txt2img = self.Conv(self.text_feat_chn, self.feat_channels[-1], 1, 1, 0)
+            self.text = torch.zeros(1, self.text_feat_chn, *([1]*self.dimension))
+        self.img_res = [res]*self.dimension
+        self.ref_grid = torch.reshape(torch.stack(torch.meshgrid([torch.arange(end=imsz) for imsz in self.img_res]), 0),
+                                      [1, self.dimension]+list(self.img_res))
+        for i in range(1, self.hier_num + 1):
+            j=-i
+            self.down_layers.append(self.Conv(self.feat_channels[i], self.feat_channels[i], 4, 2, 1))
+            self.up_layers.append(self.ConvT(self.feat_channels[j], self.feat_channels[j], 4, 2, 1))
+            self.ted_layers.append(self._make_te(time_emb_dim, self.feat_channels[i-1]))
+            self.teu_layers.append(self._make_te(time_emb_dim, 2*self.feat_channels[j]))
+            self.block_down.append(nn.Sequential(
+                AtrousBlock([self.feat_channels[i-1]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i-1], self.feat_channels[i], ndims=ndims),
+                AtrousBlock([self.feat_channels[i]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i], self.feat_channels[i], ndims=ndims),
+                AtrousBlock([self.feat_channels[i]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i], self.feat_channels[i], ndims=ndims)
+            ))
+            if self.conditional_input:
+                self.block_down_cond.append(nn.Sequential(
+                    AtrousBlock([self.feat_channels[i-1]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i-1], self.feat_channels[i], ndims=ndims),
+                    AtrousBlock([self.feat_channels[i]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i], self.feat_channels[i], ndims=ndims),
+                    AtrousBlock([self.feat_channels[i]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i], self.feat_channels[i], ndims=ndims)
+                ))
+                self.fuse_conv0.append(self.Conv(2*self.feat_channels[i], self.feat_channels[i], 1, 1, 0))
+                # self.fuse_conv1.append(self.Conv(2*self.feat_channels[i], self.feat_channels[i], 1, 1, 0))
+            if i==self.hier_num:
+                k=j
+            else:
+                k=j-1
+            self.block_up.append(nn.Sequential(
+                AtrousBlock([2*self.feat_channels[j]] + [res // (2 ** (self.hier_num-i-1))] * ndims, 2*self.feat_channels[j], self.feat_channels[j], ndims=ndims, normalize=False),
+                AtrousBlock([self.feat_channels[j]] + [res // (2 ** (self.hier_num-i-1))] * ndims, self.feat_channels[j], self.feat_channels[j], ndims=ndims, normalize=False),
+                AtrousBlock([self.feat_channels[j]] + [res // (2 ** (self.hier_num-i-1))] * ndims, self.feat_channels[j], self.feat_channels[k], ndims=ndims, normalize=False)
+            ))
+        # Bottleneck
+        self.tmid = self._make_te(time_emb_dim, self.feat_channels[-1])
+        self.b_mid = nn.Sequential(
+            AtrousBlock([self.feat_channels[self.hier_num]] + [res // (2**self.hier_num)] * ndims, self.feat_channels[self.hier_num], self.feat_channels[self.hier_num], ndims=ndims),
+            AtrousBlock([self.feat_channels[self.hier_num]] + [res // (2**self.hier_num)] * ndims, self.feat_channels[self.hier_num], self.feat_channels[self.hier_num], ndims=ndims),
+            AtrousBlock([self.feat_channels[self.hier_num]] + [res // (2**self.hier_num)] * ndims, self.feat_channels[self.hier_num], self.feat_channels[self.hier_num], ndims=ndims)
+        )
+        self.conv_out = self.Conv(self.feat_channels[1], ndims, 3, 1, 1)
+    def boundary_limit(self, sample_coords0, max_sz, plus=0., minus=1.):
+        sample_coords = torch.split(sample_coords0, split_size_or_sections=1, dim=1)
+        return torch.cat([torch.clamp(x * sz, min=minus - 1 * sz + plus, max=1 * sz - minus + plus) / sz for x, sz in
+                          zip(sample_coords, max_sz)], 1)
+    def resample(self, vol, ddf, ref=None, img_sz=None, padding_mode="zeros"):
+        ref = self.ref_grid if ref is None else ref
+        img_sz = self.max_sz if img_sz is None else img_sz
+        resample_mode = 'bilinear' # if self.dimension==2 else 'trilinear'
+        return F.grid_sample(vol, torch.flip((ddf * torch.Tensor(
+            np.reshape(np.array(self.max_sz), [1, self.dimension]+[1]*self.dimension)).to(self.device) + ref).permute(
+            [0]+list(range(2,2+self.dimension))+[1]) / img_sz - 1, dims=[-1]), mode=resample_mode, padding_mode=padding_mode,
+                             align_corners=True)
+    def forward(self, x=None, y=None, t=None, text=None, rec_num=2, ndims=2):
+        self.device = x.device
+        img_sz = x.size()[2:]
+        n = x.size()[0]
+        self.max_sz = [img_sz[0]] * self.dimension
+        ts_emb_shape=[n,-1]+[1]*self.dimension
+        self.img_sz = torch.reshape(torch.tensor([(imsz - 1) / 2 for imsz in img_sz], device=self.device), [1]*(self.dimension+1)+[self.dimension])
+        if list(img_sz) != self.img_res:
+            # print ("Reinitialize the ref_grid to match the model's input image size.")
+            # print(img_sz, self.img_res)
+            self.ref_grid = torch.reshape(torch.stack(torch.meshgrid([torch.arange(end=imsz) for imsz in img_sz]), 0),
+                                        [1, self.dimension]+list(img_sz))
+        self.ref_grid = self.ref_grid.to(self.device)
+        img = x
+        if self.conditional_input:
+            tgt = y
+        # encode the conditional input
+        tgt_down_list = []
+        for i in range(self.hier_num):
+            # out = self.block_down[i](out + self.ted_layers[i](t_emb).reshape(ts_emb_shape))
+            if self.conditional_input:
+                tgt = self.block_down_cond[i](tgt)
+                tgt_down_list.append(self.copy(tgt))
+                tgt = self.down_layers[i](tgt)
+        tgt_mid = self.copy(tgt)
+        tgt_shape = tgt_mid.shape
+        # out = out.view(out_shape[0], out_shape[1], -1).permute(2, 0, 1)  # (N, C, H, W) -> (H*W, N, C)
+        tgt_mid = tgt_mid.view(tgt_shape[0], tgt_shape[1], -1).permute(2, 0, 1)  # (N, C, H, W) -> (H*W, N, C)
+        t = [t0.to(self.device) for t0 in t]
+        t = [t0 for _ in range(rec_num) for t0 in t]
+        for rec_id,time in enumerate(t):
+            t_emb = self.time_embed(time)
+        # for rec_id in range(rec_num):
+            # if self.conditional_input:
+            #     tgt = y
+            enc_list = []
+            out = img
+            for i in range(self.hier_num):
+                out = self.block_down[i](out + self.ted_layers[i](t_emb).reshape(ts_emb_shape))
+                if self.conditional_input:
+                    # tgt = self.block_down_cond[i](tgt)
+                    out = self.fuse_conv0[i](torch.cat([out, tgt_down_list[i]], axis=1))
+                    # tgt = self.fuse_conv1[i](torch.cat([tgt, out], axis=1))
+                enc_list.append(out)
+                out = self.down_layers[i](out)
+                # if self.conditional_input:
+                #     tgt = self.down_layers[i](tgt)
+            out = self.b_mid(out + self.tmid(t_emb).reshape(ts_emb_shape))
+            if self.conditional_input:
+                # out += self.attn_layer(out, tgt, tgt)[0]
+                out_shape = out.shape
+                # tgt_shape = tgt.shape
+                # # out = out.view(out_shape[0], out_shape[1], -1).permute(2, 0, 1)  # (N, C, H, W) -> (H*W, N, C)
+                # tgt = tgt.view(tgt_shape[0], tgt_shape[1], -1).permute(2, 0, 1)  # (N, C, H, W) -> (H*W, N, C)
+                out_attn, _ = self.attn_layer(out.view(out_shape[0], out_shape[1], -1).permute(2, 0, 1), tgt_mid, tgt_mid)
+                out_attn = out_attn.permute(1, 2, 0).contiguous().view(out_shape)  # (H*W, N, C) -> (N, C, H, W)
+                out = out + out_attn
+            if self.conditional_input:
+                if text is None:
+                    text = self.text
+                    text = text.to(self.device)
+                out_txt = self.img2txt(out) + text
+                out_txt = self.txt_proc(out_txt)
+                out_txt = self.txt2img(out_txt)
+                out = out + out_txt
+            for i in range(self.hier_num):
+                out = torch.cat((self.up_layers[i](out),enc_list[-i-1]), dim=1)
+                out = self.block_up[i](out + self.teu_layers[i](t_emb).reshape(ts_emb_shape))
+            out = self.conv_out(out)/128
+            ddf_one = self.boundary_limit(out, max_sz=1 * self.max_sz)
+            if rec_id == 0:
+                ddf = ddf_one
+            else:
+                ddf = ddf_one + self.resample(ddf, ddf=ddf_one, img_sz=self.img_sz, padding_mode="border")
+            img = self.resample(x, ddf=ddf, img_sz=self.img_sz)
+        return ddf
+    def _make_te(self, dim_in, dim_out):
+        return nn.Sequential(
+            nn.Linear(dim_in, dim_out),
+            nn.ReLU(),
+            nn.Linear(dim_out, dim_out)
+        )
+class RecMutAttnNet1(nn.Module):
+    def __init__(self, n_steps=1000, time_emb_dim=100, ndims=2, num_input_chn=1, res=0, conditional_input=True,text_feat_chn=1024, num_heads=4):
+        super(RecMutAttnNet1, self).__init__()
+        # self.feat_channels = [num_input_chn, 8, 16, 32, 32, 64]
+        self.feat_channels = [num_input_chn, 16, 32, 64, 128, 256]
+        self.conditional_input = conditional_input
+        self.num_heads = num_heads
+        self.text_feat_chn = text_feat_chn
+        self.dimension = ndims
+        self.Conv = getattr(nn, 'Conv%dd' % self.dimension)
+        self.ConvT = getattr(nn, 'ConvTranspose%dd' % self.dimension)
+        # Sinusoidal embedding
+        self.time_embed = nn.Embedding(n_steps, time_emb_dim)
+        self.time_embed.weight.data = sinusoidal_embedding(n_steps, time_emb_dim)
+        self.time_embed.requires_grad_(False)
+        self.hier_num = len(self.feat_channels) - 1
+        self.down_layers = nn.ModuleList()
+        self.up_layers = nn.ModuleList()
+        self.ted_layers = nn.ModuleList()
+        self.teu_layers = nn.ModuleList()
+        self.block_down = nn.ModuleList()
+        if self.conditional_input:
+            self.block_down_cond = nn.ModuleList()
+            self.fuse_conv0 = nn.ModuleList()
+            self.fuse_conv1 = nn.ModuleList()
+            self.attn_layer = nn.MultiheadAttention(self.feat_channels[-1], self.num_heads)
+        self.block_up = nn.ModuleList()
+        for i in range(1, self.hier_num + 1):
+            j=-i
+            self.down_layers.append(self.Conv(self.feat_channels[i], self.feat_channels[i], 4, 2, 1))
+            self.up_layers.append(self.ConvT(self.feat_channels[j], self.feat_channels[j], 4, 2, 1))
+            self.ted_layers.append(self._make_te(time_emb_dim, self.feat_channels[i-1]))
+            self.teu_layers.append(self._make_te(time_emb_dim, 2*self.feat_channels[j]))
+            self.block_down.append(nn.Sequential(
+                AtrousBlock([self.feat_channels[i-1]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i-1], self.feat_channels[i], ndims=ndims),
+                AtrousBlock([self.feat_channels[i]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i], self.feat_channels[i], ndims=ndims),
+                AtrousBlock([self.feat_channels[i]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i], self.feat_channels[i], ndims=ndims)
+            ))
+            if self.conditional_input:
+                self.block_down_cond.append(nn.Sequential(
+                    AtrousBlock([self.feat_channels[i-1]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i-1], self.feat_channels[i], ndims=ndims),
+                    AtrousBlock([self.feat_channels[i]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i], self.feat_channels[i], ndims=ndims),
+                    AtrousBlock([self.feat_channels[i]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i], self.feat_channels[i], ndims=ndims)
+                ))
+                self.fuse_conv0.append(self.Conv(2*self.feat_channels[i], self.feat_channels[i], 1, 1, 0))
+                self.fuse_conv1.append(self.Conv(2*self.feat_channels[i], self.feat_channels[i], 1, 1, 0))
+            if i==self.hier_num:
+                k=j
+            else:
+                k=j-1
+            self.block_up.append(nn.Sequential(
+                AtrousBlock([2*self.feat_channels[j]] + [res // (2 ** (self.hier_num-i-1))] * ndims, 2*self.feat_channels[j], self.feat_channels[j], ndims=ndims, normalize=False),
+                AtrousBlock([self.feat_channels[j]] + [res // (2 ** (self.hier_num-i-1))] * ndims, self.feat_channels[j], self.feat_channels[j], ndims=ndims, normalize=False),
+                AtrousBlock([self.feat_channels[j]] + [res // (2 ** (self.hier_num-i-1))] * ndims, self.feat_channels[j], self.feat_channels[k], ndims=ndims, normalize=False)
+            ))
+        # Bottleneck
+        self.tmid = self._make_te(time_emb_dim, self.feat_channels[-1])
+        self.b_mid = nn.Sequential(
+            AtrousBlock([self.feat_channels[self.hier_num]] + [res // (2**self.hier_num)] * ndims, self.feat_channels[self.hier_num], self.feat_channels[self.hier_num], ndims=ndims),
+            AtrousBlock([self.feat_channels[self.hier_num]] + [res // (2**self.hier_num)] * ndims, self.feat_channels[self.hier_num], self.feat_channels[self.hier_num], ndims=ndims),
+            AtrousBlock([self.feat_channels[self.hier_num]] + [res // (2**self.hier_num)] * ndims, self.feat_channels[self.hier_num], self.feat_channels[self.hier_num], ndims=ndims)
+        )
+        self.conv_out = self.Conv(self.feat_channels[1], ndims, 3, 1, 1)
+    def boundary_limit(self, sample_coords0, max_sz, plus=0., minus=1.):
+        sample_coords = torch.split(sample_coords0, split_size_or_sections=1, dim=1)
+        return torch.cat([torch.clamp(x * sz, min=minus - 1 * sz + plus, max=1 * sz - minus + plus) / sz for x, sz in
+                          zip(sample_coords, max_sz)], 1)
+    def resample(self, vol, ddf, ref=None, img_sz=None, padding_mode="zeros"):
+        ref = self.ref_grid if ref is None else ref
+        img_sz = self.max_sz if img_sz is None else img_sz
+        resample_mode = 'bilinear' # if self.dimension==2 else 'trilinear'
+        return F.grid_sample(vol, torch.flip((ddf * torch.Tensor(
+            np.reshape(np.array(self.max_sz), [1, self.dimension]+[1]*self.dimension)).to(self.device) + ref).permute(
+            [0]+list(range(2,2+self.dimension))+[1]) / img_sz - 1, dims=[-1]), mode=resample_mode, padding_mode=padding_mode,
+                             align_corners=True)
+    def forward(self, x=None, y=None, t=None, rec_num=2, ndims=2):
+        self.device = x.device
+        img_sz = x.size()[2:]
+        n = x.size()[0]
+        self.max_sz = [img_sz[0]] * self.dimension
+        ts_emb_shape=[n,-1]+[1]*self.dimension
+        self.img_sz = torch.reshape(torch.tensor([(imsz - 1) / 2 for imsz in img_sz], device=self.device), [1]*(self.dimension+1)+[self.dimension])
+        self.ref_grid = torch.reshape(torch.stack(torch.meshgrid([torch.arange(end=imsz) for imsz in img_sz]), 0),
+                                      [1, self.dimension]+list(img_sz)).to(self.device)
+        img = x
+        t = self.time_embed(t)
+        for rec_id in range(rec_num):
+            if self.conditional_input:
+                tgt = y
+            enc_list = []
+            out = img
+            for i in range(self.hier_num):
+                out = self.block_down[i](out + self.ted_layers[i](t).reshape(ts_emb_shape))
+                if self.conditional_input:
+                    tgt = self.block_down_cond[i](tgt)
+                    out = self.fuse_conv0[i](torch.cat([out, tgt], axis=1))
+                    tgt = self.fuse_conv1[i](torch.cat([tgt, out], axis=1))
+                enc_list.append(out)
+                out = self.down_layers[i](out)
+                if self.conditional_input:
+                    tgt = self.down_layers[i](tgt)
+            out = self.b_mid(out + self.tmid(t).reshape(ts_emb_shape))
+            if self.conditional_input:
+                # out += self.attn_layer(out, tgt, tgt)[0]
+                out_shape = out.shape
+                tgt_shape = tgt.shape
+                # out = out.view(out_shape[0], out_shape[1], -1).permute(2, 0, 1)  # (N, C, H, W) -> (H*W, N, C)
+                tgt = tgt.view(tgt_shape[0], tgt_shape[1], -1).permute(2, 0, 1)  # (N, C, H, W) -> (H*W, N, C)
+                out_attn, _ = self.attn_layer(out.view(out_shape[0], out_shape[1], -1).permute(2, 0, 1), tgt, tgt)
+                out_attn = out_attn.permute(1, 2, 0).contiguous().view(out_shape)  # (H*W, N, C) -> (N, C, H, W)
+                out = out + out_attn
+            for i in range(self.hier_num):
+                out = torch.cat((self.up_layers[i](out),enc_list[-i-1]), dim=1)
+                out = self.block_up[i](out + self.teu_layers[i](t).reshape(ts_emb_shape))
+            out = self.conv_out(out)/128
+            ddf_one = self.boundary_limit(out, max_sz=1 * self.max_sz)
+            if rec_id == 0:
+                ddf = ddf_one
+            else:
+                ddf = ddf_one + self.resample(ddf, ddf=ddf_one, img_sz=self.img_sz, padding_mode="border")
+            img = self.resample(x, ddf=ddf, img_sz=self.img_sz)
+        return ddf
+    def _make_te(self, dim_in, dim_out):
+        return nn.Sequential(
+            nn.Linear(dim_in, dim_out),
+            nn.ReLU(),
+            nn.Linear(dim_out, dim_out)
+        )
+class RecMutAttnNet(nn.Module):
+    def __init__(self, n_steps=1000, time_emb_dim=100, ndims=2, num_input_chn=1, res=0, conditional_input=True,text_feat_chn=1024, num_heads=4):
+        super(RecMutAttnNet, self).__init__()
+        # self.feat_channels = [num_input_chn, 8, 16, 32, 32, 64]
+        self.feat_channels = [num_input_chn, 16, 32, 64, 128, 256]
+        self.conditional_input = conditional_input
+        self.num_heads = num_heads
+        self.text_feat_chn = text_feat_chn
+        self.dimension = ndims
+        self.Conv = getattr(nn, 'Conv%dd' % self.dimension)
+        self.ConvT = getattr(nn, 'ConvTranspose%dd' % self.dimension)
+        # Sinusoidal embedding
+        self.time_embed = nn.Embedding(n_steps, time_emb_dim)
+        self.time_embed.weight.data = sinusoidal_embedding(n_steps, time_emb_dim)
+        self.time_embed.requires_grad_(False)
+        self.hier_num = len(self.feat_channels) - 1
+        self.down_layers = nn.ModuleList()
+        self.up_layers = nn.ModuleList()
+        self.ted_layers = nn.ModuleList()
+        self.teu_layers = nn.ModuleList()
+        self.block_down = nn.ModuleList()
+        self.block_up = nn.ModuleList()
+        if self.conditional_input:
+            self.block_down_cond = nn.ModuleList()
+            self.fuse_conv0 = nn.ModuleList()
+            self.fuse_conv1 = nn.ModuleList()
+            self.attn_layer = nn.MultiheadAttention(self.feat_channels[-1], self.num_heads)
+            Global_Maxpool = getattr(nn, 'AdaptiveMaxPool%dd' % self.dimension)
+            self.global_maxpool = Global_Maxpool(1)
+            self.img2txt = self.Conv(self.feat_channels[-1], self.text_feat_chn, 1, 1, 0)
+            self.txt_proc = AtrousBlock([self.text_feat_chn] + [1] * ndims, self.text_feat_chn, self.text_feat_chn, ndims=ndims, normalize=False, atrous_rates=[0, 0])
+            self.txt2img = self.Conv(self.text_feat_chn, self.feat_channels[-1], 1, 1, 0)
+            self.text = torch.zeros(1, self.text_feat_chn, *([1]*self.dimension))
+        self.img_res = [res]*self.dimension
+        self.ref_grid = torch.reshape(torch.stack(torch.meshgrid([torch.arange(end=imsz) for imsz in self.img_res]), 0),
+                                      [1, self.dimension]+list(self.img_res))
+        for i in range(1, self.hier_num + 1):
+            j=-i
+            self.down_layers.append(self.Conv(self.feat_channels[i], self.feat_channels[i], 4, 2, 1))
+            self.up_layers.append(self.ConvT(self.feat_channels[j], self.feat_channels[j], 4, 2, 1))
+            self.ted_layers.append(self._make_te(time_emb_dim, self.feat_channels[i-1]))
+            self.teu_layers.append(self._make_te(time_emb_dim, 2*self.feat_channels[j]))
+            self.block_down.append(nn.Sequential(
+                AtrousBlock([self.feat_channels[i-1]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i-1], self.feat_channels[i], ndims=ndims),
+                AtrousBlock([self.feat_channels[i]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i], self.feat_channels[i], ndims=ndims),
+                AtrousBlock([self.feat_channels[i]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i], self.feat_channels[i], ndims=ndims)
+            ))
+            if self.conditional_input:
+                self.block_down_cond.append(nn.Sequential(
+                    AtrousBlock([self.feat_channels[i-1]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i-1], self.feat_channels[i], ndims=ndims),
+                    AtrousBlock([self.feat_channels[i]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i], self.feat_channels[i], ndims=ndims),
+                    AtrousBlock([self.feat_channels[i]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i], self.feat_channels[i], ndims=ndims)
+                ))
+                self.fuse_conv0.append(self.Conv(2*self.feat_channels[i], self.feat_channels[i], 1, 1, 0))
+                self.fuse_conv1.append(self.Conv(2*self.feat_channels[i], self.feat_channels[i], 1, 1, 0))
+            if i==self.hier_num:
+                k=j
+            else:
+                k=j-1
+            self.block_up.append(nn.Sequential(
+                AtrousBlock([2*self.feat_channels[j]] + [res // (2 ** (self.hier_num-i-1))] * ndims, 2*self.feat_channels[j], self.feat_channels[j], ndims=ndims, normalize=False),
+                AtrousBlock([self.feat_channels[j]] + [res // (2 ** (self.hier_num-i-1))] * ndims, self.feat_channels[j], self.feat_channels[j], ndims=ndims, normalize=False),
+                AtrousBlock([self.feat_channels[j]] + [res // (2 ** (self.hier_num-i-1))] * ndims, self.feat_channels[j], self.feat_channels[k], ndims=ndims, normalize=False)
+            ))
+        # Bottleneck
+        self.tmid = self._make_te(time_emb_dim, self.feat_channels[-1])
+        self.b_mid = nn.Sequential(
+            AtrousBlock([self.feat_channels[self.hier_num]] + [res // (2**self.hier_num)] * ndims, self.feat_channels[self.hier_num], self.feat_channels[self.hier_num], ndims=ndims),
+            AtrousBlock([self.feat_channels[self.hier_num]] + [res // (2**self.hier_num)] * ndims, self.feat_channels[self.hier_num], self.feat_channels[self.hier_num], ndims=ndims),
+            AtrousBlock([self.feat_channels[self.hier_num]] + [res // (2**self.hier_num)] * ndims, self.feat_channels[self.hier_num], self.feat_channels[self.hier_num], ndims=ndims)
+        )
+        self.conv_out = self.Conv(self.feat_channels[1], ndims, 3, 1, 1)
+    def boundary_limit(self, sample_coords0, max_sz, plus=0., minus=1.):
+        sample_coords = torch.split(sample_coords0, split_size_or_sections=1, dim=1)
+        return torch.cat([torch.clamp(x * sz, min=minus - 1 * sz + plus, max=1 * sz - minus + plus) / sz for x, sz in
+                          zip(sample_coords, max_sz)], 1)
+    def resample(self, vol, ddf, ref=None, img_sz=None, padding_mode="zeros"):
+        ref = self.ref_grid if ref is None else ref
+        img_sz = self.max_sz if img_sz is None else img_sz
+        resample_mode = 'bilinear' # if self.dimension==2 else 'trilinear'
+        return F.grid_sample(vol, torch.flip((ddf * torch.Tensor(
+            np.reshape(np.array(self.max_sz), [1, self.dimension]+[1]*self.dimension)).to(self.device) + ref).permute(
+            [0]+list(range(2,2+self.dimension))+[1]) / img_sz - 1, dims=[-1]), mode=resample_mode, padding_mode=padding_mode,
+                             align_corners=True)
+    def forward(self, x=None, y=None, t=None, text=None, rec_num=2, ndims=2):
+        self.device = x.device
+        img_sz = x.size()[2:]
+        n = x.size()[0]
+        self.max_sz = [img_sz[0]] * self.dimension
+        ts_emb_shape=[n,-1]+[1]*self.dimension
+        self.img_sz = torch.reshape(torch.tensor([(imsz - 1) / 2 for imsz in img_sz], device=self.device), [1]*(self.dimension+1)+[self.dimension])
+        if list(img_sz) != self.img_res:
+            # print ("Reinitialize the ref_grid to match the model's input image size.")
+            # print(img_sz, self.img_res)
+            self.ref_grid = torch.reshape(torch.stack(torch.meshgrid([torch.arange(end=imsz) for imsz in img_sz]), 0),
+                                        [1, self.dimension]+list(img_sz))
+        self.ref_grid = self.ref_grid.to(self.device)
+        img = x
+        t = self.time_embed(t)
+        for rec_id in range(rec_num):
+            if self.conditional_input:
+                tgt = y
+            enc_list = []
+            out = img
+            for i in range(self.hier_num):
+                out = self.block_down[i](out + self.ted_layers[i](t).reshape(ts_emb_shape))
+                if self.conditional_input:
+                    tgt = self.block_down_cond[i](tgt)
+                    out = self.fuse_conv0[i](torch.cat([out, tgt], axis=1))
+                    tgt = self.fuse_conv1[i](torch.cat([tgt, out], axis=1))
+                enc_list.append(out)
+                out = self.down_layers[i](out)
+                if self.conditional_input:
+                    tgt = self.down_layers[i](tgt)
+            out = self.b_mid(out + self.tmid(t).reshape(ts_emb_shape))
+            if self.conditional_input:
+                # out += self.attn_layer(out, tgt, tgt)[0]
+                out_shape = out.shape
+                tgt_shape = tgt.shape
+                # out = out.view(out_shape[0], out_shape[1], -1).permute(2, 0, 1)  # (N, C, H, W) -> (H*W, N, C)
+                tgt = tgt.view(tgt_shape[0], tgt_shape[1], -1).permute(2, 0, 1)  # (N, C, H, W) -> (H*W, N, C)
+                out_attn, _ = self.attn_layer(out.view(out_shape[0], out_shape[1], -1).permute(2, 0, 1), tgt, tgt)
+                out_attn = out_attn.permute(1, 2, 0).contiguous().view(out_shape)  # (H*W, N, C) -> (N, C, H, W)
+                out = out + out_attn
+            if self.conditional_input:
+                if text is None:
+                    text = self.text
+                    text = text.to(self.device)
+                text = text.view(-1, self.text_feat_chn, *([1]*self.dimension))
+                out_txt = self.img2txt(out) + text
+                out_txt = self.txt_proc(out_txt)
+                out_txt = self.txt2img(out_txt)
+                out = out + out_txt
+            for i in range(self.hier_num):
+                out = torch.cat((self.up_layers[i](out),enc_list[-i-1]), dim=1)
+                out = self.block_up[i](out + self.teu_layers[i](t).reshape(ts_emb_shape))
+            out = self.conv_out(out)/128
+            ddf_one = self.boundary_limit(out, max_sz=1 * self.max_sz)
+            if rec_id == 0:
+                ddf = ddf_one
+            else:
+                ddf = ddf_one + self.resample(ddf, ddf=ddf_one, img_sz=self.img_sz, padding_mode="border")
+            img = self.resample(x, ddf=ddf, img_sz=self.img_sz)
+        # print(torch.max(torch.abs(ddf)))
+        return ddf
+    def _make_te(self, dim_in, dim_out):
+        return nn.Sequential(
+            nn.Linear(dim_in, dim_out),
+            nn.ReLU(),
+            nn.Linear(dim_out, dim_out)
+        )
+class RecMulModMutAttnNet(nn.Module):
+    def __init__(self, n_steps=1000, time_emb_dim=100, ndims=2, num_input_chn=1, res=0, conditional_input=True,text_feat_chn=1024, num_heads=4):
+        super(RecMulModMutAttnNet, self).__init__()
+        # self.feat_channels = [num_input_chn, 8, 16, 32, 32, 64]
+        self.feat_channels = [num_input_chn, 16, 32, 64, 128, 256]
+        self.conditional_input = conditional_input
+        self.num_heads = num_heads
+        self.text_feat_chn = text_feat_chn
+        self.dimension = ndims
+        self.Conv = getattr(nn, 'Conv%dd' % self.dimension)
+        self.ConvT = getattr(nn, 'ConvTranspose%dd' % self.dimension)
+        # Sinusoidal embedding
+        self.time_embed = nn.Embedding(n_steps, time_emb_dim)
+        self.time_embed.weight.data = sinusoidal_embedding(n_steps, time_emb_dim)
+        self.time_embed.requires_grad_(False)
+        self.hier_num = len(self.feat_channels) - 1
+        self.down_layers = nn.ModuleList()
+        self.up_layers = nn.ModuleList()
+        self.ted_layers = nn.ModuleList()
+        self.teu_layers = nn.ModuleList()
+        self.block_down = nn.ModuleList()
+        self.block_up = nn.ModuleList()
+        if self.conditional_input:
+            # self.gate_img = nn.ModuleList()
+            self.txt_layers = nn.ModuleList()
+            self.block_down_cond = nn.ModuleList()
+            self.fuse_conv0 = nn.ModuleList()
+            self.fuse_conv1 = nn.ModuleList()
+            self.attn_layer = nn.MultiheadAttention(self.feat_channels[-1], self.num_heads)
+            Global_Maxpool = getattr(nn, 'AdaptiveMaxPool%dd' % self.dimension)
+            self.global_maxpool = Global_Maxpool(1)
+            self.img2txt = self.Conv(self.feat_channels[-1], self.text_feat_chn, 1, 1, 0)
+            self.txt_proc = AtrousBlock([self.text_feat_chn] + [1] * ndims, self.text_feat_chn, self.text_feat_chn, ndims=ndims, normalize=False, atrous_rates=[0, 0])
+            self.txt2img = self.Conv(self.text_feat_chn, self.feat_channels[-1], 1, 1, 0)
+            # self.text = torch.zeros(1, self.text_feat_chn, *([1]*self.dimension))
+            self.text = torch.zeros(1, self.text_feat_chn)
+        self.img_res = [res]*self.dimension
+        self.ref_grid = torch.reshape(torch.stack(torch.meshgrid([torch.arange(end=imsz) for imsz in self.img_res]), 0),
+                                      [1, self.dimension]+list(self.img_res))
+        for i in range(1, self.hier_num + 1):
+            j=-i
+            self.down_layers.append(self.Conv(self.feat_channels[i], self.feat_channels[i], 4, 2, 1))
+            self.up_layers.append(self.ConvT(self.feat_channels[j], self.feat_channels[j], 4, 2, 1))
+            self.ted_layers.append(self._make_te(time_emb_dim, self.feat_channels[i-1]))
+            self.teu_layers.append(self._make_te(time_emb_dim, 2*self.feat_channels[j]))
+            self.block_down.append(nn.Sequential(
+                AtrousBlock([self.feat_channels[i-1]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i-1], self.feat_channels[i], ndims=ndims),
+                AtrousBlock([self.feat_channels[i]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i], self.feat_channels[i], ndims=ndims),
+                AtrousBlock([self.feat_channels[i]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i], self.feat_channels[i], ndims=ndims)
+            ))
+            if self.conditional_input:
+                # self.gate_img.append(nn.Sequential(
+                #     nn.ConvNd(self.dimension, self.feat_channels[i], self.feat_channels[i], kernel_size=1, stride=1, padding=0),
+                #     nn.Sigmoid()
+                # ))
+                self.txt_layers.append((self._make_te(self.text_feat_chn, self.feat_channels[i])))
+                self.block_down_cond.append(nn.Sequential(
+                    AtrousBlock([self.feat_channels[i-1]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i-1], self.feat_channels[i], ndims=ndims),
+                    AtrousBlock([self.feat_channels[i]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i], self.feat_channels[i], ndims=ndims),
+                    AtrousBlock([self.feat_channels[i]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i], self.feat_channels[i], ndims=ndims)
+                ))
+                self.fuse_conv0.append(self.Conv(2*self.feat_channels[i], self.feat_channels[i], 1, 1, 0))
+                self.fuse_conv1.append(self.Conv(2*self.feat_channels[i], self.feat_channels[i], 1, 1, 0))
+            if i==self.hier_num:
+                k=j
+            else:
+                k=j-1
+            self.block_up.append(nn.Sequential(
+                AtrousBlock([2*self.feat_channels[j]] + [res // (2 ** (self.hier_num-i-1))] * ndims, 2*self.feat_channels[j], self.feat_channels[j], ndims=ndims, normalize=False),
+                AtrousBlock([self.feat_channels[j]] + [res // (2 ** (self.hier_num-i-1))] * ndims, self.feat_channels[j], self.feat_channels[j], ndims=ndims, normalize=False),
+                AtrousBlock([self.feat_channels[j]] + [res // (2 ** (self.hier_num-i-1))] * ndims, self.feat_channels[j], self.feat_channels[k], ndims=ndims, normalize=False)
+            ))
+        # Bottleneck
+        self.txt_layers.append((self._make_te(self.text_feat_chn, self.text_feat_chn)))
+        self.tmid = self._make_te(time_emb_dim, self.feat_channels[-1])
+        self.b_mid = nn.Sequential(
+            AtrousBlock([self.feat_channels[self.hier_num]] + [res // (2**self.hier_num)] * ndims, self.feat_channels[self.hier_num], self.feat_channels[self.hier_num], ndims=ndims),
+            AtrousBlock([self.feat_channels[self.hier_num]] + [res // (2**self.hier_num)] * ndims, self.feat_channels[self.hier_num], self.feat_channels[self.hier_num], ndims=ndims),
+            AtrousBlock([self.feat_channels[self.hier_num]] + [res // (2**self.hier_num)] * ndims, self.feat_channels[self.hier_num], self.feat_channels[self.hier_num], ndims=ndims)
+        )
+        self.conv_out = self.Conv(self.feat_channels[1], ndims, 3, 1, 1)
+    def boundary_limit(self, sample_coords0, max_sz, plus=0., minus=1.):
+        sample_coords = torch.split(sample_coords0, split_size_or_sections=1, dim=1)
+        return torch.cat([torch.clamp(x * sz, min=minus - 1 * sz + plus, max=1 * sz - minus + plus) / sz for x, sz in
+                          zip(sample_coords, max_sz)], 1)
+    def resample(self, vol, ddf, ref=None, img_sz=None, padding_mode="zeros"):
+        ref = self.ref_grid if ref is None else ref
+        img_sz = self.max_sz if img_sz is None else img_sz
+        resample_mode = 'bilinear' # if self.dimension==2 else 'trilinear'
+        return F.grid_sample(vol, torch.flip((ddf * torch.Tensor(
+            np.reshape(np.array(self.max_sz), [1, self.dimension]+[1]*self.dimension)).to(self.device) + ref).permute(
+            [0]+list(range(2,2+self.dimension))+[1]) / img_sz - 1, dims=[-1]), mode=resample_mode, padding_mode=padding_mode,
+                             align_corners=True)
+    def forward(self, x=None, y=None, t=None, text=None, rec_num=2, ndims=2):
+        self.device = x.device
+        img_sz = x.size()[2:]
+        n = x.size()[0]
+        self.max_sz = [img_sz[0]] * self.dimension
+        ts_emb_shape=[n,-1]+[1]*self.dimension
+        self.img_sz = torch.reshape(torch.tensor([(imsz - 1) / 2 for imsz in img_sz], device=self.device), [1]*(self.dimension+1)+[self.dimension])
+        if list(img_sz) != self.img_res:
+            # print ("Reinitialize the ref_grid to match the model's input image size.")
+            # print(img_sz, self.img_res)
+            self.ref_grid = torch.reshape(torch.stack(torch.meshgrid([torch.arange(end=imsz) for imsz in img_sz]), 0),
+                                        [1, self.dimension]+list(img_sz))
+        self.ref_grid = self.ref_grid.to(self.device)
+        img = x
+        t = self.time_embed(t)
+        if text is None:
+            text = self.text
+            # print(text.shape)
+            text = text.to(self.device)
+            txt_shape = [1,-1]+[1]*self.dimension
+        else:
+            txt_shape = [n,-1]+[1]*self.dimension
+        for rec_id in range(rec_num):
+            if self.conditional_input:
+                tgt = y
+            enc_list = []
+            out = img
+            for i in range(self.hier_num):
+                out = self.block_down[i](out + self.ted_layers[i](t).reshape(ts_emb_shape))
+                if self.conditional_input:
+                    tgt = self.block_down_cond[i](tgt) + self.txt_layers[i](text).reshape(txt_shape)
+                    out = self.fuse_conv0[i](torch.cat([out, tgt], axis=1))
+                    tgt = self.fuse_conv1[i](torch.cat([tgt, out], axis=1))
+                enc_list.append(out)
+                out = self.down_layers[i](out)
+                if self.conditional_input:
+                    tgt = self.down_layers[i](tgt)
+            out = self.b_mid(out + self.tmid(t).reshape(ts_emb_shape))
+            if self.conditional_input:
+                # out += self.attn_layer(out, tgt, tgt)[0]
+                out_shape = out.shape
+                tgt_shape = tgt.shape
+                # out = out.view(out_shape[0], out_shape[1], -1).permute(2, 0, 1)  # (N, C, H, W) -> (H*W, N, C)
+                tgt = tgt.view(tgt_shape[0], tgt_shape[1], -1).permute(2, 0, 1)  # (N, C, H, W) -> (H*W, N, C)
+                out_attn, _ = self.attn_layer(out.view(out_shape[0], out_shape[1], -1).permute(2, 0, 1), tgt, tgt)
+                out_attn = out_attn.permute(1, 2, 0).contiguous().view(out_shape)  # (H*W, N, C) -> (N, C, H, W)
+                out = out + out_attn
+            if self.conditional_input:
+                # text = text.view(-1, self.text_feat_chn, *([1]*self.dimension))
+                # out_txt = self.img2txt(out) + text.reshape(txt_shape)
+                img_txt_feat = self.img2txt(out)
+                self.img_embd = self.global_maxpool(img_txt_feat).view(n, -1)  # [B, 1024]
+                out_txt = self.txt_layers[-1](text).reshape(txt_shape) + img_txt_feat
+                out_txt = self.txt_proc(out_txt)
+                out_txt = self.txt2img(out_txt)
+                out = out + out_txt
+            for i in range(self.hier_num):
+                out = torch.cat((self.up_layers[i](out),enc_list[-i-1]), dim=1)
+                out = self.block_up[i](out + self.teu_layers[i](t).reshape(ts_emb_shape))
+            out = self.conv_out(out)/128
+            ddf_one = self.boundary_limit(out, max_sz=1 * self.max_sz)
+            if rec_id == 0:
+                ddf = ddf_one
+            else:
+                ddf = ddf_one + self.resample(ddf, ddf=ddf_one, img_sz=self.img_sz, padding_mode="border")
+            img = self.resample(x, ddf=ddf, img_sz=self.img_sz)
+        # print(torch.max(torch.abs(ddf)))
+        return ddf
+    def _make_te(self, dim_in, dim_out):
+        return nn.Sequential(
+            nn.Linear(dim_in, dim_out),
+            nn.ReLU(),
+            nn.Linear(dim_out, dim_out)
+        )
+# class RecMutAttnNet(nn.Module):
+#     def __init__(self, n_steps=1000, time_emb_dim=100, ndims=2, num_input_chn=1, res=0, conditional_input=True):
+#         super(RecMutAttnNet, self).__init__()
+#         self.feat_channels = [num_input_chn, 8, 16, 32, 32, 64]
+#         self.conditional_input = conditional_input
+#         self.dimension = ndims
+#         self.Conv = getattr(nn, 'Conv%dd' % self.dimension)
+#         self.ConvT = getattr(nn, 'ConvTranspose%dd' % self.dimension)
+#         # Sinusoidal embedding
+#         self.time_embed = nn.Embedding(n_steps, time_emb_dim)
+#         self.time_embed.weight.data = sinusoidal_embedding(n_steps, time_emb_dim)
+#         self.time_embed.requires_grad_(False)
+#         self.hier_num = len(self.feat_channels) - 1
+#         self.down_layers = nn.ModuleList()
+#         self.up_layers = nn.ModuleList()
+#         self.ted_layers = nn.ModuleList()
+#         self.teu_layers = nn.ModuleList()
+#         self.block_down = nn.ModuleList()
+#         if self.conditional_input:
+#             self.block_down_cond = nn.ModuleList()
+#             self.fuse_conv0 = nn.ModuleList()
+#             self.fuse_conv1 = nn.ModuleList()
+#         self.block_up = nn.ModuleList()
+#         for i in range(1, self.hier_num + 1):
+#             j=-i
+#             self.down_layers.append(self.Conv(self.feat_channels[i], self.feat_channels[i], 4, 2, 1))
+#             self.up_layers.append(self.ConvT(self.feat_channels[j], self.feat_channels[j], 4, 2, 1))
+#             self.ted_layers.append(self._make_te(time_emb_dim, self.feat_channels[i-1]))
+#             self.teu_layers.append(self._make_te(time_emb_dim, 2*self.feat_channels[j]))
+#             self.block_down.append(nn.Sequential(
+#                 AtrousBlock([self.feat_channels[i-1]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i-1], self.feat_channels[i], ndims=ndims),
+#                 AtrousBlock([self.feat_channels[i]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i], self.feat_channels[i], ndims=ndims),
+#                 AtrousBlock([self.feat_channels[i]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i], self.feat_channels[i], ndims=ndims)
+#             ))
+#             if self.conditional_input:
+#                 self.block_down_cond.append(nn.Sequential(
+#                     AtrousBlock([self.feat_channels[i-1]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i-1], self.feat_channels[i], ndims=ndims),
+#                     AtrousBlock([self.feat_channels[i]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i], self.feat_channels[i], ndims=ndims),
+#                     AtrousBlock([self.feat_channels[i]] + [res // (2 ** (i-1))] * ndims, self.feat_channels[i], self.feat_channels[i], ndims=ndims)
+#                 ))
+#                 self.fuse_conv0.append(self.Conv(2*self.feat_channels[i], self.feat_channels[i], 1, 1, 0))
+#                 self.fuse_conv1.append(self.Conv(2*self.feat_channels[i], self.feat_channels[i], 1, 1, 0))
+#             if i==self.hier_num:
+#                 k=j
+#             else:
+#                 k=j-1
+#             self.block_up.append(nn.Sequential(
+#                 AtrousBlock([2*self.feat_channels[j]] + [res // (2 ** (self.hier_num-i-1))] * ndims, 2*self.feat_channels[j], self.feat_channels[j], ndims=ndims, normalize=False),
+#                 AtrousBlock([self.feat_channels[j]] + [res // (2 ** (self.hier_num-i-1))] * ndims, self.feat_channels[j], self.feat_channels[j], ndims=ndims, normalize=False),
+#                 AtrousBlock([self.feat_channels[j]] + [res // (2 ** (self.hier_num-i-1))] * ndims, self.feat_channels[j], self.feat_channels[k], ndims=ndims, normalize=False)
+#             ))
+#         # Bottleneck
+#         self.tmid = self._make_te(time_emb_dim, self.feat_channels[-1])
+#         self.b_mid = nn.Sequential(
+#             AtrousBlock([self.feat_channels[self.hier_num]] + [res // (2**self.hier_num)] * ndims, self.feat_channels[self.hier_num], self.feat_channels[self.hier_num], ndims=ndims),
+#             AtrousBlock([self.feat_channels[self.hier_num]] + [res // (2**self.hier_num)] * ndims, self.feat_channels[self.hier_num], self.feat_channels[self.hier_num], ndims=ndims),
+#             AtrousBlock([self.feat_channels[self.hier_num]] + [res // (2**self.hier_num)] * ndims, self.feat_channels[self.hier_num], self.feat_channels[self.hier_num], ndims=ndims)
+#         )
+#         self.conv_out = self.Conv(self.feat_channels[1], ndims, 3, 1, 1)
+#     def boundary_limit(self, sample_coords0, max_sz, plus=0., minus=1.):
+#         sample_coords = torch.split(sample_coords0, split_size_or_sections=1, dim=1)
+#         return torch.cat([torch.clamp(x * sz, min=minus - 1 * sz + plus, max=1 * sz - minus + plus) / sz for x, sz in
+#                           zip(sample_coords, max_sz)], 1)
+#     def resample(self, vol, ddf, ref=None, img_sz=None, padding_mode="zeros"):
+#         ref = self.ref_grid if ref is None else ref
+#         img_sz = self.max_sz if img_sz is None else img_sz
+#         resample_mode = 'bilinear' # if self.dimension==2 else 'trilinear'
+#         return F.grid_sample(vol, torch.flip((ddf * torch.Tensor(
+#             np.reshape(np.array(self.max_sz), [1, self.dimension]+[1]*self.dimension)).to(self.device) + ref).permute(
+#             [0]+list(range(2,2+self.dimension))+[1]) / img_sz - 1, dims=[-1]), mode=resample_mode, padding_mode=padding_mode,
+#                              align_corners=True)
+#     def forward(self, x=None, y=None, t=None, rec_num=2, ndims=2):
+#         self.device = x.device
+#         img_sz = x.size()[2:]
+#         n = x.size()[0]
+#         self.max_sz = [img_sz[0]] * self.dimension
+#         ts_emb_shape=[n,-1]+[1]*self.dimension
+#         self.img_sz = torch.reshape(torch.tensor([(imsz - 1) / 2 for imsz in img_sz], device=self.device), [1]*(self.dimension+1)+[self.dimension])
+#         self.ref_grid = torch.reshape(torch.stack(torch.meshgrid([torch.arange(end=imsz) for imsz in img_sz]), 0),
+#                                       [1, self.dimension]+list(img_sz)).to(self.device)
+#         img = x
+#         t = self.time_embed(t)
+#         for rec_id in range(rec_num):
+#             if self.conditional_input:
+#                 tgt = y
+#             enc_list = []
+#             out = img
+#             for i in range(self.hier_num):
+#                 out = self.block_down[i](out + self.ted_layers[i](t).reshape(ts_emb_shape))
+#                 if self.conditional_input:
+#                     tgt = self.block_down_cond[i](tgt)
+#                     out = self.fuse_conv0[i](torch.cat([out, tgt], axis=1))
+#                     tgt = self.fuse_conv1[i](torch.cat([tgt, out], axis=1))
+#                 enc_list.append(out)
+#                 out = self.down_layers[i](out)
+#                 if self.conditional_input:
+#                     tgt = self.down_layers[i](tgt)
+#             out = self.b_mid(out + self.tmid(t).reshape(ts_emb_shape))
+#             if self.conditional_input:
+#                 out = out + tgt
+#             for i in range(self.hier_num):
+#                 out = torch.cat((self.up_layers[i](out),enc_list[-i-1]), dim=1)
+#                 out = self.block_up[i](out + self.teu_layers[i](t).reshape(ts_emb_shape))
+#             out = self.conv_out(out)/128
+#             ddf_one = self.boundary_limit(out, max_sz=1 * self.max_sz)
+#             if rec_id == 0:
+#                 ddf = ddf_one
+#             else:
+#                 ddf = ddf_one + self.resample(ddf, ddf=ddf_one, img_sz=self.img_sz, padding_mode="border")
+#             img = self.resample(x, ddf=ddf, img_sz=self.img_sz)
+#         return ddf
+#     def _make_te(self, dim_in, dim_out):
+#         return nn.Sequential(
+#             nn.Linear(dim_in, dim_out),
+#             nn.ReLU(),
+#             nn.Linear(dim_out, dim_out)
+#         )
+# ==============================================
+# Layers
+# ==============================================
+def ddf_multiplier(dvf,mul_num=10,stn=None):
+    ddf=dvf
+    for i in range(mul_num):
+        ddf = dvf + stn(ddf, dvf)
+    return ddf
+def composite(ddfs,stn=None):
+    if stn is None:
+        stn = STN(device=ddfs[0].device,padding_mode="border")
+    comp_ddf=ddfs[0]
+    for i in range(1,len(ddfs)):
+        comp_ddf = ddfs[i] + stn(comp_ddf,ddfs[i])
+    return comp_ddf
+class STN(nn.Module):
+    def __init__(self,ndims=2,img_sz=None,max_sz=None,device=None,padding_mode="border",resample_mode=None):
+        super(STN, self).__init__()
+        self.ndims=ndims
+        self.img_sz=[img_sz]*ndims
+        # self.img_sz=img_sz
+        self.device = device
+        self.padding_mode = padding_mode
+        # max_sz=[128]*self.ndims
+        max_sz=[img_sz]*self.ndims
+        # max_sz=img_sz
+        # max_sz=img_sz if max_sz is None else ([128,128] if img_sz is None else img_sz)
+        # self.max_sz=torch.Tensor(np.reshape(np.array(max_sz), [1, self.ndims, 1, 1])).to(self.device)
+        self.max_sz=torch.Tensor(np.reshape(np.array(max_sz), [1, self.ndims]+[1]*self.ndims)).to(self.device)
+        self.resample_mode=resample_mode
+        if self.img_sz is not None:
+            self.ref_grid = torch.reshape(torch.stack(torch.meshgrid([torch.arange(end=s) for s in self.img_sz]), 0),
+                                        [1, self.ndims] + self.img_sz).to(self.device)
+        return
+    def max_limit(self, sample_coords0, plus=0., minus=1.):
+        sample_coords = torch.split(sample_coords0, split_size_or_sections=1, dim=1)
+        # return tf.stack([tf.maximum(tf.minimum(x, sz - minus + plus), 0 + plus) for x, sz in zip(sample_coords, input_size0)],-1)
+        return torch.cat([torch.clamp(x * sz, min=minus - 1 * sz + plus, max=1 * sz - minus + plus) / sz for x, sz in
+                        zip(sample_coords, self.max_sz)], 1)
+    def boundary_limit(self, sample_coords0, plus=0., minus=1.):
+        sample_coords = torch.split(sample_coords0, split_size_or_sections=1, dim=1)
+        # return tf.stack([tf.maximum(tf.minimum(x, sz - minus + plus), 0 + plus) for x, sz in zip(sample_coords, input_size0)],-1)
+        return torch.cat([(torch.clamp(x * sz+ref, min=minus - 1 * sz + plus, max=1 * sz - minus + plus)-ref) / sz for x, sz,ref in
+                        zip(sample_coords, self.max_sz, self.ref_grid)], 1)
+    def resample(self, vol, ddf, ref=None, img_sz=None,padding_mode = "zeros"):
+        # print(vol.device, ddf.device)
+        # print(self.device)
+        # print('===================')
+        device = ddf.device
+        ref = self.ref_grid if ref is None else ref
+        if img_sz is None:
+            img_sz = self.max_sz
+        else:
+            img_sz = torch.reshape(torch.tensor([(s - 1) / 2. for s in img_sz], device=device), [1]+[1]*self.ndims+[self.ndims])
+        # resample_mode = 'bicubic'
+        if self.resample_mode is None:
+            resample_mode = 'bilinear' # if self.ndims==2 else 'trilinear'
+        else:
+            resample_mode=self.resample_mode
+        # padding_mode = "border"
+        # print(ddf.shape, ref.shape)
+        return F.grid_sample(vol.to(device), torch.flip((ddf * self.max_sz.to(device) + ref.to(device)).permute(
+            [0] + list(range(2, 2 + self.ndims)) + [1]) / img_sz - 1, dims=[-1]), mode=resample_mode,
+                            padding_mode=padding_mode,
+                            align_corners=True)
+    def forward(self,x,ddf):
+        self.device = x.device if self.device is None else self.device
+        if self.img_sz is None:
+            self.img_sz = list(x.size()[2:]).to(self.device)
+            self.ref_grid = torch.reshape(torch.stack(torch.meshgrid([torch.arange(end=s) for s in self.img_sz]), 0),[1, self.ndims]+self.img_sz).to(self.device)
+        resampled_x = self.resample(x, ddf=ddf, img_sz=self.img_sz, padding_mode=self.padding_mode)
+        return resampled_x
+if __name__ == '__main__':
+    ndims = 3
+    res = 128
+    x = torch.rand([1, 1] + [res]*ndims)
+    t = torch.randint(0, 1000, (1,))
+    text = torch.rand([1, 1024] + [1]*ndims)
+    model = RecMutAttnNet(n_steps=1000, time_emb_dim=100, ndims=ndims, num_input_chn=1, res=res, conditional_input=True)
+    y = model(x, x, t, text=text)
+    print("Ouput shape", y.shape)
+    # Total parameters
+    total_params = sum(p.numel() for p in model.parameters())
+    # Trainable parameters only
+    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print(f"Total parameters: {total_params}")
+    print(f"Trainable parameters: {trainable_params}")

Diffusion/networks_opt.py ADDED Viewed

	@@ -0,0 +1,239 @@

+"""
+networks_opt.py — Optimized network components.
+Subclasses RecMulModMutAttnNet and STN to eliminate per-call overhead:
+  1. OptSTN: register_buffer for ref_grid/max_sz — no .to(device) per call
+  2. OptRecMulModMutAttnNet: cached max_sz/img_sz tensors, ref_grid device —
+     eliminates ~80 NumPy→GPU transfers and ~32 tensor recreations per registration step
+All optimizations are mathematically equivalent to the originals.
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from Diffusion.networks import RecMulModMutAttnNet, STN
+# ======================================================================
+# Optimized STN
+# ======================================================================
+class OptSTN(STN):
+    """STN with register_buffer for automatic device transfer.
+    Eliminates per-call .to(device) overhead in resample() and forward().
+    Buffers auto-transfer when module.to(device) is called.
+    """
+    def __init__(self, ndims=2, img_sz=None, max_sz=None, device=None,
+                 padding_mode="border", resample_mode=None):
+        # Skip parent __init__ to avoid creating plain tensor attributes
+        nn.Module.__init__(self)
+        self.ndims = ndims
+        self.img_sz = [img_sz] * ndims
+        self.device = device
+        self.padding_mode = padding_mode
+        self.resample_mode = resample_mode
+        # OPT: register_buffer — auto device transfer, no per-call .to()
+        max_sz_val = [img_sz] * ndims
+        max_sz_tensor = torch.Tensor(
+            np.reshape(np.array(max_sz_val), [1, self.ndims] + [1] * self.ndims)
+        )
+        self.register_buffer('max_sz', max_sz_tensor)
+        if self.img_sz is not None:
+            ref_grid = torch.reshape(
+                torch.stack(torch.meshgrid(
+                    [torch.arange(end=s) for s in self.img_sz]
+                ), 0),
+                [1, self.ndims] + self.img_sz
+            )
+            self.register_buffer('ref_grid', ref_grid)
+            # OPT: pre-compute the img_sz tensor used when forward() calls resample()
+            img_sz_for_resample = torch.reshape(
+                torch.tensor([(s - 1) / 2. for s in self.img_sz]),
+                [1] + [1] * self.ndims + [self.ndims]
+            )
+            self.register_buffer('_img_sz_for_resample', img_sz_for_resample)
+        # OPT: pre-compute constant permutation order
+        self._perm = [0] + list(range(2, 2 + self.ndims)) + [1]
+    def resample(self, vol, ddf, ref=None, img_sz=None, padding_mode="zeros"):
+        # OPT: no .to(device) — buffers auto-transfer with module.to()
+        ref = self.ref_grid if ref is None else ref
+        if img_sz is None:
+            img_sz_t = self.max_sz
+        else:
+            # Use pre-computed tensor for the common case (called from forward)
+            img_sz_t = self._img_sz_for_resample
+        resample_mode = 'bilinear' if self.resample_mode is None else self.resample_mode
+        grid = torch.flip(
+            (ddf * self.max_sz + ref).permute(self._perm) / img_sz_t - 1,
+            dims=[-1]
+        )
+        return F.grid_sample(vol, grid, mode=resample_mode,
+                             padding_mode=padding_mode, align_corners=True)
+    def forward(self, x, ddf):
+        # OPT: no device check or ref_grid regeneration — buffers handle it
+        return self.resample(x, ddf=ddf, img_sz=self.img_sz,
+                             padding_mode=self.padding_mode)
+# ======================================================================
+# Optimized RecMulModMutAttnNet
+# ======================================================================
+class OptRecMulModMutAttnNet(RecMulModMutAttnNet):
+    """RecMulModMutAttnNet with cached tensors for resample/forward.
+    Eliminates per-call overhead:
+      - resample(): cached max_sz tensor (was: NumPy→Torch→GPU every call)
+      - forward(): cached img_sz tensor and ref_grid device placement
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # Cache slots — populated on first forward
+        self._cached_input_key = None
+        self._cached_max_sz_tensor = None
+        self._cached_img_sz_tensor = None
+        # OPT: pre-compute constant permutation order
+        self._perm = [0] + list(range(2, 2 + self.dimension)) + [1]
+    def _ensure_cache(self, img_sz, device):
+        """Populate cached tensors if input size or device changed."""
+        key = (tuple(img_sz), device)
+        if key == self._cached_input_key:
+            return
+        self._cached_input_key = key
+        max_sz_list = [img_sz[0]] * self.dimension
+        self.max_sz = max_sz_list
+        # OPT: create max_sz tensor ONCE, reuse across all resample() calls
+        self._cached_max_sz_tensor = torch.Tensor(
+            np.reshape(np.array(max_sz_list), [1, self.dimension] + [1] * self.dimension)
+        ).to(device)
+        # OPT: create img_sz tensor ONCE per size change
+        self._cached_img_sz_tensor = torch.reshape(
+            torch.tensor([(imsz - 1) / 2 for imsz in img_sz], device=device),
+            [1] * (self.dimension + 1) + [self.dimension]
+        )
+        # OPT: ref_grid — only regenerate if size changed, only .to() if needed
+        if list(img_sz) != self.img_res:
+            self.ref_grid = torch.reshape(
+                torch.stack(torch.meshgrid(
+                    [torch.arange(end=imsz) for imsz in img_sz]
+                ), 0),
+                [1, self.dimension] + list(img_sz)
+            ).to(device)
+        elif self.ref_grid.device != torch.device(device):
+            self.ref_grid = self.ref_grid.to(device)
+    def resample(self, vol, ddf, ref=None, img_sz=None, padding_mode="zeros"):
+        # OPT: use cached max_sz tensor instead of NumPy→Torch→GPU every call
+        ref = self.ref_grid if ref is None else ref
+        img_sz = self._cached_img_sz_tensor if img_sz is not None else self._cached_max_sz_tensor
+        grid = torch.flip(
+            (ddf * self._cached_max_sz_tensor + ref).permute(self._perm) / img_sz - 1,
+            dims=[-1]
+        )
+        return F.grid_sample(vol, grid, mode='bilinear',
+                             padding_mode=padding_mode, align_corners=True)
+    def forward(self, x=None, y=None, t=None, text=None, rec_num=2, ndims=2):
+        self.device = x.device
+        img_sz = x.size()[2:]
+        n = x.size()[0]
+        ts_emb_shape = [n, -1] + [1] * self.dimension
+        # OPT: cache tensors — only recreate if input size/device changes
+        self._ensure_cache(img_sz, self.device)
+        self.img_sz = self._cached_img_sz_tensor
+        img = x
+        t = self.time_embed(t)
+        if text is None:
+            text = self.text
+            text = text.to(self.device)
+            txt_shape = [1, -1] + [1] * self.dimension
+        else:
+            txt_shape = [n, -1] + [1] * self.dimension
+        for rec_id in range(rec_num):
+            if self.conditional_input:
+                tgt = y
+            enc_list = []
+            out = img
+            for i in range(self.hier_num):
+                out = self.block_down[i](out + self.ted_layers[i](t).reshape(ts_emb_shape))
+                if self.conditional_input:
+                    tgt = self.block_down_cond[i](tgt) + self.txt_layers[i](text).reshape(txt_shape)
+                    out = self.fuse_conv0[i](torch.cat([out, tgt], axis=1))
+                    tgt = self.fuse_conv1[i](torch.cat([tgt, out], axis=1))
+                enc_list.append(out)
+                out = self.down_layers[i](out)
+                if self.conditional_input:
+                    tgt = self.down_layers[i](tgt)
+            out = self.b_mid(out + self.tmid(t).reshape(ts_emb_shape))
+            if self.conditional_input:
+                out_shape = out.shape
+                tgt_shape = tgt.shape
+                out_flat = out.view(out_shape[0], out_shape[1], -1).permute(2, 0, 1)
+                tgt_flat = tgt.view(tgt_shape[0], tgt_shape[1], -1).permute(2, 0, 1)
+                out_attn, _ = self.attn_layer0(out_flat, tgt_flat, tgt_flat)
+                tgt_attn, _ = self.attn_layer1(tgt_flat, out_flat, out_flat)
+                out_attn = out_attn.permute(1, 2, 0).contiguous().view(out_shape)
+                tgt_attn = tgt_attn.permute(1, 2, 0).contiguous().view(tgt_shape)
+                out = out + out_attn
+                tgt = tgt + tgt_attn
+                out = self.fuse(torch.cat([out, tgt], dim=1))
+            if self.conditional_input:
+                img_txt_feat = self.img2txt(out)
+                self.img_embd = self.global_maxpool(img_txt_feat).view(n, -1)
+                out_txt = self.txt_layers[-1](text).reshape(txt_shape) + img_txt_feat
+                out_txt = self.txt_proc(out_txt)
+                out_txt = self.txt2img(out_txt)
+                out = out + out_txt
+            for i in range(self.hier_num):
+                out = torch.cat((self.up_layers[i](out), enc_list[-i - 1]), dim=1)
+                out = self.block_up[i](out + self.teu_layers[i](t).reshape(ts_emb_shape))
+            out = self.conv_out(out) / 128
+            ddf_one = self.boundary_limit(out, max_sz=1 * self.max_sz)
+            if rec_id == 0:
+                ddf = ddf_one
+            else:
+                ddf = ddf_one + self.resample(ddf, ddf=ddf_one, img_sz=self.img_sz, padding_mode="border")
+            img = self.resample(x, ddf=ddf, img_sz=self.img_sz)
+        return ddf
+# ======================================================================
+# Factory function
+# ======================================================================
+def get_net_opt(name):
+    """Return optimized network class if available, else fall back to original."""
+    if name == "recmulmodmutattnnet":
+        return OptRecMulModMutAttnNet
+    # Fall back to original for other network types
+    from Diffusion.networks import get_net
+    return get_net(name)

Diffusion/safe_conv_transpose.py ADDED Viewed

	@@ -0,0 +1,401 @@

+"""
+SafeConvTranspose3d: Drop-in replacement for nn.ConvTranspose3d that avoids
+the XPU memory leak in the ConvTranspose3d backward pass (oneDNN autograd bug).
+Mathematical Background
+=======================
+ConvTranspose3d (a.k.a. "transposed convolution" or "fractionally-strided
+convolution") with parameters:
+    in_channels=C_in, out_channels=C_out, kernel_size=K, stride=S, padding=P
+is the gradient (adjoint) of Conv3d with the same parameters. For an input x
+of shape [B, C_in, D, H, W], the output has shape:
+    [B, C_out, S*(D-1) + K - 2*P, S*(H-1) + K - 2*P, S*(W-1) + K - 2*P]
+For our specific case (K=4, S=2, P=1):
+    output_size = 2*(D-1) + 4 - 2 = 2*D  (likewise for H, W)
+The operation is mathematically equivalent to:
+    1. Stride insertion: insert (S-1) zeros between each input element
+    2. Padding: pad with (K - P - 1) zeros on each side
+    3. Regular Conv3d with spatially-flipped, channel-transposed weight
+Specifically:
+Step 1 - Stride insertion:
+    Input [B, C_in, D, H, W] -> [B, C_in, S*(D-1)+1, S*(H-1)+1, S*(W-1)+1]
+    For S=2: [B, C_in, 2*D-1, 2*H-1, 2*W-1]
+    Original values placed at positions 0, S, 2S, ... ; zeros elsewhere.
+Step 2 - Padding:
+    Pad each spatial dimension with (K - P - 1) zeros on each side.
+    For K=4, P=1: pad = 2 on each side.
+    Shape becomes: [B, C_in, 2*D+3, 2*H+3, 2*W+3]
+Step 3 - Conv3d with transformed weight:
+    ConvTranspose3d weight shape: [C_in, C_out, K, K, K]
+    Equivalent Conv3d weight: weight.flip(2,3,4).transpose(0,1)
+    -> shape [C_out, C_in, K, K, K]
+    Conv3d(stride=1, padding=0) on the padded input gives:
+    [B, C_out, (2*D+3 - K + 1), ...] = [B, C_out, 2*D, 2*H, 2*W]  (correct!)
+Why this is safe on XPU:
+    The forward uses F.pad (ZERO leak) and F.conv3d (negligible leak).
+    The backward is computed automatically by PyTorch's autograd through these
+    same safe ops — no ConvTranspose3d backward kernel is ever invoked.
+    Specifically:
+    - F.conv3d backward -> uses Conv3d backward (safe, 0.004 GiB/step)
+    - F.pad backward -> tensor slicing (trivially safe)
+    - Stride insertion backward -> gather at stride positions (trivially safe)
+    - weight.flip().transpose() backward -> indexing (trivially safe)
+Forward precision:
+    Not bit-for-bit identical to nn.ConvTranspose3d due to different summation
+    order (stride-insert + pad + conv3d vs native transposed conv), but the
+    difference is negligible: max absolute diff < 5e-7 in float32, no elements
+    exceeding 1e-6. This is well within float32 machine epsilon for typical
+    activation magnitudes.
+Backward precision:
+    Gradients match nn.ConvTranspose3d within 1e-5 (input) and 1e-4 (weight)
+    for float32. Verified across all channel configurations used in the
+    codebase (16-256 channels).
+Implementation choices:
+    We also provide SafeConvTranspose3d_v2 which uses a custom autograd function
+    to call F.conv_transpose3d in the forward (bit-for-bit identical) but
+    replaces the backward with safe Conv3d-based gradient computation.
+    RECOMMENDATION: Use SafeConvTranspose3d (V1, decomposed forward) because:
+    - Simpler implementation with no custom autograd
+    - Fully transparent to PyTorch's autograd
+    - Compatible with gradient checkpointing, torch.compile, etc.
+    - The ~5e-7 forward precision loss is negligible for training
+    - V2's custom autograd requires careful maintenance and is fragile
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Function
+# =============================================================================
+# Approach 1 (RECOMMENDED): Decomposed forward pass
+# =============================================================================
+class SafeConvTranspose3d(nn.Module):
+    """Drop-in replacement for nn.ConvTranspose3d that decomposes the operation
+    into stride insertion + padding + regular Conv3d.
+    All operations in forward (and thus all backward ops via autograd) are
+    safe on XPU: no ConvTranspose3d backward kernel is invoked.
+    Supports: kernel_size, stride, padding (scalar or tuple), bias, groups=1.
+    Does NOT support: output_padding, dilation != 1, groups != 1.
+    The weight tensor has the SAME shape as nn.ConvTranspose3d:
+        [in_channels, out_channels, *kernel_size]
+    so checkpoints can be loaded directly with load_state_dict().
+    """
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, output_padding=0, groups=1, bias=True,
+                 dilation=1, padding_mode='zeros'):
+        super().__init__()
+        if groups != 1:
+            raise NotImplementedError("SafeConvTranspose3d only supports groups=1")
+        if output_padding != 0:
+            raise NotImplementedError("SafeConvTranspose3d does not support output_padding")
+        # Normalize to tuples
+        if isinstance(kernel_size, int):
+            kernel_size = (kernel_size, kernel_size, kernel_size)
+        if isinstance(stride, int):
+            stride = (stride, stride, stride)
+        if isinstance(padding, int):
+            padding = (padding, padding, padding)
+        if isinstance(dilation, int):
+            dilation = (dilation, dilation, dilation)
+        if dilation != (1, 1, 1):
+            raise NotImplementedError("SafeConvTranspose3d does not support dilation != 1")
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.groups = groups
+        # Weight shape matches ConvTranspose3d: [in_channels, out_channels, *kernel_size]
+        self.weight = nn.Parameter(
+            torch.empty(in_channels, out_channels, *kernel_size)
+        )
+        if bias:
+            self.bias = nn.Parameter(torch.empty(out_channels))
+        else:
+            self.register_parameter('bias', None)
+        # Initialize weights same as nn.ConvTranspose3d
+        nn.init.kaiming_uniform_(self.weight, a=5**0.5)
+        if self.bias is not None:
+            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight)
+            if fan_in != 0:
+                bound = 1 / fan_in**0.5
+                nn.init.uniform_(self.bias, -bound, bound)
+    def forward(self, x):
+        B, C_in, D, H, W = x.shape
+        sd, sh, sw = self.stride
+        kd, kh, kw = self.kernel_size
+        pd, ph, pw = self.padding
+        # Step 1: Stride insertion — place input values at stride positions,
+        # zeros elsewhere. This is the "fractionally-strided" part.
+        if sd > 1 or sh > 1 or sw > 1:
+            D_ins = sd * (D - 1) + 1
+            H_ins = sh * (H - 1) + 1
+            W_ins = sw * (W - 1) + 1
+            x_inserted = x.new_zeros(B, C_in, D_ins, H_ins, W_ins)
+            x_inserted[:, :, ::sd, ::sh, ::sw] = x
+        else:
+            x_inserted = x
+        # Step 2: Pad with (kernel_size - padding - 1) zeros on each side.
+        # This converts ConvTranspose3d's "padding" (which removes output elements)
+        # into the equivalent zero-padding for a regular convolution.
+        pad_d = kd - pd - 1
+        pad_h = kh - ph - 1
+        pad_w = kw - pw - 1
+        # F.pad argument order: (W_left, W_right, H_left, H_right, D_left, D_right)
+        x_padded = F.pad(x_inserted, (pad_w, pad_w, pad_h, pad_h, pad_d, pad_d))
+        # Step 3: Transform weight from ConvTranspose3d layout to Conv3d layout.
+        # ConvTranspose3d weight: [C_in, C_out, kD, kH, kW]
+        # Equivalent Conv3d weight: [C_out, C_in, kD, kH, kW] with spatial dims flipped
+        w_conv = self.weight.flip(2, 3, 4).transpose(0, 1)
+        # Step 4: Standard Conv3d (stride=1, padding=0)
+        return F.conv3d(x_padded, w_conv, self.bias, stride=1, padding=0)
+    def extra_repr(self):
+        return (f'{self.in_channels}, {self.out_channels}, '
+                f'kernel_size={self.kernel_size}, stride={self.stride}, '
+                f'padding={self.padding}, bias={self.bias is not None}')
+# =============================================================================
+# Approach 2: Custom autograd — real forward, safe backward
+# =============================================================================
+class _SafeConvTranspose3dFunc(Function):
+    """Custom autograd function that uses F.conv_transpose3d in forward
+    (bit-for-bit identical) but computes gradients using Conv3d-based ops
+    in backward (avoiding the leaky oneDNN ConvTranspose3d backward kernel).
+    Gradient derivation:
+        For y = conv_transpose3d(x, w, stride=S, padding=P):
+        grad_x = conv3d(grad_y, w, stride=S, padding=P)
+            Confirmed bit-for-bit identical to PyTorch's own backward.
+        grad_w = conv3d(pad(stride_insert(x)).T, grad_y.T).flip(spatial)
+            where stride_insert inserts (S-1) zeros between elements,
+            pad adds (K-P-1) zeros on each side, and .T swaps batch/channel.
+            The spatial flip accounts for the flip in the forward decomposition.
+        grad_bias = grad_y.sum(dim=(0, 2, 3, 4))
+    """
+    @staticmethod
+    def forward(ctx, input, weight, bias, stride, padding, output_padding, groups, dilation):
+        # Use the real conv_transpose3d for bit-for-bit identical forward
+        output = F.conv_transpose3d(
+            input, weight, bias,
+            stride=stride, padding=padding,
+            output_padding=output_padding, groups=groups, dilation=dilation
+        )
+        ctx.save_for_backward(input, weight, bias)
+        ctx.stride = stride
+        ctx.padding = padding
+        ctx.output_padding = output_padding
+        ctx.groups = groups
+        ctx.dilation = dilation
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, weight, bias = ctx.saved_tensors
+        stride = ctx.stride
+        padding = ctx.padding
+        groups = ctx.groups
+        dilation = ctx.dilation
+        grad_input = grad_weight = grad_bias = None
+        if ctx.needs_input_grad[0]:
+            # grad_input of ConvTranspose3d = Conv3d(grad_output, weight)
+            # This is exact: ConvTranspose3d IS the adjoint of Conv3d.
+            grad_input = F.conv3d(
+                grad_output, weight,
+                bias=None, stride=stride, padding=padding,
+                dilation=dilation, groups=groups
+            )
+        if ctx.needs_input_grad[1]:
+            # grad_weight via the decomposed view.
+            # Forward decomposition: y = conv3d(x_padded, w.flip(spatial).T(0,1))
+            # The backward of this conv3d w.r.t. its weight can be expressed as:
+            # grad_w_conv = conv3d(x_padded.T(0,1), grad_y.T(0,1))
+            # where the batch-channel transpose turns the sum over batch
+            # into a channel dimension convolution.
+            #
+            # Then: grad_w = grad_w_conv.flip(spatial)
+            # because w_conv = w.flip(spatial).T(0,1), and the chain rule
+            # through the spatial flip gives an extra flip on the gradient.
+            B, C_in = input.shape[:2]
+            spatial = input.shape[2:]
+            # Stride-insert the input
+            if any(s > 1 for s in stride):
+                new_spatial = tuple(s * (d - 1) + 1 for s, d in zip(stride, spatial))
+                input_inserted = input.new_zeros(B, C_in, *new_spatial)
+                slices = (slice(None), slice(None)) + tuple(
+                    slice(None, None, s) for s in stride
+                )
+                input_inserted[slices] = input
+            else:
+                input_inserted = input
+            # Pad: (K - P - 1) on each side per spatial dim
+            kernel_size = weight.shape[2:]
+            pad_sizes = []
+            for k, p in zip(reversed(kernel_size), reversed(padding)):
+                pad_val = k - p - 1
+                pad_sizes.extend([pad_val, pad_val])
+            x_padded = F.pad(input_inserted, pad_sizes)
+            # Compute grad_w_conv via conv3d with batch-channel transposition
+            x_padded_t = x_padded.transpose(0, 1)    # [C_in, B, ...]
+            grad_output_t = grad_output.transpose(0, 1)  # [C_out, B, ...]
+            # conv3d([C_in, B, D_pad...], [C_out, B, D_out...]) -> [C_in, C_out, K...]
+            grad_w_conv = F.conv3d(x_padded_t, grad_output_t)
+            # Undo the spatial flip from the forward decomposition
+            grad_weight = grad_w_conv.flip(2, 3, 4)
+        if bias is not None and ctx.needs_input_grad[2]:
+            grad_bias = grad_output.sum(dim=(0,) + tuple(range(2, grad_output.ndim)))
+        return grad_input, grad_weight, grad_bias, None, None, None, None, None
+class SafeConvTranspose3d_v2(nn.Module):
+    """Drop-in replacement for nn.ConvTranspose3d using custom autograd.
+    Forward pass: Uses the real F.conv_transpose3d (bit-for-bit identical output).
+    Backward pass: Computes gradients using F.conv3d (avoids leaky oneDNN kernel).
+    Weight shape is identical to nn.ConvTranspose3d: [in_channels, out_channels, *kernel_size]
+    """
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, output_padding=0, groups=1, bias=True,
+                 dilation=1, padding_mode='zeros'):
+        super().__init__()
+        if groups != 1:
+            raise NotImplementedError("SafeConvTranspose3d_v2 only supports groups=1")
+        if output_padding != 0:
+            raise NotImplementedError("SafeConvTranspose3d_v2 does not support output_padding")
+        # Normalize to tuples
+        if isinstance(kernel_size, int):
+            kernel_size = (kernel_size, kernel_size, kernel_size)
+        if isinstance(stride, int):
+            stride = (stride, stride, stride)
+        if isinstance(padding, int):
+            padding = (padding, padding, padding)
+        if isinstance(dilation, int):
+            dilation = (dilation, dilation, dilation)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.output_padding = (0, 0, 0) if isinstance(output_padding, int) else output_padding
+        self.groups = groups
+        self.dilation = dilation
+        # Weight shape matches ConvTranspose3d: [in_channels, out_channels, *kernel_size]
+        self.weight = nn.Parameter(
+            torch.empty(in_channels, out_channels, *kernel_size)
+        )
+        if bias:
+            self.bias = nn.Parameter(torch.empty(out_channels))
+        else:
+            self.register_parameter('bias', None)
+        # Initialize weights same as nn.ConvTranspose3d
+        nn.init.kaiming_uniform_(self.weight, a=5**0.5)
+        if self.bias is not None:
+            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight)
+            if fan_in != 0:
+                bound = 1 / fan_in**0.5
+                nn.init.uniform_(self.bias, -bound, bound)
+    def forward(self, x):
+        return _SafeConvTranspose3dFunc.apply(
+            x, self.weight, self.bias,
+            self.stride, self.padding, self.output_padding,
+            self.groups, self.dilation
+        )
+    def extra_repr(self):
+        return (f'{self.in_channels}, {self.out_channels}, '
+                f'kernel_size={self.kernel_size}, stride={self.stride}, '
+                f'padding={self.padding}, bias={self.bias is not None}')
+# =============================================================================
+# Utility: in-place replacement of ConvTranspose3d in existing models
+# =============================================================================
+def replace_conv_transpose3d(module, target_cls=SafeConvTranspose3d):
+    """Recursively replace all nn.ConvTranspose3d in a module with the given
+    replacement class, copying weights and biases.
+    Usage:
+        model = MyModel()
+        replace_conv_transpose3d(model)  # in-place modification
+    Args:
+        module: The nn.Module to modify in-place.
+        target_cls: Replacement class (default: SafeConvTranspose3d).
+    """
+    for name, child in module.named_children():
+        if isinstance(child, nn.ConvTranspose3d):
+            ct = child
+            assert ct.groups == 1, f"groups={ct.groups} not supported"
+            assert ct.output_padding == (0,) * len(ct.output_padding), \
+                f"output_padding={ct.output_padding} not supported"
+            replacement = target_cls(
+                ct.in_channels, ct.out_channels, ct.kernel_size,
+                stride=ct.stride, padding=ct.padding,
+                bias=ct.bias is not None
+            )
+            # Copy weights — same tensor shape, no conversion needed
+            replacement.weight.data.copy_(ct.weight.data)
+            if ct.bias is not None:
+                replacement.bias.data.copy_(ct.bias.data)
+            setattr(module, name, replacement)
+        else:
+            replace_conv_transpose3d(child, target_cls)

Models/all_om_net/000110_all_om_net.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b9c2c90820aba95bfd89d870820574461963450ca50617ee44fb5af2b17385b3
+size 3017380171

OM_reg.py CHANGED Viewed

@@ -72,7 +72,8 @@ min_crop_ratio = 0.9
 # label_keys = ['heart']
 label_keys = ['brain']
 # label_keys = ['pancreas']
-database = ['MSD']
 dataset = OminiDataset_inference_w_all(transform=None,min_crop_ratio=min_crop_ratio,label_key = label_keys, database=database)
 Infer_Loader = DataLoader(
@@ -112,6 +113,7 @@ Deformddpm = DeformDDPM(
     padding_mode = hyp_parameters["padding_mode"],
     v_scale = hyp_parameters["v_scale"],
     resample_mode = hyp_parameters["resample_mode"],
 )
 Deformddpm.to(hyp_parameters["device"])
@@ -125,7 +127,7 @@ ddf_stn.to(hyp_parameters["device"])
 print("Loading model from:", model_save_path)
 # Deformddpm.load_state_dict(torch.load(model_save_path))
-checkpoint = torch.load(model_save_path)
 Deformddpm.load_state_dict(checkpoint['model_state_dict'])
 Deformddpm.eval()
@@ -162,12 +164,8 @@ for e, d in tqdm(enumerate(Infer_Loader)):
   # print(pid, image_original.shape, mask_original.max())
-  if hyp_parameters["ndims"] == 2:
-    nifti_img = nib.Nifti1Image(image_original[0,0,:,:], np.eye(4))
-    nifti_mask = nib.Nifti1Image(mask_original[0,:,:,:], np.eye(4))
-  elif hyp_parameters["ndims"] == 3:
-    nifti_img = nib.Nifti1Image(image_original[0,0,:,:,:], np.eye(4))
-    nifti_mask = nib.Nifti1Image(mask_original[0,0,:,:,:], np.eye(4))
   # Saving original (undeformed image)
   # CMR: format: Patient0001_Slice0001_ORG_NA.nii.gz
@@ -198,16 +196,10 @@ for e, d in tqdm(enumerate(Infer_Loader)):
       noisy_imgs_np = img_diff.cpu().detach().numpy()
       noisy_msks_np = msk_diff.cpu().detach().numpy()
-      if hyp_parameters["ndims"] == 2:
-        nifti_img_aug = nib.Nifti1Image(denoise_imgs[0,0,:,:], np.eye(4))
-        nifti_mask_aug = nib.Nifti1Image(denoise_msks[0,:,:,:], np.eye(4))
-        nifti_img = nib.Nifti1Image(noisy_imgs_np[0,0,:,:], np.eye(4))
-        nifti_mask = nib.Nifti1Image(noisy_msks_np[0, :, :, :], np.eye(4))
-      elif hyp_parameters["ndims"] == 3:
-        nifti_img_aug = nib.Nifti1Image(denoise_imgs[0,0,:,:,:], np.eye(4))
-        nifti_mask_aug = nib.Nifti1Image(denoise_msks[0,0,:,:,:], np.eye(4))
-        nifti_img = nib.Nifti1Image(noisy_imgs_np[0,0,:,:,:], np.eye(4))
-        nifti_mask = nib.Nifti1Image(noisy_msks_np[0, 0, :, :], np.eye(4))
       nib.save(nifti_img_aug, os.path.join(hyp_parameters['reg_img_savepath'],utils.get_barcode([pid,e,im,noise_step])+'.nii.gz'))
       nib.save(nifti_mask_aug, os.path.join(hyp_parameters['reg_msk_savepath'],utils.get_barcode([pid,e,im,noise_step])+'_GT.nii.gz'))

 # label_keys = ['heart']
 label_keys = ['brain']
 # label_keys = ['pancreas']
+# database = ['MSD']
+database = ['Brats2019']
 dataset = OminiDataset_inference_w_all(transform=None,min_crop_ratio=min_crop_ratio,label_key = label_keys, database=database)
 Infer_Loader = DataLoader(
     padding_mode = hyp_parameters["padding_mode"],
     v_scale = hyp_parameters["v_scale"],
     resample_mode = hyp_parameters["resample_mode"],
+    inf_mode = True,   # set to True for inference, which will use fixed slice num and slice idx for better evaluation
 )
 Deformddpm.to(hyp_parameters["device"])
 print("Loading model from:", model_save_path)
 # Deformddpm.load_state_dict(torch.load(model_save_path))
+checkpoint = torch.load(model_save_path, map_location='cpu')
 Deformddpm.load_state_dict(checkpoint['model_state_dict'])
 Deformddpm.eval()
   # print(pid, image_original.shape, mask_original.max())
+  nifti_img = utils.converet_to_nibabel(image_original, ndims=hyp_parameters["ndims"])
+  nifti_mask = utils.converet_to_nibabel(mask_original, ndims=hyp_parameters["ndims"])
   # Saving original (undeformed image)
   # CMR: format: Patient0001_Slice0001_ORG_NA.nii.gz
       noisy_imgs_np = img_diff.cpu().detach().numpy()
       noisy_msks_np = msk_diff.cpu().detach().numpy()
+      nifti_img_aug = utils.converet_to_nibabel(denoise_imgs, ndims=hyp_parameters["ndims"])
+      nifti_mask_aug = utils.converet_to_nibabel(denoise_msks, ndims=hyp_parameters["ndims"])
+      nifti_img = utils.converet_to_nibabel(noisy_imgs_np, ndims=hyp_parameters["ndims"])
+      nifti_mask = utils.converet_to_nibabel(noisy_msks_np, ndims=hyp_parameters["ndims"])
       nib.save(nifti_img_aug, os.path.join(hyp_parameters['reg_img_savepath'],utils.get_barcode([pid,e,im,noise_step])+'.nii.gz'))
       nib.save(nifti_mask_aug, os.path.join(hyp_parameters['reg_msk_savepath'],utils.get_barcode([pid,e,im,noise_step])+'_GT.nii.gz'))

OM_reg_flexres.py ADDED Viewed

	@@ -0,0 +1,382 @@

+import torch
+import torchvision
+from torch import nn
+from torchvision.utils import save_image
+from torch.utils.data import DataLoader
+from torch.optim import Adam
+from torchvision.utils import make_grid
+from Diffusion.diffuser import DeformDDPM
+from Diffusion.networks import get_net, STN
+from torchvision.transforms import Lambda
+import random
+import os
+import utils
+from Dataloader.dataloader0 import get_dataloader
+from Dataloader.dataLoader import *
+from torchvision.utils import save_image
+from einops import rearrange, reduce, repeat
+import numpy as np
+import nibabel as nib
+from tqdm import tqdm
+import yaml
+import argparse
+import torch.nn.functional as F
+import SimpleITK as sitk
+from skimage.transform import resize
+EPS = 10e-8
+parser = argparse.ArgumentParser()
+parser.add_argument(
+        "--config",
+        "-C",
+        help="Path for the config file",
+        type=str,
+        default="Config/config_om.yaml",
+        required=False,
+    )
+args = parser.parse_args()
+#=======================================================================================================================
+# Load the YAML file into a dictionary
+with open(args.config, 'r') as file:
+    hyp_parameters = yaml.safe_load(file)
+    print(hyp_parameters)
+if not os.path.exists(hyp_parameters["aug_img_savepath"]):
+    os.makedirs(hyp_parameters["aug_img_savepath"])
+if not os.path.exists(hyp_parameters["aug_msk_savepath"]):
+    os.makedirs(hyp_parameters["aug_msk_savepath"])
+if not os.path.exists(hyp_parameters["aug_ddf_savepath"]):
+    os.makedirs(hyp_parameters["aug_ddf_savepath"])
+print(hyp_parameters["aug_img_savepath"])
+hyp_parameters['batchsize'] = 1
+model_img_sz = hyp_parameters['img_size']  # e.g. 128
+# =======================================================================================================================
+# Dataset is used only for its filtering logic (to get the right set of keys + metadata).
+# We bypass the DataLoader and load volumes directly to ensure deterministic center-padding
+# that is identical between the 128^3 model input and the full-res volume.
+label_keys = ['brain']
+database = ['Brats2019']
+dataset = OminiDataset_inference_w_all(
+    transform=None, min_crop_ratio=1.0, label_key=label_keys, database=database)
+# =======================================================================================================================
+epoch=f'{hyp_parameters["model_id_str"]}_{hyp_parameters["data_name"]}_{hyp_parameters["net_name"]}'
+model_save_path = f'Models/{hyp_parameters["data_name"]}_{hyp_parameters["net_name"]}/'
+model_save_path = os.path.join(model_save_path, str(epoch)+'.pth')
+Net = get_net(hyp_parameters["net_name"])
+Deformddpm = DeformDDPM(
+    network=Net(n_steps = hyp_parameters["timesteps"],
+                ndims = hyp_parameters["ndims"],
+                num_input_chn = hyp_parameters["num_input_chn"],
+                res = model_img_sz
+                ),
+    n_steps = hyp_parameters["timesteps"],
+    image_chw = [hyp_parameters["num_input_chn"]] + [model_img_sz]*hyp_parameters["ndims"],
+    device = hyp_parameters["device"],
+    batch_size = hyp_parameters["batchsize"],
+    img_pad_mode = hyp_parameters["img_pad_mode"],
+    ddf_pad_mode = hyp_parameters["ddf_pad_mode"],
+    padding_mode = hyp_parameters["padding_mode"],
+    v_scale = hyp_parameters["v_scale"],
+    resample_mode = hyp_parameters["resample_mode"],
+    inf_mode = True,
+)
+Deformddpm.to(hyp_parameters["device"])
+ddf_stn = STN(
+    img_sz = model_img_sz,
+    ndims = hyp_parameters["ndims"],
+    padding_mode = hyp_parameters['padding_mode'],
+    device = hyp_parameters["device"],
+)
+ddf_stn.to(hyp_parameters["device"])
+print("Loading model from:", model_save_path)
+checkpoint = torch.load(model_save_path, map_location='cpu')
+Deformddpm.load_state_dict(checkpoint['model_state_dict'])
+Deformddpm.eval()
+# Full-res output directories (append _fullres to the standard paths)
+reg_img_savepath_fullres = hyp_parameters['reg_img_savepath'].rstrip('/') + '_fullres/'
+reg_msk_savepath_fullres = hyp_parameters['reg_msk_savepath'].rstrip('/') + '_fullres/'
+reg_ddf_savepath_fullres = hyp_parameters['reg_ddf_savepath'].rstrip('/') + '_fullres/'
+os.makedirs(hyp_parameters['reg_img_savepath'], exist_ok=True)
+os.makedirs(hyp_parameters['reg_msk_savepath'], exist_ok=True)
+os.makedirs(hyp_parameters['reg_ddf_savepath'], exist_ok=True)
+os.makedirs(reg_img_savepath_fullres, exist_ok=True)
+os.makedirs(reg_msk_savepath_fullres, exist_ok=True)
+os.makedirs(reg_ddf_savepath_fullres, exist_ok=True)
+# ========== Helper functions ==========
+def center_pad_to_cube(volume):
+    """Pad volume to a cube using the max dimension, with symmetric (center) padding."""
+    max_dim = max(volume.shape[:3])
+    pad_width = []
+    for s in volume.shape[:3]:
+        total_pad = max_dim - s
+        pad_before = total_pad // 2
+        pad_after = total_pad - pad_before
+        pad_width.append((pad_before, pad_after))
+    # Handle extra dims (e.g., multi-channel labels)
+    for _ in range(volume.ndim - 3):
+        pad_width.append((0, 0))
+    return np.pad(volume, pad_width, mode='constant', constant_values=0)
+def load_fullres_volume(key, ds):
+    """Load original-resolution volume: axis reorder, clamp, normalize, center-pad to cube."""
+    volume = sitk.ReadImage(key)
+    volume = sitk.GetArrayFromImage(volume)
+    volume = reverse_axis_order(volume)
+    if volume.ndim == 4:
+        channel_ids = ds.get_channel_ids(key)
+        channel_id = channel_ids[0] if len(channel_ids) > 0 else 0
+        volume = volume[:, :, :, channel_id]
+    # CT clamping
+    if ds.clamp_range is not None:
+        modality = ds.ALLdata_filtered[key].get("Modality", None)
+        if modality == "CT":
+            volume = np.clip(volume, ds.clamp_range[0], ds.clamp_range[1])
+    volume = ds.normalize(volume)
+    volume = center_pad_to_cube(volume)
+    return volume  # shape: [D, D, D] (cubic)
+def load_fullres_label(key, ds, label_key):
+    """Load original-resolution label: axis reorder, center-pad to cube (no resize)."""
+    label_path_dict = ds.ALLdata_filtered[key].get('Label_path', {})
+    task_labels = label_path_dict.get('segmentation', {})
+    if label_key not in task_labels:
+        return None
+    label = sitk.ReadImage(task_labels[label_key])
+    label = sitk.GetArrayFromImage(label)
+    label = reverse_axis_order(label)
+    if label.ndim > 3:
+        channel_ids = ds.get_channel_ids(key)
+        if len(channel_ids) != 0:
+            label = label[..., channel_ids]
+    label = center_pad_to_cube(label)
+    return label
+def apply_ddf(volume_tensor, ddf, padding_mode='border', resample_mode='bilinear'):
+    """Apply DDF to volume tensor at any resolution.
+    The DDF stores fractional displacements (value * max_sz = voxel displacement).
+    When the DDF is spatially upscaled via trilinear interpolation from model resolution
+    to full resolution, the fractional values remain correct — we use the new spatial
+    size as max_sz, which correctly scales the voxel displacement proportionally.
+    """
+    device = ddf.device
+    ndims = 3
+    img_sz = list(volume_tensor.shape[2:])
+    max_sz = torch.reshape(
+        torch.tensor(img_sz, dtype=torch.float32, device=device),
+        [1, ndims] + [1] * ndims)
+    ref_grid = torch.reshape(
+        torch.stack(torch.meshgrid(
+            [torch.arange(s, device=device) for s in img_sz], indexing='ij'), 0),
+        [1, ndims] + img_sz)
+    img_shape = torch.reshape(
+        torch.tensor([(s - 1) / 2. for s in img_sz], dtype=torch.float32, device=device),
+        [1] + [1] * ndims + [ndims])
+    grid = torch.flip(
+        (ddf * max_sz + ref_grid).permute(
+            [0] + list(range(2, 2 + ndims)) + [1]) / img_shape - 1,
+        dims=[-1])
+    return F.grid_sample(volume_tensor, grid.float(), mode=resample_mode,
+                         padding_mode=padding_mode, align_corners=True)
+# ========== Main inference loop ==========
+keys = list(dataset.ALLdata_filtered.keys())
+print("total num of images:", len(keys))
+for e, key in enumerate(tqdm(keys)):
+    pid = e
+    print(f'Processing patient {pid}, image {e}, key: {key}')
+    # --- Load full-resolution volume (center-padded to cube) ---
+    fullres_vol = load_fullres_volume(key, dataset)
+    orig_sz = list(fullres_vol.shape)  # e.g. [240, 240, 240]
+    print(f"  Full-res padded shape: {orig_sz}")
+    # --- Resize to model resolution for inference ---
+    vol_model = resize(fullres_vol, [model_img_sz] * 3,
+                       anti_aliasing=True, preserve_range=True)
+    img = torch.tensor(vol_model[None, None, :, :, :],
+                       dtype=torch.float32, device=hyp_parameters["device"])
+    # --- Load full-res labels and resize to model resolution ---
+    fullres_labels = {}
+    for lk in label_keys:
+        lab = load_fullres_label(key, dataset, lk)
+        if lab is not None:
+            fullres_labels[lk] = lab
+    # Build mask at model resolution (128^3)
+    label_arrays_model = []
+    label_arrays_fullres = []
+    for lk in label_keys:
+        if lk in fullres_labels:
+            lab = fullres_labels[lk]
+            lab_model = resize(lab, [model_img_sz] * 3,
+                               anti_aliasing=False, preserve_range=True, order=0)
+            if lab_model.ndim == 3:
+                lab_model = lab_model[None, :, :, :]
+            elif lab_model.ndim > 3:
+                lab_model = np.transpose(lab_model, (3, 0, 1, 2))
+            label_arrays_model.append(lab_model)
+            if lab.ndim == 3:
+                lab = lab[None, :, :, :]
+            elif lab.ndim > 3:
+                lab = np.transpose(lab, (3, 0, 1, 2))
+            label_arrays_fullres.append(lab)
+        else:
+            label_arrays_model.append(np.full([1] + [model_img_sz] * 3, -1))
+            label_arrays_fullres.append(np.full([1] + orig_sz, -1))
+    if len(label_arrays_model) > 0:
+        mask_model_np = np.concatenate(label_arrays_model, axis=0)
+        mask = torch.tensor(mask_model_np[None], dtype=torch.float32,
+                            device=hyp_parameters["device"])
+        fullres_msk_np = np.concatenate(label_arrays_fullres, axis=0)
+        fullres_msk_tensor = torch.tensor(fullres_msk_np[None], dtype=torch.float32,
+                                          device=hyp_parameters["device"])
+    else:
+        mask = None
+        fullres_msk_np = None
+        fullres_msk_tensor = None
+    # Build full-res image tensor
+    fullres_img_tensor = torch.tensor(fullres_vol[None, None, :, :, :],
+                                      dtype=torch.float32,
+                                      device=hyp_parameters["device"])
+    # --- Save target conditioning image (first subject) ---
+    if e <= 0:
+        target_img = img.clone().detach()
+    # --- Save original images at 128^3 ---
+    image_original = img.cpu().numpy()
+    nib.save(utils.converet_to_nibabel(image_original, ndims=hyp_parameters["ndims"]),
+             os.path.join(hyp_parameters['reg_img_savepath'],
+                          utils.get_barcode([pid, e]) + '.nii.gz'))
+    if mask is not None:
+        mask_original = mask.cpu().numpy()
+        nib.save(utils.converet_to_nibabel(mask_original, ndims=hyp_parameters["ndims"]),
+                 os.path.join(hyp_parameters['reg_msk_savepath'],
+                              utils.get_barcode([pid, e]) + '_GT.nii.gz'))
+    # --- Save original at full-res ---
+    # fullres_vol is [D,D,D], wrap as [1,1,D,D,D] for converet_to_nibabel
+    nib.save(utils.converet_to_nibabel(fullres_vol[None, None], ndims=hyp_parameters["ndims"]),
+             os.path.join(reg_img_savepath_fullres,
+                          utils.get_barcode([pid, e]) + '.nii.gz'))
+    if fullres_msk_np is not None:
+        # fullres_msk_np is [C,D,D,D], wrap as [1,C,D,D,D]
+        nib.save(utils.converet_to_nibabel(fullres_msk_np[None], ndims=hyp_parameters["ndims"]),
+                 os.path.join(reg_msk_savepath_fullres,
+                              utils.get_barcode([pid, e]) + '_GT.nii.gz'))
+    # --- Diffusion recovery at model resolution ---
+    noise_step = hyp_parameters["start_noise_step"]
+    with torch.no_grad():
+        for im in range(1):
+            print(f'  Generating -> Subject-{pid}, Scan-{e} ({im}/{hyp_parameters["aug_coe"]})', end='\r')
+            [ddf_comp, ddf_rand], [img_rec, img_diff, img_save], [msk_rec, msk_diff, msk_save] = \
+                Deformddpm.diff_recover(
+                    img_org=img,
+                    cond_imgs=target_img.clone().detach(),
+                    msk_org=mask,
+                    T=[None, hyp_parameters["timesteps"]],
+                    v_scale=hyp_parameters["v_scale"],
+                    t_save=None,
+                    proc_type=hyp_parameters["condition_type"])
+            # --- Save 128^3 results (same as OM_reg.py) ---
+            denoise_imgs = img_rec.cpu().numpy()
+            noisy_imgs_np = img_diff.cpu().numpy()
+            nib.save(utils.converet_to_nibabel(denoise_imgs, ndims=hyp_parameters["ndims"]),
+                     os.path.join(hyp_parameters['reg_img_savepath'],
+                                  utils.get_barcode([pid, e, im, noise_step]) + '.nii.gz'))
+            nib.save(utils.converet_to_nibabel(noisy_imgs_np, ndims=hyp_parameters["ndims"]),
+                     os.path.join(hyp_parameters['reg_img_savepath'],
+                                  utils.get_barcode([pid, e, im, noise_step],
+                                                    header=['Patient', 'Slice', 'NoiseImg', 'NoiseStep']) + '.nii.gz'))
+            if msk_rec is not None:
+                denoise_msks = msk_rec.cpu().numpy()
+                noisy_msks_np = msk_diff.cpu().numpy()
+                nib.save(utils.converet_to_nibabel(denoise_msks, ndims=hyp_parameters["ndims"]),
+                         os.path.join(hyp_parameters['reg_msk_savepath'],
+                                      utils.get_barcode([pid, e, im, noise_step]) + '_GT.nii.gz'))
+                nib.save(utils.converet_to_nibabel(noisy_msks_np, ndims=hyp_parameters["ndims"]),
+                         os.path.join(hyp_parameters['reg_msk_savepath'],
+                                      utils.get_barcode([pid, e, im, noise_step],
+                                                        header=['Patient', 'Slice', 'NoiseImg', 'NoiseStep']) + '_GT.nii.gz'))
+            # --- Upscale DDFs to original resolution ---
+            ddf_fullres = F.interpolate(ddf_comp, size=orig_sz,
+                                        mode='trilinear', align_corners=False)
+            ddf_rand_fullres = F.interpolate(ddf_rand, size=orig_sz,
+                                             mode='trilinear', align_corners=False)
+            # --- Apply DDFs at original resolution ---
+            img_rec_fullres = apply_ddf(fullres_img_tensor, ddf_fullres,
+                                        padding_mode='border')
+            img_noisy_fullres = apply_ddf(fullres_img_tensor, ddf_rand_fullres,
+                                          padding_mode='border')
+            if fullres_msk_tensor is not None:
+                msk_rec_fullres = apply_ddf(fullres_msk_tensor, ddf_fullres,
+                                            padding_mode='zeros', resample_mode='nearest')
+                msk_noisy_fullres = apply_ddf(fullres_msk_tensor, ddf_rand_fullres,
+                                              padding_mode='zeros', resample_mode='nearest')
+            # --- Save full-res results ---
+            nib.save(utils.converet_to_nibabel(img_rec_fullres, ndims=hyp_parameters["ndims"]),
+                     os.path.join(reg_img_savepath_fullres,
+                                  utils.get_barcode([pid, e, im, noise_step]) + '.nii.gz'))
+            nib.save(utils.converet_to_nibabel(img_noisy_fullres, ndims=hyp_parameters["ndims"]),
+                     os.path.join(reg_img_savepath_fullres,
+                                  utils.get_barcode([pid, e, im, noise_step],
+                                                    header=['Patient', 'Slice', 'NoiseImg', 'NoiseStep']) + '.nii.gz'))
+            if fullres_msk_tensor is not None:
+                nib.save(utils.converet_to_nibabel(msk_rec_fullres, ndims=hyp_parameters["ndims"]),
+                         os.path.join(reg_msk_savepath_fullres,
+                                      utils.get_barcode([pid, e, im, noise_step]) + '_GT.nii.gz'))
+                nib.save(utils.converet_to_nibabel(msk_noisy_fullres, ndims=hyp_parameters["ndims"]),
+                         os.path.join(reg_msk_savepath_fullres,
+                                      utils.get_barcode([pid, e, im, noise_step],
+                                                        header=['Patient', 'Slice', 'NoiseImg', 'NoiseStep']) + '_GT.nii.gz'))
+            # Save full-res DDF (converet_to_nibabel handles multi-channel → channel-last)
+            nib.save(utils.converet_to_nibabel(ddf_fullres, ndims=hyp_parameters["ndims"]),
+                     os.path.join(reg_ddf_savepath_fullres,
+                                  utils.get_barcode([pid, e, im, noise_step]) + '.nii.gz'))
+            if (im - hyp_parameters["start_noise_step"]) % 2 == 0:
+                noise_step = noise_step + hyp_parameters["noise_step"]
+    if e > 5:
+        break

OM_train_2modes-reg.py ADDED Viewed

	@@ -0,0 +1,517 @@

+import os, sys
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(ROOT_DIR)
+import gc
+import torch
+import torchvision
+from torch import nn
+from torchvision.utils import save_image
+from torch.utils.data import DataLoader
+from torch.optim import Adam, SGD
+from Diffusion.diffuser import DeformDDPM
+from Diffusion.networks import get_net, STN
+from torchvision.transforms import Lambda
+import Diffusion.losses as losses
+import random
+import glob
+import numpy as np
+import utils
+from tqdm import tqdm
+from Dataloader.dataloader0 import get_dataloader
+from Dataloader.dataLoader import *
+from Dataloader.dataloader_utils import thresh_img
+import yaml
+import argparse
+####################
+import torch.multiprocessing as mp
+from torch.utils.data.distributed import DistributedSampler
+from torch.nn.parallel import DistributedDataParallel as DDP
+import torch.distributed as dist
+# from torch.distributed import init_process_group
+###############
+def ddp_setup(rank, world_size):
+    """
+    Args:
+        rank: Unique identifier of each process
+        world_size: Total number of processes
+    """
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "12355"
+    dist.init_process_group(backend="nccl", rank=rank, world_size=world_size)
+    torch.cuda.set_device(rank)
+use_distributed = True
+# use_distributed = False
+EPS = 1e-5
+MSK_EPS = 0.01
+TEXT_EMBED_PROB = 0.7
+AUG_RESAMPLE_PROB = 0.6
+LOSS_WEIGHTS_DIFF = [2.0, 1.0, 16]  # [ang, dist, reg]
+# LOSS_WEIGHTS_REGIST = [9.0, 1.0, 16.0]  # [imgsim, imgmse, ddf]
+LOSS_WEIGHTS_REGIST = [1.0, 0.2, 1e2]  # [imgsim, imgmse, ddf]
+DIFF_REG_BATCH_RATIO = 2
+# AUG_PERMUTE_PROB = 0.35
+parser = argparse.ArgumentParser()
+# config_file_path = 'Config/config_cmr.yaml'
+parser.add_argument(
+        "--config",
+        "-C",
+        help="Path for the config file",
+        type=str,
+        # default="Config/config_cmr.yaml",
+        # default="Config/config_lct.yaml",
+        default="Config/config_all.yaml",
+        required=False,
+    )
+args = parser.parse_args()
+#=======================================================================================================================
+def main_train(rank=0,world_size=1,train_mode_ratio=1,thresh_imgsim=0.01):
+    if use_distributed:
+        ddp_setup(rank,world_size)
+        if torch.distributed.is_initialized():
+            print(f"World size: {torch.distributed.get_world_size()}")
+            print(f"Communication backend: {torch.distributed.get_backend()}")
+    gpu_id = rank
+    # Load the YAML file into a dictionary
+    with open(args.config, 'r') as file:
+        hyp_parameters = yaml.safe_load(file)
+        print(hyp_parameters)
+    # epoch_per_save=10
+    epoch_per_save=hyp_parameters['epoch_per_save']
+    data_name=hyp_parameters['data_name']
+    net_name = hyp_parameters['net_name']
+    Net=get_net(net_name)
+    suffix_pth=f'_{data_name}_{net_name}.pth'
+    model_save_path = os.path.join('Models',f'{data_name}_{net_name}/')
+    model_dir=model_save_path
+    transformer=utils.get_transformer(img_sz=hyp_parameters["ndims"]*[hyp_parameters['img_size']])
+    # Data_Loader=get_dataloader(data_name=hyp_parameters['data_name'], mode='train')
+    # tsfm = torchvision.transforms.Compose([
+    #             torchvision.transforms.ToTensor(),
+    #             ])
+    # dataset = Data_Loader(target_res = [hyp_parameters["img_size"]]*hyp_parameters["ndims"], transforms=None, noise_scale=hyp_parameters['noise_scale'])
+    # train_loader = DataLoader(
+    #     dataset,
+    #     batch_size=hyp_parameters['batchsize'],
+    #     # shuffle=False,
+    #     shuffle=True,
+    #     drop_last=True,
+    # )
+    # dataset = OminiDataset_v1(transform=None)
+    dataset = OMDataset_indiv(transform=None)
+    train_loader = DataLoader(
+        dataset,
+        batch_size=hyp_parameters['batchsize'],
+        shuffle=True,
+        drop_last=True,
+    )
+    # datasetp = OminiDataset_paired(transform=None)
+    datasetp = OMDataset_pair(transform=None)
+    train_loader_p = DataLoader(
+        datasetp,
+        batch_size=hyp_parameters['batchsize']//DIFF_REG_BATCH_RATIO,
+        shuffle=True,
+        drop_last=True,
+    )
+    Deformddpm = DeformDDPM(
+        network=Net(
+            n_steps=hyp_parameters["timesteps"],
+            ndims=hyp_parameters["ndims"],
+            num_input_chn = hyp_parameters["num_input_chn"],
+            res = hyp_parameters['img_size']
+            ),
+        n_steps=hyp_parameters["timesteps"],
+        image_chw=[1] + [hyp_parameters["img_size"]]*hyp_parameters["ndims"],
+        device=hyp_parameters["device"],
+        batch_size=hyp_parameters["batchsize"],
+        img_pad_mode=hyp_parameters["img_pad_mode"],
+        v_scale=hyp_parameters["v_scale"],
+    )
+    ddf_stn = STN(
+        img_sz=hyp_parameters["img_size"],
+        ndims=hyp_parameters["ndims"],
+        # padding_mode="zeros",
+        padding_mode=hyp_parameters["padding_mode"],
+        device=hyp_parameters["device"],
+    )
+    if use_distributed:
+        Deformddpm.to(rank)
+        Deformddpm = DDP(Deformddpm, device_ids=[rank])
+        ddf_stn.to(rank)
+    else:
+        Deformddpm.to(hyp_parameters["device"])
+        ddf_stn.to(hyp_parameters["device"])
+    # ddf_stn = DDP(ddf_stn, device_ids=[rank])
+    # mse = nn.MSELoss()
+    # loss_reg = losses.Grad(penalty=['l1', 'negdetj'], ndims=hyp_parameters["ndims"])
+    # loss_reg = losses.Grad(penalty=['l1', 'negdetj', 'range'], ndims=hyp_parameters["ndims"])
+    loss_reg = losses.Grad(penalty=['l1', 'negdetj', 'range'], ndims=hyp_parameters["ndims"],outrange_thresh=0.2,outrange_weight=1e3)
+    loss_reg1 = losses.Grad(penalty=['l1', 'negdetj', 'range'], ndims=hyp_parameters["ndims"],outrange_thresh=0.6,outrange_weight=1e3)
+    loss_dist = losses.MRSE(img_sz=hyp_parameters["img_size"])
+    # loss_ang = losses.MRSE(img_sz=hyp_parameters["img_size"])
+    loss_ang = losses.NCC(img_sz=hyp_parameters["img_size"])
+    loss_imgsim = losses.LNCC()
+    loss_imgmse = losses.LMSE()
+    optimizer = Adam(Deformddpm.parameters(), lr=hyp_parameters["lr"])
+    # hyp_parameters["lr"]=0.00000001
+    # optimizer_regist = Adam(Deformddpm.parameters(), lr=hyp_parameters["lr"]*0.01)
+    # optimizer_regist = SGD(Deformddpm.parameters(), lr=hyp_parameters["lr"]*0.01, momentum=0.98)
+    # optimizer = SGD(Deformddpm.parameters(), lr=hyp_parameters["lr"], momentum=0.9)
+    # # LR scheduler ----- YHM
+    # scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, hyp_parameters["lr"], hyp_parameters["lr"]*10, step_size_up=500, step_size_down=500, mode='triangular', gamma=1.0, scale_fn=None, scale_mode='cycle', cycle_momentum=True, base_momentum=0.8, max_momentum=0.9, last_epoch=-1)
+    # Deformddpm.network.load_state_dict(torch.load('/home/data/jzheng/Adaptive_Motion_Generator-master/models/1000.pth'))
+    # check for existing models
+    if not os.path.exists(model_dir):
+        os.makedirs(model_dir, exist_ok=True)
+    model_files = glob.glob(os.path.join(model_dir, "*.pth"))
+    model_files.sort()
+    if model_files:
+        if gpu_id == 0:
+            print(model_files)
+        initial_epoch, Deformddpm, optimizer = ddp_load_dict(gpu_id, Deformddpm, optimizer, model_files[-1])
+    else:
+        initial_epoch = 0
+    if gpu_id == 0:
+        print('len_train_data: ',len(dataset))
+    # Training loop
+    for epoch in range(initial_epoch,hyp_parameters["epoch"]):
+        epoch_loss_tot = 0.0
+        epoch_loss_gen_d = 0.0
+        epoch_loss_gen_a = 0.0
+        epoch_loss_reg = 0.0
+        epoch_loss_regist = 0.0
+        epoch_loss_imgsim = 0.0
+        epoch_loss_imgmse = 0.0
+        epoch_loss_ddfreg = 0.0
+        # Set model inside to train model
+        Deformddpm.train()
+        loss_nan_step = 0  # yu: count the number of nan loss steps
+        total = min(len(train_loader), len(train_loader_p))
+        # for step, batch in tqdm(enumerate(train_loader)):
+        # for step, batch in tqdm(enumerate(train_loader)):
+        # for step, batch in enumerate(train_loader_omni):
+        for step, (batch, batch_p) in tqdm(enumerate(zip(train_loader, train_loader_p)), total=total):
+            # x0, _ = batch
+            # ==========================================================================
+            # diffusion train on single image
+            # x0 = batch # for omni dataset
+            [x0,embd] = batch # for om dataset
+            x0 = x0.to(hyp_parameters["device"]).type(torch.float32)
+            # print('embd:', embd.shape)
+            if np.random.uniform(0,1)<TEXT_EMBED_PROB:
+                embd = embd.to(hyp_parameters["device"]).type(torch.float32)
+            else:
+                embd = None
+            n = x0.size()[0]  # batch_size -> n
+            x0 = x0.to(hyp_parameters["device"])
+            blind_mask = utils.get_random_deformed_mask(x0.shape[2:],apply_possibility=0.6).to(hyp_parameters["device"])
+            # random deformation + rotation
+            if hyp_parameters["ndims"]>2:
+                if np.random.uniform(0,1)<AUG_RESAMPLE_PROB:
+                    x0 = utils.random_resample(x0, deform_scale=0)
+                # elif np.random.uniform(0,1)<AUG_RESAMPLE_PROB+AUG_PERMUTE_PROB:
+                else:
+                    [x0] = utils.random_permute([x0], select_dims=[-1,-2,-3])
+            # x0 = transformer(x0)
+            if hyp_parameters['noise_scale']>0:
+                if np.random.uniform(0,1)<AUG_RESAMPLE_PROB:
+                    x0 = thresh_img(x0, [0, 2*hyp_parameters['noise_scale']])
+                x0 = x0 * (np.random.normal(1, hyp_parameters['noise_scale'] * 1)) + np.random.normal(0, hyp_parameters['noise_scale'] * 1)
+            # Picking some noise for each of the images in the batch, a timestep and the respective alpha_bars
+            t = torch.randint(0, hyp_parameters["timesteps"], (n,)).to(
+                hyp_parameters["device"]
+            )  # pick up a seq of rand number from 0 to 'timestep'
+            # proc_type = random.choice(['adding', 'independ', 'downsample', 'slice', 'project', 'none', 'uncon', 'uncon', 'uncon'])
+            proc_type = random.choice(['adding', 'downsample', 'slice', 'slice1', 'none', 'uncon', 'uncon', 'uncon'])
+            # print('proc_type:', proc_type)
+            cond_img, _, cond_ratio = Deformddpm.module.proc_cond_img(x0,proc_type=proc_type)
+            pre_dvf_I,dvf_I = Deformddpm(img_org=x0, t=t, cond_imgs=cond_img, mask=blind_mask,proc_type=[],text=embd)  # forward diffusion process
+            # print(torch.max(torch.abs(pre_dvf_I)))
+            # print(torch.max(torch.abs(dvf_I)))
+            loss_tot=0
+            loss_ddf = loss_reg(pre_dvf_I,img=x0)
+            trm_pred = ddf_stn(pre_dvf_I, dvf_I)
+            loss_gen_d = loss_dist(pred=trm_pred,inv_lab=dvf_I,ddf_stn=None,mask=blind_mask)
+            loss_gen_a = loss_ang(pred=trm_pred,inv_lab=dvf_I,ddf_stn=None,mask=blind_mask)
+            loss_tot += LOSS_WEIGHTS_DIFF[0] * loss_gen_a + LOSS_WEIGHTS_DIFF[1] * loss_gen_d
+            loss_tot += LOSS_WEIGHTS_DIFF[2] * loss_ddf
+            loss_tot = torch.sqrt(1.+MSK_EPS-cond_ratio) * loss_tot
+            # >> JZ: print nan in x0
+            if torch.isnan(x0).any():
+                print(f"*** Encountered NaN in input image x0 at epoch {epoch}, step {step}.")
+            # >> JZ: print loss of ddf
+            if loss_ddf>0.001:
+                print(f"*** High diffusion DDF loss at epoch {epoch}, step {step}: {loss_ddf.item()}.")
+            # yu: check if loss_tot==nan or inf
+            if torch.isnan(loss_tot) or torch.isinf(loss_tot):
+                print(f"*** Encountered NaN or Inf loss at epoch {epoch}, step {step}. Skipping this batch.")
+                loss_nan_step += 1
+                continue
+            if loss_nan_step > 5:
+                print(f"*** Too many NaN or Inf losses ({loss_nan_step} times) at epoch {epoch}, step {step}. Stopping training.")
+                raise ValueError("Too many NaN losses detected in loss_tot. Code terminated.")
+            optimizer.zero_grad()
+            loss_tot.backward()
+            optimizer.step()
+            epoch_loss_tot += loss_tot.item() / total
+            epoch_loss_gen_d += loss_gen_d.item() / total
+            epoch_loss_gen_a += loss_gen_a.item() / total
+            epoch_loss_reg += loss_ddf.item() / total
+            # print(loss_gen_a.item())
+            # if 0:
+            # if loss_gen_a.item() < -0.3 and step%train_mode_ratio == 0:
+            if step%train_mode_ratio == 0:
+                # ==========================================================================
+                # registration train on paired images
+                # x1, y1 = next(iter(train_loader_p))
+                # [x1, y1, _, embd_y] = next(iter(train_loader_p))
+                [x1, y1, _, embd_y] = batch_p
+                if np.random.uniform(0,1)<TEXT_EMBED_PROB:
+                    # embd_x = embd_x.to(hyp_parameters["device"]).type(torch.float32)
+                    embd_y = embd_y.to(hyp_parameters["device"]).type(torch.float32)
+                else:
+                    # embd_x = None
+                    embd_y = None
+                x1 = x1.to(hyp_parameters["device"]).type(torch.float32)
+                y1 = y1.to(hyp_parameters["device"]).type(torch.float32)
+                n = x1.size()[0]  # batch_size -> n
+                # random deformation + rotation
+                # if hyp_parameters["ndims"]>2:
+                #     if np.random.uniform(0,1)<0.6:
+                #         x1 = utils.random_resample(x1, deform_scale=0)
+                #         y1 = utils.random_resample(y1, deform_scale=0)
+                # x1 = transformer(x1)
+                # y1 = transformer(y1)
+                [x1, y1] = utils.random_permute([x1, y1], select_dims=[-1,-2,-3])
+                if hyp_parameters['noise_scale']>0:
+                    [x1, y1] = thresh_img([x1, y1], [0, 2*hyp_parameters['noise_scale']])
+                    random_scale = np.random.normal(1, hyp_parameters['noise_scale'] * 1)
+                    random_shift = np.random.normal(0, hyp_parameters['noise_scale'] * 1)
+                    x1 = x1 * random_scale + random_shift
+                    y1 = y1 * random_scale + random_shift
+                    # x1 = thresh_img(x1, [0, 2*hyp_parameters['noise_scale']])
+                    # x1 = x1 * (np.random.normal(1, hyp_parameters['noise_scale'] * 1)) + np.random.normal(0, hyp_parameters['noise_scale'] * 1)
+                    # y1 = thresh_img(y1, [0, 2*hyp_parameters['noise_scale']])
+                    # y1 = y1 * (np.random.normal(1, hyp_parameters['noise_scale'] * 1)) + np.random.normal(0, hyp_parameters['noise_scale'] * 1)
+                # # Picking some noise for each of the images in the batch, a timestep and the respective alpha_bars
+                # t = torch.randint(0, hyp_parameters["timesteps"], (n,)).to(
+                #     hyp_parameters["device"]
+                # )  # pick up a seq of rand number from 0 to 'timestep'
+                # scale_regist = np.random.uniform(0.6,1.)
+                # T_regist = sorted(random.sample(range(0, int(hyp_parameters["timesteps"] * scale_regist) + 1), 16), reverse=True)
+                # print('T_regist (0.6,1) sampling range:', T_regist)
+                scale_regist = np.random.uniform(0.0,0.7)
+                select_timestep = np.random.randint(8, 17)  # select a random number of timesteps to sample, between 8 and 16
+                T_regist = sorted(random.sample(range(int(hyp_parameters["timesteps"] * scale_regist),hyp_parameters["timesteps"]), select_timestep), reverse=True)
+                # print('T_regist (0.1,0.7) sampling range:', T_regist)
+                # scale_regist = np.random.uniform(0.4,1.)
+                # T_regist = [int(hyp_parameters["timesteps"]*scale_regist)]
+                # scale_regist = np.random.uniform(0.6,1.)
+                # init_T = int(hyp_parameters["timesteps"] * scale_regist)
+                # T_regist = sorted(random.sample(range(0, int(hyp_parameters["timesteps"] * scale_regist)), 2)+list(range(init_T,hyp_parameters["timesteps"]+1)), reverse=True)
+                T_regist = [[t for _ in range(hyp_parameters["batchsize"]//2)] for t in T_regist]
+                # print('T_regist:', T_regist)
+                # proc_type = random.choice(['adding', 'independ', 'downsample', 'slice', 'project', 'none', 'none'])
+                proc_type = random.choice(['downsample', 'slice', 'slice1', 'none', 'none'])
+                # proc_type = random.choice(['project'])
+                y1_proc, msk_tgt, cond_ratio = Deformddpm.module.proc_cond_img(y1,proc_type=proc_type)
+                [ddf_comp,ddf_rand],[img_rec,img_diff,img_save],_ = Deformddpm(img_org=x1, cond_imgs=y1_proc, T=[None, T_regist], proc_type=[],text=embd_y)  # forward diffusion process
+                # loss_sim = loss_imgsim(img_rec, y1, label=msk_tgt*(y1>thresh_imgsim))  # calculate loss for the registration process
+                # loss_mse = loss_imgmse(img_rec, y1, label=msk_tgt*(y1>0.0))  # calculate loss for the registration process
+                # loss_ddf1 = loss_reg1(ddf_comp,img=y1,msk=(msk_tgt+MSK_EPS))  # calculate loss for the registration process
+                loss_sim = loss_imgsim(img_rec, y1, label=(y1>thresh_imgsim))  # calculate loss for the registration process
+                loss_mse = loss_imgmse(img_rec, y1, label=(y1>=0.0))  # calculate loss for the registration process
+                loss_ddf1 = loss_reg1(ddf_comp, img=y1)  # calculate loss for the registration process
+                loss_regist = 0
+                loss_regist += LOSS_WEIGHTS_REGIST[0] * loss_sim
+                loss_regist += LOSS_WEIGHTS_REGIST[1] * loss_mse
+                loss_regist += LOSS_WEIGHTS_REGIST[2] * loss_ddf1
+                # print('proc_type:', proc_type, 'cond_ratio:', cond_ratio.item())
+                # print('loss_regist:', loss_regist.item(), 'loss_sim:', loss_sim.item(), 'loss_ddf1:', loss_ddf1.item())
+                # >> JZ: print nan in x0
+                if torch.isnan(x0).any():
+                    print(f"*** Encountered NaN in input image x0 at epoch {epoch}, step {step}.")
+                # >> JZ: print loss of ddf
+                if loss_ddf1>0.002:
+                    print(f"*** High registration DDF loss at epoch {epoch}, step {step}: {loss_ddf1.item()}.")
+                    # # Print gradients for each parameter
+                    # for name, param in Deformddpm.named_parameters():
+                    #     if param.grad is not None:
+                    #         print(f"Gradient for {name}: {param.grad.norm()}")
+                    #     else:
+                    #         print(f"Gradient for {name}: None")
+                loss_regist = torch.sqrt(cond_ratio+MSK_EPS) *loss_regist
+                optimizer.zero_grad()
+                loss_regist.backward()
+                torch.nn.utils.clip_grad_norm_(Deformddpm.parameters(), max_norm=0.4)
+                optimizer.step()
+                epoch_loss_regist += loss_regist.item() / total
+                epoch_loss_imgsim += loss_sim.item() / total
+                epoch_loss_imgmse += loss_mse.item() / total
+                epoch_loss_ddfreg += loss_ddf1.item() / total
+            if step % 10 == 0:
+                print('step:',step,':', loss_tot.item(),'=',loss_gen_a.item(),'+', loss_gen_d.item(),'+',loss_ddf.item())
+                print(f'     loss_regist: {loss_regist} = {loss_sim} (imgsim) + {loss_mse} (imgmse) + {loss_ddf1} (ddf)')
+            # break   # FOR TESTING
+            # else:
+            #     print('loss_gen_a:',loss_gen_a.item())     # FOR TESTING
+            #     pass
+        if 1:
+        # if gpu_id == 0:
+            print('==================')
+            print(epoch,':', epoch_loss_tot,'=',epoch_loss_gen_a,'+', epoch_loss_gen_d,'+',epoch_loss_reg, ' (ang+dist+regul)')
+            print(f'     loss_regist: {epoch_loss_regist} = {epoch_loss_imgsim} (imgsim) + {epoch_loss_imgmse} (imgmse) + {epoch_loss_ddfreg} (ddf)')
+            print('==================')
+        # # LR schedular step ----- YHM
+        # scheduler.step()
+        if 0 == epoch % epoch_per_save:
+            save_dir=model_save_path + str(epoch).rjust(6, '0') + suffix_pth
+            os.makedirs(os.path.dirname(model_save_path), exist_ok=True)
+            # break   # FOR TESTING
+            if not use_distributed:
+                print(f"saved in {save_dir}")
+                # torch.save(Deformddpm.state_dict(), save_dir)
+                torch.save({
+                    'model_state_dict': Deformddpm.state_dict(),
+                    'optimizer_state_dict': optimizer.state_dict(),
+                    'epoch': epoch
+                }, save_dir)
+            elif gpu_id == 0:
+                print(f"saved in {save_dir}")
+                # torch.save(Deformddpm.module.state_dict(), save_dir)
+                torch.save({
+                    'model_state_dict': Deformddpm.module.state_dict(),
+                    'optimizer_state_dict': optimizer.state_dict(),
+                    'epoch': epoch
+                }, save_dir)
+    # Resource cleanup at the end of training
+    torch.cuda.empty_cache()
+    gc.collect()
+    if use_distributed and dist.is_initialized():
+        dist.destroy_process_group()
+def ddp_load_dict(gpu_id, Deformddpm, optimizer, model_file,use_distributed=True, load_strict=False):
+    if gpu_id == 0:
+    # if 0:
+        utils.print_memory_usage("Before Loading Model")
+        if 1:
+            gc.collect()
+            torch.cuda.empty_cache()
+        # Deformddpm.network.load_state_dict(torch.load(latest_model_file))
+        # Deformddpm.load_state_dict(torch.load(latest_model_file), strict=False)
+        checkpoint = torch.load(model_file)
+        # checkpoint = torch.load(latest_model_file, map_location=f"cuda:{rank}")
+        if use_distributed:
+            Deformddpm.module.load_state_dict(checkpoint['model_state_dict'], strict=load_strict)
+        else:
+            Deformddpm.load_state_dict(checkpoint['model_state_dict'], strict=load_strict)
+        if load_strict:
+            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
+        utils.print_memory_usage("After Loading Checkpoint on GPU")
+    if use_distributed:
+        # Broadcast model weights from rank 0 to all other GPUs
+        dist.barrier()
+        for param in Deformddpm.parameters():
+            dist.broadcast(param.data, src=0)  # Synchronize model across ranks
+        dist.barrier()
+        for param_group in optimizer.param_groups:
+            for param in param_group['params']:
+                if param.grad is not None:
+                    dist.broadcast(param.grad, src=0)  # Sync optimizer gradients
+    # initial_epoch = checkpoint['epoch'] + 1
+    # get the epoch number from the filename and add 1 to set as initial_epoch
+    initial_epoch = int(os.path.basename(model_file).split('.')[0][:6]) + 1
+    return initial_epoch, Deformddpm, optimizer
+if __name__ == "__main__":
+    if use_distributed:
+        world_size = torch.cuda.device_count()
+        print(f"Distributed GPU number = {world_size}")
+        mp.spawn(main_train,args = (world_size,),nprocs = world_size)
+    else:
+        main_train(0,1)

OM_train_2modes.py CHANGED Viewed

@@ -1,4 +1,8 @@
-import os
 import gc
 import torch
 import torchvision
@@ -48,12 +52,11 @@ use_distributed = True
 EPS = 1e-5
 MSK_EPS = 0.01
 TEXT_EMBED_PROB = 0.7
-AUG_RESAMPLE_PROB = 0.6
-LOSS_WEIGHTS_DIFF = [2.0, 1.0, 30]  # [ang, dist, reg]
 # LOSS_WEIGHTS_REGIST = [9.0, 1.0, 16.0]  # [imgsim, imgmse, ddf]
-# LOSS_WEIGHTS_REGIST = [10.0, 1.0, 1.0]  # [imgsim, imgmse, ddf]
-# LOSS_WEIGHTS_REGIST = [2.0, 0.1, 1e3]  # [imgsim, imgmse, ddf]
-LOSS_WEIGHTS_REGIST = [2.0, 0.1, 256]  # [imgsim, imgmse, ddf]
 # AUG_PERMUTE_PROB = 0.35
@@ -130,7 +133,7 @@ def main_train(rank=0,world_size=1,train_mode_ratio=1,thresh_imgsim=0.01):
     datasetp = OMDataset_pair(transform=None)
     train_loader_p = DataLoader(
         datasetp,
-        batch_size=hyp_parameters['batchsize']//2,
         shuffle=True,
         drop_last=True,
     )
@@ -174,12 +177,15 @@ def main_train(rank=0,world_size=1,train_mode_ratio=1,thresh_imgsim=0.01):
     # mse = nn.MSELoss()
     # loss_reg = losses.Grad(penalty=['l1', 'negdetj'], ndims=hyp_parameters["ndims"])
     loss_reg = losses.Grad(penalty=['l1', 'negdetj', 'range'], ndims=hyp_parameters["ndims"],outrange_thresh=0.2,outrange_weight=1e3)
     loss_reg1 = losses.Grad(penalty=['l1', 'negdetj', 'range'], ndims=hyp_parameters["ndims"],outrange_thresh=0.6,outrange_weight=1e3)
     loss_dist = losses.MRSE(img_sz=hyp_parameters["img_size"])
     # loss_ang = losses.MRSE(img_sz=hyp_parameters["img_size"])
     loss_ang = losses.NCC(img_sz=hyp_parameters["img_size"])
-    loss_imgsim = losses.LNCC()
     loss_imgmse = losses.LMSE()
     optimizer = Adam(Deformddpm.parameters(), lr=hyp_parameters["lr"])
@@ -220,15 +226,15 @@ def main_train(rank=0,world_size=1,train_mode_ratio=1,thresh_imgsim=0.01):
         epoch_loss_ddfreg = 0.0
         # Set model inside to train model
         Deformddpm.train()
         loss_nan_step = 0  # yu: count the number of nan loss steps
         total = min(len(train_loader), len(train_loader_p))
-        for step, (batch, batch_p) in tqdm(enumerate(zip(train_loader, train_loader_p)), total=total):
         # for step, batch in tqdm(enumerate(train_loader)):
         # for step, batch in tqdm(enumerate(train_loader)):
         # for step, batch in enumerate(train_loader_omni):
             # x0, _ = batch
@@ -258,10 +264,10 @@ def main_train(rank=0,world_size=1,train_mode_ratio=1,thresh_imgsim=0.01):
                 # elif np.random.uniform(0,1)<AUG_RESAMPLE_PROB+AUG_PERMUTE_PROB:
                 else:
                     [x0] = utils.random_permute([x0], select_dims=[-1,-2,-3])
-            x0 = transformer(x0)
             if hyp_parameters['noise_scale']>0:
                 if np.random.uniform(0,1)<AUG_RESAMPLE_PROB:
-                    x0 = thresh_img(x0, [0, 1*hyp_parameters['noise_scale']])
                 x0 = x0 * (np.random.normal(1, hyp_parameters['noise_scale'] * 1)) + np.random.normal(0, hyp_parameters['noise_scale'] * 1)
             # Picking some noise for each of the images in the batch, a timestep and the respective alpha_bars
@@ -270,12 +276,15 @@ def main_train(rank=0,world_size=1,train_mode_ratio=1,thresh_imgsim=0.01):
             )  # pick up a seq of rand number from 0 to 'timestep'
             # proc_type = random.choice(['adding', 'independ', 'downsample', 'slice', 'project', 'none', 'uncon', 'uncon', 'uncon'])
-            proc_type = random.choice(['adding', 'downsample', 'slice', 'none', 'uncon', 'uncon', 'uncon'])
             # print('proc_type:', proc_type)
             cond_img, _, cond_ratio = Deformddpm.module.proc_cond_img(x0,proc_type=proc_type)
             pre_dvf_I,dvf_I = Deformddpm(img_org=x0, t=t, cond_imgs=cond_img, mask=blind_mask,proc_type=[],text=embd)  # forward diffusion process
             loss_tot=0
             loss_ddf = loss_reg(pre_dvf_I,img=x0)
@@ -302,15 +311,14 @@ def main_train(rank=0,world_size=1,train_mode_ratio=1,thresh_imgsim=0.01):
                 print(f"*** Too many NaN or Inf losses ({loss_nan_step} times) at epoch {epoch}, step {step}. Stopping training.")
                 raise ValueError("Too many NaN losses detected in loss_tot. Code terminated.")
             optimizer.zero_grad()
             loss_tot.backward()
             optimizer.step()
-            epoch_loss_tot += loss_tot.item() * len(x0) / len(train_loader.dataset)
-            epoch_loss_gen_d += loss_gen_d.item() * len(x0) / len(train_loader.dataset)
-            epoch_loss_gen_a += loss_gen_a.item() * len(x0) / len(train_loader.dataset)
-            epoch_loss_reg += loss_ddf.item() * len(x0) / len(train_loader.dataset)
             # print(loss_gen_a.item())
             # if 0:
@@ -336,8 +344,8 @@ def main_train(rank=0,world_size=1,train_mode_ratio=1,thresh_imgsim=0.01):
                 #     if np.random.uniform(0,1)<0.6:
                 #         x1 = utils.random_resample(x1, deform_scale=0)
                 #         y1 = utils.random_resample(y1, deform_scale=0)
-                x1 = transformer(x1)
-                y1 = transformer(y1)
                 [x1, y1] = utils.random_permute([x1, y1], select_dims=[-1,-2,-3])
                 if hyp_parameters['noise_scale']>0:
                     [x1, y1] = thresh_img([x1, y1], [0, 2*hyp_parameters['noise_scale']])
@@ -355,10 +363,13 @@ def main_train(rank=0,world_size=1,train_mode_ratio=1,thresh_imgsim=0.01):
                 # )  # pick up a seq of rand number from 0 to 'timestep'
-                # scale_regist = np.random.uniform(0.2,0.25)
                 # T_regist = sorted(random.sample(range(0, int(hyp_parameters["timesteps"] * scale_regist) + 1), 16), reverse=True)
-                scale_regist = np.random.uniform(0.05,0.7)
-                T_regist = sorted(random.sample(range(int(hyp_parameters["timesteps"] * scale_regist),hyp_parameters["timesteps"]), 16), reverse=True)
                 # scale_regist = np.random.uniform(0.4,1.)
                 # T_regist = [int(hyp_parameters["timesteps"]*scale_regist)]
                 # scale_regist = np.random.uniform(0.6,1.)
@@ -369,33 +380,30 @@ def main_train(rank=0,world_size=1,train_mode_ratio=1,thresh_imgsim=0.01):
                 # print('T_regist:', T_regist)
                 # proc_type = random.choice(['adding', 'independ', 'downsample', 'slice', 'project', 'none', 'none'])
-                proc_type = random.choice(['adding', 'downsample', 'slice', 'none', 'none'])
                 # proc_type = random.choice(['project'])
                 y1_proc, msk_tgt, cond_ratio = Deformddpm.module.proc_cond_img(y1,proc_type=proc_type)
-                # msk_tgt = msk_tgt + MSK_EPS
                 [ddf_comp,ddf_rand],[img_rec,img_diff,img_save],_ = Deformddpm(img_org=x1, cond_imgs=y1_proc, T=[None, T_regist], proc_type=[],text=embd_y)  # forward diffusion process
-                # loss_ddf1 = loss_reg1(ddf_comp,img=y1,msk=msk_tgt)  # calculate loss for the registration process
-                # loss_sim = loss_imgsim(img_rec, y1, label=msk_tgt*(y1>thresh_imgsim))  # calculate loss for the registration process
-                # loss_mse = loss_imgmse(img_rec, y1, label=msk_tgt*(y1>0.0))  # calculate loss for the registration process
-                loss_sim = loss_imgsim(img_rec, y1, label=(y1>thresh_imgsim))  # calculate loss for the registration process
-                loss_mse = loss_imgmse(img_rec, y1, label=(y1>0.0))  # calculate loss for the registration process
-                loss_ddf1 = loss_reg1(ddf_comp,img=y1)  # calculate loss for the registration process
                 loss_regist = 0
                 loss_regist += LOSS_WEIGHTS_REGIST[0] * loss_sim
                 loss_regist += LOSS_WEIGHTS_REGIST[1] * loss_mse
                 loss_regist += LOSS_WEIGHTS_REGIST[2] * loss_ddf1
                 # print('proc_type:', proc_type, 'cond_ratio:', cond_ratio.item())
                 # print('loss_regist:', loss_regist.item(), 'loss_sim:', loss_sim.item(), 'loss_ddf1:', loss_ddf1.item())
                 # >> JZ: print nan in x0
                 if torch.isnan(x0).any():
                     print(f"*** Encountered NaN in input image x0 at epoch {epoch}, step {step}.")
                 # >> JZ: print loss of ddf
-                if loss_ddf1>0.001:
                     print(f"*** High registration DDF loss at epoch {epoch}, step {step}: {loss_ddf1.item()}.")
                     # # Print gradients for each parameter
                     # for name, param in Deformddpm.named_parameters():
@@ -403,43 +411,25 @@ def main_train(rank=0,world_size=1,train_mode_ratio=1,thresh_imgsim=0.01):
                     #         print(f"Gradient for {name}: {param.grad.norm()}")
                     #     else:
                     #         print(f"Gradient for {name}: None")
                 loss_regist = torch.sqrt(cond_ratio+MSK_EPS) *loss_regist
                 optimizer.zero_grad()
                 loss_regist.backward()
-                torch.nn.utils.clip_grad_norm_(Deformddpm.parameters(), max_norm=0.1)
                 optimizer.step()
-                epoch_loss_regist += loss_regist.item() * len(x0) / len(train_loader.dataset)
-                epoch_loss_imgsim += loss_sim.item() * len(x0) / len(train_loader.dataset)
-                epoch_loss_imgmse += loss_mse.item() * len(x0) / len(train_loader.dataset)
-                epoch_loss_ddfreg += loss_ddf1.item() * len(x0) / len(train_loader.dataset)
-            print('step:',step,':', loss_tot.item(),'=',loss_gen_a.item(),'+', loss_gen_d.item(),'+',loss_ddf.item())
-            print(f'     loss_regist: {loss_regist} = {loss_sim} (imgsim) + {loss_mse} (imgmse) + {loss_ddf1} (ddf)')
-            # >> JZ: if loss_imgsim is zero
-            if loss_sim.item()>-0.001:
-                print(f"*** Zero image similarity loss at epoch {epoch}, step {step}.")
-                def save_niftiimage(tensor, filename):
-                    import nibabel as nib
-                    import numpy as np
-                    array = tensor.squeeze().cpu().detach().numpy()
-                    nifti_img = nib.Nifti1Image(array, affine=np.eye(4))
-                    nib.save(nifti_img, filename)
-                # save the x1 and y1 images for debugging
-                save_path = os.path.join('/home/data/Github/OmniMorph/Log/error_files',f"debug_images_epoch{epoch}_step{step}/")
-                os.makedirs(save_path, exist_ok=True)
-                save_niftiimage(img_rec, os.path.join(save_path, 'img_rec.nii.gz'))
-                save_niftiimage(x1, os.path.join(save_path, 'x1.nii.gz'))
-                save_niftiimage(y1, os.path.join(save_path, 'y1.nii.gz'))
-                save_niftiimage(y1_proc, os.path.join(save_path, 'y1_proc.nii.gz'))
-                exit()
-            # print('step:',step,':', loss_tot.item(),'=',loss_gen_a.item(),'+', loss_gen_d.item(),'+',loss_ddf.item())
             # break   # FOR TESTING
             # else:
             #     print('loss_gen_a:',loss_gen_a.item())     # FOR TESTING
@@ -481,7 +471,7 @@ def main_train(rank=0,world_size=1,train_mode_ratio=1,thresh_imgsim=0.01):
     if use_distributed and dist.is_initialized():
         dist.destroy_process_group()
-def ddp_load_dict(gpu_id, Deformddpm, optimizer, model_file,use_distributed=True):
     if gpu_id == 0:
     # if 0:
@@ -494,10 +484,11 @@ def ddp_load_dict(gpu_id, Deformddpm, optimizer, model_file,use_distributed=True
         checkpoint = torch.load(model_file)
         # checkpoint = torch.load(latest_model_file, map_location=f"cuda:{rank}")
         if use_distributed:
-            Deformddpm.module.load_state_dict(checkpoint['model_state_dict'])
         else:
-            Deformddpm.load_state_dict(checkpoint['model_state_dict'])
-        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
         utils.print_memory_usage("After Loading Checkpoint on GPU")
     if use_distributed:

+import os, sys
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(ROOT_DIR)
 import gc
 import torch
 import torchvision
 EPS = 1e-5
 MSK_EPS = 0.01
 TEXT_EMBED_PROB = 0.7
+AUG_RESAMPLE_PROB = 0.5
+LOSS_WEIGHTS_DIFF = [2.0, 2.0, 4.0]  # [ang, dist, reg]
 # LOSS_WEIGHTS_REGIST = [9.0, 1.0, 16.0]  # [imgsim, imgmse, ddf]
+LOSS_WEIGHTS_REGIST = [1.0, 0.05, 128]  # [imgsim, imgmse, ddf]
+DIFF_REG_BATCH_RATIO = 2
 # AUG_PERMUTE_PROB = 0.35
     datasetp = OMDataset_pair(transform=None)
     train_loader_p = DataLoader(
         datasetp,
+        batch_size=hyp_parameters['batchsize']//DIFF_REG_BATCH_RATIO,
         shuffle=True,
         drop_last=True,
     )
     # mse = nn.MSELoss()
     # loss_reg = losses.Grad(penalty=['l1', 'negdetj'], ndims=hyp_parameters["ndims"])
+    # loss_reg = losses.Grad(penalty=['l1', 'negdetj', 'range'], ndims=hyp_parameters["ndims"])
     loss_reg = losses.Grad(penalty=['l1', 'negdetj', 'range'], ndims=hyp_parameters["ndims"],outrange_thresh=0.2,outrange_weight=1e3)
     loss_reg1 = losses.Grad(penalty=['l1', 'negdetj', 'range'], ndims=hyp_parameters["ndims"],outrange_thresh=0.6,outrange_weight=1e3)
     loss_dist = losses.MRSE(img_sz=hyp_parameters["img_size"])
     # loss_ang = losses.MRSE(img_sz=hyp_parameters["img_size"])
     loss_ang = losses.NCC(img_sz=hyp_parameters["img_size"])
+    # loss_imgsim = losses.LNCC()
+    loss_imgsim = losses.MSLNCC()
     loss_imgmse = losses.LMSE()
     optimizer = Adam(Deformddpm.parameters(), lr=hyp_parameters["lr"])
         epoch_loss_ddfreg = 0.0
         # Set model inside to train model
         Deformddpm.train()
         loss_nan_step = 0  # yu: count the number of nan loss steps
         total = min(len(train_loader), len(train_loader_p))
         # for step, batch in tqdm(enumerate(train_loader)):
         # for step, batch in tqdm(enumerate(train_loader)):
         # for step, batch in enumerate(train_loader_omni):
+        for step, (batch, batch_p) in tqdm(enumerate(zip(train_loader, train_loader_p)), total=total):
             # x0, _ = batch
                 # elif np.random.uniform(0,1)<AUG_RESAMPLE_PROB+AUG_PERMUTE_PROB:
                 else:
                     [x0] = utils.random_permute([x0], select_dims=[-1,-2,-3])
+            # x0 = transformer(x0)
             if hyp_parameters['noise_scale']>0:
                 if np.random.uniform(0,1)<AUG_RESAMPLE_PROB:
+                    x0 = thresh_img(x0, [0, 2*hyp_parameters['noise_scale']])
                 x0 = x0 * (np.random.normal(1, hyp_parameters['noise_scale'] * 1)) + np.random.normal(0, hyp_parameters['noise_scale'] * 1)
             # Picking some noise for each of the images in the batch, a timestep and the respective alpha_bars
             )  # pick up a seq of rand number from 0 to 'timestep'
             # proc_type = random.choice(['adding', 'independ', 'downsample', 'slice', 'project', 'none', 'uncon', 'uncon', 'uncon'])
+            proc_type = random.choice(['adding', 'downsample', 'slice', 'slice1', 'none', 'uncon', 'uncon', 'uncon'])
             # print('proc_type:', proc_type)
             cond_img, _, cond_ratio = Deformddpm.module.proc_cond_img(x0,proc_type=proc_type)
             pre_dvf_I,dvf_I = Deformddpm(img_org=x0, t=t, cond_imgs=cond_img, mask=blind_mask,proc_type=[],text=embd)  # forward diffusion process
+            # print(torch.max(torch.abs(pre_dvf_I)))
+            # print(torch.max(torch.abs(dvf_I)))
             loss_tot=0
             loss_ddf = loss_reg(pre_dvf_I,img=x0)
                 print(f"*** Too many NaN or Inf losses ({loss_nan_step} times) at epoch {epoch}, step {step}. Stopping training.")
                 raise ValueError("Too many NaN losses detected in loss_tot. Code terminated.")
             optimizer.zero_grad()
             loss_tot.backward()
             optimizer.step()
+            epoch_loss_tot += loss_tot.item() / total
+            epoch_loss_gen_d += loss_gen_d.item() / total
+            epoch_loss_gen_a += loss_gen_a.item() / total
+            epoch_loss_reg += loss_ddf.item() / total
             # print(loss_gen_a.item())
             # if 0:
                 #     if np.random.uniform(0,1)<0.6:
                 #         x1 = utils.random_resample(x1, deform_scale=0)
                 #         y1 = utils.random_resample(y1, deform_scale=0)
+                # x1 = transformer(x1)
+                # y1 = transformer(y1)
                 [x1, y1] = utils.random_permute([x1, y1], select_dims=[-1,-2,-3])
                 if hyp_parameters['noise_scale']>0:
                     [x1, y1] = thresh_img([x1, y1], [0, 2*hyp_parameters['noise_scale']])
                 # )  # pick up a seq of rand number from 0 to 'timestep'
+                # scale_regist = np.random.uniform(0.6,1.)
                 # T_regist = sorted(random.sample(range(0, int(hyp_parameters["timesteps"] * scale_regist) + 1), 16), reverse=True)
+                # print('T_regist (0.6,1) sampling range:', T_regist)
+                scale_regist = np.random.uniform(0.0,0.7)
+                select_timestep = np.random.randint(8, 17)  # select a random number of timesteps to sample, between 8 and 16
+                T_regist = sorted(random.sample(range(int(hyp_parameters["timesteps"] * scale_regist),hyp_parameters["timesteps"]), select_timestep), reverse=True)
+                # print('T_regist (0.1,0.7) sampling range:', T_regist)
                 # scale_regist = np.random.uniform(0.4,1.)
                 # T_regist = [int(hyp_parameters["timesteps"]*scale_regist)]
                 # scale_regist = np.random.uniform(0.6,1.)
                 # print('T_regist:', T_regist)
                 # proc_type = random.choice(['adding', 'independ', 'downsample', 'slice', 'project', 'none', 'none'])
+                proc_type = random.choice(['downsample', 'slice', 'slice1', 'none', 'none'])
                 # proc_type = random.choice(['project'])
                 y1_proc, msk_tgt, cond_ratio = Deformddpm.module.proc_cond_img(y1,proc_type=proc_type)
+                msk_tgt = msk_tgt+MSK_EPS
                 [ddf_comp,ddf_rand],[img_rec,img_diff,img_save],_ = Deformddpm(img_org=x1, cond_imgs=y1_proc, T=[None, T_regist], proc_type=[],text=embd_y)  # forward diffusion process
+                loss_sim = loss_imgsim(img_rec, y1, label=msk_tgt*(y1>thresh_imgsim))  # calculate loss for the registration process
+                loss_mse = loss_imgmse(img_rec, y1, label=msk_tgt*(y1>=0.0))  # calculate loss for the registration process
+                # loss_ddf1 = loss_reg1(ddf_comp,img=y1,msk=(msk_tgt+MSK_EPS))  # calculate loss for the registration process
+                # loss_sim = loss_imgsim(img_rec, y1, label=(y1>thresh_imgsim))  # calculate loss for the registration process
+                # loss_mse = loss_imgmse(img_rec, y1, label=(y1>=0.0))  # calculate loss for the registration process
+                loss_ddf1 = loss_reg1(ddf_comp, img=y1)  # calculate loss for the registration process
                 loss_regist = 0
                 loss_regist += LOSS_WEIGHTS_REGIST[0] * loss_sim
                 loss_regist += LOSS_WEIGHTS_REGIST[1] * loss_mse
                 loss_regist += LOSS_WEIGHTS_REGIST[2] * loss_ddf1
                 # print('proc_type:', proc_type, 'cond_ratio:', cond_ratio.item())
                 # print('loss_regist:', loss_regist.item(), 'loss_sim:', loss_sim.item(), 'loss_ddf1:', loss_ddf1.item())
                 # >> JZ: print nan in x0
                 if torch.isnan(x0).any():
                     print(f"*** Encountered NaN in input image x0 at epoch {epoch}, step {step}.")
                 # >> JZ: print loss of ddf
+                if loss_ddf1>0.002:
                     print(f"*** High registration DDF loss at epoch {epoch}, step {step}: {loss_ddf1.item()}.")
                     # # Print gradients for each parameter
                     # for name, param in Deformddpm.named_parameters():
                     #         print(f"Gradient for {name}: {param.grad.norm()}")
                     #     else:
                     #         print(f"Gradient for {name}: None")
                 loss_regist = torch.sqrt(cond_ratio+MSK_EPS) *loss_regist
                 optimizer.zero_grad()
                 loss_regist.backward()
+                torch.nn.utils.clip_grad_norm_(Deformddpm.parameters(), max_norm=0.2)
                 optimizer.step()
+                epoch_loss_regist += loss_regist.item() / total
+                epoch_loss_imgsim += loss_sim.item() / total
+                epoch_loss_imgmse += loss_mse.item() / total
+                epoch_loss_ddfreg += loss_ddf1.item() / total
+            if step % 10 == 0:
+                print('step:',step,':', loss_tot.item(),'=',loss_gen_a.item(),'+', loss_gen_d.item(),'+',loss_ddf.item())
+                print(f'     loss_regist: {loss_regist} = {loss_sim} (imgsim) + {loss_mse} (imgmse) + {loss_ddf1} (ddf)')
             # break   # FOR TESTING
             # else:
             #     print('loss_gen_a:',loss_gen_a.item())     # FOR TESTING
     if use_distributed and dist.is_initialized():
         dist.destroy_process_group()
+def ddp_load_dict(gpu_id, Deformddpm, optimizer, model_file,use_distributed=True, load_strict=False):
     if gpu_id == 0:
     # if 0:
         checkpoint = torch.load(model_file)
         # checkpoint = torch.load(latest_model_file, map_location=f"cuda:{rank}")
         if use_distributed:
+            Deformddpm.module.load_state_dict(checkpoint['model_state_dict'], strict=load_strict)
         else:
+            Deformddpm.load_state_dict(checkpoint['model_state_dict'], strict=load_strict)
+        if load_strict:
+            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
         utils.print_memory_usage("After Loading Checkpoint on GPU")
     if use_distributed:

OM_train_3modes-XPU.py ADDED Viewed

	@@ -0,0 +1,957 @@

+import os, sys, contextlib
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(ROOT_DIR)
+import gc
+import torch
+import torchvision
+from torch import nn
+from torchvision.utils import save_image
+from torch.utils.data import DataLoader
+from torch.optim import Adam, SGD
+from Diffusion.diffuser import DeformDDPM
+from Diffusion.networks import get_net, STN
+from torchvision.transforms import Lambda
+import torch.nn.functional as F
+import Diffusion.losses as losses
+import random
+import glob
+import numpy as np
+import utils
+from tqdm import tqdm
+from Dataloader.dataloader0 import get_dataloader
+from Dataloader.dataLoader import *
+from Dataloader.dataloader_utils import thresh_img
+import yaml
+import argparse
+# XPU support: import Intel Extension for PyTorch and oneCCL bindings if available
+try:
+    import intel_extension_for_pytorch as ipex
+except ImportError:
+    ipex = None
+try:
+    import oneccl_bindings_for_pytorch
+except (ImportError, Exception) as e:
+    print(f"WARNING: Failed to import oneccl_bindings_for_pytorch: {e}")
+####################
+import torch.multiprocessing as mp
+from torch.utils.data.distributed import DistributedSampler
+from torch.nn.parallel import DistributedDataParallel as DDP
+import torch.distributed as dist
+# from torch.distributed import init_process_group
+###############
+def _device_available(device_type):
+    if device_type == 'xpu':
+        return hasattr(torch, 'xpu') and torch.xpu.is_available()
+    return torch.cuda.is_available()
+def _device_count(device_type):
+    if device_type == 'xpu':
+        return torch.xpu.device_count() if hasattr(torch, 'xpu') else 0
+    return torch.cuda.device_count()
+def _set_device(rank, device_type):
+    if device_type == 'xpu':
+        torch.xpu.set_device(rank)
+    else:
+        torch.cuda.set_device(rank)
+def _empty_cache(device_type):
+    if device_type == 'xpu' and hasattr(torch, 'xpu'):
+        torch.xpu.empty_cache()
+    elif torch.cuda.is_available():
+        torch.cuda.empty_cache()
+def ddp_setup(rank, world_size):
+    """
+    Args:
+        rank: Unique identifier of each process (local_rank when launched by torchrun)
+        world_size: Total number of processes
+    """
+    backend = "ccl" if DEVICE_TYPE == "xpu" else "nccl"
+    if "LOCAL_RANK" in os.environ:
+        # Launched by torchrun: MASTER_ADDR, MASTER_PORT, RANK, WORLD_SIZE already set
+        dist.init_process_group(backend=backend)
+        _set_device(int(os.environ["LOCAL_RANK"]), DEVICE_TYPE)
+    else:
+        # Single-node mp.spawn
+        os.environ["MASTER_ADDR"] = "localhost"
+        os.environ["MASTER_PORT"] = "12355"
+        dist.init_process_group(backend=backend, rank=rank, world_size=world_size)
+        _set_device(rank, DEVICE_TYPE)
+EPS = 1e-5
+MSK_EPS = 0.01
+TEXT_EMBED_PROB = 0.5
+AUG_RESAMPLE_PROB = 0.5
+LOSS_WEIGHTS_DIFF = [2.0, 1.0, 4.0]  # [ang, dist, reg]
+# LOSS_WEIGHTS_REGIST = [9.0, 1.0, 16.0]  # [imgsim, imgmse, ddf]
+LOSS_WEIGHTS_REGIST = [1.0, 0.01, 1e2]  # [imgsim, imgmse, ddf]
+DIFF_REG_BATCH_RATIO = 2
+LOSS_WEIGHT_CONTRASTIVE = 1e-4
+REGISTRATION_STEP_RATIO = 1
+CONTRASTIVE_STEP_RATIO = 1
+MID_EPOCH_SAVE_STEPS = 10  # Save mid-epoch checkpoint every N steps for crash recovery.
+                           # XPU autograd leaks ~1.0 GiB/step of device memory (Intel bug).
+                           # With gradient checkpointing, training survives ~26 steps from fresh start,
+                           # but fewer when carrying leaked memory from previous epoch.
+                           # Save every 10 steps to minimize lost work on OOM crash.
+EXIT_CODE_RESTART = 42     # Exit code signaling proactive restart (not a crash).
+# AUG_PERMUTE_PROB = 0.35
+parser = argparse.ArgumentParser()
+# config_file_path = 'Config/config_cmr.yaml'
+parser.add_argument(
+        "--config",
+        "-C",
+        help="Path for the config file",
+        type=str,
+        # default="Config/config_cmr.yaml",
+        # default="Config/config_lct.yaml",
+        default="Config/config_all.yaml",
+        required=False,
+    )
+parser.add_argument("--dummy-samples", type=int, default=0, help="Use dummy random data for testing (0=use real data)")
+parser.add_argument("--batchsize", type=int, default=0, help="Override batch size from config (0=use config value)")
+parser.add_argument("--max-steps-before-restart", type=int, default=0,
+                    help="Proactive restart: exit after N training steps to reset XPU memory leak. "
+                         "0=disabled (rely on OOM crash + auto-resubmit). "
+                         "Recommended: 20 for XPU (survives ~26 steps max).")
+parser.add_argument("--no-save", action="store_true",
+                    help="Disable all checkpoint saving (for diagnostic/validation runs)")
+parser.add_argument("--reset-optimizer", action="store_true",
+                    help="Skip optimizer state loading from checkpoint (use when architecture changed)")
+parser.add_argument("--eval-only", action="store_true",
+                    help="Forward pass only: compute and print losses without backward/optimizer (no memory leak)")
+args = parser.parse_args()
+# Read config early to determine device type for DDP setup
+with open(args.config, 'r') as _f:
+    _cfg = yaml.safe_load(_f)
+DEVICE_TYPE = _cfg.get('device', 'cuda')  # 'cuda' or 'xpu'
+# Auto-detect: use DDP only when multiple devices are available
+use_distributed = _device_available(DEVICE_TYPE) and _device_count(DEVICE_TYPE) > 1
+# use_distributed = True
+# use_distributed = False
+#=======================================================================================================================
+class _DummyIndiv(torch.utils.data.Dataset):
+    def __init__(self, n, sz, embd_dim=1024):
+        self.n, self.sz, self.embd_dim = n, sz, embd_dim
+    def __len__(self): return self.n
+    def __getitem__(self, i):
+        return np.random.rand(1, self.sz, self.sz, self.sz).astype(np.float64), np.random.randn(self.embd_dim).astype(np.float32)
+class _DummyPair(torch.utils.data.Dataset):
+    def __init__(self, n, sz, embd_dim=1024):
+        self.n, self.sz, self.embd_dim = n, sz, embd_dim
+    def __len__(self): return self.n
+    def __getitem__(self, i):
+        return (np.random.rand(1, self.sz, self.sz, self.sz).astype(np.float64),
+                np.random.rand(1, self.sz, self.sz, self.sz).astype(np.float64),
+                np.random.randn(self.embd_dim).astype(np.float32),
+                np.random.randn(self.embd_dim).astype(np.float32))
+def main_train(rank=0,world_size=1,train_mode_ratio=1,thresh_imgsim=0.01):
+    if use_distributed:
+        ddp_setup(rank,world_size)
+        if torch.distributed.is_initialized() and rank == 0:
+            print(f"World size: {torch.distributed.get_world_size()}")
+            print(f"Communication backend: {torch.distributed.get_backend()}")
+            print(f"PYTORCH_ALLOC_CONF: {os.environ.get('PYTORCH_ALLOC_CONF', 'not set')}")
+            if DEVICE_TYPE == 'xpu' and hasattr(torch, 'xpu'):
+                props = torch.xpu.get_device_properties(0)
+                print(f"XPU device: {props.name}, total memory: {props.total_memory / 1024**3:.2f} GiB")
+    # gpu_id = global rank (for save/print guards); rank = local device index
+    if "RANK" in os.environ:
+        gpu_id = int(os.environ["RANK"])
+        rank = int(os.environ["LOCAL_RANK"])
+    else:
+        gpu_id = rank
+    # Load the YAML file into a dictionary
+    with open(args.config, 'r') as file:
+        hyp_parameters = yaml.safe_load(file)
+    if args.batchsize > 0:
+        hyp_parameters['batchsize'] = args.batchsize
+    if gpu_id == 0:
+        print(hyp_parameters)
+    # epoch_per_save=10
+    epoch_per_save=hyp_parameters['epoch_per_save']
+    data_name=hyp_parameters['data_name']
+    net_name = hyp_parameters['net_name']
+    Net=get_net(net_name)
+    suffix_pth=f'_{data_name}_{net_name}.pth'
+    model_save_path = os.path.join('Models',f'{data_name}_{net_name}/')
+    model_dir=model_save_path
+    transformer=utils.get_transformer(img_sz=hyp_parameters["ndims"]*[hyp_parameters['img_size']])
+    # Data_Loader=get_dataloader(data_name=hyp_parameters['data_name'], mode='train')
+    # tsfm = torchvision.transforms.Compose([
+    #             torchvision.transforms.ToTensor(),
+    #             ])
+    # dataset = Data_Loader(target_res = [hyp_parameters["img_size"]]*hyp_parameters["ndims"], transforms=None, noise_scale=hyp_parameters['noise_scale'])
+    # train_loader = DataLoader(
+    #     dataset,
+    #     batch_size=hyp_parameters['batchsize'],
+    #     # shuffle=False,
+    #     shuffle=True,
+    #     drop_last=True,
+    # )
+    if args.dummy_samples > 0:
+        dataset = _DummyIndiv(args.dummy_samples, hyp_parameters['img_size'])
+        datasetp = _DummyPair(args.dummy_samples, hyp_parameters['img_size'])
+    else:
+        # dataset = OminiDataset_v1(transform=None)
+        dataset = OMDataset_indiv(transform=None)
+        # datasetp = OminiDataset_paired(transform=None)
+        datasetp = OMDataset_pair(transform=None)
+    if use_distributed:
+        sampler = DistributedSampler(dataset, shuffle=True)
+        sampler_p = DistributedSampler(datasetp, shuffle=True)
+    else:
+        sampler = None
+        sampler_p = None
+    train_loader = DataLoader(
+        dataset,
+        batch_size=hyp_parameters['batchsize'],
+        shuffle=(sampler is None),
+        drop_last=True,
+        sampler=sampler,
+    )
+    train_loader_p = DataLoader(
+        datasetp,
+        batch_size=max(1, hyp_parameters['batchsize']//DIFF_REG_BATCH_RATIO),
+        shuffle=(sampler_p is None),
+        drop_last=True,
+        sampler=sampler_p,
+    )
+    network = Net(
+        n_steps=hyp_parameters["timesteps"],
+        ndims=hyp_parameters["ndims"],
+        num_input_chn = hyp_parameters["num_input_chn"],
+        res = hyp_parameters['img_size']
+    )
+    # Enable gradient checkpointing on XPU to reduce peak activation memory.
+    # XPU autograd leaks ~1.0 GiB/step; lower peak buys more steps before OOM.
+    if DEVICE_TYPE == 'xpu' and hasattr(network, 'use_checkpoint'):
+        network.use_checkpoint = True
+        if gpu_id == 0:
+            print("  [init] Gradient checkpointing enabled for XPU", flush=True)
+    Deformddpm = DeformDDPM(
+        network=network,
+        n_steps=hyp_parameters["timesteps"],
+        image_chw=[1] + [hyp_parameters["img_size"]]*hyp_parameters["ndims"],
+        device=hyp_parameters["device"],
+        batch_size=hyp_parameters["batchsize"],
+        img_pad_mode=hyp_parameters["img_pad_mode"],
+        v_scale=hyp_parameters["v_scale"],
+    )
+    ddf_stn = STN(
+        img_sz=hyp_parameters["img_size"],
+        ndims=hyp_parameters["ndims"],
+        # padding_mode="zeros",
+        padding_mode=hyp_parameters["padding_mode"],
+        device=hyp_parameters["device"],
+    )
+    if use_distributed:
+        device = f"{DEVICE_TYPE}:{rank}"
+        # NO pre-allocation. CCL/oneDNN accumulate ~1.4 GiB/step of device memory outside
+        # PyTorch's caching allocator. Pre-allocating steals from that budget:
+        #   92% pre-alloc → crash at step 3, 78% → step 10, none (70% cap) → step 14.
+        # Instead, use empty_cache() between training phases to release unused cached memory
+        # back to the device for CCL/oneDNN.
+        if gpu_id == 0 and DEVICE_TYPE == 'xpu' and hasattr(torch, 'xpu'):
+            total_mem = torch.xpu.get_device_properties(rank).total_memory
+            print(f"  [init] XPU device memory: {total_mem/1024**3:.1f} GiB, no pre-allocation (relying on empty_cache between phases)", flush=True)
+        Deformddpm.to(device)
+        Deformddpm = DDP(Deformddpm, device_ids=[rank], find_unused_parameters=True)
+        ddf_stn.to(device)
+    else:
+        Deformddpm.to(hyp_parameters["device"])
+        ddf_stn.to(hyp_parameters["device"])
+    # ddf_stn = DDP(ddf_stn, device_ids=[rank])
+    # mse = nn.MSELoss()
+    # loss_reg = losses.Grad(penalty=['l1', 'negdetj'], ndims=hyp_parameters["ndims"])
+    # loss_reg = losses.Grad(penalty=['l1', 'negdetj', 'range'], ndims=hyp_parameters["ndims"])
+    loss_reg = losses.Grad(penalty=['l1', 'negdetj', 'range'], ndims=hyp_parameters["ndims"],outrange_thresh=0.2,outrange_weight=1e3)
+    loss_reg1 = losses.Grad(penalty=['l1', 'negdetj', 'range'], ndims=hyp_parameters["ndims"],outrange_thresh=0.6,outrange_weight=1e3)
+    loss_dist = losses.MRSE(img_sz=hyp_parameters["img_size"])
+    # loss_ang = losses.MRSE(img_sz=hyp_parameters["img_size"])
+    loss_ang = losses.NCC(img_sz=hyp_parameters["img_size"])
+    loss_imgsim = losses.MSLNCC()
+    loss_imgmse = losses.LMSE()
+    optimizer = Adam(Deformddpm.parameters(), lr=hyp_parameters["lr"])
+    # hyp_parameters["lr"]=0.00000001
+    # optimizer_regist = Adam(Deformddpm.parameters(), lr=hyp_parameters["lr"]*0.01)
+    # optimizer_regist = SGD(Deformddpm.parameters(), lr=hyp_parameters["lr"]*0.01, momentum=0.98)
+    # optimizer = SGD(Deformddpm.parameters(), lr=hyp_parameters["lr"], momentum=0.9)
+    # # LR scheduler ----- YHM
+    # scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, hyp_parameters["lr"], hyp_parameters["lr"]*10, step_size_up=500, step_size_down=500, mode='triangular', gamma=1.0, scale_fn=None, scale_mode='cycle', cycle_momentum=True, base_momentum=0.8, max_momentum=0.9, last_epoch=-1)
+    # Deformddpm.network.load_state_dict(torch.load('/home/data/jzheng/Adaptive_Motion_Generator-master/models/1000.pth'))
+    # check for existing models
+    if not os.path.exists(model_dir):
+        os.makedirs(model_dir, exist_ok=True)
+    # Check for checkpoints: first check tmp/ for mid-epoch, then main dir for epoch-level
+    tmp_dir = os.path.join(model_dir, "tmp")
+    tmp_files = sorted(glob.glob(os.path.join(tmp_dir, "*.pth")))
+    model_files = sorted(glob.glob(os.path.join(model_dir, "*.pth")))
+    initial_step = 0
+    # Epoch stats and RNG states to restore when resuming from mid-epoch checkpoint
+    _resume_epoch_stats = None
+    _resume_rng = None
+    if tmp_files and not args.eval_only and args.max_steps_before_restart > 0:
+        # Mid-epoch checkpoint: only use when proactive restart is enabled
+        latest = tmp_files[-1]
+        if gpu_id == 0:
+            print(f"  [resume] Found mid-epoch checkpoint: {latest}")
+        initial_epoch, Deformddpm, optimizer = ddp_load_dict(gpu_id, Deformddpm, optimizer, latest, use_distributed=use_distributed)
+        basename = os.path.basename(latest)
+        initial_step = int(basename.split('_step')[1].split('_')[0].split('.')[0])
+        _ckpt = torch.load(latest, map_location='cpu', weights_only=False)
+        _resume_epoch_stats = _ckpt.get('epoch_stats', None)
+        del _ckpt
+        if gpu_id == 0:
+            print(f"  [resume] Resuming epoch {initial_epoch} from step {initial_step}"
+                  f"{' (with epoch_stats)' if _resume_epoch_stats else ''}", flush=True)
+    elif model_files:
+        if gpu_id == 0:
+            print(model_files)
+        latest = model_files[-1]
+        initial_epoch, Deformddpm, optimizer = ddp_load_dict(gpu_id, Deformddpm, optimizer, latest, use_distributed=use_distributed)
+    else:
+        initial_epoch = 0
+    if gpu_id == 0:
+        print('len_train_data: ',len(dataset))
+    # Proactive restart: track steps since process start to exit before OOM.
+    max_steps_restart = args.max_steps_before_restart
+    steps_since_start = 0
+    # Training loop
+    for epoch in range(initial_epoch,hyp_parameters["epoch"]):
+        if use_distributed and sampler is not None:
+            sampler.set_epoch(epoch)
+            sampler_p.set_epoch(epoch)
+        epoch_loss_tot = 0.0
+        epoch_loss_gen_d = 0.0
+        epoch_loss_gen_a = 0.0
+        epoch_loss_reg = 0.0
+        epoch_loss_regist = 0.0
+        epoch_loss_imgsim = 0.0
+        epoch_loss_imgmse = 0.0
+        epoch_loss_ddfreg = 0.0
+        epoch_loss_contrastive = 0.0
+        total_contra = 0
+        total_reg_restored = None
+        total_contra_restored = None
+        # Restore epoch accumulators from mid-epoch checkpoint (only for the resumed epoch)
+        if _resume_epoch_stats is not None and epoch == initial_epoch:
+            epoch_loss_tot = _resume_epoch_stats.get('epoch_loss_tot', 0.0)
+            epoch_loss_gen_d = _resume_epoch_stats.get('epoch_loss_gen_d', 0.0)
+            epoch_loss_gen_a = _resume_epoch_stats.get('epoch_loss_gen_a', 0.0)
+            epoch_loss_reg = _resume_epoch_stats.get('epoch_loss_reg', 0.0)
+            epoch_loss_regist = _resume_epoch_stats.get('epoch_loss_regist', 0.0)
+            epoch_loss_imgsim = _resume_epoch_stats.get('epoch_loss_imgsim', 0.0)
+            epoch_loss_imgmse = _resume_epoch_stats.get('epoch_loss_imgmse', 0.0)
+            epoch_loss_ddfreg = _resume_epoch_stats.get('epoch_loss_ddfreg', 0.0)
+            epoch_loss_contrastive = _resume_epoch_stats.get('epoch_loss_contrastive', 0.0)
+            total_reg_restored = _resume_epoch_stats.get('total_reg', None)
+            total_contra_restored = _resume_epoch_stats.get('total_contra', None)
+            loss_nan_step = _resume_epoch_stats.get('loss_nan_step', 0)
+            # RNG states are restored INSIDE the skip loop (at the last skipped step)
+            # to avoid DataLoader __getitem__ calls corrupting the restored state.
+            _resume_rng = {k: _resume_epoch_stats[k] for k in
+                           ('rng_torch', 'rng_numpy', 'rng_python', 'rng_xpu', 'rng_cuda')
+                           if k in _resume_epoch_stats}
+            if gpu_id == 0:
+                print(f"  [resume] Restored epoch stats from checkpoint (loss_tot={epoch_loss_tot:.4f})", flush=True)
+            _resume_epoch_stats = None  # Only restore once
+        else:
+            loss_nan_step = 0  # only reset when NOT resuming mid-epoch
+        # Set model inside to train model
+        Deformddpm.train()
+        total = min(len(train_loader), len(train_loader_p))
+        total_reg = total // REGISTRATION_STEP_RATIO
+        # Restore total_reg and total_contra from checkpoint if available (mid-epoch resume)
+        if total_reg_restored is not None:
+            total_reg = total_reg_restored
+            total_reg_restored = None
+        if total_contra_restored is not None:
+            total_contra = total_contra_restored
+            total_contra_restored = None
+        # for step, batch in tqdm(enumerate(train_loader)):
+        # for step, batch in tqdm(enumerate(train_loader)):
+        # for step, batch in enumerate(train_loader_omni):
+        for step, (batch, batch_p) in tqdm(enumerate(zip(train_loader, train_loader_p)), total=total):
+            # Skip steps already completed (mid-epoch resume).
+            # Checkpoint at step N is saved AFTER step N's training completes,
+            # so step N itself must also be skipped (use <=, not <).
+            if epoch == initial_epoch and initial_step > 0 and step <= initial_step:
+                # Restore RNG at the last skipped step, AFTER DataLoader __getitem__
+                # has consumed RNG for all skipped batches. This way the first
+                # non-skipped step starts with exactly the saved RNG state.
+                if step == initial_step and _resume_rng is not None:
+                    # Restore rank 0's RNG as base state, then re-seed per-rank
+                    # so each rank has independent RNG (matching continuous run's
+                    # divergent-per-rank behavior). Without this, all ranks would
+                    # share rank 0's RNG → correlated augmentation/dropout decisions.
+                    if 'rng_torch' in _resume_rng:
+                        torch.set_rng_state(_resume_rng['rng_torch'])
+                    if 'rng_numpy' in _resume_rng:
+                        np.random.set_state(_resume_rng['rng_numpy'])
+                    if 'rng_python' in _resume_rng:
+                        random.setstate(_resume_rng['rng_python'])
+                    if 'rng_xpu' in _resume_rng and DEVICE_TYPE == 'xpu':
+                        torch.xpu.set_rng_state(_resume_rng['rng_xpu'])
+                    elif 'rng_cuda' in _resume_rng and torch.cuda.is_available():
+                        torch.cuda.set_rng_state(_resume_rng['rng_cuda'])
+                    # Per-rank re-seed: checkpoint only has rank 0's RNG state.
+                    # Advance each rank's RNG by a deterministic offset so they
+                    # diverge (as they would in a continuous run).
+                    if gpu_id > 0:
+                        rank_seed = gpu_id * 100003 + initial_step * 31
+                        torch.manual_seed(torch.initial_seed() + rank_seed)
+                        np.random.seed((np.random.get_state()[1][0] + rank_seed) % (2**31))
+                        random.seed(random.getrandbits(32) + rank_seed)
+                        if DEVICE_TYPE == 'xpu' and hasattr(torch, 'xpu'):
+                            torch.xpu.manual_seed(torch.initial_seed() + rank_seed)
+                        elif torch.cuda.is_available():
+                            torch.cuda.manual_seed(torch.initial_seed() + rank_seed)
+                    _resume_rng = None
+                    if gpu_id == 0:
+                        print(f"  [resume] RNG states restored at step {step} (per-rank re-seeded)", flush=True)
+                continue
+            # Free registration tensors from previous step
+            x1 = y1 = ddf_comp = img_rec = img_diff = None
+            ddf_rand = y1_proc = msk_tgt = img_save = None
+            loss_regist = loss_sim = loss_mse = loss_ddf1 = None
+            # Memory diagnostic (one per node via local rank 0) — only warn when abnormal
+            # Normal at step start: ~16 GiB reserved, ~48 GiB free (of 64 GiB total)
+            if rank == 0 and DEVICE_TYPE == 'xpu' and hasattr(torch, 'xpu'):
+                torch.xpu.reset_peak_memory_stats(rank)
+                free_mem, total_mem_dev = torch.xpu.mem_get_info(rank)
+                used_gib = (total_mem_dev - free_mem) / 1024**3
+                if used_gib > 24:  # Normal is ~16 GiB at step start; warn if accumulating
+                    alloc = torch.xpu.memory_allocated() / 1024**3
+                    reserved = torch.xpu.memory_reserved() / 1024**3
+                    free_gib = free_mem / 1024**3
+                    print(f"  [mem WARNING] gpu_id={gpu_id} epoch {epoch} step {step}: "
+                          f"{used_gib:.1f} GiB used ({alloc:.1f} alloc / {reserved:.1f} reserved), "
+                          f"{free_gib:.1f} GiB free", flush=True)
+            # ==========================================================================
+            # diffusion train on single image
+            # x0 = batch # for omni dataset
+            [x0,embd] = batch # for om dataset
+            x0 = x0.to(hyp_parameters["device"]).type(torch.float32)
+            # print('embd:', embd.shape)
+            embd_dev = embd.to(hyp_parameters["device"]).type(torch.float32)
+            if np.random.uniform(0,1)<TEXT_EMBED_PROB:
+                embd_in = embd_dev
+            else:
+                embd_in = None
+            n = x0.size()[0]  # batch_size -> n
+            x0 = x0.to(hyp_parameters["device"])
+            blind_mask = utils.get_random_deformed_mask(x0.shape[2:],apply_possibility=0.6).to(hyp_parameters["device"])
+            # random deformation + rotation
+            if hyp_parameters["ndims"]>2:
+                if np.random.uniform(0,1)<AUG_RESAMPLE_PROB:
+                    x0 = utils.random_resample(x0, deform_scale=0)
+                # elif np.random.uniform(0,1)<AUG_RESAMPLE_PROB+AUG_PERMUTE_PROB:
+                else:
+                    [x0] = utils.random_permute([x0], select_dims=[-1,-2,-3])
+            # x0 = transformer(x0)
+            if hyp_parameters['noise_scale']>0:
+                if np.random.uniform(0,1)<AUG_RESAMPLE_PROB:
+                    x0 = thresh_img(x0, [0, 2*hyp_parameters['noise_scale']])
+                x0 = x0 * (np.random.normal(1, hyp_parameters['noise_scale'] * 1)) + np.random.normal(0, hyp_parameters['noise_scale'] * 1)
+            # Picking some noise for each of the images in the batch, a timestep and the respective alpha_bars
+            t = torch.randint(0, hyp_parameters["timesteps"], (n,)).to(
+                hyp_parameters["device"]
+            )  # pick up a seq of rand number from 0 to 'timestep'
+            # proc_type = random.choice(['adding', 'independ', 'downsample', 'slice', 'project', 'none', 'uncon', 'uncon', 'uncon'])
+            proc_type = random.choice(['adding', 'downsample', 'slice', 'slice1', 'none', 'uncon', 'uncon', 'uncon'])
+            # print('proc_type:', proc_type)
+            ddpm = Deformddpm.module if use_distributed else Deformddpm
+            cond_img, _, cond_ratio = ddpm.proc_cond_img(x0,proc_type=proc_type)
+            pre_dvf_I,dvf_I = Deformddpm(img_org=x0, t=t, cond_imgs=cond_img, mask=blind_mask,proc_type=[],text=embd_in)  # forward diffusion process
+            loss_tot=0
+            loss_ddf = loss_reg(pre_dvf_I,img=x0)
+            trm_pred = ddf_stn(pre_dvf_I, dvf_I)
+            loss_gen_d = loss_dist(pred=trm_pred,inv_lab=dvf_I,ddf_stn=None,mask=blind_mask)
+            loss_gen_a = loss_ang(pred=trm_pred,inv_lab=dvf_I,ddf_stn=None,mask=blind_mask)
+            loss_tot += LOSS_WEIGHTS_DIFF[0] * loss_gen_a + LOSS_WEIGHTS_DIFF[1] * loss_gen_d
+            loss_tot += LOSS_WEIGHTS_DIFF[2] * loss_ddf
+            loss_tot = torch.sqrt(1.+MSK_EPS-cond_ratio) * loss_tot
+            # >> JZ: print nan in x0
+            if torch.isnan(x0).any():
+                print(f"*** Encountered NaN in input image x0 at epoch {epoch}, step {step}.")
+            # >> JZ: print loss of ddf
+            if loss_ddf>0.001:
+                print(f"*** High diffusion DDF loss at epoch {epoch}, step {step}: {loss_ddf.item()}.")
+            # yu: check if loss_tot==nan or inf
+            # Synchronize NaN skip across all DDP ranks to avoid collective desync
+            # Use broadcast from rank 0 instead of all_reduce to avoid CCL hang on single-node XPU
+            is_nan = torch.isnan(loss_tot) or torch.isinf(loss_tot)
+            if use_distributed:
+                nan_flag = torch.tensor([1.0 if is_nan else 0.0], device=f"{DEVICE_TYPE}:{rank}")
+                dist.broadcast(nan_flag, src=0)
+                is_nan = nan_flag.item() > 0
+            if is_nan:
+                if gpu_id == 0:
+                    print(f"*** Encountered NaN or Inf loss at epoch {epoch}, step {step}. Skipping this batch.")
+                loss_nan_step += 1
+                continue
+            if loss_nan_step > 5:
+                print(f"*** Too many NaN or Inf losses ({loss_nan_step} times) at epoch {epoch}, step {step}. Stopping training.")
+                raise ValueError("Too many NaN losses detected in loss_tot. Code terminated.")
+            # ==========================================================================
+            # Diffusion backward (no gradient clipping — diffusion dominates training)
+            if not args.eval_only:
+                optimizer.zero_grad()
+                loss_tot.backward()
+                optimizer.step()
+            epoch_loss_tot += loss_tot.item() / total
+            epoch_loss_gen_d += loss_gen_d.item() / total
+            epoch_loss_gen_a += loss_gen_a.item() / total
+            epoch_loss_reg += loss_ddf.item() / total
+            # Print running average every 20 steps in eval-only mode
+            if args.eval_only and gpu_id == 0 and (step + 1) % 20 == 0:
+                n = step + 1
+                print(f"  [eval] step {step}: running_avg ang={epoch_loss_gen_a*total/n:.4f} "
+                      f"dist={epoch_loss_gen_d*total/n:.4f} regul={epoch_loss_reg*total/n:.6f}", flush=True)
+            # Free diffusion intermediates and aggressively release all memory to device.
+            # XPU runtime leaks ~1.3 GiB/step outside the caching allocator.
+            # gc.collect() + synchronize() + empty_cache() attempts to reclaim deferred/lazy allocations.
+            loss_gen_a_val = loss_gen_a.item()
+            del pre_dvf_I, dvf_I, trm_pred, loss_tot, loss_gen_a, loss_gen_d, loss_ddf
+            gc.collect()
+            if DEVICE_TYPE == 'xpu':
+                torch.xpu.synchronize()
+                _empty_cache(DEVICE_TYPE)
+            # Sync loss_gen_a across DDP ranks for contrastive and registration gating
+            if use_distributed:
+                loss_gen_a_sync = torch.tensor([loss_gen_a_val], device=f"{DEVICE_TYPE}:{rank}")
+                dist.broadcast(loss_gen_a_sync, src=0)
+                loss_gen_a_gate = loss_gen_a_sync.item()
+            else:
+                loss_gen_a_gate = loss_gen_a_val
+            # ==========================================================================
+            # Contrastive train on single image (text-image alignment)
+            # Separate backward with gradient clipping to prevent destabilizing diffusion.
+            loss_contra_val = None
+            if step % CONTRASTIVE_STEP_RATIO == 0:
+                n_contra = x0.size()[0]
+                t_contra = torch.randint(0, hyp_parameters["timesteps"], (n_contra,)).to(hyp_parameters["device"])
+                # Route through DDP wrapper and return img_embd directly so DDP
+                # traces the correct subgraph (encoder + mid + attn + img2txt).
+                img_embd = Deformddpm(img_org=(x0 * blind_mask).detach(), cond_imgs=cond_img.detach(), T=t_contra, output_embedding=True, text=None)  # [B, 1024]
+                loss_contra = LOSS_WEIGHT_CONTRASTIVE * F.relu(1 - F.cosine_similarity(img_embd, embd_dev, dim=-1).mean()-0.25)
+                if not args.eval_only:
+                    optimizer.zero_grad()
+                    loss_contra.backward()
+                    torch.nn.utils.clip_grad_norm_(Deformddpm.parameters(), max_norm=1e-3)
+                    optimizer.step()
+                loss_contra_val = loss_contra.item()
+                epoch_loss_contrastive += loss_contra_val / total * CONTRASTIVE_STEP_RATIO
+            # Free remaining intermediates and aggressively release memory before registration
+            if cond_img is not None:
+                del cond_img
+            if blind_mask is not None:
+                del blind_mask
+            gc.collect()
+            if DEVICE_TYPE == 'xpu':
+                torch.xpu.synchronize()
+            _empty_cache(DEVICE_TYPE)
+            # ==========================================================================
+            # registration train on paired images
+            # loss_gen_a_gate already synced across DDP ranks above
+            do_regist = step % REGISTRATION_STEP_RATIO == 0 and loss_gen_a_gate < -0.8
+            if do_regist:
+                [x1, y1, _, embd_y] = batch_p
+                if np.random.uniform(0,1)<TEXT_EMBED_PROB:
+                    embd_y = embd_y.to(hyp_parameters["device"]).type(torch.float32)
+                else:
+                    embd_y = None
+                x1 = x1.to(hyp_parameters["device"]).type(torch.float32)
+                y1 = y1.to(hyp_parameters["device"]).type(torch.float32)
+                n = x1.size()[0]  # batch_size -> n
+                [x1, y1] = utils.random_permute([x1, y1], select_dims=[-1,-2,-3])
+                if hyp_parameters['noise_scale']>0:
+                    [x1, y1] = thresh_img([x1, y1], [0, 2*hyp_parameters['noise_scale']])
+                    random_scale = np.random.normal(1, hyp_parameters['noise_scale'] * 1)
+                    random_shift = np.random.normal(0, hyp_parameters['noise_scale'] * 1)
+                    x1 = x1 * random_scale + random_shift
+                    y1 = y1 * random_scale + random_shift
+                scale_regist = np.random.uniform(0.0,0.5)
+                select_timestep = np.random.randint(12, 32)  # select a random number of timesteps to sample, between 8 and 16
+                T_regist = sorted(random.sample(range(int(hyp_parameters["timesteps"] * scale_regist),hyp_parameters["timesteps"]), select_timestep), reverse=True)
+                T_regist = [[t for _ in range(max(1, hyp_parameters["batchsize"]//2))] for t in T_regist]
+                proc_type = random.choice(['downsample', 'slice', 'slice1', 'none', 'none'])
+                ddpm_inner = Deformddpm.module if use_distributed else Deformddpm
+                y1_proc, msk_tgt, cond_ratio = ddpm_inner.proc_cond_img(y1,proc_type=proc_type)
+                msk_tgt = msk_tgt+MSK_EPS
+                [ddf_comp,ddf_rand],[img_rec,img_diff,img_save],_ = Deformddpm(img_org=x1, cond_imgs=y1_proc, T=[None, T_regist], proc_type=[],text=embd_y)  # forward diffusion process
+                loss_sim = loss_imgsim(img_rec, y1, label=msk_tgt*(y1>thresh_imgsim))  # calculate loss for the registration process
+                loss_mse = loss_imgmse(img_rec, y1, label=msk_tgt*(y1>=0.0))  # calculate loss for the registration process
+                loss_ddf1 = loss_reg1(ddf_comp, img=y1)  # calculate loss for the registration process
+                loss_regist = 0
+                loss_regist += LOSS_WEIGHTS_REGIST[0] * loss_sim
+                loss_regist += LOSS_WEIGHTS_REGIST[1] * loss_mse
+                loss_regist += LOSS_WEIGHTS_REGIST[2] * loss_ddf1
+                # >> JZ: print nan in x0
+                if torch.isnan(x0).any():
+                    print(f"*** Encountered NaN in input image x0 at epoch {epoch}, step {step}.")
+                # >> JZ: print loss of ddf
+                if loss_ddf1>0.002:
+                    print(f"*** High registration DDF loss at epoch {epoch}, step {step}: {loss_ddf1.item()}.")
+                loss_regist = torch.sqrt(cond_ratio+MSK_EPS) *loss_regist
+                if not args.eval_only:
+                    optimizer.zero_grad()
+                    loss_regist.backward()
+                    # torch.nn.utils.clip_grad_norm_(Deformddpm.parameters(), max_norm=0.1)
+                    torch.nn.utils.clip_grad_norm_(Deformddpm.parameters(), max_norm=0.02)
+                    optimizer.step()
+                epoch_loss_regist += loss_regist.item()
+                epoch_loss_imgsim += loss_sim.item()
+                epoch_loss_imgmse += loss_mse.item()
+                epoch_loss_ddfreg += loss_ddf1.item()
+            else:
+                loss_sim = torch.tensor(0.0)
+                loss_mse = torch.tensor(0.0)
+                loss_ddf1 = torch.tensor(0.0)
+                loss_regist = torch.tensor(0.0)
+                if step % REGISTRATION_STEP_RATIO==0:
+                    total_reg = total_reg-1
+            # Mid-epoch checkpoint and proactive restart (only when --max-steps-before-restart > 0)
+            if max_steps_restart > 0 and step > 0 and step % MID_EPOCH_SAVE_STEPS == 0 and gpu_id == 0 and not args.no_save:
+                _epoch_stats = {
+                    'epoch_loss_tot': epoch_loss_tot,
+                    'epoch_loss_gen_d': epoch_loss_gen_d,
+                    'epoch_loss_gen_a': epoch_loss_gen_a,
+                    'epoch_loss_reg': epoch_loss_reg,
+                    'epoch_loss_regist': epoch_loss_regist,
+                    'epoch_loss_imgsim': epoch_loss_imgsim,
+                    'epoch_loss_imgmse': epoch_loss_imgmse,
+                    'epoch_loss_ddfreg': epoch_loss_ddfreg,
+                    'epoch_loss_contrastive': epoch_loss_contrastive,
+                    'total_reg': total_reg,
+                    'total_contra': total_contra,
+                    'loss_nan_step': loss_nan_step,
+                    'rng_torch': torch.get_rng_state(),
+                    'rng_numpy': np.random.get_state(),
+                    'rng_python': random.getstate(),
+                    **(({'rng_xpu': torch.xpu.get_rng_state()} if DEVICE_TYPE == 'xpu' and hasattr(torch, 'xpu') else
+                        {'rng_cuda': torch.cuda.get_rng_state()} if torch.cuda.is_available() else {})),
+                }
+                tmp_dir = os.path.join(model_save_path, "tmp")
+                os.makedirs(tmp_dir, exist_ok=True)
+                for old_f in glob.glob(os.path.join(tmp_dir, "*.pth")):
+                    os.remove(old_f)
+                mid_save = os.path.join(tmp_dir, f"{epoch:06d}_step{step:04d}{suffix_pth}")
+                state = Deformddpm.module.state_dict() if use_distributed else Deformddpm.state_dict()
+                torch.save({
+                    'model_state_dict': state,
+                    'optimizer_state_dict': optimizer.state_dict(),
+                    'epoch': epoch,
+                    'step': step,
+                    'epoch_stats': _epoch_stats,
+                }, mid_save)
+                print(f"  [mid-epoch] Saved checkpoint at epoch {epoch} step {step}: {mid_save}", flush=True)
+            # Proactive restart: exit cleanly after N steps to reset XPU memory leak.
+            # The bash wrapper will re-launch srun within the same SLURM allocation.
+            steps_since_start += 1
+            if max_steps_restart > 0 and steps_since_start >= max_steps_restart:
+                # Save checkpoint at current position (if not just saved above)
+                if not (step > 0 and step % MID_EPOCH_SAVE_STEPS == 0) and gpu_id == 0 and not args.no_save:
+                    _epoch_stats = {
+                        'epoch_loss_tot': epoch_loss_tot, 'epoch_loss_gen_d': epoch_loss_gen_d,
+                        'epoch_loss_gen_a': epoch_loss_gen_a, 'epoch_loss_reg': epoch_loss_reg,
+                        'epoch_loss_regist': epoch_loss_regist, 'epoch_loss_imgsim': epoch_loss_imgsim,
+                        'epoch_loss_imgmse': epoch_loss_imgmse, 'epoch_loss_ddfreg': epoch_loss_ddfreg,
+                        'epoch_loss_contrastive': epoch_loss_contrastive, 'total_reg': total_reg, 'total_contra': total_contra,
+                        'loss_nan_step': loss_nan_step,
+                        'rng_torch': torch.get_rng_state(), 'rng_numpy': np.random.get_state(),
+                        'rng_python': random.getstate(),
+                        **(({'rng_xpu': torch.xpu.get_rng_state()} if DEVICE_TYPE == 'xpu' and hasattr(torch, 'xpu') else
+                            {'rng_cuda': torch.cuda.get_rng_state()} if torch.cuda.is_available() else {})),
+                    }
+                    tmp_dir = os.path.join(model_save_path, "tmp")
+                    os.makedirs(tmp_dir, exist_ok=True)
+                    for old_f in glob.glob(os.path.join(tmp_dir, "*.pth")):
+                        os.remove(old_f)
+                    mid_save = os.path.join(tmp_dir, f"{epoch:06d}_step{step:04d}{suffix_pth}")
+                    state = Deformddpm.module.state_dict() if use_distributed else Deformddpm.state_dict()
+                    torch.save({
+                        'model_state_dict': state,
+                        'optimizer_state_dict': optimizer.state_dict(),
+                        'epoch': epoch,
+                        'step': step,
+                        'epoch_stats': _epoch_stats,
+                    }, mid_save)
+                    print(f"  [restart] Saved checkpoint at epoch {epoch} step {step}: {mid_save}", flush=True)
+                if gpu_id == 0:
+                    print(f"  [restart] Proactive restart after {steps_since_start} steps "
+                          f"(limit {max_steps_restart}). Exiting with code {EXIT_CODE_RESTART}.", flush=True)
+                # Clean shutdown
+                _empty_cache(DEVICE_TYPE)
+                gc.collect()
+                if use_distributed and dist.is_initialized():
+                    dist.barrier()
+                    dist.destroy_process_group()
+                sys.exit(EXIT_CODE_RESTART)
+        if gpu_id == 0:
+            print('==================')
+            print(epoch,':', epoch_loss_tot,'=',epoch_loss_gen_a,'+', epoch_loss_gen_d,'+',epoch_loss_reg, ' (ang+dist+regul)')
+            print(f'     loss_contrastive: {epoch_loss_contrastive}')
+            total_reg_safe = max(total_reg, 1)
+            print(f'     loss_regist: {epoch_loss_regist/total_reg_safe} = {epoch_loss_imgsim/total_reg_safe} (imgsim) + {epoch_loss_imgmse/total_reg_safe} (imgmse) + {epoch_loss_ddfreg/total_reg_safe} (ddf)')
+            print('==================')
+        if 0 == epoch % epoch_per_save and not args.no_save:
+            save_dir=model_save_path + str(epoch).rjust(6, '0') + suffix_pth
+            os.makedirs(os.path.dirname(model_save_path), exist_ok=True)
+            # break   # FOR TESTING
+            if not use_distributed:
+                print(f"saved in {save_dir}")
+                # torch.save(Deformddpm.state_dict(), save_dir)
+                torch.save({
+                    'model_state_dict': Deformddpm.state_dict(),
+                    'optimizer_state_dict': optimizer.state_dict(),
+                    'epoch': epoch
+                }, save_dir)
+            elif gpu_id == 0:
+                print(f"saved in {save_dir}")
+                # torch.save(Deformddpm.module.state_dict(), save_dir)
+                torch.save({
+                    'model_state_dict': Deformddpm.module.state_dict(),
+                    'optimizer_state_dict': optimizer.state_dict(),
+                    'epoch': epoch
+                }, save_dir)
+        # Clean up tmp/ mid-epoch checkpoints after completed epoch
+        if gpu_id == 0 and not args.no_save:
+            tmp_dir = os.path.join(model_dir, "tmp")
+            tmp_pths = glob.glob(os.path.join(tmp_dir, "*.pth"))
+            if tmp_pths:
+                for f in tmp_pths:
+                    os.remove(f)
+                print(f"  [cleanup] Cleared {len(tmp_pths)} tmp/ mid-epoch checkpoints", flush=True)
+        # Reset initial_step after first epoch completes (no more skipping)
+        initial_step = 0
+        # XPU CCL workaround: restart after each epoch to avoid CCL hang on 2nd epoch.
+        # CCL's Level Zero IPC handles accumulate and cause deadlock after ~200+ collectives.
+        # A fresh process resets the L0 context. The bash loop catches exit code 42 and restarts.
+        if DEVICE_TYPE == 'xpu' and use_distributed:
+            if gpu_id == 0:
+                print(f"  [xpu-restart] Epoch {epoch} done. Restarting to reset CCL state.", flush=True)
+            _empty_cache(DEVICE_TYPE)
+            gc.collect()
+            if dist.is_initialized():
+                dist.barrier()
+                dist.destroy_process_group()
+            sys.exit(EXIT_CODE_RESTART)
+    # Resource cleanup at the end of training
+    _empty_cache(DEVICE_TYPE)
+    gc.collect()
+    if use_distributed and dist.is_initialized():
+        dist.destroy_process_group()
+def ddp_load_dict(gpu_id, Deformddpm, optimizer, model_file,use_distributed=True, load_strict=False):
+    # All ranks load checkpoint so optimizer state is consistent across DDP processes.
+    # (Optimizer state includes per-parameter Adam momentum/variance which are NOT
+    # broadcast — only model weights are broadcast. Without this, non-rank-0 processes
+    # would have fresh Adam state after restart.)
+    gc.collect()
+    _empty_cache(DEVICE_TYPE)
+    if gpu_id == 0:
+        utils.print_memory_usage("Before Loading Model")
+    checkpoint = torch.load(model_file, map_location='cpu', weights_only=False)
+    if use_distributed:
+        Deformddpm.module.load_state_dict(checkpoint['model_state_dict'], strict=load_strict)
+    else:
+        Deformddpm.load_state_dict(checkpoint['model_state_dict'], strict=load_strict)
+    # Restore optimizer state when available (needed for mid-epoch resume).
+    # Selective loading: load states for parameters with matching shapes, skip mismatched ones
+    # (e.g., UpsampleConv replaced ConvTranspose3d — different kernel shapes).
+    # After one epoch, the saved checkpoint will have correct state for ALL parameters.
+    if 'optimizer_state_dict' in checkpoint and not args.reset_optimizer:
+        saved_opt = checkpoint['optimizer_state_dict']
+        saved_state = saved_opt.get('state', {})
+        param_list = [p for group in optimizer.param_groups for p in group['params']]
+        # Check if all shapes match (fast path: full load)
+        all_match = True
+        skipped = 0
+        for idx, s in saved_state.items():
+            if int(idx) < len(param_list):
+                p = param_list[int(idx)]
+                for k, v in s.items():
+                    if isinstance(v, torch.Tensor) and v.dim() > 0 and v.shape != p.shape:
+                        all_match = False
+                        break
+                if not all_match:
+                    break
+        if all_match:
+            optimizer.load_state_dict(saved_opt)
+        else:
+            # Selective load: restore param_groups settings (lr, betas, etc.)
+            for saved_g, group in zip(saved_opt['param_groups'], optimizer.param_groups):
+                for k, v in saved_g.items():
+                    if k != 'params':
+                        group[k] = v
+            # Restore per-parameter state only where shapes match
+            for idx, s in saved_state.items():
+                idx_int = int(idx)
+                if idx_int < len(param_list):
+                    p = param_list[idx_int]
+                    shapes_ok = all(
+                        v.shape == p.shape for k, v in s.items()
+                        if isinstance(v, torch.Tensor) and v.dim() > 0
+                    )
+                    if shapes_ok:
+                        # Cast state tensors to match parameter dtype/device
+                        new_state = {}
+                        for k, v in s.items():
+                            if isinstance(v, torch.Tensor):
+                                new_state[k] = v.to(dtype=p.dtype, device=p.device) if v.dim() > 0 else v
+                            else:
+                                new_state[k] = v
+                        optimizer.state[p] = new_state
+                    else:
+                        skipped += 1
+            if gpu_id == 0:
+                loaded = len(saved_state) - skipped
+                print(f"  [checkpoint] Selective optimizer load: {loaded} params restored, "
+                      f"{skipped} skipped (shape mismatch, fresh Adam for those)", flush=True)
+    elif args.reset_optimizer and gpu_id == 0:
+        print("  [checkpoint] --reset-optimizer: skipping optimizer state, starting fresh Adam", flush=True)
+    del checkpoint
+    if gpu_id == 0:
+        utils.print_memory_usage("After Loading Checkpoint on GPU")
+    if use_distributed:
+        # Broadcast model weights from rank 0 to ensure exact consistency
+        dist.barrier()
+        for param in Deformddpm.parameters():
+            dist.broadcast(param.data, src=0)
+    # get the epoch number from the filename
+    basename = os.path.basename(model_file)
+    epoch_from_file = int(basename[:6])
+    if '_step' in basename:
+        # Mid-epoch checkpoint: resume at same epoch (don't +1)
+        initial_epoch = epoch_from_file
+    else:
+        # End-of-epoch checkpoint: start next epoch
+        initial_epoch = epoch_from_file + 1
+    return initial_epoch, Deformddpm, optimizer
+if __name__ == "__main__":
+    if "LOCAL_RANK" in os.environ:
+        # Multi-node: launched by torchrun / srun
+        use_distributed = True
+        local_rank = int(os.environ["LOCAL_RANK"])
+        world_size = int(os.environ["WORLD_SIZE"])
+        print(f"torchrun launch: LOCAL_RANK={local_rank}, RANK={os.environ.get('RANK')}, WORLD_SIZE={world_size}")
+        try:
+            main_train(local_rank, world_size)
+        except Exception as e:
+            import traceback
+            print(f"\n{'='*60}\nRANK {os.environ.get('RANK')} FAILED:\n{'='*60}", flush=True)
+            traceback.print_exc()
+            raise
+    elif use_distributed:
+        # Single-node multi-GPU: use mp.spawn
+        world_size = _device_count(DEVICE_TYPE)
+        print(f"Distributed {DEVICE_TYPE.upper()} device number = {world_size}")
+        mp.spawn(main_train,args = (world_size,),nprocs = world_size)
+    else:
+        main_train(0,1)

OM_train_3modes.py CHANGED Viewed

@@ -1,4 +1,8 @@
-import os
 import gc
 import torch
 import torchvision
@@ -9,21 +13,32 @@ from torch.utils.data import DataLoader
 from torch.optim import Adam, SGD
 from Diffusion.diffuser import DeformDDPM
 from Diffusion.networks import get_net, STN
-from torchvision.transforms import Lambda
 import Diffusion.losses as losses
 import random
 import glob
 import numpy as np
 import utils
-from tqdm import tqdm
-from Dataloader.dataloader0 import get_dataloader
 from Dataloader.dataLoader import *
 from Dataloader.dataloader_utils import thresh_img
 import yaml
 import argparse
 ####################
 import torch.multiprocessing as mp
 from torch.utils.data.distributed import DistributedSampler
@@ -31,27 +46,66 @@ from torch.nn.parallel import DistributedDataParallel as DDP
 import torch.distributed as dist
 # from torch.distributed import init_process_group
 ###############
 def ddp_setup(rank, world_size):
     """
     Args:
-        rank: Unique identifier of each process
         world_size: Total number of processes
     """
-    os.environ["MASTER_ADDR"] = "localhost"
-    os.environ["MASTER_PORT"] = "12355"
-    dist.init_process_group(backend="nccl", rank=rank, world_size=world_size)
-    torch.cuda.set_device(rank)
-use_distributed = True
-# use_distributed = False
 EPS = 1e-5
 MSK_EPS = 0.01
-TEXT_EMBED_PROB = 0.7
-AUG_RESAMPLE_PROB = 0.6
-LOSS_WEIGHTS_DIFF = [2.0, 1.0, 3.0]  # [ang, dist, reg]
 # LOSS_WEIGHTS_REGIST = [9.0, 1.0, 16.0]  # [imgsim, imgmse, ddf]
-LOSS_WEIGHTS_REGIST = [1.0, 0.2, 1e3]  # [imgsim, imgmse, ddf]
 # AUG_PERMUTE_PROB = 0.35
@@ -68,23 +122,73 @@ parser.add_argument(
         default="Config/config_all.yaml",
         required=False,
     )
 args = parser.parse_args()
 #=======================================================================================================================
 def main_train(rank=0,world_size=1,train_mode_ratio=1,thresh_imgsim=0.01):
     if use_distributed:
         ddp_setup(rank,world_size)
-        if torch.distributed.is_initialized():
             print(f"World size: {torch.distributed.get_world_size()}")
             print(f"Communication backend: {torch.distributed.get_backend()}")
-    gpu_id = rank
     # Load the YAML file into a dictionary
     with open(args.config, 'r') as file:
         hyp_parameters = yaml.safe_load(file)
         print(hyp_parameters)
     # epoch_per_save=10
@@ -98,7 +202,7 @@ def main_train(rank=0,world_size=1,train_mode_ratio=1,thresh_imgsim=0.01):
     suffix_pth=f'_{data_name}_{net_name}.pth'
     model_save_path = os.path.join('Models',f'{data_name}_{net_name}/')
     model_dir=model_save_path
-    transformer=utils.get_transformer(img_sz=hyp_parameters["ndims"]*[hyp_parameters['img_size']])
     # Data_Loader=get_dataloader(data_name=hyp_parameters['data_name'], mode='train')
@@ -115,33 +219,54 @@ def main_train(rank=0,world_size=1,train_mode_ratio=1,thresh_imgsim=0.01):
     #     drop_last=True,
     # )
-    # dataset = OminiDataset_v1(transform=None)
-    dataset = OMDataset_indiv(transform=None)
     train_loader = DataLoader(
         dataset,
         batch_size=hyp_parameters['batchsize'],
-        shuffle=True,
         drop_last=True,
     )
-    # datasetp = OminiDataset_paired(transform=None)
-    datasetp = OMDataset_pair(transform=None)
     train_loader_p = DataLoader(
         datasetp,
-        batch_size=hyp_parameters['batchsize']//2,
-        shuffle=True,
         drop_last=True,
     )
     Deformddpm = DeformDDPM(
-        network=Net(
-            n_steps=hyp_parameters["timesteps"],
-            ndims=hyp_parameters["ndims"],
-            num_input_chn = hyp_parameters["num_input_chn"],
-            res = hyp_parameters['img_size']
-            ),
         n_steps=hyp_parameters["timesteps"],
         image_chw=[1] + [hyp_parameters["img_size"]]*hyp_parameters["ndims"],
         device=hyp_parameters["device"],
@@ -161,9 +286,18 @@ def main_train(rank=0,world_size=1,train_mode_ratio=1,thresh_imgsim=0.01):
     if use_distributed:
-        Deformddpm.to(rank)
-        Deformddpm = DDP(Deformddpm, device_ids=[rank])
-        ddf_stn.to(rank)
     else:
         Deformddpm.to(hyp_parameters["device"])
         ddf_stn.to(hyp_parameters["device"])
@@ -172,12 +306,14 @@ def main_train(rank=0,world_size=1,train_mode_ratio=1,thresh_imgsim=0.01):
     # mse = nn.MSELoss()
     # loss_reg = losses.Grad(penalty=['l1', 'negdetj'], ndims=hyp_parameters["ndims"])
-    loss_reg = losses.Grad(penalty=['l1', 'negdetj', 'range'], ndims=hyp_parameters["ndims"],outrange_thresh=0.2,outrange_weight=1e2)
-    loss_reg1 = losses.Grad(penalty=['l1', 'negdetj', 'range'], ndims=hyp_parameters["ndims"],outrange_thresh=0.6,outrange_weight=1e2)
     loss_dist = losses.MRSE(img_sz=hyp_parameters["img_size"])
     # loss_ang = losses.MRSE(img_sz=hyp_parameters["img_size"])
     loss_ang = losses.NCC(img_sz=hyp_parameters["img_size"])
-    loss_imgsim = losses.LNCC()
     loss_imgmse = losses.LMSE()
     optimizer = Adam(Deformddpm.parameters(), lr=hyp_parameters["lr"])
@@ -194,19 +330,51 @@ def main_train(rank=0,world_size=1,train_mode_ratio=1,thresh_imgsim=0.01):
     # check for existing models
     if not os.path.exists(model_dir):
         os.makedirs(model_dir, exist_ok=True)
-    model_files = glob.glob(os.path.join(model_dir, "*.pth"))
-    model_files.sort()
-    if model_files:
         if gpu_id == 0:
             print(model_files)
-        initial_epoch, Deformddpm, optimizer = ddp_load_dict(gpu_id, Deformddpm, optimizer, model_files[-1])
     else:
         initial_epoch = 0
     if gpu_id == 0:
         print('len_train_data: ',len(dataset))
     # Training loop
     for epoch in range(initial_epoch,hyp_parameters["epoch"]):
         epoch_loss_tot = 0.0
         epoch_loss_gen_d = 0.0
@@ -216,17 +384,110 @@ def main_train(rank=0,world_size=1,train_mode_ratio=1,thresh_imgsim=0.01):
         epoch_loss_imgsim = 0.0
         epoch_loss_imgmse = 0.0
         epoch_loss_ddfreg = 0.0
         # Set model inside to train model
         Deformddpm.train()
-        loss_nan_step = 0  # yu: count the number of nan loss steps
-        for step, batch in tqdm(enumerate(train_loader)):
         # for step, batch in tqdm(enumerate(train_loader)):
         # for step, batch in enumerate(train_loader_omni):
-            # x0, _ = batch
             # ==========================================================================
             # diffusion train on single image
@@ -235,12 +496,11 @@ def main_train(rank=0,world_size=1,train_mode_ratio=1,thresh_imgsim=0.01):
             [x0,embd] = batch # for om dataset
             x0 = x0.to(hyp_parameters["device"]).type(torch.float32)
             # print('embd:', embd.shape)
             if np.random.uniform(0,1)<TEXT_EMBED_PROB:
-                embd = embd.to(hyp_parameters["device"]).type(torch.float32)
             else:
-                embd = None
             n = x0.size()[0]  # batch_size -> n
             x0 = x0.to(hyp_parameters["device"])
@@ -254,10 +514,10 @@ def main_train(rank=0,world_size=1,train_mode_ratio=1,thresh_imgsim=0.01):
                 # elif np.random.uniform(0,1)<AUG_RESAMPLE_PROB+AUG_PERMUTE_PROB:
                 else:
                     [x0] = utils.random_permute([x0], select_dims=[-1,-2,-3])
-            x0 = transformer(x0)
             if hyp_parameters['noise_scale']>0:
                 if np.random.uniform(0,1)<AUG_RESAMPLE_PROB:
-                    x0 = thresh_img(x0, [0, 1*hyp_parameters['noise_scale']])
                 x0 = x0 * (np.random.normal(1, hyp_parameters['noise_scale'] * 1)) + np.random.normal(0, hyp_parameters['noise_scale'] * 1)
             # Picking some noise for each of the images in the batch, a timestep and the respective alpha_bars
@@ -266,157 +526,301 @@ def main_train(rank=0,world_size=1,train_mode_ratio=1,thresh_imgsim=0.01):
             )  # pick up a seq of rand number from 0 to 'timestep'
             # proc_type = random.choice(['adding', 'independ', 'downsample', 'slice', 'project', 'none', 'uncon', 'uncon', 'uncon'])
-            proc_type = random.choice(['adding', 'independ', 'downsample', 'slice', 'none', 'uncon', 'uncon', 'uncon'])
             # print('proc_type:', proc_type)
-            cond_img, _, cond_ratio = Deformddpm.module.proc_cond_img(x0,proc_type=proc_type)
-            pre_dvf_I,dvf_I = Deformddpm(img_org=x0, t=t, cond_imgs=cond_img, mask=blind_mask,proc_type=[],text=embd)  # forward diffusion process
-            loss_tot=0
-            loss_ddf = loss_reg(pre_dvf_I,img=x0)
-            trm_pred = ddf_stn(pre_dvf_I, dvf_I)
-            loss_gen_d = loss_dist(pred=trm_pred,inv_lab=dvf_I,ddf_stn=None,mask=blind_mask)
-            loss_gen_a = loss_ang(pred=trm_pred,inv_lab=dvf_I,ddf_stn=None,mask=blind_mask)
-            loss_tot += LOSS_WEIGHTS_DIFF[0] * loss_gen_a + LOSS_WEIGHTS_DIFF[1] * loss_gen_d
-            loss_tot += LOSS_WEIGHTS_DIFF[2] * loss_ddf
-            loss_tot = torch.sqrt(1.+MSK_EPS-cond_ratio) * loss_tot
-            # >> JZ: print nan in x0
-            if torch.isnan(x0).any():
-                print(f"*** Encountered NaN in input image x0 at epoch {epoch}, step {step}.")
-            # >> JZ: print loss of ddf
-            if loss_ddf>0.001:
-                print(f"*** High diffusion DDF loss at epoch {epoch}, step {step}: {loss_ddf.item()}.")
-            # yu: check if loss_tot==nan or inf
-            if torch.isnan(loss_tot) or torch.isinf(loss_tot):
-                print(f"*** Encountered NaN or Inf loss at epoch {epoch}, step {step}. Skipping this batch.")
-                loss_nan_step += 1
-                continue
-            if loss_nan_step > 5:
-                print(f"*** Too many NaN or Inf losses ({loss_nan_step} times) at epoch {epoch}, step {step}. Stopping training.")
-                raise ValueError("Too many NaN losses detected in loss_tot. Code terminated.")
-            optimizer.zero_grad()
-            loss_tot.backward()
-            optimizer.step()
-            epoch_loss_tot += loss_tot.item() * len(x0) / len(train_loader.dataset)
-            epoch_loss_gen_d += loss_gen_d.item() * len(x0) / len(train_loader.dataset)
-            epoch_loss_gen_a += loss_gen_a.item() * len(x0) / len(train_loader.dataset)
-            epoch_loss_reg += loss_ddf.item() * len(x0) / len(train_loader.dataset)
-            # print(loss_gen_a.item())
-            # if 0:
-            # if loss_gen_a.item() < -0.3 and step%train_mode_ratio == 0:
-            if step%train_mode_ratio == 0:
                 # ==========================================================================
-                # registration train on paired images
-                # x1, y1 = next(iter(train_loader_p))
-                [x1, y1, _, embd_y] = next(iter(train_loader_p))
                 if np.random.uniform(0,1)<TEXT_EMBED_PROB:
-                    # embd_x = embd_x.to(hyp_parameters["device"]).type(torch.float32)
                     embd_y = embd_y.to(hyp_parameters["device"]).type(torch.float32)
                 else:
-                    # embd_x = None
                     embd_y = None
                 x1 = x1.to(hyp_parameters["device"]).type(torch.float32)
                 y1 = y1.to(hyp_parameters["device"]).type(torch.float32)
                 n = x1.size()[0]  # batch_size -> n
-                # random deformation + rotation
-                # if hyp_parameters["ndims"]>2:
-                #     if np.random.uniform(0,1)<0.6:
-                #         x1 = utils.random_resample(x1, deform_scale=0)
-                #         y1 = utils.random_resample(y1, deform_scale=0)
-                x1 = transformer(x1)
-                y1 = transformer(y1)
                 [x1, y1] = utils.random_permute([x1, y1], select_dims=[-1,-2,-3])
                 if hyp_parameters['noise_scale']>0:
-                    x1 = thresh_img(x1, [0, 2*hyp_parameters['noise_scale']])
-                    x1 = x1 * (np.random.normal(1, hyp_parameters['noise_scale'] * 1)) + np.random.normal(0, hyp_parameters['noise_scale'] * 1)
-                    y1 = thresh_img(y1, [0, 2*hyp_parameters['noise_scale']])
-                    y1 = y1 * (np.random.normal(1, hyp_parameters['noise_scale'] * 1)) + np.random.normal(0, hyp_parameters['noise_scale'] * 1)
-                # Picking some noise for each of the images in the batch, a timestep and the respective alpha_bars
-                t = torch.randint(0, hyp_parameters["timesteps"], (n,)).to(
-                    hyp_parameters["device"]
-                )  # pick up a seq of rand number from 0 to 'timestep'
-                scale_regist = np.random.uniform(0.6,1.)
-                T_regist = sorted(random.sample(range(0, int(hyp_parameters["timesteps"] * scale_regist) + 1), 16), reverse=True)
-                # scale_regist = np.random.uniform(0.4,1.)
-                # T_regist = [int(hyp_parameters["timesteps"]*scale_regist)]
-                # scale_regist = np.random.uniform(0.6,1.)
-                # init_T = int(hyp_parameters["timesteps"] * scale_regist)
-                # T_regist = sorted(random.sample(range(0, int(hyp_parameters["timesteps"] * scale_regist)), 2)+list(range(init_T,hyp_parameters["timesteps"]+1)), reverse=True)
-                T_regist = [[t for _ in range(hyp_parameters["batchsize"]//2)] for t in T_regist]
-                # print('T_regist:', T_regist)
-                # proc_type = random.choice(['adding', 'independ', 'downsample', 'slice', 'project', 'none', 'none'])
-                proc_type = random.choice(['adding', 'independ', 'downsample', 'slice', 'none', 'none'])
-                # proc_type = random.choice(['project'])
-                y1, msk_tgt, cond_ratio = Deformddpm.module.proc_cond_img(y1,proc_type=proc_type)
-                msk_tgt = msk_tgt + MSK_EPS
-                [ddf_comp,ddf_rand],[img_rec,img_diff,img_save],_ = Deformddpm(img_org=x1, cond_imgs=y1, T=[None, T_regist], proc_type=[],text=embd_y)  # forward diffusion process
-                loss_ddf1 = loss_reg1(ddf_comp,img=y1,msk=msk_tgt)  # calculate loss for the registration process
                 loss_sim = loss_imgsim(img_rec, y1, label=msk_tgt*(y1>thresh_imgsim))  # calculate loss for the registration process
-                loss_mse = loss_imgmse(img_rec, y1, label=msk_tgt*(y1>0.0))  # calculate loss for the registration process
                 loss_regist = 0
                 loss_regist += LOSS_WEIGHTS_REGIST[0] * loss_sim
                 loss_regist += LOSS_WEIGHTS_REGIST[1] * loss_mse
                 loss_regist += LOSS_WEIGHTS_REGIST[2] * loss_ddf1
-                # print('proc_type:', proc_type, 'cond_ratio:', cond_ratio.item())
-                # print('loss_regist:', loss_regist.item(), 'loss_sim:', loss_sim.item(), 'loss_ddf1:', loss_ddf1.item())
                 # >> JZ: print nan in x0
                 if torch.isnan(x0).any():
                     print(f"*** Encountered NaN in input image x0 at epoch {epoch}, step {step}.")
                 # >> JZ: print loss of ddf
-                if loss_ddf1>0.001:
                     print(f"*** High registration DDF loss at epoch {epoch}, step {step}: {loss_ddf1.item()}.")
-                loss_regist = torch.sqrt(cond_ratio+MSK_EPS) *loss_regist
-                optimizer.zero_grad()
-                loss_regist.backward()
-                # # Print gradients for each parameter
-                # for name, param in Deformddpm.named_parameters():
-                #     if param.grad is not None:
-                #         print(f"Gradient for {name}: {param.grad.norm()}")
-                #     else:
-                #         print(f"Gradient for {name}: None")
-                torch.nn.utils.clip_grad_norm_(Deformddpm.parameters(), max_norm=0.1)
-                optimizer.step()
-                epoch_loss_regist += loss_regist.item() * len(x0) / len(train_loader.dataset)
-                epoch_loss_imgsim += loss_sim.item() * len(x0) / len(train_loader.dataset)
-                epoch_loss_imgmse += loss_mse.item() * len(x0) / len(train_loader.dataset)
-                epoch_loss_ddfreg += loss_ddf1.item() * len(x0) / len(train_loader.dataset)
-            # print('step:',step,':', loss_tot.item(),'=',loss_gen_a.item(),'+', loss_gen_d.item(),'+',loss_ddf.item())
-            # break   # FOR TESTING
-            # else:
-            #     print('loss_gen_a:',loss_gen_a.item())     # FOR TESTING
-            #     pass
-        if 1:
-        # if gpu_id == 0:
             print(epoch,':', epoch_loss_tot,'=',epoch_loss_gen_a,'+', epoch_loss_gen_d,'+',epoch_loss_reg, ' (ang+dist+regul)')
-            print(f'     loss_regist: {epoch_loss_regist} = {epoch_loss_imgsim} (imgsim) + {epoch_loss_imgmse} (imgmse) + {epoch_loss_ddfreg} (ddf)')
-        # # LR schedular step ----- YHM
-        # scheduler.step()
-        if 0 == epoch % epoch_per_save:
             save_dir=model_save_path + str(epoch).rjust(6, '0') + suffix_pth
             os.makedirs(os.path.dirname(model_save_path), exist_ok=True)
             # break   # FOR TESTING
@@ -436,55 +840,150 @@ def main_train(rank=0,world_size=1,train_mode_ratio=1,thresh_imgsim=0.01):
                     'optimizer_state_dict': optimizer.state_dict(),
                     'epoch': epoch
                 }, save_dir)
     # Resource cleanup at the end of training
-    torch.cuda.empty_cache()
     gc.collect()
     if use_distributed and dist.is_initialized():
         dist.destroy_process_group()
-def ddp_load_dict(gpu_id, Deformddpm, optimizer, model_file,use_distributed=True):
     if gpu_id == 0:
-    # if 0:
         utils.print_memory_usage("Before Loading Model")
-        if 1:
-            gc.collect()
-            torch.cuda.empty_cache()
-        # Deformddpm.network.load_state_dict(torch.load(latest_model_file))
-        # Deformddpm.load_state_dict(torch.load(latest_model_file), strict=False)
-        checkpoint = torch.load(model_file)
-        # checkpoint = torch.load(latest_model_file, map_location=f"cuda:{rank}")
-        if use_distributed:
-            Deformddpm.module.load_state_dict(checkpoint['model_state_dict'])
         else:
-            Deformddpm.load_state_dict(checkpoint['model_state_dict'])
-        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
         utils.print_memory_usage("After Loading Checkpoint on GPU")
     if use_distributed:
-        # Broadcast model weights from rank 0 to all other GPUs
         dist.barrier()
         for param in Deformddpm.parameters():
-            dist.broadcast(param.data, src=0)  # Synchronize model across ranks
-        dist.barrier()
-        for param_group in optimizer.param_groups:
-            for param in param_group['params']:
-                if param.grad is not None:
-                    dist.broadcast(param.grad, src=0)  # Sync optimizer gradients
-    # initial_epoch = checkpoint['epoch'] + 1
-    # get the epoch number from the filename and add 1 to set as initial_epoch
-    initial_epoch = int(os.path.basename(model_file).split('.')[0][:6]) + 1
     return initial_epoch, Deformddpm, optimizer
 if __name__ == "__main__":
-    if use_distributed:
-        world_size = torch.cuda.device_count()
-        print(f"Distributed GPU number = {world_size}")
         mp.spawn(main_train,args = (world_size,),nprocs = world_size)
     else:
         main_train(0,1)

+import os, sys, contextlib
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(ROOT_DIR)
 import gc
 import torch
 import torchvision
 from torch.optim import Adam, SGD
 from Diffusion.diffuser import DeformDDPM
 from Diffusion.networks import get_net, STN
+# from torchvision.transforms import Lambda
+import torch.nn.functional as F
 import Diffusion.losses as losses
 import random
 import glob
 import numpy as np
 import utils
+from tqdm import tqdm
+# from Dataloader.dataloader0 import get_dataloader
 from Dataloader.dataLoader import *
 from Dataloader.dataloader_utils import thresh_img
 import yaml
 import argparse
+# XPU support: import Intel Extension for PyTorch and oneCCL bindings if available
+try:
+    import intel_extension_for_pytorch as ipex
+except ImportError:
+    ipex = None
+try:
+    import oneccl_bindings_for_pytorch
+except (ImportError, Exception) as e:
+    print(f"WARNING: Failed to import oneccl_bindings_for_pytorch: {e}")
 ####################
 import torch.multiprocessing as mp
 from torch.utils.data.distributed import DistributedSampler
 import torch.distributed as dist
 # from torch.distributed import init_process_group
 ###############
+def _device_available(device_type):
+    if device_type == 'xpu':
+        return hasattr(torch, 'xpu') and torch.xpu.is_available()
+    return torch.cuda.is_available()
+def _device_count(device_type):
+    if device_type == 'xpu':
+        return torch.xpu.device_count() if hasattr(torch, 'xpu') else 0
+    return torch.cuda.device_count()
+def _set_device(rank, device_type):
+    if device_type == 'xpu':
+        torch.xpu.set_device(rank)
+    else:
+        torch.cuda.set_device(rank)
+def _empty_cache(device_type):
+    if device_type == 'xpu' and hasattr(torch, 'xpu'):
+        torch.xpu.empty_cache()
+    elif torch.cuda.is_available():
+        torch.cuda.empty_cache()
 def ddp_setup(rank, world_size):
     """
     Args:
+        rank: Unique identifier of each process (local_rank when launched by torchrun)
         world_size: Total number of processes
     """
+    backend = "ccl" if DEVICE_TYPE == "xpu" else "nccl"
+    if "LOCAL_RANK" in os.environ:
+        # Launched by torchrun: MASTER_ADDR, MASTER_PORT, RANK, WORLD_SIZE already set
+        dist.init_process_group(backend=backend)
+        _set_device(int(os.environ["LOCAL_RANK"]), DEVICE_TYPE)
+    else:
+        # Single-node mp.spawn
+        os.environ["MASTER_ADDR"] = "localhost"
+        os.environ["MASTER_PORT"] = "12355"
+        dist.init_process_group(backend=backend, rank=rank, world_size=world_size)
+        _set_device(rank, DEVICE_TYPE)
 EPS = 1e-5
 MSK_EPS = 0.01
+TEXT_EMBED_PROB = 0.5
+AUG_RESAMPLE_PROB = 0.5
+LOSS_WEIGHTS_DIFF = [4.0, 2.0, 8.0]  # [ang, dist, reg]
 # LOSS_WEIGHTS_REGIST = [9.0, 1.0, 16.0]  # [imgsim, imgmse, ddf]
+LOSS_WEIGHTS_REGIST = [1.0, 0.01, 1e2]  # [imgsim, imgmse, ddf]
+DIFF_REG_BATCH_RATIO = 2
+# LOSS_WEIGHT_CONTRASTIVE = 1e-4
+LOSS_WEIGHT_CONTRASTIVE = 1e-1
+REGISTRATION_STEP_RATIO = 1
+CONTRASTIVE_STEP_RATIO = 1
+ACCEPT_THRESH_CONTRASTIVE = 0.1
+ACCEPT_THRESH_ANGLE = -0.8
+MID_EPOCH_SAVE_STEPS = 1e4  # Save mid-epoch checkpoint every N steps for crash recovery.
+                           # XPU autograd leaks ~1.0 GiB/step of device memory (Intel bug).
+                           # With gradient checkpointing, training survives ~26 steps from fresh start,
+                           # but fewer when carrying leaked memory from previous epoch.
+                           # Save every 10 steps to minimize lost work on OOM crash.
+EXIT_CODE_RESTART = 42     # Exit code signaling proactive restart (not a crash).
 # AUG_PERMUTE_PROB = 0.35
         default="Config/config_all.yaml",
         required=False,
     )
+parser.add_argument("--dummy-samples", type=int, default=0, help="Use dummy random data for testing (0=use real data)")
+parser.add_argument("--batchsize", type=int, default=0, help="Override batch size from config (0=use config value)")
+parser.add_argument("--max-steps-before-restart", type=int, default=0,
+                    help="Proactive restart: exit after N training steps to reset XPU memory leak. "
+                         "0=disabled (rely on OOM crash + auto-resubmit). "
+                         "Recommended: 20 for XPU (survives ~26 steps max).")
+parser.add_argument("--no-save", action="store_true", default=False,
+                    help="Disable all checkpoint saving (for diagnostic/validation runs)")
+parser.add_argument("--reset-optimizer", action="store_true",
+                    help="Skip optimizer state loading from checkpoint (use when architecture changed)")
+parser.add_argument("--eval-only", action="store_true",
+                    help="Forward pass only: compute and print losses without backward/optimizer (no memory leak)")
 args = parser.parse_args()
+# Read config early to determine device type for DDP setup
+with open(args.config, 'r') as _f:
+    _cfg = yaml.safe_load(_f)
+DEVICE_TYPE = _cfg.get('device', 'cuda')  # 'cuda' or 'xpu'
+# Auto-detect: use DDP only when multiple devices are available
+use_distributed = _device_available(DEVICE_TYPE) and _device_count(DEVICE_TYPE) > 1
+# use_distributed = True
+# use_distributed = False
 #=======================================================================================================================
+class _DummyIndiv(torch.utils.data.Dataset):
+    def __init__(self, n, sz, embd_dim=1024):
+        self.n, self.sz, self.embd_dim = n, sz, embd_dim
+    def __len__(self): return self.n
+    def __getitem__(self, i):
+        return np.random.rand(1, self.sz, self.sz, self.sz).astype(np.float64), np.random.randn(self.embd_dim).astype(np.float32)
+class _DummyPair(torch.utils.data.Dataset):
+    def __init__(self, n, sz, embd_dim=1024):
+        self.n, self.sz, self.embd_dim = n, sz, embd_dim
+    def __len__(self): return self.n
+    def __getitem__(self, i):
+        return (np.random.rand(1, self.sz, self.sz, self.sz).astype(np.float64),
+                np.random.rand(1, self.sz, self.sz, self.sz).astype(np.float64),
+                np.random.randn(self.embd_dim).astype(np.float32),
+                np.random.randn(self.embd_dim).astype(np.float32))
 def main_train(rank=0,world_size=1,train_mode_ratio=1,thresh_imgsim=0.01):
     if use_distributed:
         ddp_setup(rank,world_size)
+        if torch.distributed.is_initialized() and rank == 0:
             print(f"World size: {torch.distributed.get_world_size()}")
             print(f"Communication backend: {torch.distributed.get_backend()}")
+            print(f"PYTORCH_ALLOC_CONF: {os.environ.get('PYTORCH_ALLOC_CONF', 'not set')}")
+            if DEVICE_TYPE == 'xpu' and hasattr(torch, 'xpu'):
+                props = torch.xpu.get_device_properties(0)
+                print(f"XPU device: {props.name}, total memory: {props.total_memory / 1024**3:.2f} GiB")
+    # gpu_id = global rank (for save/print guards); rank = local device index
+    if "RANK" in os.environ:
+        gpu_id = int(os.environ["RANK"])
+        rank = int(os.environ["LOCAL_RANK"])
+    else:
+        gpu_id = rank
     # Load the YAML file into a dictionary
     with open(args.config, 'r') as file:
         hyp_parameters = yaml.safe_load(file)
+    if args.batchsize > 0:
+        hyp_parameters['batchsize'] = args.batchsize
+    if gpu_id == 0:
         print(hyp_parameters)
     # epoch_per_save=10
     suffix_pth=f'_{data_name}_{net_name}.pth'
     model_save_path = os.path.join('Models',f'{data_name}_{net_name}/')
     model_dir=model_save_path
+    # transformer=utils.get_transformer(img_sz=hyp_parameters["ndims"]*[hyp_parameters['img_size']])
     # Data_Loader=get_dataloader(data_name=hyp_parameters['data_name'], mode='train')
     #     drop_last=True,
     # )
+    if args.dummy_samples > 0:
+        dataset = _DummyIndiv(args.dummy_samples, hyp_parameters['img_size'])
+        datasetp = _DummyPair(args.dummy_samples, hyp_parameters['img_size'])
+    else:
+        # dataset = OminiDataset_v1(transform=None)
+        dataset = OMDataset_indiv(transform=None)
+        # datasetp = OminiDataset_paired(transform=None)
+        datasetp = OMDataset_pair(transform=None)
+    if use_distributed:
+        sampler = DistributedSampler(dataset, shuffle=True)
+        sampler_p = DistributedSampler(datasetp, shuffle=True)
+    else:
+        sampler = None
+        sampler_p = None
     train_loader = DataLoader(
         dataset,
         batch_size=hyp_parameters['batchsize'],
+        shuffle=(sampler is None),
         drop_last=True,
+        sampler=sampler,
     )
     train_loader_p = DataLoader(
         datasetp,
+        batch_size=max(1, hyp_parameters['batchsize']//DIFF_REG_BATCH_RATIO),
+        shuffle=(sampler_p is None),
         drop_last=True,
+        sampler=sampler_p,
     )
+    network = Net(
+        n_steps=hyp_parameters["timesteps"],
+        ndims=hyp_parameters["ndims"],
+        num_input_chn = hyp_parameters["num_input_chn"],
+        res = hyp_parameters['img_size']
+    )
+    # Enable gradient checkpointing on XPU to reduce peak activation memory.
+    # XPU autograd leaks ~1.0 GiB/step; lower peak buys more steps before OOM.
+    if DEVICE_TYPE == 'xpu' and hasattr(network, 'use_checkpoint'):
+        network.use_checkpoint = True
+        if gpu_id == 0:
+            print("  [init] Gradient checkpointing enabled for XPU", flush=True)
     Deformddpm = DeformDDPM(
+        network=network,
         n_steps=hyp_parameters["timesteps"],
         image_chw=[1] + [hyp_parameters["img_size"]]*hyp_parameters["ndims"],
         device=hyp_parameters["device"],
     if use_distributed:
+        device = f"{DEVICE_TYPE}:{rank}"
+        # NO pre-allocation. CCL/oneDNN accumulate ~1.4 GiB/step of device memory outside
+        # PyTorch's caching allocator. Pre-allocating steals from that budget:
+        #   92% pre-alloc → crash at step 3, 78% → step 10, none (70% cap) → step 14.
+        # Instead, use empty_cache() between training phases to release unused cached memory
+        # back to the device for CCL/oneDNN.
+        if gpu_id == 0 and DEVICE_TYPE == 'xpu' and hasattr(torch, 'xpu'):
+            total_mem = torch.xpu.get_device_properties(rank).total_memory
+            print(f"  [init] XPU device memory: {total_mem/1024**3:.1f} GiB, no pre-allocation (relying on empty_cache between phases)", flush=True)
+        Deformddpm.to(device)
+        Deformddpm = DDP(Deformddpm, device_ids=[rank], find_unused_parameters=True)
+        ddf_stn.to(device)
     else:
         Deformddpm.to(hyp_parameters["device"])
         ddf_stn.to(hyp_parameters["device"])
     # mse = nn.MSELoss()
     # loss_reg = losses.Grad(penalty=['l1', 'negdetj'], ndims=hyp_parameters["ndims"])
+    # loss_reg = losses.Grad(penalty=['l1', 'negdetj', 'range'], ndims=hyp_parameters["ndims"])
+    loss_reg = losses.Grad(penalty=['l1', 'negdetj', 'range'], ndims=hyp_parameters["ndims"],outrange_thresh=0.2,outrange_weight=1e3)
+    loss_reg1 = losses.Grad(penalty=['l1', 'negdetj', 'range'], ndims=hyp_parameters["ndims"],outrange_thresh=0.6,outrange_weight=1e3)
     loss_dist = losses.MRSE(img_sz=hyp_parameters["img_size"])
     # loss_ang = losses.MRSE(img_sz=hyp_parameters["img_size"])
     loss_ang = losses.NCC(img_sz=hyp_parameters["img_size"])
+    loss_imgsim = losses.MSLNCC()
     loss_imgmse = losses.LMSE()
     optimizer = Adam(Deformddpm.parameters(), lr=hyp_parameters["lr"])
     # check for existing models
     if not os.path.exists(model_dir):
         os.makedirs(model_dir, exist_ok=True)
+    # Check for checkpoints: first check tmp/ for mid-epoch, then main dir for epoch-level
+    tmp_dir = os.path.join(model_dir, "tmp")
+    tmp_files = sorted(glob.glob(os.path.join(tmp_dir, "*.pth")))
+    model_files = sorted(glob.glob(os.path.join(model_dir, "*.pth")))
+    initial_step = 0
+    # Epoch stats and RNG states to restore when resuming from mid-epoch checkpoint
+    _resume_epoch_stats = None
+    _resume_rng = None
+    if tmp_files and not args.eval_only and args.max_steps_before_restart > 0:
+        # Mid-epoch checkpoint: only use when proactive restart is enabled
+        latest = tmp_files[-1]
+        if gpu_id == 0:
+            print(f"  [resume] Found mid-epoch checkpoint: {latest}")
+        initial_epoch, Deformddpm, optimizer = ddp_load_dict(gpu_id, Deformddpm, optimizer, latest, use_distributed=use_distributed)
+        basename = os.path.basename(latest)
+        initial_step = int(basename.split('_step')[1].split('_')[0].split('.')[0])
+        _ckpt = torch.load(latest, map_location='cpu', weights_only=False)
+        _resume_epoch_stats = _ckpt.get('epoch_stats', None)
+        del _ckpt
+        if gpu_id == 0:
+            print(f"  [resume] Resuming epoch {initial_epoch} from step {initial_step}"
+                  f"{' (with epoch_stats)' if _resume_epoch_stats else ''}", flush=True)
+    elif model_files:
         if gpu_id == 0:
             print(model_files)
+        latest = model_files[-1]
+        initial_epoch, Deformddpm, optimizer = ddp_load_dict(gpu_id, Deformddpm, optimizer, latest, use_distributed=use_distributed)
     else:
         initial_epoch = 0
     if gpu_id == 0:
         print('len_train_data: ',len(dataset))
+    # Proactive restart: track steps since process start to exit before OOM.
+    max_steps_restart = args.max_steps_before_restart
+    steps_since_start = 0
+    loss_contra_gate = 0.0
     # Training loop
     for epoch in range(initial_epoch,hyp_parameters["epoch"]):
+        if use_distributed and sampler is not None:
+            sampler.set_epoch(epoch)
+            sampler_p.set_epoch(epoch)
         epoch_loss_tot = 0.0
         epoch_loss_gen_d = 0.0
         epoch_loss_imgsim = 0.0
         epoch_loss_imgmse = 0.0
         epoch_loss_ddfreg = 0.0
+        epoch_loss_contrastive = 0.0
+        total_contra = 0
+        total_reg_restored = None
+        total_contra_restored = None
+        # Restore epoch accumulators from mid-epoch checkpoint (only for the resumed epoch)
+        if _resume_epoch_stats is not None and epoch == initial_epoch:
+            epoch_loss_tot = _resume_epoch_stats.get('epoch_loss_tot', 0.0)
+            epoch_loss_gen_d = _resume_epoch_stats.get('epoch_loss_gen_d', 0.0)
+            epoch_loss_gen_a = _resume_epoch_stats.get('epoch_loss_gen_a', 0.0)
+            epoch_loss_reg = _resume_epoch_stats.get('epoch_loss_reg', 0.0)
+            epoch_loss_regist = _resume_epoch_stats.get('epoch_loss_regist', 0.0)
+            epoch_loss_imgsim = _resume_epoch_stats.get('epoch_loss_imgsim', 0.0)
+            epoch_loss_imgmse = _resume_epoch_stats.get('epoch_loss_imgmse', 0.0)
+            epoch_loss_ddfreg = _resume_epoch_stats.get('epoch_loss_ddfreg', 0.0)
+            epoch_loss_contrastive = _resume_epoch_stats.get('epoch_loss_contrastive', 0.0)
+            total_reg_restored = _resume_epoch_stats.get('total_reg', None)
+            total_contra_restored = _resume_epoch_stats.get('total_contra', None)
+            loss_nan_step = _resume_epoch_stats.get('loss_nan_step', 0)
+            # RNG states are restored INSIDE the skip loop (at the last skipped step)
+            # to avoid DataLoader __getitem__ calls corrupting the restored state.
+            _resume_rng = {k: _resume_epoch_stats[k] for k in
+                           ('rng_torch', 'rng_numpy', 'rng_python', 'rng_xpu', 'rng_cuda')
+                           if k in _resume_epoch_stats}
+            if gpu_id == 0:
+                print(f"  [resume] Restored epoch stats from checkpoint (loss_tot={epoch_loss_tot:.4f})", flush=True)
+            _resume_epoch_stats = None  # Only restore once
+        else:
+            loss_nan_step = 0  # only reset when NOT resuming mid-epoch
         # Set model inside to train model
         Deformddpm.train()
+        total = min(len(train_loader), len(train_loader_p))
+        total_reg = total // REGISTRATION_STEP_RATIO
+        # Restore total_reg and total_contra from checkpoint if available (mid-epoch resume)
+        if total_reg_restored is not None:
+            total_reg = total_reg_restored
+            total_reg_restored = None
+        if total_contra_restored is not None:
+            total_contra = total_contra_restored
+            total_contra_restored = None
+        # for step, batch in tqdm(enumerate(train_loader)):
         # for step, batch in tqdm(enumerate(train_loader)):
         # for step, batch in enumerate(train_loader_omni):
+        for step, (batch, batch_p) in tqdm(enumerate(zip(train_loader, train_loader_p)), total=total):
+            # Skip steps already completed (mid-epoch resume).
+            # Checkpoint at step N is saved AFTER step N's training completes,
+            # so step N itself must also be skipped (use <=, not <).
+            if epoch == initial_epoch and initial_step > 0 and step <= initial_step:
+                # Restore RNG at the last skipped step, AFTER DataLoader __getitem__
+                # has consumed RNG for all skipped batches. This way the first
+                # non-skipped step starts with exactly the saved RNG state.
+                if step == initial_step and _resume_rng is not None:
+                    # Restore rank 0's RNG as base state, then re-seed per-rank
+                    # so each rank has independent RNG (matching continuous run's
+                    # divergent-per-rank behavior). Without this, all ranks would
+                    # share rank 0's RNG → correlated augmentation/dropout decisions.
+                    if 'rng_torch' in _resume_rng:
+                        torch.set_rng_state(_resume_rng['rng_torch'])
+                    if 'rng_numpy' in _resume_rng:
+                        np.random.set_state(_resume_rng['rng_numpy'])
+                    if 'rng_python' in _resume_rng:
+                        random.setstate(_resume_rng['rng_python'])
+                    if 'rng_xpu' in _resume_rng and DEVICE_TYPE == 'xpu':
+                        torch.xpu.set_rng_state(_resume_rng['rng_xpu'])
+                    elif 'rng_cuda' in _resume_rng and torch.cuda.is_available():
+                        torch.cuda.set_rng_state(_resume_rng['rng_cuda'])
+                    # Per-rank re-seed: checkpoint only has rank 0's RNG state.
+                    # Advance each rank's RNG by a deterministic offset so they
+                    # diverge (as they would in a continuous run).
+                    if gpu_id > 0:
+                        rank_seed = gpu_id * 100003 + initial_step * 31
+                        torch.manual_seed(torch.initial_seed() + rank_seed)
+                        np.random.seed((np.random.get_state()[1][0] + rank_seed) % (2**31))
+                        random.seed(random.getrandbits(32) + rank_seed)
+                        if DEVICE_TYPE == 'xpu' and hasattr(torch, 'xpu'):
+                            torch.xpu.manual_seed(torch.initial_seed() + rank_seed)
+                        elif torch.cuda.is_available():
+                            torch.cuda.manual_seed(torch.initial_seed() + rank_seed)
+                    _resume_rng = None
+                    if gpu_id == 0:
+                        print(f"  [resume] RNG states restored at step {step} (per-rank re-seeded)", flush=True)
+                continue
+            # Free registration tensors from previous step
+            x1 = y1 = ddf_comp = img_rec = img_diff = None
+            ddf_rand = y1_proc = msk_tgt = img_save = None
+            loss_regist = loss_sim = loss_mse = loss_ddf1 = None
+            # Memory diagnostic (one per node via local rank 0) — only warn when abnormal
+            # Normal at step start: ~16 GiB reserved, ~48 GiB free (of 64 GiB total)
+            if rank == 0 and DEVICE_TYPE == 'xpu' and hasattr(torch, 'xpu'):
+                torch.xpu.reset_peak_memory_stats(rank)
+                free_mem, total_mem_dev = torch.xpu.mem_get_info(rank)
+                used_gib = (total_mem_dev - free_mem) / 1024**3
+                if used_gib > 24:  # Normal is ~16 GiB at step start; warn if accumulating
+                    alloc = torch.xpu.memory_allocated() / 1024**3
+                    reserved = torch.xpu.memory_reserved() / 1024**3
+                    free_gib = free_mem / 1024**3
+                    print(f"  [mem WARNING] gpu_id={gpu_id} epoch {epoch} step {step}: "
+                          f"{used_gib:.1f} GiB used ({alloc:.1f} alloc / {reserved:.1f} reserved), "
+                          f"{free_gib:.1f} GiB free", flush=True)
             # ==========================================================================
             # diffusion train on single image
             [x0,embd] = batch # for om dataset
             x0 = x0.to(hyp_parameters["device"]).type(torch.float32)
             # print('embd:', embd.shape)
+            embd_dev = embd.to(hyp_parameters["device"]).type(torch.float32)
             if np.random.uniform(0,1)<TEXT_EMBED_PROB:
+                embd_in = embd_dev
             else:
+                embd_in = None
             n = x0.size()[0]  # batch_size -> n
             x0 = x0.to(hyp_parameters["device"])
                 # elif np.random.uniform(0,1)<AUG_RESAMPLE_PROB+AUG_PERMUTE_PROB:
                 else:
                     [x0] = utils.random_permute([x0], select_dims=[-1,-2,-3])
+            # x0 = transformer(x0)
             if hyp_parameters['noise_scale']>0:
                 if np.random.uniform(0,1)<AUG_RESAMPLE_PROB:
+                    x0 = thresh_img(x0, [0, 2*hyp_parameters['noise_scale']])
                 x0 = x0 * (np.random.normal(1, hyp_parameters['noise_scale'] * 1)) + np.random.normal(0, hyp_parameters['noise_scale'] * 1)
             # Picking some noise for each of the images in the batch, a timestep and the respective alpha_bars
             )  # pick up a seq of rand number from 0 to 'timestep'
             # proc_type = random.choice(['adding', 'independ', 'downsample', 'slice', 'project', 'none', 'uncon', 'uncon', 'uncon'])
+            proc_type = random.choice(['adding', 'downsample', 'slice', 'slice1', 'none', 'uncon', 'uncon', 'uncon'])
             # print('proc_type:', proc_type)
+            ddpm = Deformddpm.module if use_distributed else Deformddpm
+            cond_img, _, cond_ratio = ddpm.proc_cond_img(x0,proc_type=proc_type)
+            if loss_contra_gate < ACCEPT_THRESH_CONTRASTIVE:
+                pre_dvf_I,dvf_I = Deformddpm(img_org=x0, t=t, cond_imgs=cond_img, mask=blind_mask,proc_type=[],text=embd_in)  # forward diffusion process
+                loss_tot=0
+                loss_ddf = loss_reg(pre_dvf_I,img=x0)
+                trm_pred = ddf_stn(pre_dvf_I, dvf_I)
+                loss_gen_d = loss_dist(pred=trm_pred,inv_lab=dvf_I,ddf_stn=None,mask=blind_mask)
+                loss_gen_a = loss_ang(pred=trm_pred,inv_lab=dvf_I,ddf_stn=None,mask=blind_mask)
+                loss_tot += LOSS_WEIGHTS_DIFF[0] * loss_gen_a + LOSS_WEIGHTS_DIFF[1] * loss_gen_d
+                loss_tot += LOSS_WEIGHTS_DIFF[2] * loss_ddf
+                loss_tot = torch.sqrt(1.+MSK_EPS-cond_ratio) * loss_tot
+                # >> JZ: print nan in x0
+                if torch.isnan(x0).any():
+                    print(f"*** Encountered NaN in input image x0 at epoch {epoch}, step {step}.")
+                # >> JZ: print loss of ddf
+                if loss_ddf>0.001:
+                    print(f"*** High diffusion DDF loss at epoch {epoch}, step {step}: {loss_ddf.item()}.")
+                # yu: check if loss_tot==nan or inf
+                # Synchronize NaN skip across all DDP ranks to avoid collective desync
+                # Use broadcast from rank 0 instead of all_reduce to avoid CCL hang on single-node XPU
+                is_nan = torch.isnan(loss_tot) or torch.isinf(loss_tot)
+                if use_distributed:
+                    nan_flag = torch.tensor([1.0 if is_nan else 0.0], device=f"{DEVICE_TYPE}:{rank}")
+                    dist.broadcast(nan_flag, src=0)
+                    is_nan = nan_flag.item() > 0
+                if is_nan:
+                    if gpu_id == 0:
+                        print(f"*** Encountered NaN or Inf loss at epoch {epoch}, step {step}. Skipping this batch.")
+                    loss_nan_step += 1
+                    continue
+                if loss_nan_step > 5:
+                    print(f"*** Too many NaN or Inf losses ({loss_nan_step} times) at epoch {epoch}, step {step}. Stopping training.")
+                    raise ValueError("Too many NaN losses detected in loss_tot. Code terminated.")
                 # ==========================================================================
+                # Diffusion backward (no gradient clipping — diffusion dominates training)
+                # print(loss_contra_gate)
+                if (not args.eval_only):  # Skip backward when contrastive loss is high to avoid destabilizing diffusion training (especially early on)
+                    optimizer.zero_grad()
+                    loss_tot.backward()
+                    optimizer.step()
+                epoch_loss_tot += loss_tot.item() / total
+                epoch_loss_gen_d += loss_gen_d.item() / total
+                epoch_loss_gen_a += loss_gen_a.item() / total
+                epoch_loss_reg += loss_ddf.item() / total
+                # Print running average every 20 steps in eval-only mode
+                if args.eval_only and gpu_id == 0 and (step + 1) % 20 == 0:
+                    n = step + 1
+                    print(f"  [eval] step {step}: running_avg ang={epoch_loss_gen_a*total/n:.4f} "
+                        f"dist={epoch_loss_gen_d*total/n:.4f} regul={epoch_loss_reg*total/n:.6f}", flush=True)
+                # Free diffusion intermediates and aggressively release all memory to device.
+                # XPU runtime leaks ~1.3 GiB/step outside the caching allocator.
+                # gc.collect() + synchronize() + empty_cache() attempts to reclaim deferred/lazy allocations.
+                loss_gen_a_val = loss_gen_a.item()
+                # del pre_dvf_I, dvf_I, trm_pred, loss_tot, loss_gen_a, loss_gen_d, loss_ddf
+                gc.collect()
+                if DEVICE_TYPE == 'xpu':
+                    torch.xpu.synchronize()
+                    _empty_cache(DEVICE_TYPE)
+                # Sync loss_gen_a across DDP ranks for contrastive and registration gating
+                if use_distributed:
+                    loss_gen_a_sync = torch.tensor([loss_gen_a_val], device=f"{DEVICE_TYPE}:{rank}")
+                    dist.broadcast(loss_gen_a_sync, src=0)
+                    loss_gen_a_gate = loss_gen_a_sync.item()
+                else:
+                    loss_gen_a_gate = loss_gen_a_val
+                LOSS_WEIGHT_CONTRASTIVE=1e-4
+            else:
+                LOSS_WEIGHT_CONTRASTIVE=1e-1
+                if gpu_id == 0:
+                    print(f"  [train] step {step}: Skipping backward (contra_gate={loss_contra_gate:.4f})", flush=True)
+            # ==========================================================================
+            # Contrastive train on single image (text-image alignment)
+            # Separate backward with gradient clipping to prevent destabilizing diffusion.
+            loss_contra_val = None
+            if step % CONTRASTIVE_STEP_RATIO == 0:
+                n_contra = x0.size()[0]
+                t_contra = torch.randint(0, hyp_parameters["timesteps"], (n_contra,)).to(hyp_parameters["device"])
+                # Route through DDP wrapper and return img_embd directly so DDP
+                # traces the correct subgraph (encoder + mid + attn + img2txt).
+                img_embd = Deformddpm(img_org=(x0 * blind_mask).detach(), cond_imgs=cond_img.detach(), T=t_contra, output_embedding=True, text=None)  # [B, 1024]
+                loss_contra_preweight = F.relu(1 - F.cosine_similarity(img_embd, embd_dev, dim=-1)-0.25).mean()
+                loss_contra = LOSS_WEIGHT_CONTRASTIVE * loss_contra_preweight
+                if not args.eval_only:
+                    optimizer.zero_grad()
+                    loss_contra.backward()
+                    torch.nn.utils.clip_grad_norm_(Deformddpm.parameters(), max_norm=LOSS_WEIGHT_CONTRASTIVE*1)
+                    optimizer.step()
+                loss_contra_val = loss_contra.item()
+                epoch_loss_contrastive += loss_contra_val / total * CONTRASTIVE_STEP_RATIO
+                # else:
+                #     if gpu_id == 0:
+                #         print(f"*** Warning: Network does not have img_embd attribute for contrastive loss at epoch {epoch}, step {step}.")
+            # Free remaining intermediates and aggressively release memory before registration
+            if cond_img is not None:
+                del cond_img
+            if blind_mask is not None:
+                del blind_mask
+            gc.collect()
+            if DEVICE_TYPE == 'xpu':
+                torch.xpu.synchronize()
+                _empty_cache(DEVICE_TYPE)
+            # Sync loss_gen_a across DDP ranks for contrastive and registration gating
+            if use_distributed:
+                loss_contra_sync = torch.tensor([loss_contra_preweight], device=f"{DEVICE_TYPE}:{rank}")
+                dist.broadcast(loss_contra_sync, src=0)
+                loss_contra_gate = loss_contra_sync.item()
+            else:
+                loss_contra_gate = loss_contra_preweight
+            # ==========================================================================
+            # registration train on paired images
+            # loss_gen_a_gate already synced across DDP ranks above
+            do_regist = step % REGISTRATION_STEP_RATIO == 0 and (loss_contra_gate < ACCEPT_THRESH_CONTRASTIVE) and loss_gen_a_gate < ACCEPT_THRESH_ANGLE
+            if do_regist:
+                [x1, y1, _, embd_y] = batch_p
                 if np.random.uniform(0,1)<TEXT_EMBED_PROB:
                     embd_y = embd_y.to(hyp_parameters["device"]).type(torch.float32)
                 else:
                     embd_y = None
                 x1 = x1.to(hyp_parameters["device"]).type(torch.float32)
                 y1 = y1.to(hyp_parameters["device"]).type(torch.float32)
                 n = x1.size()[0]  # batch_size -> n
                 [x1, y1] = utils.random_permute([x1, y1], select_dims=[-1,-2,-3])
                 if hyp_parameters['noise_scale']>0:
+                    [x1, y1] = thresh_img([x1, y1], [0, 2*hyp_parameters['noise_scale']])
+                    random_scale = np.random.normal(1, hyp_parameters['noise_scale'] * 1)
+                    random_shift = np.random.normal(0, hyp_parameters['noise_scale'] * 1)
+                    x1 = x1 * random_scale + random_shift
+                    y1 = y1 * random_scale + random_shift
+                scale_regist = np.random.uniform(0.0,0.5)
+                select_timestep = np.random.randint(12, 32)  # select a random number of timesteps to sample, between 8 and 16
+                T_regist = sorted(random.sample(range(int(hyp_parameters["timesteps"] * scale_regist),hyp_parameters["timesteps"]), select_timestep), reverse=True)
+                T_regist = [[t for _ in range(max(1, hyp_parameters["batchsize"]//2))] for t in T_regist]
+                proc_type = random.choice(['downsample', 'slice', 'slice1', 'none', 'none'])
+                ddpm_inner = Deformddpm.module if use_distributed else Deformddpm
+                y1_proc, msk_tgt, cond_ratio = ddpm_inner.proc_cond_img(y1,proc_type=proc_type)
+                msk_tgt = msk_tgt+MSK_EPS
+                [ddf_comp,ddf_rand],[img_rec,img_diff,img_save],_ = Deformddpm(img_org=x1, cond_imgs=y1_proc, T=[None, T_regist], proc_type=[],text=embd_y)  # forward diffusion process
                 loss_sim = loss_imgsim(img_rec, y1, label=msk_tgt*(y1>thresh_imgsim))  # calculate loss for the registration process
+                loss_mse = loss_imgmse(img_rec, y1, label=msk_tgt*(y1>=0.0))  # calculate loss for the registration process
+                loss_ddf1 = loss_reg1(ddf_comp, img=y1)  # calculate loss for the registration process
                 loss_regist = 0
                 loss_regist += LOSS_WEIGHTS_REGIST[0] * loss_sim
                 loss_regist += LOSS_WEIGHTS_REGIST[1] * loss_mse
                 loss_regist += LOSS_WEIGHTS_REGIST[2] * loss_ddf1
                 # >> JZ: print nan in x0
                 if torch.isnan(x0).any():
                     print(f"*** Encountered NaN in input image x0 at epoch {epoch}, step {step}.")
                 # >> JZ: print loss of ddf
+                if loss_ddf1>0.002:
                     print(f"*** High registration DDF loss at epoch {epoch}, step {step}: {loss_ddf1.item()}.")
+                loss_regist = torch.sqrt(cond_ratio+MSK_EPS) *loss_regist
+                if not args.eval_only:
+                    optimizer.zero_grad()
+                    loss_regist.backward()
+                    torch.nn.utils.clip_grad_norm_(Deformddpm.parameters(), max_norm=0.02)
+                    optimizer.step()
+                epoch_loss_regist += loss_regist.item()
+                epoch_loss_imgsim += loss_sim.item()
+                epoch_loss_imgmse += loss_mse.item()
+                epoch_loss_ddfreg += loss_ddf1.item()
+            else:
+                loss_sim = torch.tensor(0.0)
+                loss_mse = torch.tensor(0.0)
+                loss_ddf1 = torch.tensor(0.0)
+                loss_regist = torch.tensor(0.0)
+                if step % REGISTRATION_STEP_RATIO==0:
+                    total_reg = total_reg-1
+            # print for checking
+            if step % 10 == 0:
+                print('step:',step,':', loss_tot.item(),'=',loss_gen_a.item(),'+', loss_gen_d.item(),'+',loss_ddf.item())
+                print(f'-     loss_regist: {loss_regist} = {loss_sim} (imgsim) + {loss_mse} (imgmse) + {loss_ddf1} (ddf)')
+                print(f'-     loss_contra: {loss_contra}')
+            # Mid-epoch checkpoint and proactive restart (only when --max-steps-before-restart > 0)
+            if max_steps_restart > 0 and step > 0 and step % MID_EPOCH_SAVE_STEPS == 0 and gpu_id == 0 and not args.no_save:
+                _epoch_stats = {
+                    'epoch_loss_tot': epoch_loss_tot,
+                    'epoch_loss_gen_d': epoch_loss_gen_d,
+                    'epoch_loss_gen_a': epoch_loss_gen_a,
+                    'epoch_loss_reg': epoch_loss_reg,
+                    'epoch_loss_regist': epoch_loss_regist,
+                    'epoch_loss_imgsim': epoch_loss_imgsim,
+                    'epoch_loss_imgmse': epoch_loss_imgmse,
+                    'epoch_loss_ddfreg': epoch_loss_ddfreg,
+                    'epoch_loss_contrastive': epoch_loss_contrastive,
+                    'total_reg': total_reg,
+                    'total_contra': total_contra,
+                    'loss_nan_step': loss_nan_step,
+                    'rng_torch': torch.get_rng_state(),
+                    'rng_numpy': np.random.get_state(),
+                    'rng_python': random.getstate(),
+                    **(({'rng_xpu': torch.xpu.get_rng_state()} if DEVICE_TYPE == 'xpu' and hasattr(torch, 'xpu') else
+                        {'rng_cuda': torch.cuda.get_rng_state()} if torch.cuda.is_available() else {})),
+                }
+                tmp_dir = os.path.join(model_save_path, "tmp")
+                os.makedirs(tmp_dir, exist_ok=True)
+                for old_f in glob.glob(os.path.join(tmp_dir, "*.pth")):
+                    os.remove(old_f)
+                mid_save = os.path.join(tmp_dir, f"{epoch:06d}_step{step:04d}{suffix_pth}")
+                state = Deformddpm.module.state_dict() if use_distributed else Deformddpm.state_dict()
+                torch.save({
+                    'model_state_dict': state,
+                    'optimizer_state_dict': optimizer.state_dict(),
+                    'epoch': epoch,
+                    'step': step,
+                    'epoch_stats': _epoch_stats,
+                }, mid_save)
+                print(f"  [mid-epoch] Saved checkpoint at epoch {epoch} step {step}: {mid_save}", flush=True)
+            # Proactive restart: exit cleanly after N steps to reset XPU memory leak.
+            # The bash wrapper will re-launch srun within the same SLURM allocation.
+            steps_since_start += 1
+            if max_steps_restart > 0 and steps_since_start >= max_steps_restart:
+                # Save checkpoint at current position (if not just saved above)
+                if not (step > 0 and step % MID_EPOCH_SAVE_STEPS == 0) and gpu_id == 0 and not args.no_save:
+                    _epoch_stats = {
+                        'epoch_loss_tot': epoch_loss_tot, 'epoch_loss_gen_d': epoch_loss_gen_d,
+                        'epoch_loss_gen_a': epoch_loss_gen_a, 'epoch_loss_reg': epoch_loss_reg,
+                        'epoch_loss_regist': epoch_loss_regist, 'epoch_loss_imgsim': epoch_loss_imgsim,
+                        'epoch_loss_imgmse': epoch_loss_imgmse, 'epoch_loss_ddfreg': epoch_loss_ddfreg,
+                        'epoch_loss_contrastive': epoch_loss_contrastive, 'total_reg': total_reg, 'total_contra': total_contra,
+                        'loss_nan_step': loss_nan_step,
+                        'rng_torch': torch.get_rng_state(), 'rng_numpy': np.random.get_state(),
+                        'rng_python': random.getstate(),
+                        **(({'rng_xpu': torch.xpu.get_rng_state()} if DEVICE_TYPE == 'xpu' and hasattr(torch, 'xpu') else
+                            {'rng_cuda': torch.cuda.get_rng_state()} if torch.cuda.is_available() else {})),
+                    }
+                    tmp_dir = os.path.join(model_save_path, "tmp")
+                    os.makedirs(tmp_dir, exist_ok=True)
+                    for old_f in glob.glob(os.path.join(tmp_dir, "*.pth")):
+                        os.remove(old_f)
+                    mid_save = os.path.join(tmp_dir, f"{epoch:06d}_step{step:04d}{suffix_pth}")
+                    state = Deformddpm.module.state_dict() if use_distributed else Deformddpm.state_dict()
+                    torch.save({
+                        'model_state_dict': state,
+                        'optimizer_state_dict': optimizer.state_dict(),
+                        'epoch': epoch,
+                        'step': step,
+                        'epoch_stats': _epoch_stats,
+                    }, mid_save)
+                    print(f"  [restart] Saved checkpoint at epoch {epoch} step {step}: {mid_save}", flush=True)
+                if gpu_id == 0:
+                    print(f"  [restart] Proactive restart after {steps_since_start} steps "
+                          f"(limit {max_steps_restart}). Exiting with code {EXIT_CODE_RESTART}.", flush=True)
+                # Clean shutdown
+                _empty_cache(DEVICE_TYPE)
+                gc.collect()
+                if use_distributed and dist.is_initialized():
+                    dist.barrier()
+                    dist.destroy_process_group()
+                sys.exit(EXIT_CODE_RESTART)
+        if gpu_id == 0:
+            print('==================')
             print(epoch,':', epoch_loss_tot,'=',epoch_loss_gen_a,'+', epoch_loss_gen_d,'+',epoch_loss_reg, ' (ang+dist+regul)')
+            print(f'     loss_contrastive: {epoch_loss_contrastive}')
+            total_reg_safe = max(total_reg, 1)
+            print(f'     loss_regist: {epoch_loss_regist/total_reg_safe} = {epoch_loss_imgsim/total_reg_safe} (imgsim) + {epoch_loss_imgmse/total_reg_safe} (imgmse) + {epoch_loss_ddfreg/total_reg_safe} (ddf)')
+            print('==================')
+        if 0 == epoch % epoch_per_save and not args.no_save:
             save_dir=model_save_path + str(epoch).rjust(6, '0') + suffix_pth
             os.makedirs(os.path.dirname(model_save_path), exist_ok=True)
             # break   # FOR TESTING
                     'optimizer_state_dict': optimizer.state_dict(),
                     'epoch': epoch
                 }, save_dir)
+        # Clean up tmp/ mid-epoch checkpoints after completed epoch
+        if gpu_id == 0 and not args.no_save:
+            tmp_dir = os.path.join(model_dir, "tmp")
+            tmp_pths = glob.glob(os.path.join(tmp_dir, "*.pth"))
+            if tmp_pths:
+                for f in tmp_pths:
+                    os.remove(f)
+                print(f"  [cleanup] Cleared {len(tmp_pths)} tmp/ mid-epoch checkpoints", flush=True)
+        # Reset initial_step after first epoch completes (no more skipping)
+        initial_step = 0
+        # XPU CCL workaround: restart after each epoch to avoid CCL hang on 2nd epoch.
+        # CCL's Level Zero IPC handles accumulate and cause deadlock after ~200+ collectives.
+        # A fresh process resets the L0 context. The bash loop catches exit code 42 and restarts.
+        if DEVICE_TYPE == 'xpu' and use_distributed:
+            if gpu_id == 0:
+                print(f"  [xpu-restart] Epoch {epoch} done. Restarting to reset CCL state.", flush=True)
+            _empty_cache(DEVICE_TYPE)
+            gc.collect()
+            if dist.is_initialized():
+                dist.barrier()
+                dist.destroy_process_group()
+            sys.exit(EXIT_CODE_RESTART)
     # Resource cleanup at the end of training
+    _empty_cache(DEVICE_TYPE)
     gc.collect()
     if use_distributed and dist.is_initialized():
         dist.destroy_process_group()
+def ddp_load_dict(gpu_id, Deformddpm, optimizer, model_file,use_distributed=True, load_strict=False):
+    # All ranks load checkpoint so optimizer state is consistent across DDP processes.
+    # (Optimizer state includes per-parameter Adam momentum/variance which are NOT
+    # broadcast — only model weights are broadcast. Without this, non-rank-0 processes
+    # would have fresh Adam state after restart.)
+    gc.collect()
+    _empty_cache(DEVICE_TYPE)
     if gpu_id == 0:
         utils.print_memory_usage("Before Loading Model")
+    # checkpoint = torch.load(model_file, map_location='cpu', weights_only=False)
+    checkpoint = torch.load(model_file, map_location='cpu')
+    if use_distributed:
+        Deformddpm.module.load_state_dict(checkpoint['model_state_dict'], strict=load_strict)
+    else:
+        Deformddpm.load_state_dict(checkpoint['model_state_dict'], strict=load_strict)
+    # Restore optimizer state when available (needed for mid-epoch resume).
+    # Selective loading: load states for parameters with matching shapes, skip mismatched ones
+    # (e.g., UpsampleConv replaced ConvTranspose3d — different kernel shapes).
+    # After one epoch, the saved checkpoint will have correct state for ALL parameters.
+    if 'optimizer_state_dict' in checkpoint and not args.reset_optimizer:
+        saved_opt = checkpoint['optimizer_state_dict']
+        saved_state = saved_opt.get('state', {})
+        param_list = [p for group in optimizer.param_groups for p in group['params']]
+        # Check if all shapes match (fast path: full load)
+        all_match = True
+        skipped = 0
+        for idx, s in saved_state.items():
+            if int(idx) < len(param_list):
+                p = param_list[int(idx)]
+                for k, v in s.items():
+                    if isinstance(v, torch.Tensor) and v.dim() > 0 and v.shape != p.shape:
+                        all_match = False
+                        break
+                if not all_match:
+                    break
+        if all_match:
+            optimizer.load_state_dict(saved_opt)
         else:
+            # Selective load: restore param_groups settings (lr, betas, etc.)
+            for saved_g, group in zip(saved_opt['param_groups'], optimizer.param_groups):
+                for k, v in saved_g.items():
+                    if k != 'params':
+                        group[k] = v
+            # Restore per-parameter state only where shapes match
+            for idx, s in saved_state.items():
+                idx_int = int(idx)
+                if idx_int < len(param_list):
+                    p = param_list[idx_int]
+                    shapes_ok = all(
+                        v.shape == p.shape for k, v in s.items()
+                        if isinstance(v, torch.Tensor) and v.dim() > 0
+                    )
+                    if shapes_ok:
+                        # Cast state tensors to match parameter dtype/device
+                        new_state = {}
+                        for k, v in s.items():
+                            if isinstance(v, torch.Tensor):
+                                new_state[k] = v.to(dtype=p.dtype, device=p.device) if v.dim() > 0 else v
+                            else:
+                                new_state[k] = v
+                        optimizer.state[p] = new_state
+                    else:
+                        skipped += 1
+            if gpu_id == 0:
+                loaded = len(saved_state) - skipped
+                print(f"  [checkpoint] Selective optimizer load: {loaded} params restored, "
+                      f"{skipped} skipped (shape mismatch, fresh Adam for those)", flush=True)
+    elif args.reset_optimizer and gpu_id == 0:
+        print("  [checkpoint] --reset-optimizer: skipping optimizer state, starting fresh Adam", flush=True)
+    del checkpoint
+    if gpu_id == 0:
         utils.print_memory_usage("After Loading Checkpoint on GPU")
     if use_distributed:
+        # Broadcast model weights from rank 0 to ensure exact consistency
         dist.barrier()
         for param in Deformddpm.parameters():
+            dist.broadcast(param.data, src=0)
+    # get the epoch number from the filename
+    basename = os.path.basename(model_file)
+    epoch_from_file = int(basename[:6])
+    if '_step' in basename:
+        # Mid-epoch checkpoint: resume at same epoch (don't +1)
+        initial_epoch = epoch_from_file
+    else:
+        # End-of-epoch checkpoint: start next epoch
+        initial_epoch = epoch_from_file + 1
     return initial_epoch, Deformddpm, optimizer
 if __name__ == "__main__":
+    if "LOCAL_RANK" in os.environ:
+        # Multi-node: launched by torchrun / srun
+        use_distributed = True
+        local_rank = int(os.environ["LOCAL_RANK"])
+        world_size = int(os.environ["WORLD_SIZE"])
+        print(f"torchrun launch: LOCAL_RANK={local_rank}, RANK={os.environ.get('RANK')}, WORLD_SIZE={world_size}")
+        try:
+            main_train(local_rank, world_size)
+        except Exception as e:
+            import traceback
+            print(f"\n{'='*60}\nRANK {os.environ.get('RANK')} FAILED:\n{'='*60}", flush=True)
+            traceback.print_exc()
+            raise
+    elif use_distributed:
+        # Single-node multi-GPU: use mp.spawn
+        world_size = _device_count(DEVICE_TYPE)
+        print(f"Distributed {DEVICE_TYPE.upper()} device number = {world_size}")
         mp.spawn(main_train,args = (world_size,),nprocs = world_size)
     else:
         main_train(0,1)

OM_train_3modes_cudaonly.py ADDED Viewed

	@@ -0,0 +1,512 @@

+import os, sys
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(ROOT_DIR)
+import gc
+import torch
+import torchvision
+from torch import nn
+from torchvision.utils import save_image
+from torch.utils.data import DataLoader
+from torch.optim import Adam, SGD
+from Diffusion.diffuser import DeformDDPM
+from Diffusion.networks import get_net, STN
+from torchvision.transforms import Lambda
+import torch.nn.functional as F
+import Diffusion.losses as losses
+import random
+import glob
+import numpy as np
+import utils
+from tqdm import tqdm
+from Dataloader.dataloader0 import get_dataloader
+from Dataloader.dataLoader import *
+from Dataloader.dataloader_utils import thresh_img
+import yaml
+import argparse
+####################
+import torch.multiprocessing as mp
+from torch.utils.data.distributed import DistributedSampler
+from torch.nn.parallel import DistributedDataParallel as DDP
+import torch.distributed as dist
+# from torch.distributed import init_process_group
+###############
+def ddp_setup(rank, world_size):
+    """
+    Args:
+        rank: Unique identifier of each process
+        world_size: Total number of processes
+    """
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "12355"
+    dist.init_process_group(backend="nccl", rank=rank, world_size=world_size)
+    torch.cuda.set_device(rank)
+# Auto-detect: use DDP only when multiple CUDA GPUs are available
+use_distributed = torch.cuda.is_available() and torch.cuda.device_count() > 1
+# use_distributed = True
+# use_distributed = False
+EPS = 1e-5
+MSK_EPS = 0.01
+TEXT_EMBED_PROB = 0.5
+AUG_RESAMPLE_PROB = 0.5
+LOSS_WEIGHTS_DIFF = [2.0, 1.0, 4.0]  # [ang, dist, reg]
+# LOSS_WEIGHTS_REGIST = [9.0, 1.0, 16.0]  # [imgsim, imgmse, ddf]
+LOSS_WEIGHTS_REGIST = [1.0, 0.01, 1e2]  # [imgsim, imgmse, ddf]
+DIFF_REG_BATCH_RATIO = 2
+LOSS_WEIGHT_CONTRASTIVE = 0.001
+REGISTRATION_STEP_RATIO = 1
+CONTRASTIVE_STEP_RATIO = 1
+# AUG_PERMUTE_PROB = 0.35
+parser = argparse.ArgumentParser()
+# config_file_path = 'Config/config_cmr.yaml'
+parser.add_argument(
+        "--config",
+        "-C",
+        help="Path for the config file",
+        type=str,
+        # default="Config/config_cmr.yaml",
+        # default="Config/config_lct.yaml",
+        default="Config/config_all.yaml",
+        required=False,
+    )
+# parser.add_argument("--dummy-samples", type=int, default=0, help="Use dummy random data for testing (0=use real data)")
+parser.add_argument("--batchsize", type=int, default=0, help="Override batch size from config (0=use config value)")
+args = parser.parse_args()
+#=======================================================================================================================
+def main_train(rank=0,world_size=1,train_mode_ratio=1,thresh_imgsim=0.01):
+    if use_distributed:
+        ddp_setup(rank,world_size)
+        if torch.distributed.is_initialized():
+            print(f"World size: {torch.distributed.get_world_size()}")
+            print(f"Communication backend: {torch.distributed.get_backend()}")
+    gpu_id = rank
+    # Load the YAML file into a dictionary
+    with open(args.config, 'r') as file:
+        hyp_parameters = yaml.safe_load(file)
+    if args.batchsize > 0:
+        hyp_parameters['batchsize'] = args.batchsize
+    print(hyp_parameters)
+    # epoch_per_save=10
+    epoch_per_save=hyp_parameters['epoch_per_save']
+    data_name=hyp_parameters['data_name']
+    net_name = hyp_parameters['net_name']
+    Net=get_net(net_name)
+    suffix_pth=f'_{data_name}_{net_name}.pth'
+    model_save_path = os.path.join('Models',f'{data_name}_{net_name}/')
+    model_dir=model_save_path
+    transformer=utils.get_transformer(img_sz=hyp_parameters["ndims"]*[hyp_parameters['img_size']])
+    # Data_Loader=get_dataloader(data_name=hyp_parameters['data_name'], mode='train')
+    # tsfm = torchvision.transforms.Compose([
+    #             torchvision.transforms.ToTensor(),
+    #             ])
+    # dataset = Data_Loader(target_res = [hyp_parameters["img_size"]]*hyp_parameters["ndims"], transforms=None, noise_scale=hyp_parameters['noise_scale'])
+    # train_loader = DataLoader(
+    #     dataset,
+    #     batch_size=hyp_parameters['batchsize'],
+    #     # shuffle=False,
+    #     shuffle=True,
+    #     drop_last=True,
+    # )
+    # dataset = OminiDataset_v1(transform=None)
+    dataset = OMDataset_indiv(transform=None)
+    # datasetp = OminiDataset_paired(transform=None)
+    datasetp = OMDataset_pair(transform=None)
+    train_loader = DataLoader(
+        dataset,
+        batch_size=hyp_parameters['batchsize'],
+        shuffle=True,
+        drop_last=True,
+    )
+    train_loader_p = DataLoader(
+        datasetp,
+        batch_size=max(1, hyp_parameters['batchsize']//DIFF_REG_BATCH_RATIO),
+        shuffle=True,
+        drop_last=True,
+    )
+    Deformddpm = DeformDDPM(
+        network=Net(
+            n_steps=hyp_parameters["timesteps"],
+            ndims=hyp_parameters["ndims"],
+            num_input_chn = hyp_parameters["num_input_chn"],
+            res = hyp_parameters['img_size']
+            ),
+        n_steps=hyp_parameters["timesteps"],
+        image_chw=[1] + [hyp_parameters["img_size"]]*hyp_parameters["ndims"],
+        device=hyp_parameters["device"],
+        batch_size=hyp_parameters["batchsize"],
+        img_pad_mode=hyp_parameters["img_pad_mode"],
+        v_scale=hyp_parameters["v_scale"],
+    )
+    ddf_stn = STN(
+        img_sz=hyp_parameters["img_size"],
+        ndims=hyp_parameters["ndims"],
+        # padding_mode="zeros",
+        padding_mode=hyp_parameters["padding_mode"],
+        device=hyp_parameters["device"],
+    )
+    if use_distributed:
+        Deformddpm.to(rank)
+        Deformddpm = DDP(Deformddpm, device_ids=[rank])
+        ddf_stn.to(rank)
+    else:
+        Deformddpm.to(hyp_parameters["device"])
+        ddf_stn.to(hyp_parameters["device"])
+    # ddf_stn = DDP(ddf_stn, device_ids=[rank])
+    # mse = nn.MSELoss()
+    # loss_reg = losses.Grad(penalty=['l1', 'negdetj'], ndims=hyp_parameters["ndims"])
+    # loss_reg = losses.Grad(penalty=['l1', 'negdetj', 'range'], ndims=hyp_parameters["ndims"])
+    loss_reg = losses.Grad(penalty=['l1', 'negdetj', 'range'], ndims=hyp_parameters["ndims"],outrange_thresh=0.2,outrange_weight=1e3)
+    loss_reg1 = losses.Grad(penalty=['l1', 'negdetj', 'range'], ndims=hyp_parameters["ndims"],outrange_thresh=0.6,outrange_weight=1e3)
+    loss_dist = losses.MRSE(img_sz=hyp_parameters["img_size"])
+    # loss_ang = losses.MRSE(img_sz=hyp_parameters["img_size"])
+    loss_ang = losses.NCC(img_sz=hyp_parameters["img_size"])
+    loss_imgsim = losses.MSLNCC()
+    loss_imgmse = losses.LMSE()
+    optimizer = Adam(Deformddpm.parameters(), lr=hyp_parameters["lr"])
+    # hyp_parameters["lr"]=0.00000001
+    # optimizer_regist = Adam(Deformddpm.parameters(), lr=hyp_parameters["lr"]*0.01)
+    # optimizer_regist = SGD(Deformddpm.parameters(), lr=hyp_parameters["lr"]*0.01, momentum=0.98)
+    # optimizer = SGD(Deformddpm.parameters(), lr=hyp_parameters["lr"], momentum=0.9)
+    # # LR scheduler ----- YHM
+    # scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, hyp_parameters["lr"], hyp_parameters["lr"]*10, step_size_up=500, step_size_down=500, mode='triangular', gamma=1.0, scale_fn=None, scale_mode='cycle', cycle_momentum=True, base_momentum=0.8, max_momentum=0.9, last_epoch=-1)
+    # Deformddpm.network.load_state_dict(torch.load('/home/data/jzheng/Adaptive_Motion_Generator-master/models/1000.pth'))
+    # check for existing models
+    if not os.path.exists(model_dir):
+        os.makedirs(model_dir, exist_ok=True)
+    model_files = glob.glob(os.path.join(model_dir, "*.pth"))
+    model_files.sort()
+    if model_files:
+        if gpu_id == 0:
+            print(model_files)
+        initial_epoch, Deformddpm, optimizer = ddp_load_dict(gpu_id, Deformddpm, optimizer, model_files[-1], use_distributed=use_distributed)
+    else:
+        initial_epoch = 0
+    if gpu_id == 0:
+        print('len_train_data: ',len(dataset))
+    # Training loop
+    for epoch in range(initial_epoch,hyp_parameters["epoch"]):
+        epoch_loss_tot = 0.0
+        epoch_loss_gen_d = 0.0
+        epoch_loss_gen_a = 0.0
+        epoch_loss_reg = 0.0
+        epoch_loss_regist = 0.0
+        epoch_loss_imgsim = 0.0
+        epoch_loss_imgmse = 0.0
+        epoch_loss_ddfreg = 0.0
+        epoch_loss_contrastive = 0.0
+        # Set model inside to train model
+        Deformddpm.train()
+        loss_nan_step = 0  # yu: count the number of nan loss steps
+        total = min(len(train_loader), len(train_loader_p))
+        total_reg = total // REGISTRATION_STEP_RATIO
+        # for step, batch in tqdm(enumerate(train_loader)):
+        # for step, batch in tqdm(enumerate(train_loader)):
+        # for step, batch in enumerate(train_loader_omni):
+        for step, (batch, batch_p) in tqdm(enumerate(zip(train_loader, train_loader_p)), total=total):
+            # x0, _ = batch
+            # ==========================================================================
+            # diffusion train on single image
+            # x0 = batch # for omni dataset
+            [x0,embd] = batch # for om dataset
+            x0 = x0.to(hyp_parameters["device"]).type(torch.float32)
+            # print('embd:', embd.shape)
+            embd_dev = embd.to(hyp_parameters["device"]).type(torch.float32)
+            if np.random.uniform(0,1)<TEXT_EMBED_PROB:
+                embd_in = embd_dev
+            else:
+                embd_in = None
+            n = x0.size()[0]  # batch_size -> n
+            x0 = x0.to(hyp_parameters["device"])
+            blind_mask = utils.get_random_deformed_mask(x0.shape[2:],apply_possibility=0.6).to(hyp_parameters["device"])
+            # random deformation + rotation
+            if hyp_parameters["ndims"]>2:
+                if np.random.uniform(0,1)<AUG_RESAMPLE_PROB:
+                    x0 = utils.random_resample(x0, deform_scale=0)
+                # elif np.random.uniform(0,1)<AUG_RESAMPLE_PROB+AUG_PERMUTE_PROB:
+                else:
+                    [x0] = utils.random_permute([x0], select_dims=[-1,-2,-3])
+            # x0 = transformer(x0)
+            if hyp_parameters['noise_scale']>0:
+                if np.random.uniform(0,1)<AUG_RESAMPLE_PROB:
+                    x0 = thresh_img(x0, [0, 2*hyp_parameters['noise_scale']])
+                x0 = x0 * (np.random.normal(1, hyp_parameters['noise_scale'] * 1)) + np.random.normal(0, hyp_parameters['noise_scale'] * 1)
+            # Picking some noise for each of the images in the batch, a timestep and the respective alpha_bars
+            t = torch.randint(0, hyp_parameters["timesteps"], (n,)).to(
+                hyp_parameters["device"]
+            )  # pick up a seq of rand number from 0 to 'timestep'
+            # proc_type = random.choice(['adding', 'independ', 'downsample', 'slice', 'project', 'none', 'uncon', 'uncon', 'uncon'])
+            proc_type = random.choice(['adding', 'downsample', 'slice', 'slice1', 'none', 'uncon', 'uncon', 'uncon'])
+            # print('proc_type:', proc_type)
+            ddpm = Deformddpm.module if use_distributed else Deformddpm
+            cond_img, _, cond_ratio = ddpm.proc_cond_img(x0,proc_type=proc_type)
+            pre_dvf_I,dvf_I = Deformddpm(img_org=x0, t=t, cond_imgs=cond_img, mask=blind_mask,proc_type=[],text=embd_in)  # forward diffusion process
+            # print(torch.max(torch.abs(pre_dvf_I)))
+            # print(torch.max(torch.abs(dvf_I)))
+            loss_tot=0
+            loss_ddf = loss_reg(pre_dvf_I,img=x0)
+            trm_pred = ddf_stn(pre_dvf_I, dvf_I)
+            loss_gen_d = loss_dist(pred=trm_pred,inv_lab=dvf_I,ddf_stn=None,mask=blind_mask)
+            loss_gen_a = loss_ang(pred=trm_pred,inv_lab=dvf_I,ddf_stn=None,mask=blind_mask)
+            loss_tot += LOSS_WEIGHTS_DIFF[0] * loss_gen_a + LOSS_WEIGHTS_DIFF[1] * loss_gen_d
+            loss_tot += LOSS_WEIGHTS_DIFF[2] * loss_ddf
+            loss_tot = torch.sqrt(1.+MSK_EPS-cond_ratio) * loss_tot
+            # >> JZ: print nan in x0
+            if torch.isnan(x0).any():
+                print(f"*** Encountered NaN in input image x0 at epoch {epoch}, step {step}.")
+            # >> JZ: print loss of ddf
+            if loss_ddf>0.001:
+                print(f"*** High diffusion DDF loss at epoch {epoch}, step {step}: {loss_ddf.item()}.")
+            # yu: check if loss_tot==nan or inf
+            if torch.isnan(loss_tot) or torch.isinf(loss_tot):
+                print(f"*** Encountered NaN or Inf loss at epoch {epoch}, step {step}. Skipping this batch.")
+                loss_nan_step += 1
+                continue
+            if loss_nan_step > 5:
+                print(f"*** Too many NaN or Inf losses ({loss_nan_step} times) at epoch {epoch}, step {step}. Stopping training.")
+                raise ValueError("Too many NaN losses detected in loss_tot. Code terminated.")
+            optimizer.zero_grad()
+            loss_tot.backward()
+            optimizer.step()
+            epoch_loss_tot += loss_tot.item() / total
+            epoch_loss_gen_d += loss_gen_d.item() / total
+            epoch_loss_gen_a += loss_gen_a.item() / total
+            epoch_loss_reg += loss_ddf.item() / total
+            # ==========================================================================
+            # contrastive train on single image (text-image alignment)
+            loss_contra_val = None
+            if step % CONTRASTIVE_STEP_RATIO == 0:
+                raw_network = Deformddpm.module.network if use_distributed else Deformddpm.network
+                n_contra = x0.size()[0]
+                t_contra = torch.randint(0, hyp_parameters["timesteps"], (n_contra,)).to(hyp_parameters["device"])
+                _ = raw_network(x=(x0 * blind_mask).detach(), y=cond_img.detach(), t=t_contra, text=None)
+                if hasattr(raw_network, 'img_embd') and raw_network.img_embd is not None:
+                    img_embd = raw_network.img_embd  # [B, 1024]
+                    loss_contra = LOSS_WEIGHT_CONTRASTIVE * F.relu(1 - F.cosine_similarity(img_embd, embd_dev, dim=-1).mean()-0.05)  # contrastive loss to align image embedding with text embedding, with a margin of 0.02
+                    optimizer.zero_grad()
+                    loss_contra.backward()
+                    torch.nn.utils.clip_grad_norm_(Deformddpm.parameters(), max_norm=0.02)
+                    optimizer.step()
+                    loss_contra_val = loss_contra.item()
+                    epoch_loss_contrastive += loss_contra_val / total * CONTRASTIVE_STEP_RATIO
+                else:
+                    if gpu_id == 0:
+                        print(f"*** Warning: Network does not have img_embd attribute for contrastive loss at epoch {epoch}, step {step}.")
+            # ==========================================================================
+            # registration train on paired images
+            if step%REGISTRATION_STEP_RATIO == 0 and loss_gen_a.item()<-0.6:  # only train registration on relatively well-deformed images, to avoid too large registration loss and unstable training in the early stage
+                [x1, y1, _, embd_y] = batch_p
+                if np.random.uniform(0,1)<TEXT_EMBED_PROB:
+                    embd_y = embd_y.to(hyp_parameters["device"]).type(torch.float32)
+                else:
+                    embd_y = None
+                x1 = x1.to(hyp_parameters["device"]).type(torch.float32)
+                y1 = y1.to(hyp_parameters["device"]).type(torch.float32)
+                n = x1.size()[0]  # batch_size -> n
+                [x1, y1] = utils.random_permute([x1, y1], select_dims=[-1,-2,-3])
+                if hyp_parameters['noise_scale']>0:
+                    [x1, y1] = thresh_img([x1, y1], [0, 2*hyp_parameters['noise_scale']])
+                    random_scale = np.random.normal(1, hyp_parameters['noise_scale'] * 1)
+                    random_shift = np.random.normal(0, hyp_parameters['noise_scale'] * 1)
+                    x1 = x1 * random_scale + random_shift
+                    y1 = y1 * random_scale + random_shift
+                scale_regist = np.random.uniform(0.0,0.7)
+                select_timestep = np.random.randint(12, 25)  # select a random number of timesteps to sample, between 8 and 16
+                T_regist = sorted(random.sample(range(int(hyp_parameters["timesteps"] * scale_regist),hyp_parameters["timesteps"]), select_timestep), reverse=True)
+                T_regist = [[t for _ in range(max(1, hyp_parameters["batchsize"]//2))] for t in T_regist]
+                proc_type = random.choice(['downsample', 'slice', 'slice1', 'none', 'none'])
+                ddpm_inner = Deformddpm.module if use_distributed else Deformddpm
+                y1_proc, msk_tgt, cond_ratio = ddpm_inner.proc_cond_img(y1,proc_type=proc_type)
+                msk_tgt = msk_tgt+MSK_EPS
+                [ddf_comp,ddf_rand],[img_rec,img_diff,img_save],_ = Deformddpm(img_org=x1, cond_imgs=y1_proc, T=[None, T_regist], proc_type=[],text=embd_y)  # forward diffusion process
+                loss_sim = loss_imgsim(img_rec, y1, label=msk_tgt*(y1>thresh_imgsim))  # calculate loss for the registration process
+                loss_mse = loss_imgmse(img_rec, y1, label=msk_tgt*(y1>=0.0))  # calculate loss for the registration process
+                loss_ddf1 = loss_reg1(ddf_comp, img=y1)  # calculate loss for the registration process
+                loss_regist = 0
+                loss_regist += LOSS_WEIGHTS_REGIST[0] * loss_sim
+                loss_regist += LOSS_WEIGHTS_REGIST[1] * loss_mse
+                loss_regist += LOSS_WEIGHTS_REGIST[2] * loss_ddf1
+                # >> JZ: print nan in x0
+                if torch.isnan(x0).any():
+                    print(f"*** Encountered NaN in input image x0 at epoch {epoch}, step {step}.")
+                # >> JZ: print loss of ddf
+                if loss_ddf1>0.002:
+                    print(f"*** High registration DDF loss at epoch {epoch}, step {step}: {loss_ddf1.item()}.")
+                loss_regist = torch.sqrt(cond_ratio+MSK_EPS) *loss_regist
+                optimizer.zero_grad()
+                loss_regist.backward()
+                torch.nn.utils.clip_grad_norm_(Deformddpm.parameters(), max_norm=0.1)
+                optimizer.step()
+                epoch_loss_regist += loss_regist.item()
+                epoch_loss_imgsim += loss_sim.item()
+                epoch_loss_imgmse += loss_mse.item()
+                epoch_loss_ddfreg += loss_ddf1.item()
+            else:
+                loss_sim = torch.tensor(0.0)
+                loss_mse = torch.tensor(0.0)
+                loss_ddf1 = torch.tensor(0.0)
+                loss_regist = torch.tensor(0.0)
+                if step % REGISTRATION_STEP_RATIO==0:
+                    total_reg = total_reg-1
+            if step % 10 == 0:
+                print('step:',step,':', loss_tot.item(),'=',loss_gen_a.item(),'+', loss_gen_d.item(),'+',loss_ddf.item())
+                if loss_contra_val is not None:
+                    print(f'     loss_contrastive: {loss_contra_val:.6f}')
+                print(f'     loss_regist: {loss_regist} = {loss_sim} (imgsim) + {loss_mse} (imgmse) + {loss_ddf1} (ddf)')
+        if 1:
+            print('==================')
+            print(epoch,':', epoch_loss_tot,'=',epoch_loss_gen_a,'+', epoch_loss_gen_d,'+',epoch_loss_reg, ' (ang+dist+regul)')
+            print(f'     loss_contrastive: {epoch_loss_contrastive}')
+            print(f'     loss_regist: {epoch_loss_regist/total_reg} = {epoch_loss_imgsim/total_reg} (imgsim) + {epoch_loss_imgmse/total_reg} (imgmse) + {epoch_loss_ddfreg/total_reg} (ddf)')
+            print('==================')
+        if 0 == epoch % epoch_per_save:
+            save_dir=model_save_path + str(epoch).rjust(6, '0') + suffix_pth
+            os.makedirs(os.path.dirname(model_save_path), exist_ok=True)
+            # break   # FOR TESTING
+            if not use_distributed:
+                print(f"saved in {save_dir}")
+                # torch.save(Deformddpm.state_dict(), save_dir)
+                torch.save({
+                    'model_state_dict': Deformddpm.state_dict(),
+                    'optimizer_state_dict': optimizer.state_dict(),
+                    'epoch': epoch
+                }, save_dir)
+            elif gpu_id == 0:
+                print(f"saved in {save_dir}")
+                # torch.save(Deformddpm.module.state_dict(), save_dir)
+                torch.save({
+                    'model_state_dict': Deformddpm.module.state_dict(),
+                    'optimizer_state_dict': optimizer.state_dict(),
+                    'epoch': epoch
+                }, save_dir)
+    # Resource cleanup at the end of training
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    gc.collect()
+    if use_distributed and dist.is_initialized():
+        dist.destroy_process_group()
+def ddp_load_dict(gpu_id, Deformddpm, optimizer, model_file,use_distributed=True, load_strict=False):
+    if gpu_id == 0:
+    # if 0:
+        utils.print_memory_usage("Before Loading Model")
+        if torch.cuda.is_available():
+            gc.collect()
+            torch.cuda.empty_cache()
+        # Deformddpm.network.load_state_dict(torch.load(latest_model_file))
+        # Deformddpm.load_state_dict(torch.load(latest_model_file), strict=False)
+        checkpoint = torch.load(model_file, map_location='cpu')
+        # checkpoint = torch.load(latest_model_file, map_location=f"cuda:{rank}")
+        if use_distributed:
+            Deformddpm.module.load_state_dict(checkpoint['model_state_dict'], strict=load_strict)
+        else:
+            Deformddpm.load_state_dict(checkpoint['model_state_dict'], strict=load_strict)
+        if load_strict:
+            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
+        utils.print_memory_usage("After Loading Checkpoint on GPU")
+    if use_distributed:
+        # Broadcast model weights from rank 0 to all other GPUs
+        dist.barrier()
+        for param in Deformddpm.parameters():
+            dist.broadcast(param.data, src=0)  # Synchronize model across ranks
+        dist.barrier()
+        for param_group in optimizer.param_groups:
+            for param in param_group['params']:
+                if param.grad is not None:
+                    dist.broadcast(param.grad, src=0)  # Sync optimizer gradients
+    # initial_epoch = checkpoint['epoch'] + 1
+    # get the epoch number from the filename and add 1 to set as initial_epoch
+    initial_epoch = int(os.path.basename(model_file).split('.')[0][:6]) + 1
+    return initial_epoch, Deformddpm, optimizer
+if __name__ == "__main__":
+    if use_distributed:
+        world_size = torch.cuda.device_count()
+        print(f"Distributed GPU number = {world_size}")
+        mp.spawn(main_train,args = (world_size,),nprocs = world_size)
+    else:
+        main_train(0,1)

OM_train_3modes_opt.py ADDED Viewed

	@@ -0,0 +1,513 @@

+"""
+OM_train_3modes_opt.py — Optimized 3-mode training (diffusion + contrastive + registration).
+Speed optimizations over OM_train_3modes.py (all mathematically equivalent):
+  1. DataLoader: num_workers, pin_memory, persistent_workers for I/O overlap
+  2. optimizer.zero_grad(set_to_none=True) — avoids zero-fill overhead
+  3. Fixed-length T_regist (16 steps) — avoids XPU dynamic shape recompilation
+  4. Removed redundant x0.to(device) call
+  5. Uses diffuser_opt.DeformDDPM (hoisted clone, no *0 redundancy, OptSTN, inference_mode)
+  6. Uses losses_opt.MSLNCC/LNCC (register_buffer for kernels)
+  7. Pre-compute proc_type lists to reduce Python overhead in hot loop
+  8. Uses OptRecMulModMutAttnNet (cached resample tensors, ~300 fewer CPU→GPU transfers)
+  9. Uses OptSTN for ddf_stn (register_buffer, no per-call .to())
+"""
+import os, sys
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(ROOT_DIR)
+import gc
+import torch
+import torchvision
+from torch import nn
+from torchvision.utils import save_image
+from torch.utils.data import DataLoader
+from torch.optim import Adam, SGD
+from Diffusion.diffuser_opt import DeformDDPM
+from Diffusion.networks_opt import get_net_opt, OptSTN
+from torchvision.transforms import Lambda
+import torch.nn.functional as F
+import Diffusion.losses_opt as losses
+import random
+import glob
+import numpy as np
+import utils
+from tqdm import tqdm
+from Dataloader.dataloader0 import get_dataloader
+from Dataloader.dataLoader import *
+from Dataloader.dataloader_utils import thresh_img
+import yaml
+import argparse
+####################
+import torch.multiprocessing as mp
+from torch.utils.data.distributed import DistributedSampler
+from torch.nn.parallel import DistributedDataParallel as DDP
+import torch.distributed as dist
+###############
+def ddp_setup(rank, world_size):
+    """
+    Args:
+        rank: Unique identifier of each process
+        world_size: Total number of processes
+    """
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "12355"
+    dist.init_process_group(backend="nccl", rank=rank, world_size=world_size)
+    torch.cuda.set_device(rank)
+# Auto-detect: use DDP only when multiple CUDA GPUs are available
+use_distributed = torch.cuda.is_available() and torch.cuda.device_count() > 1
+# use_distributed = True
+# use_distributed = False
+EPS = 1e-5
+MSK_EPS = 0.01
+TEXT_EMBED_PROB = 0.7
+AUG_RESAMPLE_PROB = 0.5
+LOSS_WEIGHTS_DIFF = [2.0, 2.0, 4.0]  # [ang, dist, reg]
+# LOSS_WEIGHTS_REGIST = [9.0, 1.0, 16.0]  # [imgsim, imgmse, ddf]
+LOSS_WEIGHTS_REGIST = [1.0, 0.05, 128]  # [imgsim, imgmse, ddf]
+DIFF_REG_BATCH_RATIO = 2
+LOSS_WEIGHT_CONTRASTIVE = 1.0
+CONTRASTIVE_STEP_RATIO = 2
+# OPT: Fixed registration timestep count to avoid XPU dynamic shape recompilation
+FIXED_T_REGIST_LEN = 16
+# OPT: DataLoader workers (set to 0 to disable multiprocessing if needed)
+NUM_WORKERS = 4
+PIN_MEMORY = True
+# AUG_PERMUTE_PROB = 0.35
+parser = argparse.ArgumentParser()
+# config_file_path = 'Config/config_cmr.yaml'
+parser.add_argument(
+        "--config",
+        "-C",
+        help="Path for the config file",
+        type=str,
+        # default="Config/config_cmr.yaml",
+        # default="Config/config_lct.yaml",
+        default="Config/config_all.yaml",
+        required=False,
+    )
+parser.add_argument("--dummy-samples", type=int, default=0, help="Use dummy random data for testing (0=use real data)")
+parser.add_argument("--batchsize", type=int, default=0, help="Override batch size from config (0=use config value)")
+parser.add_argument("--num-workers", type=int, default=NUM_WORKERS, help="DataLoader num_workers (default: 4)")
+args = parser.parse_args()
+#=======================================================================================================================
+class _DummyIndiv(torch.utils.data.Dataset):
+    def __init__(self, n, sz, embd_dim=1024):
+        self.n, self.sz, self.embd_dim = n, sz, embd_dim
+    def __len__(self): return self.n
+    def __getitem__(self, i):
+        return np.random.rand(1, self.sz, self.sz, self.sz).astype(np.float64), np.random.randn(self.embd_dim).astype(np.float32)
+class _DummyPair(torch.utils.data.Dataset):
+    def __init__(self, n, sz, embd_dim=1024):
+        self.n, self.sz, self.embd_dim = n, sz, embd_dim
+    def __len__(self): return self.n
+    def __getitem__(self, i):
+        return (np.random.rand(1, self.sz, self.sz, self.sz).astype(np.float64),
+                np.random.rand(1, self.sz, self.sz, self.sz).astype(np.float64),
+                np.random.randn(self.embd_dim).astype(np.float32),
+                np.random.randn(self.embd_dim).astype(np.float32))
+def main_train(rank=0,world_size=1,train_mode_ratio=1,thresh_imgsim=0.01):
+    if use_distributed:
+        ddp_setup(rank,world_size)
+        if torch.distributed.is_initialized():
+            print(f"World size: {torch.distributed.get_world_size()}")
+            print(f"Communication backend: {torch.distributed.get_backend()}")
+    gpu_id = rank
+    # Load the YAML file into a dictionary
+    with open(args.config, 'r') as file:
+        hyp_parameters = yaml.safe_load(file)
+    if args.batchsize > 0:
+        hyp_parameters['batchsize'] = args.batchsize
+    print(hyp_parameters)
+    # epoch_per_save=10
+    epoch_per_save=hyp_parameters['epoch_per_save']
+    data_name=hyp_parameters['data_name']
+    net_name = hyp_parameters['net_name']
+    Net=get_net_opt(net_name)
+    suffix_pth=f'_{data_name}_{net_name}.pth'
+    model_save_path = os.path.join('Models',f'{data_name}_{net_name}/')
+    model_dir=model_save_path
+    transformer=utils.get_transformer(img_sz=hyp_parameters["ndims"]*[hyp_parameters['img_size']])
+    # OPT: DataLoader with num_workers, pin_memory, persistent_workers
+    num_workers = args.num_workers
+    use_pin_memory = PIN_MEMORY and hyp_parameters["device"] != "cpu"
+    if args.dummy_samples > 0:
+        dataset = _DummyIndiv(args.dummy_samples, hyp_parameters['img_size'])
+        datasetp = _DummyPair(args.dummy_samples, hyp_parameters['img_size'])
+    else:
+        dataset = OMDataset_indiv(transform=None)
+        datasetp = OMDataset_pair(transform=None)
+    train_loader = DataLoader(
+        dataset,
+        batch_size=hyp_parameters['batchsize'],
+        shuffle=True,
+        drop_last=True,
+        num_workers=num_workers,                           # OPT
+        pin_memory=use_pin_memory,                         # OPT
+        persistent_workers=num_workers > 0,                # OPT
+    )
+    train_loader_p = DataLoader(
+        datasetp,
+        batch_size=max(1, hyp_parameters['batchsize']//DIFF_REG_BATCH_RATIO),
+        shuffle=True,
+        drop_last=True,
+        num_workers=num_workers,                           # OPT
+        pin_memory=use_pin_memory,                         # OPT
+        persistent_workers=num_workers > 0,                # OPT
+    )
+    Deformddpm = DeformDDPM(
+        network=Net(
+            n_steps=hyp_parameters["timesteps"],
+            ndims=hyp_parameters["ndims"],
+            num_input_chn = hyp_parameters["num_input_chn"],
+            res = hyp_parameters['img_size']
+            ),
+        n_steps=hyp_parameters["timesteps"],
+        image_chw=[1] + [hyp_parameters["img_size"]]*hyp_parameters["ndims"],
+        device=hyp_parameters["device"],
+        batch_size=hyp_parameters["batchsize"],
+        img_pad_mode=hyp_parameters["img_pad_mode"],
+        v_scale=hyp_parameters["v_scale"],
+    )
+    ddf_stn = OptSTN(
+        img_sz=hyp_parameters["img_size"],
+        ndims=hyp_parameters["ndims"],
+        # padding_mode="zeros",
+        padding_mode=hyp_parameters["padding_mode"],
+        device=hyp_parameters["device"],
+    )
+    if use_distributed:
+        Deformddpm.to(rank)
+        Deformddpm = DDP(Deformddpm, device_ids=[rank])
+        ddf_stn.to(rank)
+    else:
+        Deformddpm.to(hyp_parameters["device"])
+        ddf_stn.to(hyp_parameters["device"])
+    # ddf_stn = DDP(ddf_stn, device_ids=[rank])
+    # mse = nn.MSELoss()
+    # loss_reg = losses.Grad(penalty=['l1', 'negdetj'], ndims=hyp_parameters["ndims"])
+    # loss_reg = losses.Grad(penalty=['l1', 'negdetj', 'range'], ndims=hyp_parameters["ndims"])
+    loss_reg = losses.Grad(penalty=['l1', 'negdetj', 'range'], ndims=hyp_parameters["ndims"],outrange_thresh=0.2,outrange_weight=1e3)
+    loss_reg1 = losses.Grad(penalty=['l1', 'negdetj', 'range'], ndims=hyp_parameters["ndims"],outrange_thresh=0.6,outrange_weight=1e3)
+    loss_dist = losses.MRSE(img_sz=hyp_parameters["img_size"])
+    # loss_ang = losses.MRSE(img_sz=hyp_parameters["img_size"])
+    loss_ang = losses.NCC(img_sz=hyp_parameters["img_size"])
+    loss_imgsim = losses.MSLNCC()
+    loss_imgmse = losses.LMSE()
+    optimizer = Adam(Deformddpm.parameters(), lr=hyp_parameters["lr"])
+    # check for existing models
+    if not os.path.exists(model_dir):
+        os.makedirs(model_dir, exist_ok=True)
+    model_files = glob.glob(os.path.join(model_dir, "*.pth"))
+    model_files.sort()
+    if model_files:
+        if gpu_id == 0:
+            print(model_files)
+        initial_epoch, Deformddpm, optimizer = ddp_load_dict(gpu_id, Deformddpm, optimizer, model_files[-1], use_distributed=use_distributed)
+    else:
+        initial_epoch = 0
+    if gpu_id == 0:
+        print('len_train_data: ',len(dataset))
+    # Training loop
+    for epoch in range(initial_epoch,hyp_parameters["epoch"]):
+        epoch_loss_tot = 0.0
+        epoch_loss_gen_d = 0.0
+        epoch_loss_gen_a = 0.0
+        epoch_loss_reg = 0.0
+        epoch_loss_regist = 0.0
+        epoch_loss_imgsim = 0.0
+        epoch_loss_imgmse = 0.0
+        epoch_loss_ddfreg = 0.0
+        epoch_loss_contrastive = 0.0
+        # Set model inside to train model
+        Deformddpm.train()
+        loss_nan_step = 0  # yu: count the number of nan loss steps
+        total = min(len(train_loader), len(train_loader_p))
+        for step, (batch, batch_p) in tqdm(enumerate(zip(train_loader, train_loader_p)), total=total):
+            # ==========================================================================
+            # diffusion train on single image
+            [x0,embd] = batch # for om dataset
+            x0 = x0.to(hyp_parameters["device"]).type(torch.float32)
+            embd_dev = embd.to(hyp_parameters["device"]).type(torch.float32)
+            if np.random.uniform(0,1)<TEXT_EMBED_PROB:
+                embd_in = embd_dev
+            else:
+                embd_in = None
+            n = x0.size()[0]  # batch_size -> n
+            # OPT: removed redundant x0.to(device) — already done above
+            blind_mask = utils.get_random_deformed_mask(x0.shape[2:],apply_possibility=0.6).to(hyp_parameters["device"])
+            # random deformation + rotation
+            if hyp_parameters["ndims"]>2:
+                if np.random.uniform(0,1)<AUG_RESAMPLE_PROB:
+                    x0 = utils.random_resample(x0, deform_scale=0)
+                # elif np.random.uniform(0,1)<AUG_RESAMPLE_PROB+AUG_PERMUTE_PROB:
+                else:
+                    [x0] = utils.random_permute([x0], select_dims=[-1,-2,-3])
+            # x0 = transformer(x0)
+            if hyp_parameters['noise_scale']>0:
+                if np.random.uniform(0,1)<AUG_RESAMPLE_PROB:
+                    x0 = thresh_img(x0, [0, 2*hyp_parameters['noise_scale']])
+                x0 = x0 * (np.random.normal(1, hyp_parameters['noise_scale'] * 1)) + np.random.normal(0, hyp_parameters['noise_scale'] * 1)
+            # Picking some noise for each of the images in the batch, a timestep and the respective alpha_bars
+            t = torch.randint(0, hyp_parameters["timesteps"], (n,)).to(
+                hyp_parameters["device"]
+            )  # pick up a seq of rand number from 0 to 'timestep'
+            proc_type = random.choice(['adding', 'downsample', 'slice', 'slice1', 'none', 'uncon', 'uncon', 'uncon'])
+            ddpm = Deformddpm.module if use_distributed else Deformddpm
+            cond_img, _, cond_ratio = ddpm.proc_cond_img(x0,proc_type=proc_type)
+            pre_dvf_I,dvf_I = Deformddpm(img_org=x0, t=t, cond_imgs=cond_img, mask=blind_mask,proc_type=[],text=embd_in)  # forward diffusion process
+            loss_tot=0
+            loss_ddf = loss_reg(pre_dvf_I,img=x0)
+            trm_pred = ddf_stn(pre_dvf_I, dvf_I)
+            loss_gen_d = loss_dist(pred=trm_pred,inv_lab=dvf_I,ddf_stn=None,mask=blind_mask)
+            loss_gen_a = loss_ang(pred=trm_pred,inv_lab=dvf_I,ddf_stn=None,mask=blind_mask)
+            loss_tot += LOSS_WEIGHTS_DIFF[0] * loss_gen_a + LOSS_WEIGHTS_DIFF[1] * loss_gen_d
+            loss_tot += LOSS_WEIGHTS_DIFF[2] * loss_ddf
+            loss_tot = torch.sqrt(1.+MSK_EPS-cond_ratio) * loss_tot
+            # >> JZ: print nan in x0
+            if torch.isnan(x0).any():
+                print(f"*** Encountered NaN in input image x0 at epoch {epoch}, step {step}.")
+            # >> JZ: print loss of ddf
+            if loss_ddf>0.001:
+                print(f"*** High diffusion DDF loss at epoch {epoch}, step {step}: {loss_ddf.item()}.")
+            # yu: check if loss_tot==nan or inf
+            if torch.isnan(loss_tot) or torch.isinf(loss_tot):
+                print(f"*** Encountered NaN or Inf loss at epoch {epoch}, step {step}. Skipping this batch.")
+                loss_nan_step += 1
+                continue
+            if loss_nan_step > 5:
+                print(f"*** Too many NaN or Inf losses ({loss_nan_step} times) at epoch {epoch}, step {step}. Stopping training.")
+                raise ValueError("Too many NaN losses detected in loss_tot. Code terminated.")
+            optimizer.zero_grad(set_to_none=True)     # OPT: set_to_none faster than zero-fill
+            loss_tot.backward()
+            optimizer.step()
+            epoch_loss_tot += loss_tot.item() / total
+            epoch_loss_gen_d += loss_gen_d.item() / total
+            epoch_loss_gen_a += loss_gen_a.item() / total
+            epoch_loss_reg += loss_ddf.item() / total
+            # ==========================================================================
+            # contrastive train on single image (text-image alignment)
+            loss_contra_val = None
+            if step % CONTRASTIVE_STEP_RATIO == 0:
+                raw_network = Deformddpm.module.network if use_distributed else Deformddpm.network
+                n_contra = x0.size()[0]
+                t_contra = torch.randint(0, hyp_parameters["timesteps"], (n_contra,)).to(hyp_parameters["device"])
+                _ = raw_network(x=(x0 * blind_mask).detach(), y=cond_img.detach(), t=t_contra, text=None)
+                if hasattr(raw_network, 'img_embd') and raw_network.img_embd is not None:
+                    img_embd = raw_network.img_embd  # [B, 1024]
+                    loss_contra = LOSS_WEIGHT_CONTRASTIVE * (1 - F.cosine_similarity(img_embd, embd_dev, dim=-1).mean())
+                    optimizer.zero_grad(set_to_none=True)   # OPT
+                    loss_contra.backward()
+                    torch.nn.utils.clip_grad_norm_(Deformddpm.parameters(), max_norm=0.05)
+                    optimizer.step()
+                    loss_contra_val = loss_contra.item()
+                    epoch_loss_contrastive += loss_contra_val / total
+                else:
+                    if gpu_id == 0:
+                        print(f"*** Warning: Network does not have img_embd attribute for contrastive loss at epoch {epoch}, step {step}.")
+            # ==========================================================================
+            # registration train on paired images
+            if step%train_mode_ratio == 0:
+                [x1, y1, _, embd_y] = batch_p
+                if np.random.uniform(0,1)<TEXT_EMBED_PROB:
+                    embd_y = embd_y.to(hyp_parameters["device"]).type(torch.float32)
+                else:
+                    embd_y = None
+                x1 = x1.to(hyp_parameters["device"]).type(torch.float32)
+                y1 = y1.to(hyp_parameters["device"]).type(torch.float32)
+                n = x1.size()[0]  # batch_size -> n
+                [x1, y1] = utils.random_permute([x1, y1], select_dims=[-1,-2,-3])
+                if hyp_parameters['noise_scale']>0:
+                    [x1, y1] = thresh_img([x1, y1], [0, 2*hyp_parameters['noise_scale']])
+                    random_scale = np.random.normal(1, hyp_parameters['noise_scale'] * 1)
+                    random_shift = np.random.normal(0, hyp_parameters['noise_scale'] * 1)
+                    x1 = x1 * random_scale + random_shift
+                    y1 = y1 * random_scale + random_shift
+                scale_regist = np.random.uniform(0.0,0.7)
+                # OPT: fixed-length T_regist to avoid XPU dynamic shape recompilation
+                # Sample FIXED_T_REGIST_LEN timesteps (was: random 8-16), always same loop length
+                t_pool = list(range(int(hyp_parameters["timesteps"] * scale_regist), hyp_parameters["timesteps"]))
+                select_timestep = min(FIXED_T_REGIST_LEN, len(t_pool))
+                T_regist = sorted(random.sample(t_pool, select_timestep), reverse=True)
+                T_regist = [[t for _ in range(max(1, hyp_parameters["batchsize"]//2))] for t in T_regist]
+                proc_type = random.choice(['downsample', 'slice', 'slice1', 'none', 'none'])
+                ddpm_inner = Deformddpm.module if use_distributed else Deformddpm
+                y1_proc, msk_tgt, cond_ratio = ddpm_inner.proc_cond_img(y1,proc_type=proc_type)
+                msk_tgt = msk_tgt+MSK_EPS
+                [ddf_comp,ddf_rand],[img_rec,img_diff,img_save],_ = Deformddpm(img_org=x1, cond_imgs=y1_proc, T=[None, T_regist], proc_type=[],text=embd_y)  # forward diffusion process
+                loss_sim = loss_imgsim(img_rec, y1, label=msk_tgt*(y1>thresh_imgsim))  # calculate loss for the registration process
+                loss_mse = loss_imgmse(img_rec, y1, label=msk_tgt*(y1>=0.0))  # calculate loss for the registration process
+                loss_ddf1 = loss_reg1(ddf_comp, img=y1)  # calculate loss for the registration process
+                loss_regist = 0
+                loss_regist += LOSS_WEIGHTS_REGIST[0] * loss_sim
+                loss_regist += LOSS_WEIGHTS_REGIST[1] * loss_mse
+                loss_regist += LOSS_WEIGHTS_REGIST[2] * loss_ddf1
+                # >> JZ: print nan in x0
+                if torch.isnan(x0).any():
+                    print(f"*** Encountered NaN in input image x0 at epoch {epoch}, step {step}.")
+                # >> JZ: print loss of ddf
+                if loss_ddf1>0.002:
+                    print(f"*** High registration DDF loss at epoch {epoch}, step {step}: {loss_ddf1.item()}.")
+                loss_regist = torch.sqrt(cond_ratio+MSK_EPS) *loss_regist
+                optimizer.zero_grad(set_to_none=True)   # OPT
+                loss_regist.backward()
+                torch.nn.utils.clip_grad_norm_(Deformddpm.parameters(), max_norm=0.2)
+                optimizer.step()
+                epoch_loss_regist += loss_regist.item() / total
+                epoch_loss_imgsim += loss_sim.item() / total
+                epoch_loss_imgmse += loss_mse.item() / total
+                epoch_loss_ddfreg += loss_ddf1.item() / total
+            if step % 10 == 0:
+                print('step:',step,':', loss_tot.item(),'=',loss_gen_a.item(),'+', loss_gen_d.item(),'+',loss_ddf.item())
+                if loss_contra_val is not None:
+                    print(f'     loss_contrastive: {loss_contra_val:.6f}')
+                print(f'     loss_regist: {loss_regist} = {loss_sim} (imgsim) + {loss_mse} (imgmse) + {loss_ddf1} (ddf)')
+        if 1:
+            print('==================')
+            print(epoch,':', epoch_loss_tot,'=',epoch_loss_gen_a,'+', epoch_loss_gen_d,'+',epoch_loss_reg, ' (ang+dist+regul)')
+            print(f'     loss_contrastive: {epoch_loss_contrastive}')
+            print(f'     loss_regist: {epoch_loss_regist} = {epoch_loss_imgsim} (imgsim) + {epoch_loss_imgmse} (imgmse) + {epoch_loss_ddfreg} (ddf)')
+            print('==================')
+        if 0 == epoch % epoch_per_save:
+            save_dir=model_save_path + str(epoch).rjust(6, '0') + suffix_pth
+            os.makedirs(os.path.dirname(model_save_path), exist_ok=True)
+            # break   # FOR TESTING
+            if not use_distributed:
+                print(f"saved in {save_dir}")
+                # torch.save(Deformddpm.state_dict(), save_dir)
+                torch.save({
+                    'model_state_dict': Deformddpm.state_dict(),
+                    'optimizer_state_dict': optimizer.state_dict(),
+                    'epoch': epoch
+                }, save_dir)
+            elif gpu_id == 0:
+                print(f"saved in {save_dir}")
+                # torch.save(Deformddpm.module.state_dict(), save_dir)
+                torch.save({
+                    'model_state_dict': Deformddpm.module.state_dict(),
+                    'optimizer_state_dict': optimizer.state_dict(),
+                    'epoch': epoch
+                }, save_dir)
+    # Resource cleanup at the end of training
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    gc.collect()
+    if use_distributed and dist.is_initialized():
+        dist.destroy_process_group()
+def ddp_load_dict(gpu_id, Deformddpm, optimizer, model_file,use_distributed=True, load_strict=False):
+    if gpu_id == 0:
+    # if 0:
+        utils.print_memory_usage("Before Loading Model")
+        if torch.cuda.is_available():
+            gc.collect()
+            torch.cuda.empty_cache()
+        checkpoint = torch.load(model_file, map_location='cpu')
+        if use_distributed:
+            Deformddpm.module.load_state_dict(checkpoint['model_state_dict'], strict=load_strict)
+        else:
+            Deformddpm.load_state_dict(checkpoint['model_state_dict'], strict=load_strict)
+        if load_strict:
+            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
+        utils.print_memory_usage("After Loading Checkpoint on GPU")
+    if use_distributed:
+        # Broadcast model weights from rank 0 to all other GPUs
+        dist.barrier()
+        for param in Deformddpm.parameters():
+            dist.broadcast(param.data, src=0)  # Synchronize model across ranks
+        dist.barrier()
+        for param_group in optimizer.param_groups:
+            for param in param_group['params']:
+                if param.grad is not None:
+                    dist.broadcast(param.grad, src=0)  # Sync optimizer gradients
+    # initial_epoch = checkpoint['epoch'] + 1
+    # get the epoch number from the filename and add 1 to set as initial_epoch
+    initial_epoch = int(os.path.basename(model_file).split('.')[0][:6]) + 1
+    return initial_epoch, Deformddpm, optimizer
+if __name__ == "__main__":
+    if use_distributed:
+        world_size = torch.cuda.device_count()
+        print(f"Distributed GPU number = {world_size}")
+        mp.spawn(main_train,args = (world_size,),nprocs = world_size)
+    else:
+        main_train(0,1)

OM_train_3modes_original.py ADDED Viewed

	@@ -0,0 +1,585 @@

+import os, sys
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(ROOT_DIR)
+import gc
+import torch
+import torchvision
+from torch import nn
+from torchvision.utils import save_image
+from torch.utils.data import DataLoader
+from torch.optim import Adam, SGD
+from Diffusion.diffuser import DeformDDPM
+from Diffusion.networks import get_net, STN
+from torchvision.transforms import Lambda
+import torch.nn.functional as F
+import Diffusion.losses as losses
+import random
+import glob
+import numpy as np
+import utils
+from tqdm import tqdm
+from Dataloader.dataloader0 import get_dataloader
+from Dataloader.dataLoader import *
+from Dataloader.dataloader_utils import thresh_img
+import yaml
+import argparse
+# XPU support: import Intel Extension for PyTorch and oneCCL bindings if available
+try:
+    import intel_extension_for_pytorch as ipex
+except ImportError:
+    ipex = None
+try:
+    import oneccl_bindings_for_pytorch
+except (ImportError, Exception) as e:
+    print(f"WARNING: Failed to import oneccl_bindings_for_pytorch: {e}")
+####################
+import torch.multiprocessing as mp
+from torch.utils.data.distributed import DistributedSampler
+from torch.nn.parallel import DistributedDataParallel as DDP
+import torch.distributed as dist
+# from torch.distributed import init_process_group
+###############
+def _device_available(device_type):
+    if device_type == 'xpu':
+        return hasattr(torch, 'xpu') and torch.xpu.is_available()
+    return torch.cuda.is_available()
+def _device_count(device_type):
+    if device_type == 'xpu':
+        return torch.xpu.device_count() if hasattr(torch, 'xpu') else 0
+    return torch.cuda.device_count()
+def _set_device(rank, device_type):
+    if device_type == 'xpu':
+        torch.xpu.set_device(rank)
+    else:
+        torch.cuda.set_device(rank)
+def _empty_cache(device_type):
+    if device_type == 'xpu' and hasattr(torch, 'xpu'):
+        torch.xpu.empty_cache()
+    elif torch.cuda.is_available():
+        torch.cuda.empty_cache()
+def ddp_setup(rank, world_size):
+    """
+    Args:
+        rank: Unique identifier of each process (local_rank when launched by torchrun)
+        world_size: Total number of processes
+    """
+    backend = "ccl" if DEVICE_TYPE == "xpu" else "nccl"
+    if "LOCAL_RANK" in os.environ:
+        # Launched by torchrun: MASTER_ADDR, MASTER_PORT, RANK, WORLD_SIZE already set
+        dist.init_process_group(backend=backend)
+        _set_device(int(os.environ["LOCAL_RANK"]), DEVICE_TYPE)
+    else:
+        # Single-node mp.spawn
+        os.environ["MASTER_ADDR"] = "localhost"
+        os.environ["MASTER_PORT"] = "12355"
+        dist.init_process_group(backend=backend, rank=rank, world_size=world_size)
+        _set_device(rank, DEVICE_TYPE)
+EPS = 1e-5
+MSK_EPS = 0.01
+TEXT_EMBED_PROB = 0.5
+AUG_RESAMPLE_PROB = 0.5
+LOSS_WEIGHTS_DIFF = [2.0, 1.0, 4.0]  # [ang, dist, reg]
+# LOSS_WEIGHTS_REGIST = [9.0, 1.0, 16.0]  # [imgsim, imgmse, ddf]
+LOSS_WEIGHTS_REGIST = [1.0, 0.01, 1e2]  # [imgsim, imgmse, ddf]
+DIFF_REG_BATCH_RATIO = 2
+LOSS_WEIGHT_CONTRASTIVE = 0.001
+REGISTRATION_STEP_RATIO = 1
+CONTRASTIVE_STEP_RATIO = 1
+# AUG_PERMUTE_PROB = 0.35
+parser = argparse.ArgumentParser()
+# config_file_path = 'Config/config_cmr.yaml'
+parser.add_argument(
+        "--config",
+        "-C",
+        help="Path for the config file",
+        type=str,
+        # default="Config/config_cmr.yaml",
+        # default="Config/config_lct.yaml",
+        default="Config/config_all.yaml",
+        required=False,
+    )
+parser.add_argument("--batchsize", type=int, default=0, help="Override batch size from config (0=use config value)")
+args = parser.parse_args()
+# Read config early to determine device type for DDP setup
+with open(args.config, 'r') as _f:
+    _cfg = yaml.safe_load(_f)
+DEVICE_TYPE = _cfg.get('device', 'cuda')  # 'cuda' or 'xpu'
+# Auto-detect: use DDP only when multiple devices are available
+use_distributed = _device_available(DEVICE_TYPE) and _device_count(DEVICE_TYPE) > 1
+# use_distributed = True
+# use_distributed = False
+#=======================================================================================================================
+def main_train(rank=0,world_size=1,train_mode_ratio=1,thresh_imgsim=0.01):
+    if use_distributed:
+        ddp_setup(rank,world_size)
+        if torch.distributed.is_initialized() and rank == 0:
+            print(f"World size: {torch.distributed.get_world_size()}")
+            print(f"Communication backend: {torch.distributed.get_backend()}")
+    # gpu_id = global rank (for save/print guards); rank = local device index
+    if "RANK" in os.environ:
+        gpu_id = int(os.environ["RANK"])
+        rank = int(os.environ["LOCAL_RANK"])
+    else:
+        gpu_id = rank
+    # Load the YAML file into a dictionary
+    with open(args.config, 'r') as file:
+        hyp_parameters = yaml.safe_load(file)
+    if args.batchsize > 0:
+        hyp_parameters['batchsize'] = args.batchsize
+    if gpu_id == 0:
+        print(hyp_parameters)
+    # epoch_per_save=10
+    epoch_per_save=hyp_parameters['epoch_per_save']
+    data_name=hyp_parameters['data_name']
+    net_name = hyp_parameters['net_name']
+    Net=get_net(net_name)
+    suffix_pth=f'_{data_name}_{net_name}.pth'
+    model_save_path = os.path.join('Models',f'{data_name}_{net_name}/')
+    model_dir=model_save_path
+    transformer=utils.get_transformer(img_sz=hyp_parameters["ndims"]*[hyp_parameters['img_size']])
+    # Data_Loader=get_dataloader(data_name=hyp_parameters['data_name'], mode='train')
+    # tsfm = torchvision.transforms.Compose([
+    #             torchvision.transforms.ToTensor(),
+    #             ])
+    # dataset = Data_Loader(target_res = [hyp_parameters["img_size"]]*hyp_parameters["ndims"], transforms=None, noise_scale=hyp_parameters['noise_scale'])
+    # train_loader = DataLoader(
+    #     dataset,
+    #     batch_size=hyp_parameters['batchsize'],
+    #     # shuffle=False,
+    #     shuffle=True,
+    #     drop_last=True,
+    # )
+    # dataset = OminiDataset_v1(transform=None)
+    dataset = OMDataset_indiv(transform=None)
+    # datasetp = OminiDataset_paired(transform=None)
+    datasetp = OMDataset_pair(transform=None)
+    if use_distributed:
+        sampler = DistributedSampler(dataset, shuffle=True)
+        sampler_p = DistributedSampler(datasetp, shuffle=True)
+    else:
+        sampler = None
+        sampler_p = None
+    train_loader = DataLoader(
+        dataset,
+        batch_size=hyp_parameters['batchsize'],
+        shuffle=(sampler is None),
+        drop_last=True,
+        sampler=sampler,
+    )
+    train_loader_p = DataLoader(
+        datasetp,
+        batch_size=max(1, hyp_parameters['batchsize']//DIFF_REG_BATCH_RATIO),
+        shuffle=(sampler_p is None),
+        drop_last=True,
+        sampler=sampler_p,
+    )
+    Deformddpm = DeformDDPM(
+        network=Net(
+            n_steps=hyp_parameters["timesteps"],
+            ndims=hyp_parameters["ndims"],
+            num_input_chn = hyp_parameters["num_input_chn"],
+            res = hyp_parameters['img_size']
+            ),
+        n_steps=hyp_parameters["timesteps"],
+        image_chw=[1] + [hyp_parameters["img_size"]]*hyp_parameters["ndims"],
+        device=hyp_parameters["device"],
+        batch_size=hyp_parameters["batchsize"],
+        img_pad_mode=hyp_parameters["img_pad_mode"],
+        v_scale=hyp_parameters["v_scale"],
+    )
+    ddf_stn = STN(
+        img_sz=hyp_parameters["img_size"],
+        ndims=hyp_parameters["ndims"],
+        # padding_mode="zeros",
+        padding_mode=hyp_parameters["padding_mode"],
+        device=hyp_parameters["device"],
+    )
+    if use_distributed:
+        device = f"{DEVICE_TYPE}:{rank}"
+        Deformddpm.to(device)
+        Deformddpm = DDP(Deformddpm, device_ids=[rank])
+        ddf_stn.to(device)
+    else:
+        Deformddpm.to(hyp_parameters["device"])
+        ddf_stn.to(hyp_parameters["device"])
+    # ddf_stn = DDP(ddf_stn, device_ids=[rank])
+    # mse = nn.MSELoss()
+    # loss_reg = losses.Grad(penalty=['l1', 'negdetj'], ndims=hyp_parameters["ndims"])
+    # loss_reg = losses.Grad(penalty=['l1', 'negdetj', 'range'], ndims=hyp_parameters["ndims"])
+    loss_reg = losses.Grad(penalty=['l1', 'negdetj', 'range'], ndims=hyp_parameters["ndims"],outrange_thresh=0.2,outrange_weight=1e3)
+    loss_reg1 = losses.Grad(penalty=['l1', 'negdetj', 'range'], ndims=hyp_parameters["ndims"],outrange_thresh=0.6,outrange_weight=1e3)
+    loss_dist = losses.MRSE(img_sz=hyp_parameters["img_size"])
+    # loss_ang = losses.MRSE(img_sz=hyp_parameters["img_size"])
+    loss_ang = losses.NCC(img_sz=hyp_parameters["img_size"])
+    loss_imgsim = losses.MSLNCC()
+    loss_imgmse = losses.LMSE()
+    optimizer = Adam(Deformddpm.parameters(), lr=hyp_parameters["lr"])
+    # hyp_parameters["lr"]=0.00000001
+    # optimizer_regist = Adam(Deformddpm.parameters(), lr=hyp_parameters["lr"]*0.01)
+    # optimizer_regist = SGD(Deformddpm.parameters(), lr=hyp_parameters["lr"]*0.01, momentum=0.98)
+    # optimizer = SGD(Deformddpm.parameters(), lr=hyp_parameters["lr"], momentum=0.9)
+    # # LR scheduler ----- YHM
+    # scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, hyp_parameters["lr"], hyp_parameters["lr"]*10, step_size_up=500, step_size_down=500, mode='triangular', gamma=1.0, scale_fn=None, scale_mode='cycle', cycle_momentum=True, base_momentum=0.8, max_momentum=0.9, last_epoch=-1)
+    # Deformddpm.network.load_state_dict(torch.load('/home/data/jzheng/Adaptive_Motion_Generator-master/models/1000.pth'))
+    # check for existing models
+    if not os.path.exists(model_dir):
+        os.makedirs(model_dir, exist_ok=True)
+    model_files = glob.glob(os.path.join(model_dir, "*.pth"))
+    model_files.sort()
+    if model_files:
+        if gpu_id == 0:
+            print(model_files)
+        initial_epoch, Deformddpm, optimizer = ddp_load_dict(gpu_id, Deformddpm, optimizer, model_files[-1], use_distributed=use_distributed)
+    else:
+        initial_epoch = 0
+    if gpu_id == 0:
+        print('len_train_data: ',len(dataset))
+    # Training loop
+    for epoch in range(initial_epoch,hyp_parameters["epoch"]):
+        if use_distributed and sampler is not None:
+            sampler.set_epoch(epoch)
+            sampler_p.set_epoch(epoch)
+        epoch_loss_tot = 0.0
+        epoch_loss_gen_d = 0.0
+        epoch_loss_gen_a = 0.0
+        epoch_loss_reg = 0.0
+        epoch_loss_regist = 0.0
+        epoch_loss_imgsim = 0.0
+        epoch_loss_imgmse = 0.0
+        epoch_loss_ddfreg = 0.0
+        epoch_loss_contrastive = 0.0
+        # Set model inside to train model
+        Deformddpm.train()
+        loss_nan_step = 0  # yu: count the number of nan loss steps
+        total = min(len(train_loader), len(train_loader_p))
+        total_reg = total // REGISTRATION_STEP_RATIO
+        # for step, batch in tqdm(enumerate(train_loader)):
+        # for step, batch in tqdm(enumerate(train_loader)):
+        # for step, batch in enumerate(train_loader_omni):
+        for step, (batch, batch_p) in tqdm(enumerate(zip(train_loader, train_loader_p)), total=total):
+            # x0, _ = batch
+            # ==========================================================================
+            # diffusion train on single image
+            # x0 = batch # for omni dataset
+            [x0,embd] = batch # for om dataset
+            x0 = x0.to(hyp_parameters["device"]).type(torch.float32)
+            # print('embd:', embd.shape)
+            embd_dev = embd.to(hyp_parameters["device"]).type(torch.float32)
+            if np.random.uniform(0,1)<TEXT_EMBED_PROB:
+                embd_in = embd_dev
+            else:
+                embd_in = None
+            n = x0.size()[0]  # batch_size -> n
+            x0 = x0.to(hyp_parameters["device"])
+            blind_mask = utils.get_random_deformed_mask(x0.shape[2:],apply_possibility=0.6).to(hyp_parameters["device"])
+            # random deformation + rotation
+            if hyp_parameters["ndims"]>2:
+                if np.random.uniform(0,1)<AUG_RESAMPLE_PROB:
+                    x0 = utils.random_resample(x0, deform_scale=0)
+                # elif np.random.uniform(0,1)<AUG_RESAMPLE_PROB+AUG_PERMUTE_PROB:
+                else:
+                    [x0] = utils.random_permute([x0], select_dims=[-1,-2,-3])
+            # x0 = transformer(x0)
+            if hyp_parameters['noise_scale']>0:
+                if np.random.uniform(0,1)<AUG_RESAMPLE_PROB:
+                    x0 = thresh_img(x0, [0, 2*hyp_parameters['noise_scale']])
+                x0 = x0 * (np.random.normal(1, hyp_parameters['noise_scale'] * 1)) + np.random.normal(0, hyp_parameters['noise_scale'] * 1)
+            # Picking some noise for each of the images in the batch, a timestep and the respective alpha_bars
+            t = torch.randint(0, hyp_parameters["timesteps"], (n,)).to(
+                hyp_parameters["device"]
+            )  # pick up a seq of rand number from 0 to 'timestep'
+            # proc_type = random.choice(['adding', 'independ', 'downsample', 'slice', 'project', 'none', 'uncon', 'uncon', 'uncon'])
+            proc_type = random.choice(['adding', 'downsample', 'slice', 'slice1', 'none', 'uncon', 'uncon', 'uncon'])
+            # print('proc_type:', proc_type)
+            ddpm = Deformddpm.module if use_distributed else Deformddpm
+            cond_img, _, cond_ratio = ddpm.proc_cond_img(x0,proc_type=proc_type)
+            pre_dvf_I,dvf_I = Deformddpm(img_org=x0, t=t, cond_imgs=cond_img, mask=blind_mask,proc_type=[],text=embd_in)  # forward diffusion process
+            # print(torch.max(torch.abs(pre_dvf_I)))
+            # print(torch.max(torch.abs(dvf_I)))
+            loss_tot=0
+            loss_ddf = loss_reg(pre_dvf_I,img=x0)
+            trm_pred = ddf_stn(pre_dvf_I, dvf_I)
+            loss_gen_d = loss_dist(pred=trm_pred,inv_lab=dvf_I,ddf_stn=None,mask=blind_mask)
+            loss_gen_a = loss_ang(pred=trm_pred,inv_lab=dvf_I,ddf_stn=None,mask=blind_mask)
+            loss_tot += LOSS_WEIGHTS_DIFF[0] * loss_gen_a + LOSS_WEIGHTS_DIFF[1] * loss_gen_d
+            loss_tot += LOSS_WEIGHTS_DIFF[2] * loss_ddf
+            loss_tot = torch.sqrt(1.+MSK_EPS-cond_ratio) * loss_tot
+            # >> JZ: print nan in x0
+            if torch.isnan(x0).any():
+                print(f"*** Encountered NaN in input image x0 at epoch {epoch}, step {step}.")
+            # >> JZ: print loss of ddf
+            if loss_ddf>0.001:
+                print(f"*** High diffusion DDF loss at epoch {epoch}, step {step}: {loss_ddf.item()}.")
+            # yu: check if loss_tot==nan or inf
+            if torch.isnan(loss_tot) or torch.isinf(loss_tot):
+                print(f"*** Encountered NaN or Inf loss at epoch {epoch}, step {step}. Skipping this batch.")
+                loss_nan_step += 1
+                continue
+            if loss_nan_step > 5:
+                print(f"*** Too many NaN or Inf losses ({loss_nan_step} times) at epoch {epoch}, step {step}. Stopping training.")
+                raise ValueError("Too many NaN losses detected in loss_tot. Code terminated.")
+            optimizer.zero_grad()
+            loss_tot.backward()
+            optimizer.step()
+            epoch_loss_tot += loss_tot.item() / total
+            epoch_loss_gen_d += loss_gen_d.item() / total
+            epoch_loss_gen_a += loss_gen_a.item() / total
+            epoch_loss_reg += loss_ddf.item() / total
+            # ==========================================================================
+            # contrastive train on single image (text-image alignment)
+            loss_contra_val = None
+            if step % CONTRASTIVE_STEP_RATIO == 0:
+                raw_network = Deformddpm.module.network if use_distributed else Deformddpm.network
+                n_contra = x0.size()[0]
+                t_contra = torch.randint(0, hyp_parameters["timesteps"], (n_contra,)).to(hyp_parameters["device"])
+                _ = raw_network(x=(x0 * blind_mask).detach(), y=cond_img.detach(), t=t_contra, text=None)
+                if hasattr(raw_network, 'img_embd') and raw_network.img_embd is not None:
+                    img_embd = raw_network.img_embd  # [B, 1024]
+                    loss_contra = LOSS_WEIGHT_CONTRASTIVE * F.relu(1 - F.cosine_similarity(img_embd, embd_dev, dim=-1).mean()-0.05)  # contrastive loss to align image embedding with text embedding, with a margin of 0.02
+                    optimizer.zero_grad()
+                    loss_contra.backward()
+                    torch.nn.utils.clip_grad_norm_(Deformddpm.parameters(), max_norm=0.02)
+                    optimizer.step()
+                    loss_contra_val = loss_contra.item()
+                    epoch_loss_contrastive += loss_contra_val / total * CONTRASTIVE_STEP_RATIO
+                else:
+                    if gpu_id == 0:
+                        print(f"*** Warning: Network does not have img_embd attribute for contrastive loss at epoch {epoch}, step {step}.")
+            # ==========================================================================
+            # registration train on paired images
+            if step%REGISTRATION_STEP_RATIO == 0 and loss_gen_a.item()<-0.6:  # only train registration on relatively well-deformed images, to avoid too large registration loss and unstable training in the early stage
+                [x1, y1, _, embd_y] = batch_p
+                if np.random.uniform(0,1)<TEXT_EMBED_PROB:
+                    embd_y = embd_y.to(hyp_parameters["device"]).type(torch.float32)
+                else:
+                    embd_y = None
+                x1 = x1.to(hyp_parameters["device"]).type(torch.float32)
+                y1 = y1.to(hyp_parameters["device"]).type(torch.float32)
+                n = x1.size()[0]  # batch_size -> n
+                [x1, y1] = utils.random_permute([x1, y1], select_dims=[-1,-2,-3])
+                if hyp_parameters['noise_scale']>0:
+                    [x1, y1] = thresh_img([x1, y1], [0, 2*hyp_parameters['noise_scale']])
+                    random_scale = np.random.normal(1, hyp_parameters['noise_scale'] * 1)
+                    random_shift = np.random.normal(0, hyp_parameters['noise_scale'] * 1)
+                    x1 = x1 * random_scale + random_shift
+                    y1 = y1 * random_scale + random_shift
+                scale_regist = np.random.uniform(0.0,0.7)
+                select_timestep = np.random.randint(12, 25)  # select a random number of timesteps to sample, between 8 and 16
+                T_regist = sorted(random.sample(range(int(hyp_parameters["timesteps"] * scale_regist),hyp_parameters["timesteps"]), select_timestep), reverse=True)
+                T_regist = [[t for _ in range(max(1, hyp_parameters["batchsize"]//2))] for t in T_regist]
+                proc_type = random.choice(['downsample', 'slice', 'slice1', 'none', 'none'])
+                ddpm_inner = Deformddpm.module if use_distributed else Deformddpm
+                y1_proc, msk_tgt, cond_ratio = ddpm_inner.proc_cond_img(y1,proc_type=proc_type)
+                msk_tgt = msk_tgt+MSK_EPS
+                [ddf_comp,ddf_rand],[img_rec,img_diff,img_save],_ = Deformddpm(img_org=x1, cond_imgs=y1_proc, T=[None, T_regist], proc_type=[],text=embd_y)  # forward diffusion process
+                loss_sim = loss_imgsim(img_rec, y1, label=msk_tgt*(y1>thresh_imgsim))  # calculate loss for the registration process
+                loss_mse = loss_imgmse(img_rec, y1, label=msk_tgt*(y1>=0.0))  # calculate loss for the registration process
+                loss_ddf1 = loss_reg1(ddf_comp, img=y1)  # calculate loss for the registration process
+                loss_regist = 0
+                loss_regist += LOSS_WEIGHTS_REGIST[0] * loss_sim
+                loss_regist += LOSS_WEIGHTS_REGIST[1] * loss_mse
+                loss_regist += LOSS_WEIGHTS_REGIST[2] * loss_ddf1
+                # >> JZ: print nan in x0
+                if torch.isnan(x0).any():
+                    print(f"*** Encountered NaN in input image x0 at epoch {epoch}, step {step}.")
+                # >> JZ: print loss of ddf
+                if loss_ddf1>0.002:
+                    print(f"*** High registration DDF loss at epoch {epoch}, step {step}: {loss_ddf1.item()}.")
+                loss_regist = torch.sqrt(cond_ratio+MSK_EPS) *loss_regist
+                optimizer.zero_grad()
+                loss_regist.backward()
+                torch.nn.utils.clip_grad_norm_(Deformddpm.parameters(), max_norm=0.1)
+                optimizer.step()
+                epoch_loss_regist += loss_regist.item()
+                epoch_loss_imgsim += loss_sim.item()
+                epoch_loss_imgmse += loss_mse.item()
+                epoch_loss_ddfreg += loss_ddf1.item()
+            else:
+                loss_sim = torch.tensor(0.0)
+                loss_mse = torch.tensor(0.0)
+                loss_ddf1 = torch.tensor(0.0)
+                loss_regist = torch.tensor(0.0)
+                if step % REGISTRATION_STEP_RATIO==0:
+                    total_reg = total_reg-1
+            # if step % 50 == 0:
+            #     print('step:',step,':', loss_tot.item(),'=',loss_gen_a.item(),'+', loss_gen_d.item(),'+',loss_ddf.item())
+            #     if loss_contra_val is not None:
+            #         print(f'     loss_contrastive: {loss_contra_val:.6f}')
+            #     print(f'     loss_regist: {loss_regist} = {loss_sim} (imgsim) + {loss_mse} (imgmse) + {loss_ddf1} (ddf)')
+        if gpu_id == 0:
+            print('==================')
+            print(epoch,':', epoch_loss_tot,'=',epoch_loss_gen_a,'+', epoch_loss_gen_d,'+',epoch_loss_reg, ' (ang+dist+regul)')
+            print(f'     loss_contrastive: {epoch_loss_contrastive}')
+            print(f'     loss_regist: {epoch_loss_regist/total_reg} = {epoch_loss_imgsim/total_reg} (imgsim) + {epoch_loss_imgmse/total_reg} (imgmse) + {epoch_loss_ddfreg/total_reg} (ddf)')
+            print('==================')
+        if 0 == epoch % epoch_per_save:
+            save_dir=model_save_path + str(epoch).rjust(6, '0') + suffix_pth
+            os.makedirs(os.path.dirname(model_save_path), exist_ok=True)
+            # break   # FOR TESTING
+            if not use_distributed:
+                print(f"saved in {save_dir}")
+                # torch.save(Deformddpm.state_dict(), save_dir)
+                torch.save({
+                    'model_state_dict': Deformddpm.state_dict(),
+                    'optimizer_state_dict': optimizer.state_dict(),
+                    'epoch': epoch
+                }, save_dir)
+            elif gpu_id == 0:
+                print(f"saved in {save_dir}")
+                # torch.save(Deformddpm.module.state_dict(), save_dir)
+                torch.save({
+                    'model_state_dict': Deformddpm.module.state_dict(),
+                    'optimizer_state_dict': optimizer.state_dict(),
+                    'epoch': epoch
+                }, save_dir)
+    # Resource cleanup at the end of training
+    _empty_cache(DEVICE_TYPE)
+    gc.collect()
+    if use_distributed and dist.is_initialized():
+        dist.destroy_process_group()
+def ddp_load_dict(gpu_id, Deformddpm, optimizer, model_file,use_distributed=True, load_strict=False):
+    if gpu_id == 0:
+    # if 0:
+        utils.print_memory_usage("Before Loading Model")
+        gc.collect()
+        _empty_cache(DEVICE_TYPE)
+        # Deformddpm.network.load_state_dict(torch.load(latest_model_file))
+        # Deformddpm.load_state_dict(torch.load(latest_model_file), strict=False)
+        checkpoint = torch.load(model_file, map_location='cpu')
+        # checkpoint = torch.load(latest_model_file, map_location=f"cuda:{rank}")
+        if use_distributed:
+            Deformddpm.module.load_state_dict(checkpoint['model_state_dict'], strict=load_strict)
+        else:
+            Deformddpm.load_state_dict(checkpoint['model_state_dict'], strict=load_strict)
+        if load_strict:
+            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
+        utils.print_memory_usage("After Loading Checkpoint on GPU")
+    if use_distributed:
+        # Broadcast model weights from rank 0 to all other GPUs
+        dist.barrier()
+        for param in Deformddpm.parameters():
+            dist.broadcast(param.data, src=0)  # Synchronize model across ranks
+        dist.barrier()
+        for param_group in optimizer.param_groups:
+            for param in param_group['params']:
+                if param.grad is not None:
+                    dist.broadcast(param.grad, src=0)  # Sync optimizer gradients
+    # initial_epoch = checkpoint['epoch'] + 1
+    # get the epoch number from the filename and add 1 to set as initial_epoch
+    initial_epoch = int(os.path.basename(model_file).split('.')[0][:6]) + 1
+    return initial_epoch, Deformddpm, optimizer
+if __name__ == "__main__":
+    if "LOCAL_RANK" in os.environ:
+        # Multi-node: launched by torchrun / srun
+        use_distributed = True
+        local_rank = int(os.environ["LOCAL_RANK"])
+        world_size = int(os.environ["WORLD_SIZE"])
+        print(f"torchrun launch: LOCAL_RANK={local_rank}, RANK={os.environ.get('RANK')}, WORLD_SIZE={world_size}")
+        try:
+            main_train(local_rank, world_size)
+        except Exception as e:
+            import traceback
+            print(f"\n{'='*60}\nRANK {os.environ.get('RANK')} FAILED:\n{'='*60}", flush=True)
+            traceback.print_exc()
+            raise
+    elif use_distributed:
+        # Single-node multi-GPU: use mp.spawn
+        world_size = _device_count(DEVICE_TYPE)
+        print(f"Distributed {DEVICE_TYPE.upper()} device number = {world_size}")
+        mp.spawn(main_train,args = (world_size,),nprocs = world_size)
+    else:
+        main_train(0,1)

OMorpher/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .omorpher import OMorpher
2	+
3	+ __all__ = ['OMorpher']

OMorpher/omorpher.py ADDED Viewed

	@@ -0,0 +1,1058 @@

+"""
+OMorpher — Object-oriented wrapper for OmniMorph diffusion-based deformation.
+Stores original high-res images and composes all intermediate deformations as
+deformation fields (DDFs), resampling only once at the end to avoid blurring.
+Independent of DeformDDPM at runtime; reimplements the diffusion logic using
+the network / STN / loss building blocks from Diffusion.*.
+"""
+import os
+import glob
+import math
+import random
+from typing import Optional, Union, List, Tuple, Dict
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+import yaml
+import SimpleITK as sitk
+from skimage.transform import resize as sk_resize
+from Diffusion.networks import get_net, STN, DefRec_MutAttnNet
+from Diffusion.losses import Grad, MRSE, NCC
+EPS = 1e-8
+class OMorpher:
+    """High-level interface for OmniMorph deformation diffusion.
+    All images are kept at their original resolution internally.  Deformation
+    fields are composed at model resolution and up-scaled on demand so that the
+    original image is resampled at most *once*.
+    """
+    # ------------------------------------------------------------------
+    # Construction
+    # ------------------------------------------------------------------
+    def __init__(
+        self,
+        config: Union[str, dict],
+        checkpoint_path: Optional[str] = None,
+        device: Optional[str] = None,
+        bert_model_path: Optional[str] = None,
+    ):
+        # ---- Config ----
+        if isinstance(config, str):
+            with open(config, "r") as f:
+                config = yaml.safe_load(f)
+        self.config: dict = config
+        self.net_name: str = config.get("net_name", "recmutattnnet")
+        self.ndims: int = config.get("ndims", 3)
+        self.img_size: int = config.get("img_size", 128)
+        self.timesteps: int = config.get("timesteps", 80)
+        self.v_scale: float = config.get("v_scale", 5e-5)
+        self.noise_scale: float = config.get("noise_scale", 0.1)
+        self.condition_type: str = config.get("condition_type", "none")
+        self.num_input_chn: int = config.get("num_input_chn", 1)
+        self.img_pad_mode: str = config.get("img_pad_mode", "zeros")
+        self.ddf_pad_mode: str = config.get("ddf_pad_mode", "border")
+        self.padding_mode: str = config.get("padding_mode", "border")
+        self.resample_mode: str = config.get("resample_mode", "bilinear")
+        self.batch_size: int = config.get("batchsize", 1)
+        self.data_name: str = config.get("data_name", "all")
+        self.clamp_range: list = config.get("clamp_range", [-400, 400])
+        self.inf_mode: bool = config.get("inf_mode", True)
+        # ---- Device ----
+        if device is not None:
+            self.device = torch.device(device)
+        else:
+            self.device = self._resolve_device(config.get("device", None))
+        # ---- BERT (lazy) ----
+        self.bert_model_path = bert_model_path or os.path.join(
+            os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
+            "External", "Models", "bert_large_uncased",
+        )
+        self._bert_model = None
+        self._bert_tokenizer = None
+        # ---- Network ----
+        Net = get_net(self.net_name)
+        self.network = Net(
+            n_steps=self.timesteps,
+            ndims=self.ndims,
+            num_input_chn=self.num_input_chn,
+            res=self.img_size,
+        )
+        self.network.to(self.device)
+        # ---- STN instances ----
+        self.ctl_ratio = 4
+        self.ctl_sz = self.img_size // self.ctl_ratio
+        self.stn_full = STN(
+            img_sz=self.img_size,
+            ndims=self.ndims,
+            padding_mode=self.padding_mode,
+            device=self.device,
+        )
+        self.stn_ctl = STN(
+            img_sz=self.ctl_sz,
+            ndims=self.ndims,
+            padding_mode=self.ddf_pad_mode,
+            device=self.device,
+        )
+        self.img_stn = STN(
+            img_sz=self.img_size,
+            ndims=self.ndims,
+            padding_mode=self.img_pad_mode,
+            device=self.device,
+            resample_mode=self.resample_mode if self.resample_mode != "bilinear" else None,
+        )
+        self.msk_stn = STN(
+            img_sz=self.img_size,
+            ndims=self.ndims,
+            padding_mode=self.img_pad_mode,
+            device=self.device,
+            resample_mode="nearest",
+        )
+        # ---- Loss functions (for fine-tuning) ----
+        self._loss_grad = Grad(penalty=["l1"], ndims=self.ndims)
+        self._loss_dist = MRSE(img_sz=self.img_size)
+        self._loss_ang = NCC(img_sz=self.img_size)
+        # ---- Load checkpoint ----
+        if checkpoint_path is not None:
+            self._load_checkpoint(checkpoint_path)
+        else:
+            auto_path = self._auto_find_checkpoint()
+            if auto_path is not None:
+                self._load_checkpoint(auto_path)
+        self.network.eval()
+        # ---- State ----
+        self._init_img: Optional[torch.Tensor] = None          # [B,1,S,S,S] model-res
+        self._init_img_raw: Optional[torch.Tensor] = None       # [B,1,D,H,W] full-res
+        self._init_img_original_shape: Optional[tuple] = None
+        self._init_ddf: Optional[torch.Tensor] = None           # [B,ndims,S,S,S]
+        self._cond_img: Optional[torch.Tensor] = None           # [B,1,S,S,S]
+        self._cond_txt: Optional[torch.Tensor] = None           # [B,1024]
+        self._predicted_ddf: Optional[torch.Tensor] = None      # [B,ndims,S,S,S]
+        self._intermediate_ddfs: List[Tuple[int, torch.Tensor]] = []
+        # ---- Fine-tuning state ----
+        self._optimizer: Optional[torch.optim.Optimizer] = None
+    # ------------------------------------------------------------------
+    # Device resolution
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _resolve_device(hint: Optional[str] = None) -> torch.device:
+        if hint is not None:
+            s = str(hint).lower()
+            if s not in ("auto", ""):
+                return torch.device(s)
+        # XPU → CUDA → CPU
+        try:
+            import intel_extension_for_pytorch  # noqa: F401
+            if torch.xpu.is_available():
+                return torch.device("xpu")
+        except (ImportError, AttributeError):
+            pass
+        if torch.cuda.is_available():
+            return torch.device("cuda")
+        return torch.device("cpu")
+    # ------------------------------------------------------------------
+    # Checkpoint helpers
+    # ------------------------------------------------------------------
+    def _auto_find_checkpoint(self) -> Optional[str]:
+        pattern = os.path.join(
+            os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
+            "Models",
+            f"{self.data_name}_{self.net_name}",
+            "*.pth",
+        )
+        files = sorted(glob.glob(pattern))
+        return files[-1] if files else None
+    def _load_checkpoint(self, path: str):
+        ckpt = torch.load(path, map_location="cpu")
+        state_dict = ckpt.get("model_state_dict", ckpt)
+        # Strip DDP 'module.' prefix and DeformDDPM wrapper keys
+        cleaned = {}
+        for k, v in state_dict.items():
+            k = k.replace("module.", "")
+            if k.startswith("network."):
+                k = k[len("network."):]
+            cleaned[k] = v
+        # Only load keys that exist in the network
+        net_keys = set(self.network.state_dict().keys())
+        filtered = {k: v for k, v in cleaned.items() if k in net_keys}
+        if filtered:
+            self.network.load_state_dict(filtered, strict=False)
+    # ------------------------------------------------------------------
+    # Public — Input setters
+    # ------------------------------------------------------------------
+    def set_init_img(
+        self,
+        img,
+        modality: Optional[str] = None,
+    ) -> "OMorpher":
+        """Set the initial image. Accepts numpy, torch, path, or (img, ddf) tuple."""
+        init_ddf = None
+        if isinstance(img, (tuple, list)):
+            img, init_ddf = img[0], img[1]
+        model_tensor, fullres_tensor, orig_shape = self._standardize_img(
+            img, modality=modality, keep_raw=True,
+        )
+        self._init_img = model_tensor
+        self._init_img_raw = fullres_tensor
+        self._init_img_original_shape = orig_shape
+        if init_ddf is not None:
+            self._init_ddf = self._to_ddf_tensor(init_ddf)
+        else:
+            B = self._init_img.shape[0]
+            S = self.img_size
+            self._init_ddf = torch.zeros(
+                [B, self.ndims] + [S] * self.ndims,
+                dtype=torch.float32, device=self.device,
+            )
+        return self
+    def set_cond_img(
+        self,
+        img=None,
+        modality: Optional[str] = None,
+    ) -> "OMorpher":
+        """Set the conditioning image. Default: Gaussian noise sigma=0.1."""
+        if img is None:
+            B = self._init_img.shape[0] if self._init_img is not None else self.batch_size
+            S = self.img_size
+            self._cond_img = torch.randn(
+                [B, 1] + [S] * self.ndims,
+                dtype=torch.float32, device=self.device,
+            ) * 0.1
+        else:
+            tensor, _, _ = self._standardize_img(img, modality=modality, keep_raw=False)
+            self._cond_img = tensor
+        return self
+    def set_cond_txt(self, txt=None) -> "OMorpher":
+        """Set the text conditioning. Accepts string, numpy [1024], torch [1024], or None."""
+        self._cond_txt = self._standardize_txt(txt)
+        return self
+    def set_init_def(self, ddf=None) -> "OMorpher":
+        """Set or regenerate the initial deformation field.
+        If *ddf* is ``None``, a random DDF is generated using the forward
+        diffusion parameters (useful for data augmentation).
+        """
+        if ddf is None:
+            if self._init_img is None:
+                raise RuntimeError("set_init_img() must be called before set_init_def()")
+            t_val = self.config.get("start_noise_step", self.timesteps // 2)
+            t = torch.tensor([t_val], dtype=torch.long, device=self.device)
+            _, _, random_ddf = self._get_random_ddf(self._init_img, t)
+            self._init_ddf = random_ddf
+        else:
+            self._init_ddf = self._to_ddf_tensor(ddf)
+        return self
+    # ------------------------------------------------------------------
+    # Public — Core operations (inference)
+    # ------------------------------------------------------------------
+    def predict(
+        self,
+        T: Optional[list] = None,
+        proc_type: Optional[str] = None,
+        t_save: Optional[list] = None,
+    ) -> "OMorpher":
+        """Run reverse diffusion and store predicted DDF. Returns ``self`` for chaining."""
+        if self._init_img is None:
+            raise RuntimeError("set_init_img() must be called before predict()")
+        # Defaults
+        start_noise = self.config.get("start_noise_step", 0)
+        if T is None:
+            T = [start_noise, self.timesteps]
+        if proc_type is None:
+            proc_type = self.condition_type
+        B = self._init_img.shape[0]
+        S = self.img_size
+        # Conditioning
+        cond_img_src = self._cond_img if self._cond_img is not None else self._init_img.clone().detach()
+        cond_img, mask, cond_ratio = self._proc_cond_img(cond_img_src, proc_type=proc_type)
+        # Text embedding
+        txt = self._cond_txt
+        if txt is None:
+            txt = torch.zeros([B, 1024], dtype=torch.float32, device=self.device)
+        # Reshape text for network consumption
+        if isinstance(self.network, DefRec_MutAttnNet):
+            txt = txt.view(B, -1, *([1] * self.ndims))
+        # Initial state
+        init_ddf_is_zero = (self._init_ddf is None) or torch.all(self._init_ddf == 0)
+        if not init_ddf_is_zero:
+            ddf_comp = self._init_ddf.clone()
+            img_rec = self.img_stn(self._init_img, ddf_comp)
+        elif T[0] is not None and T[0] > 0:
+            t_start = torch.tensor(np.array([T[0]]), device=self.device)
+            img_rec, _, ddf_comp = self._get_random_ddf(self._init_img, t_start)
+        else:
+            img_rec = self._init_img.clone()
+            ddf_comp = torch.zeros(
+                [B, self.ndims] + [S] * self.ndims,
+                dtype=torch.float32, device=self.device,
+            )
+        # Reverse diffusion loop
+        self._intermediate_ddfs = []
+        rec_num = 2  # matches DeformDDPM.rec_num default
+        if isinstance(self.network, DefRec_MutAttnNet):
+            # DefRec network: pass full time list at once
+            t_list = list(range(T[1] - 1, -1, -1))
+            with torch.no_grad():
+                pre_dvf = self.network(
+                    x=img_rec, y=cond_img, t=t_list, rec_num=rec_num, text=txt,
+                )
+            ddf_comp = self.stn_full(ddf_comp, pre_dvf) + pre_dvf
+            img_rec = self.img_stn(self._init_img.clone().detach(), ddf_comp)
+            if t_save:
+                self._intermediate_ddfs.append((0, ddf_comp.clone()))
+        else:
+            # Standard iterative recovery
+            time_steps = range(T[1] - 1, -1, -1)
+            for i in time_steps:
+                t = torch.tensor(np.array([i]), device=self.device)
+                with torch.no_grad():
+                    pre_dvf = self.network(
+                        x=img_rec, y=cond_img, t=t, rec_num=rec_num, text=txt,
+                    )
+                ddf_comp = self.stn_full(ddf_comp, pre_dvf) + pre_dvf
+                img_rec = self.img_stn(self._init_img.clone().detach(), ddf_comp)
+                if t_save is not None and i in t_save:
+                    self._intermediate_ddfs.append((i, ddf_comp.clone()))
+        self._predicted_ddf = ddf_comp
+        return self
+    def get_def(
+        self,
+        t_list: Optional[list] = None,
+    ) -> Union[torch.Tensor, Dict[int, torch.Tensor]]:
+        """Return the final predicted DDF, or intermediate DDFs for given timesteps."""
+        if t_list is None:
+            if self._predicted_ddf is None:
+                raise RuntimeError("predict() must be called before get_def()")
+            return self._predicted_ddf
+        out = {}
+        for t, ddf in self._intermediate_ddfs:
+            if t in t_list:
+                out[t] = ddf
+        return out
+    def apply_def(
+        self,
+        img=None,
+        ddf: Optional[torch.Tensor] = None,
+        padding_mode: Optional[str] = None,
+        resample_mode: Optional[str] = None,
+    ) -> torch.Tensor:
+        """Apply a DDF to an image. Auto-upscales DDF when sizes differ.
+        Defaults: init image at full resolution, predicted DDF.
+        """
+        if padding_mode is None:
+            padding_mode = self.padding_mode
+        if resample_mode is None:
+            resample_mode = "bilinear"
+        # Default DDF
+        if ddf is None:
+            if self._predicted_ddf is None:
+                raise RuntimeError("predict() must be called before apply_def()")
+            ddf = self._predicted_ddf
+        # Default image: full-res init image tensor
+        if img is None:
+            if self._init_img_raw is not None:
+                vol_tensor = self._init_img_raw
+            else:
+                vol_tensor = self._init_img
+        else:
+            vol_tensor = self._ensure_tensor(img)
+        # Upscale DDF if sizes differ
+        target_sz = list(vol_tensor.shape[2:])
+        ddf_sz = list(ddf.shape[2:])
+        if target_sz != ddf_sz:
+            ddf = F.interpolate(
+                ddf, size=target_sz,
+                mode="bilinear" if self.ndims == 2 else "trilinear",
+                align_corners=False,
+            )
+        return self._apply_ddf(vol_tensor, ddf, padding_mode=padding_mode, resample_mode=resample_mode)
+    # ------------------------------------------------------------------
+    # Public — Fine-tuning
+    # ------------------------------------------------------------------
+    def finetune_setup(
+        self,
+        lr: float = 1e-4,
+        optimizer_cls=None,
+    ) -> "OMorpher":
+        """Switch to training mode and create an optimizer."""
+        self.network.train()
+        self.inf_mode = False
+        if optimizer_cls is None:
+            optimizer_cls = torch.optim.Adam
+        self._optimizer = optimizer_cls(self.network.parameters(), lr=lr)
+        return self
+    def finetune_step(
+        self,
+        img_batch,
+        cond_batch=None,
+        text_batch=None,
+        t=None,
+        proc_type=None,
+    ) -> dict:
+        """Single training step. Returns loss dict."""
+        if self._optimizer is None:
+            raise RuntimeError("finetune_setup() must be called first")
+        img, _, _ = self._standardize_img(img_batch, keep_raw=False)
+        cond = self._standardize_img(cond_batch, keep_raw=False)[0] if cond_batch is not None else img.clone()
+        text = self._standardize_txt(text_batch)
+        B = img.shape[0]
+        if t is None:
+            t = torch.randint(0, self.timesteps, (B,), device=self.device)
+        else:
+            t = torch.tensor(t, device=self.device) if not isinstance(t, torch.Tensor) else t.to(self.device)
+        proc_type = proc_type or self.condition_type
+        cond_img, mask, cond_ratio = self._proc_cond_img(cond, proc_type=proc_type)
+        noisy_img, dvf_gt, _ = self._get_random_ddf(img, t)
+        # Reshape text for network
+        if isinstance(self.network, DefRec_MutAttnNet):
+            if text is not None:
+                text = text.view(B, -1, *([1] * self.ndims))
+            t_input = [t]
+        else:
+            t_input = t
+        pre_dvf = self.network(x=noisy_img * mask, y=cond_img, t=t_input, rec_num=2, text=text)
+        loss_grad = self._loss_grad(y_pred=pre_dvf, img=img)
+        trm_pred = self.stn_full(pre_dvf, dvf_gt)
+        loss_dist = self._loss_dist(pred=trm_pred, inv_lab=dvf_gt)
+        loss_ang = self._loss_ang(pred=trm_pred, inv_lab=dvf_gt)
+        loss_total = 2.0 * loss_ang + 1.0 * loss_dist + 16.0 * loss_grad
+        self._optimizer.zero_grad()
+        loss_total.backward()
+        self._optimizer.step()
+        return {
+            "loss_total": loss_total.item(),
+            "loss_grad": loss_grad.item(),
+            "loss_dist": loss_dist.item(),
+            "loss_ang": loss_ang.item(),
+        }
+    def finetune_save(self, path: str, epoch: int = 0):
+        """Save checkpoint in the standard OmniMorph format."""
+        os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
+        torch.save(
+            {
+                "model_state_dict": self.network.state_dict(),
+                "optimizer_state_dict": self._optimizer.state_dict() if self._optimizer else None,
+                "epoch": epoch,
+            },
+            path,
+        )
+    def finetune_teardown(self) -> "OMorpher":
+        """Switch back to eval mode."""
+        self.network.eval()
+        self.inf_mode = True
+        self._optimizer = None
+        return self
+    # ------------------------------------------------------------------
+    # Private — Diffusion logic
+    # ------------------------------------------------------------------
+    def _get_ddf_scale(
+        self, t: torch.Tensor, divide_num: int = 1, max_ddf_num: int = 200,
+    ) -> Tuple[int, torch.Tensor, torch.Tensor]:
+        """Timestep-dependent deformation magnitude. Mirrors DeformDDPM._get_ddf_scale()."""
+        rec_num = 1
+        mul_num_ddf = torch.floor_divide(2 * torch.pow(t.float(), 1.3), 3 * divide_num).int()
+        mul_num_dvf = torch.floor_divide(torch.pow(t.float(), 0.6), divide_num).int()
+        mul_num_ddf = torch.clamp(mul_num_ddf, min=1, max=max_ddf_num)
+        mul_num_dvf = torch.clamp(mul_num_dvf, min=0, max=max_ddf_num)
+        return rec_num, mul_num_ddf, mul_num_dvf
+    def _sample_random_uniform_multi_order(
+        self, high=None, low=0.0, order_num=3,
+    ) -> float:
+        sample_value = low
+        for _ in range(order_num):
+            sample_value = np.random.uniform(low=sample_value, high=high)
+        return sample_value
+    def _multiscale_dvf_generate(
+        self, v_scale: float, ctl_szs: list = None, rand_v_scale: bool = True,
+    ) -> torch.Tensor:
+        """Multi-scale Gaussian DVF at control-point sizes."""
+        if ctl_szs is None:
+            ctl_szs = [4, 8, 16, 32, 64]
+        dvf = 0
+        for ctl_sz in ctl_szs:
+            _v = (
+                self._sample_random_uniform_multi_order(high=v_scale, low=1e-8, order_num=2)
+                if rand_v_scale
+                else v_scale
+            )
+            if ctl_sz <= 2:
+                _v = _v / 2
+            dvf_comp = torch.randn(
+                [self.batch_size, self.ndims] + [ctl_sz] * self.ndims
+            ) * _v
+            dvf_comp = F.interpolate(
+                dvf_comp * self.ctl_sz / ctl_sz,
+                [self.ctl_sz] * self.ndims,
+                align_corners=False,
+                mode="bilinear" if self.ndims == 2 else "trilinear",
+            )
+            dvf = dvf + dvf_comp
+        return dvf
+    def _random_ddf_generate(
+        self,
+        rec_num: int = 3,
+        mul_num: list = None,
+        noise_ratio: float = 0.08,
+        select_num: int = 4,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compose DVFs to build a DDF. Mirrors DeformDDPM._random_ddf_generate()."""
+        if mul_num is None:
+            mul_num = [torch.tensor([5]), torch.tensor([5])]
+        crop_rate = 2
+        # unsqueeze mul_num for broadcasting
+        for _ in range(self.ndims + 1):
+            mul_num = [torch.unsqueeze(n, -1) for n in mul_num]
+        ctl_ddf_sz = [self.batch_size, self.ndims] + [self.ctl_sz] * self.ndims
+        ddf = torch.zeros(ctl_ddf_sz)
+        dddf = torch.zeros(ctl_ddf_sz)
+        scale_num = min(8, int(math.log2(self.ctl_sz)))
+        ctl_szs_all = [self.ctl_sz // (2 ** i) for i in range(scale_num)]
+        for _i in range(rec_num):
+            if len(ctl_szs_all) > select_num:
+                ctl_szs = random.sample(ctl_szs_all, select_num)
+            else:
+                ctl_szs = ctl_szs_all
+            dvf = self._multiscale_dvf_generate(self.v_scale, ctl_szs=ctl_szs).to(self.device)
+            if noise_ratio == 0:
+                dvf0 = dvf
+            else:
+                dvf0 = dvf + self.stn_ctl(
+                    self._multiscale_dvf_generate(
+                        self.v_scale * noise_ratio, ctl_szs=ctl_szs, rand_v_scale=False,
+                    ).to(self.device),
+                    dvf,
+                )
+            for j in range(torch.max(mul_num[0]).item()):
+                flag = [(n > j).int().to(self.device) for n in mul_num]
+                ddf = dvf0 * flag[0] + self.stn_ctl(ddf, dvf0 * flag[0])
+                dddf = dvf * flag[1] + self.stn_ctl(dddf, dvf * flag[1])
+        # Upscale and center-crop
+        interp_mode = "bilinear" if self.ndims == 2 else "trilinear"
+        ddf = F.interpolate(
+            ddf * self.img_size / self.ctl_sz,
+            self.img_size * crop_rate,
+            mode=interp_mode,
+        )
+        dddf = F.interpolate(
+            dddf * self.img_size / self.ctl_sz,
+            self.img_size * crop_rate,
+            mode=interp_mode,
+        )
+        half = self.img_size // 2
+        three_half = self.img_size * 3 // 2
+        if self.ndims == 2:
+            ddf = ddf[..., half:three_half, half:three_half]
+            dddf = dddf[..., half:three_half, half:three_half]
+        else:
+            ddf = ddf[..., half:three_half, half:three_half, half:three_half]
+            dddf = dddf[..., half:three_half, half:three_half, half:three_half]
+        return ddf, dddf
+    def _get_random_ddf(
+        self, img: torch.Tensor, t: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Forward-diffuse: generate random DDF and warp image."""
+        rec_num, mul_num_ddf, mul_num_dvf = self._get_ddf_scale(t=t)
+        ddf_forward, dvf_forward = self._random_ddf_generate(
+            rec_num=rec_num, mul_num=[mul_num_ddf, mul_num_dvf],
+        )
+        warped_img = self.img_stn(img, ddf_forward)
+        return warped_img, dvf_forward, ddf_forward
+    # ------------------------------------------------------------------
+    # Private — Conditioning processing
+    # ------------------------------------------------------------------
+    def _proc_cond_img(
+        self,
+        img: torch.Tensor,
+        proc_type: Optional[str] = None,
+        noise_scale: float = 0.1,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Conditioning strategies. Mirrors DeformDDPM.proc_cond_img()."""
+        proc_img = img.clone().detach()
+        if proc_type is None:
+            proc_type = random.choices(
+                ["adding", "independ", "downsample", "slice", "none", "uncon"],
+                weights=[1, 1, 1, 1, 1, 3],
+                k=1,
+            )[0]
+        mask = torch.tensor(1, device=img.device)
+        cond_ratio = torch.tensor(1.0, device=img.device)
+        if proc_type in ["none", None, "", "None"]:
+            return proc_img, mask, cond_ratio
+        noise_type = random.choice(["gaussian", "uniform", "none"])
+        if proc_type == "uncon":
+            noise_map = self._create_noise_map(img, noise_type=noise_type, noise_scale=noise_scale)
+            return noise_map, torch.tensor(0, device=img.device), torch.tensor(0, device=img.device)
+        noise_map = None
+        if proc_type in ["adding", "independ", "slice"]:
+            noise_map = self._create_noise_map(img, noise_type=noise_type, noise_scale=noise_scale)
+        if proc_type == "adding":
+            noise_ratio = np.random.uniform(0.0, 1.0)
+            proc_img = proc_img * (1 - noise_ratio) + noise_map * noise_ratio
+            cond_ratio = torch.tensor(1 - noise_ratio, device=img.device)
+        elif proc_type == "independ":
+            mask = self._create_noise_map(img, noise_type="binary")
+            proc_img = img * mask
+            cond_ratio = mask.float().mean()
+        elif proc_type == "downsample":
+            down_ratio = list(np.random.uniform(1.0 / 64, 1, [self.ndims]))
+            down_img = F.interpolate(
+                proc_img, scale_factor=down_ratio,
+                mode="bilinear" if self.ndims == 2 else "trilinear",
+            )
+            proc_img = F.interpolate(
+                down_img, size=[self.img_size] * self.ndims,
+                mode="bilinear" if self.ndims == 2 else "trilinear",
+                align_corners=False,
+            )
+            cond_ratio = torch.tensor(np.sqrt(np.prod(down_ratio)), device=img.device)
+        elif proc_type == "slice":
+            slice_num_max = random.randint(1, 64)
+            slice_num_max = random.randint(1, slice_num_max)
+            mask, sample_ratio = self._get_slice_mask(img, slice_num_range=[0, slice_num_max])
+            proc_img = img * mask
+            cond_ratio = torch.tensor(sample_ratio, device=img.device)
+        elif proc_type == "project":
+            proj_img = torch.zeros_like(img)
+            rand_bourn = np.random.randint(0, 2, size=[self.ndims])
+            proj_dim_num = np.sum(rand_bourn)
+            for i, pflag in zip(range(2, 2 + self.ndims), rand_bourn):
+                if pflag:
+                    proj_img += torch.mean(img, dim=i, keepdim=True)
+            proc_img = proj_img / (proj_dim_num + EPS)
+            cond_ratio = torch.tensor(proj_dim_num / (128 * self.ndims), device=img.device)
+        return proc_img, mask, cond_ratio
+    def _create_noise_map(
+        self,
+        img: torch.Tensor,
+        noise_type: str = "gaussian",
+        noise_scale: float = 0.1,
+    ) -> torch.Tensor:
+        if noise_type == "gaussian":
+            return (torch.randn_like(img) * noise_scale).to(img.device)
+        elif noise_type == "uniform":
+            return (torch.rand_like(img) * noise_scale * 2 - noise_scale).to(img.device)
+        elif noise_type == "binary":
+            return torch.bernoulli(torch.rand_like(img)).to(img.device)
+        return torch.zeros_like(img).to(img.device)
+    def _get_slice_mask(
+        self,
+        img: torch.Tensor,
+        slice_num_range: list = None,
+    ) -> Tuple[torch.Tensor, float]:
+        if slice_num_range is None:
+            slice_num_range = [0, 32]
+        slice_num_range[1] = min(slice_num_range[1], self.img_size)
+        mask = torch.zeros_like(img)
+        sample_ratio = 0.0
+        for i in range(self.ndims):
+            if self.inf_mode:
+                slice_num = 1
+                slice_idx = [self.img_size // 2]
+            else:
+                slice_num = random.randint(slice_num_range[0], slice_num_range[1])
+                slice_idx = random.sample(range(self.img_size), slice_num)
+            transpose_list = [0, 1, 1 + self.ndims] + list(range(2, 1 + self.ndims))
+            for idx in slice_idx:
+                mask[..., idx] = 1
+            mask = mask.permute(*transpose_list)
+            sample_ratio += np.sqrt(slice_num / self.img_size) / self.ndims
+        return mask, sample_ratio
+    # ------------------------------------------------------------------
+    # Private — Standardization
+    # ------------------------------------------------------------------
+    def _standardize_img(
+        self,
+        img,
+        modality: Optional[str] = None,
+        keep_raw: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple]]:
+        """Deterministic inference variant of the dataloader pipeline.
+        Returns ``(model_tensor, fullres_tensor_or_None, orig_shape_or_None)``.
+        * *model_tensor*: ``[B, C, S, S, S]`` at model resolution.
+        * *fullres_tensor*: ``[B, C, D, H, W]`` at original padded resolution
+          (only when *keep_raw=True*).
+        * *orig_shape*: spatial dims of padded volume before resize.
+        Accepts numpy arrays, torch tensors (any dimensionality), or a
+        file path (loaded via SimpleITK).  Torch tensors with >= 4 dims
+        are treated as already-batched and are passed through with
+        appropriate device/dtype conversion.
+        """
+        fullres_tensor = None
+        orig_shape = None
+        # 1. Load from path
+        if isinstance(img, str):
+            sitk_img = sitk.ReadImage(img)
+            vol = sitk.GetArrayFromImage(sitk_img)
+            vol = self._reverse_axis_order(vol)
+        elif isinstance(img, np.ndarray):
+            vol = img.copy()
+        elif isinstance(img, torch.Tensor):
+            # If already a batched tensor [B,C,...], pass through
+            if img.ndim >= 4:
+                t = img.float().to(self.device)
+                if keep_raw:
+                    fullres_tensor = t.clone()
+                return t, fullres_tensor, None
+            # 1-3D tensor — treat as spatial-only numpy
+            vol = img.numpy()
+        else:
+            raise TypeError(f"Unsupported image type: {type(img)}")
+        # 2. Extract 3D from 4D
+        if vol.ndim == 4:
+            vol = vol[:, :, :, 0]
+        # 3. CT clamping
+        if modality is not None and modality.upper() == "CT" and self.clamp_range is not None:
+            vol = np.clip(vol, self.clamp_range[0], self.clamp_range[1])
+        # 4. Normalize [0, 1]
+        vol = vol.astype(np.float64)
+        vol = (vol - np.min(vol)) / (np.ptp(vol) + 1e-7)
+        # 5. Center-pad to cube
+        vol = self._center_pad_to_cube(vol)
+        orig_shape = vol.shape[:3]
+        # 6. Full-res tensor (before resize)
+        if keep_raw:
+            fullres_tensor = torch.tensor(
+                vol[None, None, ...], dtype=torch.float32, device=self.device,
+            )
+        # 7. Resize to model resolution
+        target_sz = [self.img_size] * self.ndims
+        vol_resized = sk_resize(
+            vol, target_sz, anti_aliasing=True, preserve_range=True,
+        )
+        # 8. Add batch + channel dims
+        model_tensor = torch.tensor(
+            vol_resized[None, None, ...], dtype=torch.float32, device=self.device,
+        )
+        return model_tensor, fullres_tensor, orig_shape
+    def _standardize_label(
+        self,
+        label,
+        fill_value: float = -1,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Standardize a label volume for inference.
+        Returns ``(model_tensor, fullres_tensor)``.
+        * *model_tensor*: ``[1, C, S, S, S]`` at model resolution
+          (nearest-neighbor resize, no anti-aliasing).
+        * *fullres_tensor*: ``[1, C, D, H, W]`` at original padded resolution.
+        If *label* is ``None``, returns *fill_value*-filled placeholders
+        shaped to match the current init image (model-res and full-res).
+        Accepts numpy arrays or torch tensors.  Does NOT apply
+        normalization or clamping (labels are discrete indices).
+        """
+        # --- Placeholder for missing labels ---
+        if label is None:
+            model_sz = [self.img_size] * self.ndims
+            model_t = torch.full(
+                [1, 1] + model_sz, fill_value,
+                dtype=torch.float32, device=self.device,
+            )
+            if self._init_img_raw is not None:
+                fullres_sz = list(self._init_img_raw.shape[2:])
+            else:
+                fullres_sz = model_sz
+            fullres_t = torch.full(
+                [1, 1] + fullres_sz, fill_value,
+                dtype=torch.float32, device=self.device,
+            )
+            return model_t, fullres_t
+        # --- Convert to numpy if needed ---
+        if isinstance(label, torch.Tensor):
+            if label.ndim >= 4:
+                # Already batched tensor — pass through
+                fullres_t = label.float().to(self.device)
+                target_sz = [self.img_size] * self.ndims
+                model_t = F.interpolate(
+                    fullres_t, size=target_sz, mode="nearest",
+                )
+                return model_t, fullres_t
+            lab = label.numpy()
+        elif isinstance(label, np.ndarray):
+            lab = label.copy()
+        else:
+            raise TypeError(f"Unsupported label type: {type(label)}")
+        # --- Center-pad to cube ---
+        lab = self._center_pad_to_cube(lab)
+        # --- Channel dim: 3D→[C=1,...], 4D→channels-first [C,...] ---
+        if lab.ndim == 3:
+            lab = lab[None, :, :, :]       # [1, D, H, W]
+        elif lab.ndim > 3:
+            lab = np.transpose(lab, (3, 0, 1, 2))  # [C, D, H, W]
+        # --- Full-res tensor ---
+        fullres_t = torch.tensor(
+            lab[None, ...], dtype=torch.float32, device=self.device,
+        )  # [1, C, D, H, W]
+        # --- Resize to model resolution (nearest-neighbor) ---
+        target_sz = [self.img_size] * self.ndims
+        # Resize each channel separately to avoid resizing the channel dim
+        channels = []
+        for c in range(lab.shape[0]):
+            ch = sk_resize(
+                lab[c], target_sz,
+                anti_aliasing=False, preserve_range=True, order=0,
+            )
+            channels.append(ch)
+        lab_model = np.stack(channels, axis=0)  # [C, S, S, S]
+        model_t = torch.tensor(
+            lab_model[None, ...], dtype=torch.float32, device=self.device,
+        )  # [1, C, S, S, S]
+        return model_t, fullres_t
+    def _standardize_txt(self, txt) -> Optional[torch.Tensor]:
+        """Convert text input to [B, 1024] tensor."""
+        if txt is None:
+            return None
+        if isinstance(txt, str):
+            self._ensure_bert()
+            from Dataloader.bert_helper import str2emb
+            emb = str2emb(
+                txt, max_words_num=100,
+                embeder=self._bert_model, tokenizer=self._bert_tokenizer,
+                reduce_method="mean",
+            )
+            return emb.to(self.device)  # [1, 1024]
+        if isinstance(txt, np.ndarray):
+            t = torch.tensor(txt, dtype=torch.float32, device=self.device)
+            if t.ndim == 1:
+                t = t.unsqueeze(0)
+            return t
+        if isinstance(txt, torch.Tensor):
+            t = txt.float().to(self.device)
+            if t.ndim == 1:
+                t = t.unsqueeze(0)
+            return t
+        raise TypeError(f"Unsupported text type: {type(txt)}")
+    def _ensure_bert(self):
+        if self._bert_model is None:
+            from Dataloader.bert_helper import get_frozen_embeder
+            self._bert_model, self._bert_tokenizer = get_frozen_embeder(self.bert_model_path)
+    # ------------------------------------------------------------------
+    # Private — Spatial utilities
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _reverse_axis_order(arr: np.ndarray) -> np.ndarray:
+        """SimpleITK → NumPy axis order."""
+        return np.ascontiguousarray(arr.transpose(tuple(range(arr.ndim)[::-1])))
+    @staticmethod
+    def _center_pad_to_cube(volume: np.ndarray) -> np.ndarray:
+        """Pad volume to a cube using the max dimension, with symmetric padding."""
+        max_dim = max(volume.shape[:3])
+        pad_width = []
+        for s in volume.shape[:3]:
+            total_pad = max_dim - s
+            pad_before = total_pad // 2
+            pad_after = total_pad - pad_before
+            pad_width.append((pad_before, pad_after))
+        for _ in range(volume.ndim - 3):
+            pad_width.append((0, 0))
+        return np.pad(volume, pad_width, mode="constant", constant_values=0)
+    def _apply_ddf(
+        self,
+        volume_tensor: torch.Tensor,
+        ddf: torch.Tensor,
+        padding_mode: str = "border",
+        resample_mode: str = "bilinear",
+    ) -> torch.Tensor:
+        """Apply DDF to volume tensor at any resolution via grid_sample."""
+        device = ddf.device
+        ndims = self.ndims
+        img_sz = list(volume_tensor.shape[2:])
+        max_sz = torch.reshape(
+            torch.tensor(img_sz, dtype=torch.float32, device=device),
+            [1, ndims] + [1] * ndims,
+        )
+        ref_grid = torch.reshape(
+            torch.stack(
+                torch.meshgrid(
+                    [torch.arange(s, device=device, dtype=torch.float32) for s in img_sz],
+                    indexing="ij",
+                ),
+                0,
+            ),
+            [1, ndims] + img_sz,
+        )
+        img_shape = torch.reshape(
+            torch.tensor(
+                [(s - 1) / 2.0 for s in img_sz], dtype=torch.float32, device=device,
+            ),
+            [1] + [1] * ndims + [ndims],
+        )
+        grid = torch.flip(
+            (ddf * max_sz + ref_grid).permute(
+                [0] + list(range(2, 2 + ndims)) + [1]
+            )
+            / img_shape
+            - 1,
+            dims=[-1],
+        )
+        return F.grid_sample(
+            volume_tensor.to(device),
+            grid.float(),
+            mode=resample_mode,
+            padding_mode=padding_mode,
+            align_corners=True,
+        )
+    def _ensure_tensor(self, img) -> torch.Tensor:
+        """Convert numpy/torch input to a [B, C, ...] float tensor on device."""
+        if isinstance(img, np.ndarray):
+            t = torch.tensor(img, dtype=torch.float32, device=self.device)
+        elif isinstance(img, torch.Tensor):
+            t = img.float().to(self.device)
+        else:
+            raise TypeError(f"Unsupported image type: {type(img)}")
+        if t.ndim == self.ndims:          # spatial only → [B=1, C=1, ...]
+            t = t[None, None, ...]
+        elif t.ndim == self.ndims + 1:    # [C, ...] → [B=1, C, ...]
+            t = t[None, ...]
+        return t
+    def _to_ddf_tensor(self, ddf) -> torch.Tensor:
+        """Convert ddf input to proper tensor on device."""
+        if isinstance(ddf, np.ndarray):
+            ddf = torch.tensor(ddf, dtype=torch.float32)
+        ddf = ddf.float().to(self.device)
+        if ddf.ndim == self.ndims + 1:
+            ddf = ddf.unsqueeze(0)
+        # Resize to model resolution if needed
+        model_sz = [self.img_size] * self.ndims
+        if list(ddf.shape[2:]) != model_sz:
+            ddf = F.interpolate(
+                ddf, size=model_sz,
+                mode="bilinear" if self.ndims == 2 else "trilinear",
+                align_corners=False,
+            )
+        return ddf
+    # ------------------------------------------------------------------
+    # Convenience / repr
+    # ------------------------------------------------------------------
+    def __repr__(self) -> str:
+        status_parts = []
+        if self._init_img is not None:
+            status_parts.append(f"init_img={list(self._init_img.shape)}")
+        if self._cond_img is not None:
+            status_parts.append(f"cond_img={list(self._cond_img.shape)}")
+        if self._predicted_ddf is not None:
+            status_parts.append(f"predicted_ddf={list(self._predicted_ddf.shape)}")
+        status = ", ".join(status_parts) if status_parts else "empty"
+        return (
+            f"OMorpher(net={self.net_name}, ndims={self.ndims}, "
+            f"img_size={self.img_size}, device={self.device}, {status})"
+        )

README.md CHANGED Viewed

@@ -1,80 +1,129 @@
-# OmniMorph: Deform All-in-One Framework for Medical Image Generation, Restoration and Registration based on conditional Deformation-Recovery Diffusion Model
-## Links
-- **Google Drive**: [Dataset & Resources](https://drive.google.com/drive/folders/1N72SeYKwnaMmFq9_NqqEXxZ1jUcw2SwG?usp=drive_link)
-- **Notion**: [Dataset Documentation](https://www.notion.so/Dataset-2bc2300266fe48dfafef580dacf16d50?pvs=4)
-- **Overleaf**: [Paper Draft](https://www.overleaf.com/4489753418kstfhwsxgtkw#a0dbad)
-- **Discord**: [Channel Invite](https://discord.gg/6HrD29T2)
-- **GitHub Repository**: `/home/data/Github/OmniMorph`
-## Environments
-### Data Processing
-- Library: **SimpleITK**
-- Environment:
-```bash
-conda activate torch
-conda deactivate
-```
-### Diffusion Model / DataEngineer (with BERT)
-> Note: 暂不更新，等 MIA 审稿
-```bash
-source /home/data/jzheng/Adaptive_Motion_Generator-master/pipenv/bin/activate
-deactivate
-```
-Or:
-```bash
-source /home/data/Github/OmniMorph/ominenv/bin/activate
-```
-### nnUNet
-```bash
-source ~/PycharmProjects/pythonProject/venv/bin/activate
-```
-### Masking CUDA
-```bash
-CUDA_VISIBLE_DEVICES=0,1,3 python ...
-```
-## Rental Server (租赁服务器)
-```bash
-ssh -p 49419 root@i-2.gpushare.com
-# Password: aFwd98tamsHPtDDhWzUqvXfTagUqfNg8
-```
-SSH Config:
-```
-Host gpushare
-    HostName i-2.gpushare.com
-    User root
-    Port 49419
-```
-Conda environments on server:
-```bash
-conda activate OM
-conda activate unigrad
-```
-Data path: `/hy-tmp`
-## Data Paths
-| Item | Path |
-|------|------|
-| Dataset | `/home/data/Github/data/data_gen_def/DATASETS` |
-| Processed Data | `/home/data/Github/data/data_gen_def/DATASETS_processed` |
-| Data Processing Template | `/home/data/jzheng/Data_Engineering/dataclean_TotSeg.py` |
-## Related Documentation
-1. **DataEngineer**:
-   - `/home/data/jzheng/Data_Engineering/README.md`
-   - `/home/data/jzheng/data_process`
-2. **OmniMorph**: `/home/data/Github/OmniMorph/README.md`

+---
+license: mit
+tags:
+  - medical-imaging
+  - registration
+  - diffusion
+  - 3d
+  - image-generation
+  - image-restoration
+  - pytorch
+library_name: pytorch
+---
+# OmniMorph
+**Deform All-in-One Framework for Medical Image Generation, Restoration and Registration based on a conditional Deformation-Recovery Diffusion Model (DeformDDPM).**
+OmniMorph is a unified framework for 2D/3D multi-modal medical imaging (CT, MRI, PET) supporting:
+- **Generation** — text-conditioned image synthesis via BERT embeddings.
+- **Restoration** — recover anatomically plausible images from degraded inputs.
+- **Registration** — paired / unpaired / flexible-resolution registration via diffused deformation vector fields.
+## Repository Contents
+| Path | Description |
+|---|---|
+| `OM_train*.py` | Training entrypoints (single-/2-/3-mode variants, CUDA + Intel XPU) |
+| `OM_aug*.py`, `OM_reg*.py`, `OM_contrastive*.py` | Inference / augmentation / registration / contrastive scripts |
+| `Diffusion/` | DeformDDPM core: `diffuser.py`, networks, losses, spatial utils |
+| `OMorpher/` | Higher-level model wrapper |
+| `Dataloader/` | Multi-modality dataloaders + dataset mappings (16 datasets) |
+| `Config/` | YAML training/inference configs |
+| `Scripts/` | Auxiliary scripts (registration, evaluation) |
+| `tests/` | Pytest suite for `OMorpher` and loss functions |
+| `bash_*.sh`, `*.slurm` | SLURM submission scripts (CUDA + Intel XPU/Dawn) |
+| `Models/all_om_net/000110_all_om_net.pth` | Trained checkpoint (epoch 110, multi-modal `recmulmodmutattnnet`) |
+> **Note** Only the final checkpoint (epoch 110) is shipped here. Earlier epochs and the `bert_large_uncased` weights are not bundled — download `bert-large-uncased` from the official Hugging Face repo if you need the contrastive text encoder.
+## Setup
+```bash
+git clone https://huggingface.co/DRDMsig/Omini3D
+cd Omini3D
+pip install -r requirements.txt
+```
+For Intel XPU / Dawn cluster, install the matching `intel-extension-for-pytorch` build before installing the rest of the requirements.
+## Quick Start
+### Training
+```bash
+# Single-mode diffusion
+CUDA_VISIBLE_DEVICES=0 python OM_train.py -C Config/config_om.yaml
+# Dual mode (diffusion + registration)
+CUDA_VISIBLE_DEVICES=0,1 python OM_train_2modes.py -C Config/config_om.yaml
+# Triple mode (diffusion + contrastive + registration)
+CUDA_VISIBLE_DEVICES=0,1 python OM_train_3modes.py -C Config/config_om.yaml
+# Intel XPU (single node)
+sbatch bash_train_single_node.sh
+```
+### Inference
+```bash
+# Augmentation / restoration with a trained model
+python OM_aug.py -C Config/config_om.yaml
+# Paired registration
+python OM_reg.py -C Config/config_om.yaml
+# Flexible-resolution registration
+python OM_reg_flexres.py -C Config/config_om.yaml
+```
+### Loading the checkpoint
+```python
+import torch
+from Diffusion.networks import get_net
+# Production network (multi-modal recmutattnnet)
+net = get_net("recmulmodmutattnnet")
+state = torch.load("Models/all_om_net/000110_all_om_net.pth", map_location="cpu")
+net.load_state_dict(state["model"] if "model" in state else state)
+net.eval()
+```
+## Architecture
+```
+Config YAML → DataLoader(s) → DeformDDPM(Network, STN) → Loss → Checkpoint
+```
+- **`DeformDDPM`** (`Diffusion/diffuser.py`) — forward/reverse diffusion over deformation vector fields (DVFs); multi-scale DDFs at control-point ratios `[4, 8, 16, 32, 64]`.
+- **Networks** (`Diffusion/networks.py`) — selectable via `get_net(name)`:
+  - `recmulmodmutattnnet` — current production multi-modal multi-head-attention net (used by `000110_all_om_net.pth`)
+  - `recmutattnnet`, `recmutattnnet_contrastive`, `recresacnet`, `defrecmutattnnet`
+- **`STN`** — Spatial Transformer for differentiable warping; composes deformations as `comp_ddf = dvf + stn(ddf, dvf)`.
+- **Losses** (`Diffusion/losses.py`, `losses_ncc0.py`) — `Grad`, `LNCC`, `LMSE`, `NCC`, `MRSE`, `RMSE`.
+## Datasets Supported
+`Dataloader/nifty_mappings/` contains pre-computed mappings for 16 public medical-imaging datasets, including:
+AbdomenAtlas, AbdomenCT-1k, BraTS 2019/2020/2021, MSD, OASIS-1/2, OAI-ZIB, MnMs, Kaggle OSIC, TotalSegmentator (CT+MRI), PSMA-FDG-PET-CT-Lesion, CIA.
+The dataset files themselves are **not** included; obtain them from their respective sources and update the mapping paths.
+## Citation
+```bibtex
+@article{omnimorph,
+  title  = {OmniMorph: Deform All-in-One Framework for Medical Image Generation,
+            Restoration and Registration via Conditional Deformation-Recovery
+            Diffusion Models},
+  author = {Zheng, J. and Mo, M. and others},
+  year   = {2025}
+}
+```
+## License
+MIT — see `LICENSE`.

Scripts/OM_aug_om.py ADDED Viewed

	@@ -0,0 +1,239 @@

+"""
+OM_aug_om.py — Augmentation using OMorpher.
+Drop-in replacement for OM_aug.py. Produces identical outputs but uses
+OMorpher instead of DeformDDPM + STN + standalone apply_ddf().
+Usage:
+    python Scripts/OM_aug_om.py -C Config/config_om.yaml
+"""
+import os
+import sys
+import argparse
+# Add project root to path so imports work from Scripts/
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import numpy as np
+import torch
+import nibabel as nib
+import yaml
+from tqdm import tqdm
+import utils
+from Dataloader.dataLoader import OminiDataset_inference_w_all
+from torch.utils.data import DataLoader
+from OMorpher import OMorpher
+# ========== CLI ==========
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--config", "-C",
+    help="Path for the config file",
+    type=str,
+    default="Config/config_cmr.yaml",
+    required=False,
+)
+args = parser.parse_args()
+# ========== Config ==========
+with open(args.config, "r") as file:
+    hyp_parameters = yaml.safe_load(file)
+    print(hyp_parameters)
+if not os.path.exists(hyp_parameters["aug_img_savepath"]):
+    os.makedirs(hyp_parameters["aug_img_savepath"])
+if not os.path.exists(hyp_parameters["aug_msk_savepath"]):
+    os.makedirs(hyp_parameters["aug_msk_savepath"])
+if not os.path.exists(hyp_parameters["aug_ddf_savepath"]):
+    os.makedirs(hyp_parameters["aug_ddf_savepath"])
+print(hyp_parameters["aug_img_savepath"])
+hyp_parameters["batchsize"] = 1
+# ========== Dataset (identical to OM_aug.py) ==========
+select_channels_dict = {}
+min_crop_ratio = 0.9
+label_keys = ["heart"]
+database = ["MnMs"]
+subtype = "es"
+hyp_parameters["aug_img_savepath"] = f"Data/Aug_data/mnms_{subtype}/img/"
+hyp_parameters["aug_msk_savepath"] = f"Data/Aug_data/mnms_{subtype}/msk/"
+hyp_parameters["aug_ddf_savepath"] = f"Data/Aug_data/mnms_{subtype}/ddf/"
+select_channels_dict = {"ImgDict": [subtype]}
+dataset = OminiDataset_inference_w_all(
+    transform=None,
+    min_crop_ratio=min_crop_ratio,
+    label_key=label_keys,
+    database=database,
+    select_channels_dict=select_channels_dict,
+)
+Infer_Loader = DataLoader(
+    dataset,
+    batch_size=hyp_parameters["batchsize"],
+    shuffle=False,
+)
+# ========== OMorpher setup ==========
+epoch = f'{hyp_parameters["model_id_str"]}_{hyp_parameters["data_name"]}_{hyp_parameters["net_name"]}'
+model_save_path = os.path.join(
+    f'Models/{hyp_parameters["data_name"]}_{hyp_parameters["net_name"]}/',
+    str(epoch) + ".pth",
+)
+print("Loading model from:", model_save_path)
+om = OMorpher(
+    config=hyp_parameters,
+    checkpoint_path=model_save_path,
+    device=str(hyp_parameters.get("device", "cpu")),
+)
+print(om)
+# ========== Output directories ==========
+os.makedirs(hyp_parameters["aug_img_savepath"], exist_ok=True)
+os.makedirs(hyp_parameters["aug_msk_savepath"], exist_ok=True)
+os.makedirs(hyp_parameters["aug_ddf_savepath"], exist_ok=True)
+# ========== Main inference loop ==========
+device = om.device
+print("total num of image:", len(Infer_Loader))
+for e, d in tqdm(enumerate(Infer_Loader)):
+    img = d["img"]
+    mask = d["labels"]
+    label_str = str(d["label_channels"])
+    pid = e
+    print("Processing to patient:", pid, " image:", e)
+    img = img.type(torch.float32).to(device)
+    image_original = img.cpu().detach().numpy()
+    mask = mask.type(torch.float32).to(device)
+    mask_original = mask.cpu().detach().numpy()
+    # Save original image and mask
+    nifti_img = utils.converet_to_nibabel(image_original, ndims=hyp_parameters["ndims"])
+    nifti_mask = utils.converet_to_nibabel(mask_original, ndims=hyp_parameters["ndims"])
+    nib.save(
+        nifti_img,
+        os.path.join(
+            hyp_parameters["aug_img_savepath"],
+            utils.get_barcode([pid, e]) + ".nii.gz",
+        ),
+    )
+    nib.save(
+        nifti_mask,
+        os.path.join(
+            hyp_parameters["aug_msk_savepath"],
+            utils.get_barcode([pid, e]) + "_GT.nii.gz",
+        ),
+    )
+    # Augmentation loop
+    noise_step = hyp_parameters["start_noise_step"]
+    with torch.no_grad():
+        for im in range(hyp_parameters["aug_coe"]):
+            print(
+                f"Generating -> Subject-{pid}, Scan-{e} "
+                f'({im}/{hyp_parameters["aug_coe"]})',
+                end="\r",
+            )
+            # 1. Set init image (DataLoader tensor passes through)
+            om.set_init_img(img)
+            # 2. Self-conditioning (matches: cond_imgs = img_org.clone().detach())
+            om.set_cond_img(img)
+            # 3. Forward diffuse to get noisy image + random DDF
+            t_start = torch.tensor(np.array([noise_step]), device=device)
+            img_diff, _, ddf_rand = om._get_random_ddf(om._init_img, t_start)
+            # 4. Get noisy mask
+            msk_diff = om.apply_def(
+                img=mask, ddf=ddf_rand,
+                padding_mode="zeros", resample_mode="nearest",
+            )
+            # 5. Set random DDF as initial DDF
+            om.set_init_def(ddf=ddf_rand.clone().detach())
+            # 6. Run reverse diffusion
+            om.predict(
+                T=[noise_step, hyp_parameters["timesteps"]],
+                proc_type=hyp_parameters["condition_type"],
+            )
+            # 7. Get recovered outputs
+            ddf_comp = om.get_def()
+            img_rec = om.apply_def(img=img, ddf=ddf_comp, padding_mode="zeros")
+            msk_rec = om.apply_def(
+                img=mask, ddf=ddf_comp,
+                padding_mode="zeros", resample_mode="nearest",
+            )
+            # Convert to numpy for saving
+            denoise_imgs = img_rec.cpu().detach().numpy()
+            denoise_msks = msk_rec.cpu().detach().numpy()
+            noisy_imgs_np = img_diff.cpu().detach().numpy()
+            noisy_msks_np = msk_diff.cpu().detach().numpy()
+            # Save augmented (recovered) outputs
+            nifti_img_aug = utils.converet_to_nibabel(denoise_imgs, ndims=hyp_parameters["ndims"])
+            nifti_mask_aug = utils.converet_to_nibabel(denoise_msks, ndims=hyp_parameters["ndims"])
+            nifti_img = utils.converet_to_nibabel(noisy_imgs_np, ndims=hyp_parameters["ndims"])
+            nifti_mask = utils.converet_to_nibabel(noisy_msks_np, ndims=hyp_parameters["ndims"])
+            nib.save(
+                nifti_img_aug,
+                os.path.join(
+                    hyp_parameters["aug_img_savepath"],
+                    utils.get_barcode([pid, e, im, noise_step]) + ".nii.gz",
+                ),
+            )
+            nib.save(
+                nifti_mask_aug,
+                os.path.join(
+                    hyp_parameters["aug_msk_savepath"],
+                    utils.get_barcode([pid, e, im, noise_step]) + "_GT.nii.gz",
+                ),
+            )
+            # Save noisy image/mask
+            nib.save(
+                nifti_img,
+                os.path.join(
+                    hyp_parameters["aug_img_savepath"],
+                    utils.get_barcode(
+                        [pid, e, im, noise_step],
+                        header=["Patient", "Slice", "NoiseImg", "NoiseStep"],
+                    ) + ".nii.gz",
+                ),
+            )
+            nib.save(
+                nifti_mask,
+                os.path.join(
+                    hyp_parameters["aug_msk_savepath"],
+                    utils.get_barcode(
+                        [pid, e, im, noise_step],
+                        header=["Patient", "Slice", "NoiseImg", "NoiseStep"],
+                    ) + "_GT.nii.gz",
+                ),
+            )
+            if (im - hyp_parameters["start_noise_step"]) % 2 == 0:
+                noise_step = noise_step + hyp_parameters["noise_step"]
+    if e >= 0:
+        exit()

Scripts/OM_reg_flexres_om.py ADDED Viewed

	@@ -0,0 +1,315 @@

+"""
+OM_reg_flexres_om.py — Full-resolution registration using OMorpher.
+Drop-in replacement for OM_reg_flexres.py.  Produces identical outputs but
+uses OMorpher instead of DeformDDPM + STN + standalone apply_ddf().
+Usage:
+    python Scripts/OM_reg_flexres_om.py -C Config/config_om.yaml
+"""
+import os
+import sys
+import argparse
+# Add project root to path so imports work from Scripts/
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import numpy as np
+import torch
+import torch.nn.functional as F
+import nibabel as nib
+import yaml
+import SimpleITK as sitk
+from tqdm import tqdm
+import utils
+from Dataloader.dataLoader import OminiDataset_inference_w_all, reverse_axis_order
+from OMorpher import OMorpher
+# ========== CLI ==========
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--config", "-C",
+    help="Path for the config file",
+    type=str,
+    default="Config/config_om.yaml",
+    required=False,
+)
+args = parser.parse_args()
+# ========== Config ==========
+with open(args.config, "r") as file:
+    hyp_parameters = yaml.safe_load(file)
+    print(hyp_parameters)
+if not os.path.exists(hyp_parameters["aug_img_savepath"]):
+    os.makedirs(hyp_parameters["aug_img_savepath"])
+if not os.path.exists(hyp_parameters["aug_msk_savepath"]):
+    os.makedirs(hyp_parameters["aug_msk_savepath"])
+if not os.path.exists(hyp_parameters["aug_ddf_savepath"]):
+    os.makedirs(hyp_parameters["aug_ddf_savepath"])
+print(hyp_parameters["aug_img_savepath"])
+hyp_parameters["batchsize"] = 1
+model_img_sz = hyp_parameters["img_size"]
+# ========== Dataset (unchanged — used only for filtering/metadata) ==========
+label_keys = ["brain"]
+database = ["Brats2019"]
+dataset = OminiDataset_inference_w_all(
+    transform=None, min_crop_ratio=1.0, label_key=label_keys, database=database,
+)
+# ========== OMorpher setup ==========
+epoch = f'{hyp_parameters["model_id_str"]}_{hyp_parameters["data_name"]}_{hyp_parameters["net_name"]}'
+model_save_path = os.path.join(
+    f'Models/{hyp_parameters["data_name"]}_{hyp_parameters["net_name"]}/',
+    str(epoch) + ".pth",
+)
+print("Loading model from:", model_save_path)
+om = OMorpher(
+    config=hyp_parameters,
+    checkpoint_path=model_save_path,
+    device=str(hyp_parameters.get("device", "cpu")),
+)
+print(om)
+# ========== Output directories ==========
+reg_img_savepath_fullres = hyp_parameters["reg_img_savepath"].rstrip("/") + "_fullres/"
+reg_msk_savepath_fullres = hyp_parameters["reg_msk_savepath"].rstrip("/") + "_fullres/"
+reg_ddf_savepath_fullres = hyp_parameters["reg_ddf_savepath"].rstrip("/") + "_fullres/"
+for p in [
+    hyp_parameters["reg_img_savepath"],
+    hyp_parameters["reg_msk_savepath"],
+    hyp_parameters["reg_ddf_savepath"],
+    reg_img_savepath_fullres,
+    reg_msk_savepath_fullres,
+    reg_ddf_savepath_fullres,
+]:
+    os.makedirs(p, exist_ok=True)
+# ========== Helper: load full-res data (same as original) ==========
+def center_pad_to_cube(volume):
+    """Pad volume to a cube using the max dimension, with symmetric (center) padding."""
+    max_dim = max(volume.shape[:3])
+    pad_width = []
+    for s in volume.shape[:3]:
+        total_pad = max_dim - s
+        pad_before = total_pad // 2
+        pad_after = total_pad - pad_before
+        pad_width.append((pad_before, pad_after))
+    for _ in range(volume.ndim - 3):
+        pad_width.append((0, 0))
+    return np.pad(volume, pad_width, mode="constant", constant_values=0)
+def load_fullres_volume(key, ds):
+    """Load original-resolution volume: axis reorder, clamp, normalize, center-pad to cube."""
+    volume = sitk.ReadImage(key)
+    volume = sitk.GetArrayFromImage(volume)
+    volume = reverse_axis_order(volume)
+    if volume.ndim == 4:
+        channel_ids = ds.get_channel_ids(key)
+        channel_id = channel_ids[0] if len(channel_ids) > 0 else 0
+        volume = volume[:, :, :, channel_id]
+    if ds.clamp_range is not None:
+        modality = ds.ALLdata_filtered[key].get("Modality", None)
+        if modality == "CT":
+            volume = np.clip(volume, ds.clamp_range[0], ds.clamp_range[1])
+    volume = ds.normalize(volume)
+    volume = center_pad_to_cube(volume)
+    return volume
+def load_fullres_label(key, ds, label_key):
+    """Load original-resolution label: axis reorder, center-pad to cube."""
+    label_path_dict = ds.ALLdata_filtered[key].get("Label_path", {})
+    task_labels = label_path_dict.get("segmentation", {})
+    if label_key not in task_labels:
+        return None
+    label = sitk.ReadImage(task_labels[label_key])
+    label = sitk.GetArrayFromImage(label)
+    label = reverse_axis_order(label)
+    if label.ndim > 3:
+        channel_ids = ds.get_channel_ids(key)
+        if len(channel_ids) != 0:
+            label = label[..., channel_ids]
+    label = center_pad_to_cube(label)
+    return label
+# ========== Main inference loop ==========
+keys = list(dataset.ALLdata_filtered.keys())
+print("total num of images:", len(keys))
+device = om.device
+for e, key in enumerate(tqdm(keys)):
+    pid = e
+    print(f"Processing patient {pid}, image {e}, key: {key}")
+    # --- Load & standardize volume via OMorpher ---
+    fullres_vol = load_fullres_volume(key, dataset)
+    om.set_init_img(fullres_vol)
+    img = om._init_img                    # [1, 1, model_sz, model_sz, model_sz]
+    fullres_img_tensor = om._init_img_raw  # [1, 1, D, H, W] full-res tensor
+    orig_sz = list(fullres_img_tensor.shape[2:])
+    print(f"  Full-res padded shape: {orig_sz}")
+    # --- Load & standardize labels via OMorpher ---
+    masks_model = []
+    masks_fullres = []
+    for lk in label_keys:
+        lab = load_fullres_label(key, dataset, lk)
+        model_t, fullres_t = om._standardize_label(lab)   # None → -1 placeholder
+        masks_model.append(model_t)
+        masks_fullres.append(fullres_t)
+    if masks_model:
+        mask = torch.cat(masks_model, dim=1)                # [1, C_total, S, S, S]
+        fullres_msk_tensor = torch.cat(masks_fullres, dim=1)  # [1, C_total, D, H, W]
+    else:
+        mask = None
+        fullres_msk_tensor = None
+    # --- Save target conditioning image (first subject) ---
+    if e <= 0:
+        target_img = img.clone().detach()
+    # --- Save original images at model resolution ---
+    image_original = img.cpu().numpy()
+    nib.save(
+        utils.converet_to_nibabel(image_original, ndims=hyp_parameters["ndims"]),
+        os.path.join(hyp_parameters["reg_img_savepath"],
+                     utils.get_barcode([pid, e]) + ".nii.gz"),
+    )
+    if mask is not None:
+        mask_original = mask.cpu().numpy()
+        nib.save(
+            utils.converet_to_nibabel(mask_original, ndims=hyp_parameters["ndims"]),
+            os.path.join(hyp_parameters["reg_msk_savepath"],
+                         utils.get_barcode([pid, e]) + "_GT.nii.gz"),
+        )
+    # --- Save original at full-res ---
+    nib.save(
+        utils.converet_to_nibabel(fullres_img_tensor, ndims=hyp_parameters["ndims"]),
+        os.path.join(reg_img_savepath_fullres,
+                     utils.get_barcode([pid, e]) + ".nii.gz"),
+    )
+    if fullres_msk_tensor is not None:
+        nib.save(
+            utils.converet_to_nibabel(fullres_msk_tensor, ndims=hyp_parameters["ndims"]),
+            os.path.join(reg_msk_savepath_fullres,
+                         utils.get_barcode([pid, e]) + "_GT.nii.gz"),
+        )
+    # --- Diffusion recovery via OMorpher ---
+    noise_step = hyp_parameters["start_noise_step"]
+    with torch.no_grad():
+        for im in range(1):
+            print(
+                f"  Generating -> Subject-{pid}, Scan-{e} "
+                f'({im}/{hyp_parameters["aug_coe"]})',
+                end="\r",
+            )
+            # Set up OMorpher inputs
+            om.set_init_img(img)
+            om.set_cond_img(target_img.clone().detach())
+            # Run diffusion recovery
+            # T=[None, timesteps] in original means: no initial noise, full reverse diffusion
+            om.predict(
+                T=[None, hyp_parameters["timesteps"]],
+                proc_type=hyp_parameters["condition_type"],
+            )
+            ddf_comp = om.get_def()
+            # Reconstruct images at model resolution using OMorpher
+            img_rec = om.apply_def(img=img, ddf=ddf_comp, padding_mode="zeros")
+            # --- Save model-resolution results ---
+            denoise_imgs = img_rec.cpu().numpy()
+            nib.save(
+                utils.converet_to_nibabel(denoise_imgs, ndims=hyp_parameters["ndims"]),
+                os.path.join(
+                    hyp_parameters["reg_img_savepath"],
+                    utils.get_barcode([pid, e, im, noise_step]) + ".nii.gz",
+                ),
+            )
+            if mask is not None:
+                msk_rec = om.apply_def(
+                    img=mask, ddf=ddf_comp,
+                    padding_mode="zeros", resample_mode="nearest",
+                )
+                denoise_msks = msk_rec.cpu().numpy()
+                nib.save(
+                    utils.converet_to_nibabel(denoise_msks, ndims=hyp_parameters["ndims"]),
+                    os.path.join(
+                        hyp_parameters["reg_msk_savepath"],
+                        utils.get_barcode([pid, e, im, noise_step]) + "_GT.nii.gz",
+                    ),
+                )
+            # --- Upscale DDF and apply at full resolution via OMorpher ---
+            img_rec_fullres = om.apply_def(
+                img=fullres_img_tensor, ddf=ddf_comp, padding_mode="border",
+            )
+            if fullres_msk_tensor is not None:
+                msk_rec_fullres = om.apply_def(
+                    img=fullres_msk_tensor, ddf=ddf_comp,
+                    padding_mode="zeros", resample_mode="nearest",
+                )
+            # Upscale DDF for saving
+            ddf_fullres = F.interpolate(
+                ddf_comp, size=orig_sz, mode="trilinear", align_corners=False,
+            )
+            # --- Save full-res results ---
+            nib.save(
+                utils.converet_to_nibabel(img_rec_fullres, ndims=hyp_parameters["ndims"]),
+                os.path.join(
+                    reg_img_savepath_fullres,
+                    utils.get_barcode([pid, e, im, noise_step]) + ".nii.gz",
+                ),
+            )
+            if fullres_msk_tensor is not None:
+                nib.save(
+                    utils.converet_to_nibabel(msk_rec_fullres, ndims=hyp_parameters["ndims"]),
+                    os.path.join(
+                        reg_msk_savepath_fullres,
+                        utils.get_barcode([pid, e, im, noise_step]) + "_GT.nii.gz",
+                    ),
+                )
+            nib.save(
+                utils.converet_to_nibabel(ddf_fullres, ndims=hyp_parameters["ndims"]),
+                os.path.join(
+                    reg_ddf_savepath_fullres,
+                    utils.get_barcode([pid, e, im, noise_step]) + ".nii.gz",
+                ),
+            )
+            if (im - hyp_parameters["start_noise_step"]) % 2 == 0:
+                noise_step = noise_step + hyp_parameters["noise_step"]
+    if e > 5:
+        break

Scripts/OM_reg_pair_ext.py ADDED Viewed

	@@ -0,0 +1,676 @@

+"""
+OM_reg_pair.py — Paired registration using OMorpher with external dataset.
+Loads fixed/moving pairs from a Learn2Reg-style JSON dataset file
+(e.g. HippocampusMR_dataset.json) and registers each moving image to its
+paired fixed image. Saves registered images, masks, DDFs, source originals,
+and evaluation metrics (DSC, ASD, HD) per organ label.
+Usage:
+    python Scripts/OM_reg_pair.py -C Config/config_om.yaml \
+        --dataset-json /path/to/HippocampusMR_dataset.json \
+        --split val
+    python Scripts/OM_reg_pair.py -C Config/config_om.yaml \
+        --dataset-json /path/to/HippocampusMR_dataset.json \
+        --split test -N 10
+"""
+import os
+import sys
+# Add project root to path so imports work from Scripts/
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import csv
+import json
+import numpy as np
+import torch
+import torch.nn.functional as F
+import nibabel as nib
+import yaml
+import SimpleITK as sitk
+from scipy.ndimage import distance_transform_edt, binary_erosion
+from tqdm import tqdm
+import utils
+from Dataloader.dataLoader import reverse_axis_order
+from OMorpher import OMorpher
+# ========== CLI ==========
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--config", "-C",
+    help="Path for the config file",
+    type=str,
+    default="Config/config_om.yaml",
+    required=False,
+)
+parser.add_argument(
+    "--dataset-json",
+    help="Path to the Learn2Reg-style dataset JSON",
+    type=str,
+    default="~/rds/rds-airr-p51-TWhPgQVLKbA/Code/Registration/Dataset/HippocampusMR/HippocampusMR_dataset.json",
+)
+parser.add_argument(
+    "--split",
+    help="Which registration split to use: 'val' or 'test'",
+    type=str,
+    choices=["val", "test"],
+    default="val",
+)
+parser.add_argument(
+    "--max-samples", "-N",
+    help="Max number of pairs to register (0 = all)",
+    type=int,
+    default=0,
+)
+args = parser.parse_args()
+# ========== Config ==========
+with open(args.config, "r") as file:
+    hyp_parameters = yaml.safe_load(file)
+    print(hyp_parameters)
+hyp_parameters["batchsize"] = 1
+model_img_sz = hyp_parameters["img_size"]
+timesteps = hyp_parameters["timesteps"]
+condition_type = hyp_parameters["condition_type"]
+ndims = hyp_parameters["ndims"]
+# ========== Load external dataset JSON ==========
+dataset_json_path = os.path.expanduser(args.dataset_json)
+dataset_root = os.path.dirname(dataset_json_path)
+with open(dataset_json_path, "r") as f:
+    dataset_meta = json.load(f)
+dataset_name = dataset_meta.get("name", "UnknownDataset")
+print(f"Dataset: {dataset_name}")
+# Select registration split
+if args.split == "val":
+    pairs = dataset_meta.get("registration_val", [])
+elif args.split == "test":
+    pairs = dataset_meta.get("registration_test", [])
+else:
+    raise ValueError(f"Unknown split: {args.split}")
+if args.max_samples > 0:
+    pairs = pairs[: args.max_samples]
+print(f"Split: {args.split}, Pairs: {len(pairs)}")
+# Build label lookup: image basename -> label relative path
+# from the "training" entries in the JSON
+_label_lookup = {}
+for entry in dataset_meta.get("training", []):
+    img_base = os.path.basename(entry["image"])
+    _label_lookup[img_base] = entry.get("label")
+# Label class names (from JSON: "0": "background", "1": "head", "2": "tail")
+_label_names = dataset_meta.get("labels", {}).get("0", {})
+# Organ labels are all non-background classes
+organ_label_ids = {int(k): v for k, v in _label_names.items() if int(k) > 0}
+print(f"Organ labels for evaluation: {organ_label_ids}")
+# ========== OMorpher setup ==========
+epoch = f'{hyp_parameters["model_id_str"]}_{hyp_parameters["data_name"]}_{hyp_parameters["net_name"]}'
+model_save_path = os.path.join(
+    f'Models/{hyp_parameters["data_name"]}_{hyp_parameters["net_name"]}/',
+    str(epoch) + ".pth",
+)
+print("Loading model from:", model_save_path)
+om = OMorpher(
+    config=hyp_parameters,
+    checkpoint_path=model_save_path,
+    device=str(hyp_parameters.get("device", "cpu")),
+)
+print(om)
+# ========== Output directories ==========
+reg_img_savepath = hyp_parameters["reg_img_savepath"]
+reg_msk_savepath = hyp_parameters["reg_msk_savepath"]
+reg_ddf_savepath = hyp_parameters["reg_ddf_savepath"]
+reg_img_savepath_fullres = reg_img_savepath.rstrip("/") + "_fullres/"
+reg_msk_savepath_fullres = reg_msk_savepath.rstrip("/") + "_fullres/"
+reg_ddf_savepath_fullres = reg_ddf_savepath.rstrip("/") + "_fullres/"
+eval_dir = os.path.join(reg_img_savepath, "..", "eval")
+for p in [
+    reg_img_savepath, reg_msk_savepath, reg_ddf_savepath,
+    reg_img_savepath_fullres, reg_msk_savepath_fullres, reg_ddf_savepath_fullres,
+    eval_dir,
+]:
+    os.makedirs(p, exist_ok=True)
+# ========== Helper functions ==========
+def resolve_path(rel_path):
+    """Resolve a relative path from the dataset JSON to an absolute path."""
+    if os.path.isabs(rel_path):
+        return rel_path
+    return os.path.normpath(os.path.join(dataset_root, rel_path))
+def load_volume(nifti_path):
+    """Load a NIfTI volume: axis reorder only.
+    OMorpher._standardize_img handles: normalize → pad-to-cube → resize to model res.
+    """
+    volume = sitk.ReadImage(nifti_path)
+    volume = sitk.GetArrayFromImage(volume)
+    volume = reverse_axis_order(volume)
+    if volume.ndim == 4:
+        volume = volume[:, :, :, 0]
+    return volume
+def load_label(nifti_path):
+    """Load a NIfTI label map: axis reorder only.
+    OMorpher._standardize_label handles: pad-to-cube → resize to model res (nearest).
+    """
+    label = sitk.ReadImage(nifti_path)
+    label = sitk.GetArrayFromImage(label)
+    label = reverse_axis_order(label)
+    if label.ndim > 3:
+        label = label[:, :, :, 0]
+    return label
+def get_label_path_for_image(image_rel_path):
+    """Find the label path for an image by looking up the training entries."""
+    img_base = os.path.basename(image_rel_path)
+    label_rel = _label_lookup.get(img_base)
+    if label_rel is None:
+        return None
+    return resolve_path(label_rel)
+def split_label_classes(label_map, class_ids):
+    """Split a multi-class label map into per-class binary masks.
+    Returns a dict {class_id: binary_numpy_array}.
+    """
+    masks = {}
+    for cid in class_ids:
+        masks[cid] = (label_map == cid).astype(np.float32)
+    return masks
+def get_volume_name(path):
+    """Extract a short name from a NIfTI file path."""
+    name = os.path.basename(path)
+    for ext in [".nii.gz", ".nii"]:
+        if name.endswith(ext):
+            name = name[: -len(ext)]
+            break
+    return name
+# ---------- Evaluation metrics ----------
+def _surface_distances(pred, gt):
+    """Compute directed surface distances between two binary masks."""
+    pred_bool = pred > 0.5
+    gt_bool = gt > 0.5
+    if not np.any(pred_bool) or not np.any(gt_bool):
+        return None, None
+    struct = None
+    pred_surface = pred_bool ^ binary_erosion(pred_bool, structure=struct)
+    gt_surface = gt_bool ^ binary_erosion(gt_bool, structure=struct)
+    if not np.any(pred_surface):
+        pred_surface = pred_bool
+    if not np.any(gt_surface):
+        gt_surface = gt_bool
+    dt_gt = distance_transform_edt(~gt_surface)
+    dt_pred = distance_transform_edt(~pred_surface)
+    return dt_gt[pred_surface], dt_pred[gt_surface]
+def compute_dsc(pred, gt):
+    """Dice Similarity Coefficient."""
+    pred_bool = pred > 0.5
+    gt_bool = gt > 0.5
+    intersection = np.sum(pred_bool & gt_bool)
+    denom = np.sum(pred_bool) + np.sum(gt_bool)
+    if denom == 0:
+        return 1.0
+    return 2.0 * float(intersection) / float(denom)
+def compute_asd(pred, gt):
+    """Average (symmetric) Surface Distance."""
+    d1, d2 = _surface_distances(pred, gt)
+    if d1 is None:
+        return float("nan")
+    return (np.mean(d1) + np.mean(d2)) / 2.0
+def compute_hd(pred, gt):
+    """Hausdorff Distance (maximum of directed HDs)."""
+    d1, d2 = _surface_distances(pred, gt)
+    if d1 is None:
+        return float("nan")
+    return float(max(np.max(d1), np.max(d2)))
+def compute_negdetj_pct(ddf, ndims=3):
+    """Percent of voxels with negative Jacobian determinant.
+    Args:
+        ddf: displacement field tensor [1, ndims, ...] or numpy array.
+        ndims: 2 or 3.
+    Returns:
+        Percentage of voxels where det(Jacobian) < 0.
+    """
+    if isinstance(ddf, torch.Tensor):
+        ddf = ddf.detach().cpu().numpy()
+    # ddf shape: [1, C, ...] or [C, ...]
+    if ddf.ndim == ndims + 2:
+        ddf = ddf[0]  # remove batch dim -> [C, ...]
+    # Compute spatial gradients via finite differences (forward diff, clipped)
+    if ndims == 3:
+        # ddf: [3, D, H, W]
+        # Derivatives along each spatial axis
+        dux_dx = np.diff(ddf[0], axis=0, append=ddf[0, -1:, :, :])
+        duy_dx = np.diff(ddf[1], axis=0, append=ddf[1, -1:, :, :])
+        duz_dx = np.diff(ddf[2], axis=0, append=ddf[2, -1:, :, :])
+        dux_dy = np.diff(ddf[0], axis=1, append=ddf[0, :, -1:, :])
+        duy_dy = np.diff(ddf[1], axis=1, append=ddf[1, :, -1:, :])
+        duz_dy = np.diff(ddf[2], axis=1, append=ddf[2, :, -1:, :])
+        dux_dz = np.diff(ddf[0], axis=2, append=ddf[0, :, :, -1:])
+        duy_dz = np.diff(ddf[1], axis=2, append=ddf[1, :, :, -1:])
+        duz_dz = np.diff(ddf[2], axis=2, append=ddf[2, :, :, -1:])
+        # Jacobian = I + du/dx
+        j11 = 1.0 + dux_dx; j12 = dux_dy; j13 = dux_dz
+        j21 = duy_dx; j22 = 1.0 + duy_dy; j23 = duy_dz
+        j31 = duz_dx; j32 = duz_dy; j33 = 1.0 + duz_dz
+        detj = (
+            j11 * (j22 * j33 - j23 * j32)
+            - j12 * (j21 * j33 - j23 * j31)
+            + j13 * (j21 * j32 - j22 * j31)
+        )
+    elif ndims == 2:
+        dux_dx = np.diff(ddf[0], axis=0, append=ddf[0, -1:, :])
+        duy_dx = np.diff(ddf[1], axis=0, append=ddf[1, -1:, :])
+        dux_dy = np.diff(ddf[0], axis=1, append=ddf[0, :, -1:])
+        duy_dy = np.diff(ddf[1], axis=1, append=ddf[1, :, -1:])
+        detj = (1.0 + dux_dx) * (1.0 + duy_dy) - dux_dy * duy_dx
+    else:
+        raise ValueError(f"Unsupported ndims={ndims}")
+    n_neg = np.sum(detj < 0)
+    n_total = detj.size
+    return 100.0 * float(n_neg) / float(n_total)
+# ========== Prepare evaluation structures ==========
+# metrics[class_id][metric_name][pair_idx] = value  (post-registration)
+metrics = {
+    cid: {"dsc": {}, "asd": {}, "hd": {}}
+    for cid in organ_label_ids
+}
+# metrics_pre: same structure but for pre-registration (source vs target, no deformation)
+metrics_pre = {
+    cid: {"dsc": {}, "asd": {}, "hd": {}}
+    for cid in organ_label_ids
+}
+# Per-pair DDF quality metric (not per-class)
+negdetj_pct = {}  # pair_idx -> percentage of negative Jacobian determinant
+# Also collect per-pair info for the CSV
+pair_info = []  # list of (pair_idx, fixed_name, moving_name)
+# ========== Paired registration ==========
+with torch.no_grad():
+    for pair_idx, pair in enumerate(tqdm(pairs, desc="Pairs")):
+        fixed_rel = pair["fixed"]
+        moving_rel = pair["moving"]
+        fixed_path = resolve_path(fixed_rel)
+        moving_path = resolve_path(moving_rel)
+        fixed_name = get_volume_name(fixed_rel)
+        moving_name = get_volume_name(moving_rel)
+        pair_tag = f"Tgt{pair_idx:04d}_Src{pair_idx:04d}"
+        pair_info.append((pair_idx, fixed_name, moving_name))
+        print(f"\n  [{pair_idx}] Fixed: {fixed_name}, Moving: {moving_name}")
+        # --- Load volumes ---
+        fixed_vol = load_volume(fixed_path)
+        moving_vol = load_volume(moving_path)
+        # --- Load labels (if available) ---
+        fixed_label_path = get_label_path_for_image(fixed_rel)
+        moving_label_path = get_label_path_for_image(moving_rel)
+        fixed_label_map = None
+        moving_label_map = None
+        if fixed_label_path is not None and os.path.exists(fixed_label_path):
+            fixed_label_map = load_label(fixed_label_path)
+        if moving_label_path is not None and os.path.exists(moving_label_path):
+            moving_label_map = load_label(moving_label_path)
+        # --- Prepare tensors via OMorpher ---
+        # Set moving image as init (source to be deformed)
+        om.set_init_img(moving_vol)
+        src_img_model = om._init_img.clone()
+        src_img_fullres = om._init_img_raw.clone()
+        src_orig_sz = list(src_img_fullres.shape[2:])
+        # Set fixed image as conditioning (target)
+        om.set_init_img(fixed_vol)
+        tgt_img_model = om._init_img.clone()
+        tgt_img_fullres = om._init_img_raw.clone()
+        # Standardize labels through OMorpher
+        src_mask_model, src_mask_fullres = None, None
+        tgt_mask_model, tgt_mask_fullres = None, None
+        if moving_label_map is not None:
+            # Split into per-class binary masks, stack as channels
+            src_class_masks = split_label_classes(moving_label_map, organ_label_ids.keys())
+            src_masks_model = []
+            src_masks_fullres = []
+            om.set_init_img(moving_vol)  # reset so _standardize_label uses correct shape
+            for cid in sorted(organ_label_ids.keys()):
+                m_model, m_fullres = om._standardize_label(src_class_masks[cid])
+                src_masks_model.append(m_model)
+                src_masks_fullres.append(m_fullres)
+            src_mask_model = torch.cat(src_masks_model, dim=1)
+            src_mask_fullres = torch.cat(src_masks_fullres, dim=1)
+        if fixed_label_map is not None:
+            tgt_class_masks = split_label_classes(fixed_label_map, organ_label_ids.keys())
+            tgt_masks_model = []
+            tgt_masks_fullres = []
+            om.set_init_img(fixed_vol)  # reset so _standardize_label uses correct shape
+            for cid in sorted(organ_label_ids.keys()):
+                m_model, m_fullres = om._standardize_label(tgt_class_masks[cid])
+                tgt_masks_model.append(m_model)
+                tgt_masks_fullres.append(m_fullres)
+            tgt_mask_model = torch.cat(tgt_masks_model, dim=1)
+            tgt_mask_fullres = torch.cat(tgt_masks_fullres, dim=1)
+        # --- Save target (fixed) original at model resolution ---
+        nib.save(
+            utils.converet_to_nibabel(tgt_img_model, ndims=ndims),
+            os.path.join(reg_img_savepath, f"{pair_tag}_TGT_ORG.nii.gz"),
+        )
+        if tgt_mask_model is not None:
+            nib.save(
+                utils.converet_to_nibabel(tgt_mask_model, ndims=ndims),
+                os.path.join(reg_msk_savepath, f"{pair_tag}_TGT_ORG_GT.nii.gz"),
+            )
+        # --- Save source (moving) original at model resolution ---
+        nib.save(
+            utils.converet_to_nibabel(src_img_model, ndims=ndims),
+            os.path.join(reg_img_savepath, f"Src{pair_idx:04d}_ORG.nii.gz"),
+        )
+        if src_mask_model is not None:
+            nib.save(
+                utils.converet_to_nibabel(src_mask_model, ndims=ndims),
+                os.path.join(reg_msk_savepath, f"Src{pair_idx:04d}_ORG_GT.nii.gz"),
+            )
+        # --- Save target original at full resolution ---
+        nib.save(
+            utils.converet_to_nibabel(tgt_img_fullres, ndims=ndims),
+            os.path.join(reg_img_savepath_fullres, f"{pair_tag}_TGT_ORG.nii.gz"),
+        )
+        if tgt_mask_fullres is not None:
+            nib.save(
+                utils.converet_to_nibabel(tgt_mask_fullres, ndims=ndims),
+                os.path.join(reg_msk_savepath_fullres, f"{pair_tag}_TGT_ORG_GT.nii.gz"),
+            )
+        # --- Save source original at full resolution ---
+        nib.save(
+            utils.converet_to_nibabel(src_img_fullres, ndims=ndims),
+            os.path.join(reg_img_savepath_fullres, f"Src{pair_idx:04d}_ORG.nii.gz"),
+        )
+        if src_mask_fullres is not None:
+            nib.save(
+                utils.converet_to_nibabel(src_mask_fullres, ndims=ndims),
+                os.path.join(reg_msk_savepath_fullres, f"Src{pair_idx:04d}_ORG_GT.nii.gz"),
+            )
+        # --- Register moving to fixed ---
+        om.set_init_img(src_img_model)
+        om.set_cond_img(tgt_img_model.clone().detach())
+        om.predict(
+            T=[None, timesteps],
+            proc_type=condition_type,
+        )
+        ddf_comp = om.get_def()
+        # --- DDF quality: percent negative Jacobian determinant ---
+        neg_pct = compute_negdetj_pct(ddf_comp, ndims=ndims)
+        negdetj_pct[pair_idx] = neg_pct
+        print(f"    %|J|<0 = {neg_pct:.4f}%")
+        # --- Model-resolution registered image ---
+        img_rec = om.apply_def(
+            img=src_img_model, ddf=ddf_comp, padding_mode="zeros",
+        )
+        nib.save(
+            utils.converet_to_nibabel(img_rec, ndims=ndims),
+            os.path.join(reg_img_savepath, f"{pair_tag}.nii.gz"),
+        )
+        # --- Model-resolution registered mask ---
+        msk_rec = None
+        if src_mask_model is not None:
+            msk_rec = om.apply_def(
+                img=src_mask_model, ddf=ddf_comp,
+                padding_mode="zeros", resample_mode="nearest",
+            )
+            nib.save(
+                utils.converet_to_nibabel(msk_rec, ndims=ndims),
+                os.path.join(reg_msk_savepath, f"{pair_tag}_GT.nii.gz"),
+            )
+        # --- Model-resolution DDF ---
+        nib.save(
+            utils.converet_to_nibabel(ddf_comp, ndims=ndims),
+            os.path.join(reg_ddf_savepath, f"{pair_tag}.nii.gz"),
+        )
+        # --- Full-resolution registered image ---
+        img_rec_fullres = om.apply_def(
+            img=src_img_fullres, ddf=ddf_comp, padding_mode="border",
+        )
+        nib.save(
+            utils.converet_to_nibabel(img_rec_fullres, ndims=ndims),
+            os.path.join(reg_img_savepath_fullres, f"{pair_tag}.nii.gz"),
+        )
+        # --- Full-resolution registered mask ---
+        msk_rec_fullres = None
+        if src_mask_fullres is not None:
+            msk_rec_fullres = om.apply_def(
+                img=src_mask_fullres, ddf=ddf_comp,
+                padding_mode="zeros", resample_mode="nearest",
+            )
+            nib.save(
+                utils.converet_to_nibabel(msk_rec_fullres, ndims=ndims),
+                os.path.join(reg_msk_savepath_fullres, f"{pair_tag}_GT.nii.gz"),
+            )
+        # --- Full-resolution DDF ---
+        ddf_fullres = F.interpolate(
+            ddf_comp, size=src_orig_sz, mode="trilinear", align_corners=False,
+        )
+        nib.save(
+            utils.converet_to_nibabel(ddf_fullres, ndims=ndims),
+            os.path.join(reg_ddf_savepath_fullres, f"{pair_tag}.nii.gz"),
+        )
+        # --- Evaluation metrics (full-res organ labels) ---
+        if (
+            organ_label_ids
+            and src_mask_fullres is not None
+            and tgt_mask_fullres is not None
+        ):
+            for ch_idx, cid in enumerate(sorted(organ_label_ids.keys())):
+                lk = organ_label_ids[cid]
+                tgt_mask_np = tgt_mask_fullres[0, ch_idx].cpu().numpy()
+                src_mask_np = src_mask_fullres[0, ch_idx].cpu().numpy()
+                if np.all(tgt_mask_np < 0) or np.all(src_mask_np < 0):
+                    continue
+                # Pre-registration: source vs target (no deformation)
+                pre_dsc = compute_dsc(src_mask_np, tgt_mask_np)
+                pre_asd = compute_asd(src_mask_np, tgt_mask_np)
+                pre_hd = compute_hd(src_mask_np, tgt_mask_np)
+                metrics_pre[cid]["dsc"][pair_idx] = pre_dsc
+                metrics_pre[cid]["asd"][pair_idx] = pre_asd
+                metrics_pre[cid]["hd"][pair_idx] = pre_hd
+                # Post-registration: registered mask vs target
+                if msk_rec_fullres is not None:
+                    reg_mask_np = msk_rec_fullres[0, ch_idx].cpu().numpy()
+                    post_dsc = compute_dsc(reg_mask_np, tgt_mask_np)
+                    post_asd = compute_asd(reg_mask_np, tgt_mask_np)
+                    post_hd = compute_hd(reg_mask_np, tgt_mask_np)
+                else:
+                    post_dsc = float("nan")
+                    post_asd = float("nan")
+                    post_hd = float("nan")
+                metrics[cid]["dsc"][pair_idx] = post_dsc
+                metrics[cid]["asd"][pair_idx] = post_asd
+                metrics[cid]["hd"][pair_idx] = post_hd
+                print(
+                    f"    [{lk}] PRE  DSC={pre_dsc:.4f}  ASD={pre_asd:.2f}  HD={pre_hd:.2f}"
+                )
+                print(
+                    f"    [{lk}] POST DSC={post_dsc:.4f}  ASD={post_asd:.2f}  HD={post_hd:.2f}"
+                )
+print("\nPaired registration complete.")
+# ========== Write evaluation CSVs ==========
+n_pairs = len(pairs)
+def _fmt(val):
+    if val is None:
+        return ""
+    if np.isnan(val):
+        return "NaN"
+    return f"{val:.6f}"
+# --- Per-pair %|J|<0 CSV ---
+negdetj_csv_path = os.path.join(eval_dir, "negdetj_pct.csv")
+with open(negdetj_csv_path, "w", newline="") as f:
+    writer = csv.writer(f)
+    writer.writerow(["pair_idx", "fixed", "moving", "negdetj_pct"])
+    for pi, fixed_name, moving_name in pair_info:
+        writer.writerow([pi, fixed_name, moving_name, _fmt(negdetj_pct.get(pi))])
+print(f"Saved {negdetj_csv_path}")
+for cid in sorted(organ_label_ids.keys()):
+    lk = organ_label_ids[cid]
+    prefix = f"{lk}_" if len(organ_label_ids) > 1 else ""
+    for metric_name in ["dsc", "asd", "hd"]:
+        mn_upper = metric_name.upper()
+        csv_path = os.path.join(eval_dir, f"{prefix}{metric_name}.csv")
+        with open(csv_path, "w", newline="") as f:
+            writer = csv.writer(f)
+            writer.writerow([
+                "pair_idx", "fixed", "moving",
+                f"pre_{mn_upper}", f"post_{mn_upper}",
+            ])
+            for pi, fixed_name, moving_name in pair_info:
+                pre_val = metrics_pre[cid][metric_name].get(pi)
+                post_val = metrics[cid][metric_name].get(pi)
+                writer.writerow([
+                    pi, fixed_name, moving_name,
+                    _fmt(pre_val), _fmt(post_val),
+                ])
+        print(f"Saved {csv_path}")
+# --- Overall summary ---
+overall_path = os.path.join(eval_dir, "overall.csv")
+with open(overall_path, "w", newline="") as f:
+    writer = csv.writer(f)
+    writer.writerow([
+        "label", "metric",
+        "pre_mean", "pre_std",
+        "post_mean", "post_std",
+        "n_pairs",
+    ])
+    # %|J|<0 summary (not per-label)
+    negdetj_vals = [v for v in negdetj_pct.values() if not np.isnan(v)]
+    writer.writerow([
+        "ALL",
+        "%|J|<0",
+        "", "",
+        _fmt(np.mean(negdetj_vals) if negdetj_vals else float("nan")),
+        _fmt(np.std(negdetj_vals) if negdetj_vals else float("nan")),
+        len(negdetj_vals),
+    ])
+    for cid in sorted(organ_label_ids.keys()):
+        lk = organ_label_ids[cid]
+        for metric_name in ["dsc", "asd", "hd"]:
+            pre_vals = [
+                v for v in metrics_pre[cid][metric_name].values()
+                if not np.isnan(v)
+            ]
+            post_vals = [
+                v for v in metrics[cid][metric_name].values()
+                if not np.isnan(v)
+            ]
+            pre_mean = np.mean(pre_vals) if pre_vals else float("nan")
+            pre_std = np.std(pre_vals) if pre_vals else float("nan")
+            post_mean = np.mean(post_vals) if post_vals else float("nan")
+            post_std = np.std(post_vals) if post_vals else float("nan")
+            n = max(len(pre_vals), len(post_vals))
+            writer.writerow([
+                lk,
+                metric_name.upper(),
+                _fmt(pre_mean), _fmt(pre_std),
+                _fmt(post_mean), _fmt(post_std),
+                n,
+            ])
+print(f"Saved {overall_path}")