spc819 commited on Dec 22, 2025

Commit

7f3dfd7

verified ·

1 Parent(s): f249a24

Upload 69 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
CVPR25_TextSegFMData_with_class.json +0 -0
config_CT.json +93 -0
config_nonCT.json +13 -0
data/__init__.py +0 -0
data/default_resampling.py +208 -0
data/resample_torch.py +162 -0
data/resampling_test.py +593 -0
environment.yml +211 -0
evaluate/SurfaceDice.py +492 -0
evaluate/__init__.py +0 -0
evaluate/evaluator.py +379 -0
evaluate/merge_after_evaluate.py +198 -0
evaluate/metric.py +46 -0
evaluate/params.py +153 -0
inference_medals_nifti.py +1885 -0
model/SwinUNETR.py +1116 -0
model/__init__.py +0 -0
model/base_bert.py +26 -0
model/build_model.py +103 -0
model/dynamic-network-architectures-main/.gitignore +113 -0
model/dynamic-network-architectures-main/LICENCE +201 -0
model/dynamic-network-architectures-main/README.md +25 -0
model/dynamic-network-architectures-main/dynamic_network_architectures.egg-info/PKG-INFO +16 -0
model/dynamic-network-architectures-main/dynamic_network_architectures.egg-info/SOURCES.txt +24 -0
model/dynamic-network-architectures-main/dynamic_network_architectures.egg-info/dependency_links.txt +1 -0
model/dynamic-network-architectures-main/dynamic_network_architectures.egg-info/not-zip-safe +1 -0
model/dynamic-network-architectures-main/dynamic_network_architectures.egg-info/requires.txt +2 -0
model/dynamic-network-architectures-main/dynamic_network_architectures.egg-info/top_level.txt +1 -0
model/dynamic-network-architectures-main/dynamic_network_architectures/__init__.py +0 -0
model/dynamic-network-architectures-main/dynamic_network_architectures/__pycache__/__init__.cpython-310.pyc +0 -0
model/dynamic-network-architectures-main/dynamic_network_architectures/architectures/__init__.py +0 -0
model/dynamic-network-architectures-main/dynamic_network_architectures/architectures/__pycache__/__init__.cpython-310.pyc +0 -0
model/dynamic-network-architectures-main/dynamic_network_architectures/architectures/__pycache__/unet.cpython-310.pyc +0 -0
model/dynamic-network-architectures-main/dynamic_network_architectures/architectures/resnet.py +236 -0
model/dynamic-network-architectures-main/dynamic_network_architectures/architectures/unet.py +220 -0
model/dynamic-network-architectures-main/dynamic_network_architectures/architectures/vgg.py +85 -0
model/dynamic-network-architectures-main/dynamic_network_architectures/building_blocks/__init__.py +0 -0
model/dynamic-network-architectures-main/dynamic_network_architectures/building_blocks/__pycache__/__init__.cpython-310.pyc +0 -0
model/dynamic-network-architectures-main/dynamic_network_architectures/building_blocks/__pycache__/helper.cpython-310.pyc +0 -0
model/dynamic-network-architectures-main/dynamic_network_architectures/building_blocks/__pycache__/plain_conv_encoder.cpython-310.pyc +0 -0
model/dynamic-network-architectures-main/dynamic_network_architectures/building_blocks/__pycache__/regularization.cpython-310.pyc +0 -0
model/dynamic-network-architectures-main/dynamic_network_architectures/building_blocks/__pycache__/residual.cpython-310.pyc +0 -0
model/dynamic-network-architectures-main/dynamic_network_architectures/building_blocks/__pycache__/residual_encoders.cpython-310.pyc +0 -0
model/dynamic-network-architectures-main/dynamic_network_architectures/building_blocks/__pycache__/simple_conv_blocks.cpython-310.pyc +0 -0
model/dynamic-network-architectures-main/dynamic_network_architectures/building_blocks/__pycache__/unet_decoder.cpython-310.pyc +0 -0
model/dynamic-network-architectures-main/dynamic_network_architectures/building_blocks/helper.py +242 -0
model/dynamic-network-architectures-main/dynamic_network_architectures/building_blocks/plain_conv_encoder.py +105 -0
model/dynamic-network-architectures-main/dynamic_network_architectures/building_blocks/regularization.py +86 -0
model/dynamic-network-architectures-main/dynamic_network_architectures/building_blocks/residual.py +371 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+model/dynamic-network-architectures-main/imgs/Logos/HI_Logo.png filter=lfs diff=lfs merge=lfs -text

CVPR25_TextSegFMData_with_class.json ADDED Viewed

The diff for this file is too large to render. See raw diff

config_CT.json ADDED Viewed

	@@ -0,0 +1,93 @@

+{
+    "texts_soft_tissue": [
+        "Aorta in whole body CT",
+        "gallbladder in whole body CT",
+        "left kidney in whole body CT",
+        "right kidney in whole body CT",
+        "liver in whole body CT",
+        "Pancreas in whole body CT",
+        "Spleen in whole body CT",
+        "stomach in whole body CT",
+        "Left adrenal gland in whole body CT",
+        "right adrenal gland in whole body CT",
+        "Bladder in whole body CT",
+        "Esophagus in whole body CT",
+        "Heart in whole body CT",
+        "Pulmonary vein in whole body CT",
+        "Brachiocephalic trunk in whole body CT",
+        "Right subclavian artery in whole body CT",
+        "Left subclavian artery in whole body CT",
+        "Right common carotid artery in whole body CT",
+        "Left common carotid artery in whole body CT",
+        "Left brachiocephalic vein in whole body CT",
+        "Right brachiocephalic vein in whole body CT",
+        "Left atrial appendage in whole body CT",
+        "Superior vena cava in whole body CT",
+        "Inferior vena cava in whole body CT",
+        "Portal vein and splenic vein in whole body CT",
+        "Left iliac artery in whole body CT",
+        "Right iliac artery in whole body CT",
+        "Left iliac vena in whole body CT",
+        "Right iliac vena in whole body CT",
+        "Spinal cord in whole body CT",
+        "Left gluteus Maximus in whole body CT",
+        "Right gluteus Maximus in whole body CT",
+        "Left gluteus Medius in whole body CT",
+        "Right gluteus Medius in whole body CT",
+        "Left gluteus Minimus in whole body CT",
+        "Right gluteus Minimus in whole body CT",
+        "Left autochthon in whole body CT",
+        "Right autochthon in whole body CT",
+        "Left iliopsoas in whole body CT",
+        "Right iliopsoas in whole body CT"
+    ],
+    "texts_bone": [
+        "Vertebrae C7 in whole body CT",
+        "Vertebrae C6 in whole body CT",
+        "Vertebrae C5 in whole body CT",
+        "Vertebrae C4 in whole body CT",
+        "Vertebrae C3 in whole body CT",
+        "Vertebrae C2 in whole body CT",
+        "Vertebrae C1 in whole body CT",
+        "Vertebrae T12 in whole body CT",
+        "Vertebrae T11 in whole body CT",
+        "Vertebrae T10 in whole body CT",
+        "Vertebrae T9 in whole body CT",
+        "Vertebrae T8 in whole body CT",
+        "Vertebrae T7 in whole body CT",
+        "Vertebrae T6 in whole body CT",
+        "Vertebrae T5 in whole body CT",
+        "Vertebrae T4 in whole body CT",
+        "Vertebrae T3 in whole body CT",
+        "Vertebrae T2 in whole body CT",
+        "Vertebrae T1 in whole body CT",
+        "Left humerus in whole body CT",
+        "Right humerus in whole body CT",
+        "Left clavicula in whole body CT",
+        "Right clavicula in whole body CT",
+        "Left femur in whole body CT",
+        "Right femur in whole body CT",
+        "Left hip in whole body CT",
+        "Right hip in whole body CT"
+    ],
+    "texts_lung": [
+        "Left lung in whole body CT",
+        "Right lung in whole body CT"
+    ],
+    "window_settings": {
+        "soft_tissue": {
+            "window_level": 40,
+            "window_width": 400
+        },
+        "bone": {
+            "window_level": 500,
+            "window_width": 1500
+        },
+        "lung": {
+            "window_level": -600,
+            "window_width": 1500
+        }
+    },
+    "modality": "CT",
+    "instance_label": 0
+}

config_nonCT.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+    "texts": [
+        "Spleen in MRI"
+    ],
+    "normalization_settings": {
+        "percentile_lower": 0.5,
+        "percentile_upper": 99.5,
+        "preserve_zero": true
+    },
+    "modality": "MRI",
+    "instance_label": 0
+}

data/__init__.py ADDED Viewed

File without changes

data/default_resampling.py ADDED Viewed

	@@ -0,0 +1,208 @@

+from collections import OrderedDict
+from copy import deepcopy
+from typing import Union, Tuple, List
+import numpy as np
+import pandas as pd
+import sklearn
+import torch
+from batchgenerators.augmentations.utils import resize_segmentation
+from scipy.ndimage import map_coordinates
+from skimage.transform import resize
+ANISO_THRESHOLD = 3  # determines when a sample is considered anisotropic (3 means that the spacing in the low
+                     # resolution axis must be 3x as large as the next largest spacing)
+def get_do_separate_z(spacing: Union[Tuple[float, ...], List[float], np.ndarray], anisotropy_threshold=ANISO_THRESHOLD):
+    do_separate_z = (np.max(spacing) / np.min(spacing)) > anisotropy_threshold
+    return do_separate_z
+def get_lowres_axis(new_spacing: Union[Tuple[float, ...], List[float], np.ndarray]):
+    axis = np.where(max(new_spacing) / np.array(new_spacing) == 1)[0]  # find which axis is anisotropic
+    return axis
+def compute_new_shape(old_shape: Union[Tuple[int, ...], List[int], np.ndarray],
+                      old_spacing: Union[Tuple[float, ...], List[float], np.ndarray],
+                      new_spacing: Union[Tuple[float, ...], List[float], np.ndarray]) -> np.ndarray:
+    assert len(old_spacing) == len(old_shape)
+    assert len(old_shape) == len(new_spacing)
+    new_shape = np.array([int(round(i / j * k)) for i, j, k in zip(old_spacing, new_spacing, old_shape)])
+    return new_shape
+def determine_do_sep_z_and_axis(
+        force_separate_z: bool,
+        current_spacing,
+        new_spacing,
+        separate_z_anisotropy_threshold: float = ANISO_THRESHOLD) -> Tuple[bool, Union[int, None]]:
+    if force_separate_z is not None:
+        do_separate_z = force_separate_z
+        if force_separate_z:
+            axis = get_lowres_axis(current_spacing)
+        else:
+            axis = None
+    else:
+        if get_do_separate_z(current_spacing, separate_z_anisotropy_threshold):
+            do_separate_z = True
+            axis = get_lowres_axis(current_spacing)
+        elif get_do_separate_z(new_spacing, separate_z_anisotropy_threshold):
+            do_separate_z = True
+            axis = get_lowres_axis(new_spacing)
+        else:
+            do_separate_z = False
+            axis = None
+    if axis is not None:
+        if len(axis) == 3:
+            do_separate_z = False
+            axis = None
+        elif len(axis) == 2:
+            # this happens for spacings like (0.24, 1.25, 1.25) for example. In that case we do not want to resample
+            # separately in the out of plane axis
+            do_separate_z = False
+            axis = None
+        else:
+            axis = axis[0]
+    return do_separate_z, axis
+def resample_data_or_seg_to_spacing(data: np.ndarray,
+                                    current_spacing: Union[Tuple[float, ...], List[float], np.ndarray],
+                                    new_spacing: Union[Tuple[float, ...], List[float], np.ndarray],
+                                    is_seg: bool = False,
+                                    order: int = 3, order_z: int = 0,
+                                    force_separate_z: Union[bool, None] = False,
+                                    separate_z_anisotropy_threshold: float = ANISO_THRESHOLD):
+    do_separate_z, axis = determine_do_sep_z_and_axis(force_separate_z, current_spacing, new_spacing,
+                                                      separate_z_anisotropy_threshold)
+    if data is not None:
+        assert data.ndim == 4, "data must be c x y z"
+    shape = np.array(data.shape)
+    new_shape = compute_new_shape(shape[1:], current_spacing, new_spacing)
+    data_reshaped = resample_data_or_seg(data, new_shape, is_seg, axis, order, do_separate_z, order_z=order_z)
+    return data_reshaped
+def resample_data_or_seg_to_shape(data: Union[torch.Tensor, np.ndarray],
+                                  new_shape: Union[Tuple[int, ...], List[int], np.ndarray],
+                                  current_spacing: Union[Tuple[float, ...], List[float], np.ndarray],
+                                  new_spacing: Union[Tuple[float, ...], List[float], np.ndarray],
+                                  is_seg: bool = False,
+                                  order: int = 3, order_z: int = 0,
+                                  force_separate_z: Union[bool, None] = False,
+                                  separate_z_anisotropy_threshold: float = ANISO_THRESHOLD):
+    """
+    needed for segmentation export. Stupid, I know
+    """
+    if isinstance(data, torch.Tensor):
+        data = data.numpy()
+    do_separate_z, axis = determine_do_sep_z_and_axis(force_separate_z, current_spacing, new_spacing,
+                                                      separate_z_anisotropy_threshold)
+    if data is not None:
+        assert data.ndim == 4, "data must be c x y z"
+    data_reshaped = resample_data_or_seg(data, new_shape, is_seg, axis, order, do_separate_z, order_z=order_z)
+    return data_reshaped
+def resample_data_or_seg(data: np.ndarray, new_shape: Union[Tuple[float, ...], List[float], np.ndarray],
+                         is_seg: bool = False, axis: Union[None, int] = None, order: int = 3,
+                         do_separate_z: bool = False, order_z: int = 0, dtype_out = None):
+    """
+    separate_z=True will resample with order 0 along z
+    :param data:
+    :param new_shape:
+    :param is_seg:
+    :param axis:
+    :param order:
+    :param do_separate_z:
+    :param order_z: only applies if do_separate_z is True
+    :return:
+    """
+    assert data.ndim == 4, "data must be (c, x, y, z)"
+    assert len(new_shape) == data.ndim - 1
+    if is_seg:
+        resize_fn = resize_segmentation
+        kwargs = OrderedDict()
+    else:
+        resize_fn = resize
+        kwargs = {'mode': 'edge', 'anti_aliasing': False}
+    shape = np.array(data[0].shape)
+    new_shape = np.array(new_shape)
+    if dtype_out is None:
+        dtype_out = data.dtype
+    reshaped_final = np.zeros((data.shape[0], *new_shape), dtype=dtype_out)
+    if np.any(shape != new_shape):
+        data = data.astype(float, copy=False)
+        if do_separate_z:
+            # print("separate z, order in z is", order_z, "order inplane is", order)
+            assert axis is not None, 'If do_separate_z, we need to know what axis is anisotropic'
+            if axis == 0:
+                new_shape_2d = new_shape[1:]
+            elif axis == 1:
+                new_shape_2d = new_shape[[0, 2]]
+            else:
+                new_shape_2d = new_shape[:-1]
+            for c in range(data.shape[0]):
+                tmp = deepcopy(new_shape)
+                tmp[axis] = shape[axis]
+                reshaped_here = np.zeros(tmp)
+                for slice_id in range(shape[axis]):
+                    if axis == 0:
+                        reshaped_here[slice_id] = resize_fn(data[c, slice_id], new_shape_2d, order, **kwargs)
+                    elif axis == 1:
+                        reshaped_here[:, slice_id] = resize_fn(data[c, :, slice_id], new_shape_2d, order, **kwargs)
+                    else:
+                        reshaped_here[:, :, slice_id] = resize_fn(data[c, :, :, slice_id], new_shape_2d, order, **kwargs)
+                if shape[axis] != new_shape[axis]:
+                    # The following few lines are blatantly copied and modified from sklearn's resize()
+                    rows, cols, dim = new_shape[0], new_shape[1], new_shape[2]
+                    orig_rows, orig_cols, orig_dim = reshaped_here.shape
+                    # align_corners=False
+                    row_scale = float(orig_rows) / rows
+                    col_scale = float(orig_cols) / cols
+                    dim_scale = float(orig_dim) / dim
+                    map_rows, map_cols, map_dims = np.mgrid[:rows, :cols, :dim]
+                    map_rows = row_scale * (map_rows + 0.5) - 0.5
+                    map_cols = col_scale * (map_cols + 0.5) - 0.5
+                    map_dims = dim_scale * (map_dims + 0.5) - 0.5
+                    coord_map = np.array([map_rows, map_cols, map_dims])
+                    if not is_seg or order_z == 0:
+                        reshaped_final[c] = map_coordinates(reshaped_here, coord_map, order=order_z, mode='nearest')[None]
+                    else:
+                        unique_labels = np.sort(pd.unique(reshaped_here.ravel()))  # np.unique(reshaped_data)
+                        for i, cl in enumerate(unique_labels):
+                            reshaped_final[c][np.round(
+                                map_coordinates((reshaped_here == cl).astype(float), coord_map, order=order_z,
+                                                mode='nearest')) > 0.5] = cl
+                else:
+                    reshaped_final[c] = reshaped_here
+        else:
+            # print("no separate z, order", order)
+            for c in range(data.shape[0]):
+                reshaped_final[c] = resize_fn(data[c], new_shape, order, **kwargs)
+        return reshaped_final
+    else:
+        # print("no resampling necessary")
+        return data
+if __name__ == '__main__':
+    input_array = np.random.random((1, 42, 231, 142))
+    output_shape = (52, 256, 256)
+    out = resample_data_or_seg(input_array, output_shape, is_seg=False, axis=3, order=1, order_z=0, do_separate_z=True)
+    print(out.shape, input_array.shape)

data/resample_torch.py ADDED Viewed

	@@ -0,0 +1,162 @@

+from copy import deepcopy
+from typing import Union, Tuple, List
+import numpy as np
+import torch
+from einops import rearrange
+from torch.nn import functional as F
+from data.default_resampling import determine_do_sep_z_and_axis
+ANISO_THRESHOLD = 3  # determines when a sample is considered anisotropic (3 means that the spacing in the low
+                     # resolution axis must be 3x as large as the next largest spacing)
+def resample_torch_simple(
+        data: Union[torch.Tensor, np.ndarray],
+        new_shape: Union[Tuple[int, ...], List[int], np.ndarray],
+        is_seg: bool = False,
+        num_threads: int = 4,
+        device: torch.device = torch.device('cpu'),
+        memefficient_seg_resampling: bool = False,
+        mode='linear'
+):
+    if mode == 'linear':
+        if data.ndim == 4:
+            torch_mode = 'trilinear'
+        elif data.ndim == 3:
+            torch_mode = 'bilinear'
+        else:
+            raise RuntimeError
+    else:
+        torch_mode = mode
+    if isinstance(new_shape, np.ndarray):
+        new_shape = [int(i) for i in new_shape]
+    if all([i == j for i, j in zip(new_shape, data.shape[1:])]):
+        return data
+    else:
+        n_threads = torch.get_num_threads()
+        torch.set_num_threads(num_threads)
+        new_shape = tuple(new_shape)
+        with torch.no_grad():
+            input_was_numpy = isinstance(data, np.ndarray)
+            if input_was_numpy:
+                data = torch.from_numpy(data).to(device)
+            else:
+                orig_device = deepcopy(data.device)
+                data = data.to(device)
+            if is_seg:
+                unique_values = torch.unique(data)
+                result_dtype = torch.int8 if max(unique_values) < 127 else torch.int16
+                result = torch.zeros((data.shape[0], *new_shape), dtype=result_dtype, device=device)
+                if not memefficient_seg_resampling:
+                    # believe it or not, the implementation below is 3x as fast (at least on Liver CT and on CPU)
+                    # Why? Because argmax is slow. The implementation below immediately sets most locations and only lets the
+                    # uncertain ones be determined by argmax
+                    # unique_values = torch.unique(data)
+                    # result = torch.zeros((len(unique_values), data.shape[0], *new_shape), dtype=torch.float16)
+                    # for i, u in enumerate(unique_values):
+                    #     result[i] = F.interpolate((data[None] == u).float() * 1000, new_shape, mode='trilinear', antialias=False)[0]
+                    # result = unique_values[result.argmax(0)]
+                    result_tmp = torch.zeros((len(unique_values), data.shape[0], *new_shape), dtype=torch.float16,
+                                             device=device)
+                    scale_factor = 1000
+                    done_mask = torch.zeros_like(result, dtype=torch.bool, device=device)
+                    for i, u in enumerate(unique_values):
+                        result_tmp[i] = \
+                            F.interpolate((data[None] == u).float() * scale_factor, new_shape, mode=torch_mode,
+                                          antialias=False)[0]
+                        mask = result_tmp[i] > (0.7 * scale_factor)
+                        result[mask] = u.item()
+                        done_mask |= mask
+                    if not torch.all(done_mask):
+                        # print('resolving argmax', torch.sum(~done_mask), "voxels to go")
+                        result[~done_mask] = unique_values[result_tmp[:, ~done_mask].argmax(0)].to(result_dtype)
+                else:
+                    for i, u in enumerate(unique_values):
+                        if u == 0:
+                            pass
+                        result[F.interpolate((data[None] == u).float(), new_shape, mode=torch_mode, antialias=False)[
+                                   0] > 0.5] = u
+            else:
+                result = F.interpolate(data[None].float(), new_shape, mode=torch_mode, antialias=False)[0]
+            if input_was_numpy:
+                result = result.cpu().numpy()
+            else:
+                result = result.to(orig_device)
+        torch.set_num_threads(n_threads)
+        return result
+def resample_torch_fornnunet(
+        data: Union[torch.Tensor, np.ndarray],
+        new_shape: Union[Tuple[int, ...], List[int], np.ndarray],
+        current_spacing: Union[Tuple[float, ...], List[float], np.ndarray],
+        new_spacing: Union[Tuple[float, ...], List[float], np.ndarray],
+        is_seg: bool = False,
+        num_threads: int = 4,
+        device: torch.device = torch.device('cpu'),
+        memefficient_seg_resampling: bool = False,
+        force_separate_z: Union[bool, None] = None,
+        separate_z_anisotropy_threshold: float = ANISO_THRESHOLD,
+        mode='linear',
+        aniso_axis_mode='nearest-exact'
+):
+    """
+    data must be c, x, y, z
+    """
+    assert data.ndim == 4, "data must be c, x, y, z"
+    new_shape = [int(i) for i in new_shape]
+    orig_shape = data.shape
+    do_separate_z, axis = determine_do_sep_z_and_axis(force_separate_z, current_spacing, new_spacing,
+                                                      separate_z_anisotropy_threshold)
+    # print('shape', data.shape, 'current_spacing', current_spacing, 'new_spacing', new_spacing, 'do_separate_z', do_separate_z, 'axis', axis)
+    if do_separate_z:
+        was_numpy = isinstance(data, np.ndarray)
+        if was_numpy:
+            data = torch.from_numpy(data)
+        if isinstance(axis, list):
+            assert len(axis) == 1
+            axis = axis[0]
+        else:
+            pass
+        tmp = "xyz"
+        axis_letter = tmp[axis]
+        others_int = [i for i in range(3) if i != axis]
+        others = [tmp[i] for i in others_int]
+        # reshape by overloading c channel
+        data = rearrange(data, f"c x y z -> (c {axis_letter}) {others[0]} {others[1]}")
+        # reshape in-plane
+        tmp_new_shape = [new_shape[i] for i in others_int]
+        data = resample_torch_simple(data, tmp_new_shape, is_seg=is_seg, num_threads=num_threads, device=device,
+                                     memefficient_seg_resampling=memefficient_seg_resampling, mode=mode)
+        data = rearrange(data, f"(c {axis_letter}) {others[0]} {others[1]} -> c x y z",
+                         **{
+                             axis_letter: orig_shape[axis + 1],
+                             others[0]: tmp_new_shape[0],
+                             others[1]: tmp_new_shape[1]
+                         }
+                         )
+        # reshape out of plane w/ nearest
+        data = resample_torch_simple(data, new_shape, is_seg=is_seg, num_threads=num_threads, device=device,
+                                     memefficient_seg_resampling=memefficient_seg_resampling, mode=aniso_axis_mode)
+        if was_numpy:
+            data = data.numpy()
+        return data
+    else:
+        return resample_torch_simple(data, new_shape, is_seg, num_threads, device, memefficient_seg_resampling)
+if __name__ == '__main__':
+    torch.set_num_threads(16)

data/resampling_test.py ADDED Viewed

	@@ -0,0 +1,593 @@

+from typing import Union, Tuple, List
+import numpy as np
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+import time
+from copy import deepcopy
+from default_resampling import determine_do_sep_z_and_axis
+import psutil
+import nibabel as nib
+import os
+from pathlib import Path
+ANISO_THRESHOLD = 3
+def compute_new_shape(current_shape: Union[Tuple[int, ...], List[int], np.ndarray],
+                     current_spacing: Union[Tuple[float, ...], List[float], np.ndarray],
+                     target_spacing: Union[Tuple[float, ...], List[float], np.ndarray]) -> List[int]:
+    """Compute new shape based on spacing ratios."""
+    current_shape = np.array(current_shape)
+    current_spacing = np.array(current_spacing)
+    target_spacing = np.array(target_spacing)
+    return [int(round(s * (cs / ts))) for s, cs, ts in zip(current_shape, current_spacing, target_spacing)]
+def optimized_3d_resample(
+    data: Union[torch.Tensor, np.ndarray],
+    current_spacing: Union[Tuple[float, ...], List[float], np.ndarray],
+    target_spacing: Union[Tuple[float, ...], List[float], np.ndarray],
+    is_seg: bool = False,
+    device: torch.device = torch.device('cpu'),
+    num_threads: int = 8,
+    chunk_size: int = 64,
+    force_separate_z: Union[bool, None] = None,
+    separate_z_anisotropy_threshold: float = ANISO_THRESHOLD,
+    preserve_range: bool = True
+) -> Union[torch.Tensor, np.ndarray]:
+    """
+    Optimized 3D image resampling with adaptive interpolation and chunked processing.
+    Args:
+        data: Input 3D volume [C, D, H, W] or [D, H, W]
+        current_spacing: Current voxel spacing (z, y, x)
+        target_spacing: Target voxel spacing (z, y, x)
+        is_seg: Whether the input is a segmentation mask
+        device: Torch device for computation
+        num_threads: Number of threads for CPU operations
+        chunk_size: Size of chunks for large volume processing
+        force_separate_z: Force separate z resampling
+        separate_z_anisotropy_threshold: Threshold for anisotropic resampling
+        preserve_range: Preserve original value range for non-segmentation data
+    Returns:
+        Resampled 3D volume
+    """
+    print(f"\nStarting optimized_3d_resample with input shape: {data.shape}, is_seg: {is_seg}")
+    input_was_numpy = isinstance(data, np.ndarray)
+    if input_was_numpy:
+        data = torch.from_numpy(data).to(device)
+    else:
+        data = data.to(device)
+    print(f"Input converted to tensor on {device}, shape: {data.shape}")
+    if data.ndim == 3:
+        data = data.unsqueeze(0)
+    assert data.ndim == 4, "Data must be 3D or 4D (C, D, H, W)"
+    new_shape = compute_new_shape(data.shape[1:], current_spacing, target_spacing)
+    print(f"Computed new shape: {new_shape} from current_spacing: {current_spacing}, target_spacing: {target_spacing}")
+    if all(i == j for i, j in zip(new_shape, data.shape[1:])):
+        print("No resampling needed, shapes identical.")
+        return data.cpu().numpy() if input_was_numpy else data
+    mode = 'nearest' if is_seg else 'trilinear'
+    aniso_axis_mode = 'nearest-exact' if is_seg else 'linear'
+    print(f"Interpolation mode: {mode}, Anisotropic axis mode: {aniso_axis_mode}")
+    do_separate_z, axis = determine_do_sep_z_and_axis(force_separate_z, current_spacing,
+                                                     target_spacing, separate_z_anisotropy_threshold)
+    print(f"Do separate Z: {do_separate_z}, Axis: {axis}")
+    if preserve_range and not is_seg:
+        v_min, v_max = data.min(), data.max()
+        print(f"Preserving range for non-segmentation data: min={v_min.item():.4f}, max={v_max.item():.4f}")
+    torch.set_num_threads(num_threads)
+    print(f"Set number of threads to {num_threads}")
+    start_time = time.time()
+    if do_separate_z:
+        tmp = "xyz"
+        axis_letter = tmp[axis]
+        others_int = [i for i in range(3) if i != axis]
+        others = [tmp[i] for i in others_int]
+        print(f"Separate Z resampling along axis {axis_letter}, others: {others}")
+        tmp_new_shape = [new_shape[i] for i in others_int]
+        print(f"First pass: Resampling to shape {tmp_new_shape} for axes {others}")
+        data = rearrange(data, f"c x y z -> (c {axis_letter}) {others[0]} {others[1]}")
+        print(f"Rearranged data shape: {data.shape}")
+        data = _chunked_resample(data, tmp_new_shape, mode, chunk_size, device, is_seg)
+        print(f"After first pass resampling, shape: {data.shape}")
+        data = rearrange(data, f"(c {axis_letter}) {others[0]} {others[1]} -> c x y z",
+                        **{axis_letter: data.shape[1], others[0]: tmp_new_shape[0], others[1]: tmp_new_shape[1]})
+        print(f"Rearranged back to shape: {data.shape}")
+        data = _chunked_resample(data, new_shape, aniso_axis_mode, chunk_size, device, is_seg)
+        print(f"After second pass resampling, final shape: {data.shape}")
+    else:
+        print(f"Direct resampling to shape: {new_shape}")
+        data = _chunked_resample(data, new_shape, mode, chunk_size, device, is_seg)
+        print(f"After direct resampling, final shape: {data.shape}")
+    resample_time = time.time() - start_time
+    print(f"Resampling completed in {resample_time:.3f}s")
+    if is_seg:
+        unique_values = torch.unique(data)
+        result_dtype = torch.int8 if max(unique_values) < 127 else torch.int16
+        data = data.round().to(result_dtype)
+        print(f"Segmentation data rounded and converted to {result_dtype}, unique values: {unique_values.tolist()}")
+    if preserve_range and not is_seg:
+        data = torch.clamp(data, v_min, v_max)
+        print(f"Clamped data to original range: min={v_min.item():.4f}, max={v_max.item():.4f}")
+    output = data.cpu().numpy() if input_was_numpy else data
+    print(f"Output shape: {output.shape}, type: {type(output)}")
+    return output
+def _chunked_resample(
+    volume: torch.Tensor,
+    target_shape: Tuple[int, ...],
+    mode: str,
+    chunk_size: int,
+    device: torch.device,
+    is_seg: bool
+) -> torch.Tensor:
+    """Chunked resampling for large volumes with adaptive chunk sizing."""
+    print(f"\nStarting _chunked_resample with input shape: {volume.shape}, target shape: {target_shape}")
+    C, D, H, W = volume.shape
+    tD, tH, tW = target_shape
+    # Adaptive chunk size based on available memory
+    if device.type == 'cpu':
+        available_memory = psutil.virtual_memory().available / 1024**2  # in MB
+    else:
+        total_memory = torch.cuda.get_device_properties(device).total_memory / 1024**2  # in MB
+        allocated_memory = torch.cuda.memory_allocated(device) / 1024**2
+        available_memory = total_memory - allocated_memory
+    mem_per_voxel = volume.element_size() * volume.nelement() / volume.numel()
+    target_voxel_count = C * tD * tH * tW
+    chunk_mem_ratio = 0.5 if device.type == 'cpu' else 0.3
+    adaptive_chunk_size = max(
+        32,
+        min(chunk_size, int((available_memory * chunk_mem_ratio / mem_per_voxel / C) ** (1/3)))
+    )
+    # Early return for small volumes
+    if D * H * W <= 128**3:
+        with torch.cuda.amp.autocast(enabled=not is_seg):
+            start_time = time.time()
+            # Cast to float for interpolation if is_seg and mode is nearest
+            input_tensor = volume.float() if is_seg and mode == 'nearest' else volume
+            result = F.interpolate(
+                input_tensor.unsqueeze(0),
+                size=target_shape,
+                mode=mode,
+                align_corners=False if mode != 'nearest' else None
+            ).squeeze(0)
+            # Convert back to original dtype for segmentation
+            if is_seg:
+                result = result.round().to(volume.dtype)
+            # print(f"Direct interpolation completed in {time.time() - start_time:.3f}s, output shape: {result.shape}")
+            return result
+    result = torch.zeros((C, tD, tH, tW), device=device, dtype=volume.dtype)
+    out_chunk_size = max(1, int(adaptive_chunk_size * min(tD/D, tH/H, tW/W)))
+    for c in range(C):
+        for z in range(0, tD, out_chunk_size):
+            z_end = min(z + out_chunk_size, tD)
+            for y in range(0, tH, out_chunk_size):
+                y_end = min(y + out_chunk_size, tH)
+                for x in range(0, tW, out_chunk_size):
+                    x_end = min(x + out_chunk_size, tW)
+                    in_z = max(0, int(z * D / tD) - 1)
+                    in_z_end = min(D, int(z_end * D / tD) + 2)
+                    in_y = max(0, int(y * H / tH) - 1)
+                    in_y_end = min(H, int(y_end * H / tH) + 2)
+                    in_x = max(0, int(x * W / tW) - 1)
+                    in_x_end = min(W, int(x_end * W / tW) + 2)
+                    chunk = volume[c:c+1, in_z:in_z_end, in_y:in_y_end, in_x:in_x_end]
+                    chunk_target = (z_end - z, y_end - y, x_end - x)
+                    with torch.cuda.amp.autocast(enabled=not is_seg):
+                        start_time = time.time()
+                        # Cast to float for interpolation if is_seg and mode is nearest
+                        input_chunk = chunk.float() if is_seg and mode == 'nearest' else chunk
+                        resampled_chunk = F.interpolate(
+                            input_chunk.unsqueeze(0),
+                            size=chunk_target,
+                            mode=mode,
+                            align_corners=False if mode != 'nearest' else None
+                        ).squeeze(0)
+                        # Convert back to original dtype for segmentation
+                        if is_seg:
+                            resampled_chunk = resampled_chunk.round().to(volume.dtype)
+                        # print(f"Chunk interpolation completed in {time.time() - start_time:.3f}s, shape: {resampled_chunk.shape}")
+                    result[c, z:z_end, y:y_end, x:x_end] = resampled_chunk
+                    del chunk, resampled_chunk
+                    if device.type == 'cuda':
+                        torch.cuda.empty_cache()
+    return result
+def resample_torch_simple(
+    data: Union[torch.Tensor, np.ndarray],
+    new_shape: Union[Tuple[int, ...], List[int], np.ndarray],
+    is_seg: bool = False,
+    num_threads: int = 4,
+    device: torch.device = torch.device('cpu'),
+    memefficient_seg_resampling: bool = False,
+    mode: str = 'linear'
+) -> Union[torch.Tensor, np.ndarray]:
+    if mode == 'linear':
+        torch_mode = 'trilinear' if data.ndim == 4 else 'bilinear'
+    else:
+        torch_mode = mode
+    if isinstance(new_shape, np.ndarray):
+        new_shape = [int(i) for i in new_shape]
+    if all([i == j for i, j in zip(new_shape, data.shape[1:])]):
+        return data
+    n_threads = torch.get_num_threads()
+    torch.set_num_threads(num_threads)
+    new_shape = tuple(new_shape)
+    with torch.no_grad():
+        input_was_numpy = isinstance(data, np.ndarray)
+        if input_was_numpy:
+            data = torch.from_numpy(data).to(device)
+        else:
+            orig_device = deepcopy(data.device)
+            data = data.to(device)
+        if is_seg:
+            unique_values = torch.unique(data)
+            result_dtype = torch.int8 if max(unique_values) < 127 else torch.int16
+            result = torch.zeros((data.shape[0], *new_shape), dtype=result_dtype, device=device)
+            if not memefficient_seg_resampling:
+                result_tmp = torch.zeros((len(unique_values), data.shape[0], *new_shape), dtype=torch.float16,
+                                       device=device)
+                scale_factor = 1000
+                done_mask = torch.zeros_like(result, dtype=torch.bool, device=device)
+                for i, u in enumerate(unique_values):
+                    result_tmp[i] = F.interpolate((data[None] == u).float() * scale_factor, new_shape, mode=torch_mode,
+                                                antialias=False)[0]
+                    mask = result_tmp[i] > (0.7 * scale_factor)
+                    result[mask] = u.item()
+                    done_mask |= mask
+                if not torch.all(done_mask):
+                    result[~done_mask] = unique_values[result_tmp[:, ~done_mask].argmax(0)].to(result_dtype)
+            else:
+                for i, u in enumerate(unique_values):
+                    if u == 0:
+                        continue
+                    result[F.interpolate((data[None] == u).float(), new_shape, mode=torch_mode, antialias=False)[0] > 0.5] = u
+        else:
+            result = F.interpolate(data[None].float(), new_shape, mode=torch_mode, antialias=False)[0]
+        if input_was_numpy:
+            result = result.cpu().numpy()
+        else:
+            result = result.to(orig_device)
+    torch.set_num_threads(n_threads)
+    return result
+def resample_torch_fornnunet(
+    data: Union[torch.Tensor, np.ndarray],
+    new_shape: Union[Tuple[int, ...], List[int], np.ndarray],
+    current_spacing: Union[Tuple[float, ...], List[float], np.ndarray],
+    new_spacing: Union[Tuple[float, ...], List[float], np.ndarray],
+    is_seg: bool = False,
+    num_threads: int = 4,
+    device: torch.device = torch.device('cpu'),
+    memefficient_seg_resampling: bool = False,
+    force_separate_z: Union[bool, None] = None,
+    separate_z_anisotropy_threshold: float = ANISO_THRESHOLD,
+    mode: str = 'linear',
+    aniso_axis_mode: str = 'nearest-exact'
+) -> Union[torch.Tensor, np.ndarray]:
+    assert data.ndim == 4, "data must be c, x, y, z"
+    new_shape = [int(i) for i in new_shape]
+    orig_shape = data.shape
+    do_separate_z, axis = determine_do_sep_z_and_axis(force_separate_z, current_spacing, new_spacing,
+                                                     separate_z_anisotropy_threshold)
+    if do_separate_z:
+        was_numpy = isinstance(data, np.ndarray)
+        if was_numpy:
+            data = torch.from_numpy(data)
+        if isinstance(axis, list):
+            axis = axis[0]
+        tmp = "xyz"
+        axis_letter = tmp[axis]
+        others_int = [i for i in range(3) if i != axis]
+        others = [tmp[i] for i in others_int]
+        data = rearrange(data, f"c x y z -> (c {axis_letter}) {others[0]} {others[1]}")
+        tmp_new_shape = [new_shape[i] for i in others_int]
+        data = resample_torch_simple(data, tmp_new_shape, is_seg=is_seg, num_threads=num_threads, device=device,
+                                   memefficient_seg_resampling=memefficient_seg_resampling, mode=mode)
+        data = rearrange(data, f"(c {axis_letter}) {others[0]} {others[1]} -> c x y z",
+                        **{axis_letter: orig_shape[axis + 1], others[0]: tmp_new_shape[0], others[1]: tmp_new_shape[1]})
+        data = resample_torch_simple(data, new_shape, is_seg=is_seg, num_threads=num_threads, device=device,
+                                   memefficient_seg_resampling=memefficient_seg_resampling, mode=aniso_axis_mode)
+        if was_numpy:
+            data = data.numpy()
+        return data
+    else:
+        return resample_torch_simple(data, new_shape, is_seg, num_threads, device, memefficient_seg_resampling)
+def dice_score(pred: np.ndarray, true: np.ndarray) -> float:
+    """Compute Dice score for segmentation masks."""
+    pred = pred.flatten()
+    true = true.flatten()
+    intersection = np.sum(pred * true)
+    return (2. * intersection) / (np.sum(pred) + np.sum(true) + 1e-8)
+# Placeholder for compute_new_shape if not provided
+def compute_new_shape(original_shape, current_spacing, target_spacing):
+    """
+    Compute the new shape based on the spacing ratio.
+    original_shape: (z, y, x)
+    current_spacing: (z, y, x)
+    target_spacing: (z, y, x)
+    """
+    zoom_factors = [c / t for c, t in zip(current_spacing, target_spacing)]
+    new_shape = [int(round(s * z)) for s, z in zip(original_shape, zoom_factors)]
+    return tuple(new_shape)
+# Function to save as NIfTI
+def save_nii(array, spacing, output_path, is_seg=False):
+    """
+    Save numpy array as NIfTI file with specified spacing.
+    is_seg: If True, convert to int32 for segmentation masks.
+    """
+    # Convert torch tensor to numpy if necessary
+    if isinstance(array, torch.Tensor):
+        array = array.cpu().numpy()
+    # Convert data type for NIfTI compatibility
+    if is_seg:
+        array = array.astype(np.int32)  # Convert segmentation to int32
+    else:
+        array = array.astype(np.float32)  # Ensure image is float32
+    # Transpose to (X, Y, Z, C) for NIfTI
+    if array.ndim == 4:
+        array = array.transpose(2, 3, 1, 0)  # From (C, Z, Y, X) to (X, Y, Z, C)
+    else:
+        array = array.transpose(2, 3, 1)  # From (Z, Y, X) to (X, Y, Z)
+    # Create NIfTI image with affine based on spacing
+    affine = np.diag(list(spacing) + [1.0])
+    nii_img = nib.Nifti1Image(array, affine=affine)
+    nib.save(nii_img, output_path)
+    print(f"Saved: {output_path}")
+# Main resampling function
+def main():
+    torch.set_num_threads(4)
+    device = torch.device('cuda') #torch.device('cpu')  # Force CPU as per provided code
+    print(f"\nRunning tests on device: {device}")
+    # Define paths
+    npz_file_path = "/media/shipc/hhd_8T/spc/code/CVPR2025_Text_guided_seg_submission/inputs/Microscopy_cremi_000_sc.npz"
+    gt_path = "/media/shipc/hhd_8T/spc/code/CVPR2025_Text_guided_seg_submission/gts/Microscopy_cremi_000_sc.npz"
+    output_dir = "/media/shipc/hhd_8T/spc/code/CVPR2025_Text_guided_seg_submission/workspace_teamx/outputs_test_resample"
+    # Ensure output directory exists
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    # Load input data
+    data = np.load(npz_file_path, allow_pickle=True)
+    img_array = data['imgs']  # Shape: (C, Z, Y, X) or (Z, Y, X)
+    img_spacing = data['spacing']  # (z, y, x)
+    img_spacing = [1.0, 1.0, 1.0]  # Override as per provided code
+    gt_data = np.load(gt_path, allow_pickle=True)
+    gt_array = gt_data['gts']  # Shape: (C, Z, Y, X) or (Z, Y, X)
+    # Convert data types to PyTorch-compatible types
+    img_array = img_array.astype(np.float32)  # Convert image to float32
+    gt_array = gt_array.astype(np.int32)      # Convert segmentation mask to int32
+    # Ensure img_array and gt_array have channel dimension
+    if img_array.ndim == 3:
+        img_array = img_array[np.newaxis, ...]  # Add channel dimension: (1, Z, Y, X)
+    if gt_array.ndim == 3:
+        gt_array = gt_array[np.newaxis, ...]  # Add channel dimension: (1, Z, Y, X)
+    # Define target spacings to test
+    target_spacings = [
+        (1.2, 1.2, 1.2),
+        (1.5, 1.5, 1.5),
+        (2.0, 2.0, 2.0),
+    ]
+    # Original shape and spacing
+    original_shape = img_array.shape[1:]  # (Z, Y, X)
+    current_spacing = img_spacing
+    print(f"\nOriginal image shape: {original_shape}, Current spacing (z,y,x): {current_spacing}")
+    for target_spacing in target_spacings:
+        print(f"\n=== Resampling to Target Spacing: {target_spacing} ===")
+        # Compute new shape
+        new_shape = compute_new_shape(original_shape, current_spacing, target_spacing)
+        print(f"Computed target shape: {new_shape}")
+        # === Image Resampling ===
+        print("\nResampling image...")
+        # Ground truth resampling
+        print("Computing ground truth with resample_torch_simple...")
+        start_time = time.time()
+        if device.type == 'cuda':
+            torch.cuda.synchronize()  # Ensure GPU operations are complete
+        gt_img = resample_torch_simple(
+            img_array,
+            new_shape=new_shape,
+            is_seg=False,
+            num_threads=4,
+            device=device
+        )
+        if device.type == 'cuda':
+            torch.cuda.synchronize()  # Ensure GPU operations are complete
+        gt_time = time.time() - start_time
+        output_path = os.path.join(output_dir, f"img_gt_spacing_{target_spacing[0]}_{target_spacing[1]}_{target_spacing[2]}.nii.gz")
+        print(f"Ground truth image shape: {gt_img.shape}, Time: {gt_time:.3f}s")
+        save_nii(gt_img, target_spacing, output_path, is_seg=False)
+        # Optimized resampling
+        print("Running optimized_3d_resample...")
+        start_time = time.time()
+        if device.type == 'cuda':
+            torch.cuda.synchronize()
+        mem_before = psutil.virtual_memory().used / 1024**2 if device.type == 'cpu' else torch.cuda.memory_allocated(device) / 1024**2
+        resampled_img_opt = optimized_3d_resample(
+            img_array,
+            current_spacing,
+            target_spacing,
+            is_seg=False,
+            device=device,
+            num_threads=4,
+            chunk_size=64
+        )
+        if device.type == 'cuda':
+            torch.cuda.synchronize()
+        opt_time = time.time() - start_time
+        mem_after = psutil.virtual_memory().used / 1024**2 if device.type == 'cpu' else torch.cuda.memory_allocated(device) / 1024**2
+        opt_mae = np.mean(np.abs(resampled_img_opt - gt_img))
+        output_path = os.path.join(output_dir, f"img_opt_spacing_{target_spacing[0]}_{target_spacing[1]}_{target_spacing[2]}.nii.gz")
+        print(f"Optimized image shape: {resampled_img_opt.shape}, Time: {opt_time:.3f}s, "
+              f"Memory used: {mem_after - mem_before:.2f} MB, MAE: {opt_mae:.6f}")
+        save_nii(resampled_img_opt, target_spacing, output_path, is_seg=False)
+        # Original resampling
+        print("Running resample_torch_fornnunet...")
+        start_time = time.time()
+        if device.type == 'cuda':
+            torch.cuda.synchronize()
+        mem_before = psutil.virtual_memory().used / 1024**2 if device.type == 'cpu' else torch.cuda.memory_allocated(device) / 1024**2
+        resampled_img_orig = resample_torch_fornnunet(
+            img_array,
+            new_shape,
+            current_spacing,
+            target_spacing,
+            is_seg=False,
+            num_threads=4,
+            device=device
+        )
+        if device.type == 'cuda':
+            torch.cuda.synchronize()
+        orig_time = time.time() - start_time
+        mem_after = psutil.virtual_memory().used / 1024**2 if device.type == 'cpu' else torch.cuda.memory_allocated(device) / 1024**2
+        orig_mae = np.mean(np.abs(resampled_img_orig - gt_img))
+        output_path = os.path.join(output_dir, f"img_orig_spacing_{target_spacing[0]}_{target_spacing[1]}_{target_spacing[2]}.nii.gz")
+        print(f"Original image shape: {resampled_img_orig.shape}, Time: {orig_time:.3f}s, "
+              f"Memory used: {mem_after - mem_before:.2f} MB, MAE: {orig_mae:.6f}")
+        save_nii(resampled_img_orig, target_spacing, output_path, is_seg=False)
+        # === Segmentation Mask Resampling ===
+        print("\nResampling segmentation mask...")
+        # Ground truth resampling
+        print("Computing ground truth with resample_torch_simple...")
+        start_time = time.time()
+        if device.type == 'cuda':
+            torch.cuda.synchronize()
+        gt_seg = resample_torch_simple(
+            gt_array,
+            new_shape=new_shape,
+            is_seg=True,
+            num_threads=4,
+            device=device
+        )
+        if device.type == 'cuda':
+            torch.cuda.synchronize()
+        gt_seg_time = time.time() - start_time
+        output_path = os.path.join(output_dir, f"seg_gt_spacing_{target_spacing[0]}_{target_spacing[1]}_{target_spacing[2]}.nii.gz")
+        print(f"Ground truth segmentation shape: {gt_seg.shape}, Time: {gt_seg_time:.3f}s")
+        save_nii(gt_seg, target_spacing, output_path, is_seg=True)
+        # Optimized resampling
+        print("Running optimized_3d_resample for segmentation...")
+        start_time = time.time()
+        if device.type == 'cuda':
+            torch.cuda.synchronize()
+        mem_before = psutil.virtual_memory().used / 1024**2 if device.type == 'cpu' else torch.cuda.memory_allocated(device) / 1024**2
+        resampled_seg_opt = optimized_3d_resample(
+            gt_array,
+            current_spacing,
+            target_spacing,
+            is_seg=True,
+            device=device,
+            num_threads=4,
+            chunk_size=64
+        )
+        if device.type == 'cuda':
+            torch.cuda.synchronize()
+        opt_seg_time = time.time() - start_time
+        mem_after = psutil.virtual_memory().used / 1024**2 if device.type == 'cpu' else torch.cuda.memory_allocated(device) / 1024**2
+        opt_dice = dice_score(resampled_seg_opt, gt_seg)
+        output_path = os.path.join(output_dir, f"seg_opt_spacing_{target_spacing[0]}_{target_spacing[1]}_{target_spacing[2]}.nii.gz")
+        print(f"Optimized segmentation shape: {resampled_seg_opt.shape}, Time: {opt_seg_time:.3f}s, "
+              f"Memory used: {mem_after - mem_before:.2f} MB, Dice: {opt_dice:.6f}")
+        save_nii(resampled_seg_opt, target_spacing, output_path, is_seg=True)
+        # Original resampling
+        print("Running resample_torch_fornnunet for segmentation...")
+        start_time = time.time()
+        if device.type == 'cuda':
+            torch.cuda.synchronize()
+        mem_before = psutil.virtual_memory().used / 1024**2 if device.type == 'cpu' else torch.cuda.memory_allocated(device) / 1024**2
+        resampled_seg_orig = resample_torch_fornnunet(
+            gt_array,
+            new_shape,
+            current_spacing,
+            target_spacing,
+            is_seg=True,
+            num_threads=4,
+            device=device
+        )
+        if device.type == 'cuda':
+            torch.cuda.synchronize()
+        orig_seg_time = time.time() - start_time
+        mem_after = psutil.virtual_memory().used / 1024**2 if device.type == 'cpu' else torch.cuda.memory_allocated(device) / 1024**2
+        orig_dice = dice_score(resampled_seg_orig, gt_seg)
+        output_path = os.path.join(output_dir, f"seg_orig_spacing_{target_spacing[0]}_{target_spacing[1]}_{target_spacing[2]}.nii.gz")
+        print(f"Original segmentation shape: {resampled_seg_orig.shape}, Time: {orig_seg_time:.3f}s, "
+              f"Memory used: {mem_after - mem_before:.2f} MB, Dice: {orig_dice:.6f}")
+        save_nii(resampled_seg_orig, target_spacing, output_path, is_seg=True)
+        # Summary
+        print(f"\n=== Summary for Target Spacing: {target_spacing} ===")
+        print("Image Resampling Metrics:")
+        print(f"Optimized - Shape: {resampled_img_opt.shape}, Time: {opt_time:.3f}s, MAE: {opt_mae:.6f}")
+        print(f"Original  - Shape: {resampled_img_orig.shape}, Time: {orig_time:.3f}s, MAE: {orig_mae:.6f}")
+        print(f"Time Improvement: {(orig_time - opt_time) / orig_time * 100:.2f}%")
+        print(f"MAE Improvement: {(orig_mae - opt_mae) / orig_mae * 100:.2f}%")
+        print("Segmentation Mask Resampling Metrics:")
+        print(f"Optimized - Shape: {resampled_seg_opt.shape}, Time: {opt_seg_time:.3f}s, Dice: {opt_dice:.6f}")
+        print(f"Original  - Shape: {resampled_seg_orig.shape}, Time: {orig_seg_time:.3f}s, Dice: {orig_dice:.6f}")
+        print(f"Time Improvement: {(orig_seg_time - opt_seg_time) / orig_seg_time * 100:.2f}%")
+        print(f"Dice Improvement: {(opt_dice - orig_dice) / orig_dice * 100:.2f}%")
+if __name__ == '__main__':
+    main()

environment.yml ADDED Viewed

	@@ -0,0 +1,211 @@

+name: medals_local_test
+channels:
+  - pytorch
+  - nvidia
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - aom=3.12.1=h7934f7d_0
+  - blas=1.0=mkl
+  - brotlicffi=1.2.0.0=py310h7354ed3_0
+  - bzip2=1.0.8=h5eee18b_6
+  - ca-certificates=2025.12.2=h06a4308_0
+  - cairo=1.18.4=h44eff21_0
+  - certifi=2025.11.12=py310h06a4308_0
+  - cffi=2.0.0=py310h4eded50_1
+  - charset-normalizer=3.4.4=py310h06a4308_0
+  - cuda-cudart=12.1.105=0
+  - cuda-cupti=12.1.105=0
+  - cuda-libraries=12.1.0=0
+  - cuda-nvrtc=12.1.105=0
+  - cuda-nvtx=12.1.105=0
+  - cuda-opencl=12.9.19=0
+  - cuda-runtime=12.1.0=0
+  - cuda-version=12.9=3
+  - dav1d=1.2.1=h5eee18b_0
+  - expat=2.7.3=h7354ed3_4
+  - ffmpeg=6.1.1=hecf7045_5
+  - filelock=3.20.0=py310h06a4308_0
+  - fontconfig=2.15.0=h2c49b7f_0
+  - freetype=2.13.3=h4a9f257_0
+  - fribidi=1.0.10=h7b6447c_0
+  - giflib=5.2.2=h5eee18b_0
+  - gmp=6.3.0=h6a678d5_0
+  - gmpy2=2.2.2=py310ha78e65c_0
+  - graphite2=1.3.14=h295c915_1
+  - harfbuzz=10.2.0=hdfddeaa_1
+  - icu=73.1=h6a678d5_0
+  - idna=3.11=py310h06a4308_0
+  - intel-openmp=2025.0.0=h06a4308_1171
+  - jinja2=3.1.6=py310h06a4308_0
+  - jpeg=9f=h5ce9db8_0
+  - lame=3.100=h7b6447c_0
+  - lcms2=2.17=heab6991_0
+  - ld_impl_linux-64=2.44=h153f514_2
+  - leptonica=1.82.0=hfdeec58_3
+  - lerc=4.0.0=h6a678d5_0
+  - libarchive=3.8.2=h3ec8f01_0
+  - libavif=1.3.0=h3539ee5_0
+  - libcublas=12.1.0.26=0
+  - libcufft=11.0.2.4=0
+  - libcufile=1.14.1.1=4
+  - libcurand=10.3.10.19=0
+  - libcusolver=11.4.4.55=0
+  - libcusparse=12.0.2.55=0
+  - libdeflate=1.22=h5eee18b_0
+  - libexpat=2.7.3=h7354ed3_4
+  - libffi=3.4.4=h6a678d5_1
+  - libgcc=15.2.0=h69a1729_7
+  - libgcc-ng=15.2.0=h166f726_7
+  - libglib=2.84.4=h77a78f3_0
+  - libgomp=15.2.0=h4751f2c_7
+  - libhwloc=2.12.1=default_hf1bbc79_1000
+  - libiconv=1.16=h5eee18b_3
+  - libjpeg-turbo=2.0.0=h9bf148f_0
+  - libnpp=12.0.2.50=0
+  - libnsl=2.0.0=h5eee18b_0
+  - libnvjitlink=12.1.105=0
+  - libnvjpeg=12.1.1.14=0
+  - libogg=1.3.5=h27cfd23_1
+  - libopenjpeg=2.5.4=hee96239_1
+  - libopus=1.3.1=h5eee18b_1
+  - libpng=1.6.50=h2ed474d_0
+  - libstdcxx=15.2.0=h39759b7_7
+  - libstdcxx-ng=15.2.0=hc03a8fd_7
+  - libtheora=1.2.0=h32ad74f_1
+  - libtiff=4.7.1=h029b1ac_0
+  - libuuid=1.41.5=h5eee18b_0
+  - libvorbis=1.3.7=h7b6447c_0
+  - libvpx=1.15.2=h4cb591d_0
+  - libwebp=1.6.0=h089d785_0
+  - libwebp-base=1.6.0=hb7bb969_0
+  - libxcb=1.17.0=h9b100fa_0
+  - libxml2=2.13.9=h2c43086_0
+  - libzlib=1.3.1=hb25bd0a_0
+  - llvm-openmp=14.0.6=h9e868ea_0
+  - lz4-c=1.9.4=h6a678d5_1
+  - markupsafe=3.0.2=py310h5eee18b_0
+  - mkl=2025.0.0=hacee8c2_941
+  - mkl-service=2.5.2=py310hacdc0fc_0
+  - mkl_fft=2.1.1=py310h8fe796d_0
+  - mkl_random=1.3.0=py310h505adc9_0
+  - mpc=1.3.1=h5eee18b_0
+  - mpfr=4.2.1=h5eee18b_0
+  - mpmath=1.3.0=py310h06a4308_0
+  - ncurses=6.5=h7934f7d_0
+  - networkx=3.4.2=py310h06a4308_0
+  - ocl-icd=2.3.3=h47b2149_0
+  - opencl-headers=2025.07.22=hfb20e49_0
+  - openh264=2.6.0=he621ea3_0
+  - openjpeg=2.5.4=h4e0627c_1
+  - openssl=3.0.18=hd6dcaed_0
+  - pcre2=10.46=hf426167_0
+  - pillow=12.0.0=py310h3b88751_1
+  - pip=25.3=pyhc872135_0
+  - pixman=0.46.4=h7934f7d_0
+  - pthread-stubs=0.3=h0ce48e5_1
+  - pycparser=2.23=py310h06a4308_0
+  - pysocks=1.7.1=py310h06a4308_1
+  - python=3.10.19=h6fa692b_0
+  - pytorch-cuda=12.1=ha16c6d3_6
+  - pytorch-mutex=1.0=cuda
+  - pyyaml=6.0.3=py310h591646f_0
+  - readline=8.3=hc2a1206_0
+  - requests=2.32.5=py310h06a4308_1
+  - setuptools=80.9.0=py310h06a4308_0
+  - sqlite=3.51.0=h2a70700_0
+  - sympy=1.14.0=py310h06a4308_1
+  - tbb=2022.3.0=h698db13_0
+  - tbb-devel=2022.3.0=h698db13_0
+  - tesseract=5.2.0=hb0d2e87_3
+  - tk=8.6.15=h54e0aa7_0
+  - typing_extensions=4.15.0=py310h06a4308_0
+  - urllib3=2.6.1=py310h06a4308_0
+  - wheel=0.45.1=py310h06a4308_0
+  - xorg-libx11=1.8.12=h9b100fa_1
+  - xorg-libxau=1.0.12=h9b100fa_0
+  - xorg-libxdmcp=1.1.5=h9b100fa_0
+  - xorg-libxext=1.3.6=h9b100fa_0
+  - xorg-libxrender=0.9.12=h9b100fa_0
+  - xorg-xorgproto=2024.1=h5eee18b_1
+  - xz=5.6.4=h5eee18b_1
+  - yaml=0.2.5=h7b6447c_0
+  - zlib=1.3.1=hb25bd0a_0
+  - zstd=1.5.7=h11fc155_0
+  - pip:
+      - acvl-utils==0.2.5
+      - argparse==1.4.0
+      - batchgenerators==0.25.1
+      - blosc2==3.12.2
+      - connected-components-3d==3.26.1
+      - contourpy==1.3.2
+      - cycler==0.12.1
+      - dicom2nifti==2.6.2
+      - dynamic-network-architectures==0.2
+      - einops==0.8.1
+      - fonttools==4.61.1
+      - fsspec==2025.12.0
+      - future==1.0.0
+      - hf-xet==1.2.0
+      - huggingface-hub==0.36.0
+      - imagecodecs==2025.3.30
+      - imageio==2.37.2
+      - importlib-resources==6.5.2
+      - joblib==1.5.3
+      - kiwisolver==1.4.9
+      - lazy-loader==0.4
+      - linecache2==1.0.0
+      - matplotlib==3.10.8
+      - monai==1.4.0
+      - msgpack==1.1.2
+      - ndindex==1.10.1
+      - nibabel==5.3.2
+      - nnunetv2==2.4.1
+      - numexpr==2.14.1
+      - numpy==1.26.4
+      - nvidia-cublas-cu12==12.1.3.1
+      - nvidia-cuda-cupti-cu12==12.1.105
+      - nvidia-cuda-nvrtc-cu12==12.1.105
+      - nvidia-cuda-runtime-cu12==12.1.105
+      - nvidia-cudnn-cu12==8.9.2.26
+      - nvidia-cufft-cu12==11.0.2.54
+      - nvidia-curand-cu12==10.3.2.106
+      - nvidia-cusolver-cu12==11.4.5.107
+      - nvidia-cusparse-cu12==12.1.0.106
+      - nvidia-nccl-cu12==2.19.3
+      - nvidia-nvjitlink-cu12==12.9.86
+      - nvidia-nvtx-cu12==12.1.105
+      - packaging==25.0
+      - pandas==2.3.3
+      - platformdirs==4.5.1
+      - positional-encodings==6.0.3
+      - py-cpuinfo==9.0.0
+      - pydicom==3.0.1
+      - pyparsing==3.2.5
+      - python-dateutil==2.9.0.post0
+      - python-gdcm==3.2.2
+      - python-graphviz==0.21
+      - pytz==2025.2
+      - regex==2025.11.3
+      - safetensors==0.7.0
+      - scikit-image==0.25.2
+      - scikit-learn==1.7.2
+      - scipy==1.15.3
+      - seaborn==0.13.2
+      - simpleitk==2.5.3
+      - six==1.17.0
+      - threadpoolctl==3.6.0
+      - tifffile==2025.5.10
+      - tokenizers==0.21.4
+      - torch==2.2.0+cu121
+      - torchaudio==2.2.0+cu121
+      - torchvision==0.17.0+cu121
+      - tqdm==4.67.1
+      - traceback2==1.4.0
+      - transformers==4.51.3
+      - triton==2.2.0
+      - tzdata==2025.3
+      - unittest2==1.1.0
+      - yacs==0.1.8
+prefix: /yinghepool/shipengcheng/.conda/envs/medals_local_test

evaluate/SurfaceDice.py ADDED Viewed

	@@ -0,0 +1,492 @@

+import numpy as np
+import scipy.ndimage
+# neighbour_code_to_normals is a lookup table.
+# For every binary neighbour code
+# (2x2x2 neighbourhood = 8 neighbours = 8 bits = 256 codes)
+# it contains the surface normals of the triangles (called "surfel" for
+# "surface element" in the following). The length of the normal
+# vector encodes the surfel area.
+#
+# created by compute_surface_area_lookup_table.ipynb using the
+# marching_cube algorithm, see e.g. https://en.wikipedia.org/wiki/Marching_cubes
+# credit to: http://medicaldecathlon.com/files/Surface_distance_based_measures.ipynb
+neighbour_code_to_normals = [
+  [[0,0,0]],
+  [[0.125,0.125,0.125]],
+  [[-0.125,-0.125,0.125]],
+  [[-0.25,-0.25,0.0],[0.25,0.25,-0.0]],
+  [[0.125,-0.125,0.125]],
+  [[-0.25,-0.0,-0.25],[0.25,0.0,0.25]],
+  [[0.125,-0.125,0.125],[-0.125,-0.125,0.125]],
+  [[0.5,0.0,-0.0],[0.25,0.25,0.25],[0.125,0.125,0.125]],
+  [[-0.125,0.125,0.125]],
+  [[0.125,0.125,0.125],[-0.125,0.125,0.125]],
+  [[-0.25,0.0,0.25],[-0.25,0.0,0.25]],
+  [[0.5,0.0,0.0],[-0.25,-0.25,0.25],[-0.125,-0.125,0.125]],
+  [[0.25,-0.25,0.0],[0.25,-0.25,0.0]],
+  [[0.5,0.0,0.0],[0.25,-0.25,0.25],[-0.125,0.125,-0.125]],
+  [[-0.5,0.0,0.0],[-0.25,0.25,0.25],[-0.125,0.125,0.125]],
+  [[0.5,0.0,0.0],[0.5,0.0,0.0]],
+  [[0.125,-0.125,-0.125]],
+  [[0.0,-0.25,-0.25],[0.0,0.25,0.25]],
+  [[-0.125,-0.125,0.125],[0.125,-0.125,-0.125]],
+  [[0.0,-0.5,0.0],[0.25,0.25,0.25],[0.125,0.125,0.125]],
+  [[0.125,-0.125,0.125],[0.125,-0.125,-0.125]],
+  [[0.0,0.0,-0.5],[0.25,0.25,0.25],[-0.125,-0.125,-0.125]],
+  [[-0.125,-0.125,0.125],[0.125,-0.125,0.125],[0.125,-0.125,-0.125]],
+  [[-0.125,-0.125,-0.125],[-0.25,-0.25,-0.25],[0.25,0.25,0.25],[0.125,0.125,0.125]],
+  [[-0.125,0.125,0.125],[0.125,-0.125,-0.125]],
+  [[0.0,-0.25,-0.25],[0.0,0.25,0.25],[-0.125,0.125,0.125]],
+  [[-0.25,0.0,0.25],[-0.25,0.0,0.25],[0.125,-0.125,-0.125]],
+  [[0.125,0.125,0.125],[0.375,0.375,0.375],[0.0,-0.25,0.25],[-0.25,0.0,0.25]],
+  [[0.125,-0.125,-0.125],[0.25,-0.25,0.0],[0.25,-0.25,0.0]],
+  [[0.375,0.375,0.375],[0.0,0.25,-0.25],[-0.125,-0.125,-0.125],[-0.25,0.25,0.0]],
+  [[-0.5,0.0,0.0],[-0.125,-0.125,-0.125],[-0.25,-0.25,-0.25],[0.125,0.125,0.125]],
+  [[-0.5,0.0,0.0],[-0.125,-0.125,-0.125],[-0.25,-0.25,-0.25]],
+  [[0.125,-0.125,0.125]],
+  [[0.125,0.125,0.125],[0.125,-0.125,0.125]],
+  [[0.0,-0.25,0.25],[0.0,0.25,-0.25]],
+  [[0.0,-0.5,0.0],[0.125,0.125,-0.125],[0.25,0.25,-0.25]],
+  [[0.125,-0.125,0.125],[0.125,-0.125,0.125]],
+  [[0.125,-0.125,0.125],[-0.25,-0.0,-0.25],[0.25,0.0,0.25]],
+  [[0.0,-0.25,0.25],[0.0,0.25,-0.25],[0.125,-0.125,0.125]],
+  [[-0.375,-0.375,0.375],[-0.0,0.25,0.25],[0.125,0.125,-0.125],[-0.25,-0.0,-0.25]],
+  [[-0.125,0.125,0.125],[0.125,-0.125,0.125]],
+  [[0.125,0.125,0.125],[0.125,-0.125,0.125],[-0.125,0.125,0.125]],
+  [[-0.0,0.0,0.5],[-0.25,-0.25,0.25],[-0.125,-0.125,0.125]],
+  [[0.25,0.25,-0.25],[0.25,0.25,-0.25],[0.125,0.125,-0.125],[-0.125,-0.125,0.125]],
+  [[0.125,-0.125,0.125],[0.25,-0.25,0.0],[0.25,-0.25,0.0]],
+  [[0.5,0.0,0.0],[0.25,-0.25,0.25],[-0.125,0.125,-0.125],[0.125,-0.125,0.125]],
+  [[0.0,0.25,-0.25],[0.375,-0.375,-0.375],[-0.125,0.125,0.125],[0.25,0.25,0.0]],
+  [[-0.5,0.0,0.0],[-0.25,-0.25,0.25],[-0.125,-0.125,0.125]],
+  [[0.25,-0.25,0.0],[-0.25,0.25,0.0]],
+  [[0.0,0.5,0.0],[-0.25,0.25,0.25],[0.125,-0.125,-0.125]],
+  [[0.0,0.5,0.0],[0.125,-0.125,0.125],[-0.25,0.25,-0.25]],
+  [[0.0,0.5,0.0],[0.0,-0.5,0.0]],
+  [[0.25,-0.25,0.0],[-0.25,0.25,0.0],[0.125,-0.125,0.125]],
+  [[-0.375,-0.375,-0.375],[-0.25,0.0,0.25],[-0.125,-0.125,-0.125],[-0.25,0.25,0.0]],
+  [[0.125,0.125,0.125],[0.0,-0.5,0.0],[-0.25,-0.25,-0.25],[-0.125,-0.125,-0.125]],
+  [[0.0,-0.5,0.0],[-0.25,-0.25,-0.25],[-0.125,-0.125,-0.125]],
+  [[-0.125,0.125,0.125],[0.25,-0.25,0.0],[-0.25,0.25,0.0]],
+  [[0.0,0.5,0.0],[0.25,0.25,-0.25],[-0.125,-0.125,0.125],[-0.125,-0.125,0.125]],
+  [[-0.375,0.375,-0.375],[-0.25,-0.25,0.0],[-0.125,0.125,-0.125],[-0.25,0.0,0.25]],
+  [[0.0,0.5,0.0],[0.25,0.25,-0.25],[-0.125,-0.125,0.125]],
+  [[0.25,-0.25,0.0],[-0.25,0.25,0.0],[0.25,-0.25,0.0],[0.25,-0.25,0.0]],
+  [[-0.25,-0.25,0.0],[-0.25,-0.25,0.0],[-0.125,-0.125,0.125]],
+  [[0.125,0.125,0.125],[-0.25,-0.25,0.0],[-0.25,-0.25,0.0]],
+  [[-0.25,-0.25,0.0],[-0.25,-0.25,0.0]],
+  [[-0.125,-0.125,0.125]],
+  [[0.125,0.125,0.125],[-0.125,-0.125,0.125]],
+  [[-0.125,-0.125,0.125],[-0.125,-0.125,0.125]],
+  [[-0.125,-0.125,0.125],[-0.25,-0.25,0.0],[0.25,0.25,-0.0]],
+  [[0.0,-0.25,0.25],[0.0,-0.25,0.25]],
+  [[0.0,0.0,0.5],[0.25,-0.25,0.25],[0.125,-0.125,0.125]],
+  [[0.0,-0.25,0.25],[0.0,-0.25,0.25],[-0.125,-0.125,0.125]],
+  [[0.375,-0.375,0.375],[0.0,-0.25,-0.25],[-0.125,0.125,-0.125],[0.25,0.25,0.0]],
+  [[-0.125,-0.125,0.125],[-0.125,0.125,0.125]],
+  [[0.125,0.125,0.125],[-0.125,-0.125,0.125],[-0.125,0.125,0.125]],
+  [[-0.125,-0.125,0.125],[-0.25,0.0,0.25],[-0.25,0.0,0.25]],
+  [[0.5,0.0,0.0],[-0.25,-0.25,0.25],[-0.125,-0.125,0.125],[-0.125,-0.125,0.125]],
+  [[-0.0,0.5,0.0],[-0.25,0.25,-0.25],[0.125,-0.125,0.125]],
+  [[-0.25,0.25,-0.25],[-0.25,0.25,-0.25],[-0.125,0.125,-0.125],[-0.125,0.125,-0.125]],
+  [[-0.25,0.0,-0.25],[0.375,-0.375,-0.375],[0.0,0.25,-0.25],[-0.125,0.125,0.125]],
+  [[0.5,0.0,0.0],[-0.25,0.25,-0.25],[0.125,-0.125,0.125]],
+  [[-0.25,0.0,0.25],[0.25,0.0,-0.25]],
+  [[-0.0,0.0,0.5],[-0.25,0.25,0.25],[-0.125,0.125,0.125]],
+  [[-0.125,-0.125,0.125],[-0.25,0.0,0.25],[0.25,0.0,-0.25]],
+  [[-0.25,-0.0,-0.25],[-0.375,0.375,0.375],[-0.25,-0.25,0.0],[-0.125,0.125,0.125]],
+  [[0.0,0.0,-0.5],[0.25,0.25,-0.25],[-0.125,-0.125,0.125]],
+  [[-0.0,0.0,0.5],[0.0,0.0,0.5]],
+  [[0.125,0.125,0.125],[0.125,0.125,0.125],[0.25,0.25,0.25],[0.0,0.0,0.5]],
+  [[0.125,0.125,0.125],[0.25,0.25,0.25],[0.0,0.0,0.5]],
+  [[-0.25,0.0,0.25],[0.25,0.0,-0.25],[-0.125,0.125,0.125]],
+  [[-0.0,0.0,0.5],[0.25,-0.25,0.25],[0.125,-0.125,0.125],[0.125,-0.125,0.125]],
+  [[-0.25,0.0,0.25],[-0.25,0.0,0.25],[-0.25,0.0,0.25],[0.25,0.0,-0.25]],
+  [[0.125,-0.125,0.125],[0.25,0.0,0.25],[0.25,0.0,0.25]],
+  [[0.25,0.0,0.25],[-0.375,-0.375,0.375],[-0.25,0.25,0.0],[-0.125,-0.125,0.125]],
+  [[-0.0,0.0,0.5],[0.25,-0.25,0.25],[0.125,-0.125,0.125]],
+  [[0.125,0.125,0.125],[0.25,0.0,0.25],[0.25,0.0,0.25]],
+  [[0.25,0.0,0.25],[0.25,0.0,0.25]],
+  [[-0.125,-0.125,0.125],[0.125,-0.125,0.125]],
+  [[0.125,0.125,0.125],[-0.125,-0.125,0.125],[0.125,-0.125,0.125]],
+  [[-0.125,-0.125,0.125],[0.0,-0.25,0.25],[0.0,0.25,-0.25]],
+  [[0.0,-0.5,0.0],[0.125,0.125,-0.125],[0.25,0.25,-0.25],[-0.125,-0.125,0.125]],
+  [[0.0,-0.25,0.25],[0.0,-0.25,0.25],[0.125,-0.125,0.125]],
+  [[0.0,0.0,0.5],[0.25,-0.25,0.25],[0.125,-0.125,0.125],[0.125,-0.125,0.125]],
+  [[0.0,-0.25,0.25],[0.0,-0.25,0.25],[0.0,-0.25,0.25],[0.0,0.25,-0.25]],
+  [[0.0,0.25,0.25],[0.0,0.25,0.25],[0.125,-0.125,-0.125]],
+  [[-0.125,0.125,0.125],[0.125,-0.125,0.125],[-0.125,-0.125,0.125]],
+  [[-0.125,0.125,0.125],[0.125,-0.125,0.125],[-0.125,-0.125,0.125],[0.125,0.125,0.125]],
+  [[-0.0,0.0,0.5],[-0.25,-0.25,0.25],[-0.125,-0.125,0.125],[-0.125,-0.125,0.125]],
+  [[0.125,0.125,0.125],[0.125,-0.125,0.125],[0.125,-0.125,-0.125]],
+  [[-0.0,0.5,0.0],[-0.25,0.25,-0.25],[0.125,-0.125,0.125],[0.125,-0.125,0.125]],
+  [[0.125,0.125,0.125],[-0.125,-0.125,0.125],[0.125,-0.125,-0.125]],
+  [[0.0,-0.25,-0.25],[0.0,0.25,0.25],[0.125,0.125,0.125]],
+  [[0.125,0.125,0.125],[0.125,-0.125,-0.125]],
+  [[0.5,0.0,-0.0],[0.25,-0.25,-0.25],[0.125,-0.125,-0.125]],
+  [[-0.25,0.25,0.25],[-0.125,0.125,0.125],[-0.25,0.25,0.25],[0.125,-0.125,-0.125]],
+  [[0.375,-0.375,0.375],[0.0,0.25,0.25],[-0.125,0.125,-0.125],[-0.25,0.0,0.25]],
+  [[0.0,-0.5,0.0],[-0.25,0.25,0.25],[-0.125,0.125,0.125]],
+  [[-0.375,-0.375,0.375],[0.25,-0.25,0.0],[0.0,0.25,0.25],[-0.125,-0.125,0.125]],
+  [[-0.125,0.125,0.125],[-0.25,0.25,0.25],[0.0,0.0,0.5]],
+  [[0.125,0.125,0.125],[0.0,0.25,0.25],[0.0,0.25,0.25]],
+  [[0.0,0.25,0.25],[0.0,0.25,0.25]],
+  [[0.5,0.0,-0.0],[0.25,0.25,0.25],[0.125,0.125,0.125],[0.125,0.125,0.125]],
+  [[0.125,-0.125,0.125],[-0.125,-0.125,0.125],[0.125,0.125,0.125]],
+  [[-0.25,-0.0,-0.25],[0.25,0.0,0.25],[0.125,0.125,0.125]],
+  [[0.125,0.125,0.125],[0.125,-0.125,0.125]],
+  [[-0.25,-0.25,0.0],[0.25,0.25,-0.0],[0.125,0.125,0.125]],
+  [[0.125,0.125,0.125],[-0.125,-0.125,0.125]],
+  [[0.125,0.125,0.125],[0.125,0.125,0.125]],
+  [[0.125,0.125,0.125]],
+  [[0.125,0.125,0.125]],
+  [[0.125,0.125,0.125],[0.125,0.125,0.125]],
+  [[0.125,0.125,0.125],[-0.125,-0.125,0.125]],
+  [[-0.25,-0.25,0.0],[0.25,0.25,-0.0],[0.125,0.125,0.125]],
+  [[0.125,0.125,0.125],[0.125,-0.125,0.125]],
+  [[-0.25,-0.0,-0.25],[0.25,0.0,0.25],[0.125,0.125,0.125]],
+  [[0.125,-0.125,0.125],[-0.125,-0.125,0.125],[0.125,0.125,0.125]],
+  [[0.5,0.0,-0.0],[0.25,0.25,0.25],[0.125,0.125,0.125],[0.125,0.125,0.125]],
+  [[0.0,0.25,0.25],[0.0,0.25,0.25]],
+  [[0.125,0.125,0.125],[0.0,0.25,0.25],[0.0,0.25,0.25]],
+  [[-0.125,0.125,0.125],[-0.25,0.25,0.25],[0.0,0.0,0.5]],
+  [[-0.375,-0.375,0.375],[0.25,-0.25,0.0],[0.0,0.25,0.25],[-0.125,-0.125,0.125]],
+  [[0.0,-0.5,0.0],[-0.25,0.25,0.25],[-0.125,0.125,0.125]],
+  [[0.375,-0.375,0.375],[0.0,0.25,0.25],[-0.125,0.125,-0.125],[-0.25,0.0,0.25]],
+  [[-0.25,0.25,0.25],[-0.125,0.125,0.125],[-0.25,0.25,0.25],[0.125,-0.125,-0.125]],
+  [[0.5,0.0,-0.0],[0.25,-0.25,-0.25],[0.125,-0.125,-0.125]],
+  [[0.125,0.125,0.125],[0.125,-0.125,-0.125]],
+  [[0.0,-0.25,-0.25],[0.0,0.25,0.25],[0.125,0.125,0.125]],
+  [[0.125,0.125,0.125],[-0.125,-0.125,0.125],[0.125,-0.125,-0.125]],
+  [[-0.0,0.5,0.0],[-0.25,0.25,-0.25],[0.125,-0.125,0.125],[0.125,-0.125,0.125]],
+  [[0.125,0.125,0.125],[0.125,-0.125,0.125],[0.125,-0.125,-0.125]],
+  [[-0.0,0.0,0.5],[-0.25,-0.25,0.25],[-0.125,-0.125,0.125],[-0.125,-0.125,0.125]],
+  [[-0.125,0.125,0.125],[0.125,-0.125,0.125],[-0.125,-0.125,0.125],[0.125,0.125,0.125]],
+  [[-0.125,0.125,0.125],[0.125,-0.125,0.125],[-0.125,-0.125,0.125]],
+  [[0.0,0.25,0.25],[0.0,0.25,0.25],[0.125,-0.125,-0.125]],
+  [[0.0,-0.25,-0.25],[0.0,0.25,0.25],[0.0,0.25,0.25],[0.0,0.25,0.25]],
+  [[0.0,0.0,0.5],[0.25,-0.25,0.25],[0.125,-0.125,0.125],[0.125,-0.125,0.125]],
+  [[0.0,-0.25,0.25],[0.0,-0.25,0.25],[0.125,-0.125,0.125]],
+  [[0.0,-0.5,0.0],[0.125,0.125,-0.125],[0.25,0.25,-0.25],[-0.125,-0.125,0.125]],
+  [[-0.125,-0.125,0.125],[0.0,-0.25,0.25],[0.0,0.25,-0.25]],
+  [[0.125,0.125,0.125],[-0.125,-0.125,0.125],[0.125,-0.125,0.125]],
+  [[-0.125,-0.125,0.125],[0.125,-0.125,0.125]],
+  [[0.25,0.0,0.25],[0.25,0.0,0.25]],
+  [[0.125,0.125,0.125],[0.25,0.0,0.25],[0.25,0.0,0.25]],
+  [[-0.0,0.0,0.5],[0.25,-0.25,0.25],[0.125,-0.125,0.125]],
+  [[0.25,0.0,0.25],[-0.375,-0.375,0.375],[-0.25,0.25,0.0],[-0.125,-0.125,0.125]],
+  [[0.125,-0.125,0.125],[0.25,0.0,0.25],[0.25,0.0,0.25]],
+  [[-0.25,-0.0,-0.25],[0.25,0.0,0.25],[0.25,0.0,0.25],[0.25,0.0,0.25]],
+  [[-0.0,0.0,0.5],[0.25,-0.25,0.25],[0.125,-0.125,0.125],[0.125,-0.125,0.125]],
+  [[-0.25,0.0,0.25],[0.25,0.0,-0.25],[-0.125,0.125,0.125]],
+  [[0.125,0.125,0.125],[0.25,0.25,0.25],[0.0,0.0,0.5]],
+  [[0.125,0.125,0.125],[0.125,0.125,0.125],[0.25,0.25,0.25],[0.0,0.0,0.5]],
+  [[-0.0,0.0,0.5],[0.0,0.0,0.5]],
+  [[0.0,0.0,-0.5],[0.25,0.25,-0.25],[-0.125,-0.125,0.125]],
+  [[-0.25,-0.0,-0.25],[-0.375,0.375,0.375],[-0.25,-0.25,0.0],[-0.125,0.125,0.125]],
+  [[-0.125,-0.125,0.125],[-0.25,0.0,0.25],[0.25,0.0,-0.25]],
+  [[-0.0,0.0,0.5],[-0.25,0.25,0.25],[-0.125,0.125,0.125]],
+  [[-0.25,0.0,0.25],[0.25,0.0,-0.25]],
+  [[0.5,0.0,0.0],[-0.25,0.25,-0.25],[0.125,-0.125,0.125]],
+  [[-0.25,0.0,-0.25],[0.375,-0.375,-0.375],[0.0,0.25,-0.25],[-0.125,0.125,0.125]],
+  [[-0.25,0.25,-0.25],[-0.25,0.25,-0.25],[-0.125,0.125,-0.125],[-0.125,0.125,-0.125]],
+  [[-0.0,0.5,0.0],[-0.25,0.25,-0.25],[0.125,-0.125,0.125]],
+  [[0.5,0.0,0.0],[-0.25,-0.25,0.25],[-0.125,-0.125,0.125],[-0.125,-0.125,0.125]],
+  [[-0.125,-0.125,0.125],[-0.25,0.0,0.25],[-0.25,0.0,0.25]],
+  [[0.125,0.125,0.125],[-0.125,-0.125,0.125],[-0.125,0.125,0.125]],
+  [[-0.125,-0.125,0.125],[-0.125,0.125,0.125]],
+  [[0.375,-0.375,0.375],[0.0,-0.25,-0.25],[-0.125,0.125,-0.125],[0.25,0.25,0.0]],
+  [[0.0,-0.25,0.25],[0.0,-0.25,0.25],[-0.125,-0.125,0.125]],
+  [[0.0,0.0,0.5],[0.25,-0.25,0.25],[0.125,-0.125,0.125]],
+  [[0.0,-0.25,0.25],[0.0,-0.25,0.25]],
+  [[-0.125,-0.125,0.125],[-0.25,-0.25,0.0],[0.25,0.25,-0.0]],
+  [[-0.125,-0.125,0.125],[-0.125,-0.125,0.125]],
+  [[0.125,0.125,0.125],[-0.125,-0.125,0.125]],
+  [[-0.125,-0.125,0.125]],
+  [[-0.25,-0.25,0.0],[-0.25,-0.25,0.0]],
+  [[0.125,0.125,0.125],[-0.25,-0.25,0.0],[-0.25,-0.25,0.0]],
+  [[-0.25,-0.25,0.0],[-0.25,-0.25,0.0],[-0.125,-0.125,0.125]],
+  [[-0.25,-0.25,0.0],[-0.25,-0.25,0.0],[-0.25,-0.25,0.0],[0.25,0.25,-0.0]],
+  [[0.0,0.5,0.0],[0.25,0.25,-0.25],[-0.125,-0.125,0.125]],
+  [[-0.375,0.375,-0.375],[-0.25,-0.25,0.0],[-0.125,0.125,-0.125],[-0.25,0.0,0.25]],
+  [[0.0,0.5,0.0],[0.25,0.25,-0.25],[-0.125,-0.125,0.125],[-0.125,-0.125,0.125]],
+  [[-0.125,0.125,0.125],[0.25,-0.25,0.0],[-0.25,0.25,0.0]],
+  [[0.0,-0.5,0.0],[-0.25,-0.25,-0.25],[-0.125,-0.125,-0.125]],
+  [[0.125,0.125,0.125],[0.0,-0.5,0.0],[-0.25,-0.25,-0.25],[-0.125,-0.125,-0.125]],
+  [[-0.375,-0.375,-0.375],[-0.25,0.0,0.25],[-0.125,-0.125,-0.125],[-0.25,0.25,0.0]],
+  [[0.25,-0.25,0.0],[-0.25,0.25,0.0],[0.125,-0.125,0.125]],
+  [[0.0,0.5,0.0],[0.0,-0.5,0.0]],
+  [[0.0,0.5,0.0],[0.125,-0.125,0.125],[-0.25,0.25,-0.25]],
+  [[0.0,0.5,0.0],[-0.25,0.25,0.25],[0.125,-0.125,-0.125]],
+  [[0.25,-0.25,0.0],[-0.25,0.25,0.0]],
+  [[-0.5,0.0,0.0],[-0.25,-0.25,0.25],[-0.125,-0.125,0.125]],
+  [[0.0,0.25,-0.25],[0.375,-0.375,-0.375],[-0.125,0.125,0.125],[0.25,0.25,0.0]],
+  [[0.5,0.0,0.0],[0.25,-0.25,0.25],[-0.125,0.125,-0.125],[0.125,-0.125,0.125]],
+  [[0.125,-0.125,0.125],[0.25,-0.25,0.0],[0.25,-0.25,0.0]],
+  [[0.25,0.25,-0.25],[0.25,0.25,-0.25],[0.125,0.125,-0.125],[-0.125,-0.125,0.125]],
+  [[-0.0,0.0,0.5],[-0.25,-0.25,0.25],[-0.125,-0.125,0.125]],
+  [[0.125,0.125,0.125],[0.125,-0.125,0.125],[-0.125,0.125,0.125]],
+  [[-0.125,0.125,0.125],[0.125,-0.125,0.125]],
+  [[-0.375,-0.375,0.375],[-0.0,0.25,0.25],[0.125,0.125,-0.125],[-0.25,-0.0,-0.25]],
+  [[0.0,-0.25,0.25],[0.0,0.25,-0.25],[0.125,-0.125,0.125]],
+  [[0.125,-0.125,0.125],[-0.25,-0.0,-0.25],[0.25,0.0,0.25]],
+  [[0.125,-0.125,0.125],[0.125,-0.125,0.125]],
+  [[0.0,-0.5,0.0],[0.125,0.125,-0.125],[0.25,0.25,-0.25]],
+  [[0.0,-0.25,0.25],[0.0,0.25,-0.25]],
+  [[0.125,0.125,0.125],[0.125,-0.125,0.125]],
+  [[0.125,-0.125,0.125]],
+  [[-0.5,0.0,0.0],[-0.125,-0.125,-0.125],[-0.25,-0.25,-0.25]],
+  [[-0.5,0.0,0.0],[-0.125,-0.125,-0.125],[-0.25,-0.25,-0.25],[0.125,0.125,0.125]],
+  [[0.375,0.375,0.375],[0.0,0.25,-0.25],[-0.125,-0.125,-0.125],[-0.25,0.25,0.0]],
+  [[0.125,-0.125,-0.125],[0.25,-0.25,0.0],[0.25,-0.25,0.0]],
+  [[0.125,0.125,0.125],[0.375,0.375,0.375],[0.0,-0.25,0.25],[-0.25,0.0,0.25]],
+  [[-0.25,0.0,0.25],[-0.25,0.0,0.25],[0.125,-0.125,-0.125]],
+  [[0.0,-0.25,-0.25],[0.0,0.25,0.25],[-0.125,0.125,0.125]],
+  [[-0.125,0.125,0.125],[0.125,-0.125,-0.125]],
+  [[-0.125,-0.125,-0.125],[-0.25,-0.25,-0.25],[0.25,0.25,0.25],[0.125,0.125,0.125]],
+  [[-0.125,-0.125,0.125],[0.125,-0.125,0.125],[0.125,-0.125,-0.125]],
+  [[0.0,0.0,-0.5],[0.25,0.25,0.25],[-0.125,-0.125,-0.125]],
+  [[0.125,-0.125,0.125],[0.125,-0.125,-0.125]],
+  [[0.0,-0.5,0.0],[0.25,0.25,0.25],[0.125,0.125,0.125]],
+  [[-0.125,-0.125,0.125],[0.125,-0.125,-0.125]],
+  [[0.0,-0.25,-0.25],[0.0,0.25,0.25]],
+  [[0.125,-0.125,-0.125]],
+  [[0.5,0.0,0.0],[0.5,0.0,0.0]],
+  [[-0.5,0.0,0.0],[-0.25,0.25,0.25],[-0.125,0.125,0.125]],
+  [[0.5,0.0,0.0],[0.25,-0.25,0.25],[-0.125,0.125,-0.125]],
+  [[0.25,-0.25,0.0],[0.25,-0.25,0.0]],
+  [[0.5,0.0,0.0],[-0.25,-0.25,0.25],[-0.125,-0.125,0.125]],
+  [[-0.25,0.0,0.25],[-0.25,0.0,0.25]],
+  [[0.125,0.125,0.125],[-0.125,0.125,0.125]],
+  [[-0.125,0.125,0.125]],
+  [[0.5,0.0,-0.0],[0.25,0.25,0.25],[0.125,0.125,0.125]],
+  [[0.125,-0.125,0.125],[-0.125,-0.125,0.125]],
+  [[-0.25,-0.0,-0.25],[0.25,0.0,0.25]],
+  [[0.125,-0.125,0.125]],
+  [[-0.25,-0.25,0.0],[0.25,0.25,-0.0]],
+  [[-0.125,-0.125,0.125]],
+  [[0.125,0.125,0.125]],
+  [[0,0,0]]]
+def compute_surface_distances(mask_gt, mask_pred, spacing_mm):
+  """Compute closest distances from all surface points to the other surface.
+  Finds all surface elements "surfels" in the ground truth mask `mask_gt` and
+  the predicted mask `mask_pred`, computes their area in mm^2 and the distance
+  to the closest point on the other surface. It returns two sorted lists of
+  distances together with the corresponding surfel areas. If one of the masks
+  is empty, the corresponding lists are empty and all distances in the other
+  list are `inf`
+  Args:
+    mask_gt: 3-dim Numpy array of type bool. The ground truth mask.
+    mask_pred: 3-dim Numpy array of type bool. The predicted mask.
+    spacing_mm: 3-element list-like structure. Voxel spacing in x0, x1 and x2
+        direction
+  Returns:
+    A dict with
+    "distances_gt_to_pred": 1-dim numpy array of type float. The distances in mm
+        from all ground truth surface elements to the predicted surface,
+        sorted from smallest to largest
+    "distances_pred_to_gt": 1-dim numpy array of type float. The distances in mm
+        from all predicted surface elements to the ground truth surface,
+        sorted from smallest to largest
+    "surfel_areas_gt": 1-dim numpy array of type float. The area in mm^2 of
+        the ground truth surface elements in the same order as
+        distances_gt_to_pred
+    "surfel_areas_pred": 1-dim numpy array of type float. The area in mm^2 of
+        the predicted surface elements in the same order as
+        distances_pred_to_gt
+  """
+  # compute the area for all 256 possible surface elements
+  # (given a 2x2x2 neighbourhood) according to the spacing_mm
+  neighbour_code_to_surface_area = np.zeros([256])
+  for code in range(256):
+    normals = np.array(neighbour_code_to_normals[code])
+    sum_area = 0
+    for normal_idx in range(normals.shape[0]):
+      # normal vector
+      n = np.zeros([3])
+      n[0] = normals[normal_idx,0] * spacing_mm[1] * spacing_mm[2]
+      n[1] = normals[normal_idx,1] * spacing_mm[0] * spacing_mm[2]
+      n[2] = normals[normal_idx,2] * spacing_mm[0] * spacing_mm[1]
+      area = np.linalg.norm(n)
+      sum_area += area
+    neighbour_code_to_surface_area[code] = sum_area
+  # compute the bounding box of the masks to trim
+  # the volume to the smallest possible processing subvolume
+  mask_all = mask_gt | mask_pred
+  bbox_min = np.zeros(3, np.int64)
+  bbox_max = np.zeros(3, np.int64)
+  # max projection to the x0-axis
+  proj_0 = np.max(np.max(mask_all, axis=2), axis=1)
+  idx_nonzero_0 = np.nonzero(proj_0)[0]
+  if len(idx_nonzero_0) == 0:
+    return {"distances_gt_to_pred":  np.array([]),
+            "distances_pred_to_gt":  np.array([]),
+            "surfel_areas_gt":       np.array([]),
+            "surfel_areas_pred":     np.array([])}
+  bbox_min[0] = np.min(idx_nonzero_0)
+  bbox_max[0] = np.max(idx_nonzero_0)
+  # max projection to the x1-axis
+  proj_1 = np.max(np.max(mask_all, axis=2), axis=0)
+  idx_nonzero_1 = np.nonzero(proj_1)[0]
+  bbox_min[1] = np.min(idx_nonzero_1)
+  bbox_max[1] = np.max(idx_nonzero_1)
+  # max projection to the x2-axis
+  proj_2 = np.max(np.max(mask_all, axis=1), axis=0)
+  idx_nonzero_2 = np.nonzero(proj_2)[0]
+  bbox_min[2] = np.min(idx_nonzero_2)
+  bbox_max[2] = np.max(idx_nonzero_2)
+#  print("bounding box min = {}".format(bbox_min))
+#  print("bounding box max = {}".format(bbox_max))
+  # crop the processing subvolume.
+  # we need to zeropad the cropped region with 1 voxel at the lower,
+  # the right and the back side. This is required to obtain the "full"
+  # convolution result with the 2x2x2 kernel
+  cropmask_gt = np.zeros((bbox_max - bbox_min)+2, np.uint8)
+  cropmask_pred = np.zeros((bbox_max - bbox_min)+2, np.uint8)
+  cropmask_gt[0:-1, 0:-1, 0:-1] = mask_gt[bbox_min[0]:bbox_max[0]+1,
+                                          bbox_min[1]:bbox_max[1]+1,
+                                          bbox_min[2]:bbox_max[2]+1]
+  cropmask_pred[0:-1, 0:-1, 0:-1] = mask_pred[bbox_min[0]:bbox_max[0]+1,
+                                              bbox_min[1]:bbox_max[1]+1,
+                                              bbox_min[2]:bbox_max[2]+1]
+  # compute the neighbour code (local binary pattern) for each voxel
+  # the resultsing arrays are spacially shifted by minus half a voxel in each axis.
+  # i.e. the points are located at the corners of the original voxels
+  kernel = np.array([[[128,64],
+                      [32,16]],
+                     [[8,4],
+                      [2,1]]])
+  neighbour_code_map_gt = scipy.ndimage.filters.correlate(cropmask_gt.astype(np.uint8), kernel, mode="constant", cval=0)
+  neighbour_code_map_pred = scipy.ndimage.filters.correlate(cropmask_pred.astype(np.uint8), kernel, mode="constant", cval=0)
+  # create masks with the surface voxels
+  borders_gt   = ((neighbour_code_map_gt != 0) & (neighbour_code_map_gt != 255))
+  borders_pred = ((neighbour_code_map_pred != 0) & (neighbour_code_map_pred != 255))
+  # compute the distance transform (closest distance of each voxel to the surface voxels)
+  if borders_gt.any():
+    distmap_gt = scipy.ndimage.morphology.distance_transform_edt(~borders_gt, sampling=spacing_mm)
+  else:
+    distmap_gt = np.Inf * np.ones(borders_gt.shape)
+  if borders_pred.any():
+    distmap_pred = scipy.ndimage.morphology.distance_transform_edt(~borders_pred, sampling=spacing_mm)
+  else:
+    distmap_pred = np.Inf * np.ones(borders_pred.shape)
+  # compute the area of each surface element
+  surface_area_map_gt = neighbour_code_to_surface_area[neighbour_code_map_gt]
+  surface_area_map_pred = neighbour_code_to_surface_area[neighbour_code_map_pred]
+  # create a list of all surface elements with distance and area
+  distances_gt_to_pred = distmap_pred[borders_gt]
+  distances_pred_to_gt = distmap_gt[borders_pred]
+  surfel_areas_gt   = surface_area_map_gt[borders_gt]
+  surfel_areas_pred = surface_area_map_pred[borders_pred]
+  # sort them by distance
+  if distances_gt_to_pred.shape != (0,):
+    sorted_surfels_gt = np.array(sorted(zip(distances_gt_to_pred, surfel_areas_gt)))
+    distances_gt_to_pred = sorted_surfels_gt[:,0]
+    surfel_areas_gt      = sorted_surfels_gt[:,1]
+  if distances_pred_to_gt.shape != (0,):
+    sorted_surfels_pred = np.array(sorted(zip(distances_pred_to_gt, surfel_areas_pred)))
+    distances_pred_to_gt = sorted_surfels_pred[:,0]
+    surfel_areas_pred    = sorted_surfels_pred[:,1]
+  return {"distances_gt_to_pred":  distances_gt_to_pred,
+          "distances_pred_to_gt":  distances_pred_to_gt,
+          "surfel_areas_gt":       surfel_areas_gt,
+          "surfel_areas_pred":     surfel_areas_pred}
+def compute_average_surface_distance(surface_distances):
+  distances_gt_to_pred = surface_distances["distances_gt_to_pred"]
+  distances_pred_to_gt = surface_distances["distances_pred_to_gt"]
+  surfel_areas_gt      = surface_distances["surfel_areas_gt"]
+  surfel_areas_pred    = surface_distances["surfel_areas_pred"]
+  average_distance_gt_to_pred = np.sum( distances_gt_to_pred * surfel_areas_gt) / np.sum(surfel_areas_gt)
+  average_distance_pred_to_gt = np.sum( distances_pred_to_gt * surfel_areas_pred) / np.sum(surfel_areas_pred)
+  return (average_distance_gt_to_pred, average_distance_pred_to_gt)
+def compute_robust_hausdorff(surface_distances, percent):
+  distances_gt_to_pred = surface_distances["distances_gt_to_pred"]
+  distances_pred_to_gt = surface_distances["distances_pred_to_gt"]
+  surfel_areas_gt      = surface_distances["surfel_areas_gt"]
+  surfel_areas_pred    = surface_distances["surfel_areas_pred"]
+  if len(distances_gt_to_pred) > 0:
+    surfel_areas_cum_gt   = np.cumsum(surfel_areas_gt) / np.sum(surfel_areas_gt)
+    idx = np.searchsorted(surfel_areas_cum_gt, percent/100.0)
+    perc_distance_gt_to_pred = distances_gt_to_pred[min(idx, len(distances_gt_to_pred)-1)]
+  else:
+    perc_distance_gt_to_pred = np.Inf
+  if len(distances_pred_to_gt) > 0:
+    surfel_areas_cum_pred = np.cumsum(surfel_areas_pred) / np.sum(surfel_areas_pred)
+    idx = np.searchsorted(surfel_areas_cum_pred, percent/100.0)
+    perc_distance_pred_to_gt = distances_pred_to_gt[min(idx, len(distances_pred_to_gt)-1)]
+  else:
+    perc_distance_pred_to_gt = np.Inf
+  return max( perc_distance_gt_to_pred, perc_distance_pred_to_gt)
+def compute_surface_overlap_at_tolerance(surface_distances, tolerance_mm):
+  distances_gt_to_pred = surface_distances["distances_gt_to_pred"]
+  distances_pred_to_gt = surface_distances["distances_pred_to_gt"]
+  surfel_areas_gt      = surface_distances["surfel_areas_gt"]
+  surfel_areas_pred    = surface_distances["surfel_areas_pred"]
+  rel_overlap_gt   = np.sum(surfel_areas_gt[distances_gt_to_pred <= tolerance_mm]) / np.sum(surfel_areas_gt)
+  rel_overlap_pred = np.sum(surfel_areas_pred[distances_pred_to_gt <= tolerance_mm]) / np.sum(surfel_areas_pred)
+  return (rel_overlap_gt, rel_overlap_pred)
+def compute_surface_dice_at_tolerance(surface_distances, tolerance_mm):
+  distances_gt_to_pred = surface_distances["distances_gt_to_pred"]
+  distances_pred_to_gt = surface_distances["distances_pred_to_gt"]
+  surfel_areas_gt      = surface_distances["surfel_areas_gt"]
+  surfel_areas_pred    = surface_distances["surfel_areas_pred"]
+  overlap_gt   = np.sum(surfel_areas_gt[distances_gt_to_pred <= tolerance_mm])
+  overlap_pred = np.sum(surfel_areas_pred[distances_pred_to_gt <= tolerance_mm])
+  surface_dice = (overlap_gt + overlap_pred) / (
+      np.sum(surfel_areas_gt) + np.sum(surfel_areas_pred))
+  return surface_dice
+def compute_dice_coefficient(mask_gt, mask_pred):
+  """Compute soerensen-dice coefficient.
+  compute the soerensen-dice coefficient between the ground truth mask `mask_gt`
+  and the predicted mask `mask_pred`.
+  Args:
+    mask_gt: 3-dim Numpy array of type bool. The ground truth mask.
+    mask_pred: 3-dim Numpy array of type bool. The predicted mask.
+  Returns:
+    the dice coeffcient as float. If both masks are empty, the result is NaN
+  """
+  volume_sum = mask_gt.sum() + mask_pred.sum()
+  if volume_sum == 0:
+    return np.NaN
+  volume_intersect = (mask_gt & mask_pred).sum()
+  return 2*volume_intersect / volume_sum

evaluate/__init__.py ADDED Viewed

File without changes

evaluate/evaluator.py ADDED Viewed

	@@ -0,0 +1,379 @@

+import os
+import time
+import torch
+from torch.cuda.amp import autocast as autocast
+from tqdm import tqdm
+from einops import rearrange, repeat, reduce
+import numpy as np
+import pandas as pd
+from pathlib import Path
+import nibabel as nib
+import shutil
+import pickle
+from scipy.ndimage import gaussian_filter
+import torch.distributed as dist
+from evaluate.metric import calculate_metric_percase
+from evaluate.merge_after_evaluate import merge
+from train.dist import is_master
+def compute_gaussian(tile_size, sigma_scale: float = 1. / 8, value_scaling_factor: float = 10, dtype=np.float16):
+    tmp = np.zeros(tile_size)
+    center_coords = [i // 2 for i in tile_size]
+    sigmas = [i * sigma_scale for i in tile_size]
+    tmp[tuple(center_coords)] = 1
+    gaussian_importance_map = gaussian_filter(tmp, sigmas, 0, mode='constant', cval=0)
+    # gaussian_importance_map = torch.from_numpy(gaussian_importance_map)
+    gaussian_importance_map = gaussian_importance_map / np.max(gaussian_importance_map) * value_scaling_factor
+    gaussian_importance_map = gaussian_importance_map.astype(dtype)
+    # gaussian_importance_map cannot be 0, otherwise we may end up with nans!
+    gaussian_importance_map[gaussian_importance_map == 0] = np.min(
+        gaussian_importance_map[gaussian_importance_map != 0])
+    return gaussian_importance_map
+def evaluate(model,
+             text_encoder,
+             device,
+             testset,
+             testloader,
+             dice_score,
+             nsd_score,
+             csv_path,
+             resume,
+             save_interval,
+             visualization):
+    # if to store pred、gt、img (as nii.gz
+    if visualization:
+        nib_dir = csv_path.replace('.csv', '')
+    # collate in master process
+    if is_master():
+        # datasets --> labels --> metrics
+        datasets_labels_metrics = {}   # {'COVID19':{'covid19_infection':{'dice':[0.8, 0.9, ...], ...} ...}, ...}
+        # datasets --> samples --> labels --> metrics
+        samples_labels_metrics = {}   # {'COVID19':{'0.npy':{'covid19_infection':{'dice':0.8, ...} ...}, ...} 记录每个dataset里的sample（行）
+        # datsets --> labels
+        datasets_labels_sets = {}    # {'COVID19':set('covid19_infection', ...), ...}  记录每个dataset里的label种类（列）
+    # accumulate scores of each sample in each process
+    results_of_samples = [] # each element : [dataset_name, modality, sample_id, scores_of_labels(dict), label_names]
+    # load results from an interrupted eval (only in master process)
+    if resume and is_master():
+        root_dir = os.path.dirname(csv_path)
+        prefix = os.path.basename(csv_path).replace('.csv', '_tmp_rank')  # xxx/test/step_xxx.csv --> step_xxx_tmp_rank
+        pkl_to_del = []
+        for f in os.listdir(root_dir):
+            if prefix in f:
+                # load list of results
+                pkl_path = f'{root_dir}/{f}'
+                with open(pkl_path, 'rb') as f:
+                    results_of_samples += pickle.load(f)
+                print(f'Load results from {pkl_path}')
+                pkl_to_del.append(pkl_path)
+        # there may be duplication? We leave the deduplication to the final merge
+        # merge all the loaded samples, del the tmp pickle files in previous evaluation task
+        for pkl_path in pkl_to_del:
+            os.remove(pkl_path)
+            print(f'Del {pkl_path}')
+        merge_pkl = csv_path.replace('.csv', f'_tmp_rank0.pkl')
+        with open(merge_pkl, 'wb') as f:
+            pickle.dump(results_of_samples, f)
+        print(f'Load results of {len(results_of_samples)} samples, Merge into {merge_pkl}')
+    model.eval()
+    text_encoder.eval()
+    with torch.no_grad():
+        data_time = 0
+        pred_time = 0
+        metric_time = 0
+        avg_patch_batch_num = 0
+        avg_query_batch_num = 0
+        # in ddp, only master process display the progress bar
+        if is_master():
+            testloader = tqdm(testloader, disable=False)
+        else:
+            testloader = tqdm(testloader, disable=True)
+        # gaussian kernel to accumulate predcition
+        gaussian = torch.tensor(compute_gaussian((288, 288, 96))).to(device)    # hwd
+        end_time = time.time()
+        for sample in testloader:    # in evaluation/inference, a "batch" in loader is a volume
+            # data loading
+            dataset_name = sample['dataset_name']
+            sample_id = sample['sample_id']
+            batched_patches = sample['batched_patches']
+            batched_y1y2_x1x2_z1z2 = sample['batched_y1y2_x1x2_z1z2']
+            labels = sample['labels']
+            gt_segmentation = sample['gt_segmentation'].numpy()  # n h w d
+            modality = sample['modality']
+            image_path = sample['image_path']
+            n,h,w,d = gt_segmentation.shape
+            prediction = torch.zeros((n, h, w, d))
+            accumulation = torch.zeros((n, h, w, d))
+            data_time += (time.time()-end_time)
+            end_time = time.time()
+            with autocast():
+                queries = text_encoder(labels, modality)
+                # for each batch of patches, query with all labels
+                for patches, y1y2_x1x2_z1z2_ls in zip(batched_patches, batched_y1y2_x1x2_z1z2):   # [b, c, h, w, d]
+                    patches = patches.to(device=device)
+                    prediction_patch = model(queries=queries, image_input=patches, train_mode=False)
+                    prediction_patch = torch.sigmoid(prediction_patch)  # bnhwd
+                    prediction_patch = prediction_patch.detach() # .cpu().numpy()
+                    # fill in
+                    for b in range(len(y1y2_x1x2_z1z2_ls)):
+                        y1, y2, x1, x2, z1, z2 = y1y2_x1x2_z1z2_ls[b]
+                        # gaussian accumulation
+                        tmp = prediction_patch[b, :, :y2-y1, :x2-x1, :z2-z1] * gaussian[:y2-y1, :x2-x1, :z2-z1] # on gpu
+                        prediction[:, y1:y2, x1:x2, z1:z2] += tmp.cpu()
+                        accumulation[:, y1:y2, x1:x2, z1:z2] += gaussian[:y2-y1, :x2-x1, :z2-z1].cpu()
+            pred_time += (time.time()-end_time)
+            end_time = time.time()
+            # avg
+            prediction = prediction / accumulation
+            prediction = torch.where(prediction>0.5, 1.0, 0.0)
+            prediction = prediction.numpy()
+            # cal metrics : [{'dice':x, ...}, ...]
+            scores = []
+            for j in range(len(labels)):
+                scores.append(calculate_metric_percase(prediction[j, :, :, :], gt_segmentation[j, :, :, :], dice_score, nsd_score))    # {'dice':0.9, 'nsd':0.8} 每个label一个dict
+            # visualization
+            if visualization:
+                Path(f'{nib_dir}/{dataset_name}').mkdir(exist_ok=True, parents=True)
+                # 将image、gt和prediction保存下来
+                results = np.zeros((h, w, d)) # hwd
+                for j, label in enumerate(labels):
+                    results += prediction[j, :, :, :] * (j+1)   # 0 --> 1 (skip background)
+                    Path(f'{nib_dir}/{dataset_name}/seg_{sample_id}').mkdir(exist_ok=True, parents=True)
+                    # 每个label单独一个nii.gz
+                    segobj = nib.nifti2.Nifti1Image(prediction[j, :, :, :], np.eye(4))
+                    nib.save(segobj, f'{nib_dir}/{dataset_name}/seg_{sample_id}/{label}.nii.gz')
+                segobj = nib.nifti2.Nifti1Image(results, np.eye(4))
+                nib.save(segobj, f'{nib_dir}/{dataset_name}/seg_{sample_id}.nii.gz')
+                image = testset.load_image(image_path)
+                image = np.squeeze(image)
+                imgobj = nib.nifti2.Nifti1Image(image, np.eye(4))
+                nib.save(imgobj, f'{nib_dir}/{dataset_name}/img_{sample_id}.nii.gz')
+                gt = np.zeros((h, w, d)) # hwd
+                for j, label in enumerate(labels):
+                    gt += gt_segmentation[j, :, :, :] * (j+1)   # 0 --> 1 (skip background)
+                    Path(f'{nib_dir}/{dataset_name}/gt_{sample_id}').mkdir(exist_ok=True, parents=True)
+                    # 每个label单独一个nii.gz
+                    segobj = nib.nifti2.Nifti1Image(gt_segmentation[j, :, :, :], np.eye(4))
+                    nib.save(segobj, f'{nib_dir}/{dataset_name}/gt_{sample_id}/{label}.nii.gz')
+                gtobj = nib.nifti2.Nifti1Image(gt, np.eye(4))
+                nib.save(gtobj, f'{nib_dir}/{dataset_name}/gt_{sample_id}.nii.gz')
+            metric_time += (time.time()-end_time)
+            end_time = time.time()
+            # accumulate
+            results_of_samples.append([dataset_name, modality, sample_id, scores, labels])
+            # save in each process regularly in case of interruption
+            if len(results_of_samples) % save_interval == 0:
+                with open(csv_path.replace('.csv', f'_tmp_rank{dist.get_rank()}.pkl'), 'wb') as f:
+                    pickle.dump(results_of_samples, f)
+        """
+        # gather results from all device to rank-0 (solution 1)
+        gather_results = [None for i in range(dist.get_world_size())]
+        dist.gather_object(
+            results_of_samples,
+            gather_results if dist.get_rank() == 0 else None,
+            dst = 0
+            )
+        if int(dist.get_rank()) == 0:
+            results_of_samples = [tmp for ls in results_of_samples for tmp in ls]
+        """
+        avg_patch_batch_num /= len(testloader)
+        avg_query_batch_num /= len(testloader)
+        data_time /= len(testloader)
+        pred_time /= len(testloader)
+        metric_time /= len(testloader)
+        print(f'On Rank {dist.get_rank()}, each sample has {avg_patch_batch_num} batch of patches and {avg_query_batch_num} batch of queries, Data Time: {data_time}, Pred Time: {pred_time}, Dice Time: {metric_time}')
+        torch.cuda.empty_cache()
+        # save in each process (to a fnl pickle, also denoting this process ends)
+        with open(csv_path.replace('.csv', f'_fnl_rank{dist.get_rank()}.pkl'), 'wb') as f:
+            pickle.dump(results_of_samples, f)
+        # gather and record in rank 0 (solution 2)
+        if is_master():
+            # detect the finish of each process
+            while True:
+                all_process_finished = True
+                for rank_id in range(torch.distributed.get_world_size()):
+                    if not os.path.exists(csv_path.replace('.csv', f'_fnl_rank{rank_id}.pkl')): # xxx_tmp_rankx.pkl
+                        all_process_finished = False
+                        break
+                if all_process_finished:
+                    break
+                else:
+                    time.sleep(10)
+            # read results of each process (samples may be duplicated due to the even distribution of ddp, check)
+            results_of_samples = []
+            for rank_id in range(torch.distributed.get_world_size()):
+                fnl_results_file = csv_path.replace('.csv', f'_fnl_rank{rank_id}.pkl')
+                tmp_results_file = csv_path.replace('.csv', f'_tmp_rank{rank_id}.pkl')
+                with open(fnl_results_file, 'rb') as f:
+                    results_of_samples += pickle.load(f)
+                os.remove(fnl_results_file)
+                if os.path.exists(tmp_results_file):
+                    os.remove(tmp_results_file)
+            # check duplication
+            unique_set = set()
+            deduplicated_results_of_samples = []
+            for dataset_name, modality, sample_id, scores, labels in results_of_samples:
+                if f'{dataset_name}/{sample_id}' not in unique_set:
+                    unique_set.add(f'{dataset_name}/{sample_id}')
+                    deduplicated_results_of_samples.append([dataset_name, modality, sample_id, scores, labels])
+            results_of_samples = deduplicated_results_of_samples
+            # save for tmp
+            with open(csv_path.replace('.csv', '.pkl'), 'wb') as f:
+                pickle.dump(results_of_samples, f)
+            # collate results
+            for dataset_name, modality, sample_id, scores, labels in results_of_samples:    #  [[dataset_name, modality, sample_id, scores_of_labels(dict), label_names], ...]
+                dataset_name = f'{dataset_name}({modality})'
+                if dataset_name not in datasets_labels_metrics:
+                    datasets_labels_metrics[dataset_name] = {}  # {'COVID19(CT)':{}}
+                if dataset_name not in datasets_labels_sets:
+                    datasets_labels_sets[dataset_name] = set()  # {'COVID19(CT)':set()}
+                if dataset_name not in samples_labels_metrics:
+                    samples_labels_metrics[dataset_name] = {}
+                samples_labels_metrics[dataset_name][sample_id] = {}   # {'COVID19(CT)':{'0':{}}}
+                for metric_dict, label in zip(scores, labels):
+                    # accumulate metrics （for per dataset per class
+                    # {'COVID19(CT)':{'covid19_infection':{'dice':[0.8, 0.9, ...], 'nsd':[0.8, 0.9, ...], ...} ...}, ...}
+                    if label not in datasets_labels_metrics[dataset_name]:
+                        datasets_labels_metrics[dataset_name][label] = {k:[v] for k,v in metric_dict.items()}
+                    else:
+                        for k,v in metric_dict.items():
+                            datasets_labels_metrics[dataset_name][label][k].append(v)
+                    # statistic labels
+                    # {'COVID19(CT)':set('covid19_infection', ...)}
+                    if label not in datasets_labels_sets[dataset_name]:
+                        datasets_labels_sets[dataset_name].add(label)
+                    # record metrics （for per dataset per sample per class
+                    # {'COVID19':{'0.npy':{'covid19_infection':{'dice':0.8, 'nsd':0.9, ...} ...}, ...}
+                    samples_labels_metrics[dataset_name][sample_id][label] = {k:v for k,v in metric_dict.items()}
+            # average and log (列为metrics，例如dice，nsd...)
+            # create a df like:
+            # {
+            #   'TotalSegmentator': [0.xx, 0.xx, ...]    # 在T之前，这是一列
+            #   'TotalSegmentator, Lung': [0.68, 0.72, ...]
+            # }
+            # by defult, print the dice (1st metric) of each dataset
+            info = 'Metrics of Each Dataset:\n'
+            avg_df = {}
+            for dataset in datasets_labels_metrics.keys():
+                avg_df[dataset] = {k:[] for k in metric_dict.keys()}    # 'TotalSegmentator(CT)': {'dice':[0.8, ...] 'nsd':[0.5, ...], ...}
+                for label in datasets_labels_metrics[dataset].keys():
+                    avg_df[f'{dataset}, {label}'] = []
+                    for metric in datasets_labels_metrics[dataset][label].keys():
+                        label_metric = np.average(datasets_labels_metrics[dataset][label][metric])
+                        avg_df[f'{dataset}, {label}'].append(label_metric)  # 'TotalSegmentator, Lung': [0.68, 0.72, ...] list of num_metrics
+                        avg_df[dataset][metric].append(label_metric)
+                avg_df[dataset] = {k:np.average(v) for k,v in avg_df[dataset].items()} # 'TotalSegmentator': {'dice':[0.8, ...] 'nsd':[0.5, ...], ...} --> 'TotalSegmentator': {'dice':0.x, 'nsd':0.x, ...}
+                info += f'{dataset}  |  '
+                for k ,v in avg_df[dataset].items():
+                    info += f'{v}({k})  |  '
+                info += '\n'
+                avg_df[dataset] = list(avg_df[dataset].values())
+            avg_df = pd.DataFrame(avg_df).T
+            avg_df.columns = list(metric_dict.keys())   # ['dice', 'nsd']
+            avg_df.to_csv(csv_path)
+            print(info)
+            # detailed log （nsd和dice，列为class label
+            # multi-sheet, two for each dataset
+            df_list = [['summary', avg_df]]
+            for dataset, label_set in datasets_labels_sets.items():
+                metric_df ={}
+                if dice_score:
+                    metric_df['dice'] = {}
+                if nsd_score:
+                    metric_df['nsd'] = {}
+                # create dfs like:
+                # {
+                #   '0.npy': [0.xx, 0.xx, ...]
+                #   ......
+                # }
+                # {'COVID19':{'0.npy':{'covid19_infection':{'dice':0.8, ...} ...}, ...}
+                for image_id, label_dict in samples_labels_metrics[dataset].items():
+                    for metric in metric_df:
+                        tmp = []    # one dice for each label in this dataset
+                        for label in label_set:
+                            score = label_dict[label][metric] if label in label_dict else -1
+                            tmp.append(score)
+                        metric_df[metric][image_id] = tmp
+                for metric, metric_df in metric_df.items():
+                    metric_df = pd.DataFrame(metric_df).T
+                    metric_df.columns = list(label_set)
+                    df_list.append([dataset+f'({metric})', metric_df])
+            xlsx_path = csv_path.replace('.csv', '.xlsx')
+            with pd.ExcelWriter(xlsx_path) as writer:
+                for name, df in df_list:
+                    # 将每个 DataFrame 写入一个 sheet(sheet name must be < 31)
+                    if len(name) > 31:
+                        name = name[len(name)-31:]
+                    df.to_excel(writer, sheet_name=name, index=True)
+            # avg_dice_over_merged_labels, avg_nsd_over_merged_labels = merge(region_split_json, label_statistic_json, xlsx_path, xlsx_path)
+            os.remove(csv_path.replace('.csv', '.pkl'))
+        else:
+            pass
+            # avg_dice_over_merged_labels = avg_nsd_over_merged_labels = 0
+        return # avg_dice_over_merged_labels, avg_nsd_over_merged_labels

evaluate/merge_after_evaluate.py ADDED Viewed

	@@ -0,0 +1,198 @@

+import json
+import pandas as pd
+import openpyxl
+def merge(mod_label_json, mod_label_statistic, xlsx2load, xlsx2save):
+    mod_lab2dice = {}
+    # Load the first sheet of the Excel file
+    excel_file_path = xlsx2load
+    df = pd.read_excel(excel_file_path, sheet_name=0)
+    has_nsd = True if len(df.columns) > 2 else False
+    # 将Dataset Merged 写入新的工作表
+    workbook = openpyxl.load_workbook(xlsx2load)
+    new_sheet = workbook.create_sheet(title='Dataset Merge', index=1)
+    new_sheet.cell(row=1, column=1, value='Dataset')
+    new_sheet.cell(row=1, column=2, value='Dice')
+    new_sheet.cell(row=1, column=3, value='NSD')
+    row = 2
+    for i in range(0, len(df)):
+        if ',' not in df.iloc[i, 0]:
+            new_sheet.cell(row=row, column=1, value=df.iloc[i, 0])
+            new_sheet.cell(row=row, column=2, value=df.iloc[i, 1])
+            if has_nsd:
+                new_sheet.cell(row=row, column=3, value=df.iloc[i, 2])
+            row += 1
+    # with pd.ExcelWriter(xlsx2save, engine='openpyxl', mode='a', if_sheet_exists='new') as writer:
+    #     filtered_df.to_excel(writer, sheet_name='Dataset Merge', index=False)
+    # 选取前两列
+    dataset_label_ls = df.iloc[:, 0]
+    dice_ls = df.iloc[:, 1]
+    nsd_ls = df.iloc[:, 2] if has_nsd else [0] * len(df)
+    for dataset_modality_label, dice, nsd in zip(dataset_label_ls, dice_ls, nsd_ls):  # MSD_Pancreas(ct), pancreas   0.89
+        if ', ' not in dataset_modality_label:
+            continue
+        dataset_modality, label = dataset_modality_label.split(', ')
+        label = label.lower()   # pancreas
+        # label = merge_label(label)
+        modality = dataset_modality.split('(')[-1].split(')')[0]   # ct
+        # unique id : modality_label
+        mod_lab = f'{modality}_{label}'
+        # accumulate : dice and where the dice comes from (dataset, label, modality)
+        if mod_lab not in mod_lab2dice:
+            mod_lab2dice[mod_lab] = {'dice':[], 'nsd':[], 'merge':[]}
+        mod_lab2dice[mod_lab]['dice'].append(dice)
+        mod_lab2dice[mod_lab]['nsd'].append(nsd)
+        mod_lab2dice[mod_lab]['merge'].append(dataset_modality_label)
+    # retrieval regions
+    with open(mod_label_json, 'r') as f:
+        dict = json.load(f)
+    region2label = dict['region_based']
+    for region, label_ls in region2label.items():
+        region2label[region] = [mod_lab.split('_')[-1] for mod_lab in label_ls] # 去除modality
+    region2label['abnormal'] = [mod_lab.split('_')[-1] for mod_lab in dict['abnormal']]
+    region_dice_ls = {k:[] for k in region2label.keys()} # {'brain':[0.9, ...], ...}
+    region_nsd_ls = {k:[] for k in region2label.keys()} # {'brain':[0.9, ...], ...}
+    region_merge_ls = {k:[] for k in region2label.keys()} # {'brain':['frontal lobe', ...], ...}
+    mod_lab_ls = []
+    dice_ls = []
+    nsd_ls = []
+    merge_ls = []
+    region_ls = []
+    for mod_lab, dict in mod_lab2dice.items():
+        label = mod_lab.split('_')[-1]
+        mod_lab_ls.append(mod_lab)
+        dice_ls.append(sum(dict['dice'])/len(dict['dice']))
+        nsd_ls.append(sum(dict['nsd'])/len(dict['nsd']))
+        merge_ls.append(' / '.join(dict['merge']))
+        # find region
+        if label in region2label['abnormal']:
+            region_dice_ls['abnormal'].append(dice_ls[-1])
+            region_nsd_ls['abnormal'].append(nsd_ls[-1])
+            region_merge_ls['abnormal'].append(mod_lab)
+            region_ls.append('abnormal')
+        else:
+            found = False
+            for region, labels_in_region in region2label.items():
+                if label in labels_in_region:
+                    region_dice_ls[region].append(dice_ls[-1])
+                    region_nsd_ls[region].append(nsd_ls[-1])
+                    region_merge_ls[region].append(mod_lab)
+                    region_ls.append(region)
+                    found = True
+                    break
+            if not found:
+                print(label)
+                region_ls.append('unknown')
+    df = pd.DataFrame({
+        'Modality_Label': mod_lab_ls,
+        'Dice': dice_ls,
+        'NSD': nsd_ls,
+        'Merge': merge_ls,
+        'Region': region_ls
+    })
+    #book = openpyxl.load_workbook(xlsx2save)
+    #writer = pd.ExcelWriter(xlsx2save, engine='openpyxl')
+    #writer.book = book
+    # with pd.ExcelWriter(xlsx2save, engine='openpyxl', mode='a', if_sheet_exists='new') as writer:
+    #     df.to_excel(writer, sheet_name='Label Merge', index=False)
+    # 写上anno num和repeat ratio
+    with open(mod_label_statistic, 'r') as f:
+        statistic_dict = json.load(f)
+    # 将Label Merged DataFrame写入新的工作表
+    new_sheet = workbook.create_sheet(title='Label Merge', index=1)
+    new_sheet.cell(row=1, column=1, value='Modality_Label')
+    new_sheet.cell(row=1, column=2, value='Dice')
+    new_sheet.cell(row=1, column=3, value='NSD')
+    new_sheet.cell(row=1, column=4, value='Merge')
+    new_sheet.cell(row=1, column=5, value='Region')
+    new_sheet.cell(row=1, column=6, value='Total_Num')
+    new_sheet.cell(row=1, column=7, value='Aug_Ratio')
+    row = 2
+    for mod_lab, dice, nsd, merge, region in zip(mod_lab_ls, dice_ls, nsd_ls, merge_ls, region_ls):
+        if mod_lab in statistic_dict:
+            _, total_num, aug_ratio = statistic_dict[mod_lab]
+        else:
+            total_num = aug_ratio = 0
+        new_sheet.cell(row=row, column=1, value=mod_lab)
+        new_sheet.cell(row=row, column=2, value=dice)
+        new_sheet.cell(row=row, column=3, value=nsd)
+        new_sheet.cell(row=row, column=4, value=merge)
+        new_sheet.cell(row=row, column=5, value=region)
+        new_sheet.cell(row=row, column=6, value=total_num)
+        new_sheet.cell(row=row, column=7, value=aug_ratio)
+        row += 1
+    new_sheet.cell(row=row, column=2, value=sum(dice_ls)/len(dice_ls))  # avg over all labels
+    new_sheet.cell(row=row, column=3, value=sum(nsd_ls)/len(nsd_ls))
+    # 将Region Merged 写入新的工作表
+    new_sheet = workbook.create_sheet(title='Region Merge', index=1)
+    new_sheet.cell(row=1, column=1, value='Region')
+    new_sheet.cell(row=1, column=2, value='Dice')
+    new_sheet.cell(row=1, column=3, value='NSD')
+    new_sheet.cell(row=1, column=4, value='Merge')
+    row = 2
+    for key in region_dice_ls.keys():
+        if len(region_dice_ls[key]) == 0:
+            dice = nsd = 0
+            merge = None
+        else:
+            dice = sum(region_dice_ls[key])/len(region_dice_ls[key])
+            nsd = sum(region_nsd_ls[key])/len(region_nsd_ls[key])
+            merge = ','.join(region_merge_ls[key])
+        class_name = f'{key}({len(region_dice_ls[key])})'
+        new_sheet.cell(row=row, column=1, value=class_name)
+        new_sheet.cell(row=row, column=2, value=dice)
+        new_sheet.cell(row=row, column=3, value=nsd)
+        new_sheet.cell(row=row, column=4, value=merge)
+        row += 1
+    workbook.save(xlsx2save)
+    # 返回所有 label 的 avg
+    avg_dice_over_merged_labels = sum(dice_ls) / len(dice_ls)
+    avg_nsd_over_merged_labels = sum(nsd_ls) / len(nsd_ls)
+    return avg_dice_over_merged_labels, avg_nsd_over_merged_labels
+if __name__ == '__main__':
+    import argparse
+    def str2bool(v):
+        if isinstance(v, bool):
+            return v
+        if v.lower() in ('yes', 'true', 't', 'y', '1'):
+            return True
+        elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+            return False
+        else:
+            raise argparse.ArgumentTypeError('Boolean value expected.')
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--xlsx2load', type=str)
+    parser.add_argument('--xlsx2save', type=str)
+    parser.add_argument('--mod_lab_json', type=str, default='/mnt/petrelfs/share_data/wuchaoyi/SAM/processed_files_v4/mod_lab(72).json')
+    parser.add_argument('--mod_label_statistic', type=str, default='/mnt/petrelfs/share_data/wuchaoyi/SAM/processed_files_v4/mod_lab_accum_statis(49).json')
+    config = parser.parse_args()
+    if not config.xlsx2save:
+        config.xlsx2save = config.xlsx2load
+    merge(config.mod_lab_json, config.mod_label_statistic, config.xlsx2load, config.xlsx2save)

evaluate/metric.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import torch
+import numpy as np
+import time
+from medpy import metric
+from .SurfaceDice import compute_surface_distances, compute_surface_dice_at_tolerance
+def calculate_metric_percase(pred, gt, dice=True, nsd=True):
+    pred = pred.astype(bool)
+    gt = gt.astype(bool)
+    metrics = {}
+    if np.sum(gt) == 0.0:
+        if np.sum(pred) == 0.0:
+            if dice:
+                metrics['dice'] = 1.0
+            if nsd:
+                metrics['nsd'] = 1.0
+        else:
+            if dice:
+                metrics['dice'] = 0.0
+            if nsd:
+                metrics['nsd'] = 0.0
+        return metrics
+    if dice:
+        dice_score = metric.binary.dc(pred, gt)
+        metrics['dice'] = dice_score
+    if nsd:
+        surface_distances = compute_surface_distances(gt, pred, [1, 1, 3])
+        nsd_score = compute_surface_dice_at_tolerance(surface_distances, 1)
+        metrics['nsd'] = nsd_score
+    return metrics
+if __name__ == '__main__':
+    pred = torch.zeros((3, 256, 256, 16)).numpy()
+    pred[:, 0:128, 0:128, :] = 1.0
+    gt = torch.zeros((3, 256, 256, 16)).numpy()
+    gt[:, 0:64, 0:64, :] = 1.0
+    dice = calculate_metric_percase(pred, gt)['dice']
+    print(dice)

evaluate/params.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import argparse
+def str2bool(v):
+    return v.lower() in ('true', 't')
+def parse_args():
+    parser = argparse.ArgumentParser()
+    # Exp Controller
+    parser.add_argument(
+        "--rcd_dir",
+        type=str,
+        help="save the evaluation results (in a directory)",
+    )
+    parser.add_argument(
+        "--rcd_file",
+        type=str,
+        help="save the evaluation results (in a csv/xlsx file)",
+    )
+    parser.add_argument(
+        "--visualization",
+        type=str2bool,
+        default=False,
+        help="save the visualization for each case (img, gt, pred)",
+    )
+    parser.add_argument(
+        "--checkpoint",
+        type=str,
+        help="Checkpoint path",
+    )
+    parser.add_argument(
+        "--partial_load",
+        type=str2bool,
+        default=True,
+        help="Allow to load partial paramters from checkpoint",
+    )
+    parser.add_argument(
+        "--gpu",
+        type=str,
+        default=None,
+    )
+    parser.add_argument(
+        "--resume",
+        type=str2bool,
+        default=True,
+        help="Inherit medial results from an interrupted evaluation (no harm even if you evaluate from scratch)",
+    )
+    parser.add_argument(
+        "--save_interval",
+        type=int,
+        default=100
+    )
+    # Metrics
+    parser.add_argument(
+        "--dice",
+        type=str2bool,
+        default=True,
+    )
+    parser.add_argument(
+        "--nsd",
+        type=str2bool,
+        default=True,
+    )
+    # Med SAM Dataset
+    parser.add_argument(
+        "--datasets_jsonl",
+        type=str,
+    )
+    parser.add_argument(
+        "--text_prompts_json",
+        type=str,
+        help='This is needed for CVPR25 challenge, where multiple prompts (synonyms) are required.'
+    )
+    # Sampler and Loader
+    parser.add_argument(
+        "--online_crop",
+        type=str2bool,
+        default='False',
+        help='load pre-cropped image patches directly, or crop online',
+    )
+    parser.add_argument(
+        "--crop_size",
+        type=int,
+        nargs='+',
+        default=[288, 288, 96],
+    )
+    parser.add_argument(
+        "--max_queries",
+        type=int,
+        default=256,
+    )
+    parser.add_argument(
+        "--batchsize_3d",
+        type=int,
+        default=2,
+    )
+    parser.add_argument(
+        "--pin_memory",
+        type=str2bool,
+        default=False,
+        help='load data to gpu to accelerate'
+    )
+    parser.add_argument(
+        "--num_workers",
+        type=int,
+        default=4
+    )
+    # Knowledge Encoder
+    parser.add_argument(
+        "--text_encoder_partial_load",
+        type=str2bool,
+        default=True,
+        help="Allow to load partial paramters from checkpoint",
+    )
+    parser.add_argument(
+        "--text_encoder_checkpoint",
+        type=str,
+    )
+    parser.add_argument(
+        "--text_encoder",
+        type=str,
+    )
+    # MaskFormer
+    parser.add_argument(
+        "--vision_backbone",
+        type=str,
+        help='UNET or UNET-H'
+    )
+    parser.add_argument(
+        "--patch_size",
+        type=int,
+        nargs='+',
+        default=[32, 32, 32],
+        help='patch size on h w and d'
+    )
+    parser.add_argument(
+        "--deep_supervision",
+        type=str2bool,
+        default=False,
+    )
+    args = parser.parse_args()
+    return args

inference_medals_nifti.py ADDED Viewed

	@@ -0,0 +1,1885 @@

+"""
+Medal-S inference script for generic raw image segmentation.
+This script provides an interface for running Medal-S inference
+on raw NIfTI images. It supports both single-stage (Stage 2 only) and
+two-stage (Stage 1 + Stage 2) inference modes.
+Usage:
+    python inference_medals.py --input input.nii.gz --output output.nii.gz \\
+        --modality CT --texts "Aorta observed in abdominal CT scans" --labels 1
+    # Or use JSON configuration file:
+    python inference_medals.py --input input.nii.gz --output output.nii.gz \\
+        --config config.json --mode stage1+stage2
+Author: Pengcheng Shi
+Institute: Medical Image Insights, Inc., Shanghai, China
+Email: shipc1220@gmail.com
+License: Apache License 2.0
+"""
+import os
+import argparse
+import json
+import time
+import math
+import random
+import itertools
+import gc
+import numpy as np
+import SimpleITK as sitk
+import torch
+import torch.nn.functional as F
+from typing import List
+from scipy.ndimage import label, gaussian_filter
+from einops import rearrange
+from tqdm import tqdm
+from torch.cuda.amp import autocast
+from data.default_resampling import resample_data_or_seg, compute_new_shape, resample_data_or_seg_to_spacing
+from data.resample_torch import resample_torch_fornnunet, resample_torch_simple
+from model.maskformer import Maskformer
+from model.knowledge_encoder import Knowledge_Encoder
+def adjust_spacing(img_array, img_spacing):
+    """
+    Adjust spacing based on image dimensions.
+    This function swaps spacing values if the dimension with minimum size
+    doesn't match the dimension with maximum spacing.
+    Args:
+        img_array: Image array (used for shape reference)
+        img_spacing: Spacing array
+    Returns:
+        Adjusted spacing array
+    """
+    img_spacing = np.asarray(img_spacing)
+    min_dim_index = np.argmin(img_array.shape)
+    max_spacing_index = np.argmax(img_spacing)
+    if (min_dim_index != max_spacing_index) and (img_spacing[max_spacing_index] > 0.5):
+        new_order = list(range(len(img_spacing)))
+        new_order[min_dim_index], new_order[max_spacing_index] = new_order[max_spacing_index], new_order[min_dim_index]
+        img_spacing = img_spacing[new_order]
+    return img_spacing
+def remove_small_objects_binary(binary_data, min_size=10):
+    """
+    Remove small objects from binary data.
+    Args:
+        binary_data: Binary array
+        min_size: Minimum size threshold for objects to keep
+    Returns:
+        Binary array with small objects removed
+    """
+    labeled_array, num_features = label(binary_data)
+    sizes = np.bincount(labeled_array.ravel())
+    remove = sizes < min_size
+    remove[0] = False  # Ensure the background (label 0) is not removed
+    labeled_array[remove[labeled_array]] = 0
+    return labeled_array > 0
+def respace_image(image: np.ndarray, current_spacing: np.ndarray, target_spacing: np.ndarray, device: torch.device) -> np.ndarray:
+    """
+    Resample image to target spacing.
+    Args:
+        image: Input image array with shape (C, H, W, D)
+        current_spacing: Current spacing array
+        target_spacing: Target spacing array
+        device: PyTorch device for resampling
+    Returns:
+        Resampled image array
+    """
+    new_shape = compute_new_shape(image.shape[1:], current_spacing, target_spacing)
+    resampled_image = resample_torch_fornnunet(
+        image, new_shape, current_spacing, target_spacing,
+        is_seg=False, num_threads=8, device=device,
+        memefficient_seg_resampling=False,
+        force_separate_z=None,
+        separate_z_anisotropy_threshold=3.0
+    )
+    return resampled_image
+def respace_mask(mask: np.ndarray, current_spacing: np.ndarray, target_spacing: np.ndarray, device: torch.device) -> np.ndarray:
+    """
+    Resample mask to target spacing.
+    Args:
+        mask: Input mask array with shape (C, H, W, D)
+        current_spacing: Current spacing array
+        target_spacing: Target spacing array
+        device: PyTorch device for resampling
+    Returns:
+        Resampled mask array
+    """
+    new_shape = compute_new_shape(mask.shape[1:], current_spacing, target_spacing)
+    resampled_mask = resample_torch_fornnunet(
+        mask, new_shape, current_spacing, target_spacing,
+        is_seg=True, num_threads=8, device=device,
+        memefficient_seg_resampling=False,
+        force_separate_z=None,
+        separate_z_anisotropy_threshold=3.0
+    )
+    return resampled_mask
+def split_3d(image_tensor, crop_size=[288, 288, 96]):
+    """
+    Split 3D image into overlapping patches.
+    Patches are extracted with 50% overlap (stride = crop_size / 2) to ensure
+    complete coverage of the image volume.
+    Args:
+        image_tensor: Input image tensor with shape (C, H, W, D)
+        crop_size: Size of each patch [h, w, d]
+    Returns:
+        split_patch: List of patch tensors
+        split_idx: List of patch indices [h_s, h_e, w_s, w_e, d_s, d_e]
+    """
+    interval_h, interval_w, interval_d = crop_size[0] // 2, crop_size[1] // 2, crop_size[2] // 2
+    split_idx = []
+    split_patch = []
+    c, h, w, d = image_tensor.shape
+    h_crop = max(math.ceil(h / interval_h) - 1, 1)
+    w_crop = max(math.ceil(w / interval_w) - 1, 1)
+    d_crop = max(math.ceil(d / interval_d) - 1, 1)
+    for i in range(h_crop):
+        h_s = i * interval_h
+        h_e = h_s + crop_size[0]
+        if h_e > h:
+            h_s = h - crop_size[0]
+            h_e = h
+            if h_s < 0:
+                h_s = 0
+        for j in range(w_crop):
+            w_s = j * interval_w
+            w_e = w_s + crop_size[1]
+            if w_e > w:
+                w_s = w - crop_size[1]
+                w_e = w
+                if w_s < 0:
+                    w_s = 0
+            for k in range(d_crop):
+                d_s = k * interval_d
+                d_e = d_s + crop_size[2]
+                if d_e > d:
+                    d_s = d - crop_size[2]
+                    d_e = d
+                if d_s < 0:
+                    d_s = 0
+                split_idx.append([h_s, h_e, w_s, w_e, d_s, d_e])
+                split_patch.append(image_tensor[:, h_s:h_e, w_s:w_e, d_s:d_e])
+    return split_patch, split_idx
+def pad_if_necessary(image, crop_size=[288, 288, 96]):
+    """
+    Pad image if necessary to meet crop size requirements.
+    Args:
+        image: Input image tensor with shape (C, H, W, D)
+        crop_size: Minimum size requirements [h, w, d]
+    Returns:
+        padded_image: Padded image tensor
+        padding_info: Tuple of padding amounts (pad_h, pad_w, pad_d)
+    """
+    c, h, w, d = image.shape
+    croph, cropw, cropd = crop_size
+    pad_in_h = 0 if h >= croph else croph - h
+    pad_in_w = 0 if w >= cropw else cropw - w
+    pad_in_d = 0 if d >= cropd else cropd - d
+    padding_info = (pad_in_h, pad_in_w, pad_in_d)
+    if pad_in_h + pad_in_w + pad_in_d > 0:
+        pad = (0, pad_in_d, 0, pad_in_w, 0, pad_in_h)
+        image = F.pad(image, pad, 'constant', 0)
+    return image, padding_info
+def remove_padding(padded_image, padding_info):
+    """
+    Remove padding from image.
+    Args:
+        padded_image: Padded image (can be torch.Tensor or numpy array)
+        padding_info: Tuple of padding amounts (pad_h, pad_w, pad_d)
+    Returns:
+        Image with padding removed
+    """
+    pad_in_h, pad_in_w, pad_in_d = padding_info
+    if len(padded_image.shape) == 4:
+        if isinstance(padded_image, torch.Tensor):
+            return padded_image[:, :padded_image.shape[1]-pad_in_h, :padded_image.shape[2]-pad_in_w, :padded_image.shape[3]-pad_in_d]
+        else:
+            return padded_image[:, :padded_image.shape[1]-pad_in_h, :padded_image.shape[2]-pad_in_w, :padded_image.shape[3]-pad_in_d]
+    else:
+        if isinstance(padded_image, torch.Tensor):
+            return padded_image[:padded_image.shape[0]-pad_in_h, :padded_image.shape[1]-pad_in_w, :padded_image.shape[2]-pad_in_d]
+        else:
+            return padded_image[:padded_image.shape[0]-pad_in_h, :padded_image.shape[1]-pad_in_w, :padded_image.shape[2]-pad_in_d]
+def internal_maybe_mirror_and_predict(model=None, queries=None, image_input=None, simulated_lowres_sc_pred=None,
+                                      simulated_lowres_mc_pred=None, mirror_axes=(0, 1, 2)):
+    """
+    Apply test-time augmentation with mirroring.
+    This function performs inference with multiple mirroring combinations
+    and averages the results for improved robustness.
+    Args:
+        model: Model to use for prediction
+        queries: Query tensor
+        image_input: Input image tensor
+        simulated_lowres_sc_pred: Simulated low-res single-channel prediction
+        simulated_lowres_mc_pred: Simulated low-res multi-channel prediction
+        mirror_axes: Axes to mirror (0, 1, 2 for spatial dimensions)
+    Returns:
+        Averaged prediction tensor
+    """
+    prediction = model(queries=queries,
+                       image_input=image_input,
+                       simulated_lowres_sc_pred=simulated_lowres_sc_pred,
+                       simulated_lowres_mc_pred=simulated_lowres_mc_pred,
+                       train_mode=False)
+    if mirror_axes is not None:
+        assert max(mirror_axes) <= image_input.ndim - 3, 'mirror_axes does not match the dimension of the input!'
+        mirror_axes = [m + 2 for m in mirror_axes]
+        axes_combinations = [
+            c for i in range(len(mirror_axes)) for c in itertools.combinations(mirror_axes, i + 1)
+        ]
+        for axes in axes_combinations:
+            image_input_fliped = torch.flip(image_input, axes)
+            simulated_lowres_sc_pred_fliped = torch.flip(simulated_lowres_sc_pred.unsqueeze(0), axes).squeeze(0) if simulated_lowres_sc_pred is not None else None
+            simulated_lowres_mc_pred_fliped = torch.flip(simulated_lowres_mc_pred.unsqueeze(0), axes).squeeze(0) if simulated_lowres_mc_pred is not None else None
+            prediction_fliped = model(queries=queries,
+                                     image_input=image_input_fliped,
+                                     simulated_lowres_sc_pred=simulated_lowres_sc_pred_fliped,
+                                     simulated_lowres_mc_pred=simulated_lowres_mc_pred_fliped,
+                                     train_mode=False)
+            prediction += torch.flip(prediction_fliped, axes)
+        prediction /= (len(axes_combinations) + 1)
+    return prediction
+def compute_patch_prediction(
+    queries: torch.Tensor,
+    patches: torch.Tensor,
+    lowres_single_channel_pred: torch.Tensor,
+    lowres_multi_channel_pred: torch.Tensor,
+    model: torch.nn.Module,
+    possible_block_sizes: List[int],
+    n_repeats: int = 1,
+    disable_tta: bool = True
+) -> torch.Tensor:
+    """
+    Compute patch predictions using complementary masking.
+    This function splits the volume into blocks, processes complementary halves
+    using random masks, and combines results. The process is repeated n_repeats
+    times with different random masks, and results are averaged.
+    Args:
+        queries: Input query tensor, shape (batch, query_dim)
+        patches: Image patch tensor, shape (batch, channels, h, w, d)
+        lowres_single_channel_pred: Low-res single-channel prediction, shape (1, 1, h, w, d)
+        lowres_multi_channel_pred: Low-res multi-channel prediction, shape (1, c, h, w, d)
+        model: Trained neural network model
+        possible_block_sizes: List of possible block sizes (e.g., [8, 16, 32])
+        n_repeats: Number of times to repeat prediction with different masks
+        disable_tta: Whether to disable test-time augmentation
+    Returns:
+        Averaged patch prediction, shape (1, c, h, w, d)
+    """
+    # Validate inputs
+    if not possible_block_sizes:
+        raise ValueError("possible_block_sizes cannot be empty")
+    if n_repeats < 1:
+        raise ValueError("n_repeats must be at least 1")
+    _, _, h, w, d = lowres_single_channel_pred.shape
+    device = lowres_single_channel_pred.device
+    prediction_sum = torch.zeros_like(lowres_multi_channel_pred, device=device)
+    def upsample_block_mask(block_mask: torch.Tensor, block_size: int) -> torch.Tensor:
+        """Upsample a block mask to full resolution."""
+        upsampled = (
+            block_mask.unsqueeze(0).unsqueeze(0)
+            .repeat_interleave(block_size, dim=2)
+            .repeat_interleave(block_size, dim=3)
+            .repeat_interleave(block_size, dim=4)
+            [:, :, :h, :w, :d]
+        ).float()
+        return upsampled
+    for _ in range(n_repeats):
+        block_size = random.choice(possible_block_sizes)
+        n_blocks_h = (h + block_size - 1) // block_size
+        n_blocks_w = (w + block_size - 1) // block_size
+        n_blocks_d = (d + block_size - 1) // block_size
+        total_blocks = n_blocks_h * n_blocks_w * n_blocks_d
+        num_selected = max(1, total_blocks // 2)
+        block_mask = torch.zeros(n_blocks_h, n_blocks_w, n_blocks_d, dtype=torch.bool, device=device)
+        indices = torch.randperm(total_blocks, device=device)[:num_selected]
+        block_mask.view(-1)[indices] = True
+        mask = upsample_block_mask(block_mask, block_size)
+        complementary_mask = 1.0 - mask
+        masked_sc_pred = lowres_single_channel_pred * mask
+        masked_mc_pred = lowres_multi_channel_pred * mask
+        if disable_tta:
+            first_half_pred = model(
+                queries=queries,
+                image_input=patches,
+                simulated_lowres_sc_pred=masked_sc_pred,
+                simulated_lowres_mc_pred=masked_mc_pred,
+                train_mode=False
+            )
+        else:
+            first_half_pred = internal_maybe_mirror_and_predict(
+                model=model,
+                queries=queries,
+                image_input=patches,
+                simulated_lowres_sc_pred=masked_sc_pred,
+                simulated_lowres_mc_pred=masked_mc_pred,
+                mirror_axes=(0, 1, 2)
+            )
+        masked_sc_pred_comp = lowres_single_channel_pred * complementary_mask
+        masked_mc_pred_comp = lowres_multi_channel_pred * complementary_mask
+        if disable_tta:
+            second_half_pred = model(
+                queries=queries,
+                image_input=patches,
+                simulated_lowres_sc_pred=masked_sc_pred_comp,
+                simulated_lowres_mc_pred=masked_mc_pred_comp,
+                train_mode=False
+            )
+        else:
+            second_half_pred = internal_maybe_mirror_and_predict(
+                model=model,
+                queries=queries,
+                image_input=patches,
+                simulated_lowres_sc_pred=masked_sc_pred_comp,
+                simulated_lowres_mc_pred=masked_mc_pred_comp,
+                mirror_axes=(0, 1, 2)
+            )
+        final_prediction = first_half_pred * complementary_mask + second_half_pred * mask
+        prediction_sum += final_prediction
+    return prediction_sum / n_repeats
+def read_npz_data(raw_image, raw_spacing, crop_size=[288, 288, 96],
+                  target_spacing=[1.5, 1.5, 3.0], scaled_roi_lowres_pred_array=None,
+                  class_name_list=[], stage_1_flag=False, device=torch.device("cuda", 0), verbose=True):
+    """
+    Read and preprocess image data for inference.
+    This function handles spacing adjustments, image resampling, padding,
+    and patch splitting for the inference pipeline.
+    Args:
+        raw_image: Input image array with shape (d, h, w)
+        raw_spacing: Spacing array with shape (3,)
+        crop_size: Target crop size [h, w, d]
+        target_spacing: Target spacing [h, w, d]
+        scaled_roi_lowres_pred_array: Optional low-res prediction for ROI-based inference
+        class_name_list: List of class names (kept for compatibility, not used)
+        stage_1_flag: Whether this is Stage 1 inference (kept for compatibility, not used)
+        device: PyTorch device for resampling
+        verbose: Whether to print detailed information (default: True)
+    Returns:
+        data_dict: Dictionary containing preprocessed patches and metadata
+    """
+    raw_d, raw_h, raw_w = raw_image.shape
+    image = rearrange(raw_image, 'd h w -> h w d')
+    spacing = raw_spacing.astype(np.float32)
+    # Simplified spacing adjustment following the provided steps
+    # Step 1: Handle very small spacing values
+    for i in range(3):
+        if spacing[i] <= 0.1:
+            spacing[i] = 1.0
+    # Step 2: Adjust spacing based on image dimensions
+    spacing = adjust_spacing(image, spacing)
+    # Step 3: Initialize parameters for spacing adjustment
+    max_dims = [1000, 1000, 700]
+    min_dims = crop_size
+    thresholds = []
+    current = 1.25
+    while current <= 50:
+        thresholds.append(current)
+        current *= 1.25
+    raw_target_spacing = target_spacing.copy()
+    # Step 4: Adjust spacing based on constraints
+    for i in range(3):
+        # If spacing is less than 1.0 and image dimension is within max_dims, set to 1.0
+        if spacing[i] < 1.0 and image.shape[i] <= max_dims[i]:
+            spacing[i] = 1.0  # second stage model resolution
+        # If physical dimension exceeds max_dims and spacing is greater than target, use target spacing
+        if spacing[i] * image.shape[i] > max_dims[i] * target_spacing[i] and spacing[i] > target_spacing[i]:
+            spacing[i] = target_spacing[i]
+        # If physical dimension is less than min_dims threshold, adjust target_spacing
+        elif spacing[i] * image.shape[i] < min_dims[i] * target_spacing[i]:
+            alpha_spacing = 1
+            for threshold in reversed(thresholds):
+                if image.shape[i] <= (min_dims[i] / threshold):
+                    alpha_spacing = threshold
+                    break
+            raw_target_spacing[i] = target_spacing[i]
+            target_spacing[i] = max(spacing[i] * image.shape[i] / min_dims[i], spacing[i] / alpha_spacing)
+            if verbose:
+                print("alpha_spacing: ", alpha_spacing)
+                print("spacing[i] * image.shape[i] / min_dims[i], spacing[i] / alpha_spacing: ", spacing[i] * image.shape[i] / min_dims[i], spacing[i] / alpha_spacing)
+                print("raw_target_spacing[i], target_spacing[i]: ", raw_target_spacing[i], target_spacing[i])
+            target_spacing[i] = min(raw_target_spacing[i], target_spacing[i])
+            if verbose:
+                print("image.shape[i], min_dims[i], target_spacing[i], spacing[i]: ", image.shape[i], min_dims[i], target_spacing[i], spacing[i])
+    # Set default num_iterations (no special class handling)
+    num_iterations = 1
+    image = image[np.newaxis, ...].astype(np.float32)
+    if verbose:
+        print("image.shape: ", image.shape)
+        print("spacing: ", spacing)
+        print("target_spacing: ", target_spacing)
+    image = respace_image(image, spacing, target_spacing, torch.device('cpu'))
+    if verbose:
+        print("respace image.shape: ", image.shape)
+    image = torch.tensor(image)
+    image, padding_info = pad_if_necessary(image, crop_size=crop_size)
+    _, h, w, d = image.shape
+    patches, y1y2_x1x2_z1z2_ls = split_3d(image, crop_size=crop_size)
+    data_dict = {
+        'spacing': spacing,
+        'original_shape': (raw_h, raw_w, raw_d),
+        'current_shape': (h, w, d),
+        'patches': patches,
+        'y1y2_x1x2_z1z2_ls': y1y2_x1x2_z1z2_ls,
+        'padding_info': padding_info,
+        'raw_image': raw_image,
+        'num_iterations': num_iterations
+    }
+    if scaled_roi_lowres_pred_array is not None:
+        lowres_pred = rearrange(scaled_roi_lowres_pred_array, 'd h w -> h w d')
+        lowres_pred = lowres_pred[np.newaxis, ...].astype(np.float32)
+        lowres_pred = respace_mask(lowres_pred, spacing, target_spacing, torch.device('cpu'))
+        lowres_pred = torch.tensor(lowres_pred)
+        lowres_pred, padding_info = pad_if_necessary(lowres_pred, crop_size=crop_size)
+        lowres_pred_patches, _ = split_3d(lowres_pred, crop_size=crop_size)
+        data_dict['lowres_pred_patches'] = lowres_pred_patches
+        data_dict['padding_info'] = padding_info
+    return data_dict
+def compute_gaussian(tile_size, sigma_scale: float = 1. / 8, value_scaling_factor: float = 10, dtype=np.float16):
+    """
+    Compute Gaussian importance map for patch weighting.
+    This creates a Gaussian weight map centered at the patch center, used for
+    weighted averaging of overlapping patch predictions.
+    Args:
+        tile_size: Size of the tile (crop_size)
+        sigma_scale: Scale factor for Gaussian sigma (relative to tile size)
+        value_scaling_factor: Scaling factor for the Gaussian values
+        dtype: Data type for the output array
+    Returns:
+        Gaussian importance map array
+    """
+    tmp = np.zeros(tile_size)
+    center_coords = [i // 2 for i in tile_size]
+    sigmas = [i * sigma_scale for i in tile_size]
+    tmp[tuple(center_coords)] = 1
+    gaussian_importance_map = gaussian_filter(tmp, sigmas, 0, mode='constant', cval=0)
+    gaussian_importance_map = gaussian_importance_map / np.max(gaussian_importance_map) * value_scaling_factor
+    gaussian_importance_map = gaussian_importance_map.astype(dtype)
+    gaussian_importance_map[gaussian_importance_map == 0] = np.min(
+        gaussian_importance_map[gaussian_importance_map != 0])
+    return gaussian_importance_map
+def sc_mask_to_mc_mask(sc_mask, label_values_ls):
+    """
+    Convert single-channel mask to multi-channel mask.
+    Args:
+        sc_mask: Single-channel mask with shape (1, 1, h, w, d) or (h, w, d)
+        label_values_ls: List of label values to create channels for
+    Returns:
+        Multi-channel mask with shape (1, n_classes, h, w, d)
+    """
+    sc_mask = sc_mask.squeeze(0).squeeze(0)
+    assert sc_mask.ndim == 3
+    h, w, d = sc_mask.shape
+    n = len(label_values_ls)
+    mc_mask = torch.zeros((n, h, w, d), dtype=bool).to(sc_mask.device)
+    for i, label_value in enumerate(label_values_ls):
+        mc_mask[i] = torch.where(sc_mask == label_value, 1, 0)
+    mc_mask = mc_mask.to(torch.float32)
+    mc_mask = mc_mask.unsqueeze(0)
+    return mc_mask
+class MedicalSegmentationPipeline:
+    """
+    Pipeline for medical image segmentation.
+    This class handles model loading, data preprocessing, and inference execution
+    for the Medal-S segmentation pipeline.
+    """
+    def __init__(self, config):
+        """
+        Initialize the segmentation pipeline.
+        Args:
+            config: Dictionary containing pipeline configuration parameters
+        """
+        self.config = config
+        self.device = torch.device(config['device'])
+    def _load_model(self):
+        """
+        Load vision model and text encoder from checkpoints.
+        Returns:
+            model: Loaded vision model (Maskformer)
+            text_encoder: Loaded text encoder (Knowledge_Encoder)
+        """
+        crop_str = '_'.join(map(str, self.config['crop_size']))
+        spacing_str = '_'.join(map(str, self.config['target_spacing_model']))
+        vision_backbone_checkpoint = os.path.join(
+            self.config['checkpoints_path'],
+            f"nano_UNet_CVPR2025_crop_size_{crop_str}_spacing_{spacing_str}_step_{self.config['model_step']}.pth")
+        model = Maskformer(
+            self.config['vision_backbone'],
+            self.config['input_channels'],
+            self.config['crop_size'],
+            self.config['patch_size'],
+            False
+        )
+        model = model.to(self.device)
+        checkpoint = torch.load(vision_backbone_checkpoint, map_location=self.device)
+        new_state_dict = {
+            k[7:] if k.startswith('module.') else k: v
+            for k, v in checkpoint['model_state_dict'].items()
+            if 'mid_mask_embed_proj' not in k
+        }
+        model.load_state_dict(new_state_dict)
+        model.eval()
+        text_encoder = Knowledge_Encoder(
+            biolord_checkpoint=os.path.join(
+                self.config['checkpoints_path'],
+                'BioLORD-2023-C'
+            )
+        )
+        text_encoder = text_encoder.to(self.device)
+        checkpoint = torch.load(
+            os.path.join(self.config['checkpoints_path'], 'text_encoder.pth'),
+            map_location=self.device
+        )
+        new_state_dict = {
+            k[7:] if k.startswith('module.') else k: v
+            for k, v in checkpoint['model_state_dict'].items()
+        }
+        text_encoder.load_state_dict(new_state_dict, strict=False)
+        text_encoder.eval()
+        return model, text_encoder
+    def run_inference(self, raw_image, raw_spacing, verbose=True):
+        """
+        Run inference on the input image.
+        This method performs the complete inference pipeline:
+        1. Load models (vision backbone and text encoder)
+        2. Preprocess image data (resampling, padding, patch splitting)
+        3. Encode text prompts
+        4. Process patches and aggregate predictions
+        5. Post-process results (remove padding, resample to original shape)
+        Args:
+            raw_image: Input image array with shape (d, h, w)
+            raw_spacing: Spacing array with shape (3,)
+            verbose: Whether to print detailed information (default: True)
+        Returns:
+            pred_array: Segmentation array with shape (d, h, w), dtype int16
+            max_prob_array: Maximum probability array (if return_max_prob=True), or None
+        """
+        model, text_encoder = self._load_model()
+        pred_array = None
+        crop_size = self.config['crop_size']
+        disable_tta = self.config['disable_tta']
+        instance_label = self.config['instance_label']
+        modality = self.config['modality']
+        text_prompts = self.config['texts']
+        label_values = self.config['label_values']
+        return_max_prob = self.config['return_max_prob']
+        class_name_list = self.config['class_name_list']
+        stage_1_flag = self.config['stage_1_flag']
+        with torch.no_grad():
+            # Gaussian is kept on CPU, as accumulation will now happen on CPU
+            gaussian = torch.tensor(compute_gaussian(tuple(crop_size)), dtype=torch.float32).cpu()
+            data_dict = read_npz_data(
+                raw_image=raw_image,
+                raw_spacing=raw_spacing,
+                crop_size=crop_size,
+                target_spacing=self.config['target_spacing'],
+                scaled_roi_lowres_pred_array=self.config['scaled_roi_lowres_pred_array'],
+                class_name_list=class_name_list,
+                stage_1_flag=stage_1_flag,
+                device=self.device,
+                verbose=verbose
+            )
+            spacing = data_dict['spacing']
+            original_shape = data_dict['original_shape']
+            current_shape = data_dict['current_shape']
+            batched_patches = data_dict['patches']
+            batched_y1y2_x1x2_z1z2 = data_dict['y1y2_x1x2_z1z2_ls']
+            padding_info = data_dict['padding_info']
+            raw_image = data_dict['raw_image']
+            num_iterations = data_dict['num_iterations']
+            batched_lowres_pred_patches = data_dict.get('lowres_pred_patches')
+            modality_code = torch.tensor([{
+                'ct': 0, 'mri': 1, 'us': 2, 'pet': 3, 'microscopy': 4
+            }[modality]]).to(self.device)  # Keep modality_code on GPU if text_encoder needs it on GPU
+            h, w, d = current_shape
+            n_total_classes = len(text_prompts)
+            # Get category batch size from config, default to 24
+            category_batch_size = self.config.get('category_batch_size', 24)
+            background_threshold = self.config.get('background_threshold', 0.5)
+            # Initialize max_prob and max_class_label_value on CPU to save GPU memory
+            max_prob = torch.zeros((h, w, d), dtype=torch.float32, device='cpu')
+            max_class_label_value = torch.zeros((h, w, d), dtype=torch.int16, device='cpu')
+            # Process categories in batches to avoid OOM
+            category_range = range(0, n_total_classes, category_batch_size)
+            pbar = tqdm(category_range, desc="Processing Categories")
+            for i in pbar:
+                current_category_texts = text_prompts[i:i + category_batch_size]
+                current_label_values = label_values[i:i + category_batch_size]
+                current_n = len(current_category_texts)
+                end_idx = min(i + current_n - 1, n_total_classes - 1)
+                # Update progress bar description with current category range
+                pbar.set_description(f"Processing Categories {i}-{end_idx}")
+                # Keep these large tensors on CPU for accumulation
+                temp_prediction_batch_cpu = torch.zeros((current_n, h, w, d), dtype=torch.float32, device='cpu')
+                temp_accumulation_batch_cpu = torch.zeros((current_n, h, w, d), dtype=torch.float32, device='cpu')
+                # Encode text prompts for current batch
+                with autocast(enabled=False):
+                    queries = text_encoder(current_category_texts, modality_code, self.device)  # queries remain on GPU for model input
+                # Process patches for current category batch
+                for patches, lowres_pred_patches, y1y2_x1x2_z1z2_ls in tqdm(
+                    zip(batched_patches, batched_lowres_pred_patches if batched_lowres_pred_patches is not None else [None]*len(batched_patches), batched_y1y2_x1x2_z1z2),
+                    total=len(batched_patches),
+                    desc="Processing",
+                    ncols=100,
+                    bar_format="{l_bar}{bar:20}{r_bar}",
+                    colour="green",
+                    leave=False
+                ):
+                    patches = patches.unsqueeze(0).to(device=self.device, dtype=torch.float32)  # patches on GPU for model input
+                    y1, y2, x1, x2, z1, z2 = y1y2_x1x2_z1z2_ls
+                    simulated_lowres_sc_pred = None
+                    simulated_lowres_mc_pred = None
+                    if not self.config['w_lowres_pred_prompts']:
+                        simulated_lowres_sc_pred = torch.zeros((1, 1, *crop_size), device=self.device, dtype=torch.float32)
+                        simulated_lowres_mc_pred = torch.zeros((1, current_n, *crop_size), device=self.device, dtype=torch.float32)
+                        prediction_patch = model(
+                            queries=queries,
+                            image_input=patches,
+                            simulated_lowres_sc_pred=simulated_lowres_sc_pred,
+                            simulated_lowres_mc_pred=simulated_lowres_mc_pred,
+                            train_mode=False
+                        ) if self.config['disable_tta'] else internal_maybe_mirror_and_predict(
+                            model=model,
+                            queries=queries,
+                            image_input=patches,
+                            simulated_lowres_sc_pred=simulated_lowres_sc_pred,
+                            simulated_lowres_mc_pred=simulated_lowres_mc_pred,
+                            mirror_axes=(0, 1, 2)
+                        )
+                    else:
+                        lowres_pred_patches = lowres_pred_patches.unsqueeze(0).to(device=self.device, dtype=torch.float32)
+                        simulated_lowres_sc_pred = torch.where(lowres_pred_patches > 0, torch.ones_like(lowres_pred_patches), torch.zeros_like(lowres_pred_patches))
+                        simulated_lowres_mc_pred = sc_mask_to_mc_mask(lowres_pred_patches, [int(val) for val in current_label_values])
+                        possible_block_sizes = [8]
+                        if instance_label == 1:
+                            n_repeats = 1
+                        else:
+                            n_repeats = 1
+                        prediction_patch = compute_patch_prediction(queries, patches, simulated_lowres_sc_pred, simulated_lowres_mc_pred, model, possible_block_sizes, n_repeats, disable_tta)
+                    if instance_label == 1:  # Instance segmentation mode
+                        for _ in range(num_iterations):
+                            prediction_patch_prob = torch.sigmoid(prediction_patch).detach()
+                            simulated_lowres_mc_pred = torch.where(prediction_patch_prob > 0.5, 1.0, 0.0)
+                            simulated_lowres_sc_pred = (simulated_lowres_mc_pred.sum(dim=1, keepdim=True) > 0).float()
+                            possible_block_sizes = [4]
+                            n_repeats = 1
+                            prediction_patch = compute_patch_prediction(queries, patches, simulated_lowres_sc_pred, simulated_lowres_mc_pred, model, possible_block_sizes, n_repeats, disable_tta)
+                    prediction_patch_prob_gpu = torch.sigmoid(prediction_patch).detach()
+                    current_gaussian_slice = gaussian[:y2-y1, :x2-x1, :z2-z1]  # Already on CPU
+                    # Perform accumulation on CPU. Move prediction_patch_prob_gpu to CPU here.
+                    temp_prediction_batch_cpu[:, y1:y2, x1:x2, z1:z2] += (prediction_patch_prob_gpu[0, :, :y2-y1, :x2-x1, :z2-z1].cpu() * current_gaussian_slice)
+                    temp_accumulation_batch_cpu[:, y1:y2, x1:x2, z1:z2] += current_gaussian_slice
+                    # Explicitly delete GPU tensors to free up memory immediately
+                    del prediction_patch, prediction_patch_prob_gpu, patches
+                    if simulated_lowres_sc_pred is not None:
+                        del simulated_lowres_sc_pred
+                    if simulated_lowres_mc_pred is not None:
+                        del simulated_lowres_mc_pred
+                    torch.cuda.empty_cache()  # Clear any cached GPU memory after each patch processing
+                    gc.collect()  # Python garbage collection
+                # Normalize predictions by accumulation
+                batch_accumulation_cpu = temp_accumulation_batch_cpu
+                batch_accumulation_cpu[batch_accumulation_cpu == 0] = 1e-8
+                batch_prediction_prob_cpu = temp_prediction_batch_cpu / batch_accumulation_cpu
+                # Update max_prob and max_class_label_value on CPU
+                for j in range(current_n):
+                    class_prob_cpu = batch_prediction_prob_cpu[j, ...]  # Already on CPU
+                    class_label_value_cpu_scalar = torch.tensor(int(current_label_values[j]), dtype=torch.int16, device='cpu')  # Already on CPU
+                    update_mask_cpu = class_prob_cpu > max_prob
+                    max_prob[update_mask_cpu] = class_prob_cpu[update_mask_cpu]
+                    max_class_label_value[update_mask_cpu] = class_label_value_cpu_scalar
+                # Clean up batch tensors
+                del temp_prediction_batch_cpu, temp_accumulation_batch_cpu, batch_accumulation_cpu, batch_prediction_prob_cpu, queries
+                # Previous patch-level deletions handle GPU memory
+            # Final operations on CPU
+            background_indices = max_prob < background_threshold
+            max_class_label_value[background_indices] = 0
+            results = max_class_label_value.numpy()  # Already on CPU, just convert to numpy
+            results = remove_padding(results, padding_info)
+            current_h, current_w, current_d = results.shape
+            if results.shape != original_shape:
+                results = resample_torch_simple(
+                    results[np.newaxis, ...],
+                    new_shape=original_shape,
+                    is_seg=True,
+                    num_threads=4,
+                    device=torch.device('cpu'),
+                    memefficient_seg_resampling=False).squeeze(0)
+                if verbose:
+                    print(f"Resized segmentation from {current_h, current_w, current_d} to {original_shape}")
+            pred_array = rearrange(results, 'h w d -> d h w').astype(np.int16)
+            if return_max_prob and instance_label == 0:
+                # max_prob is already on CPU, just convert to numpy for post-processing
+                max_prob_numpy = max_prob.numpy()
+                max_prob_numpy = remove_padding(max_prob_numpy, padding_info)
+                current_h, current_w, current_d = max_prob_numpy.shape
+                if max_prob_numpy.shape != original_shape:
+                    max_prob_numpy = resample_torch_simple(
+                        max_prob_numpy[np.newaxis, ...],
+                        new_shape=original_shape,
+                        is_seg=False,
+                        num_threads=4,
+                        device=torch.device('cpu'),
+                        memefficient_seg_resampling=False).squeeze(0)
+                    if verbose:
+                        print(f"Resized max probability from {current_h, current_w, current_d} to {original_shape}")
+                max_prob = rearrange(max_prob_numpy, 'h w d -> d h w').astype(np.float32)
+        if return_max_prob and instance_label == 0:
+            return pred_array, max_prob
+        else:
+            return pred_array, None
+def run_segmentation(
+    raw_image,
+    raw_spacing,
+    crop_size=[192, 192, 96],
+    target_spacing=[1.5, 1.5, 3.0],
+    target_spacing_model=[1.5, 1.5, 3.0],
+    w_lowres_pred_prompts=False,
+    scaled_roi_lowres_pred_array=None,
+    disable_tta=True,
+    model_step=100000,
+    vision_backbone="UNET",
+    input_channels=2,
+    patch_size=[32, 32, 32],
+    modality='CT',
+    instance_label=0,
+    texts=[],
+    label_values=[],
+    return_max_prob=False,
+    class_name_list=[],
+    stage_1_flag=False,
+    device="cuda:0",
+    checkpoints_path="./checkpoints",
+    category_batch_size=24,
+    background_threshold=0.5,
+    verbose=True,
+):
+    """
+    Main segmentation function.
+    This function orchestrates the entire segmentation pipeline including
+    model loading, data preprocessing, patch-based inference, and result aggregation.
+    Args:
+        raw_image: Input image array with shape (d, h, w), dtype uint8, values in [0, 255]
+        raw_spacing: Spacing array with shape (3,)
+        crop_size: Crop size for patch processing [h, w, d]
+        target_spacing: Target spacing for resampling [h, w, d]
+        target_spacing_model: Target spacing for model (should match target_spacing)
+        w_lowres_pred_prompts: Whether to use low-res predictions as spatial prompts
+        scaled_roi_lowres_pred_array: Low-res prediction array for spatial prompts
+        disable_tta: Disable test-time augmentation
+        model_step: Model checkpoint step number
+        vision_backbone: Vision backbone architecture name
+        input_channels: Number of input channels
+        patch_size: Patch size for the model
+        modality: Imaging modality ('CT', 'MRI', 'US', 'PET', 'microscopy')
+        instance_label: 0 for semantic segmentation, 1 for instance segmentation
+        texts: List of text prompts (one per class)
+        label_values: List of label values (one per class)
+        return_max_prob: Whether to return maximum probability map
+        class_name_list: List of class names for class-specific adjustments
+        stage_1_flag: Whether this is Stage 1 inference
+        device: Device string (e.g., 'cuda:0' or 'cpu')
+        checkpoints_path: Path to model checkpoints directory
+        category_batch_size: Number of categories to process in each batch (default: 24)
+            Adjust based on GPU memory. Larger 3D images require smaller batch sizes.
+            Accumulation operations are performed on CPU for more stable memory usage.
+        background_threshold: Probability threshold for background (default: 0.5)
+            Voxels with max probability below this threshold will be labeled as background.
+        verbose: Whether to print detailed information (default: True)
+    Returns:
+        pred_array: Segmentation array with shape (d, h, w), dtype int16
+        max_prob_array: Maximum probability array (if return_max_prob=True), or None
+    """
+    w_lowres_pred_prompts = scaled_roi_lowres_pred_array is not None
+    config = {
+        'device': device,
+        'modality': modality,
+        'instance_label': instance_label,
+        'texts': texts,
+        'label_values': label_values,
+        'vision_backbone': vision_backbone,
+        'crop_size': crop_size,
+        'patch_size': patch_size,
+        'target_spacing': target_spacing,
+        'target_spacing_model': target_spacing_model,
+        'model_step': model_step,
+        'input_channels': input_channels,
+        'w_lowres_pred_prompts': w_lowres_pred_prompts,
+        'scaled_roi_lowres_pred_array': scaled_roi_lowres_pred_array,
+        'disable_tta': disable_tta,
+        'checkpoints_path': checkpoints_path,
+        'return_max_prob': return_max_prob,
+        'class_name_list': class_name_list,
+        'stage_1_flag': stage_1_flag,
+        'category_batch_size': category_batch_size,
+        'background_threshold': background_threshold,
+    }
+    pipeline = MedicalSegmentationPipeline(config)
+    return pipeline.run_inference(raw_image, raw_spacing, verbose=verbose)
+# ============================================================================
+# Main Inference Functions
+# ============================================================================
+# These functions provide the high-level interface for running inference
+# on raw NIfTI images with proper preprocessing and post-processing.
+# ============================================================================
+def normalize_image_ct(image_data, window_level=40, window_width=400, window_type='soft_tissue'):
+    """
+    Normalize CT image using window/level technique.
+    Args:
+        image_data: Input CT image array
+        window_level: Window level (center of the window). If None, will use default based on window_type
+        window_width: Window width (range of the window). If None, will use default based on window_type
+        window_type: Type of window ('soft_tissue', 'bone', 'lung'). Used if window_level/window_width are None
+    Returns:
+        Normalized image array with dtype uint8, values in [0, 255]
+    """
+    # Default window settings for different window types
+    default_windows = {
+        'soft_tissue': {'window_level': 40, 'window_width': 400},
+        'bone': {'window_level': 500, 'window_width': 1500},
+        'lung': {'window_level': -600, 'window_width': 1500}
+    }
+    # Use defaults if not provided
+    if window_level is None or window_width is None:
+        if window_type in default_windows:
+            window_level = default_windows[window_type]['window_level']
+            window_width = default_windows[window_type]['window_width']
+        else:
+            # Fallback to soft_tissue defaults
+            window_level = default_windows['soft_tissue']['window_level']
+            window_width = default_windows['soft_tissue']['window_width']
+    lower_bound = window_level - window_width / 2
+    upper_bound = window_level + window_width / 2
+    image_data_pre = np.clip(image_data, lower_bound, upper_bound)
+    image_data_pre = (
+        (image_data_pre - np.min(image_data_pre))
+        / (np.max(image_data_pre) - np.min(image_data_pre) + 1e-8)
+        * 255.0
+    )
+    return image_data_pre.astype(np.uint8)
+def normalize_image_other(image_data, percentile_lower=None, percentile_upper=None, preserve_zero=None, normalization_settings=None):
+    """
+    Normalize non-CT images using percentile-based normalization.
+    This method clips values to specified percentiles, then
+    normalizes to [0, 255] range while optionally preserving zero values.
+    Args:
+        image_data: Input image array
+        percentile_lower: Lower percentile for clipping. If None, will use default or value from normalization_settings
+        percentile_upper: Upper percentile for clipping. If None, will use default or value from normalization_settings
+        preserve_zero: Whether to preserve zero values. If None, will use default or value from normalization_settings
+        normalization_settings: Dictionary containing normalization settings from config.
+            Format: {'percentile_lower': 0.5, 'percentile_upper': 99.5, 'preserve_zero': True}
+    Returns:
+        Normalized image array with dtype uint8, values in [0, 255]
+    """
+    # Default normalization settings
+    default_percentile_lower = 0.5
+    default_percentile_upper = 99.5
+    default_preserve_zero = True
+    # Use settings from config if provided
+    if normalization_settings is not None:
+        if percentile_lower is None:
+            percentile_lower = normalization_settings.get('percentile_lower', default_percentile_lower)
+        if percentile_upper is None:
+            percentile_upper = normalization_settings.get('percentile_upper', default_percentile_upper)
+        if preserve_zero is None:
+            preserve_zero = normalization_settings.get('preserve_zero', default_preserve_zero)
+    else:
+        # Use defaults if not provided
+        if percentile_lower is None:
+            percentile_lower = default_percentile_lower
+        if percentile_upper is None:
+            percentile_upper = default_percentile_upper
+        if preserve_zero is None:
+            preserve_zero = default_preserve_zero
+    # Calculate percentiles from non-zero values
+    non_zero_data = image_data[image_data > 0]
+    if len(non_zero_data) > 0:
+        lower_bound, upper_bound = np.percentile(
+            non_zero_data, [percentile_lower, percentile_upper]
+        )
+    else:
+        # If all values are zero, use min/max
+        lower_bound = np.min(image_data)
+        upper_bound = np.max(image_data)
+    image_data_pre = np.clip(image_data, lower_bound, upper_bound)
+    image_data_pre = (
+        (image_data_pre - np.min(image_data_pre))
+        / (np.max(image_data_pre) - np.min(image_data_pre) + 1e-8)
+        * 255.0
+    )
+    if preserve_zero:
+        image_data_pre[image_data == 0] = 0
+    return image_data_pre.astype(np.uint8)
+def load_nifti_image(image_path):
+    """
+    Load NIfTI image and extract data, spacing, and metadata.
+    Args:
+        image_path: Path to NIfTI image file
+    Returns:
+        image_data: Image array with shape (d, h, w)
+        spacing_xyz: Spacing tuple (x, y, z) from SimpleITK
+        metadata: Dictionary containing origin, direction, and spacing_xyz
+    """
+    img_sitk = sitk.ReadImage(image_path)
+    image_data = sitk.GetArrayFromImage(img_sitk)  # Shape: (d, h, w)
+    spacing_xyz = img_sitk.GetSpacing()  # (x, y, z)
+    # Save metadata for output
+    metadata = {
+        'origin': img_sitk.GetOrigin(),
+        'direction': img_sitk.GetDirection(),
+        'spacing_xyz': spacing_xyz
+    }
+    return image_data, spacing_xyz, metadata
+def convert_spacing(spacing_xyz, image_shape):
+    """
+    Convert spacing from SimpleITK format (x, y, z) to format expected by run_segmentation.
+    Following the conversion logic from inference_raw_nifti_2.py:
+    1. SimpleITK returns (x, y, z)
+    2. Image from SimpleITK is (d, h, w) where d=z, h=y, w=x
+    3. Convert to (d, h, w) spacing: (z, x, y) = (d, h, w)
+    4. Then convert to format expected by run_segmentation: (h, w, d)
+    Args:
+        spacing_xyz: Spacing tuple from SimpleITK (x, y, z)
+        image_shape: Image shape (d, h, w)
+    Returns:
+        img_spacing: Spacing array in format expected by run_segmentation
+    """
+    img_spacing = np.array(spacing_xyz, dtype=np.float32)
+    # Step 1: Convert from (x, y, z) to (d, h, w) spacing
+    # SimpleITK: (x, y, z) -> Image: (d, h, w) where d=z, h=y, w=x
+    # So spacing (x, y, z) -> (z, x, y) = (d, h, w)
+    img_spacing_transposed = img_spacing[[2, 0, 1]]  # (z, x, y) = (d, h, w)
+    # Step 2: Handle very small spacing values
+    for i in range(3):
+        if img_spacing_transposed[i] < 0.1:
+            img_spacing_transposed[i] = 1.0
+    # Step 3: Optional: Adjust spacing based on image dimensions
+    # Note: adjust_spacing expects image in (h, w, d) format, so we need to rearrange
+    # For now, we'll skip this adjustment or use a dummy array
+    try:
+        img_spacing_transposed = adjust_spacing(
+            np.zeros(image_shape),  # Dummy array for shape reference
+            img_spacing_transposed
+        ).astype(np.float32)
+    except Exception:
+        # If adjust_spacing fails, use spacing as-is
+        pass
+    # Step 4: Convert to format expected by run_segmentation
+    # This converts (d, h, w) to (h, w, d)
+    img_spacing = img_spacing_transposed[[1, 2, 0]]
+    return img_spacing
+def run_inference_single_window(
+    image_data,
+    spacing_xyz,
+    metadata,
+    modality='CT',
+    texts=None,
+    label_values=None,
+    inference_mode='stage2_only',
+    device="cuda:0",
+    checkpoints_path="./checkpoints",
+    window_settings=None,
+    window_type='soft_tissue',
+    normalization_settings=None,
+    verbose=True
+):
+    """
+    Run inference for a single window type.
+    This is an internal function used by run_inference to handle single window type inference.
+    Args:
+        image_data: Raw image data array (d, h, w)
+        spacing_xyz: Spacing tuple (x, y, z)
+        metadata: Image metadata dictionary
+        modality: Imaging modality ('CT', 'MRI', 'US', 'PET', 'microscopy')
+        texts: List of text prompts (one per class)
+        label_values: List of label values (one per class)
+        inference_mode: Inference mode ('stage2_only' or 'stage1+stage2')
+        device: Device to use ('cuda:0' or 'cpu')
+        checkpoints_path: Path to model checkpoints
+        window_settings: Dictionary containing window settings for different window types (CT only)
+        window_type: Type of window to use ('soft_tissue', 'bone', 'lung')
+        normalization_settings: Dictionary containing normalization settings for non-CT modalities
+        verbose: Whether to print detailed information (default: True)
+    Returns:
+        pred_array: Segmentation array (d, h, w)
+    """
+    if texts is None:
+        texts = []
+    if label_values is None:
+        label_values = []
+    if len(texts) != len(label_values):
+        raise ValueError("Number of text prompts must match number of label values")
+    # Normalize image
+    if verbose:
+        print(f"Normalizing image for {window_type} window (modality: {modality})")
+    if modality.upper() == 'CT':
+        # Get window settings from config if available
+        window_level = None
+        window_width = None
+        if window_settings is not None and window_type in window_settings:
+            window_level = window_settings[window_type].get('window_level')
+            window_width = window_settings[window_type].get('window_width')
+            if verbose:
+                print(f"Using {window_type} window: level={window_level}, width={window_width}")
+        img_array = normalize_image_ct(image_data, window_level=window_level,
+                                       window_width=window_width, window_type=window_type)
+    else:
+        # Get normalization settings from config if available
+        if normalization_settings is not None:
+            if verbose:
+                print(f"Using normalization settings from config: {normalization_settings}")
+            img_array = normalize_image_other(image_data, normalization_settings=normalization_settings)
+        else:
+            # Use default normalization
+            if verbose:
+                print("Using default normalization settings")
+            img_array = normalize_image_other(image_data)
+    if verbose:
+        print(f"Normalized image range: [{img_array.min()}, {img_array.max()}]")
+    # Convert spacing
+    img_spacing = convert_spacing(spacing_xyz, img_array.shape)
+    if verbose:
+        print(f"Converted spacing: {img_spacing}")
+    # Run inference
+    if inference_mode == 'stage1+stage2':
+        if verbose:
+            print(f"Running two-stage inference with {window_type} window...")
+        # Stage 1: Low-resolution
+        if verbose:
+            print("Stage 1: Low-resolution segmentation...")
+        stage_1_pred, _ = run_segmentation(
+            raw_image=img_array,
+            raw_spacing=img_spacing,
+            crop_size=[224, 224, 128],
+            target_spacing=[1.5, 1.5, 3.0],
+            target_spacing_model=[1.5, 1.5, 3.0],
+            w_lowres_pred_prompts=False,
+            scaled_roi_lowres_pred_array=None,
+            disable_tta=True,
+            model_step=358600,
+            modality=modality.lower(),
+            instance_label=0,
+            texts=texts,
+            label_values=label_values,
+            return_max_prob=False,
+            class_name_list=[],
+            stage_1_flag=True,
+            device=device,
+            checkpoints_path=checkpoints_path,
+            verbose=verbose
+        )
+        # Check if Stage 1 found anything
+        if stage_1_pred.sum() == 0:
+            if verbose:
+                print("Warning: Stage 1 found no predictions. Using Stage 1 result as final output.")
+            final_pred = stage_1_pred
+        else:
+            if verbose:
+                print("Stage 1 completed. Extracting ROI for Stage 2...")
+            # Remove small objects from Stage 1 prediction
+            min_size = 10
+            lowres_pred_binary = (stage_1_pred > 0).astype(np.int16)
+            lowres_pred_binary = remove_small_objects_binary(lowres_pred_binary, min_size=min_size).astype(np.int16)
+            stage_1_pred_cleaned = stage_1_pred * lowres_pred_binary
+            # Extract ROI from Stage 1 prediction
+            # Find bounding box of non-zero regions
+            non_zero_indices = np.argwhere(stage_1_pred_cleaned > 0)
+            if len(non_zero_indices) == 0:
+                if verbose:
+                    print("Warning: No non-zero regions after cleaning. Using Stage 1 result.")
+                final_pred = stage_1_pred_cleaned
+            else:
+                z_min, y_min, x_min = non_zero_indices.min(axis=0)
+                z_max, y_max, x_max = non_zero_indices.max(axis=0)
+                # Calculate ROI center and range with scaling factor
+                m = 1.1  # Scaling factor for ROI expansion
+                z_center = (z_min + z_max) / 2
+                y_center = (y_min + y_max) / 2
+                x_center = (x_min + x_max) / 2
+                z_range = (z_max - z_min + 1) * m / 2
+                y_range = (y_max - y_min + 1) * m / 2
+                x_range = (x_max - x_min + 1) * m / 2
+                # Calculate minimum ranges based on Stage 2 crop size and spacing
+                stage_2_crop_size = [192, 192, 192]
+                stage_2_target_spacing = [1.0, 1.0, 1.0]
+                img_spacing_for_roi = img_spacing.copy()
+                min_z_range = (stage_2_crop_size[2] / 2) * stage_2_target_spacing[2] / img_spacing_for_roi[2] if img_spacing_for_roi[2] > 0 else z_range
+                min_y_range = (stage_2_crop_size[0] / 2) * stage_2_target_spacing[0] / img_spacing_for_roi[0] if img_spacing_for_roi[0] > 0 else y_range
+                min_x_range = (stage_2_crop_size[1] / 2) * stage_2_target_spacing[1] / img_spacing_for_roi[1] if img_spacing_for_roi[1] > 0 else x_range
+                z_range = max(min_z_range - 1, z_range)
+                y_range = max(min_y_range - 1, y_range)
+                x_range = max(min_x_range - 1, x_range)
+                z_min_new = max(0, int(z_center - z_range))
+                z_max_new = min(stage_1_pred_cleaned.shape[0] - 1, int(z_center + z_range))
+                y_min_new = max(0, int(y_center - y_range))
+                y_max_new = min(stage_1_pred_cleaned.shape[1] - 1, int(y_center + y_range))
+                x_min_new = max(0, int(x_center - x_range))
+                x_max_new = min(stage_1_pred_cleaned.shape[2] - 1, int(x_center + x_range))
+                if verbose:
+                    print(f"ROI bounds: z=[{z_min_new}:{z_max_new}], y=[{y_min_new}:{y_max_new}], x=[{x_min_new}:{x_max_new}]")
+                roi_array = img_array[z_min_new:z_max_new+1, y_min_new:y_max_new+1, x_min_new:x_max_new+1]
+                roi_lowres_pred = stage_1_pred_cleaned[z_min_new:z_max_new+1, y_min_new:y_max_new+1, x_min_new:x_max_new+1]
+                if verbose:
+                    print(f"ROI image shape: {roi_array.shape}")
+                    print(f"ROI prediction shape: {roi_lowres_pred.shape}")
+                # Stage 2: High-resolution segmentation on ROI
+                if verbose:
+                    print("Stage 2: High-resolution segmentation on ROI...")
+                roi_pred, _ = run_segmentation(
+                    raw_image=roi_array,
+                    raw_spacing=img_spacing,
+                    crop_size=[192, 192, 192],
+                    target_spacing=[1.0, 1.0, 1.0],
+                    target_spacing_model=[1.0, 1.0, 1.0],
+                    w_lowres_pred_prompts=True,
+                    scaled_roi_lowres_pred_array=roi_lowres_pred,
+                    disable_tta=True,
+                    model_step=341300,
+                    modality=modality.lower(),
+                    instance_label=0,
+                    texts=texts,
+                    label_values=label_values,
+                    return_max_prob=False,
+                    class_name_list=[],
+                    stage_1_flag=False,
+                    device=device,
+                    checkpoints_path=checkpoints_path,
+                    verbose=verbose
+                )
+                # Integrate ROI prediction back into full volume
+                if verbose:
+                    print("Integrating Stage 2 results back into full volume...")
+                final_pred = np.zeros_like(stage_1_pred_cleaned, dtype=np.int16)
+                final_pred[z_min_new:z_max_new+1, y_min_new:y_max_new+1, x_min_new:x_max_new+1] = roi_pred
+                if verbose:
+                    print("Stage1+Stage2 inference completed.")
+    elif inference_mode == 'stage2_only':
+        if verbose:
+            print(f"Running Stage 2 inference with {window_type} window...")
+        final_pred, _ = run_segmentation(
+            raw_image=img_array,
+            raw_spacing=img_spacing,
+            crop_size=[192, 192, 192],
+            target_spacing=[1.0, 1.0, 1.0],
+            target_spacing_model=[1.0, 1.0, 1.0],
+            w_lowres_pred_prompts=False,
+            scaled_roi_lowres_pred_array=None,
+            disable_tta=True,
+            model_step=341300,
+            modality=modality.lower(),
+            instance_label=0,
+            texts=texts,
+            label_values=label_values,
+            return_max_prob=False,
+            class_name_list=[],
+            stage_1_flag=False,
+            device=device,
+            checkpoints_path=checkpoints_path,
+            verbose=verbose
+        )
+    else:
+        raise ValueError(f"Unknown inference mode: {inference_mode}. Must be 'stage2_only' or 'stage1+stage2'")
+    return final_pred
+def run_inference(
+    image_path,
+    output_path,
+    modality='CT',
+    texts=None,
+    label_values=None,
+    inference_mode='stage2_only',
+    device="cuda:0",
+    checkpoints_path="./checkpoints",
+    window_settings=None,
+    window_type='soft_tissue',
+    normalization_settings=None,
+    window_type_mapping=None,
+    verbose=True
+):
+    """
+    Run Medal-S inference on a raw NIfTI image.
+    Supports multi-window inference for CT images: if multiple window types are specified
+    (e.g., soft_tissue, bone, lung), each window type will be processed separately with
+    its corresponding window settings, and results will be merged.
+    Args:
+        image_path: Path to input NIfTI image
+        output_path: Path to save output segmentation (will be modified with mode suffix)
+        modality: Imaging modality ('CT', 'MRI', 'US', 'PET', 'microscopy')
+        texts: List of text prompts (one per class)
+        label_values: List of label values (one per class)
+        inference_mode: Inference mode ('stage2_only' or 'stage1+stage2')
+        device: Device to use ('cuda:0' or 'cpu')
+        checkpoints_path: Path to model checkpoints
+        window_settings: Dictionary containing window settings for different window types (CT only).
+            Format: {'soft_tissue': {'window_level': 40, 'window_width': 400}, ...}
+        window_type: Type of window to use ('soft_tissue', 'bone', 'lung'). Default: 'soft_tissue' (CT only)
+            Ignored if window_type_mapping indicates multiple window types
+        normalization_settings: Dictionary containing normalization settings for non-CT modalities.
+            Format: {'percentile_lower': 0.5, 'percentile_upper': 99.5, 'preserve_zero': True}
+        window_type_mapping: Dictionary mapping each text to its window type.
+            Format: {'text1': 'soft_tissue', 'text2': 'bone', ...}
+            If provided and contains multiple window types, will perform separate inference for each
+        verbose: Whether to print detailed information (default: True)
+    Returns:
+        pred_array: Segmentation array (d, h, w)
+        inference_time: Total inference time in seconds
+    """
+    if texts is None:
+        texts = []
+    if label_values is None:
+        label_values = []
+    if len(texts) != len(label_values):
+        raise ValueError("Number of text prompts must match number of label values")
+    # Add mode suffix to output filename
+    if inference_mode == 'stage1+stage2':
+        suffix = '_stage1+stage2'
+    elif inference_mode == 'stage2_only':
+        suffix = '_stage2_only'
+    else:
+        suffix = f'_{inference_mode}'
+    # Modify output path to include suffix
+    base_path, ext = os.path.splitext(output_path)
+    if ext == '.gz':  # Handle .nii.gz
+        base_path, nii_ext = os.path.splitext(base_path)
+        output_path = f"{base_path}{suffix}{nii_ext}{ext}"
+    else:
+        output_path = f"{base_path}{suffix}{ext}"
+    if verbose:
+        print(f"Output will be saved to: {output_path}")
+    # Start timing
+    start_time = time.time()
+    # Load image
+    if verbose:
+        print(f"Loading image: {image_path}")
+    image_data, spacing_xyz, metadata = load_nifti_image(image_path)
+    if verbose:
+        print(f"Image shape: {image_data.shape}")
+        print(f"Original spacing (x, y, z): {spacing_xyz}")
+    # Determine inference strategy based on modality and window types
+    if modality.upper() == 'CT':
+        # CT modality: check for multiple window types
+        if window_type_mapping is not None:
+            window_types = list(set(window_type_mapping.values()))
+            if len(window_types) > 1:
+                # Multiple window types: perform separate inference for each window type
+                if verbose:
+                    print(f"\n{'='*60}")
+                    print(f"CT with {len(window_types)} window types detected: {window_types}")
+                    print("Performing separate inference for each window type...")
+                    print(f"{'='*60}\n")
+                all_predictions = []
+                for wt in window_types:
+                    if verbose:
+                        print(f"\n{'='*60}")
+                        print(f"Processing {wt} window type...")
+                        print(f"{'='*60}\n")
+                    # Filter texts and label_values for this window type
+                    wt_texts = [text for text in texts if window_type_mapping.get(text) == wt]
+                    wt_indices = [i for i, text in enumerate(texts) if window_type_mapping.get(text) == wt]
+                    wt_label_values = [label_values[i] for i in wt_indices]
+                    if len(wt_texts) == 0:
+                        if verbose:
+                            print(f"No classes for {wt} window type, skipping...")
+                        continue
+                    if verbose:
+                        print(f"Classes for {wt} window: {len(wt_texts)}")
+                        print(f"  Texts: {wt_texts}")
+                        print(f"  Labels: {wt_label_values}")
+                    # Run inference for this window type with its specific window settings
+                    wt_pred = run_inference_single_window(
+                        image_data=image_data,
+                        spacing_xyz=spacing_xyz,
+                        metadata=metadata,
+                        modality=modality,
+                        texts=wt_texts,
+                        label_values=wt_label_values,
+                        inference_mode=inference_mode,
+                        device=device,
+                        checkpoints_path=checkpoints_path,
+                        window_settings=window_settings,
+                        window_type=wt,  # Use the specific window type
+                        normalization_settings=normalization_settings,
+                        verbose=verbose
+                    )
+                    all_predictions.append((wt_pred, wt_label_values))
+                # Merge predictions: use maximum label value when overlapping
+                if verbose:
+                    print(f"\n{'='*60}")
+                    print("Merging predictions from all window types...")
+                    print(f"{'='*60}\n")
+                final_pred = np.zeros_like(all_predictions[0][0], dtype=np.int16)
+                for wt_pred, wt_labels in all_predictions:
+                    # For each label in this window type's prediction
+                    for label_val in wt_labels:
+                        label_int = int(label_val)
+                        mask = (wt_pred == label_int)
+                        # Only update if current prediction is background (0) or smaller label
+                        final_pred[mask] = np.maximum(final_pred[mask], label_int)
+                if verbose:
+                    print("Merging completed.")
+            else:
+                # Single window type: use the specific window type
+                if len(window_types) == 1:
+                    window_type = window_types[0]
+                    if verbose:
+                        print(f"CT with single window type: {window_type}")
+                final_pred = run_inference_single_window(
+                    image_data=image_data,
+                    spacing_xyz=spacing_xyz,
+                    metadata=metadata,
+                    modality=modality,
+                    texts=texts,
+                    label_values=label_values,
+                    inference_mode=inference_mode,
+                    device=device,
+                    checkpoints_path=checkpoints_path,
+                    window_settings=window_settings,
+                    window_type=window_type,  # Use the determined window type
+                    normalization_settings=normalization_settings,
+                    verbose=verbose
+                )
+        else:
+            # No window_type_mapping: use default window_type
+            if verbose:
+                print(f"CT without window_type_mapping, using window type: {window_type}")
+            final_pred = run_inference_single_window(
+                image_data=image_data,
+                spacing_xyz=spacing_xyz,
+                metadata=metadata,
+                modality=modality,
+                texts=texts,
+                label_values=label_values,
+                inference_mode=inference_mode,
+                device=device,
+                checkpoints_path=checkpoints_path,
+                window_settings=window_settings,
+                window_type=window_type,
+                normalization_settings=normalization_settings,
+                verbose=verbose
+            )
+    else:
+        # Non-CT modality: use normalization_settings (other normalization)
+        if verbose:
+            print(f"Non-CT modality ({modality}): using normalization_settings")
+        final_pred = run_inference_single_window(
+            image_data=image_data,
+            spacing_xyz=spacing_xyz,
+            metadata=metadata,
+            modality=modality,
+            texts=texts,
+            label_values=label_values,
+            inference_mode=inference_mode,
+            device=device,
+            checkpoints_path=checkpoints_path,
+            window_settings=window_settings,  # Not used for non-CT
+            window_type=window_type,  # Not used for non-CT
+            normalization_settings=normalization_settings,  # Used for non-CT
+            verbose=verbose
+        )
+    # End timing
+    end_time = time.time()
+    inference_time = end_time - start_time
+    if verbose:
+        print(f"\n{'='*60}")
+        print(f"Inference Mode: {inference_mode}")
+        print(f"Total Inference Time: {inference_time:.2f} seconds ({inference_time/60:.2f} minutes)")
+        print(f"{'='*60}\n")
+    # Save result
+    if verbose:
+        print(f"Saving segmentation to: {output_path}")
+    seg_sitk = sitk.GetImageFromArray(final_pred.astype(np.int16))
+    seg_sitk.SetSpacing(metadata['spacing_xyz'])
+    seg_sitk.SetOrigin(metadata['origin'])
+    seg_sitk.SetDirection(metadata['direction'])
+    sitk.WriteImage(seg_sitk, output_path)
+    if verbose:
+        print(f"Successfully saved segmentation to: {output_path}")
+    return final_pred, inference_time
+def load_config_from_json(config_path):
+    """
+    Load configuration from JSON file.
+    Supports two formats:
+    1. Legacy format: single 'texts' array
+    2. New format: separate arrays for 'texts_soft_tissue', 'texts_bone', 'texts_lung'
+    If 'labels' field is missing or empty, automatically generates consecutive
+    integer labels starting from 1 (i.e., [1, 2, 3, ..., n] where n is the
+    number of texts).
+    Args:
+        config_path: Path to JSON configuration file
+    Returns:
+        config: Dictionary containing configuration parameters with processed labels
+    Example:
+        # Legacy format:
+        {"texts": ["Aorta", "Liver"], "labels": [1, 2]}
+        # New format with window types:
+        {
+            "texts_soft_tissue": ["Aorta", "Liver"],
+            "texts_bone": ["Vertebrae C1"],
+            "texts_lung": ["Left lung"],
+            "window_settings": {
+                "soft_tissue": {"window_level": 40, "window_width": 400},
+                "bone": {"window_level": 400, "window_width": 1500},
+                "lung": {"window_level": -600, "window_width": 1500}
+            }
+        }
+    """
+    with open(config_path, 'r', encoding='utf-8') as f:
+        config = json.load(f)
+    # Check if using new format (separate window types)
+    has_window_types = any(key in config for key in ['texts_soft_tissue', 'texts_bone', 'texts_lung'])
+    if has_window_types:
+        # New format: combine all texts from different window types
+        texts_soft_tissue = config.get('texts_soft_tissue', [])
+        texts_bone = config.get('texts_bone', [])
+        texts_lung = config.get('texts_lung', [])
+        # Combine all texts in order: soft_tissue, bone, lung
+        texts = texts_soft_tissue + texts_bone + texts_lung
+        # Store window type mapping for each text
+        window_type_mapping = {}
+        for text in texts_soft_tissue:
+            window_type_mapping[text] = 'soft_tissue'
+        for text in texts_bone:
+            window_type_mapping[text] = 'bone'
+        for text in texts_lung:
+            window_type_mapping[text] = 'lung'
+        config['texts'] = texts
+        config['window_type_mapping'] = window_type_mapping
+    else:
+        # Legacy format: single texts array
+        texts = config.get('texts', [])
+        # Default all texts to soft_tissue window type for backward compatibility
+        window_type_mapping = {text: 'soft_tissue' for text in texts}
+        config['window_type_mapping'] = window_type_mapping
+    # Process labels: auto-generate if missing or empty
+    texts = config.get('texts', [])
+    labels = config.get('labels', None)
+    if labels is None or len(labels) == 0:
+        # Auto-generate consecutive labels starting from 1
+        labels = list(range(1, len(texts) + 1))
+        print(f"  Auto-generated consecutive labels: {labels}")
+    else:
+        # Convert labels to integers (handle both string and integer inputs)
+        labels = [int(label) for label in labels]
+    # Validate that number of labels matches number of texts
+    if len(labels) != len(texts):
+        raise ValueError(
+            f"Number of labels ({len(labels)}) must match number of texts ({len(texts)}). "
+            f"Texts: {len(texts)}, Labels: {len(labels)}"
+        )
+    config['labels'] = labels
+    return config
+def main():
+    """
+    Main entry point for the inference script.
+    Parses command-line arguments and runs inference with the specified
+    configuration.
+    """
+    parser = argparse.ArgumentParser(
+        description="Medal-S inference for raw NIfTI images",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+    # Using JSON configuration file:
+    python inference_medals.py --input image.nii.gz --output result.nii.gz \\
+        --config config.json --mode stage2_only
+    # Using command-line arguments:
+    python inference_medals.py --input image.nii.gz --output result.nii.gz \\
+        --modality CT --texts "Aorta in CT" --labels 1 --mode stage1+stage2
+        """
+    )
+    parser.add_argument(
+        "--input", "-i",
+        type=str,
+        required=True,
+        help="Path to input NIfTI image"
+    )
+    parser.add_argument(
+        "--output", "-o",
+        type=str,
+        required=True,
+        help="Path to save output segmentation (suffix will be added automatically based on inference mode)"
+    )
+    parser.add_argument(
+        "--config", "-c",
+        type=str,
+        default=None,
+        help="Path to JSON configuration file (if provided, will override --texts, --labels, --modality)"
+    )
+    parser.add_argument(
+        "--modality", "-m",
+        type=str,
+        default="CT",
+        choices=['CT', 'MRI', 'US', 'PET', 'microscopy'],
+        help="Imaging modality (default: CT, ignored if --config is provided)"
+    )
+    parser.add_argument(
+        "--texts",
+        type=str,
+        nargs='+',
+        default=None,
+        help="Text prompts (one per class, ignored if --config is provided)"
+    )
+    parser.add_argument(
+        "--labels",
+        type=str,
+        nargs='+',
+        default=None,
+        help="Label values (one per class, must match texts, ignored if --config is provided)"
+    )
+    parser.add_argument(
+        "--mode",
+        type=str,
+        default="stage2_only",
+        choices=['stage2_only', 'stage1+stage2'],
+        help="Inference mode: 'stage2_only' (default) or 'stage1+stage2'"
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda:0",
+        help="Device to use (default: cuda:0)"
+    )
+    parser.add_argument(
+        "--checkpoints",
+        type=str,
+        default="./checkpoints",
+        help="Path to model checkpoints (default: ./checkpoints)"
+    )
+    parser.add_argument(
+        "--verbose", "-v",
+        action='store_true',
+        default=False,
+        help="Print detailed information during inference (default: False)"
+    )
+    args = parser.parse_args()
+    verbose = args.verbose
+    # Load configuration from JSON file if provided
+    window_settings = None
+    window_type = 'soft_tissue'
+    normalization_settings = None
+    window_type_mapping = None
+    if args.config:
+        if not os.path.exists(args.config):
+            raise FileNotFoundError(f"Configuration file not found: {args.config}")
+        config = load_config_from_json(args.config)
+        texts = config.get('texts', [])
+        labels = config.get('labels', [])
+        modality = config.get('modality', 'CT')
+        window_settings = config.get('window_settings')
+        normalization_settings = config.get('normalization_settings')
+        window_type_mapping = config.get('window_type_mapping')
+        # Determine default window type based on texts (for CT only, used as fallback)
+        if modality.upper() == 'CT':
+            if window_type_mapping:
+                window_types = list(set(window_type_mapping.values()))
+                if len(window_types) == 1:
+                    window_type = window_types[0]
+                else:
+                    # Default to soft_tissue if mixed types (will be handled by multi-window inference)
+                    window_type = 'soft_tissue'
+        # Convert labels to strings for compatibility with run_segmentation
+        # (run_segmentation expects string labels)
+        label_values = [str(label) for label in labels]
+        if verbose:
+            print(f"Loaded configuration from: {args.config}")
+            print(f"  Modality: {modality}")
+            print(f"  Number of classes: {len(texts)}")
+            print(f"  Labels: {labels}")
+            if modality.upper() == 'CT' and window_settings:
+                print(f"  Window settings available for: {list(window_settings.keys())}")
+                if window_type_mapping:
+                    window_types = list(set(window_type_mapping.values()))
+                    if len(window_types) > 1:
+                        print(f"  Multiple window types detected: {window_types}")
+                        print(f"  Will perform separate inference for each window type")
+                    else:
+                        print(f"  Using window type: {window_type}")
+                else:
+                    print(f"  Using window type: {window_type}")
+            elif normalization_settings:
+                print(f"  Normalization settings: {normalization_settings}")
+    else:
+        # Use command line arguments
+        if args.texts is None or args.labels is None:
+            raise ValueError("Either --config or both --texts and --labels must be provided")
+        texts = args.texts
+        label_values = args.labels
+        modality = args.modality
+    # Create output directory if needed
+    output_dir = os.path.dirname(args.output)
+    if output_dir and not os.path.exists(output_dir):
+        os.makedirs(output_dir, exist_ok=True)
+    # Run inference
+    run_inference(
+        image_path=args.input,
+        output_path=args.output,
+        modality=modality,
+        texts=texts,
+        label_values=label_values,
+        inference_mode=args.mode,
+        device=args.device,
+        checkpoints_path=args.checkpoints,
+        window_settings=window_settings,
+        window_type=window_type,
+        normalization_settings=normalization_settings,
+        window_type_mapping=window_type_mapping,
+        verbose=verbose
+    )
+if __name__ == '__main__':
+    main()

model/SwinUNETR.py ADDED Viewed

	@@ -0,0 +1,1116 @@

+from typing import Sequence, Tuple, Type, Union
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from torch.nn import LayerNorm
+from monai.networks.blocks import MLPBlock as Mlp
+from monai.networks.blocks import PatchEmbed, UnetOutBlock, UnetrBasicBlock, UnetrUpBlock
+from monai.networks.layers import DropPath, trunc_normal_
+from monai.utils import ensure_tuple_rep, optional_import
+rearrange, _ = optional_import("einops", name="rearrange")
+class SwinUNETR_Enc(nn.Module):
+    """
+    Swin UNETR based on: "Hatamizadeh et al.,
+    Swin UNETR: Swin Transformers for Semantic Segmentation of Brain Tumors in MRI Images
+    <https://arxiv.org/abs/2201.01266>"
+    """
+    def __init__(
+        self,
+        img_size: Union[Sequence[int], int],
+        in_channels: int,
+        depths: Sequence[int] = (2, 2, 2, 2),
+        num_heads: Sequence[int] = (3, 6, 12, 24),
+        feature_size: int = 24,
+        norm_name: Union[Tuple, str] = "instance",
+        drop_rate: float = 0.0,
+        attn_drop_rate: float = 0.0,
+        dropout_path_rate: float = 0.0,
+        normalize: bool = True,
+        use_checkpoint: bool = False,
+        spatial_dims: int = 3,
+        return_skips: bool = True,
+    ) -> None:
+        """
+        Args:
+            img_size: dimension of input image.
+            in_channels: dimension of input channels.
+            out_channels: dimension of output channels.
+            feature_size: dimension of network feature size.
+            depths: number of layers in each stage.
+            num_heads: number of attention heads.
+            norm_name: feature normalization type and arguments.
+            drop_rate: dropout rate.
+            attn_drop_rate: attention dropout rate.
+            dropout_path_rate: drop path rate.
+            normalize: normalize output intermediate features in each stage.
+            use_checkpoint: use gradient checkpointing for reduced memory usage.
+            spatial_dims: number of spatial dims.
+        """
+        super().__init__()
+        self.return_skips = return_skips
+        img_size = ensure_tuple_rep(img_size, spatial_dims)
+        patch_size = ensure_tuple_rep(2, spatial_dims)
+        window_size = ensure_tuple_rep(7, spatial_dims)
+        if not (spatial_dims == 2 or spatial_dims == 3):
+            raise ValueError("spatial dimension should be 2 or 3.")
+        for m, p in zip(img_size, patch_size):
+            for i in range(5):
+                if m % np.power(p, i + 1) != 0:
+                    raise ValueError("input image size (img_size) should be divisible by stage-wise image resolution.")
+        if not (0 <= drop_rate <= 1):
+            raise ValueError("dropout rate should be between 0 and 1.")
+        if not (0 <= attn_drop_rate <= 1):
+            raise ValueError("attention dropout rate should be between 0 and 1.")
+        if not (0 <= dropout_path_rate <= 1):
+            raise ValueError("drop path rate should be between 0 and 1.")
+        if feature_size % 12 != 0:
+            raise ValueError("feature_size should be divisible by 12.")
+        self.normalize = normalize
+        self.swinViT = SwinTransformer(
+            in_chans=in_channels,
+            embed_dim=feature_size,
+            window_size=window_size,
+            patch_size=patch_size,
+            depths=depths,
+            num_heads=num_heads,
+            mlp_ratio=4.0,
+            qkv_bias=True,
+            drop_rate=drop_rate,
+            attn_drop_rate=attn_drop_rate,
+            drop_path_rate=dropout_path_rate,
+            norm_layer=nn.LayerNorm,
+            use_checkpoint=use_checkpoint,
+            spatial_dims=spatial_dims,
+        )
+        self.encoder1 = UnetrBasicBlock(    # 2 conv layers
+            spatial_dims=spatial_dims,
+            in_channels=in_channels,
+            out_channels=feature_size,
+            kernel_size=3,
+            stride=1,
+            norm_name=norm_name,
+            res_block=True,
+        )
+        self.encoder2 = UnetrBasicBlock(
+            spatial_dims=spatial_dims,
+            in_channels=feature_size,
+            out_channels=feature_size,
+            kernel_size=3,
+            stride=1,
+            norm_name=norm_name,
+            res_block=True,
+        )
+        self.encoder3 = UnetrBasicBlock(
+            spatial_dims=spatial_dims,
+            in_channels=2 * feature_size,
+            out_channels=2 * feature_size,
+            kernel_size=3,
+            stride=1,
+            norm_name=norm_name,
+            res_block=True,
+        )
+        self.encoder4 = UnetrBasicBlock(
+            spatial_dims=spatial_dims,
+            in_channels=4 * feature_size,
+            out_channels=4 * feature_size,
+            kernel_size=3,
+            stride=1,
+            norm_name=norm_name,
+            res_block=True,
+        )
+        self.encoder5 = UnetrBasicBlock(
+            spatial_dims=spatial_dims,
+            in_channels=8 * feature_size,
+            out_channels=8 * feature_size,
+            kernel_size=3,
+            stride=1,
+            norm_name=norm_name,
+            res_block=True,
+        )
+        self.encoder6 = UnetrBasicBlock(
+            spatial_dims=spatial_dims,
+            in_channels=16 * feature_size,
+            out_channels=16 * feature_size,
+            kernel_size=3,
+            stride=1,
+            norm_name=norm_name,
+            res_block=True,
+        )
+    def load_from(self, weights):
+        with torch.no_grad():
+            self.swinViT.patch_embed.proj.weight.copy_(weights["state_dict"]["module.patch_embed.proj.weight"])
+            self.swinViT.patch_embed.proj.bias.copy_(weights["state_dict"]["module.patch_embed.proj.bias"])
+            for bname, block in self.swinViT.layers1[0].blocks.named_children():
+                block.load_from(weights, n_block=bname, layer="layers1")
+            self.swinViT.layers1[0].downsample.reduction.weight.copy_(
+                weights["state_dict"]["module.layers1.0.downsample.reduction.weight"]
+            )
+            self.swinViT.layers1[0].downsample.norm.weight.copy_(
+                weights["state_dict"]["module.layers1.0.downsample.norm.weight"]
+            )
+            self.swinViT.layers1[0].downsample.norm.bias.copy_(
+                weights["state_dict"]["module.layers1.0.downsample.norm.bias"]
+            )
+            for bname, block in self.swinViT.layers2[0].blocks.named_children():
+                block.load_from(weights, n_block=bname, layer="layers2")
+            self.swinViT.layers2[0].downsample.reduction.weight.copy_(
+                weights["state_dict"]["module.layers2.0.downsample.reduction.weight"]
+            )
+            self.swinViT.layers2[0].downsample.norm.weight.copy_(
+                weights["state_dict"]["module.layers2.0.downsample.norm.weight"]
+            )
+            self.swinViT.layers2[0].downsample.norm.bias.copy_(
+                weights["state_dict"]["module.layers2.0.downsample.norm.bias"]
+            )
+            for bname, block in self.swinViT.layers3[0].blocks.named_children():
+                block.load_from(weights, n_block=bname, layer="layers3")
+            self.swinViT.layers3[0].downsample.reduction.weight.copy_(
+                weights["state_dict"]["module.layers3.0.downsample.reduction.weight"]
+            )
+            self.swinViT.layers3[0].downsample.norm.weight.copy_(
+                weights["state_dict"]["module.layers3.0.downsample.norm.weight"]
+            )
+            self.swinViT.layers3[0].downsample.norm.bias.copy_(
+                weights["state_dict"]["module.layers3.0.downsample.norm.bias"]
+            )
+            for bname, block in self.swinViT.layers4[0].blocks.named_children():
+                block.load_from(weights, n_block=bname, layer="layers4")
+            self.swinViT.layers4[0].downsample.reduction.weight.copy_(
+                weights["state_dict"]["module.layers4.0.downsample.reduction.weight"]
+            )
+            self.swinViT.layers4[0].downsample.norm.weight.copy_(
+                weights["state_dict"]["module.layers4.0.downsample.norm.weight"]
+            )
+            self.swinViT.layers4[0].downsample.norm.bias.copy_(
+                weights["state_dict"]["module.layers4.0.downsample.norm.bias"]
+            )
+    def forward(self, x_in):
+        # print(x_in.shape, task_id.shape)
+        hidden_states_out = self.swinViT(x_in, self.normalize)
+        enc0 = self.encoder1(x_in)
+        enc1 = self.encoder2(hidden_states_out[0])
+        enc2 = self.encoder3(hidden_states_out[1])
+        enc3 = self.encoder4(hidden_states_out[2])
+        enc4 = self.encoder5(hidden_states_out[3])
+        dec4 = self.encoder6(hidden_states_out[4])
+        # print(x_in.shape, enc0.shape, enc1.shape, enc2.shape, enc3.shape, dec4.shape)
+        # torch.Size([6, 1, 64, 64, 64]) torch.Size([6, 48, 64, 64, 64]) torch.Size([6, 48, 32, 32, 32])
+        # torch.Size([6, 96, 16, 16, 16]) torch.Size([6, 192, 8,8, 8]) torch.Size([6, 768, 2, 2, 2])
+        if self.return_skips:
+            return [enc0, enc1, enc2, enc3, enc4, dec4]
+        else:
+            return [dec4]
+class SwinUNETR(nn.Module):
+    """
+    Swin UNETR based on: "Hatamizadeh et al.,
+    Swin UNETR: Swin Transformers for Semantic Segmentation of Brain Tumors in MRI Images
+    <https://arxiv.org/abs/2201.01266>"
+    """
+    def __init__(
+        self,
+        img_size: Union[Sequence[int], int],
+        in_channels: int,
+        depths: Sequence[int] = (2, 2, 2, 2),
+        num_heads: Sequence[int] = (3, 6, 12, 24),
+        feature_size: int = 24,
+        norm_name: Union[Tuple, str] = "instance",
+        drop_rate: float = 0.0,
+        attn_drop_rate: float = 0.0,
+        dropout_path_rate: float = 0.0,
+        normalize: bool = True,
+        use_checkpoint: bool = False,
+        spatial_dims: int = 3,
+        encoding: Union[Tuple, str] = 'rand_embedding', ## rand_embedding or word_embedding
+        deep_supervision: bool = True,
+        return_skips: bool = True,
+    ) -> None:
+        """
+        Args:
+            img_size: dimension of input image.
+            in_channels: dimension of input channels.
+            out_channels: dimension of output channels.
+            feature_size: dimension of network feature size.
+            depths: number of layers in each stage.
+            num_heads: number of attention heads.
+            norm_name: feature normalization type and arguments.
+            drop_rate: dropout rate.
+            attn_drop_rate: attention dropout rate.
+            dropout_path_rate: drop path rate.
+            normalize: normalize output intermediate features in each stage.
+            use_checkpoint: use gradient checkpointing for reduced memory usage.
+            spatial_dims: number of spatial dims.
+        Examples::
+            # for 3D single channel input with size (96,96,96), 4-channel output and feature size of 48.
+            >>> net = SwinUNETR(img_size=(96,96,96), in_channels=1, out_channels=4, feature_size=48)
+            # for 3D 4-channel input with size (128,128,128), 3-channel output and (2,4,2,2) layers in each stage.
+            >>> net = SwinUNETR(img_size=(128,128,128), in_channels=4, out_channels=3, depths=(2,4,2,2))
+            # for 2D single channel input with size (96,96), 2-channel output and gradient checkpointing.
+            >>> net = SwinUNETR(img_size=(96,96), in_channels=3, out_channels=2, use_checkpoint=True, spatial_dims=2)
+        """
+        super().__init__()
+        self.deep_supervision = deep_supervision
+        self.return_skips = return_skips
+        self.encoding = encoding
+        img_size = ensure_tuple_rep(img_size, spatial_dims)
+        patch_size = ensure_tuple_rep(2, spatial_dims)
+        window_size = ensure_tuple_rep(7, spatial_dims)
+        if not (spatial_dims == 2 or spatial_dims == 3):
+            raise ValueError("spatial dimension should be 2 or 3.")
+        for m, p in zip(img_size, patch_size):
+            for i in range(5):
+                if m % np.power(p, i + 1) != 0:
+                    raise ValueError("input image size (img_size) should be divisible by stage-wise image resolution.")
+        if not (0 <= drop_rate <= 1):
+            raise ValueError("dropout rate should be between 0 and 1.")
+        if not (0 <= attn_drop_rate <= 1):
+            raise ValueError("attention dropout rate should be between 0 and 1.")
+        if not (0 <= dropout_path_rate <= 1):
+            raise ValueError("drop path rate should be between 0 and 1.")
+        if feature_size % 12 != 0:
+            raise ValueError("feature_size should be divisible by 12.")
+        self.normalize = normalize
+        self.encoder = SwinUNETR_Enc(
+            img_size,
+            in_channels,
+            depths,
+            num_heads,
+            feature_size,
+            norm_name,
+            drop_rate,
+            attn_drop_rate,
+            dropout_path_rate,
+            normalize,
+            use_checkpoint,
+            spatial_dims,
+            return_skips=True
+        )
+        self.decoder5 = UnetrUpBlock(   # a transpose conv layer and 2 conv layers
+            spatial_dims=spatial_dims,
+            in_channels=16 * feature_size,
+            out_channels=8 * feature_size,
+            kernel_size=3,
+            upsample_kernel_size=2,
+            norm_name=norm_name,
+            res_block=True,
+        )
+        self.decoder4 = UnetrUpBlock(
+            spatial_dims=spatial_dims,
+            in_channels=feature_size * 8,
+            out_channels=feature_size * 4,
+            kernel_size=3,
+            upsample_kernel_size=2,
+            norm_name=norm_name,
+            res_block=True,
+        )
+        self.decoder3 = UnetrUpBlock(
+            spatial_dims=spatial_dims,
+            in_channels=feature_size * 4,
+            out_channels=feature_size * 2,
+            kernel_size=3,
+            upsample_kernel_size=2,
+            norm_name=norm_name,
+            res_block=True,
+        )
+        self.decoder2 = UnetrUpBlock(
+            spatial_dims=spatial_dims,
+            in_channels=feature_size * 2,
+            out_channels=feature_size,
+            kernel_size=3,
+            upsample_kernel_size=2,
+            norm_name=norm_name,
+            res_block=True,
+        )
+        self.decoder1 = UnetrUpBlock(
+            spatial_dims=spatial_dims,
+            in_channels=feature_size,
+            out_channels=feature_size,
+            kernel_size=3,
+            upsample_kernel_size=2,
+            norm_name=norm_name,
+            res_block=True,
+        )
+    def forward(self, x_in):
+        enc0, enc1, enc2, enc3, enc4, dec4 = self.encoder(x_in)
+        dec3 = self.decoder5(dec4, enc4)
+        dec2 = self.decoder4(dec3, enc3)
+        dec1 = self.decoder3(dec2, enc2)
+        dec0 = self.decoder2(dec1, enc1)
+        out = self.decoder1(dec0, enc0)
+        # print(dec3.shape, dec2.shape, dec1.shape, dec0.shape, out.shape)
+        # torch.Size([6, 384, 4, 4, 4]) torch.Size([6, 192, 8, 8, 8]) torch.Size([6, 96, 16, 16, 16])
+        # torch.Size([6, 48, 32, 32, 32]) torch.Size([6, 48, 64, 64, 64])
+        if self.deep_supervision:
+            out_ls = [out, dec0, dec1, dec2, dec3]
+        else:
+            out_ls = [out]
+        if self.return_skips:
+            skips = [enc0, enc1, enc2, enc3, enc4, dec4]
+        else:
+            skips = [dec4]
+        return skips, out_ls
+def window_partition(x, window_size):
+    """window partition operation based on: "Liu et al.,
+    Swin Transformer: Hierarchical Vision Transformer using Shifted Windows
+    <https://arxiv.org/abs/2103.14030>"
+    https://github.com/microsoft/Swin-Transformer
+     Args:
+        x: input tensor.
+        window_size: local window size.
+    """
+    x_shape = x.size()
+    if len(x_shape) == 5:
+        b, d, h, w, c = x_shape
+        x = x.view(
+            b,
+            d // window_size[0],
+            window_size[0],
+            h // window_size[1],
+            window_size[1],
+            w // window_size[2],
+            window_size[2],
+            c,
+        )
+        windows = (
+            x.permute(0, 1, 3, 5, 2, 4, 6, 7).contiguous().view(-1, window_size[0] * window_size[1] * window_size[2], c)
+        )
+    elif len(x_shape) == 4:
+        b, h, w, c = x.shape
+        x = x.view(b, h // window_size[0], window_size[0], w // window_size[1], window_size[1], c)
+        windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size[0] * window_size[1], c)
+    return windows
+def window_reverse(windows, window_size, dims):
+    """window reverse operation based on: "Liu et al.,
+    Swin Transformer: Hierarchical Vision Transformer using Shifted Windows
+    <https://arxiv.org/abs/2103.14030>"
+    https://github.com/microsoft/Swin-Transformer
+     Args:
+        windows: windows tensor.
+        window_size: local window size.
+        dims: dimension values.
+    """
+    if len(dims) == 4:
+        b, d, h, w = dims
+        x = windows.view(
+            b,
+            d // window_size[0],
+            h // window_size[1],
+            w // window_size[2],
+            window_size[0],
+            window_size[1],
+            window_size[2],
+            -1,
+        )
+        x = x.permute(0, 1, 4, 2, 5, 3, 6, 7).contiguous().view(b, d, h, w, -1)
+    elif len(dims) == 3:
+        b, h, w = dims
+        x = windows.view(b, h // window_size[0], w // window_size[0], window_size[0], window_size[1], -1)
+        x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(b, h, w, -1)
+    return x
+def get_window_size(x_size, window_size, shift_size=None):
+    """Computing window size based on: "Liu et al.,
+    Swin Transformer: Hierarchical Vision Transformer using Shifted Windows
+    <https://arxiv.org/abs/2103.14030>"
+    https://github.com/microsoft/Swin-Transformer
+     Args:
+        x_size: input size.
+        window_size: local window size.
+        shift_size: window shifting size.
+    """
+    use_window_size = list(window_size)
+    if shift_size is not None:
+        use_shift_size = list(shift_size)
+    for i in range(len(x_size)):
+        if x_size[i] <= window_size[i]:
+            use_window_size[i] = x_size[i]
+            if shift_size is not None:
+                use_shift_size[i] = 0
+    if shift_size is None:
+        return tuple(use_window_size)
+    else:
+        return tuple(use_window_size), tuple(use_shift_size)
+class WindowAttention(nn.Module):
+    """
+    Window based multi-head self attention module with relative position bias based on: "Liu et al.,
+    Swin Transformer: Hierarchical Vision Transformer using Shifted Windows
+    <https://arxiv.org/abs/2103.14030>"
+    https://github.com/microsoft/Swin-Transformer
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        window_size: Sequence[int],
+        qkv_bias: bool = False,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+    ) -> None:
+        """
+        Args:
+            dim: number of feature channels.
+            num_heads: number of attention heads.
+            window_size: local window size.
+            qkv_bias: add a learnable bias to query, key, value.
+            attn_drop: attention dropout rate.
+            proj_drop: dropout rate of output.
+        """
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        mesh_args = torch.meshgrid.__kwdefaults__
+        if len(self.window_size) == 3:
+            self.relative_position_bias_table = nn.Parameter(
+                torch.zeros(
+                    (2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1) * (2 * self.window_size[2] - 1),
+                    num_heads,
+                )
+            )
+            coords_d = torch.arange(self.window_size[0])
+            coords_h = torch.arange(self.window_size[1])
+            coords_w = torch.arange(self.window_size[2])
+            if mesh_args is not None:
+                coords = torch.stack(torch.meshgrid(coords_d, coords_h, coords_w, indexing="ij"))
+            else:
+                coords = torch.stack(torch.meshgrid(coords_d, coords_h, coords_w))
+            coords_flatten = torch.flatten(coords, 1)
+            relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
+            relative_coords = relative_coords.permute(1, 2, 0).contiguous()
+            relative_coords[:, :, 0] += self.window_size[0] - 1
+            relative_coords[:, :, 1] += self.window_size[1] - 1
+            relative_coords[:, :, 2] += self.window_size[2] - 1
+            relative_coords[:, :, 0] *= (2 * self.window_size[1] - 1) * (2 * self.window_size[2] - 1)
+            relative_coords[:, :, 1] *= 2 * self.window_size[2] - 1
+        elif len(self.window_size) == 2:
+            self.relative_position_bias_table = nn.Parameter(
+                torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)
+            )
+            coords_h = torch.arange(self.window_size[0])
+            coords_w = torch.arange(self.window_size[1])
+            if mesh_args is not None:
+                coords = torch.stack(torch.meshgrid(coords_h, coords_w, indexing="ij"))
+            else:
+                coords = torch.stack(torch.meshgrid(coords_h, coords_w))
+            coords_flatten = torch.flatten(coords, 1)
+            relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
+            relative_coords = relative_coords.permute(1, 2, 0).contiguous()
+            relative_coords[:, :, 0] += self.window_size[0] - 1
+            relative_coords[:, :, 1] += self.window_size[1] - 1
+            relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)
+        self.register_buffer("relative_position_index", relative_position_index)
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        trunc_normal_(self.relative_position_bias_table, std=0.02)
+        self.softmax = nn.Softmax(dim=-1)
+    def forward(self, x, mask):
+        b, n, c = x.shape
+        qkv = self.qkv(x).reshape(b, n, 3, self.num_heads, c // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        q = q * self.scale
+        attn = q @ k.transpose(-2, -1)
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.clone()[:n, :n].reshape(-1)
+        ].reshape(n, n, -1)
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()
+        attn = attn + relative_position_bias.unsqueeze(0)
+        if mask is not None:
+            nw = mask.shape[0]
+            attn = attn.view(b // nw, nw, self.num_heads, n, n) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, n, n)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(b, n, c)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class SwinTransformerBlock(nn.Module):
+    """
+    Swin Transformer block based on: "Liu et al.,
+    Swin Transformer: Hierarchical Vision Transformer using Shifted Windows
+    <https://arxiv.org/abs/2103.14030>"
+    https://github.com/microsoft/Swin-Transformer
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        window_size: Sequence[int],
+        shift_size: Sequence[int],
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        drop_path: float = 0.0,
+        act_layer: str = "GELU",
+        norm_layer: Type[LayerNorm] = nn.LayerNorm,  # type: ignore
+        use_checkpoint: bool = False,
+    ) -> None:
+        """
+        Args:
+            dim: number of feature channels.
+            num_heads: number of attention heads.
+            window_size: local window size.
+            shift_size: window shift size.
+            mlp_ratio: ratio of mlp hidden dim to embedding dim.
+            qkv_bias: add a learnable bias to query, key, value.
+            drop: dropout rate.
+            attn_drop: attention dropout rate.
+            drop_path: stochastic depth rate.
+            act_layer: activation layer.
+            norm_layer: normalization layer.
+            use_checkpoint: use gradient checkpointing for reduced memory usage.
+        """
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        self.use_checkpoint = use_checkpoint
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim,
+            window_size=self.window_size,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(hidden_size=dim, mlp_dim=mlp_hidden_dim, act=act_layer, dropout_rate=drop, dropout_mode="swin")
+    def forward_part1(self, x, mask_matrix):
+        x_shape = x.size()
+        x = self.norm1(x)
+        if len(x_shape) == 5:
+            b, d, h, w, c = x.shape
+            window_size, shift_size = get_window_size((d, h, w), self.window_size, self.shift_size)
+            pad_l = pad_t = pad_d0 = 0
+            pad_d1 = (window_size[0] - d % window_size[0]) % window_size[0]
+            pad_b = (window_size[1] - h % window_size[1]) % window_size[1]
+            pad_r = (window_size[2] - w % window_size[2]) % window_size[2]
+            x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b, pad_d0, pad_d1))
+            _, dp, hp, wp, _ = x.shape
+            dims = [b, dp, hp, wp]
+        elif len(x_shape) == 4:
+            b, h, w, c = x.shape
+            window_size, shift_size = get_window_size((h, w), self.window_size, self.shift_size)
+            pad_l = pad_t = 0
+            pad_r = (window_size[0] - h % window_size[0]) % window_size[0]
+            pad_b = (window_size[1] - w % window_size[1]) % window_size[1]
+            x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+            _, hp, wp, _ = x.shape
+            dims = [b, hp, wp]
+        if any(i > 0 for i in shift_size):
+            if len(x_shape) == 5:
+                shifted_x = torch.roll(x, shifts=(-shift_size[0], -shift_size[1], -shift_size[2]), dims=(1, 2, 3))
+            elif len(x_shape) == 4:
+                shifted_x = torch.roll(x, shifts=(-shift_size[0], -shift_size[1]), dims=(1, 2))
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+        x_windows = window_partition(shifted_x, window_size)
+        attn_windows = self.attn(x_windows, mask=attn_mask)
+        attn_windows = attn_windows.view(-1, *(window_size + (c,)))
+        shifted_x = window_reverse(attn_windows, window_size, dims)
+        if any(i > 0 for i in shift_size):
+            if len(x_shape) == 5:
+                x = torch.roll(shifted_x, shifts=(shift_size[0], shift_size[1], shift_size[2]), dims=(1, 2, 3))
+            elif len(x_shape) == 4:
+                x = torch.roll(shifted_x, shifts=(shift_size[0], shift_size[1]), dims=(1, 2))
+        else:
+            x = shifted_x
+        if len(x_shape) == 5:
+            if pad_d1 > 0 or pad_r > 0 or pad_b > 0:
+                x = x[:, :d, :h, :w, :].contiguous()
+        elif len(x_shape) == 4:
+            if pad_r > 0 or pad_b > 0:
+                x = x[:, :h, :w, :].contiguous()
+        return x
+    def forward_part2(self, x):
+        return self.drop_path(self.mlp(self.norm2(x)))
+    def load_from(self, weights, n_block, layer):
+        root = f"module.{layer}.0.blocks.{n_block}."
+        block_names = [
+            "norm1.weight",
+            "norm1.bias",
+            "attn.relative_position_bias_table",
+            "attn.relative_position_index",
+            "attn.qkv.weight",
+            "attn.qkv.bias",
+            "attn.proj.weight",
+            "attn.proj.bias",
+            "norm2.weight",
+            "norm2.bias",
+            "mlp.fc1.weight",
+            "mlp.fc1.bias",
+            "mlp.fc2.weight",
+            "mlp.fc2.bias",
+        ]
+        with torch.no_grad():
+            self.norm1.weight.copy_(weights["state_dict"][root + block_names[0]])
+            self.norm1.bias.copy_(weights["state_dict"][root + block_names[1]])
+            self.attn.relative_position_bias_table.copy_(weights["state_dict"][root + block_names[2]])
+            self.attn.relative_position_index.copy_(weights["state_dict"][root + block_names[3]])
+            self.attn.qkv.weight.copy_(weights["state_dict"][root + block_names[4]])
+            self.attn.qkv.bias.copy_(weights["state_dict"][root + block_names[5]])
+            self.attn.proj.weight.copy_(weights["state_dict"][root + block_names[6]])
+            self.attn.proj.bias.copy_(weights["state_dict"][root + block_names[7]])
+            self.norm2.weight.copy_(weights["state_dict"][root + block_names[8]])
+            self.norm2.bias.copy_(weights["state_dict"][root + block_names[9]])
+            self.mlp.linear1.weight.copy_(weights["state_dict"][root + block_names[10]])
+            self.mlp.linear1.bias.copy_(weights["state_dict"][root + block_names[11]])
+            self.mlp.linear2.weight.copy_(weights["state_dict"][root + block_names[12]])
+            self.mlp.linear2.bias.copy_(weights["state_dict"][root + block_names[13]])
+    def forward(self, x, mask_matrix):
+        shortcut = x
+        if self.use_checkpoint:
+            x = checkpoint.checkpoint(self.forward_part1, x, mask_matrix)
+        else:
+            x = self.forward_part1(x, mask_matrix)
+        x = shortcut + self.drop_path(x)
+        if self.use_checkpoint:
+            x = x + checkpoint.checkpoint(self.forward_part2, x)
+        else:
+            x = x + self.forward_part2(x)
+        return x
+class PatchMerging(nn.Module):
+    """
+    Patch merging layer based on: "Liu et al.,
+    Swin Transformer: Hierarchical Vision Transformer using Shifted Windows
+    <https://arxiv.org/abs/2103.14030>"
+    https://github.com/microsoft/Swin-Transformer
+    """
+    def __init__(
+        self, dim: int, norm_layer: Type[LayerNorm] = nn.LayerNorm, spatial_dims: int = 3
+    ) -> None:  # type: ignore
+        """
+        Args:
+            dim: number of feature channels.
+            norm_layer: normalization layer.
+            spatial_dims: number of spatial dims.
+        """
+        super().__init__()
+        self.dim = dim
+        if spatial_dims == 3:
+            self.reduction = nn.Linear(8 * dim, 2 * dim, bias=False)
+            self.norm = norm_layer(8 * dim)
+        elif spatial_dims == 2:
+            self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+            self.norm = norm_layer(4 * dim)
+    def forward(self, x):
+        x_shape = x.size()
+        if len(x_shape) == 5:
+            b, d, h, w, c = x_shape
+            pad_input = (h % 2 == 1) or (w % 2 == 1) or (d % 2 == 1)
+            if pad_input:
+                x = F.pad(x, (0, 0, 0, d % 2, 0, w % 2, 0, h % 2))
+            x0 = x[:, 0::2, 0::2, 0::2, :]
+            x1 = x[:, 1::2, 0::2, 0::2, :]
+            x2 = x[:, 0::2, 1::2, 0::2, :]
+            x3 = x[:, 0::2, 0::2, 1::2, :]
+            x4 = x[:, 1::2, 0::2, 1::2, :]
+            x5 = x[:, 0::2, 1::2, 0::2, :]
+            x6 = x[:, 0::2, 0::2, 1::2, :]
+            x7 = x[:, 1::2, 1::2, 1::2, :]
+            x = torch.cat([x0, x1, x2, x3, x4, x5, x6, x7], -1)
+        elif len(x_shape) == 4:
+            b, h, w, c = x_shape
+            pad_input = (h % 2 == 1) or (w % 2 == 1)
+            if pad_input:
+                x = F.pad(x, (0, 0, 0, w % 2, 0, h % 2))
+            x0 = x[:, 0::2, 0::2, :]
+            x1 = x[:, 1::2, 0::2, :]
+            x2 = x[:, 0::2, 1::2, :]
+            x3 = x[:, 1::2, 1::2, :]
+            x = torch.cat([x0, x1, x2, x3], -1)
+        x = self.norm(x)
+        x = self.reduction(x)
+        return x
+def compute_mask(dims, window_size, shift_size, device):
+    """Computing region masks based on: "Liu et al.,
+    Swin Transformer: Hierarchical Vision Transformer using Shifted Windows
+    <https://arxiv.org/abs/2103.14030>"
+    https://github.com/microsoft/Swin-Transformer
+     Args:
+        dims: dimension values.
+        window_size: local window size.
+        shift_size: shift size.
+        device: device.
+    """
+    cnt = 0
+    if len(dims) == 3:
+        d, h, w = dims
+        img_mask = torch.zeros((1, d, h, w, 1), device=device)
+        for d in slice(-window_size[0]), slice(-window_size[0], -shift_size[0]), slice(-shift_size[0], None):
+            for h in slice(-window_size[1]), slice(-window_size[1], -shift_size[1]), slice(-shift_size[1], None):
+                for w in slice(-window_size[2]), slice(-window_size[2], -shift_size[2]), slice(-shift_size[2], None):
+                    img_mask[:, d, h, w, :] = cnt
+                    cnt += 1
+    elif len(dims) == 2:
+        h, w = dims
+        img_mask = torch.zeros((1, h, w, 1), device=device)
+        for h in slice(-window_size[0]), slice(-window_size[0], -shift_size[0]), slice(-shift_size[0], None):
+            for w in slice(-window_size[1]), slice(-window_size[1], -shift_size[1]), slice(-shift_size[1], None):
+                img_mask[:, h, w, :] = cnt
+                cnt += 1
+    mask_windows = window_partition(img_mask, window_size)
+    mask_windows = mask_windows.squeeze(-1)
+    attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+    attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+    return attn_mask
+class BasicLayer(nn.Module):
+    """
+    Basic Swin Transformer layer in one stage based on: "Liu et al.,
+    Swin Transformer: Hierarchical Vision Transformer using Shifted Windows
+    <https://arxiv.org/abs/2103.14030>"
+    https://github.com/microsoft/Swin-Transformer
+    """
+    def __init__(
+        self,
+        dim: int,
+        depth: int,
+        num_heads: int,
+        window_size: Sequence[int],
+        drop_path: list,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        norm_layer: Type[LayerNorm] = nn.LayerNorm,  # type: ignore
+        downsample: isinstance = None,  # type: ignore
+        use_checkpoint: bool = False,
+    ) -> None:
+        """
+        Args:
+            dim: number of feature channels.
+            depths: number of layers in each stage.
+            num_heads: number of attention heads.
+            window_size: local window size.
+            drop_path: stochastic depth rate.
+            mlp_ratio: ratio of mlp hidden dim to embedding dim.
+            qkv_bias: add a learnable bias to query, key, value.
+            drop: dropout rate.
+            attn_drop: attention dropout rate.
+            norm_layer: normalization layer.
+            downsample: downsample layer at the end of the layer.
+            use_checkpoint: use gradient checkpointing for reduced memory usage.
+        """
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = tuple(i // 2 for i in window_size)
+        self.no_shift = tuple(0 for i in window_size)
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        self.blocks = nn.ModuleList(
+            [
+                SwinTransformerBlock(
+                    dim=dim,
+                    num_heads=num_heads,
+                    window_size=self.window_size,
+                    shift_size=self.no_shift if (i % 2 == 0) else self.shift_size,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    drop=drop,
+                    attn_drop=attn_drop,
+                    drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                    norm_layer=norm_layer,
+                    use_checkpoint=use_checkpoint,
+                )
+                for i in range(depth)
+            ]
+        )
+        self.downsample = downsample
+        if self.downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=norm_layer, spatial_dims=len(self.window_size))
+    def forward(self, x):
+        x_shape = x.size()
+        if len(x_shape) == 5:
+            b, c, d, h, w = x_shape
+            window_size, shift_size = get_window_size((d, h, w), self.window_size, self.shift_size)
+            x = rearrange(x, "b c d h w -> b d h w c")
+            dp = int(np.ceil(d / window_size[0])) * window_size[0]
+            hp = int(np.ceil(h / window_size[1])) * window_size[1]
+            wp = int(np.ceil(w / window_size[2])) * window_size[2]
+            attn_mask = compute_mask([dp, hp, wp], window_size, shift_size, x.device)
+            for blk in self.blocks:
+                x = blk(x, attn_mask)
+            x = x.view(b, d, h, w, -1)
+            if self.downsample is not None:
+                x = self.downsample(x)
+            x = rearrange(x, "b d h w c -> b c d h w")
+        elif len(x_shape) == 4:
+            b, c, h, w = x_shape
+            window_size, shift_size = get_window_size((h, w), self.window_size, self.shift_size)
+            x = rearrange(x, "b c h w -> b h w c")
+            hp = int(np.ceil(h / window_size[0])) * window_size[0]
+            wp = int(np.ceil(w / window_size[1])) * window_size[1]
+            attn_mask = compute_mask([hp, wp], window_size, shift_size, x.device)
+            for blk in self.blocks:
+                x = blk(x, attn_mask)
+            x = x.view(b, h, w, -1)
+            if self.downsample is not None:
+                x = self.downsample(x)
+            x = rearrange(x, "b h w c -> b c h w")
+        return x
+class SwinTransformer(nn.Module):
+    """
+    Swin Transformer based on: "Liu et al.,
+    Swin Transformer: Hierarchical Vision Transformer using Shifted Windows
+    <https://arxiv.org/abs/2103.14030>"
+    https://github.com/microsoft/Swin-Transformer
+    """
+    def __init__(
+        self,
+        in_chans: int,
+        embed_dim: int,
+        window_size: Sequence[int],
+        patch_size: Sequence[int],
+        depths: Sequence[int],
+        num_heads: Sequence[int],
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        drop_rate: float = 0.0,
+        attn_drop_rate: float = 0.0,
+        drop_path_rate: float = 0.0,
+        norm_layer: Type[LayerNorm] = nn.LayerNorm,  # type: ignore
+        patch_norm: bool = False,
+        use_checkpoint: bool = False,
+        spatial_dims: int = 3,
+    ) -> None:
+        """
+        Args:
+            in_chans: dimension of input channels.
+            embed_dim: number of linear projection output channels.
+            window_size: local window size.
+            patch_size: patch size.
+            depths: number of layers in each stage.
+            num_heads: number of attention heads.
+            mlp_ratio: ratio of mlp hidden dim to embedding dim.
+            qkv_bias: add a learnable bias to query, key, value.
+            drop_rate: dropout rate.
+            attn_drop_rate: attention dropout rate.
+            drop_path_rate: stochastic depth rate.
+            norm_layer: normalization layer.
+            patch_norm: add normalization after patch embedding.
+            use_checkpoint: use gradient checkpointing for reduced memory usage.
+            spatial_dims: spatial dimension.
+        """
+        super().__init__()
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.patch_norm = patch_norm
+        self.window_size = window_size
+        self.patch_size = patch_size
+        self.patch_embed = PatchEmbed(
+            patch_size=self.patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None,  # type: ignore
+            spatial_dims=spatial_dims,
+        )
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
+        self.layers1 = nn.ModuleList()
+        self.layers2 = nn.ModuleList()
+        self.layers3 = nn.ModuleList()
+        self.layers4 = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2**i_layer),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=self.window_size,
+                drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])],
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                norm_layer=norm_layer,
+                downsample=PatchMerging,
+                use_checkpoint=use_checkpoint,
+            )
+            if i_layer == 0:
+                self.layers1.append(layer)
+            elif i_layer == 1:
+                self.layers2.append(layer)
+            elif i_layer == 2:
+                self.layers3.append(layer)
+            elif i_layer == 3:
+                self.layers4.append(layer)
+        self.num_features = int(embed_dim * 2 ** (self.num_layers - 1))
+    def proj_out(self, x, normalize=False):
+        if normalize:
+            x_shape = x.size()
+            if len(x_shape) == 5:
+                n, ch, d, h, w = x_shape
+                x = rearrange(x, "n c d h w -> n d h w c")
+                x = F.layer_norm(x, [ch])
+                x = rearrange(x, "n d h w c -> n c d h w")
+            elif len(x_shape) == 4:
+                n, ch, h, w = x_shape
+                x = rearrange(x, "n c h w -> n h w c")
+                x = F.layer_norm(x, [ch])
+                x = rearrange(x, "n h w c -> n c h w")
+        return x
+    def forward(self, x, normalize=True):
+        x0 = self.patch_embed(x)
+        x0 = self.pos_drop(x0)
+        x0_out = self.proj_out(x0, normalize)
+        x1 = self.layers1[0](x0.contiguous())
+        x1_out = self.proj_out(x1, normalize)
+        x2 = self.layers2[0](x1.contiguous())
+        x2_out = self.proj_out(x2, normalize)
+        x3 = self.layers3[0](x2.contiguous())
+        x3_out = self.proj_out(x3, normalize)
+        x4 = self.layers4[0](x3.contiguous())
+        x4_out = self.proj_out(x4, normalize)
+        return [x0_out, x1_out, x2_out, x3_out, x4_out]
+if __name__ == '__main__':
+    import os
+    def get_parameter_number(model):
+        total_num = sum(p.numel() for p in model.parameters())
+        trainable_num = sum(p.numel() for p in model.parameters() if p.requires_grad)
+        return {'Total': total_num, 'Trainable': trainable_num}
+    model = SwinUNETR(
+                    img_size=[288, 288, 96],    # the real input should satisfy : d,h,w > 32
+                    in_channels=3,
+                    feature_size=48,
+                    drop_rate=0.0,
+                    attn_drop_rate=0.0,
+                    dropout_path_rate=0.0,
+                    use_checkpoint=False,
+                    deep_supervision=True,
+                    return_skips=True,
+                    ).cuda()
+    if is_master():
+        print(f"** UNET ** {get_parameter_number(model)['Total']/1e6}M parameters")
+    image = torch.rand((1, 3, 288, 288, 96)).cuda()
+    skips, outs = model(image)
+    for s in skips:
+        print(s.shape)
+    for out in outs:
+        print(out.shape)

model/__init__.py ADDED Viewed

File without changes

model/base_bert.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import torch.nn as nn
+import torch
+from transformers import BertModel, AutoTokenizer
+class BaseBERT(nn.Module):
+    def __init__(self, basebert_checkpoint='bert-base-uncased'):
+        super().__init__()
+        self.tokenizer = AutoTokenizer.from_pretrained(basebert_checkpoint)
+        self.model = BertModel.from_pretrained(basebert_checkpoint)
+        self.modality_embed = nn.Embedding(4, 768)
+    def forward(self, text, modality):
+        encoded = self.tokenizer(
+                text,
+                truncation=True,
+                padding=True,
+                return_tensors='pt',
+                max_length=64,
+            ).to(device=torch.cuda.current_device())
+        text_feature = self.model(**encoded).last_hidden_state[:, 0, :]
+        modality_feature = self.modality_embed(modality)
+        text_feature += modality_feature
+        return text_feature

model/build_model.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import torch
+import torch.nn as nn
+import time
+import os
+from torch.nn.parallel import DistributedDataParallel as DDP
+import numpy as np
+from .maskformer import Maskformer
+from train.dist import is_master
+def get_parameter_number(model):
+    total_num = sum(p.numel() for p in model.parameters())
+    trainable_num = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    return {'Total': total_num, 'Trainable': trainable_num}
+def build_maskformer(args, device, gpu_id):
+    model = Maskformer(args.vision_backbone, args.input_channels, args.crop_size, args.patch_size, args.deep_supervision)
+    model = model.to(device)
+    model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu_id], find_unused_parameters=True)
+    def get_parameter_number(model):
+        total_num = sum(p.numel() for p in model.parameters())
+        trainable_num = sum(p.numel() for p in model.parameters() if p.requires_grad)
+        return {'Total': total_num, 'Trainable': trainable_num}
+    if is_master():
+        print(f"** MODEL ** {get_parameter_number(model)['Total']/1e6}M parameters")
+    return model
+def load_checkpoint(checkpoint_file,
+                    resume,
+                    partial_load,
+                    model,
+                    device,
+                    optimizer=None,
+                    ):
+    if is_master():
+        print('** CHECKPOINT ** : Load checkpoint from %s' % (checkpoint_file))
+    checkpoint = torch.load(checkpoint_file, map_location=device)
+    # load part of the checkpoint
+    if partial_load:
+        model_dict =  model.state_dict()
+        # check difference
+        unexpected_state_dict = [k for k in checkpoint['model_state_dict'].keys() if k not in model_dict.keys()]
+        missing_state_dict = [k for k in model_dict.keys() if k not in checkpoint['model_state_dict'].keys()]
+        unmatchd_state_dict = [k for k,v in checkpoint['model_state_dict'].items() if k in model_dict.keys() and v.shape != model_dict[k].shape]
+        # load partial parameters
+        state_dict = {k:v for k,v in checkpoint['model_state_dict'].items() if k in model_dict.keys() and v.shape == model_dict[k].shape}
+        model_dict.update(state_dict)
+        model.load_state_dict(model_dict)
+        if is_master():
+            print('The following parameters are unexpected in SAT checkpoint:\n', unexpected_state_dict)
+            print('The following parameters are missing in SAT checkpoint:\n', missing_state_dict)
+            print('The following parameters have different shapes in SAT checkpoint:\n', unmatchd_state_dict)
+            print('The following parameters are loaded in SAT:\n', state_dict.keys())
+    else:
+        model.load_state_dict(checkpoint['model_state_dict'])
+    # if resume, load optimizer and step
+    if resume:
+        try:
+            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
+        except:
+            print('Optimizer state dict not matched, skip loading optimizer state dict')
+            pass
+        start_step = int(checkpoint['step']) + 1
+        print('Resume from step %d' % (start_step))
+    else:
+        start_step = 1
+    return model, optimizer, start_step
+def inherit_knowledge_encoder(knowledge_encoder_checkpoint,
+                              model,
+                              device
+                              ):
+    # inherit unet encoder and multiscale feature projection layer from knowledge encoder
+    checkpoint = torch.load(knowledge_encoder_checkpoint, map_location=device)
+    model_dict =  model.state_dict()
+    visual_encoder_state_dict = {k.replace('atlas_tower', 'backbone'):v for k,v in checkpoint['model_state_dict'].items() if 'atlas_tower.encoder' in k}    # encoder部分
+    model_dict.update(visual_encoder_state_dict)
+    proj_state_dict = {k.replace('atlas_tower.', ''):v for k,v in checkpoint['model_state_dict'].items() if 'atlas_tower.projection_layer' in k}    # projection layer部分
+    model_dict.update(proj_state_dict)
+    model.load_state_dict(model_dict)
+    if is_master():
+        print('** CHECKPOINT ** : Inherit pretrained unet encoder from %s' % (knowledge_encoder_checkpoint))
+        print('The following parameters are loaded in SAT:\n', list(visual_encoder_state_dict.keys())+list(proj_state_dict.keys()))
+    return model

model/dynamic-network-architectures-main/.gitignore ADDED Viewed

	@@ -0,0 +1,113 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+.hypothesis/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# IPython Notebook
+.ipynb_checkpoints
+# pyenv
+.python-version
+# celery beat schedule file
+celerybeat-schedule
+# dotenv
+.env
+# virtualenv
+venv/
+ENV/
+# Spyder project settings
+.spyderproject
+# Rope project settings
+.ropeproject
+*.memmap
+*.zip
+*.npz
+*.npy
+*.jpg
+*.jpeg
+.idea
+*.txt
+.idea/*
+*.nii.gz
+*.nii
+*.tif
+*.bmp
+*.pkl
+*.xml
+*.pkl
+*.pdf
+*.jpg
+*.jpeg
+*.model
+cifar_lightning/mlruns*

model/dynamic-network-architectures-main/LICENCE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [2022] [Division of Medical Image Computing, German Cancer Research Center (DKFZ), Heidelberg, Germany]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

model/dynamic-network-architectures-main/README.md ADDED Viewed

	@@ -0,0 +1,25 @@

+# Dynamic Network Architectures
+This repository contains several ResNet, U-Net and VGG architectures in pytorch that can be dynamically adapted to a varying number of image dimensions (1D, 2D or 3D) and the number of input channels.
+## Available models
+### ResNet
+We implement the standard [ResNetD](https://arxiv.org/pdf/1812.01187.pdf) 18, 34, 50 and 152. For ResNets 50 and 152 also bottleneck implementations are available. Moreover, adapted versions that are better suited for smaller image sizes such as CIFAR can be used.
+All models additionally include regularization techniques like [Stochastic Depth](https://arxiv.org/pdf/1603.09382.pdf), [Squeeze & Excitation](https://arxiv.org/pdf/1709.01507.pdf) and [Final Layer Dropout](https://jmlr.org/papers/volume15/srivastava14a/srivastava14a.pdf).
+### VGG
+In contrast to the original [VGG](https://arxiv.org/pdf/1409.1556.pdf) implementation we exclude the final fully-connected layers in the end and replace it by additional convolutional layers and only one fully-connected layer in the end. Adapted versions that are better suited for smaller image sizes such as CIFAR can be used.
+### U-Net
+For the [U-Net](https://arxiv.org/pdf/1505.04597.pdf) a plain convolutional encoder as well as a residual encoder are available.
+# Acknowledgements
+<p align="left">
+  <img src="imgs/Logos/HI_Logo.png" width="150"> &nbsp;&nbsp;&nbsp;&nbsp;
+  <img src="imgs/Logos/DKFZ_Logo.png" width="500">
+</p>
+This Repository is developed and maintained by the Applied Computer Vision Lab (ACVL)
+of [Helmholtz Imaging](https://www.helmholtz-imaging.de/).

model/dynamic-network-architectures-main/dynamic_network_architectures.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,16 @@

+Metadata-Version: 2.4
+Name: dynamic_network_architectures
+Version: 0.2
+Summary: none
+Author: Fabian Isensee
+Author-email: f.isensee@dkfz.de
+License: private
+License-File: LICENCE
+Requires-Dist: torch>=1.6.0a
+Requires-Dist: numpy
+Dynamic: author
+Dynamic: author-email
+Dynamic: license
+Dynamic: license-file
+Dynamic: requires-dist
+Dynamic: summary

model/dynamic-network-architectures-main/dynamic_network_architectures.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,24 @@

+LICENCE
+README.md
+setup.py
+dynamic_network_architectures/__init__.py
+dynamic_network_architectures.egg-info/PKG-INFO
+dynamic_network_architectures.egg-info/SOURCES.txt
+dynamic_network_architectures.egg-info/dependency_links.txt
+dynamic_network_architectures.egg-info/not-zip-safe
+dynamic_network_architectures.egg-info/requires.txt
+dynamic_network_architectures.egg-info/top_level.txt
+dynamic_network_architectures/architectures/__init__.py
+dynamic_network_architectures/architectures/resnet.py
+dynamic_network_architectures/architectures/unet.py
+dynamic_network_architectures/architectures/vgg.py
+dynamic_network_architectures/building_blocks/__init__.py
+dynamic_network_architectures/building_blocks/helper.py
+dynamic_network_architectures/building_blocks/plain_conv_encoder.py
+dynamic_network_architectures/building_blocks/regularization.py
+dynamic_network_architectures/building_blocks/residual.py
+dynamic_network_architectures/building_blocks/residual_encoders.py
+dynamic_network_architectures/building_blocks/simple_conv_blocks.py
+dynamic_network_architectures/building_blocks/unet_decoder.py
+dynamic_network_architectures/initialization/__init__.py
+dynamic_network_architectures/initialization/weight_init.py

model/dynamic-network-architectures-main/dynamic_network_architectures.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

model/dynamic-network-architectures-main/dynamic_network_architectures.egg-info/not-zip-safe ADDED Viewed

	@@ -0,0 +1 @@


1	+

model/dynamic-network-architectures-main/dynamic_network_architectures.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ torch>=1.6.0a
2	+ numpy

model/dynamic-network-architectures-main/dynamic_network_architectures.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ dynamic_network_architectures

model/dynamic-network-architectures-main/dynamic_network_architectures/__init__.py ADDED Viewed

File without changes

model/dynamic-network-architectures-main/dynamic_network_architectures/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (256 Bytes). View file

model/dynamic-network-architectures-main/dynamic_network_architectures/architectures/__init__.py ADDED Viewed

File without changes

model/dynamic-network-architectures-main/dynamic_network_architectures/architectures/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (270 Bytes). View file

model/dynamic-network-architectures-main/dynamic_network_architectures/architectures/__pycache__/unet.cpython-310.pyc ADDED Viewed

Binary file (7.52 kB). View file

model/dynamic-network-architectures-main/dynamic_network_architectures/architectures/resnet.py ADDED Viewed

	@@ -0,0 +1,236 @@

+import torch
+from dynamic_network_architectures.building_blocks.residual_encoders import ResidualEncoder, BottleneckD, BasicBlockD
+from dynamic_network_architectures.building_blocks.helper import get_matching_pool_op, get_default_network_config
+from dynamic_network_architectures.building_blocks.simple_conv_blocks import ConvDropoutNormReLU
+from torch import nn
+_ResNet_CONFIGS = {
+    '18': {'features_per_stage': (64, 128, 256, 512), 'n_blocks_per_stage': (2, 2, 2, 2), 'strides': (1, 2, 2, 2),
+           'block': BasicBlockD, 'bottleneck_channels': None, 'disable_default_stem': True, 'stem_channels': None},
+    '34': {'features_per_stage': (64, 128, 256, 512), 'n_blocks_per_stage': (3, 4, 6, 3), 'strides': (1, 2, 2, 2),
+           'block': BasicBlockD, 'bottleneck_channels': None, 'disable_default_stem': True, 'stem_channels': None},
+    '50': {'features_per_stage': (64, 128, 256, 512), 'n_blocks_per_stage': (4, 6, 10, 5), 'strides': (1, 2, 2, 2),
+           'block': BasicBlockD, 'bottleneck_channels': None, 'disable_default_stem': True, 'stem_channels': None},
+    '152': {'features_per_stage': (64, 128, 256, 512), 'n_blocks_per_stage': (4, 13, 55, 4), 'strides': (1, 2, 2, 2),
+            'block': BasicBlockD, 'bottleneck_channels': None, 'disable_default_stem': True, 'stem_channels': None},
+    '50_bn': {'features_per_stage': (256, 512, 1024, 2048), 'n_blocks_per_stage': (3, 4, 6, 3), 'strides': (1, 2, 2, 2),
+              'block': BottleneckD, 'bottleneck_channels': (64, 128, 256, 512), 'disable_default_stem': True,
+              'stem_channels': 64},
+    '152_bn': {'features_per_stage': (256, 512, 1024, 2048), 'n_blocks_per_stage': (3, 8, 36, 3),
+               'strides': (1, 2, 2, 2),
+               'block': BottleneckD, 'bottleneck_channels': (64, 128, 256, 512), 'disable_default_stem': True,
+               'stem_channels': 64},
+    '18_cifar': {'features_per_stage': (64, 128, 256, 512), 'n_blocks_per_stage': (2, 2, 2, 2), 'strides': (1, 2, 2, 2),
+                 'block': BasicBlockD, 'bottleneck_channels': None, 'disable_default_stem': False,
+                 'stem_channels': None},
+    '34_cifar': {'features_per_stage': (64, 128, 256, 512), 'n_blocks_per_stage': (3, 4, 6, 3), 'strides': (1, 2, 2, 2),
+                 'block': BasicBlockD, 'bottleneck_channels': None, 'disable_default_stem': False,
+                 'stem_channels': None},
+    '50_cifar': {'features_per_stage': (64, 128, 256, 512), 'n_blocks_per_stage': (4, 6, 10, 5),
+                 'strides': (1, 2, 2, 2),
+                 'block': BasicBlockD, 'bottleneck_channels': None, 'disable_default_stem': False,
+                 'stem_channels': None},
+    '152_cifar': {'features_per_stage': (64, 128, 256, 512), 'n_blocks_per_stage': (4, 13, 55, 4),
+                  'strides': (1, 2, 2, 2),
+                  'block': BasicBlockD, 'bottleneck_channels': None, 'disable_default_stem': False,
+                  'stem_channels': None},
+    '50_cifar_bn': {'features_per_stage': (256, 512, 1024, 2048), 'n_blocks_per_stage': (3, 4, 6, 3),
+                    'strides': (1, 2, 2, 2),
+                    'block': BottleneckD, 'bottleneck_channels': (64, 128, 256, 512), 'disable_default_stem': False,
+                    'stem_channels': 64},
+    '152_cifar_bn': {'features_per_stage': (256, 512, 1024, 2048), 'n_blocks_per_stage': (3, 8, 36, 3),
+                     'strides': (1, 2, 2, 2),
+                     'block': BottleneckD, 'bottleneck_channels': (64, 128, 256, 512), 'disable_default_stem': False,
+                     'stem_channels': 64},
+}
+class ResNetD(nn.Module):
+    def __init__(self, n_classes: int, n_input_channel: int = 3, config='18', input_dimension=2,
+                 final_layer_dropout=0.0, stochastic_depth_p=0.0, squeeze_excitation=False,
+                 squeeze_excitation_rd_ratio=1./16):
+        """
+        Implements ResNetD (https://arxiv.org/pdf/1812.01187.pdf).
+        Args:
+            n_classes: Number of classes
+            n_input_channel: Number of input channels (e.g. 3 for RGB)
+            config: Configuration of the ResNet
+            input_dimension: Number of dimensions of the data (1, 2 or 3)
+            final_layer_dropout: Probability of dropout before the final classifier
+            stochastic_depth_p: Stochastic Depth probability
+            squeeze_excitation: Whether Squeeze and Excitation should be applied
+            squeeze_excitation_rd_ratio: Squeeze and Excitation Reduction Ratio
+        Returns:
+            ResNet Model
+        """
+        super().__init__()
+        self.input_channels = n_input_channel
+        self.cfg = _ResNet_CONFIGS[config]
+        self.ops = get_default_network_config(dimension=input_dimension)
+        self.final_layer_dropout_p = final_layer_dropout
+        if self.cfg['disable_default_stem']:
+            stem_features = self.cfg['stem_channels'] if self.cfg['stem_channels'] is not None else \
+            self.cfg['features_per_stage'][0]
+            self.stem = self._build_imagenet_stem_D(stem_features)
+            encoder_input_features = stem_features
+        else:
+            encoder_input_features = n_input_channel
+            self.stem = None
+        self.encoder = ResidualEncoder(encoder_input_features, n_stages=len(self.cfg['features_per_stage']),
+                                       features_per_stage=self.cfg['features_per_stage'], conv_op=self.ops['conv_op'],
+                                       kernel_sizes=3, strides=self.cfg['strides'],
+                                       n_blocks_per_stage=self.cfg['n_blocks_per_stage'], conv_bias=False,
+                                       norm_op=self.ops['norm_op'], norm_op_kwargs=None, dropout_op=None,
+                                       dropout_op_kwargs=None, nonlin=nn.ReLU,
+                                       nonlin_kwargs={'inplace': True}, block=self.cfg['block'],
+                                       bottleneck_channels=self.cfg['bottleneck_channels'], return_skips=False,
+                                       disable_default_stem=self.cfg['disable_default_stem'],
+                                       stem_channels=self.cfg['stem_channels'],
+                                       stochastic_depth_p=stochastic_depth_p,
+                                       squeeze_excitation=squeeze_excitation,
+                                       squeeze_excitation_reduction_ratio=squeeze_excitation_rd_ratio)
+        self.gap = get_matching_pool_op(conv_op=self.ops['conv_op'], adaptive=True, pool_type='avg')(1)
+        self.classifier = nn.Linear(self.cfg['features_per_stage'][-1], n_classes, True)
+        self.final_layer_dropout = self.ops['dropout_op'](p=self.final_layer_dropout_p)
+    def forward(self, x):
+        if self.stem is not None:
+            x = self.stem(x)
+        x = self.encoder(x)
+        x = self.gap(x)
+        x = self.final_layer_dropout(x).squeeze()
+        return self.classifier(x)
+    def _build_imagenet_stem_D(self, stem_features):
+        """
+        https://arxiv.org/pdf/1812.01187.pdf
+        use 3 3x3(x3) convs instead of one 7x7. Stride is located in first conv.
+        Fig2 b) describes this
+        :return:
+        """
+        c1 = ConvDropoutNormReLU(self.ops['conv_op'], self.input_channels, stem_features, 3, 2, False,
+                                 self.ops['norm_op'], None, None, None, nn.ReLU, {'inplace': True})
+        c2 = ConvDropoutNormReLU(self.ops['conv_op'], stem_features, stem_features, 3, 1, False,
+                                 self.ops['norm_op'], None, None, None, nn.ReLU, {'inplace': True})
+        c3 = ConvDropoutNormReLU(self.ops['conv_op'], stem_features, stem_features, 3, 1, False,
+                                 self.ops['norm_op'], None, None, None, nn.ReLU, {'inplace': True})
+        pl = get_matching_pool_op(conv_op=self.ops['conv_op'], adaptive=False, pool_type='max')(2)
+        stem = nn.Sequential(c1, c2, c3, pl)
+        return stem
+class ResNet18_CIFAR(ResNetD):
+    def __init__(self, n_classes: int, n_input_channels: int = 3, input_dimension: int = 2,
+                 final_layer_dropout: float = 0.0, stochastic_depth_p: float = 0.0, squeeze_excitation: bool = False,
+                 squeeze_excitation_rd_ratio: float = 1./16):
+        super().__init__(n_classes, n_input_channels, config='18_cifar', input_dimension=input_dimension,
+                         final_layer_dropout=final_layer_dropout, stochastic_depth_p=stochastic_depth_p,
+                         squeeze_excitation=squeeze_excitation, squeeze_excitation_rd_ratio=squeeze_excitation_rd_ratio)
+class ResNet34_CIFAR(ResNetD):
+    def __init__(self, n_classes: int, n_input_channels: int = 3, input_dimension: int = 2,
+                 final_layer_dropout: float = 0.0, stochastic_depth_p: float = 0.0, squeeze_excitation: bool = False,
+                 squeeze_excitation_rd_ratio: float = 1./16):
+        super().__init__(n_classes, n_input_channels, config='34_cifar', input_dimension=input_dimension,
+                         final_layer_dropout=final_layer_dropout, stochastic_depth_p=stochastic_depth_p,
+                         squeeze_excitation=squeeze_excitation, squeeze_excitation_rd_ratio=squeeze_excitation_rd_ratio)
+class ResNet50_CIFAR(ResNetD):
+    def __init__(self, n_classes: int, n_input_channels: int = 3, input_dimension: int = 2,
+                 final_layer_dropout: float = 0.0, stochastic_depth_p: float = 0.0, squeeze_excitation: bool = False,
+                 squeeze_excitation_rd_ratio: float = 1./16):
+        super().__init__(n_classes, n_input_channels, config='50_cifar', input_dimension=input_dimension,
+                         final_layer_dropout=final_layer_dropout, stochastic_depth_p=stochastic_depth_p,
+                         squeeze_excitation=squeeze_excitation, squeeze_excitation_rd_ratio=squeeze_excitation_rd_ratio)
+class ResNet152_CIFAR(ResNetD):
+    def __init__(self, n_classes: int, n_input_channels: int = 3, input_dimension: int = 2,
+                 final_layer_dropout: float = 0.0, stochastic_depth_p: float = 0.0, squeeze_excitation: bool = False,
+                 squeeze_excitation_rd_ratio: float = 1./16):
+        super().__init__(n_classes, n_input_channels, config='152_cifar', input_dimension=input_dimension,
+                         final_layer_dropout=final_layer_dropout, stochastic_depth_p=stochastic_depth_p,
+                         squeeze_excitation=squeeze_excitation, squeeze_excitation_rd_ratio=squeeze_excitation_rd_ratio)
+class ResNet50bn_CIFAR(ResNetD):
+    def __init__(self, n_classes: int, n_input_channels: int = 3, input_dimension: int = 2,
+                 final_layer_dropout: float = 0.0, stochastic_depth_p: float = 0.0, squeeze_excitation: bool = False,
+                 squeeze_excitation_rd_ratio: float = 1./16):
+        super().__init__(n_classes, n_input_channels, config='50_cifar_bn', input_dimension=input_dimension,
+                         final_layer_dropout=final_layer_dropout, stochastic_depth_p=stochastic_depth_p,
+                         squeeze_excitation=squeeze_excitation, squeeze_excitation_rd_ratio=squeeze_excitation_rd_ratio)
+class ResNet152bn_CIFAR(ResNetD):
+    def __init__(self, n_classes: int, n_input_channels: int = 3, input_dimension: int = 2,
+                 final_layer_dropout: float = 0.0, stochastic_depth_p: float = 0.0, squeeze_excitation: bool = False,
+                 squeeze_excitation_rd_ratio: float = 1./16):
+        super().__init__(n_classes, n_input_channels, config='152_cifar_bn', input_dimension=input_dimension,
+                         final_layer_dropout=final_layer_dropout, stochastic_depth_p=stochastic_depth_p,
+                         squeeze_excitation=squeeze_excitation, squeeze_excitation_rd_ratio=squeeze_excitation_rd_ratio)
+class ResNet18(ResNetD):
+    def __init__(self, n_classes: int, n_input_channels: int = 3, input_dimension: int = 2,
+                 final_layer_dropout: float = 0.0, stochastic_depth_p: float = 0.0, squeeze_excitation: bool = False,
+                 squeeze_excitation_rd_ratio: float = 1./16):
+        super().__init__(n_classes, n_input_channels, config='18', input_dimension=input_dimension,
+                         final_layer_dropout=final_layer_dropout, stochastic_depth_p=stochastic_depth_p,
+                         squeeze_excitation=squeeze_excitation, squeeze_excitation_rd_ratio=squeeze_excitation_rd_ratio)
+class ResNet34(ResNetD):
+    def __init__(self, n_classes: int, n_input_channels: int = 3, input_dimension: int = 2,
+                 final_layer_dropout: float = 0.0, stochastic_depth_p: float = 0.0, squeeze_excitation: bool = False,
+                 squeeze_excitation_rd_ratio: float = 1./16):
+        super().__init__(n_classes, n_input_channels, config='34', input_dimension=input_dimension,
+                         final_layer_dropout=final_layer_dropout, stochastic_depth_p=stochastic_depth_p,
+                         squeeze_excitation=squeeze_excitation, squeeze_excitation_rd_ratio=squeeze_excitation_rd_ratio)
+class ResNet50(ResNetD):
+    def __init__(self, n_classes: int, n_input_channels: int = 3, input_dimension: int = 2,
+                 final_layer_dropout: float = 0.0, stochastic_depth_p: float = 0.0, squeeze_excitation: bool = False,
+                 squeeze_excitation_rd_ratio: float = 1./16):
+        super().__init__(n_classes, n_input_channels, config='50', input_dimension=input_dimension,
+                         final_layer_dropout=final_layer_dropout, stochastic_depth_p=stochastic_depth_p,
+                         squeeze_excitation=squeeze_excitation, squeeze_excitation_rd_ratio=squeeze_excitation_rd_ratio)
+class ResNet152(ResNetD):
+    def __init__(self, n_classes: int, n_input_channels: int = 3, input_dimension: int = 2,
+                 final_layer_dropout: float = 0.0, stochastic_depth_p: float = 0.0, squeeze_excitation: bool = False,
+                 squeeze_excitation_rd_ratio: float = 1./16):
+        super().__init__(n_classes, n_input_channels, config='152', input_dimension=input_dimension,
+                         final_layer_dropout=final_layer_dropout, stochastic_depth_p=stochastic_depth_p,
+                         squeeze_excitation=squeeze_excitation, squeeze_excitation_rd_ratio=squeeze_excitation_rd_ratio)
+class ResNet50bn(ResNetD):
+    def __init__(self, n_classes: int, n_input_channels: int = 3, input_dimension: int = 2,
+                 final_layer_dropout: float = 0.0, stochastic_depth_p: float = 0.0, squeeze_excitation: bool = False,
+                 squeeze_excitation_rd_ratio: float = 1./16):
+        super().__init__(n_classes, n_input_channels, config='50_bn', input_dimension=input_dimension,
+                         final_layer_dropout=final_layer_dropout, stochastic_depth_p=stochastic_depth_p,
+                         squeeze_excitation=squeeze_excitation, squeeze_excitation_rd_ratio=squeeze_excitation_rd_ratio)
+class ResNet152bn(ResNetD):
+    def __init__(self, n_classes: int, n_input_channels: int = 3, input_dimension: int = 2,
+                 final_layer_dropout: float = 0.0, stochastic_depth_p: float = 0.0, squeeze_excitation: bool = False,
+                 squeeze_excitation_rd_ratio: float = 1./16):
+        super().__init__(n_classes, n_input_channels, config='152_bn', input_dimension=input_dimension,
+                         final_layer_dropout=final_layer_dropout, stochastic_depth_p=stochastic_depth_p,
+                         squeeze_excitation=squeeze_excitation, squeeze_excitation_rd_ratio=squeeze_excitation_rd_ratio)
+if __name__ == '__main__':
+    data = torch.rand((1, 3, 224, 224))
+    model = ResNet50bn(10, 3)
+    import hiddenlayer as hl
+    g = hl.build_graph(model, data,
+                       transforms=None)
+    g.save("network_architecture.pdf")
+    del g
+    #print(model.compute_conv_feature_map_size((32, 32)))

model/dynamic-network-architectures-main/dynamic_network_architectures/architectures/unet.py ADDED Viewed

	@@ -0,0 +1,220 @@

+from typing import Union, Type, List, Tuple
+import torch
+from dynamic_network_architectures.building_blocks.residual_encoders import ResidualEncoder
+from dynamic_network_architectures.building_blocks.residual import BasicBlockD, BottleneckD
+from torch import nn
+from torch.nn.modules.conv import _ConvNd
+from torch.nn.modules.dropout import _DropoutNd
+from dynamic_network_architectures.building_blocks.plain_conv_encoder import PlainConvEncoder
+from dynamic_network_architectures.building_blocks.unet_decoder import UNetDecoder, UNetDecoder_Seg
+from dynamic_network_architectures.building_blocks.helper import convert_conv_op_to_dim
+class PlainConvUNet(nn.Module):
+    def __init__(self,
+                 input_channels: int,
+                 n_stages: int,
+                 features_per_stage: Union[int, List[int], Tuple[int, ...]],
+                 conv_op: Type[_ConvNd],
+                 kernel_sizes: Union[int, List[int], Tuple[int, ...]],
+                 strides: Union[int, List[int], Tuple[int, ...]],
+                 n_conv_per_stage: Union[int, List[int], Tuple[int, ...]],
+                 n_conv_per_stage_decoder: Union[int, Tuple[int, ...], List[int]],
+                 conv_bias: bool = False,
+                 norm_op: Union[None, Type[nn.Module]] = None,
+                 norm_op_kwargs: dict = None,
+                 dropout_op: Union[None, Type[_DropoutNd]] = None,
+                 dropout_op_kwargs: dict = None,
+                 nonlin: Union[None, Type[torch.nn.Module]] = None, # activation
+                 nonlin_kwargs: dict = None,
+                 deep_supervision: bool = False,
+                 nonlin_first: bool = False
+                 ):
+        """
+        nonlin_first: if True you get conv -> nonlin -> norm. Else it's conv -> norm -> nonlin
+        """
+        super().__init__()
+        if isinstance(n_conv_per_stage, int):
+            n_conv_per_stage = [n_conv_per_stage] * n_stages
+        if isinstance(n_conv_per_stage_decoder, int):
+            n_conv_per_stage_decoder = [n_conv_per_stage_decoder] * (n_stages - 1)
+        assert len(n_conv_per_stage) == n_stages, "n_conv_per_stage must have as many entries as we have " \
+                                                  f"resolution stages. here: {n_stages}. " \
+                                                  f"n_conv_per_stage: {n_conv_per_stage}"
+        assert len(n_conv_per_stage_decoder) == (n_stages - 1), "n_conv_per_stage_decoder must have one less entries " \
+                                                                f"as we have resolution stages. here: {n_stages} " \
+                                                                f"stages, so it should have {n_stages - 1} entries. " \
+                                                                f"n_conv_per_stage_decoder: {n_conv_per_stage_decoder}"
+        self.encoder = PlainConvEncoder(input_channels, n_stages, features_per_stage, conv_op, kernel_sizes, strides,
+                                        n_conv_per_stage, conv_bias, norm_op, norm_op_kwargs, dropout_op,
+                                        dropout_op_kwargs, nonlin, nonlin_kwargs, return_skips=True,
+                                        nonlin_first=nonlin_first)
+        self.decoder = UNetDecoder(self.encoder, n_conv_per_stage_decoder, deep_supervision,
+                                   nonlin_first=nonlin_first)
+    def forward(self, x):
+        skips = self.encoder(x) # [2, 32, 256, 256, 96] ... [2, 768, 8, 8, 3]
+        outs = self.decoder(skips)  # [2, 32, 256, 256, 96] ... [2, 512, 16, 16, 6]
+        return skips, outs   # latent_embeddings(a list of multiscale features), perpixel_embeddings(a list of decoder outputs)
+    def compute_conv_feature_map_size(self, input_size):
+        assert len(input_size) == convert_conv_op_to_dim(self.encoder.conv_op), "just give the image size without color/feature channels or " \
+                                                            "batch channel. Do not give input_size=(b, c, x, y(, z)). " \
+                                                            "Give input_size=(x, y(, z))!"
+        return self.encoder.compute_conv_feature_map_size(input_size) + self.decoder.compute_conv_feature_map_size(input_size)
+class PlainConvUNet_Seg(nn.Module):
+    def __init__(self,
+                 input_channels: int,
+                 n_stages: int,
+                 features_per_stage: Union[int, List[int], Tuple[int, ...]],
+                 conv_op: Type[_ConvNd],
+                 kernel_sizes: Union[int, List[int], Tuple[int, ...]],
+                 strides: Union[int, List[int], Tuple[int, ...]],
+                 n_conv_per_stage: Union[int, List[int], Tuple[int, ...]],
+                 num_classes: int,
+                 n_conv_per_stage_decoder: Union[int, Tuple[int, ...], List[int]],
+                 conv_bias: bool = False,
+                 norm_op: Union[None, Type[nn.Module]] = None,
+                 norm_op_kwargs: dict = None,
+                 dropout_op: Union[None, Type[_DropoutNd]] = None,
+                 dropout_op_kwargs: dict = None,
+                 nonlin: Union[None, Type[torch.nn.Module]] = None, # activation
+                 nonlin_kwargs: dict = None,
+                 deep_supervision: bool = False,
+                 nonlin_first: bool = False
+                 ):
+        """
+        nonlin_first: if True you get conv -> nonlin -> norm. Else it's conv -> norm -> nonlin
+        """
+        super().__init__()
+        if isinstance(n_conv_per_stage, int):
+            n_conv_per_stage = [n_conv_per_stage] * n_stages
+        if isinstance(n_conv_per_stage_decoder, int):
+            n_conv_per_stage_decoder = [n_conv_per_stage_decoder] * (n_stages - 1)
+        assert len(n_conv_per_stage) == n_stages, "n_conv_per_stage must have as many entries as we have " \
+                                                  f"resolution stages. here: {n_stages}. " \
+                                                  f"n_conv_per_stage: {n_conv_per_stage}"
+        assert len(n_conv_per_stage_decoder) == (n_stages - 1), "n_conv_per_stage_decoder must have one less entries " \
+                                                                f"as we have resolution stages. here: {n_stages} " \
+                                                                f"stages, so it should have {n_stages - 1} entries. " \
+                                                                f"n_conv_per_stage_decoder: {n_conv_per_stage_decoder}"
+        self.encoder = PlainConvEncoder(input_channels, n_stages, features_per_stage, conv_op, kernel_sizes, strides,
+                                        n_conv_per_stage, conv_bias, norm_op, norm_op_kwargs, dropout_op,
+                                        dropout_op_kwargs, nonlin, nonlin_kwargs, return_skips=True,
+                                        nonlin_first=nonlin_first)
+        self.decoder = UNetDecoder_Seg(self.encoder, num_classes, n_conv_per_stage_decoder, deep_supervision,
+                                       nonlin_first=nonlin_first)
+    def forward(self, x):
+        skips = self.encoder(x) # [2, 32, 256, 256, 96] ... [2, 768, 8, 8, 3]
+        out = self.decoder(skips)  # [2, num_class, 256, 256, 96]
+        return out
+    def compute_conv_feature_map_size(self, input_size):
+        assert len(input_size) == convert_conv_op_to_dim(self.encoder.conv_op), "just give the image size without color/feature channels or " \
+                                                            "batch channel. Do not give input_size=(b, c, x, y(, z)). " \
+                                                            "Give input_size=(x, y(, z))!"
+        return self.encoder.compute_conv_feature_map_size(input_size) + self.decoder.compute_conv_feature_map_size(input_size)
+class ResidualEncoderUNet(nn.Module):
+    def __init__(self,
+                 input_channels: int,
+                 n_stages: int,
+                 features_per_stage: Union[int, List[int], Tuple[int, ...]],
+                 conv_op: Type[_ConvNd],
+                 kernel_sizes: Union[int, List[int], Tuple[int, ...]],
+                 strides: Union[int, List[int], Tuple[int, ...]],
+                 n_blocks_per_stage: Union[int, List[int], Tuple[int, ...]],
+                 n_conv_per_stage_decoder: Union[int, Tuple[int, ...], List[int]],
+                 conv_bias: bool = False,
+                 norm_op: Union[None, Type[nn.Module]] = None,
+                 norm_op_kwargs: dict = None,
+                 dropout_op: Union[None, Type[_DropoutNd]] = None,
+                 dropout_op_kwargs: dict = None,
+                 nonlin: Union[None, Type[torch.nn.Module]] = None,
+                 nonlin_kwargs: dict = None,
+                 deep_supervision: bool = False,
+                 block: Union[Type[BasicBlockD], Type[BottleneckD]] = BasicBlockD,
+                 bottleneck_channels: Union[int, List[int], Tuple[int, ...]] = None,
+                 stem_channels: int = None
+                 ):
+        super().__init__()
+        if isinstance(n_blocks_per_stage, int):
+            n_blocks_per_stage = [n_blocks_per_stage] * n_stages
+        if isinstance(n_conv_per_stage_decoder, int):
+            n_conv_per_stage_decoder = [n_conv_per_stage_decoder] * (n_stages - 1)
+        assert len(n_blocks_per_stage) == n_stages, "n_blocks_per_stage must have as many entries as we have " \
+                                                  f"resolution stages. here: {n_stages}. " \
+                                                  f"n_blocks_per_stage: {n_blocks_per_stage}"
+        assert len(n_conv_per_stage_decoder) == (n_stages - 1), "n_conv_per_stage_decoder must have one less entries " \
+                                                                f"as we have resolution stages. here: {n_stages} " \
+                                                                f"stages, so it should have {n_stages - 1} entries. " \
+                                                                f"n_conv_per_stage_decoder: {n_conv_per_stage_decoder}"
+        self.encoder = ResidualEncoder(input_channels, n_stages, features_per_stage, conv_op, kernel_sizes, strides,
+                                       n_blocks_per_stage, conv_bias, norm_op, norm_op_kwargs, dropout_op,
+                                       dropout_op_kwargs, nonlin, nonlin_kwargs, block, bottleneck_channels,
+                                       return_skips=True, disable_default_stem=False, stem_channels=stem_channels)
+        self.decoder = UNetDecoder(self.encoder, n_conv_per_stage_decoder, deep_supervision)
+    def forward(self, x):
+        skips = self.encoder(x) # [2, 32, 256, 256, 96] ... [2, 768, 8, 8, 3]
+        outs = self.decoder(skips)  # [2, 32, 256, 256, 96] ... [2, 512, 16, 16, 6]
+        return skips, outs   # latent_embeddings(a list of multiscale features), perpixel_embeddings(a list of decoder outputs)
+    def compute_conv_feature_map_size(self, input_size):
+        assert len(input_size) == convert_conv_op_to_dim(self.encoder.conv_op), "just give the image size without color/feature channels or " \
+                                                                                "batch channel. Do not give input_size=(b, c, x, y(, z)). " \
+                                                                                "Give input_size=(x, y(, z))!"
+        return self.encoder.compute_conv_feature_map_size(input_size) + self.decoder.compute_conv_feature_map_size(input_size)
+if __name__ == '__main__':
+    import sys
+    sys.path.append('/remote-home/zihengzhao/Knowledge-Enhanced-Medical-Segmentation/medical-universal-segmentation/model/dynamic-network-architectures-main')
+    data = torch.rand((2, 3, 256, 256, 96)).cuda()
+    model = PlainConvUNet(3, 6, (32, 64, 128, 256, 512, 768), nn.Conv3d, 3, (1, 2, 2, 2, 2, 2), (2, 2, 2, 2, 2, 2), 4,
+                                (2, 2, 2, 2, 2), False, nn.BatchNorm3d, None, None, None, nn.ReLU, deep_supervision=True).cuda()
+    dec_outs, enc_outs = model(data)
+    print('DEC')
+    for i in dec_outs:
+        print(i.shape)    # (2, 4, 256, 256, 96)
+    print('ENC')
+    for i in enc_outs:
+        print(i.shape)  # ()
+    exit()
+    if False:
+        import hiddenlayer as hl
+        g = hl.build_graph(model, data,
+                           transforms=None)
+        g.save("network_architecture.pdf")
+        del g
+    print(model.compute_conv_feature_map_size(data.shape[2:]))
+    data = torch.rand((1, 4, 512, 512))
+    model = PlainConvUNet(4, 8, (32, 64, 125, 256, 512, 512, 512, 512), nn.Conv2d, 3, (1, 2, 2, 2, 2, 2, 2, 2), (2, 2, 2, 2, 2, 2, 2, 2), 4,
+                                (2, 2, 2, 2, 2, 2, 2), False, nn.BatchNorm2d, None, None, None, nn.ReLU, deep_supervision=True)
+    if False:
+        import hiddenlayer as hl
+        g = hl.build_graph(model, data,
+                           transforms=None)
+        g.save("network_architecture.pdf")
+        del g
+    print(model.compute_conv_feature_map_size(data.shape[2:]))

model/dynamic-network-architectures-main/dynamic_network_architectures/architectures/vgg.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import torch
+from torch import nn
+from dynamic_network_architectures.building_blocks.plain_conv_encoder import PlainConvEncoder
+from dynamic_network_architectures.building_blocks.helper import get_matching_pool_op, get_default_network_config
+_VGG_CONFIGS = {
+    '16': {'features_per_stage': (64, 128, 256, 512, 512, 512), 'n_conv_per_stage': (2, 2, 2, 3, 3, 3),
+           'strides': (1, 2, 2, 2, 2, 2)},
+    '19': {'features_per_stage': (64, 128, 256, 512, 512, 512), 'n_conv_per_stage': (2, 2, 3, 3, 4, 4),
+           'strides': (1, 2, 2, 2, 2, 2)},
+    '16_cifar': {'features_per_stage': (64, 128, 256, 512), 'n_conv_per_stage': (2, 3, 5, 5), 'strides': (1, 2, 2, 2)},
+    '19_cifar': {'features_per_stage': (64, 128, 256, 512), 'n_conv_per_stage': (3, 4, 5, 6), 'strides': (1, 2, 2, 2)},
+}
+_VGG_OPS = {
+    1: {'conv_op': nn.Conv1d, 'norm_op': nn.BatchNorm1d},
+    2: {'conv_op': nn.Conv2d, 'norm_op': nn.BatchNorm2d},
+    3: {'conv_op': nn.Conv3d, 'norm_op': nn.BatchNorm3d},
+}
+class VGG(nn.Module):
+    def __init__(self, n_classes: int, n_input_channel: int = 3, config='16', input_dimension=2):
+        """
+        This is not 1:1 VGG because it does not have the bloated fully connected layers at the end. Since these were
+        counted towards the XX layers as well, we increase the number of convolutional layers so that we have the
+        desired number of conv layers in total
+        We also use batchnorm
+        """
+        super().__init__()
+        cfg = _VGG_CONFIGS[config]
+        ops = get_default_network_config(dimension=input_dimension)
+        self.encoder = PlainConvEncoder(
+            n_input_channel, n_stages=len(cfg['features_per_stage']), features_per_stage=cfg['features_per_stage'],
+            conv_op=ops['conv_op'],
+            kernel_sizes=3, strides=cfg['strides'], n_conv_per_stage=cfg['n_conv_per_stage'], conv_bias=False,
+            norm_op=ops['norm_op'], norm_op_kwargs=None, dropout_op=None, dropout_op_kwargs=None, nonlin=nn.ReLU,
+            nonlin_kwargs={'inplace': True}, return_skips=False
+        )
+        self.gap = get_matching_pool_op(conv_op=ops['conv_op'], adaptive=True, pool_type='avg')(1)
+        self.classifier = nn.Linear(cfg['features_per_stage'][-1], n_classes, True)
+    def forward(self, x):
+        x = self.encoder(x)
+        x = self.gap(x).squeeze()
+        return self.classifier(x)
+    def compute_conv_feature_map_size(self, input_size):
+        return self.encoder.compute_conv_feature_map_size(input_size)
+class VGG16(VGG):
+    def __init__(self, n_classes: int, n_input_channel: int = 3, input_dimension: int = 2):
+        super().__init__(n_classes, n_input_channel, config='16', input_dimension=input_dimension)
+class VGG19(VGG):
+    def __init__(self, n_classes: int, n_input_channel: int = 3, input_dimension: int = 2):
+        super().__init__(n_classes, n_input_channel, config='19', input_dimension=input_dimension)
+class VGG16_cifar(VGG):
+    def __init__(self, n_classes: int, n_input_channel: int = 3, input_dimension: int = 2):
+        super().__init__(n_classes, n_input_channel, config='16_cifar', input_dimension=input_dimension)
+class VGG19_cifar(VGG):
+    def __init__(self, n_classes: int, n_input_channel: int = 3, input_dimension: int = 2):
+        super().__init__(n_classes, n_input_channel, config='19_cifar', input_dimension=input_dimension)
+if __name__ == '__main__':
+    data = torch.rand((1, 3, 32, 32))
+    model = VGG19_cifar(10, 3)
+    import hiddenlayer as hl
+    g = hl.build_graph(model, data,
+                       transforms=None)
+    g.save("network_architecture.pdf")
+    del g
+    print(model.compute_conv_feature_map_size((32, 32)))

model/dynamic-network-architectures-main/dynamic_network_architectures/building_blocks/__init__.py ADDED Viewed

File without changes

model/dynamic-network-architectures-main/dynamic_network_architectures/building_blocks/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (272 Bytes). View file

model/dynamic-network-architectures-main/dynamic_network_architectures/building_blocks/__pycache__/helper.cpython-310.pyc ADDED Viewed

Binary file (5.93 kB). View file

model/dynamic-network-architectures-main/dynamic_network_architectures/building_blocks/__pycache__/plain_conv_encoder.cpython-310.pyc ADDED Viewed

Binary file (4.22 kB). View file

model/dynamic-network-architectures-main/dynamic_network_architectures/building_blocks/__pycache__/regularization.cpython-310.pyc ADDED Viewed

Binary file (4.39 kB). View file

model/dynamic-network-architectures-main/dynamic_network_architectures/building_blocks/__pycache__/residual.cpython-310.pyc ADDED Viewed

Binary file (14.2 kB). View file

model/dynamic-network-architectures-main/dynamic_network_architectures/building_blocks/__pycache__/residual_encoders.cpython-310.pyc ADDED Viewed

Binary file (6.39 kB). View file

model/dynamic-network-architectures-main/dynamic_network_architectures/building_blocks/__pycache__/simple_conv_blocks.cpython-310.pyc ADDED Viewed

Binary file (5.85 kB). View file

model/dynamic-network-architectures-main/dynamic_network_architectures/building_blocks/__pycache__/unet_decoder.cpython-310.pyc ADDED Viewed

Binary file (6.85 kB). View file

model/dynamic-network-architectures-main/dynamic_network_architectures/building_blocks/helper.py ADDED Viewed

	@@ -0,0 +1,242 @@

+from typing import Type
+import numpy as np
+import torch.nn
+from torch import nn
+from torch.nn.modules.batchnorm import _BatchNorm
+from torch.nn.modules.conv import _ConvNd, _ConvTransposeNd
+from torch.nn.modules.dropout import _DropoutNd
+from torch.nn.modules.instancenorm import _InstanceNorm
+def convert_dim_to_conv_op(dimension: int) -> Type[_ConvNd]:
+    """
+    :param dimension: 1, 2 or 3
+    :return: conv Class of corresponding dimension
+    """
+    if dimension == 1:
+        return nn.Conv1d
+    elif dimension == 2:
+        return nn.Conv2d
+    elif dimension == 3:
+        return nn.Conv3d
+    else:
+        raise ValueError("Unknown dimension. Only 1, 2 and 3 are supported")
+def convert_conv_op_to_dim(conv_op: Type[_ConvNd]) -> int:
+    """
+    :param conv_op: conv class
+    :return: dimension: 1, 2 or 3
+    """
+    if conv_op == nn.Conv1d:
+        return 1
+    elif conv_op == nn.Conv2d:
+        return 2
+    elif conv_op == nn.Conv3d:
+        return 3
+    else:
+        raise ValueError("Unknown dimension. Only 1d 2d and 3d conv are supported. got %s" % str(conv_op))
+def get_matching_pool_op(conv_op: Type[_ConvNd] = None,
+                         dimension: int = None,
+                         adaptive=False,
+                         pool_type: str = 'avg') -> Type[torch.nn.Module]:
+    """
+    You MUST set EITHER conv_op OR dimension. Do not set both!
+    :param conv_op:
+    :param dimension:
+    :param adaptive:
+    :param pool_type: either 'avg' or 'max'
+    :return:
+    """
+    assert not ((conv_op is not None) and (dimension is not None)), \
+        "You MUST set EITHER conv_op OR dimension. Do not set both!"
+    assert pool_type in ['avg', 'max'], 'pool_type must be either avg or max'
+    if conv_op is not None:
+        dimension = convert_conv_op_to_dim(conv_op)
+    assert dimension in [1, 2, 3], 'Dimension must be 1, 2 or 3'
+    if conv_op is not None:
+        dimension = convert_conv_op_to_dim(conv_op)
+    if dimension == 1:
+        if pool_type == 'avg':
+            if adaptive:
+                return nn.AdaptiveAvgPool1d
+            else:
+                return nn.AvgPool1d
+        elif pool_type == 'max':
+            if adaptive:
+                return nn.AdaptiveMaxPool1d
+            else:
+                return nn.MaxPool1d
+    elif dimension == 2:
+        if pool_type == 'avg':
+            if adaptive:
+                return nn.AdaptiveAvgPool2d
+            else:
+                return nn.AvgPool2d
+        elif pool_type == 'max':
+            if adaptive:
+                return nn.AdaptiveMaxPool2d
+            else:
+                return nn.MaxPool2d
+    elif dimension == 3:
+        if pool_type == 'avg':
+            if adaptive:
+                return nn.AdaptiveAvgPool3d
+            else:
+                return nn.AvgPool3d
+        elif pool_type == 'max':
+            if adaptive:
+                return nn.AdaptiveMaxPool3d
+            else:
+                return nn.MaxPool3d
+def get_matching_instancenorm(conv_op: Type[_ConvNd] = None, dimension: int = None) -> Type[_InstanceNorm]:
+    """
+    You MUST set EITHER conv_op OR dimension. Do not set both!
+    :param conv_op:
+    :param dimension:
+    :return:
+    """
+    assert not ((conv_op is not None) and (dimension is not None)), \
+        "You MUST set EITHER conv_op OR dimension. Do not set both!"
+    if conv_op is not None:
+        dimension = convert_conv_op_to_dim(conv_op)
+    if dimension is not None:
+        assert dimension in [1, 2, 3], 'Dimension must be 1, 2 or 3'
+    if dimension == 1:
+        return nn.InstanceNorm1d
+    elif dimension == 2:
+        return nn.InstanceNorm2d
+    elif dimension == 3:
+        return nn.InstanceNorm3d
+def get_matching_convtransp(conv_op: Type[_ConvNd] = None, dimension: int = None) -> Type[_ConvTransposeNd]:
+    """
+    You MUST set EITHER conv_op OR dimension. Do not set both!
+    :param conv_op:
+    :param dimension:
+    :return:
+    """
+    assert not ((conv_op is not None) and (dimension is not None)), \
+        "You MUST set EITHER conv_op OR dimension. Do not set both!"
+    if conv_op is not None:
+        dimension = convert_conv_op_to_dim(conv_op)
+    assert dimension in [1, 2, 3], 'Dimension must be 1, 2 or 3'
+    if dimension == 1:
+        return nn.ConvTranspose1d
+    elif dimension == 2:
+        return nn.ConvTranspose2d
+    elif dimension == 3:
+        return nn.ConvTranspose3d
+def get_matching_batchnorm(conv_op: Type[_ConvNd] = None, dimension: int = None) -> Type[_BatchNorm]:
+    """
+    You MUST set EITHER conv_op OR dimension. Do not set both!
+    :param conv_op:
+    :param dimension:
+    :return:
+    """
+    assert not ((conv_op is not None) and (dimension is not None)), \
+        "You MUST set EITHER conv_op OR dimension. Do not set both!"
+    if conv_op is not None:
+        dimension = convert_conv_op_to_dim(conv_op)
+    assert dimension in [1, 2, 3], 'Dimension must be 1, 2 or 3'
+    if dimension == 1:
+        return nn.BatchNorm1d
+    elif dimension == 2:
+        return nn.BatchNorm2d
+    elif dimension == 3:
+        return nn.BatchNorm3d
+def get_matching_dropout(conv_op: Type[_ConvNd] = None, dimension: int = None) -> Type[_DropoutNd]:
+    """
+    You MUST set EITHER conv_op OR dimension. Do not set both!
+    :param conv_op:
+    :param dimension:
+    :return:
+    """
+    assert not ((conv_op is not None) and (dimension is not None)), \
+        "You MUST set EITHER conv_op OR dimension. Do not set both!"
+    assert dimension in [1, 2, 3], 'Dimension must be 1, 2 or 3'
+    if dimension == 1:
+        return nn.Dropout
+    elif dimension == 2:
+        return nn.Dropout2d
+    elif dimension == 3:
+        return nn.Dropout3d
+def maybe_convert_scalar_to_list(conv_op, scalar):
+    """
+    useful for converting, for example, kernel_size=3 to [3, 3, 3] in case of nn.Conv3d
+    :param conv_op:
+    :param scalar:
+    :return:
+    """
+    if not isinstance(scalar, (tuple, list, np.ndarray)):
+        if conv_op == nn.Conv2d:
+            return [scalar] * 2
+        elif conv_op == nn.Conv3d:
+            return [scalar] * 3
+        elif conv_op == nn.Conv1d:
+            return [scalar] * 1
+        else:
+            raise RuntimeError("Invalid conv op: %s" % str(conv_op))
+    else:
+        return scalar
+def get_default_network_config(dimension: int = 2,
+                               nonlin: str = "ReLU",
+                               norm_type: str = "bn") -> dict:
+    """
+    Use this to get a standard configuration. A network configuration looks like this:
+    config = {'conv_op': torch.nn.modules.conv.Conv2d,
+              'dropout_op': torch.nn.modules.dropout.Dropout2d,
+              'norm_op': torch.nn.modules.batchnorm.BatchNorm2d,
+              'norm_op_kwargs': {'eps': 1e-05, 'affine': True},
+              'nonlin': torch.nn.modules.activation.ReLU,
+              'nonlin_kwargs': {'inplace': True}}
+    There is no need to use get_default_network_config. You can create your own. Network configs are a convenient way of
+    setting dimensionality, normalization and nonlinearity.
+    :param dimension: integer denoting the dimension of the data. 1, 2 and 3 are accepted
+    :param nonlin: string (ReLU or LeakyReLU)
+    :param norm_type: string (bn=batch norm, in=instance norm)
+    torch.nn.Module
+    :return: dict
+    """
+    config = {}
+    config['conv_op'] = convert_dim_to_conv_op(dimension)
+    config['dropout_op'] = get_matching_dropout(dimension=dimension)
+    if norm_type == "bn":
+        config['norm_op'] = get_matching_batchnorm(dimension=dimension)
+    elif norm_type == "in":
+        config['norm_op'] = get_matching_instancenorm(dimension=dimension)
+    config['norm_op_kwargs'] = None # this will use defaults
+    if nonlin == "LeakyReLU":
+        config['nonlin'] = nn.LeakyReLU
+        config['nonlin_kwargs'] = {'negative_slope': 1e-2, 'inplace': True}
+    elif nonlin == "ReLU":
+        config['nonlin'] = nn.ReLU
+        config['nonlin_kwargs'] = {'inplace': True}
+    else:
+        raise NotImplementedError('Unknown nonlin %s. Only "LeakyReLU" and "ReLU" are supported for now' % nonlin)
+    return config

model/dynamic-network-architectures-main/dynamic_network_architectures/building_blocks/plain_conv_encoder.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import torch
+from torch import nn
+import numpy as np
+from typing import Union, Type, List, Tuple
+from torch.nn.modules.conv import _ConvNd
+from torch.nn.modules.dropout import _DropoutNd
+from dynamic_network_architectures.building_blocks.simple_conv_blocks import StackedConvBlocks
+from dynamic_network_architectures.building_blocks.helper import maybe_convert_scalar_to_list, get_matching_pool_op
+class PlainConvEncoder(nn.Module):
+    def __init__(self,
+                 input_channels: int,
+                 n_stages: int,
+                 features_per_stage: Union[int, List[int], Tuple[int, ...]],
+                 conv_op: Type[_ConvNd],
+                 kernel_sizes: Union[int, List[int], Tuple[int, ...]],
+                 strides: Union[int, List[int], Tuple[int, ...]],
+                 n_conv_per_stage: Union[int, List[int], Tuple[int, ...]],
+                 conv_bias: bool = False,
+                 norm_op: Union[None, Type[nn.Module]] = None,
+                 norm_op_kwargs: dict = None,
+                 dropout_op: Union[None, Type[_DropoutNd]] = None,
+                 dropout_op_kwargs: dict = None,
+                 nonlin: Union[None, Type[torch.nn.Module]] = None,
+                 nonlin_kwargs: dict = None,
+                 return_skips: bool = False,
+                 nonlin_first: bool = False,
+                 pool: str = 'conv'
+                 ):
+        super().__init__()
+        if isinstance(kernel_sizes, int):
+            kernel_sizes = [kernel_sizes] * n_stages
+        if isinstance(features_per_stage, int):
+            features_per_stage = [features_per_stage] * n_stages
+        if isinstance(n_conv_per_stage, int):
+            n_conv_per_stage = [n_conv_per_stage] * n_stages
+        if isinstance(strides, int):
+            strides = [strides] * n_stages
+        assert len(kernel_sizes) == n_stages, "kernel_sizes must have as many entries as we have resolution stages (n_stages)"
+        assert len(n_conv_per_stage) == n_stages, "n_conv_per_stage must have as many entries as we have resolution stages (n_stages)"
+        assert len(features_per_stage) == n_stages, "features_per_stage must have as many entries as we have resolution stages (n_stages)"
+        assert len(strides) == n_stages, "strides must have as many entries as we have resolution stages (n_stages). " \
+                                             "Important: first entry is recommended to be 1, else we run strided conv drectly on the input"
+        stages = []
+        for s in range(n_stages):
+            stage_modules = []
+            if pool == 'max' or pool == 'avg':
+                if (isinstance(strides[s], int) and strides[s] != 1) or \
+                        isinstance(strides[s], (tuple, list)) and any([i != 1 for i in strides[s]]):
+                    stage_modules.append(get_matching_pool_op(conv_op, pool_type=pool)(kernel_size=strides[s], stride=strides[s]))
+                conv_stride = 1
+            elif pool == 'conv':
+                conv_stride = strides[s]
+            else:
+                raise RuntimeError()
+            stage_modules.append(StackedConvBlocks(
+                n_conv_per_stage[s], conv_op, input_channels, features_per_stage[s], kernel_sizes[s], conv_stride,
+                conv_bias, norm_op, norm_op_kwargs, dropout_op, dropout_op_kwargs, nonlin, nonlin_kwargs, nonlin_first
+            ))
+            stages.append(nn.Sequential(*stage_modules))
+            input_channels = features_per_stage[s]
+        self.stages = nn.Sequential(*stages)
+        self.output_channels = features_per_stage
+        self.strides = [maybe_convert_scalar_to_list(conv_op, i) for i in strides]
+        self.return_skips = return_skips
+        # we store some things that a potential decoder needs
+        self.conv_op = conv_op
+        self.norm_op = norm_op
+        self.norm_op_kwargs = norm_op_kwargs
+        self.nonlin = nonlin
+        self.nonlin_kwargs = nonlin_kwargs
+        self.dropout_op = dropout_op
+        self.dropout_op_kwargs = dropout_op_kwargs
+        self.conv_bias = conv_bias
+        self.kernel_sizes = kernel_sizes
+    def forward(self, x):
+        ret = []
+        for s in self.stages:
+            x = s(x)
+            ret.append(x)
+        if self.return_skips:
+            return ret
+        else:
+            return ret[-1]
+    def compute_conv_feature_map_size(self, input_size):
+        output = np.int64(0)
+        for s in range(len(self.stages)):
+            if isinstance(self.stages[s], nn.Sequential):
+                for sq in self.stages[s]:
+                    if hasattr(sq, 'compute_conv_feature_map_size'):
+                        output += self.stages[s][-1].compute_conv_feature_map_size(input_size)
+            else:
+                output += self.stages[s].compute_conv_feature_map_size(input_size)
+            input_size = [i // j for i, j in zip(input_size, self.strides[s])]
+        return output

model/dynamic-network-architectures-main/dynamic_network_architectures/building_blocks/regularization.py ADDED Viewed

	@@ -0,0 +1,86 @@

+from torch import nn
+def drop_path(x, drop_prob: float = 0., training: bool = False, scale_by_keep: bool = True):
+    """
+    This function is taken from the timm package (https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/drop.py).
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+class DropPath(nn.Module):
+    """
+    This class is taken from the timm package (https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/drop.py).
+    Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+class SqueezeExcite(nn.Module):
+    """
+    This class is taken from the timm package (https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/squeeze_excite.py)
+    and slightly modified so that the convolution type can be adapted.
+    SE Module as defined in original SE-Nets with a few additions
+    Additions include:
+        * divisor can be specified to keep channels % div == 0 (default: 8)
+        * reduction channels can be specified directly by arg (if rd_channels is set)
+        * reduction channels can be specified by float rd_ratio (default: 1/16)
+        * global max pooling can be added to the squeeze aggregation
+        * customizable activation, normalization, and gate layer
+    """
+    def __init__(
+            self, channels, conv_op, rd_ratio=1. / 16, rd_channels=None, rd_divisor=8, add_maxpool=False,
+            act_layer=nn.ReLU, norm_layer=None, gate_layer=nn.Sigmoid):
+        super(SqueezeExcite, self).__init__()
+        self.add_maxpool = add_maxpool
+        if not rd_channels:
+            rd_channels = make_divisible(channels * rd_ratio, rd_divisor, round_limit=0.)
+        self.fc1 = conv_op(channels, rd_channels, kernel_size=1, bias=True)
+        self.bn = norm_layer(rd_channels) if norm_layer else nn.Identity()
+        self.act = act_layer(inplace=True)
+        self.fc2 = conv_op(rd_channels, channels, kernel_size=1, bias=True)
+        self.gate = gate_layer()
+    def forward(self, x):
+        x_se = x.mean((2, 3), keepdim=True)
+        if self.add_maxpool:
+            # experimental codepath, may remove or change
+            x_se = 0.5 * x_se + 0.5 * x.amax((2, 3), keepdim=True)
+        x_se = self.fc1(x_se)
+        x_se = self.act(self.bn(x_se))
+        x_se = self.fc2(x_se)
+        return x * self.gate(x_se)
+def make_divisible(v, divisor=8, min_value=None, round_limit=.9):
+    """
+    This function is taken from the timm package (https://github.com/rwightman/pytorch-image-models/blob/b7cb8d0337b3e7b50516849805ddb9be5fc11644/timm/models/layers/helpers.py#L25)
+    """
+    min_value = min_value or divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < round_limit * v:
+        new_v += divisor
+    return new_v

model/dynamic-network-architectures-main/dynamic_network_architectures/building_blocks/residual.py ADDED Viewed

	@@ -0,0 +1,371 @@

+from typing import Tuple, List, Union, Type
+import torch.nn
+from torch import nn
+from torch.nn.modules.conv import _ConvNd
+from torch.nn.modules.dropout import _DropoutNd
+from dynamic_network_architectures.building_blocks.helper import maybe_convert_scalar_to_list, get_matching_pool_op
+from dynamic_network_architectures.building_blocks.simple_conv_blocks import ConvDropoutNormReLU
+from dynamic_network_architectures.building_blocks.regularization import DropPath, SqueezeExcite
+import numpy as np
+class BasicBlockD(nn.Module):
+    def __init__(self,
+                 conv_op: Type[_ConvNd],
+                 input_channels: int,
+                 output_channels: int,
+                 kernel_size: Union[int, List[int], Tuple[int, ...]],
+                 stride: Union[int, List[int], Tuple[int, ...]],
+                 conv_bias: bool = False,
+                 norm_op: Union[None, Type[nn.Module]] = None,
+                 norm_op_kwargs: dict = None,
+                 dropout_op: Union[None, Type[_DropoutNd]] = None,
+                 dropout_op_kwargs: dict = None,
+                 nonlin: Union[None, Type[torch.nn.Module]] = None,
+                 nonlin_kwargs: dict = None,
+                 stochastic_depth_p: float = 0.0,
+                 squeeze_excitation: bool = False,
+                 squeeze_excitation_reduction_ratio: float = 1. / 16,
+                 # todo wideresnet?
+                 ):
+        """
+        This implementation follows ResNet-D:
+        He, Tong, et al. "Bag of tricks for image classification with convolutional neural networks."
+        Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. 2019.
+        The skip has an avgpool (if needed) followed by 1x1 conv instead of just a strided 1x1 conv
+        :param conv_op:
+        :param input_channels:
+        :param output_channels:
+        :param kernel_size: refers only to convs in feature extraction path, not to 1x1x1 conv in skip
+        :param stride: only applies to first conv (and skip). Second conv always has stride 1
+        :param conv_bias:
+        :param norm_op:
+        :param norm_op_kwargs:
+        :param dropout_op: only the first conv can have dropout. The second never has
+        :param dropout_op_kwargs:
+        :param nonlin:
+        :param nonlin_kwargs:
+        :param stochastic_depth_p:
+        :param squeeze_excitation:
+        :param squeeze_excitation_reduction_ratio:
+        """
+        super().__init__()
+        self.input_channels = input_channels
+        self.output_channels = output_channels
+        stride = maybe_convert_scalar_to_list(conv_op, stride)
+        self.stride = stride
+        kernel_size = maybe_convert_scalar_to_list(conv_op, kernel_size)
+        if norm_op_kwargs is None:
+            norm_op_kwargs = {}
+        if nonlin_kwargs is None:
+            nonlin_kwargs = {}
+        self.conv1 = ConvDropoutNormReLU(conv_op, input_channels, output_channels, kernel_size, stride, conv_bias,
+                                         norm_op, norm_op_kwargs, dropout_op, dropout_op_kwargs, nonlin, nonlin_kwargs)
+        self.conv2 = ConvDropoutNormReLU(conv_op, output_channels, output_channels, kernel_size, 1, conv_bias, norm_op,
+                                         norm_op_kwargs, None, None, None, None)
+        self.nonlin2 = nonlin(**nonlin_kwargs) if nonlin is not None else lambda x: x
+        # Stochastic Depth
+        self.apply_stochastic_depth = False if stochastic_depth_p == 0.0 else True
+        if self.apply_stochastic_depth:
+            self.drop_path = DropPath(drop_prob=stochastic_depth_p)
+        # Squeeze Excitation
+        self.apply_se = squeeze_excitation
+        if self.apply_se:
+            self.squeeze_excitation = SqueezeExcite(self.output_channels, conv_op,
+                                                    rd_ratio=squeeze_excitation_reduction_ratio, rd_divisor=8)
+        has_stride = (isinstance(stride, int) and stride != 1) or any([i != 1 for i in stride])
+        requires_projection = (input_channels != output_channels)
+        if has_stride or requires_projection:
+            ops = []
+            if has_stride:
+                ops.append(get_matching_pool_op(conv_op=conv_op, adaptive=False, pool_type='avg')(stride, stride))
+            if requires_projection:
+                ops.append(
+                    ConvDropoutNormReLU(conv_op, input_channels, output_channels, 1, 1, False, norm_op,
+                                        norm_op_kwargs, None, None, None, None
+                                        )
+                )
+            self.skip = nn.Sequential(*ops)
+        else:
+            self.skip = lambda x: x
+    def forward(self, x):
+        residual = self.skip(x)
+        out = self.conv2(self.conv1(x))
+        if self.apply_stochastic_depth:
+            out = self.drop_path(out)
+        if self.apply_se:
+            out = self.squeeze_excitation(out)
+        out += residual
+        return self.nonlin2(out)
+    def compute_conv_feature_map_size(self, input_size):
+        assert len(input_size) == len(self.stride), "just give the image size without color/feature channels or " \
+                                                    "batch channel. Do not give input_size=(b, c, x, y(, z)). " \
+                                                    "Give input_size=(x, y(, z))!"
+        size_after_stride = [i // j for i, j in zip(input_size, self.stride)]
+        # conv1
+        output_size_conv1 = np.prod([self.output_channels, *size_after_stride], dtype=np.int64)
+        # conv2
+        output_size_conv2 = np.prod([self.output_channels, *size_after_stride], dtype=np.int64)
+        # skip conv (if applicable)
+        if (self.input_channels != self.output_channels) or any([i != j for i, j in zip(input_size, size_after_stride)]):
+            assert isinstance(self.skip, nn.Sequential)
+            output_size_skip = np.prod([self.output_channels, *size_after_stride], dtype=np.int64)
+        else:
+            assert not isinstance(self.skip, nn.Sequential)
+            output_size_skip = 0
+        return output_size_conv1 + output_size_conv2 + output_size_skip
+class BottleneckD(nn.Module):
+    def __init__(self,
+                 conv_op: Type[_ConvNd],
+                 input_channels: int,
+                 bottleneck_channels: int,
+                 output_channels: int,
+                 kernel_size: Union[int, List[int], Tuple[int, ...]],
+                 stride: Union[int, List[int], Tuple[int, ...]],
+                 conv_bias: bool = False,
+                 norm_op: Union[None, Type[nn.Module]] = None,
+                 norm_op_kwargs: dict = None,
+                 dropout_op: Union[None, Type[_DropoutNd]] = None,
+                 dropout_op_kwargs: dict = None,
+                 nonlin: Union[None, Type[torch.nn.Module]] = None,
+                 nonlin_kwargs: dict = None,
+                 stochastic_depth_p: float = 0.0,
+                 squeeze_excitation: bool = False,
+                 squeeze_excitation_reduction_ratio: float = 1. / 16
+                 ):
+        """
+        This implementation follows ResNet-D:
+        He, Tong, et al. "Bag of tricks for image classification with convolutional neural networks."
+        Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. 2019.
+        The stride sits in the 3x3 conv instead of the 1x1 conv!
+        The skip has an avgpool (if needed) followed by 1x1 conv instead of just a strided 1x1 conv
+        :param conv_op:
+        :param input_channels:
+        :param output_channels:
+        :param kernel_size: only affects the conv in the middle (typically 3x3). The other convs remain 1x1
+        :param stride: only applies to the conv in the middle (and skip). Note that this deviates from the canonical
+        ResNet implementation where the stride is applied to the first 1x1 conv. (This implementation follows ResNet-D)
+        :param conv_bias:
+        :param norm_op:
+        :param norm_op_kwargs:
+        :param dropout_op: only the second (kernel_size) conv can have dropout. The first and last conv (1x1(x1)) never have it
+        :param dropout_op_kwargs:
+        :param nonlin:
+        :param nonlin_kwargs:
+        :param stochastic_depth_p:
+        :param squeeze_excitation:
+        :param squeeze_excitation_reduction_ratio:
+        """
+        super().__init__()
+        self.input_channels = input_channels
+        self.output_channels = output_channels
+        self.bottleneck_channels = bottleneck_channels
+        stride = maybe_convert_scalar_to_list(conv_op, stride)
+        self.stride = stride
+        kernel_size = maybe_convert_scalar_to_list(conv_op, kernel_size)
+        if norm_op_kwargs is None:
+            norm_op_kwargs = {}
+        if nonlin_kwargs is None:
+            nonlin_kwargs = {}
+        self.conv1 = ConvDropoutNormReLU(conv_op, input_channels, bottleneck_channels, 1, 1, conv_bias,
+                                         norm_op, norm_op_kwargs, None, None, nonlin, nonlin_kwargs)
+        self.conv2 = ConvDropoutNormReLU(conv_op, bottleneck_channels, bottleneck_channels, kernel_size, stride,
+                                         conv_bias,
+                                         norm_op, norm_op_kwargs, dropout_op, dropout_op_kwargs, nonlin, nonlin_kwargs)
+        self.conv3 = ConvDropoutNormReLU(conv_op, bottleneck_channels, output_channels, 1, 1, conv_bias, norm_op,
+                                         norm_op_kwargs, None, None, None, None)
+        self.nonlin3 = nonlin(**nonlin_kwargs) if nonlin is not None else lambda x: x
+        # Stochastic Depth
+        self.apply_stochastic_depth = False if stochastic_depth_p == 0.0 else True
+        if self.apply_stochastic_depth:
+            self.drop_path = DropPath(drop_prob=stochastic_depth_p)
+        # Squeeze Excitation
+        self.apply_se = squeeze_excitation
+        if self.apply_se:
+            self.squeeze_excitation = SqueezeExcite(self.output_channels, conv_op,
+                                                    rd_ratio=squeeze_excitation_reduction_ratio, rd_divisor=8)
+        has_stride = (isinstance(stride, int) and stride != 1) or any([i != 1 for i in stride])
+        requires_projection = (input_channels != output_channels)
+        if has_stride or requires_projection:
+            ops = []
+            if has_stride:
+                ops.append(get_matching_pool_op(conv_op=conv_op, adaptive=False, pool_type='avg')(stride, stride))
+            if requires_projection:
+                ops.append(
+                    ConvDropoutNormReLU(conv_op, input_channels, output_channels, 1, 1, False,
+                                        norm_op, norm_op_kwargs, None, None, None, None
+                                        )
+                )
+            self.skip = nn.Sequential(*ops)
+        else:
+            self.skip = lambda x: x
+    def forward(self, x):
+        residual = self.skip(x)
+        out = self.conv3(self.conv2(self.conv1(x)))
+        if self.apply_stochastic_depth:
+            out = self.drop_path(out)
+        if self.apply_se:
+            out = self.squeeze_excitation(out)
+        out += residual
+        return self.nonlin3(out)
+    def compute_conv_feature_map_size(self, input_size):
+        assert len(input_size) == len(self.stride), "just give the image size without color/feature channels or " \
+                                                    "batch channel. Do not give input_size=(b, c, x, y(, z)). " \
+                                                    "Give input_size=(x, y(, z))!"
+        size_after_stride = [i // j for i, j in zip(input_size, self.stride)]
+        # conv1
+        output_size_conv1 = np.prod([self.bottleneck_channels, *input_size], dtype=np.int64)
+        # conv2
+        output_size_conv2 = np.prod([self.bottleneck_channels, *size_after_stride], dtype=np.int64)
+        # conv3
+        output_size_conv3 = np.prod([self.output_channels, *size_after_stride], dtype=np.int64)
+        # skip conv (if applicable)
+        if (self.input_channels != self.output_channels) or any([i != j for i, j in zip(input_size, size_after_stride)]):
+            assert isinstance(self.skip, nn.Sequential)
+            output_size_skip = np.prod([self.output_channels, *size_after_stride], dtype=np.int64)
+        else:
+            assert not isinstance(self.skip, nn.Sequential)
+            output_size_skip = 0
+        return output_size_conv1 + output_size_conv2 + output_size_conv3 + output_size_skip
+class StackedResidualBlocks(nn.Module):
+    def __init__(self,
+                 n_blocks: int,
+                 conv_op: Type[_ConvNd],
+                 input_channels: int,
+                 output_channels: Union[int, List[int], Tuple[int, ...]],
+                 kernel_size: Union[int, List[int], Tuple[int, ...]],
+                 initial_stride: Union[int, List[int], Tuple[int, ...]],
+                 conv_bias: bool = False,
+                 norm_op: Union[None, Type[nn.Module]] = None,
+                 norm_op_kwargs: dict = None,
+                 dropout_op: Union[None, Type[_DropoutNd]] = None,
+                 dropout_op_kwargs: dict = None,
+                 nonlin: Union[None, Type[torch.nn.Module]] = None,
+                 nonlin_kwargs: dict = None,
+                 block: Union[Type[BasicBlockD], Type[BottleneckD]] = BasicBlockD,
+                 bottleneck_channels: Union[int, List[int], Tuple[int, ...]] = None,
+                 stochastic_depth_p: float = 0.0,
+                 squeeze_excitation: bool = False,
+                 squeeze_excitation_reduction_ratio: float = 1. / 16
+                 ):
+        """
+        Stack multiple instances of block.
+        :param n_blocks: number of residual blocks
+        :param conv_op: nn.ConvNd class
+        :param input_channels: only relevant for forst block in the sequence. This is the input number of features.
+        After the first block, the number of features in the main path to which the residuals are added is output_channels
+        :param output_channels: number of features in the main path to which the residuals are added (and also the
+        number of features of the output)
+        :param kernel_size: kernel size for all nxn (n!=1) convolutions. Default: 3x3
+        :param initial_stride: only affects the first block. All subsequent blocks have stride 1
+        :param conv_bias: usually False
+        :param norm_op: nn.BatchNormNd, InstanceNormNd etc
+        :param norm_op_kwargs: dictionary of kwargs. Leave empty ({}) for defaults
+        :param dropout_op: nn.DropoutNd, can be None for no dropout
+        :param dropout_op_kwargs:
+        :param nonlin:
+        :param nonlin_kwargs:
+        :param block: BasicBlockD or BottleneckD
+        :param bottleneck_channels: if block is BottleneckD then we need to know the number of bottleneck features.
+        Bottleneck will use first 1x1 conv to reduce input to bottleneck features, then run the nxn (see kernel_size)
+        conv on that (bottleneck -> bottleneck). Finally the output will be projected back to output_channels
+        (bottleneck -> output_channels) with the final 1x1 conv
+        :param stochastic_depth_p: probability of applying stochastic depth in residual blocks
+        :param squeeze_excitation: whether to apply squeeze and excitation or not
+        :param squeeze_excitation_reduction_ratio: ratio by how much squeeze and excitation should reduce channels
+        respective to number of out channels of respective block
+        """
+        super().__init__()
+        assert n_blocks > 0, 'n_blocks must be > 0'
+        assert block in [BasicBlockD, BottleneckD], 'block must be BasicBlockD or BottleneckD'
+        if not isinstance(output_channels, (tuple, list)):
+            output_channels = [output_channels] * n_blocks
+        if not isinstance(bottleneck_channels, (tuple, list)):
+            bottleneck_channels = [bottleneck_channels] * n_blocks
+        if block == BasicBlockD:
+            blocks = nn.Sequential(
+                block(conv_op, input_channels, output_channels[0], kernel_size, initial_stride, conv_bias,
+                      norm_op, norm_op_kwargs, dropout_op, dropout_op_kwargs, nonlin, nonlin_kwargs, stochastic_depth_p,
+                      squeeze_excitation, squeeze_excitation_reduction_ratio),
+                *[block(conv_op, output_channels[n - 1], output_channels[n], kernel_size, 1, conv_bias, norm_op,
+                        norm_op_kwargs, dropout_op, dropout_op_kwargs, nonlin, nonlin_kwargs, stochastic_depth_p,
+                        squeeze_excitation, squeeze_excitation_reduction_ratio) for n in range(1, n_blocks)]
+            )
+        else:
+            blocks = nn.Sequential(
+                block(conv_op, input_channels, bottleneck_channels[0], output_channels[0], kernel_size,
+                      initial_stride, conv_bias, norm_op, norm_op_kwargs, dropout_op, dropout_op_kwargs,
+                      nonlin, nonlin_kwargs, stochastic_depth_p, squeeze_excitation, squeeze_excitation_reduction_ratio),
+                *[block(conv_op, output_channels[n - 1], bottleneck_channels[n], output_channels[n], kernel_size,
+                        1, conv_bias, norm_op, norm_op_kwargs, dropout_op, dropout_op_kwargs,
+                        nonlin, nonlin_kwargs, stochastic_depth_p, squeeze_excitation,
+                        squeeze_excitation_reduction_ratio) for n in range(1, n_blocks)]
+            )
+        self.blocks = blocks
+        self.initial_stride = maybe_convert_scalar_to_list(conv_op, initial_stride)
+        self.output_channels = output_channels[-1]
+    def forward(self, x):
+        return self.blocks(x)
+    def compute_conv_feature_map_size(self, input_size):
+        assert len(input_size) == len(self.initial_stride), "just give the image size without color/feature channels or " \
+                                                    "batch channel. Do not give input_size=(b, c, x, y(, z)). " \
+                                                    "Give input_size=(x, y(, z))!"
+        output = self.blocks[0].compute_conv_feature_map_size(input_size)
+        size_after_stride = [i // j for i, j in zip(input_size, self.initial_stride)]
+        for b in self.blocks[1:]:
+            output += b.compute_conv_feature_map_size(size_after_stride)
+        return output
+if __name__ == '__main__':
+    data = torch.rand((1, 3, 40, 32))
+    stx = StackedResidualBlocks(2, nn.Conv2d, 24, (16, 16), (3, 3), (1, 2),
+                                                norm_op=nn.BatchNorm2d, nonlin=nn.ReLU, nonlin_kwargs={'inplace': True},
+                                                block=BottleneckD, bottleneck_channels=3)
+    model = nn.Sequential(ConvDropoutNormReLU(nn.Conv2d,
+                                              3, 24, 3, 1, True, nn.BatchNorm2d, {}, None, None, nn.LeakyReLU,
+                                              {'inplace': True}),
+                          stx)
+    import hiddenlayer as hl
+    g = hl.build_graph(model, data,
+                       transforms=None)
+    g.save("network_architecture.pdf")
+    del g
+    print(stx.compute_conv_feature_map_size((40, 32)))