AsymmetryNet / train_mae_swin3d.py

Create train_mae_swin3d.py

529f7c0 verified about 2 months ago

29.8 kB

	#!/usr/bin/env python
	"""
	Masked Autoencoder (MAE) pretraining with 3D Swin Transformer for OPSCC CT scans.
	Asymmetry-aware reconstruction + overfitting monitoring via cosine similarity.

	Run example:
	python train_mae_swin3d.py --data-dir /path/to/your/nii_folder --output-dir ./checkpoints
	"""

	"""
	Self-Supervised Learning for OPSCC CT using 3D Swin Transformer MAE
	with asymmetry-aware reconstruction and overfitting monitoring
	"""

	import argparse
	import json
	import pickle
	import warnings
	from datetime import datetime
	from pathlib import Path

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torch.utils.data import Dataset, DataLoader

	import numpy as np
	from scipy import ndimage
	import nibabel as nib
	from tqdm import tqdm

	warnings.filterwarnings("ignore", category=UserWarning)


	# ==============================================================================
	# Drop Path
	# ==============================================================================

	class DropPath(nn.Module):
	def __init__(self, drop_prob: float = 0.):
	super().__init__()
	self.drop_prob = drop_prob

	def forward(self, x):
	if self.drop_prob == 0. or not self.training:
	return x
	keep_prob = 1 - self.drop_prob
	shape = (x.shape[0],) + (1,) * (x.ndim - 1)
	random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
	random_tensor.floor_()
	return x.div(keep_prob) * random_tensor


	# ==============================================================================
	# Asymmetry Detectors
	# ==============================================================================

	class AirwayAsymmetryDetector:
	def __init__(self, exclude_inferior_fraction=0.15, exclude_superior_fraction=0.10):
	self.exclude_inferior_fraction = exclude_inferior_fraction
	self.exclude_superior_fraction = exclude_superior_fraction

	def find_midline(self, slice_2d):
	h, w = slice_2d.shape
	search_range = w // 8
	center = w // 2
	best_midline = center
	best_symmetry = float('inf')
	for mid in range(center - search_range, center + search_range):
	compare_width = min(mid, w - mid)
	if compare_width < 10:
	continue
	left = slice_2d[:, mid - compare_width:mid]
	right = np.flip(slice_2d[:, mid:mid + compare_width], axis=1)
	diff = np.abs(left - right).mean()
	if diff < best_symmetry:
	best_symmetry = diff
	best_midline = mid
	return best_midline

	def detect_airway(self, slice_2d, air_thresh=0.1):
	binary = slice_2d < air_thresh
	labeled, num_features = ndimage.label(binary)
	edge_labels = set(labeled[0,:].flatten()) \| set(labeled[-1,:].flatten()) \| \
	set(labeled[:,0].flatten()) \| set(labeled[:,-1].flatten())
	airway_mask = np.zeros_like(binary)
	for label_id in range(1, num_features + 1):
	if label_id not in edge_labels:
	component = labeled == label_id
	if component.sum() > 20:
	airway_mask \|= component
	return airway_mask

	def forward(self, volume):
	d, h, w = volume.shape
	inferior_cutoff = int(d * self.exclude_inferior_fraction)
	superior_cutoff = int(d * (1 - self.exclude_superior_fraction))
	results = {'effacement': [], 'mass_effect': [], 'midline_shift': [], 'hybrid': [], 'midlines': []}
	for z in range(d):
	slice_2d = volume[z]
	midline = self.find_midline(slice_2d)
	midline_shift = midline - w // 2
	results['midlines'].append(midline)
	airway_mask = self.detect_airway(slice_2d)
	left_air = airway_mask[:, :midline].sum()
	right_air = airway_mask[:, midline:].sum()
	total = left_air + right_air
	effacement = abs(left_air - right_air) / max(total, 1) if total > 0 else 0
	compare_width = min(midline, w - midline)
	mass_effect = 0
	if compare_width > 0:
	soft_tissue = (slice_2d > 0.2) & (slice_2d < 0.7)
	left = slice_2d[:, midline-compare_width:midline] * soft_tissue[:, midline-compare_width:midline]
	right = np.flip(slice_2d[:, midline:midline+compare_width], axis=1) * np.flip(soft_tissue[:, midline:midline+compare_width], axis=1)
	mass_effect = np.abs(left - right).mean()
	in_range = inferior_cutoff <= z <= superior_cutoff
	hybrid = (0.5 * effacement + 0.5 * mass_effect) if in_range else 0
	results['effacement'].append(effacement)
	results['mass_effect'].append(mass_effect)
	results['midline_shift'].append(midline_shift)
	results['hybrid'].append(hybrid)
	return {k: np.array(v) for k, v in results.items()}


	class GlobalSoftTissueAsymmetryDetector:
	def __init__(self, exclude_inferior_fraction=0.15, exclude_superior_fraction=0.10):
	self.exclude_inferior_fraction = exclude_inferior_fraction
	self.exclude_superior_fraction = exclude_superior_fraction

	def forward(self, volume, midlines=None):
	d, h, w = volume.shape
	if midlines is None:
	midlines = [w // 2] * d
	results = {'left_hypo': [], 'right_hypo': [], 'hypo_asymmetry': []}
	for z in range(d):
	slice_2d = volume[z]
	midline = midlines[z]
	soft_tissue = (slice_2d > 0.2) & (slice_2d < 0.7)
	hypodense = (slice_2d < 0.35) & soft_tissue
	hypodense = ndimage.binary_opening(hypodense, iterations=1)
	hypodense = ndimage.binary_closing(hypodense, iterations=2)
	labeled, num_features = ndimage.label(hypodense)
	left_count = right_count = 0
	for i in range(1, num_features + 1):
	region = labeled == i
	size = region.sum()
	if 10 < size < 150:
	centroid_x = np.argwhere(region)[:,1].mean()
	if centroid_x < midline:
	left_count += 1
	else:
	right_count += 1
	results['left_hypo'].append(left_count)
	results['right_hypo'].append(right_count)
	results['hypo_asymmetry'].append(abs(left_count - right_count))
	return {k: np.array(v) for k, v in results.items()}


	# ==============================================================================
	# 3D Swin Transformer Components
	# ==============================================================================

	def window_partition3d(x, window_size=(4,4,4)):
	B, C, D, H, W = x.shape
	ws_d, ws_h, ws_w = window_size
	pad_d = (ws_d - D % ws_d) % ws_d
	pad_h = (ws_h - H % ws_h) % ws_h
	pad_w = (ws_w - W % ws_w) % ws_w
	x = F.pad(x, (0, pad_w, 0, pad_h, 0, pad_d))
	Dp, Hp, Wp = D + pad_d, H + pad_h, W + pad_w
	x = x.reshape(B, C, Dp // ws_d, ws_d, Hp // ws_h, ws_h, Wp // ws_w, ws_w)
	x = x.permute(0, 2, 4, 6, 1, 3, 5, 7).contiguous()
	windows = x.reshape(-1, C, ws_d * ws_h * ws_w).permute(0, 2, 1).contiguous()
	return windows, (pad_d, pad_h, pad_w)


	def window_reverse3d(windows, window_size, B, D, H, W, pads):
	pad_d, pad_h, pad_w = pads
	ws_d, ws_h, ws_w = window_size
	Dp, Hp, Wp = D + pad_d, H + pad_h, W + pad_w
	x = windows.reshape(B, Dp // ws_d, Hp // ws_h, Wp // ws_w, ws_d, ws_h, ws_w, -1)
	x = x.permute(0, 7, 1, 4, 2, 5, 3, 6).contiguous()
	x = x.reshape(B, -1, Dp, Hp, Wp)
	x = x[:, :, :D, :H, :W]
	return x


	class WindowAttention3D(nn.Module):
	def __init__(self, dim, window_size=(4,4,4), num_heads=3, qkv_bias=True, qk_scale=None,
	attn_drop=0., proj_drop=0.):
	super().__init__()
	self.dim = dim
	self.window_size = window_size
	self.num_heads = num_heads
	head_dim = dim // num_heads
	self.scale = qk_scale or head_dim ** -0.5

	coords_d = torch.arange(window_size[0])
	coords_h = torch.arange(window_size[1])
	coords_w = torch.arange(window_size[2])
	coords = torch.stack(torch.meshgrid(coords_d, coords_h, coords_w, indexing='ij'))
	coords_flatten = torch.flatten(coords, 1)
	relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
	relative_coords = relative_coords.permute(1, 2, 0).contiguous()

	relative_coords[:, :, 0] += window_size[0] - 1
	relative_coords[:, :, 1] += window_size[1] - 1
	relative_coords[:, :, 2] += window_size[2] - 1

	relative_coords[:, :, 0] = (2 window_size[1] - 1) * (2 * window_size[2] - 1)
	relative_coords[:, :, 1] = (2 window_size[2] - 1)
	self.relative_position_index = relative_coords.sum(-1)

	max_rel_pos = self.relative_position_index.max().item()
	self.relative_position_bias_table = nn.Parameter(torch.zeros((max_rel_pos + 1, num_heads)))
	nn.init.trunc_normal_(self.relative_position_bias_table, std=.02)

	self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
	self.attn_drop = nn.Dropout(attn_drop)
	self.proj = nn.Linear(dim, dim)
	self.proj_drop = nn.Dropout(proj_drop)
	self.softmax = nn.Softmax(dim=-1)

	def forward(self, x, mask=None):
	B_, N, C = x.shape
	rel_index = self.relative_position_index[:N, :N]
	relative_position_bias = self.relative_position_bias_table[rel_index.view(-1)]
	relative_position_bias = relative_position_bias.view(N, N, -1).permute(2, 0, 1).contiguous()

	qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
	q, k, v = qkv[0], qkv[1], qkv[2]
	q = q * self.scale
	attn = (q @ k.transpose(-2, -1))
	attn = attn + relative_position_bias.unsqueeze(0)

	if mask is not None:
	nW = mask.shape[0]
	attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
	attn = attn.view(-1, self.num_heads, N, N)

	attn = self.softmax(attn)
	attn = self.attn_drop(attn)
	x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
	x = self.proj(x)
	x = self.proj_drop(x)
	return x


	class SwinTransformerBlock3D(nn.Module):
	def __init__(self, dim, num_heads, window_size=(4,4,4), shift_size=(0,0,0),
	mlp_ratio=4., qkv_bias=True, drop=0., attn_drop=0., drop_path=0.,
	act_layer=nn.GELU, norm_layer=nn.LayerNorm):
	super().__init__()
	self.dim = dim
	self.window_size = window_size
	self.shift_size = shift_size
	self.norm1 = norm_layer(dim)
	self.attn = WindowAttention3D(dim=dim, window_size=window_size, num_heads=num_heads,
	qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
	self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
	self.norm2 = norm_layer(dim)
	mlp_hidden_dim = int(dim * mlp_ratio)
	self.mlp = nn.Sequential(
	nn.Linear(dim, mlp_hidden_dim), act_layer(), nn.Dropout(drop),
	nn.Linear(mlp_hidden_dim, dim), nn.Dropout(drop)
	)

	def forward(self, x):
	shortcut = x
	x_norm = x.permute(0, 2, 3, 4, 1)
	x_norm = self.norm1(x_norm)
	x = x_norm.permute(0, 4, 1, 2, 3)

	windows, pads = window_partition3d(x, self.window_size)
	attn_windows = self.attn(windows)
	x = window_reverse3d(attn_windows, self.window_size, x.shape[0], x.shape[2], x.shape[3], x.shape[4], pads)

	x = shortcut + self.drop_path(x)

	x_norm = x.permute(0, 2, 3, 4, 1)
	x_norm = self.norm2(x_norm)
	x_norm = x_norm.permute(0, 4, 1, 2, 3)
	x_mlp = self.mlp(x_norm.permute(0, 2, 3, 4, 1)).permute(0, 4, 1, 2, 3)
	x = x + self.drop_path(x_mlp)
	return x


	class PatchEmbed3D(nn.Module):
	def __init__(self, patch_size=(4,4,4), in_chans=1, embed_dim=96):
	super().__init__()
	self.proj = nn.Conv3d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)

	def forward(self, x):
	return self.proj(x)


	class PatchMerging3D(nn.Module):
	def __init__(self, dim):
	super().__init__()
	self.reduction = nn.Linear(8 * dim, 2 * dim, bias=False)

	def forward(self, x):
	B, C, D, H, W = x.shape
	pad_d, pad_h, pad_w = D % 2, H % 2, W % 2
	if pad_d or pad_h or pad_w:
	x = F.pad(x, (0, pad_w, 0, pad_h, 0, pad_d))
	_, _, Dp, Hp, Wp = x.shape
	x = x.permute(0, 2, 3, 4, 1)
	x = x.view(B, Dp // 2, 2, Hp // 2, 2, Wp // 2, 2, C)
	x = x.permute(0, 1, 3, 5, 2, 4, 6, 7).contiguous()
	x = x.view(B, Dp // 2, Hp // 2, Wp // 2, 8 * C)
	x = self.reduction(x)
	x = x.permute(0, 4, 1, 2, 3).contiguous()
	return x


	class SwinTransformer3D(nn.Module):
	def __init__(self, in_chans=1, embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24],
	window_size=(4,4,4), mlp_ratio=4., qkv_bias=True, drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1):
	super().__init__()
	self.patch_embed = PatchEmbed3D(in_chans=in_chans, embed_dim=embed_dim)
	dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
	self.layers = nn.ModuleList()
	dim = embed_dim
	for i_layer in range(len(depths)):
	blocks = nn.ModuleList([
	SwinTransformerBlock3D(dim=dim, num_heads=num_heads[i_layer], window_size=window_size,
	drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i])
	for i in range(depths[i_layer])
	])
	self.layers.append(blocks)
	if i_layer < len(depths)-1:
	self.layers.append(PatchMerging3D(dim))
	dim *= 2
	self.norm = nn.LayerNorm(dim)
	self.avgpool = nn.AdaptiveAvgPool3d(1)
	self.feature_dim = dim

	def forward(self, x):
	x = self.patch_embed(x)
	for layer in self.layers:
	if isinstance(layer, PatchMerging3D):
	x = layer(x)
	else:
	for blk in layer:
	x = blk(x)
	x = self.avgpool(x).flatten(1)
	x = self.norm(x)
	return x


	# ==============================================================================
	# MAE Model
	# ==============================================================================

	class MAE_Swin3D(nn.Module):
	def __init__(self, input_shape=(60, 128, 128)):
	super().__init__()
	self.input_shape = input_shape
	self.encoder = SwinTransformer3D(in_chans=1)
	decoder_dim = 512
	self.decoder = nn.Sequential(
	nn.Linear(self.encoder.feature_dim, decoder_dim),
	nn.ReLU(),
	nn.Linear(decoder_dim, np.prod(input_shape))
	)
	self.airway_head = nn.Linear(self.encoder.feature_dim, 4 * input_shape[0])
	self.lymph_head = nn.Linear(self.encoder.feature_dim, 3 * input_shape[0])

	def forward(self, x):
	feat = self.encoder(x)
	recon_flat = self.decoder(feat)
	recon = recon_flat.view(-1, 1, *self.input_shape)
	airway_pred = self.airway_head(feat).view(-1, self.input_shape[0], 4)
	lymph_pred = self.lymph_head(feat).view(-1, self.input_shape[0], 3)
	return {
	'reconstruction': recon,
	'airway_pred': airway_pred,
	'lymph_pred': lymph_pred,
	'features': feat
	}


	# ==============================================================================
	# Augmentations
	# ==============================================================================

	def augment_volume(volume):
	aug = volume.clone()
	device = aug.device

	if torch.rand(1) > 0.3:
	shift = (torch.rand(1).to(device) - 0.5) * 0.4
	aug += shift

	if torch.rand(1) > 0.3:
	scale = 0.7 + torch.rand(1).to(device) * 0.6
	aug *= scale

	if torch.rand(1) > 0.3:
	noise = torch.randn_like(aug) * 0.1
	aug += noise

	if torch.rand(1) > 0.5:
	aug = torch.flip(aug, dims=[-1])

	if torch.rand(1) > 0.5:
	aug = torch.flip(aug, dims=[-2])

	if torch.rand(1) > 0.7:
	k = torch.randint(1, 4, (1,)).item()
	aug = torch.rot90(aug, k, dims=[-2, -1])

	if torch.rand(1) > 0.5:
	_, _, D, H, W = aug.shape
	crop_d = int(D * (0.80 + torch.rand(1).item() * 0.15))
	crop_h = int(H * (0.80 + torch.rand(1).item() * 0.15))
	crop_w = int(W * (0.80 + torch.rand(1).item() * 0.15))
	start_d = torch.randint(0, D - crop_d + 1, (1,)).item()
	start_h = torch.randint(0, H - crop_h + 1, (1,)).item()
	start_w = torch.randint(0, W - crop_w + 1, (1,)).item()
	aug = aug[:, :, start_d:start_d+crop_d, start_h:start_h+crop_h, start_w:start_w+crop_w]
	aug = F.interpolate(aug, size=(D, H, W), mode='trilinear', align_corners=False)

	if torch.rand(1) > 0.7:
	kernel_size = 3
	padding = kernel_size // 2
	aug = F.avg_pool3d(aug, kernel_size=kernel_size, stride=1, padding=padding)

	if torch.rand(1) > 0.7:
	_, _, D, H, W = aug.shape
	erase_d = int(D * (0.05 + torch.rand(1).item() * 0.10))
	erase_h = int(H * (0.05 + torch.rand(1).item() * 0.10))
	erase_w = int(W * (0.05 + torch.rand(1).item() * 0.10))
	start_d = torch.randint(0, D - erase_d + 1, (1,)).item()
	start_h = torch.randint(0, H - erase_h + 1, (1,)).item()
	start_w = torch.randint(0, W - erase_w + 1, (1,)).item()
	aug[:, :, start_d:start_d+erase_d, start_h:start_h+erase_h, start_w:start_w+erase_w] = aug.mean()

	aug = torch.clamp(aug, 0, 1)
	return aug


	# ==============================================================================
	# Dataset
	# ==============================================================================

	class OPSCCDataset(Dataset):
	def __init__(self, data_dir: str, cache_asymmetry: bool = True):
	self.data_dir = Path(data_dir)
	self.volume_paths = list(self.data_dir.glob("**/cropped_volume.nii.gz"))
	print(f"Found {len(self.volume_paths)} volumes")

	self.cache_file = self.data_dir / ".asymmetry_cache.pkl"
	self.cache_asymmetry = cache_asymmetry
	self.asymmetry_cache = {}
	self.airway_detector = AirwayAsymmetryDetector()
	self.lymphnode_detector = GlobalSoftTissueAsymmetryDetector()

	if self.cache_asymmetry:
	if self.cache_file.is_file():
	try:
	with open(self.cache_file, 'rb') as f:
	self.asymmetry_cache = pickle.load(f)
	print(f"Loaded asymmetry cache ({len(self.asymmetry_cache)} entries)")
	except Exception:
	print("Cache load failed → recomputing")
	self._precompute_asymmetry()
	else:
	print("Computing asymmetry metrics...")
	self._precompute_asymmetry()
	try:
	with open(self.cache_file, 'wb') as f:
	pickle.dump(self.asymmetry_cache, f)
	print("Cache saved")
	except Exception as e:
	print(f"Cache save failed: {e}")

	def _precompute_asymmetry(self):
	for idx, path in enumerate(tqdm(self.volume_paths, desc="Asymmetry")):
	volume = self._load_volume(path)
	metrics = self._compute_asymmetry(volume)
	self.asymmetry_cache[idx] = metrics

	def _load_volume(self, path: Path) -> np.ndarray:
	img = nib.load(str(path))
	volume = img.get_fdata().astype(np.float32)
	if volume.ndim == 3 and volume.shape[2] < volume.shape[0]:
	volume = np.transpose(volume, (2, 0, 1))
	return volume

	def _compute_asymmetry(self, volume: np.ndarray) -> dict:
	airway = self.airway_detector.forward(volume)
	lymphnode = self.lymphnode_detector.forward(volume, airway['midlines'].tolist())
	return {'airway': airway, 'lymphnode': lymphnode}

	def __len__(self) -> int:
	return len(self.volume_paths)

	def __getitem__(self, idx: int) -> dict:
	path = self.volume_paths[idx]
	volume = self._load_volume(path)

	if self.cache_asymmetry and idx in self.asymmetry_cache:
	metrics = self.asymmetry_cache[idx]
	else:
	metrics = self._compute_asymmetry(volume)

	airway_tensor = np.stack([
	metrics['airway']['effacement'],
	metrics['airway']['mass_effect'],
	metrics['airway']['midline_shift'],
	metrics['airway']['hybrid']
	], axis=0)

	lymph_tensor = np.stack([
	metrics['lymphnode']['left_hypo'],
	metrics['lymphnode']['right_hypo'],
	metrics['lymphnode']['hypo_asymmetry']
	], axis=0)

	return {
	'volume': torch.from_numpy(volume).unsqueeze(0).float(),
	'airway_metrics': torch.from_numpy(airway_tensor).float(),
	'lymphnode_metrics': torch.from_numpy(lymph_tensor).float(),
	}


	# ==============================================================================
	# Loss
	# ==============================================================================

	class MAEAsymmetryLoss(nn.Module):
	def __init__(self, mask_ratio=0.75, asymmetry_boost=5.0):
	super().__init__()
	self.mse = nn.MSELoss(reduction='none')
	self.mask_ratio = mask_ratio
	self.asymmetry_boost = asymmetry_boost

	def forward(self, outputs, batch):
	recon = outputs['reconstruction']
	target = batch['volume']

	B, C, D, H, W = target.shape
	num_patches = D * H * W
	mask = torch.rand(B, num_patches, device=target.device) < self.mask_ratio
	mask = mask.view(B, 1, D, H, W).expand_as(recon)

	diff = self.mse(recon, target) * mask.float()

	hybrid = batch['airway_metrics'][:, 3, :]
	hybrid_norm = hybrid / (hybrid.max(dim=1, keepdim=True)[0] + 1e-6)
	slice_weights = 1.0 + self.asymmetry_boost * hybrid_norm
	weights = slice_weights.unsqueeze(1).unsqueeze(3).unsqueeze(4).expand_as(diff)

	recon_loss = (diff * weights).sum() / (mask.sum() + 1e-6)

	airway_loss = F.mse_loss(outputs['airway_pred'], batch['airway_metrics'].permute(0, 2, 1))
	lymph_loss = F.mse_loss(outputs['lymph_pred'], batch['lymphnode_metrics'].permute(0, 2, 1))

	return recon_loss + airway_loss + lymph_loss


	# ==============================================================================
	# Trainer
	# ==============================================================================

	class TrainerWithMonitoring:
	def __init__(self, model, train_loader, device, lr=1e-4, output_dir=None):
	self.model = model.to(device)
	self.device = device
	self.train_loader = train_loader
	self.optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
	self.loss_fn = MAEAsymmetryLoss()

	self.output_dir = Path(output_dir) if output_dir else None
	if self.output_dir:
	self.output_dir.mkdir(parents=True, exist_ok=True)

	self.history = {
	'epoch': [],
	'loss': [],
	'cosine_sim_mean': [],
	'cosine_sim_std': [],
	}

	def compute_cosine_similarity(self, n_samples=50):
	self.model.eval()
	similarities = []
	with torch.no_grad():
	for i, batch in enumerate(self.train_loader):
	if i >= n_samples:
	break
	volume = batch['volume'].to(self.device)
	feat1 = self.model.encoder(volume)
	volume_aug = augment_volume(volume)
	feat2 = self.model.encoder(volume_aug)
	feat1_norm = F.normalize(feat1, dim=1)
	feat2_norm = F.normalize(feat2, dim=1)
	sim = (feat1_norm * feat2_norm).sum(dim=1)
	similarities.extend(sim.cpu().numpy().tolist())
	self.model.train()
	return np.mean(similarities), np.std(similarities)

	def save_checkpoint(self, epoch, is_best=False):
	if not self.output_dir:
	return
	path = self.output_dir / f"checkpoint_epoch_{epoch:03d}.pt"
	torch.save({
	'epoch': epoch,
	'model_state_dict': self.model.state_dict(),
	'optimizer_state_dict': self.optimizer.state_dict(),
	'history': self.history,
	}, path)
	print(f"Checkpoint saved: {path.name}")

	if is_best:
	best_path = self.output_dir / "best_model.pt"
	torch.save(self.model.state_dict(), best_path)
	print(f"Best model updated: {best_path.name}")

	def train(self, n_epochs=100, monitor_every=5, save_every=10,
	early_stop_patience=20, early_stop_after=30):
	best_loss = float('inf')
	patience_counter = 0
	best_epoch = 0

	for epoch in range(1, n_epochs + 1):
	self.model.train()
	total_loss = 0.0
	num_batches = 0

	for batch in tqdm(self.train_loader, desc=f"Epoch {epoch}", leave=False):
	volume = batch['volume'].to(self.device)
	airway_metrics = batch['airway_metrics'].to(self.device)
	lymphnode_metrics = batch['lymphnode_metrics'].to(self.device)

	self.optimizer.zero_grad()
	outputs = self.model(volume)

	loss = self.loss_fn(outputs, batch)

	loss.backward()
	self.optimizer.step()

	total_loss += loss.item()
	num_batches += 1

	avg_loss = total_loss / num_batches if num_batches > 0 else 0.0

	is_best = avg_loss < best_loss
	if is_best:
	best_loss = avg_loss
	best_epoch = epoch
	patience_counter = 0
	else:
	patience_counter += 1

	if epoch % monitor_every == 0 or epoch == 1:
	cos_mean, cos_std = self.compute_cosine_similarity()
	self.history['epoch'].append(epoch)
	self.history['loss'].append(avg_loss)
	self.history['cosine_sim_mean'].append(cos_mean)
	self.history['cosine_sim_std'].append(cos_std)

	msg = f"Epoch {epoch:3d} \| Loss: {avg_loss:.4f} \| CosSim: {cos_mean:.3f}±{cos_std:.3f}"
	if is_best:
	msg += " ★"
	print(msg)

	if cos_mean > 0.95:
	print(f" WARNING: Cosine similarity very high ({cos_mean:.3f}) — possible collapse")

	else:
	msg = f"Epoch {epoch:3d} \| Loss: {avg_loss:.4f}"
	if is_best:
	msg += " ★"
	print(msg)

	if epoch % save_every == 0:
	self.save_checkpoint(epoch, is_best=is_best)
	elif is_best:
	self.save_checkpoint(epoch, is_best=True)

	if epoch > early_stop_after and patience_counter >= early_stop_patience:
	print(f"Early stopping at epoch {epoch}")
	break

	if self.output_dir:
	torch.save(self.model.state_dict(), self.output_dir / "final_model.pt")
	with open(self.output_dir / "history.json", 'w') as f:
	json.dump(self.history, f, indent=2)

	print(f"Best loss: {best_loss:.4f} at epoch {best_epoch}")
	return self.history


	# ==============================================================================
	# Main
	# ==============================================================================

	def main():
	parser = argparse.ArgumentParser(description="3D Swin MAE pretraining")
	parser.add_argument("--data-dir", type=str, required=True, help="Folder containing cropped_volume.nii.gz files")
	parser.add_argument("--output-dir", type=str, default="./checkpoints", help="Folder to save models and logs")
	parser.add_argument("--batch-size", type=int, default=2)
	parser.add_argument("--epochs", type=int, default=100)
	parser.add_argument("--lr", type=float, default=1e-4)
	parser.add_argument("--monitor-every", type=int, default=5)
	parser.add_argument("--save-every", type=int, default=10)
	parser.add_argument("--patience", type=int, default=20)
	parser.add_argument("--early-after", type=int, default=30)
	parser.add_argument("--no-cache", action="store_true")

	args = parser.parse_args()

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"Device: {device}")

	dataset = OPSCCDataset(
	data_dir=args.data_dir,
	cache_asymmetry=not args.no_cache
	)

	loader = DataLoader(
	dataset,
	batch_size=args.batch_size,
	shuffle=True,
	num_workers=0,
	pin_memory=device.type == "cuda"
	)

	model = MAE_Swin3D()

	trainer = TrainerWithMonitoring(
	model=model,
	train_loader=loader,
	device=device,
	lr=args.lr,
	output_dir=args.output_dir
	)

	trainer.train(
	n_epochs=args.epochs,
	monitor_every=args.monitor_every,
	save_every=args.save_every,
	early_stop_patience=args.patience,
	early_stop_after=args.early_after
	)

	print("\nNote: Volumes are expected to be cropped, resized to ~60×128×128, intensities [0,1].")


	if __name__ == "__main__":
	main()