saliacoel
/

x

Model card Files Files and versions

xet

Community

saliacoel commited on Apr 21

Commit

8de5a16

verified ·

1 Parent(s): ef216e4

Upload salia_get_diff_mask.py

Browse files

Files changed (1) hide show

salia_get_diff_mask.py +1235 -0

salia_get_diff_mask.py ADDED Viewed

	@@ -0,0 +1,1235 @@

+"""
+ComfyUI Deterministic Change Mask
+=================================
+A no-AI custom node that builds a soft change mask from two before/after images.
+It is designed for cases where the after image has global quality loss but also a
+local real edit such as a newly equipped garment.
+Install:
+    ComfyUI/custom_nodes/ComfyUI_DeterministicChangeMask/__init__.py
+Notes:
+    - Standard ComfyUI IMAGE tensors are usually RGB: [B,H,W,3].
+    - If a node passes [B,H,W,4], this node extracts channel 4 as alpha.
+    - If using ComfyUI LoadImage, connect the IMAGE outputs to before/after_image
+      and optionally connect the LoadImage MASK outputs to before/after_alpha.
+      LoadImage masks are inverted alpha, so this node converts alpha = 1 - mask.
+No hard dependency on OpenCV or SciPy. If available, they are used for better
+alignment / distance transform / connected components. Otherwise pure torch/numpy
+fallbacks are used.
+"""
+from __future__ import annotations
+import math
+from collections import deque
+from typing import Dict, List, Optional, Sequence, Tuple
+import numpy as np
+import torch
+import torch.nn.functional as F
+try:
+    import cv2  # type: ignore
+except Exception:  # pragma: no cover - optional dependency
+    cv2 = None
+try:
+    from scipy import ndimage as ndi  # type: ignore
+except Exception:  # pragma: no cover - optional dependency
+    ndi = None
+_EPS = 1.0e-8
+class Salia_Get_Diff_Mask:
+    """Deterministic before/after change mask for RGBA-aware workflows."""
+    CATEGORY = "mask/deterministic"
+    FUNCTION = "make_mask"
+    RETURN_TYPES = ("MASK",)
+    RETURN_NAMES = ("mask",)
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {
+            "required": {
+                "before_image": ("IMAGE",),
+                "after_image": ("IMAGE",),
+                # Method switches. Convention: -1 = off, 0 = fast/simple/auto if available,
+                # 1 = recommended/default, 2+ = alternate variants.
+                "align_mode": ("INT", {"default": -1, "min": -1, "max": 2, "step": 1}),
+                "denoise_mode": ("INT", {"default": 1, "min": -1, "max": 3, "step": 1}),
+                "color_mode": ("INT", {"default": 1, "min": -1, "max": 3, "step": 1}),
+                "alpha_mode": ("INT", {"default": 1, "min": -1, "max": 2, "step": 1}),
+                "structure_mode": ("INT", {"default": 1, "min": -1, "max": 2, "step": 1}),
+                "gradient_mode": ("INT", {"default": 1, "min": -1, "max": 2, "step": 1}),
+                "normalize_mode": ("INT", {"default": 1, "min": -1, "max": 2, "step": 1}),
+                "combine_mode": ("INT", {"default": 1, "min": -1, "max": 3, "step": 1}),
+                "hysteresis_mode": ("INT", {"default": 1, "min": -1, "max": 2, "step": 1}),
+                "morph_mode": ("INT", {"default": 1, "min": -1, "max": 4, "step": 1}),
+                "component_mode": ("INT", {"default": 1, "min": -1, "max": 1, "step": 1}),
+                "feather_mode": ("INT", {"default": 2, "min": -1, "max": 3, "step": 1}),
+                "resize_mode": ("INT", {"default": 1, "min": -1, "max": 1, "step": 1}),
+                # Weights. They are normalized internally among enabled terms.
+                "color_weight": ("FLOAT", {"default": 0.45, "min": 0.0, "max": 5.0, "step": 0.01}),
+                "alpha_weight": ("FLOAT", {"default": 0.25, "min": 0.0, "max": 5.0, "step": 0.01}),
+                "structure_weight": ("FLOAT", {"default": 0.20, "min": 0.0, "max": 5.0, "step": 0.01}),
+                "gradient_weight": ("FLOAT", {"default": 0.10, "min": 0.0, "max": 5.0, "step": 0.01}),
+                # Robust normalization / thresholding.
+                "noise_floor_k": ("FLOAT", {"default": 2.5, "min": 0.0, "max": 12.0, "step": 0.05}),
+                "mad_scale": ("FLOAT", {"default": 3.0, "min": 0.25, "max": 20.0, "step": 0.05}),
+                "low_threshold": ("FLOAT", {"default": 0.25, "min": 0.0, "max": 1.0, "step": 0.01}),
+                "high_threshold": ("FLOAT", {"default": 0.58, "min": 0.0, "max": 1.0, "step": 0.01}),
+                # Denoising / structural settings.
+                "preblur_radius": ("INT", {"default": 1, "min": 0, "max": 12, "step": 1}),
+                "preblur_sigma": ("FLOAT", {"default": 1.0, "min": 0.1, "max": 12.0, "step": 0.05}),
+                "ssim_window": ("INT", {"default": 11, "min": 3, "max": 31, "step": 2}),
+                "ssim_sigma": ("FLOAT", {"default": 1.5, "min": 0.3, "max": 8.0, "step": 0.05}),
+                # Geometry / cleanup / feather.
+                "max_align_pixels": ("INT", {"default": 24, "min": 0, "max": 256, "step": 1}),
+                "valid_alpha_threshold": ("FLOAT", {"default": 0.01, "min": 0.0, "max": 1.0, "step": 0.005}),
+                "morph_radius": ("INT", {"default": 2, "min": 0, "max": 32, "step": 1}),
+                "min_region_area": ("INT", {"default": 64, "min": 0, "max": 1000000, "step": 1}),
+                "keep_largest_regions": ("INT", {"default": 0, "min": 0, "max": 128, "step": 1}),
+                "feather_radius": ("INT", {"default": 8, "min": 0, "max": 256, "step": 1}),
+                "logistic_steepness": ("FLOAT", {"default": 10.0, "min": 0.1, "max": 64.0, "step": 0.1}),
+            },
+            "optional": {
+                "before_alpha": ("MASK",),
+                "after_alpha": ("MASK",),
+            },
+        }
+    def make_mask(
+        self,
+        before_image: torch.Tensor,
+        after_image: torch.Tensor,
+        align_mode: int,
+        denoise_mode: int,
+        color_mode: int,
+        alpha_mode: int,
+        structure_mode: int,
+        gradient_mode: int,
+        normalize_mode: int,
+        combine_mode: int,
+        hysteresis_mode: int,
+        morph_mode: int,
+        component_mode: int,
+        feather_mode: int,
+        resize_mode: int,
+        color_weight: float,
+        alpha_weight: float,
+        structure_weight: float,
+        gradient_weight: float,
+        noise_floor_k: float,
+        mad_scale: float,
+        low_threshold: float,
+        high_threshold: float,
+        preblur_radius: int,
+        preblur_sigma: float,
+        ssim_window: int,
+        ssim_sigma: float,
+        max_align_pixels: int,
+        valid_alpha_threshold: float,
+        morph_radius: int,
+        min_region_area: int,
+        keep_largest_regions: int,
+        feather_radius: int,
+        logistic_steepness: float,
+        before_alpha: Optional[torch.Tensor] = None,
+        after_alpha: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor]:
+        before_image = _ensure_image(before_image)
+        after_image = _ensure_image(after_image)
+        b_rgb, b_a = _split_rgb_alpha(before_image, before_alpha)
+        a_rgb, a_a = _split_rgb_alpha(after_image, after_alpha)
+        if b_rgb.shape[1:3] != a_rgb.shape[1:3]:
+            if resize_mode < 1:
+                raise ValueError(
+                    "before_image and after_image sizes differ. Set resize_mode=1 to resize after_image to before_image."
+                )
+            target_h, target_w = int(b_rgb.shape[1]), int(b_rgb.shape[2])
+            a_rgb = _resize_bhwc(a_rgb, target_h, target_w)
+            a_a = _resize_bhw(a_a, target_h, target_w)
+        b_rgb, b_a, a_rgb, a_a = _broadcast_batch(b_rgb, b_a, a_rgb, a_a)
+        low_threshold = float(min(low_threshold, high_threshold))
+        high_threshold = float(max(low_threshold, high_threshold))
+        ssim_window = _make_odd(int(ssim_window), minimum=3)
+        out_masks: List[torch.Tensor] = []
+        batch = int(b_rgb.shape[0])
+        for i in range(batch):
+            mask_i = self._process_one(
+                before_rgb=b_rgb[i],
+                before_alpha=b_a[i],
+                after_rgb=a_rgb[i],
+                after_alpha=a_a[i],
+                align_mode=int(align_mode),
+                denoise_mode=int(denoise_mode),
+                color_mode=int(color_mode),
+                alpha_mode=int(alpha_mode),
+                structure_mode=int(structure_mode),
+                gradient_mode=int(gradient_mode),
+                normalize_mode=int(normalize_mode),
+                combine_mode=int(combine_mode),
+                hysteresis_mode=int(hysteresis_mode),
+                morph_mode=int(morph_mode),
+                component_mode=int(component_mode),
+                color_weight=float(color_weight),
+                alpha_weight=float(alpha_weight),
+                structure_weight=float(structure_weight),
+                gradient_weight=float(gradient_weight),
+                noise_floor_k=float(noise_floor_k),
+                mad_scale=float(mad_scale),
+                low_threshold=float(low_threshold),
+                high_threshold=float(high_threshold),
+                preblur_radius=int(preblur_radius),
+                preblur_sigma=float(preblur_sigma),
+                ssim_window=int(ssim_window),
+                ssim_sigma=float(ssim_sigma),
+                max_align_pixels=int(max_align_pixels),
+                valid_alpha_threshold=float(valid_alpha_threshold),
+                morph_radius=int(morph_radius),
+                min_region_area=int(min_region_area),
+                keep_largest_regions=int(keep_largest_regions),
+                feather_mode=int(feather_mode),
+                feather_radius=int(feather_radius),
+                logistic_steepness=float(logistic_steepness),
+            )
+            out_masks.append(mask_i)
+        return (torch.stack(out_masks, dim=0).clamp(0.0, 1.0),)
+    def _process_one(
+        self,
+        before_rgb: torch.Tensor,      # [H,W,3]
+        before_alpha: torch.Tensor,    # [H,W]
+        after_rgb: torch.Tensor,       # [H,W,3]
+        after_alpha: torch.Tensor,     # [H,W]
+        align_mode: int,
+        denoise_mode: int,
+        color_mode: int,
+        alpha_mode: int,
+        structure_mode: int,
+        gradient_mode: int,
+        normalize_mode: int,
+        combine_mode: int,
+        hysteresis_mode: int,
+        morph_mode: int,
+        component_mode: int,
+        color_weight: float,
+        alpha_weight: float,
+        structure_weight: float,
+        gradient_weight: float,
+        noise_floor_k: float,
+        mad_scale: float,
+        low_threshold: float,
+        high_threshold: float,
+        preblur_radius: int,
+        preblur_sigma: float,
+        ssim_window: int,
+        ssim_sigma: float,
+        max_align_pixels: int,
+        valid_alpha_threshold: float,
+        morph_radius: int,
+        min_region_area: int,
+        keep_largest_regions: int,
+        feather_mode: int,
+        feather_radius: int,
+        logistic_steepness: float,
+    ) -> torch.Tensor:
+        device = before_rgb.device
+        dtype = before_rgb.dtype
+        h, w = int(before_rgb.shape[0]), int(before_rgb.shape[1])
+        if align_mode >= 1:
+            after_rgb, after_alpha = _align_after_to_before(
+                before_rgb,
+                before_alpha,
+                after_rgb,
+                after_alpha,
+                align_mode=align_mode,
+                max_align_pixels=max_align_pixels,
+            )
+        # Work in channel-first tensors for filtering and SSIM.
+        b_rgb_cf = before_rgb.permute(2, 0, 1).unsqueeze(0)
+        a_rgb_cf = after_rgb.permute(2, 0, 1).unsqueeze(0)
+        b_a_cf = before_alpha.unsqueeze(0).unsqueeze(0)
+        a_a_cf = after_alpha.unsqueeze(0).unsqueeze(0)
+        if denoise_mode >= 0 and preblur_radius > 0:
+            if denoise_mode == 2:
+                b_rgb_cf = _median_blur_bchw(b_rgb_cf, preblur_radius)
+                a_rgb_cf = _median_blur_bchw(a_rgb_cf, preblur_radius)
+                b_a_cf = _median_blur_bchw(b_a_cf, preblur_radius)
+                a_a_cf = _median_blur_bchw(a_a_cf, preblur_radius)
+            elif denoise_mode == 3 and cv2 is not None:
+                b_rgb_cf = _bilateral_or_gaussian_bchw(b_rgb_cf, preblur_radius, preblur_sigma)
+                a_rgb_cf = _bilateral_or_gaussian_bchw(a_rgb_cf, preblur_radius, preblur_sigma)
+                b_a_cf = _gaussian_blur_bchw(b_a_cf, preblur_radius, preblur_sigma)
+                a_a_cf = _gaussian_blur_bchw(a_a_cf, preblur_radius, preblur_sigma)
+            else:
+                b_rgb_cf = _gaussian_blur_bchw(b_rgb_cf, preblur_radius, preblur_sigma)
+                a_rgb_cf = _gaussian_blur_bchw(a_rgb_cf, preblur_radius, preblur_sigma)
+                b_a_cf = _gaussian_blur_bchw(b_a_cf, preblur_radius, preblur_sigma)
+                a_a_cf = _gaussian_blur_bchw(a_a_cf, preblur_radius, preblur_sigma)
+        b_rgb = b_rgb_cf.squeeze(0).permute(1, 2, 0).clamp(0.0, 1.0)
+        a_rgb = a_rgb_cf.squeeze(0).permute(1, 2, 0).clamp(0.0, 1.0)
+        b_alpha = b_a_cf.squeeze(0).squeeze(0).clamp(0.0, 1.0)
+        a_alpha = a_a_cf.squeeze(0).squeeze(0).clamp(0.0, 1.0)
+        valid = torch.maximum(b_alpha, a_alpha) > float(valid_alpha_threshold)
+        if valid.sum().item() < 16:
+            valid = torch.ones((h, w), dtype=torch.bool, device=device)
+        maps: List[Tuple[torch.Tensor, float, str, float]] = []
+        if color_mode >= 0 and color_weight > 0.0:
+            d_color = _color_difference_map(b_rgb, a_rgb, b_alpha, a_alpha, mode=color_mode)
+            n_color = _normalize_map(
+                d_color,
+                valid=valid,
+                mode=normalize_mode,
+                noise_floor_k=noise_floor_k,
+                mad_scale=mad_scale,
+                fixed_scale=20.0,
+            )
+            maps.append((n_color, color_weight, "color", 20.0))
+        if alpha_mode >= 1 and alpha_weight > 0.0:
+            d_alpha = (a_alpha - b_alpha).abs()
+            if alpha_mode >= 2:
+                premul_b = b_rgb * b_alpha.unsqueeze(-1)
+                premul_a = a_rgb * a_alpha.unsqueeze(-1)
+                d_premul = torch.linalg.vector_norm(premul_a - premul_b, dim=-1) / math.sqrt(3.0)
+                d_alpha = torch.maximum(d_alpha, d_premul)
+            n_alpha = _normalize_map(
+                d_alpha,
+                valid=valid,
+                mode=normalize_mode,
+                noise_floor_k=max(0.0, noise_floor_k - 0.5),
+                mad_scale=mad_scale,
+                fixed_scale=1.0,
+            )
+            maps.append((n_alpha, alpha_weight, "alpha", 1.0))
+        if structure_mode >= 1 and structure_weight > 0.0:
+            d_struct = _structure_difference_map(
+                b_rgb,
+                a_rgb,
+                mode=structure_mode,
+                window=ssim_window,
+                sigma=ssim_sigma,
+            )
+            d_struct = d_struct * valid.to(dtype=d_struct.dtype)
+            n_struct = _normalize_map(
+                d_struct,
+                valid=valid,
+                mode=normalize_mode,
+                noise_floor_k=noise_floor_k,
+                mad_scale=mad_scale,
+                fixed_scale=0.5,
+            )
+            maps.append((n_struct, structure_weight, "structure", 0.5))
+        if gradient_mode >= 1 and gradient_weight > 0.0:
+            d_grad = _gradient_difference_map(b_rgb, a_rgb, b_alpha, a_alpha, mode=gradient_mode)
+            d_grad = d_grad * valid.to(dtype=d_grad.dtype)
+            n_grad = _normalize_map(
+                d_grad,
+                valid=valid,
+                mode=normalize_mode,
+                noise_floor_k=noise_floor_k,
+                mad_scale=mad_scale,
+                fixed_scale=0.25,
+            )
+            maps.append((n_grad, gradient_weight, "gradient", 0.25))
+        if not maps:
+            return torch.zeros((h, w), dtype=dtype, device=device)
+        score = _combine_maps(maps, mode=combine_mode).clamp(0.0, 1.0)
+        score = score * valid.to(dtype=score.dtype)
+        core_np = _make_core_mask(
+            score.detach().float().cpu().numpy(),
+            low_threshold=low_threshold,
+            high_threshold=high_threshold,
+            hysteresis_mode=hysteresis_mode,
+        )
+        if morph_mode >= 1 and morph_radius > 0:
+            core_np = _morph_binary(core_np, mode=morph_mode, radius=morph_radius)
+        if component_mode >= 1:
+            core_np = _filter_components(
+                core_np,
+                min_area=max(0, int(min_region_area)),
+                keep_largest=max(0, int(keep_largest_regions)),
+            )
+        mask_np = _feather_core(
+            core_np,
+            mode=feather_mode,
+            radius=max(0, int(feather_radius)),
+            logistic_steepness=max(0.1, float(logistic_steepness)),
+        )
+        mask = torch.from_numpy(mask_np).to(device=device, dtype=dtype)
+        return mask.clamp(0.0, 1.0)
+# -----------------------------------------------------------------------------
+# Tensor preparation
+# -----------------------------------------------------------------------------
+def _ensure_image(image: torch.Tensor) -> torch.Tensor:
+    if not isinstance(image, torch.Tensor):
+        raise TypeError("Expected ComfyUI IMAGE as torch.Tensor.")
+    if image.dim() == 3:
+        image = image.unsqueeze(0)
+    if image.dim() != 4:
+        raise ValueError(f"Expected IMAGE shape [B,H,W,C], got {tuple(image.shape)}")
+    return image.float().clamp(0.0, 1.0)
+def _split_rgb_alpha(image: torch.Tensor, optional_alpha: Optional[torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]:
+    b, h, w, c = image.shape
+    if c >= 4:
+        rgb = image[..., :3]
+        alpha = image[..., 3].float().clamp(0.0, 1.0)
+    elif c == 3:
+        rgb = image
+        alpha = torch.ones((b, h, w), dtype=image.dtype, device=image.device)
+    elif c == 1:
+        rgb = image.repeat(1, 1, 1, 3)
+        alpha = torch.ones((b, h, w), dtype=image.dtype, device=image.device)
+    else:
+        # Unusual, but keep the node from crashing on odd custom tensors.
+        first = image[..., :1]
+        rgb = first.repeat(1, 1, 1, 3)
+        alpha = torch.ones((b, h, w), dtype=image.dtype, device=image.device)
+    # For normal ComfyUI LoadImage workflows, alpha arrives as MASK and is inverted:
+    # mask = 1 - opacity. Optional mask is only used when IMAGE has no explicit alpha.
+    if optional_alpha is not None and c < 4:
+        alpha = 1.0 - _ensure_mask(optional_alpha, b, h, w, image.device, image.dtype)
+    return rgb.float().clamp(0.0, 1.0), alpha.float().clamp(0.0, 1.0)
+def _ensure_mask(
+    mask: torch.Tensor,
+    batch: int,
+    height: int,
+    width: int,
+    device: torch.device,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    if not isinstance(mask, torch.Tensor):
+        raise TypeError("Expected ComfyUI MASK as torch.Tensor.")
+    mask = mask.float()
+    if mask.dim() == 2:
+        mask = mask.unsqueeze(0)
+    elif mask.dim() == 3:
+        # Could be [B,H,W] or [H,W,1].
+        if mask.shape[-1] == 1 and mask.shape[0] == height and mask.shape[1] == width:
+            mask = mask[..., 0].unsqueeze(0)
+    elif mask.dim() == 4:
+        if mask.shape[-1] == 1:
+            mask = mask[..., 0]
+        elif mask.shape[1] == 1:
+            mask = mask[:, 0, :, :]
+        else:
+            mask = mask[:, 0, :, :]
+    else:
+        raise ValueError(f"Expected MASK shape [H,W], [B,H,W], [B,H,W,1], or [B,1,H,W], got {tuple(mask.shape)}")
+    if mask.dim() != 3:
+        raise ValueError(f"Could not normalize MASK shape, got {tuple(mask.shape)}")
+    mask = mask.to(device=device, dtype=dtype).clamp(0.0, 1.0)
+    if mask.shape[1] != height or mask.shape[2] != width:
+        mask = _resize_bhw(mask, height, width)
+    if mask.shape[0] == batch:
+        return mask
+    if mask.shape[0] == 1:
+        return mask.repeat(batch, 1, 1)
+    if batch == 1:
+        return mask[:1]
+    raise ValueError(f"MASK batch {mask.shape[0]} does not match IMAGE batch {batch}.")
+def _broadcast_batch(
+    b_rgb: torch.Tensor,
+    b_a: torch.Tensor,
+    a_rgb: torch.Tensor,
+    a_a: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    b_batch = int(b_rgb.shape[0])
+    a_batch = int(a_rgb.shape[0])
+    target = max(b_batch, a_batch)
+    def rep_img(x: torch.Tensor) -> torch.Tensor:
+        if x.shape[0] == target:
+            return x
+        if x.shape[0] == 1:
+            return x.repeat(target, 1, 1, 1)
+        raise ValueError(f"Incompatible IMAGE batches: {b_batch} and {a_batch}")
+    def rep_mask(x: torch.Tensor) -> torch.Tensor:
+        if x.shape[0] == target:
+            return x
+        if x.shape[0] == 1:
+            return x.repeat(target, 1, 1)
+        raise ValueError(f"Incompatible MASK batches: {b_batch} and {a_batch}")
+    return rep_img(b_rgb), rep_mask(b_a), rep_img(a_rgb), rep_mask(a_a)
+def _resize_bhwc(image: torch.Tensor, height: int, width: int) -> torch.Tensor:
+    x = image.permute(0, 3, 1, 2)
+    x = F.interpolate(x, size=(height, width), mode="bilinear", align_corners=False)
+    return x.permute(0, 2, 3, 1).clamp(0.0, 1.0)
+def _resize_bhw(mask: torch.Tensor, height: int, width: int) -> torch.Tensor:
+    x = mask.unsqueeze(1)
+    x = F.interpolate(x, size=(height, width), mode="bilinear", align_corners=False)
+    return x[:, 0, :, :].clamp(0.0, 1.0)
+def _make_odd(value: int, minimum: int = 3) -> int:
+    value = max(int(value), minimum)
+    if value % 2 == 0:
+        value += 1
+    return value
+# -----------------------------------------------------------------------------
+# Alignment
+# -----------------------------------------------------------------------------
+def _align_after_to_before(
+    before_rgb: torch.Tensor,
+    before_alpha: torch.Tensor,
+    after_rgb: torch.Tensor,
+    after_alpha: torch.Tensor,
+    align_mode: int,
+    max_align_pixels: int,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if align_mode >= 2 and cv2 is not None:
+        try:
+            return _align_ecc_cv2(before_rgb, before_alpha, after_rgb, after_alpha, max_align_pixels)
+        except Exception:
+            # ECC is allowed to fail. Fall back to phase correlation.
+            pass
+    try:
+        dy, dx = _phase_correlation_shift(before_rgb, before_alpha, after_rgb, after_alpha)
+        if abs(dy) <= max_align_pixels and abs(dx) <= max_align_pixels:
+            after_rgb = _translate_hwc(after_rgb, dy, dx)
+            after_alpha = _translate_hw(after_alpha, dy, dx)
+    except Exception:
+        pass
+    return after_rgb, after_alpha
+def _phase_correlation_shift(
+    before_rgb: torch.Tensor,
+    before_alpha: torch.Tensor,
+    after_rgb: torch.Tensor,
+    after_alpha: torch.Tensor,
+) -> Tuple[float, float]:
+    # Returns the estimated shift of after relative to before. To align after to before,
+    # sample after at target + shift.
+    yb = _luma(before_rgb) * before_alpha
+    ya = _luma(after_rgb) * after_alpha
+    yb = yb - yb.mean()
+    ya = ya - ya.mean()
+    fa = torch.fft.fft2(ya.float())
+    fb = torch.fft.fft2(yb.float())
+    r = fa * torch.conj(fb)
+    r = r / (torch.abs(r) + _EPS)
+    corr = torch.fft.ifft2(r).real
+    flat_idx = int(torch.argmax(corr).item())
+    h, w = corr.shape
+    py = flat_idx // w
+    px = flat_idx % w
+    if py > h // 2:
+        py -= h
+    if px > w // 2:
+        px -= w
+    return float(py), float(px)
+def _translate_hwc(image: torch.Tensor, shift_y: float, shift_x: float) -> torch.Tensor:
+    h, w, c = image.shape
+    x = image.permute(2, 0, 1).unsqueeze(0)
+    y_coords, x_coords = torch.meshgrid(
+        torch.arange(h, device=image.device, dtype=image.dtype),
+        torch.arange(w, device=image.device, dtype=image.dtype),
+        indexing="ij",
+    )
+    sample_x = x_coords + float(shift_x)
+    sample_y = y_coords + float(shift_y)
+    if w > 1:
+        sample_x = sample_x / (w - 1) * 2.0 - 1.0
+    else:
+        sample_x = torch.zeros_like(sample_x)
+    if h > 1:
+        sample_y = sample_y / (h - 1) * 2.0 - 1.0
+    else:
+        sample_y = torch.zeros_like(sample_y)
+    grid = torch.stack([sample_x, sample_y], dim=-1).unsqueeze(0)
+    y = F.grid_sample(x, grid, mode="bilinear", padding_mode="zeros", align_corners=True)
+    return y.squeeze(0).permute(1, 2, 0).clamp(0.0, 1.0)
+def _translate_hw(mask: torch.Tensor, shift_y: float, shift_x: float) -> torch.Tensor:
+    return _translate_hwc(mask.unsqueeze(-1), shift_y, shift_x)[..., 0]
+def _align_ecc_cv2(
+    before_rgb: torch.Tensor,
+    before_alpha: torch.Tensor,
+    after_rgb: torch.Tensor,
+    after_alpha: torch.Tensor,
+    max_align_pixels: int,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if cv2 is None:
+        raise RuntimeError("OpenCV not available")
+    device = after_rgb.device
+    dtype = after_rgb.dtype
+    h, w = before_alpha.shape
+    template = (_luma(before_rgb) * before_alpha).detach().float().cpu().numpy().astype(np.float32)
+    moving = (_luma(after_rgb) * after_alpha).detach().float().cpu().numpy().astype(np.float32)
+    warp = np.eye(2, 3, dtype=np.float32)
+    criteria = (cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT, 50, 1.0e-5)
+    cc, warp = cv2.findTransformECC(template, moving, warp, cv2.MOTION_AFFINE, criteria, None, 5)
+    # Reject extreme translations because a garment change can confuse ECC.
+    if abs(float(warp[0, 2])) > max_align_pixels or abs(float(warp[1, 2])) > max_align_pixels:
+        raise RuntimeError("ECC alignment rejected due to large transform")
+    a_np = after_rgb.detach().float().cpu().numpy().astype(np.float32)
+    aa_np = after_alpha.detach().float().cpu().numpy().astype(np.float32)
+    aligned_rgb = cv2.warpAffine(
+        a_np,
+        warp,
+        (w, h),
+        flags=cv2.INTER_LINEAR + cv2.WARP_INVERSE_MAP,
+        borderMode=cv2.BORDER_CONSTANT,
+        borderValue=0,
+    )
+    aligned_a = cv2.warpAffine(
+        aa_np,
+        warp,
+        (w, h),
+        flags=cv2.INTER_LINEAR + cv2.WARP_INVERSE_MAP,
+        borderMode=cv2.BORDER_CONSTANT,
+        borderValue=0,
+    )
+    return (
+        torch.from_numpy(aligned_rgb).to(device=device, dtype=dtype).clamp(0.0, 1.0),
+        torch.from_numpy(aligned_a).to(device=device, dtype=dtype).clamp(0.0, 1.0),
+    )
+# -----------------------------------------------------------------------------
+# Filtering
+# -----------------------------------------------------------------------------
+def _gaussian_kernel1d(radius: int, sigma: float, device: torch.device, dtype: torch.dtype) -> torch.Tensor:
+    radius = max(0, int(radius))
+    if radius == 0:
+        return torch.ones(1, device=device, dtype=dtype)
+    x = torch.arange(-radius, radius + 1, device=device, dtype=dtype)
+    sigma = max(float(sigma), 1.0e-3)
+    k = torch.exp(-(x * x) / (2.0 * sigma * sigma))
+    return k / (k.sum() + _EPS)
+def _pad_mode_for(x: torch.Tensor, radius: int) -> str:
+    if radius <= 0:
+        return "constant"
+    h, w = int(x.shape[-2]), int(x.shape[-1])
+    return "reflect" if h > radius and w > radius else "replicate"
+def _gaussian_blur_bchw(x: torch.Tensor, radius: int, sigma: float) -> torch.Tensor:
+    radius = int(radius)
+    if radius <= 0:
+        return x
+    b, c, h, w = x.shape
+    k = _gaussian_kernel1d(radius, sigma, x.device, x.dtype)
+    kx = k.view(1, 1, 1, -1).repeat(c, 1, 1, 1)
+    ky = k.view(1, 1, -1, 1).repeat(c, 1, 1, 1)
+    mode = _pad_mode_for(x, radius)
+    y = F.pad(x, (radius, radius, 0, 0), mode=mode)
+    y = F.conv2d(y, kx, groups=c)
+    y = F.pad(y, (0, 0, radius, radius), mode=mode)
+    y = F.conv2d(y, ky, groups=c)
+    return y.clamp(0.0, 1.0)
+def _median_blur_bchw(x: torch.Tensor, radius: int) -> torch.Tensor:
+    radius = int(radius)
+    if radius <= 0:
+        return x
+    k = 2 * radius + 1
+    b, c, h, w = x.shape
+    mode = _pad_mode_for(x, radius)
+    xp = F.pad(x, (radius, radius, radius, radius), mode=mode)
+    patches = F.unfold(xp, kernel_size=k)  # [B, C*k*k, H*W]
+    patches = patches.view(b, c, k * k, h, w)
+    return patches.median(dim=2).values.clamp(0.0, 1.0)
+def _bilateral_or_gaussian_bchw(x: torch.Tensor, radius: int, sigma: float) -> torch.Tensor:
+    if cv2 is None:
+        return _gaussian_blur_bchw(x, radius, sigma)
+    b, c, h, w = x.shape
+    device, dtype = x.device, x.dtype
+    result = []
+    diameter = max(3, 2 * int(radius) + 1)
+    for i in range(b):
+        arr = x[i].detach().float().cpu().permute(1, 2, 0).numpy().astype(np.float32)
+        if c == 1:
+            filtered = cv2.bilateralFilter(arr[..., 0], diameter, sigmaColor=0.08, sigmaSpace=max(1.0, float(sigma)))
+            filtered = filtered[..., None]
+        else:
+            filtered = cv2.bilateralFilter(arr, diameter, sigmaColor=0.08, sigmaSpace=max(1.0, float(sigma)))
+        result.append(torch.from_numpy(filtered).permute(2, 0, 1))
+    return torch.stack(result, dim=0).to(device=device, dtype=dtype).clamp(0.0, 1.0)
+# -----------------------------------------------------------------------------
+# Difference maps
+# -----------------------------------------------------------------------------
+def _luma(rgb: torch.Tensor) -> torch.Tensor:
+    return 0.2126 * rgb[..., 0] + 0.7152 * rgb[..., 1] + 0.0722 * rgb[..., 2]
+def _color_difference_map(
+    before_rgb: torch.Tensor,
+    after_rgb: torch.Tensor,
+    before_alpha: torch.Tensor,
+    after_alpha: torch.Tensor,
+    mode: int,
+) -> torch.Tensor:
+    overlap = torch.sqrt((before_alpha * after_alpha).clamp(0.0, 1.0))
+    if mode == 0:
+        # Fast RGB L2 on premultiplied color, scaled roughly like Delta-E.
+        premul_b = before_rgb * before_alpha.unsqueeze(-1)
+        premul_a = after_rgb * after_alpha.unsqueeze(-1)
+        return torch.linalg.vector_norm(premul_a - premul_b, dim=-1) * (100.0 / math.sqrt(3.0))
+    lab_b = _rgb_to_lab(before_rgb)
+    lab_a = _rgb_to_lab(after_rgb)
+    if mode == 2:
+        d = torch.linalg.vector_norm(lab_a - lab_b, dim=-1)
+    else:
+        d = _delta_e_ciede2000(lab_b, lab_a)
+    d = d * overlap
+    if mode >= 3:
+        # Hybrid: add a premultiplied color guard for transparent/antialiased boundaries.
+        premul_b = before_rgb * before_alpha.unsqueeze(-1)
+        premul_a = after_rgb * after_alpha.unsqueeze(-1)
+        premul = torch.linalg.vector_norm(premul_a - premul_b, dim=-1) * (100.0 / math.sqrt(3.0))
+        d = torch.maximum(d, premul)
+    return d.clamp_min(0.0)
+def _structure_difference_map(
+    before_rgb: torch.Tensor,
+    after_rgb: torch.Tensor,
+    mode: int,
+    window: int,
+    sigma: float,
+) -> torch.Tensor:
+    if mode == 2:
+        maps = []
+        for c in range(3):
+            maps.append(_ssim_difference(before_rgb[..., c], after_rgb[..., c], window, sigma))
+        return torch.stack(maps, dim=0).mean(dim=0).clamp(0.0, 1.0)
+    return _ssim_difference(_luma(before_rgb), _luma(after_rgb), window, sigma).clamp(0.0, 1.0)
+def _ssim_difference(x: torch.Tensor, y: torch.Tensor, window: int, sigma: float) -> torch.Tensor:
+    radius = _make_odd(window, minimum=3) // 2
+    x4 = x.unsqueeze(0).unsqueeze(0)
+    y4 = y.unsqueeze(0).unsqueeze(0)
+    k = _gaussian_kernel1d(radius, sigma, x.device, x.dtype)
+    kx = k.view(1, 1, 1, -1)
+    ky = k.view(1, 1, -1, 1)
+    mode = _pad_mode_for(x4, radius)
+    def blur(z: torch.Tensor) -> torch.Tensor:
+        z = F.pad(z, (radius, radius, 0, 0), mode=mode)
+        z = F.conv2d(z, kx)
+        z = F.pad(z, (0, 0, radius, radius), mode=mode)
+        z = F.conv2d(z, ky)
+        return z
+    mux = blur(x4)
+    muy = blur(y4)
+    mux2 = mux * mux
+    muy2 = muy * muy
+    muxy = mux * muy
+    sigx2 = blur(x4 * x4) - mux2
+    sigy2 = blur(y4 * y4) - muy2
+    sigxy = blur(x4 * y4) - muxy
+    c1 = 0.01 ** 2
+    c2 = 0.03 ** 2
+    ssim = ((2.0 * muxy + c1) * (2.0 * sigxy + c2)) / ((mux2 + muy2 + c1) * (sigx2 + sigy2 + c2) + _EPS)
+    # Difference in [0, 1] for normal cases. Negative SSIM maps become strong differences.
+    return ((1.0 - ssim.squeeze(0).squeeze(0).clamp(-1.0, 1.0)) * 0.5).clamp(0.0, 1.0)
+def _gradient_difference_map(
+    before_rgb: torch.Tensor,
+    after_rgb: torch.Tensor,
+    before_alpha: torch.Tensor,
+    after_alpha: torch.Tensor,
+    mode: int,
+) -> torch.Tensor:
+    yb = (_luma(before_rgb) * before_alpha).unsqueeze(0).unsqueeze(0)
+    ya = (_luma(after_rgb) * after_alpha).unsqueeze(0).unsqueeze(0)
+    gx_b, gy_b = _sobel_xy(yb)
+    gx_a, gy_a = _sobel_xy(ya)
+    if mode == 2:
+        d = torch.sqrt((gx_a - gx_b).square() + (gy_a - gy_b).square() + _EPS)
+    else:
+        mag_b = torch.sqrt(gx_b.square() + gy_b.square() + _EPS)
+        mag_a = torch.sqrt(gx_a.square() + gy_a.square() + _EPS)
+        d = (mag_a - mag_b).abs()
+    return d.squeeze(0).squeeze(0).clamp_min(0.0)
+def _sobel_xy(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    kx = torch.tensor(
+        [[-1.0, 0.0, 1.0], [-2.0, 0.0, 2.0], [-1.0, 0.0, 1.0]],
+        dtype=x.dtype,
+        device=x.device,
+    ).view(1, 1, 3, 3) / 8.0
+    ky = torch.tensor(
+        [[-1.0, -2.0, -1.0], [0.0, 0.0, 0.0], [1.0, 2.0, 1.0]],
+        dtype=x.dtype,
+        device=x.device,
+    ).view(1, 1, 3, 3) / 8.0
+    xp = F.pad(x, (1, 1, 1, 1), mode="reflect" if x.shape[-1] > 1 and x.shape[-2] > 1 else "replicate")
+    return F.conv2d(xp, kx), F.conv2d(xp, ky)
+# -----------------------------------------------------------------------------
+# Color science: sRGB -> Lab and CIEDE2000
+# -----------------------------------------------------------------------------
+def _srgb_to_linear(rgb: torch.Tensor) -> torch.Tensor:
+    return torch.where(rgb <= 0.04045, rgb / 12.92, torch.pow((rgb + 0.055) / 1.055, 2.4))
+def _rgb_to_lab(rgb: torch.Tensor) -> torch.Tensor:
+    rgb = rgb.clamp(0.0, 1.0)
+    lin = _srgb_to_linear(rgb)
+    r, g, b = lin[..., 0], lin[..., 1], lin[..., 2]
+    x = 0.4124564 * r + 0.3575761 * g + 0.1804375 * b
+    y = 0.2126729 * r + 0.7151522 * g + 0.0721750 * b
+    z = 0.0193339 * r + 0.1191920 * g + 0.9503041 * b
+    # D65 white point.
+    x = x / 0.95047
+    y = y / 1.00000
+    z = z / 1.08883
+    delta = 6.0 / 29.0
+    def f(t: torch.Tensor) -> torch.Tensor:
+        return torch.where(t > delta ** 3, torch.pow(t.clamp_min(0.0), 1.0 / 3.0), t / (3.0 * delta * delta) + 4.0 / 29.0)
+    fx, fy, fz = f(x), f(y), f(z)
+    L = 116.0 * fy - 16.0
+    A = 500.0 * (fx - fy)
+    B = 200.0 * (fy - fz)
+    return torch.stack([L, A, B], dim=-1)
+def _delta_e_ciede2000(lab1: torch.Tensor, lab2: torch.Tensor) -> torch.Tensor:
+    L1, a1, b1 = lab1[..., 0], lab1[..., 1], lab1[..., 2]
+    L2, a2, b2 = lab2[..., 0], lab2[..., 1], lab2[..., 2]
+    C1 = torch.sqrt(a1 * a1 + b1 * b1 + _EPS)
+    C2 = torch.sqrt(a2 * a2 + b2 * b2 + _EPS)
+    C_bar = (C1 + C2) * 0.5
+    C_bar7 = C_bar.pow(7.0)
+    G = 0.5 * (1.0 - torch.sqrt(C_bar7 / (C_bar7 + 25.0 ** 7 + _EPS)))
+    a1p = (1.0 + G) * a1
+    a2p = (1.0 + G) * a2
+    C1p = torch.sqrt(a1p * a1p + b1 * b1 + _EPS)
+    C2p = torch.sqrt(a2p * a2p + b2 * b2 + _EPS)
+    h1p = torch.rad2deg(torch.atan2(b1, a1p)) % 360.0
+    h2p = torch.rad2deg(torch.atan2(b2, a2p)) % 360.0
+    dLp = L2 - L1
+    dCp = C2p - C1p
+    dh = h2p - h1p
+    cprod_zero = (C1p * C2p) <= _EPS
+    dhp = torch.where(cprod_zero, torch.zeros_like(dh), dh)
+    dhp = torch.where((dhp > 180.0) & (~cprod_zero), dhp - 360.0, dhp)
+    dhp = torch.where((dhp < -180.0) & (~cprod_zero), dhp + 360.0, dhp)
+    dHp = 2.0 * torch.sqrt(C1p * C2p + _EPS) * torch.sin(torch.deg2rad(dhp * 0.5))
+    Lp_bar = (L1 + L2) * 0.5
+    Cp_bar = (C1p + C2p) * 0.5
+    hsum = h1p + h2p
+    hdiff = torch.abs(h1p - h2p)
+    hp_bar = torch.where(cprod_zero, hsum, hsum * 0.5)
+    hp_bar = torch.where((~cprod_zero) & (hdiff > 180.0) & (hsum < 360.0), (hsum + 360.0) * 0.5, hp_bar)
+    hp_bar = torch.where((~cprod_zero) & (hdiff > 180.0) & (hsum >= 360.0), (hsum - 360.0) * 0.5, hp_bar)
+    T = (
+        1.0
+        - 0.17 * torch.cos(torch.deg2rad(hp_bar - 30.0))
+        + 0.24 * torch.cos(torch.deg2rad(2.0 * hp_bar))
+        + 0.32 * torch.cos(torch.deg2rad(3.0 * hp_bar + 6.0))
+        - 0.20 * torch.cos(torch.deg2rad(4.0 * hp_bar - 63.0))
+    )
+    delta_theta = 30.0 * torch.exp(-((hp_bar - 275.0) / 25.0).square())
+    Cp_bar7 = Cp_bar.pow(7.0)
+    Rc = 2.0 * torch.sqrt(Cp_bar7 / (Cp_bar7 + 25.0 ** 7 + _EPS))
+    Sl = 1.0 + (0.015 * (Lp_bar - 50.0).square()) / torch.sqrt(20.0 + (Lp_bar - 50.0).square() + _EPS)
+    Sc = 1.0 + 0.045 * Cp_bar
+    Sh = 1.0 + 0.015 * Cp_bar * T
+    Rt = -torch.sin(torch.deg2rad(2.0 * delta_theta)) * Rc
+    dL = dLp / (Sl + _EPS)
+    dC = dCp / (Sc + _EPS)
+    dH = dHp / (Sh + _EPS)
+    de = torch.sqrt((dL * dL + dC * dC + dH * dH + Rt * dC * dH).clamp_min(0.0))
+    return de
+# -----------------------------------------------------------------------------
+# Normalization and combination
+# -----------------------------------------------------------------------------
+def _normalize_map(
+    d: torch.Tensor,
+    valid: torch.Tensor,
+    mode: int,
+    noise_floor_k: float,
+    mad_scale: float,
+    fixed_scale: float,
+) -> torch.Tensor:
+    d = torch.nan_to_num(d.float(), nan=0.0, posinf=0.0, neginf=0.0).clamp_min(0.0)
+    if mode < 0:
+        return (d / max(float(fixed_scale), _EPS)).clamp(0.0, 1.0)
+    vals = d[valid]
+    if vals.numel() < 16:
+        vals = d.reshape(-1)
+    if vals.numel() < 1:
+        return torch.zeros_like(d)
+    vals = vals.float()
+    q50 = torch.quantile(vals, 0.50)
+    q95 = torch.quantile(vals, 0.95)
+    q99 = torch.quantile(vals, 0.99)
+    if mode == 0:
+        denom = (q95 - q50).abs().clamp_min(1.0e-6)
+        return ((d - q50) / denom).clamp(0.0, 1.0)
+    med = q50
+    mad = torch.quantile((vals - med).abs(), 0.50) * 1.4826
+    floor = med + float(noise_floor_k) * mad
+    if mode == 2:
+        # Hybrid: threshold by MAD, but stretch by high percentile for less brittle behavior.
+        denom = torch.maximum((q95 - floor).abs(), float(mad_scale) * mad).clamp_min(1.0e-6)
+    else:
+        denom = (float(mad_scale) * mad).clamp_min(1.0e-6)
+    # Fallback if the image is almost constant and MAD collapses.
+    if float(denom.detach().cpu()) <= 1.0e-5:
+        floor = q50
+        denom = (q99 - q50).abs().clamp_min(1.0e-6)
+    return ((d - floor) / denom).clamp(0.0, 1.0)
+def _combine_maps(maps: Sequence[Tuple[torch.Tensor, float, str, float]], mode: int) -> torch.Tensor:
+    tensors = [m[0].clamp(0.0, 1.0) for m in maps]
+    weights = [max(0.0, float(m[1])) for m in maps]
+    total_w = sum(weights)
+    if total_w <= _EPS:
+        return torch.zeros_like(tensors[0])
+    if mode == 2:
+        # Max catches small decisive signals. Weights become exponents/sensitivity.
+        weighted = [t * (w / total_w * len(tensors)) for t, w in zip(tensors, weights) if w > 0.0]
+        return torch.stack(weighted, dim=0).max(dim=0).values.clamp(0.0, 1.0)
+    if mode == 3:
+        # Noisy OR: useful when any strong cue should fire, but isolated weak cues should not dominate.
+        acc = torch.ones_like(tensors[0])
+        for t, w in zip(tensors, weights):
+            if w > 0.0:
+                acc = acc * (1.0 - (t * (w / total_w)).clamp(0.0, 1.0))
+        return (1.0 - acc).clamp(0.0, 1.0)
+    # mode -1, 0, 1 all resolve to weighted sum; mode 1 is the recommended one.
+    acc = torch.zeros_like(tensors[0])
+    for t, w in zip(tensors, weights):
+        if w > 0.0:
+            acc = acc + t * w
+    return (acc / total_w).clamp(0.0, 1.0)
+# -----------------------------------------------------------------------------
+# Core mask, morphology, components, feathering
+# -----------------------------------------------------------------------------
+def _make_core_mask(score: np.ndarray, low_threshold: float, high_threshold: float, hysteresis_mode: int) -> np.ndarray:
+    score = np.nan_to_num(score.astype(np.float32), nan=0.0, posinf=0.0, neginf=0.0)
+    if hysteresis_mode < 0:
+        return score >= float(high_threshold)
+    if hysteresis_mode == 2:
+        return score >= float(high_threshold)
+    strong = score >= float(high_threshold)
+    weak = score >= float(low_threshold)
+    if not strong.any():
+        return strong
+    return _hysteresis_connected(weak, strong)
+def _hysteresis_connected(weak: np.ndarray, strong: np.ndarray) -> np.ndarray:
+    weak = weak.astype(bool)
+    strong = strong.astype(bool)
+    if ndi is not None:
+        labels, n = ndi.label(weak, structure=np.ones((3, 3), dtype=np.uint8))
+        if n == 0:
+            return np.zeros_like(weak, dtype=bool)
+        strong_labels = np.unique(labels[strong])
+        strong_labels = strong_labels[strong_labels != 0]
+        if len(strong_labels) == 0:
+            return np.zeros_like(weak, dtype=bool)
+        return np.isin(labels, strong_labels)
+    h, w = weak.shape
+    out = np.zeros_like(weak, dtype=bool)
+    seen = np.zeros_like(weak, dtype=bool)
+    neighbors = [(-1, -1), (-1, 0), (-1, 1), (0, -1), (0, 1), (1, -1), (1, 0), (1, 1)]
+    ys, xs = np.nonzero(weak)
+    for sy, sx in zip(ys, xs):
+        if seen[sy, sx]:
+            continue
+        q = deque([(int(sy), int(sx))])
+        seen[sy, sx] = True
+        coords = []
+        has_strong = False
+        while q:
+            y, x = q.popleft()
+            coords.append((y, x))
+            if strong[y, x]:
+                has_strong = True
+            for dy, dx in neighbors:
+                ny, nx = y + dy, x + dx
+                if 0 <= ny < h and 0 <= nx < w and weak[ny, nx] and not seen[ny, nx]:
+                    seen[ny, nx] = True
+                    q.append((ny, nx))
+        if has_strong:
+            for y, x in coords:
+                out[y, x] = True
+    return out
+def _morph_binary(mask: np.ndarray, mode: int, radius: int) -> np.ndarray:
+    radius = int(radius)
+    if radius <= 0:
+        return mask.astype(bool)
+    x = torch.from_numpy(mask.astype(np.float32)).unsqueeze(0).unsqueeze(0)
+    def dilate(t: torch.Tensor) -> torch.Tensor:
+        return F.max_pool2d(t, kernel_size=2 * radius + 1, stride=1, padding=radius)
+    def erode(t: torch.Tensor) -> torch.Tensor:
+        return 1.0 - F.max_pool2d(1.0 - t, kernel_size=2 * radius + 1, stride=1, padding=radius)
+    if mode == 2:
+        y = erode(dilate(x))  # close only
+    elif mode == 3:
+        y = dilate(x)
+    elif mode == 4:
+        y = erode(x)
+    else:
+        y = dilate(erode(x))  # open
+        y = erode(dilate(y))  # close
+    return (y.squeeze(0).squeeze(0).numpy() >= 0.5)
+def _filter_components(mask: np.ndarray, min_area: int, keep_largest: int) -> np.ndarray:
+    mask = mask.astype(bool)
+    if not mask.any():
+        return mask
+    min_area = max(0, int(min_area))
+    keep_largest = max(0, int(keep_largest))
+    if cv2 is not None:
+        num, labels, stats, _ = cv2.connectedComponentsWithStats(mask.astype(np.uint8), connectivity=8)
+        if num <= 1:
+            return mask
+        components = []
+        for i in range(1, num):
+            area = int(stats[i, cv2.CC_STAT_AREA])
+            if area >= min_area:
+                components.append((i, area))
+        if keep_largest > 0:
+            components = sorted(components, key=lambda x: x[1], reverse=True)[:keep_largest]
+        keep_ids = {i for i, _ in components}
+        return np.isin(labels, list(keep_ids))
+    if ndi is not None:
+        labels, n = ndi.label(mask, structure=np.ones((3, 3), dtype=np.uint8))
+        areas = np.bincount(labels.ravel())
+        components = [(i, int(areas[i])) for i in range(1, n + 1) if int(areas[i]) >= min_area]
+        if keep_largest > 0:
+            components = sorted(components, key=lambda x: x[1], reverse=True)[:keep_largest]
+        keep_ids = [i for i, _ in components]
+        return np.isin(labels, keep_ids)
+    # Pure numpy fallback.
+    h, w = mask.shape
+    out = np.zeros_like(mask, dtype=bool)
+    seen = np.zeros_like(mask, dtype=bool)
+    comps: List[List[Tuple[int, int]]] = []
+    neighbors = [(-1, -1), (-1, 0), (-1, 1), (0, -1), (0, 1), (1, -1), (1, 0), (1, 1)]
+    ys, xs = np.nonzero(mask)
+    for sy, sx in zip(ys, xs):
+        if seen[sy, sx]:
+            continue
+        q = deque([(int(sy), int(sx))])
+        seen[sy, sx] = True
+        coords = []
+        while q:
+            y, x = q.popleft()
+            coords.append((y, x))
+            for dy, dx in neighbors:
+                ny, nx = y + dy, x + dx
+                if 0 <= ny < h and 0 <= nx < w and mask[ny, nx] and not seen[ny, nx]:
+                    seen[ny, nx] = True
+                    q.append((ny, nx))
+        if len(coords) >= min_area:
+            comps.append(coords)
+    if keep_largest > 0:
+        comps = sorted(comps, key=len, reverse=True)[:keep_largest]
+    for coords in comps:
+        for y, x in coords:
+            out[y, x] = True
+    return out
+def _feather_core(mask: np.ndarray, mode: int, radius: int, logistic_steepness: float) -> np.ndarray:
+    mask = mask.astype(bool)
+    if mode < 0 or radius <= 0:
+        return mask.astype(np.float32)
+    if not mask.any():
+        return np.zeros_like(mask, dtype=np.float32)
+    if mode == 1:
+        x = torch.from_numpy(mask.astype(np.float32)).unsqueeze(0).unsqueeze(0)
+        y = _gaussian_blur_bchw(x, radius=max(1, radius), sigma=max(0.1, radius / 2.0))
+        return y.squeeze(0).squeeze(0).numpy().clip(0.0, 1.0).astype(np.float32)
+    dist = _outside_distance(mask, max_radius=radius).astype(np.float32)
+    inside = mask
+    d = np.clip(dist, 0.0, float(radius))
+    if mode == 3:
+        r = max(float(radius), 1.0)
+        z = logistic_steepness * (0.5 - d / r)
+        raw = 1.0 / (1.0 + np.exp(-z))
+        raw0 = 1.0 / (1.0 + math.exp(-logistic_steepness * 0.5))
+        rawr = 1.0 / (1.0 + math.exp(logistic_steepness * 0.5))
+        falloff = (raw - rawr) / max(raw0 - rawr, 1.0e-6)
+    else:
+        # Smoothstep distance falloff. Recommended default.
+        t = np.clip(1.0 - d / max(float(radius), 1.0), 0.0, 1.0)
+        falloff = t * t * (3.0 - 2.0 * t)
+    falloff[d >= float(radius)] = 0.0
+    falloff[inside] = 1.0
+    return falloff.clip(0.0, 1.0).astype(np.float32)
+def _outside_distance(mask: np.ndarray, max_radius: int) -> np.ndarray:
+    # Distance outside mask to nearest mask pixel. Inside mask is 0.
+    if cv2 is not None:
+        outside = (~mask).astype(np.uint8)
+        return cv2.distanceTransform(outside, cv2.DIST_L2, 5)
+    if ndi is not None:
+        return ndi.distance_transform_edt(~mask).astype(np.float32)
+    # Fallback: Chebyshev ring distance up to max_radius.
+    max_radius = max(1, int(max_radius))
+    dist = np.full(mask.shape, fill_value=max_radius + 1, dtype=np.float32)
+    current = torch.from_numpy(mask.astype(np.float32)).unsqueeze(0).unsqueeze(0)
+    dist[mask] = 0.0
+    prev = current.clone()
+    for r in range(1, max_radius + 1):
+        current = F.max_pool2d(current, kernel_size=3, stride=1, padding=1)
+        ring = (current.squeeze().numpy() >= 0.5) & (prev.squeeze().numpy() < 0.5)
+        dist[ring] = float(r)
+        prev = current.clone()
+    return dist
+NODE_CLASS_MAPPINGS = {
+    "Salia_Get_Diff_Mask": Salia_Get_Diff_Mask,
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+    "Salia_Get_Diff_Mask": "Salia_Get_Diff_Mask",
+}