# AILab_SAM3Segment.py
# Integrated standalone nodes:
#   - SAM3Segment
#   - Salia_ezpz_gated_Duo2
#   - apply_segment_4
#   - SAM3Segment_Salia (fused)

import os
import sys
import hashlib
import shutil
import threading
import urllib.request
import heapq
from contextlib import nullcontext
from pathlib import Path
from typing import Any, Dict, Tuple, Optional, List

import numpy as np
import torch
import torch.nn.functional as F
from PIL import Image, ImageFilter, ImageOps
from torch.hub import download_url_to_file

import folder_paths
import comfy.model_management
import comfy.model_management as model_management

from AILab_ImageMaskTools import pil2tensor, tensor2pil

# ======================================================================================
# SAM3Segment (original, with syntax fix)
# ======================================================================================

CURRENT_DIR = os.path.dirname(__file__)
SAM3_LOCAL_DIR = os.path.join(CURRENT_DIR, "sam3")
if SAM3_LOCAL_DIR not in sys.path:
    sys.path.insert(0, SAM3_LOCAL_DIR)

SAM3_BPE_PATH = os.path.join(SAM3_LOCAL_DIR, "assets", "bpe_simple_vocab_16e6.txt.gz")
if not os.path.isfile(SAM3_BPE_PATH):
    raise RuntimeError("SAM3 assets missing; ensure sam3/assets/bpe_simple_vocab_16e6.txt.gz exists.")

from sam3.model_builder import build_sam3_image_model  # noqa: E402
from sam3.model.sam3_image_processor import Sam3Processor  # noqa: E402

_DEFAULT_PT_ENTRY = {
    "model_url": "https://huggingface.co/1038lab/sam3/resolve/main/sam3.pt",
    "filename": "sam3.pt",
}

SAM3_MODELS = {
    "sam3": _DEFAULT_PT_ENTRY.copy(),
}


def get_sam3_pt_models():
    entry = SAM3_MODELS.get("sam3")
    if entry and entry.get("filename", "").endswith(".pt"):
        return {"sam3": entry}
    for key, value in SAM3_MODELS.items():
        if value.get("filename", "").endswith(".pt"):
            return {"sam3": value}
        if "sam3" in key and value:
            candidate = value.copy()
            candidate["model_url"] = _DEFAULT_PT_ENTRY["model_url"]
            candidate["filename"] = _DEFAULT_PT_ENTRY["filename"]
            return {"sam3": candidate}
    return {"sam3": _DEFAULT_PT_ENTRY.copy()}


def process_mask(mask_image, invert_output=False, mask_blur=0, mask_offset=0):
    if invert_output:
        mask_np = np.array(mask_image)
        mask_image = Image.fromarray(255 - mask_np)
    if mask_blur > 0:
        mask_image = mask_image.filter(ImageFilter.GaussianBlur(radius=mask_blur))
    if mask_offset != 0:
        filt = ImageFilter.MaxFilter if mask_offset > 0 else ImageFilter.MinFilter
        size = abs(mask_offset) * 2 + 1
        for _ in range(abs(mask_offset)):
            mask_image = mask_image.filter(filt(size))
    return mask_image


def apply_background_color(image, mask_image, background="Alpha", background_color="#222222"):
    rgba_image = image.copy().convert("RGBA")
    rgba_image.putalpha(mask_image.convert("L"))
    if background == "Color":
        hex_color = background_color.lstrip("#")
        r, g, b = int(hex_color[0:2], 16), int(hex_color[2:4], 16), int(hex_color[4:6], 16)
        bg_image = Image.new("RGBA", image.size, (r, g, b, 255))
        composite = Image.alpha_composite(bg_image, rgba_image)
        return composite.convert("RGB")
    return rgba_image


def get_or_download_model_file(filename, url):
    local_path = None
    if hasattr(folder_paths, "get_full_path"):
        local_path = folder_paths.get_full_path("sam3", filename)
    if local_path and os.path.isfile(local_path):
        return local_path
    base_models_dir = getattr(folder_paths, "models_dir", os.path.join(CURRENT_DIR, "models"))
    models_dir = os.path.join(base_models_dir, "sam3")
    os.makedirs(models_dir, exist_ok=True)
    local_path = os.path.join(models_dir, filename)
    if not os.path.exists(local_path):
        print(f"Downloading {filename} from {url} ...")
        download_url_to_file(url, local_path)
    return local_path


def _resolve_device(user_choice):
    auto_device = comfy.model_management.get_torch_device()
    if user_choice == "CPU":
        return torch.device("cpu")
    if user_choice == "GPU":
        if auto_device.type != "cuda":
            raise RuntimeError("GPU unavailable")
        return torch.device("cuda")
    return auto_device


class SAM3Segment:
    @classmethod
    def INPUT_TYPES(cls):
        return {
            "required": {
                "image": ("IMAGE",),
                "prompt": ("STRING", {"default": "", "multiline": True, "placeholder": "Describe the concept"}),
                "sam3_model": (list(SAM3_MODELS.keys()), {"default": "sam3"}),
                "device": (["Auto", "CPU", "GPU"], {"default": "Auto"}),
                "confidence_threshold": ("FLOAT", {"default": 0.5, "min": 0.05, "max": 0.95, "step": 0.01}),
            },
            "optional": {
                "mask_blur": ("INT", {"default": 0, "min": 0, "max": 64, "step": 1}),
                "mask_offset": ("INT", {"default": 0, "min": -64, "max": 64, "step": 1}),
                "invert_output": ("BOOLEAN", {"default": False}),
                "unload_model": ("BOOLEAN", {"default": False}),
                "background": (["Alpha", "Color"], {"default": "Alpha"}),
                "background_color": ("COLORCODE", {"default": "#222222"}),
            },
        }

    RETURN_TYPES = ("IMAGE", "MASK", "IMAGE")
    RETURN_NAMES = ("IMAGE", "MASK", "MASK_IMAGE")
    FUNCTION = "segment"
    CATEGORY = "🧪AILab/🧽RMBG"

    def __init__(self):
        self.processor_cache = {}

    def _load_processor(self, model_choice, device_choice):
        torch_device = _resolve_device(device_choice)
        device_str = "cuda" if torch_device.type == "cuda" else "cpu"
        cache_key = (model_choice, device_str)
        if cache_key not in self.processor_cache:
            model_info = SAM3_MODELS[model_choice]
            ckpt_path = get_or_download_model_file(model_info["filename"], model_info["model_url"])
            model = build_sam3_image_model(
                bpe_path=SAM3_BPE_PATH,
                device=device_str,
                eval_mode=True,
                checkpoint_path=ckpt_path,
                load_from_HF=False,
                enable_segmentation=True,
                enable_inst_interactivity=False,
            )
            processor = Sam3Processor(model, device=device_str)
            self.processor_cache[cache_key] = processor
        return self.processor_cache[cache_key], torch_device

    def _empty_result(self, img_pil, background, background_color):
        w, h = img_pil.size
        mask_image = Image.new("L", (w, h), 0)
        result_image = apply_background_color(img_pil, mask_image, background, background_color)
        if background == "Alpha":
            result_image = result_image.convert("RGBA")
        else:
            result_image = result_image.convert("RGB")
        empty_mask = torch.zeros((1, h, w), dtype=torch.float32)
        mask_rgb = empty_mask.reshape((-1, 1, h, w)).movedim(1, -1).expand(-1, -1, -1, 3)
        return result_image, empty_mask, mask_rgb

    def _run_single(self, processor, img_tensor, prompt, confidence, mask_blur, mask_offset, invert, background, background_color):
        img_pil = tensor2pil(img_tensor)
        text = prompt.strip() or "object"
        state = processor.set_image(img_pil)
        processor.reset_all_prompts(state)
        processor.set_confidence_threshold(confidence, state)
        state = processor.set_text_prompt(text, state)
        masks = state.get("masks")
        if masks is None or masks.numel() == 0:
            return self._empty_result(img_pil, background, background_color)
        masks = masks.float().to("cpu")
        if masks.ndim == 4:
            masks = masks.squeeze(1)
        combined = masks.amax(dim=0)
        mask_np = (combined.clamp(0, 1).numpy() * 255).astype(np.uint8)
        mask_image = Image.fromarray(mask_np, mode="L")
        mask_image = process_mask(mask_image, invert, mask_blur, mask_offset)
        result_image = apply_background_color(img_pil, mask_image, background, background_color)
        if background == "Alpha":
            result_image = result_image.convert("RGBA")
        else:
            result_image = result_image.convert("RGB")
        mask_tensor = torch.from_numpy(np.array(mask_image).astype(np.float32) / 255.0).unsqueeze(0)
        mask_rgb = mask_tensor.reshape((-1, 1, mask_image.height, mask_image.width)).movedim(1, -1).expand(-1, -1, -1, 3)
        return result_image, mask_tensor, mask_rgb

    def segment(self, image, prompt, sam3_model, device, confidence_threshold=0.5, mask_blur=0, mask_offset=0, invert_output=False, unload_model=False, background="Alpha", background_color="#222222"):
        if image.ndim == 3:
            image = image.unsqueeze(0)

        processor, torch_device = self._load_processor(sam3_model, device)
        autocast_device = comfy.model_management.get_autocast_device(torch_device)
        autocast_enabled = torch_device.type == "cuda" and not comfy.model_management.is_device_mps(torch_device)
        ctx = torch.autocast(autocast_device, dtype=torch.bfloat16) if autocast_enabled else nullcontext()

        result_images, result_masks, result_mask_images = [], [], []

        with ctx:
            for tensor_img in image:
                img_pil, mask_tensor, mask_rgb = self._run_single(
                    processor,
                    tensor_img,
                    prompt,
                    confidence_threshold,
                    mask_blur,
                    mask_offset,
                    invert_output,
                    background,
                    background_color,
                )
                result_images.append(pil2tensor(img_pil))
                result_masks.append(mask_tensor)
                result_mask_images.append(mask_rgb)

        if unload_model:
            device_str = "cuda" if torch_device.type == "cuda" else "cpu"
            cache_key = (sam3_model, device_str)
            if cache_key in self.processor_cache:
                del self.processor_cache[cache_key]
            if torch_device.type == "cuda":
                torch.cuda.empty_cache()

        return torch.cat(result_images, dim=0), torch.cat(result_masks, dim=0), torch.cat(result_mask_images, dim=0)


# ======================================================================================
# Salia_ezpz_gated_Duo2 (standalone)
# ======================================================================================

# transformers is required for depth-estimation pipeline
try:
    from transformers import pipeline
except Exception as e:
    pipeline = None
    _TRANSFORMERS_IMPORT_ERROR = e

_CKPT_CACHE: Dict[str, Tuple[Any, Any, Any]] = {}
_CN_CACHE: Dict[str, Any] = {}
_CKPT_LOCK = threading.Lock()
_CN_LOCK = threading.Lock()


def _find_plugin_root() -> Path:
    """
    Walk upwards from this file until we find an 'assets' folder.
    If not found, fall back to this file's directory.
    """
    here = Path(__file__).resolve()
    for parent in [here.parent] + list(here.parents)[:12]:
        if (parent / "assets").is_dir():
            return parent
    return here.parent


PLUGIN_ROOT = _find_plugin_root()


def _pil_lanczos():
    if hasattr(Image, "Resampling"):
        return Image.Resampling.LANCZOS
    return Image.LANCZOS


def _image_tensor_to_pil(img: torch.Tensor) -> Image.Image:
    if img.ndim == 4:
        img = img[0]
    img = img.detach().cpu().float().clamp(0, 1)
    arr = (img.numpy() * 255.0).round().astype(np.uint8)
    if arr.shape[-1] == 4:
        return Image.fromarray(arr, mode="RGBA")
    return Image.fromarray(arr, mode="RGB")


def _pil_to_image_tensor(pil: Image.Image) -> torch.Tensor:
    if pil.mode not in ("RGB", "RGBA"):
        pil = pil.convert("RGBA") if "A" in pil.getbands() else pil.convert("RGB")
    arr = np.array(pil).astype(np.float32) / 255.0
    t = torch.from_numpy(arr)
    return t.unsqueeze(0)


def _mask_tensor_to_pil(mask: torch.Tensor) -> Image.Image:
    if mask.ndim == 3:
        mask = mask[0]
    mask = mask.detach().cpu().float().clamp(0, 1)
    arr = (mask.numpy() * 255.0).round().astype(np.uint8)
    return Image.fromarray(arr, mode="L")


def _pil_to_mask_tensor(pil_l: Image.Image) -> torch.Tensor:
    if pil_l.mode != "L":
        pil_l = pil_l.convert("L")
    arr = np.array(pil_l).astype(np.float32) / 255.0
    t = torch.from_numpy(arr)
    return t.unsqueeze(0)


def _resize_image_lanczos(img: torch.Tensor, w: int, h: int) -> torch.Tensor:
    if img.ndim != 4:
        raise ValueError("Expected IMAGE tensor with shape [B,H,W,C].")
    outs = []
    for i in range(img.shape[0]):
        pil = _image_tensor_to_pil(img[i].unsqueeze(0))
        pil = pil.resize((int(w), int(h)), resample=_pil_lanczos())
        outs.append(_pil_to_image_tensor(pil))
    return torch.cat(outs, dim=0)


def _resize_mask_lanczos(mask: torch.Tensor, w: int, h: int) -> torch.Tensor:
    if mask.ndim != 3:
        raise ValueError("Expected MASK tensor with shape [B,H,W].")
    outs = []
    for i in range(mask.shape[0]):
        pil = _mask_tensor_to_pil(mask[i].unsqueeze(0))
        pil = pil.resize((int(w), int(h)), resample=_pil_lanczos())
        outs.append(_pil_to_mask_tensor(pil))
    return torch.cat(outs, dim=0)


def _rgb_to_rgba_with_comfy_mask(rgb: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
    if rgb.ndim == 3:
        rgb = rgb.unsqueeze(0)
    if mask.ndim == 2:
        mask = mask.unsqueeze(0)

    if rgb.ndim != 4 or rgb.shape[-1] != 3:
        raise ValueError(f"rgb must be [B,H,W,3], got {tuple(rgb.shape)}")
    if mask.ndim != 3:
        raise ValueError(f"mask must be [B,H,W], got {tuple(mask.shape)}")

    if mask.shape[0] != rgb.shape[0]:
        if mask.shape[0] == 1 and rgb.shape[0] > 1:
            mask = mask.expand(rgb.shape[0], -1, -1)
        else:
            raise ValueError("Batch mismatch between rgb and mask.")

    if mask.shape[1] != rgb.shape[1] or mask.shape[2] != rgb.shape[2]:
        raise ValueError(
            f"Mask size mismatch. rgb={rgb.shape[2]}x{rgb.shape[1]} mask={mask.shape[2]}x{mask.shape[1]}"
        )

    mask = mask.to(device=rgb.device, dtype=rgb.dtype).clamp(0, 1)
    alpha = (1.0 - mask).unsqueeze(-1).clamp(0, 1)
    rgba = torch.cat([rgb.clamp(0, 1), alpha], dim=-1)
    return rgba


def _load_checkpoint_cached(ckpt_name: str):
    with _CKPT_LOCK:
        if ckpt_name in _CKPT_CACHE:
            return _CKPT_CACHE[ckpt_name]
        import nodes
        loader = nodes.CheckpointLoaderSimple()
        fn = getattr(loader, loader.FUNCTION)
        model, clip, vae = fn(ckpt_name=ckpt_name)
        _CKPT_CACHE[ckpt_name] = (model, clip, vae)
        return model, clip, vae


def _load_controlnet_cached(control_net_name: str):
    with _CN_LOCK:
        if control_net_name in _CN_CACHE:
            return _CN_CACHE[control_net_name]
        import nodes
        loader = nodes.ControlNetLoader()
        fn = getattr(loader, loader.FUNCTION)
        (cn,) = fn(control_net_name=control_net_name)
        _CN_CACHE[control_net_name] = cn
        return cn


def _assets_images_dir() -> Path:
    return PLUGIN_ROOT / "assets" / "images"


def _list_asset_pngs() -> list:
    img_dir = _assets_images_dir()
    if not img_dir.is_dir():
        return []
    files = []
    for p in img_dir.rglob("*"):
        if p.is_file() and p.suffix.lower() == ".png":
            files.append(p.relative_to(img_dir).as_posix())
    files.sort()
    return files


def _safe_asset_path(asset_rel_path: str) -> Path:
    img_dir = _assets_images_dir()
    if not img_dir.is_dir():
        raise FileNotFoundError(f"assets/images folder not found: {img_dir}")

    base = img_dir.resolve()
    rel = Path(asset_rel_path)

    if rel.is_absolute():
        raise ValueError("Absolute paths are not allowed for asset_image.")

    full = (base / rel).resolve()

    if base != full and base not in full.parents:
        raise ValueError(f"Invalid asset path (path traversal blocked): {asset_rel_path}")

    if not full.is_file():
        raise FileNotFoundError(f"Asset PNG not found in assets/images: {asset_rel_path}")
    if full.suffix.lower() != ".png":
        raise ValueError(f"Asset is not a PNG: {asset_rel_path}")

    return full


def _load_asset_image_and_mask(asset_rel_path: str) -> Tuple[torch.Tensor, torch.Tensor]:
    p = _safe_asset_path(asset_rel_path)

    im = Image.open(p)
    im = ImageOps.exif_transpose(im)

    rgba = im.convert("RGBA")
    rgb = rgba.convert("RGB")

    rgb_arr = np.array(rgb).astype(np.float32) / 255.0
    img_t = torch.from_numpy(rgb_arr)[None, ...]

    alpha = np.array(rgba.getchannel("A")).astype(np.float32) / 255.0
    mask = 1.0 - alpha

    mask_t = torch.from_numpy(mask)[None, ...]
    return img_t, mask_t


MODEL_DIR = PLUGIN_ROOT / "assets" / "depth"
MODEL_DIR.mkdir(parents=True, exist_ok=True)

REQUIRED_FILES = {
    "config.json": "https://huggingface.co/saliacoel/depth/resolve/main/config.json",
    "model.safetensors": "https://huggingface.co/saliacoel/depth/resolve/main/model.safetensors",
    "preprocessor_config.json": "https://huggingface.co/saliacoel/depth/resolve/main/preprocessor_config.json",
}

ZOE_FALLBACK_REPO_ID = "Intel/zoedepth-nyu-kitti"

_PIPE_CACHE: Dict[Tuple[str, str], Any] = {}
_PIPE_LOCK = threading.Lock()


def _have_required_files() -> bool:
    return all((MODEL_DIR / name).exists() for name in REQUIRED_FILES.keys())


def _download_url_to_file(url: str, dst: Path, timeout: int = 180) -> None:
    dst.parent.mkdir(parents=True, exist_ok=True)
    tmp = dst.with_suffix(dst.suffix + ".tmp")

    if tmp.exists():
        try:
            tmp.unlink()
        except Exception:
            pass

    req = urllib.request.Request(url, headers={"User-Agent": "ComfyUI-SaliaDepth/1.1"})
    with urllib.request.urlopen(req, timeout=timeout) as r, open(tmp, "wb") as f:
        shutil.copyfileobj(r, f)

    tmp.replace(dst)


def ensure_local_model_files() -> bool:
    if _have_required_files():
        return True
    try:
        for fname, url in REQUIRED_FILES.items():
            fpath = MODEL_DIR / fname
            if fpath.exists():
                continue
            _download_url_to_file(url, fpath)
        return _have_required_files()
    except Exception:
        return False


def HWC3(x: np.ndarray) -> np.ndarray:
    assert x.dtype == np.uint8
    if x.ndim == 2:
        x = x[:, :, None]
    assert x.ndim == 3
    H, W, C = x.shape
    assert C == 1 or C == 3 or C == 4
    if C == 3:
        return x
    if C == 1:
        return np.concatenate([x, x, x], axis=2)
    color = x[:, :, 0:3].astype(np.float32)
    alpha = x[:, :, 3:4].astype(np.float32) / 255.0
    y = color * alpha + 255.0 * (1.0 - alpha)
    y = y.clip(0, 255).astype(np.uint8)
    return y


def pad64(x: int) -> int:
    return int(np.ceil(float(x) / 64.0) * 64 - x)


def safer_memory(x: np.ndarray) -> np.ndarray:
    return np.ascontiguousarray(x.copy()).copy()


def resize_image_with_pad_min_side(
    input_image: np.ndarray,
    resolution: int,
    upscale_method: str = "INTER_CUBIC",
    skip_hwc3: bool = False,
    mode: str = "edge",
) -> Tuple[np.ndarray, Any]:
    cv2 = None
    try:
        import cv2 as _cv2
        cv2 = _cv2
    except Exception:
        cv2 = None

    img = input_image if skip_hwc3 else HWC3(input_image)

    H_raw, W_raw, _ = img.shape
    if resolution <= 0:
        return img, (lambda x: x)

    k = float(resolution) / float(min(H_raw, W_raw))
    H_target = int(np.round(float(H_raw) * k))
    W_target = int(np.round(float(W_raw) * k))

    if cv2 is not None:
        upscale_methods = {
            "INTER_NEAREST": cv2.INTER_NEAREST,
            "INTER_LINEAR": cv2.INTER_LINEAR,
            "INTER_AREA": cv2.INTER_AREA,
            "INTER_CUBIC": cv2.INTER_CUBIC,
            "INTER_LANCZOS4": cv2.INTER_LANCZOS4,
        }
        method = upscale_methods.get(upscale_method, cv2.INTER_CUBIC)
        img = cv2.resize(img, (W_target, H_target), interpolation=method if k > 1 else cv2.INTER_AREA)
    else:
        pil = Image.fromarray(img)
        resample = Image.BICUBIC if k > 1 else Image.LANCZOS
        pil = pil.resize((W_target, H_target), resample=resample)
        img = np.array(pil, dtype=np.uint8)

    H_pad, W_pad = pad64(H_target), pad64(W_target)
    img_padded = np.pad(img, [[0, H_pad], [0, W_pad], [0, 0]], mode=mode)

    def remove_pad(x: np.ndarray) -> np.ndarray:
        return safer_memory(x[:H_target, :W_target, ...])

    return safer_memory(img_padded), remove_pad


def pad_only_to_64(img_u8: np.ndarray, mode: str = "edge") -> Tuple[np.ndarray, Any]:
    img = HWC3(img_u8)
    H_raw, W_raw, _ = img.shape
    H_pad, W_pad = pad64(H_raw), pad64(W_raw)
    img_padded = np.pad(img, [[0, H_pad], [0, W_pad], [0, 0]], mode=mode)

    def remove_pad(x: np.ndarray) -> np.ndarray:
        return safer_memory(x[:H_raw, :W_raw, ...])

    return safer_memory(img_padded), remove_pad


def composite_rgba_over_white_keep_alpha(inp_u8: np.ndarray) -> Tuple[np.ndarray, Optional[np.ndarray]]:
    if inp_u8.ndim == 3 and inp_u8.shape[2] == 4:
        rgba = inp_u8.astype(np.uint8)
        rgb = rgba[:, :, 0:3].astype(np.float32)
        a = (rgba[:, :, 3:4].astype(np.float32) / 255.0)
        rgb_white = (rgb * a + 255.0 * (1.0 - a)).clip(0, 255).astype(np.uint8)
        alpha_u8 = rgba[:, :, 3].copy()
        return rgb_white, alpha_u8
    return HWC3(inp_u8), None


def apply_alpha_then_black_background(depth_rgb_u8: np.ndarray, alpha_u8: np.ndarray) -> np.ndarray:
    depth_rgb_u8 = HWC3(depth_rgb_u8)
    a = (alpha_u8.astype(np.float32) / 255.0)[:, :, None]
    out = (depth_rgb_u8.astype(np.float32) * a).clip(0, 255).astype(np.uint8)
    return out


def comfy_tensor_to_u8(img: torch.Tensor) -> np.ndarray:
    if img.ndim == 4:
        img = img[0]
    arr = img.detach().cpu().float().clamp(0, 1).numpy()
    u8 = (arr * 255.0).round().astype(np.uint8)
    return u8


def u8_to_comfy_tensor(img_u8: np.ndarray) -> torch.Tensor:
    img_u8 = HWC3(img_u8)
    t = torch.from_numpy(img_u8.astype(np.float32) / 255.0)
    return t.unsqueeze(0)


def _try_load_pipeline(model_source: str, device: torch.device):
    if pipeline is None:
        raise RuntimeError(f"transformers import failed: {_TRANSFORMERS_IMPORT_ERROR}")

    key = (model_source, str(device))
    with _PIPE_LOCK:
        if key in _PIPE_CACHE:
            return _PIPE_CACHE[key]

        p = pipeline(task="depth-estimation", model=model_source)
        try:
            p.model = p.model.to(device)
            p.device = device
        except Exception:
            pass

        _PIPE_CACHE[key] = p
        return p


def get_depth_pipeline(device: torch.device):
    if ensure_local_model_files():
        try:
            return _try_load_pipeline(str(MODEL_DIR), device)
        except Exception:
            pass
    try:
        return _try_load_pipeline(ZOE_FALLBACK_REPO_ID, device)
    except Exception:
        return None


def depth_estimate_zoe_style(
    pipe,
    input_rgb_u8: np.ndarray,
    detect_resolution: int,
    upscale_method: str = "INTER_CUBIC",
) -> np.ndarray:
    if detect_resolution == -1:
        work_img, remove_pad = pad_only_to_64(input_rgb_u8, mode="edge")
    else:
        work_img, remove_pad = resize_image_with_pad_min_side(
            input_rgb_u8,
            int(detect_resolution),
            upscale_method=upscale_method,
            skip_hwc3=False,
            mode="edge",
        )

    pil_image = Image.fromarray(work_img)

    with torch.no_grad():
        result = pipe(pil_image)
        depth = result["depth"]

        if isinstance(depth, Image.Image):
            depth_array = np.array(depth, dtype=np.float32)
        else:
            depth_array = np.array(depth, dtype=np.float32)

        vmin = float(np.percentile(depth_array, 2))
        vmax = float(np.percentile(depth_array, 85))

        depth_array = depth_array - vmin
        denom = (vmax - vmin)
        if abs(denom) < 1e-12:
            denom = 1e-6
        depth_array = depth_array / denom

        depth_array = 1.0 - depth_array
        depth_image = (depth_array * 255.0).clip(0, 255).astype(np.uint8)

    detected_map = remove_pad(HWC3(depth_image))
    return detected_map


def resize_to_original(depth_rgb_u8: np.ndarray, w0: int, h0: int) -> np.ndarray:
    try:
        import cv2
        out = cv2.resize(depth_rgb_u8, (w0, h0), interpolation=cv2.INTER_LINEAR)
        return out.astype(np.uint8)
    except Exception:
        pil = Image.fromarray(depth_rgb_u8)
        pil = pil.resize((w0, h0), resample=Image.BILINEAR)
        return np.array(pil, dtype=np.uint8)


def _salia_depth_execute(image: torch.Tensor, resolution: int = -1) -> torch.Tensor:
    try:
        device = model_management.get_torch_device()
    except Exception:
        device = torch.device("cpu")

    pipe_obj = None
    try:
        pipe_obj = get_depth_pipeline(device)
    except Exception:
        pipe_obj = None

    if pipe_obj is None:
        return image

    if image.ndim == 3:
        image = image.unsqueeze(0)

    outs = []
    for i in range(image.shape[0]):
        try:
            h0 = int(image[i].shape[0])
            w0 = int(image[i].shape[1])

            inp_u8 = comfy_tensor_to_u8(image[i])

            rgb_for_depth, alpha_u8 = composite_rgba_over_white_keep_alpha(inp_u8)
            had_rgba = alpha_u8 is not None

            depth_rgb = depth_estimate_zoe_style(
                pipe=pipe_obj,
                input_rgb_u8=rgb_for_depth,
                detect_resolution=int(resolution),
                upscale_method="INTER_CUBIC",
            )

            depth_rgb = resize_to_original(depth_rgb, w0=w0, h0=h0)

            if had_rgba:
                if alpha_u8.shape[0] != h0 or alpha_u8.shape[1] != w0:
                    try:
                        import cv2
                        alpha_u8 = cv2.resize(alpha_u8, (w0, h0), interpolation=cv2.INTER_LINEAR).astype(np.uint8)
                    except Exception:
                        pil_a = Image.fromarray(alpha_u8)
                        pil_a = pil_a.resize((w0, h0), resample=Image.BILINEAR)
                        alpha_u8 = np.array(pil_a, dtype=np.uint8)

                depth_rgb = apply_alpha_then_black_background(depth_rgb, alpha_u8)

            outs.append(u8_to_comfy_tensor(depth_rgb))
        except Exception:
            outs.append(image[i].unsqueeze(0))

    return torch.cat(outs, dim=0)


def _salia_alpha_over_region(base: torch.Tensor, overlay_rgba: torch.Tensor, x: int, y: int) -> torch.Tensor:
    if base.ndim != 4 or overlay_rgba.ndim != 4:
        raise ValueError("base and overlay must be [B,H,W,C].")

    B, H, W, C = base.shape
    b2, sH, sW, c2 = overlay_rgba.shape
    if c2 != 4:
        raise ValueError("overlay_rgba must have 4 channels (RGBA).")
    if sH != sW:
        raise ValueError("overlay must be square.")
    s = sH

    if x < 0 or y < 0 or x + s > W or y + s > H:
        raise ValueError(f"Square paste out of bounds. base={W}x{H}, paste at ({x},{y}) size={s}")

    if b2 != B:
        if b2 == 1 and B > 1:
            overlay_rgba = overlay_rgba.expand(B, -1, -1, -1)
        else:
            raise ValueError("Batch mismatch between base and overlay.")

    out = base.clone()

    overlay_rgb = overlay_rgba[..., 0:3].clamp(0, 1)
    overlay_a = overlay_rgba[..., 3:4].clamp(0, 1)

    base_rgb = out[:, y:y + s, x:x + s, 0:3]
    comp_rgb = overlay_rgb * overlay_a + base_rgb * (1.0 - overlay_a)
    out[:, y:y + s, x:x + s, 0:3] = comp_rgb

    if C == 4:
        base_a = out[:, y:y + s, x:x + s, 3:4].clamp(0, 1)
        comp_a = overlay_a + base_a * (1.0 - overlay_a)
        out[:, y:y + s, x:x + s, 3:4] = comp_a

    return out.clamp(0, 1)


_HARDCODED_CKPT_NAME = "SaliaHighlady_Speedy.safetensors"
_HARDCODED_CONTROLNET_NAME = "diffusion_pytorch_model_promax.safetensors"
_HARDCODED_CN_START = 0.00
_HARDCODED_CN_END = 1.00

_PASS1_SAMPLER_NAME = "dpmpp_2m_sde_heun_gpu"
_PASS1_SCHEDULER = "karras"
_PASS1_STEPS = 29
_PASS1_CFG = 2.6
_PASS1_CONTROLNET_STRENGTH = 0.33

_PASS2_SAMPLER_NAME = "res_multistep_ancestral_cfg_pp"
_PASS2_SCHEDULER = "karras"
_PASS2_STEPS = 30
_PASS2_CFG = 1.7
_PASS2_CONTROLNET_STRENGTH = 0.5


class Salia_ezpz_gated_Duo2:
    CATEGORY = "image/salia"
    RETURN_TYPES = ("IMAGE", "IMAGE")
    RETURN_NAMES = ("image", "image_cropped")
    FUNCTION = "run"

    @classmethod
    def INPUT_TYPES(cls):
        assets = _list_asset_pngs() or ["<no pngs found>"]
        upscale_choices = ["1", "2", "4", "6", "8", "10", "12", "14", "16"]
        return {
            "required": {
                "image": ("IMAGE",),
                "trigger_string": ("STRING", {"default": ""}),
                "X_coord": ("INT", {"default": 0, "min": 0, "max": 16384, "step": 1}),
                "Y_coord": ("INT", {"default": 0, "min": 0, "max": 16384, "step": 1}),
                "positive_prompt": ("STRING", {"default": "", "multiline": True}),
                "negative_prompt": ("STRING", {"default": "", "multiline": True}),
                "asset_image": (assets, {}),
                "square_size_1": ("INT", {"default": 384, "min": 8, "max": 8192, "step": 1}),
                "upscale_factor_1": (upscale_choices, {"default": "4"}),
                "denoise_1": ("FLOAT", {"default": 0.35, "min": 0.00, "max": 1.00, "step": 0.01}),
                "square_size_2": ("INT", {"default": 384, "min": 8, "max": 8192, "step": 1}),
                "upscale_factor_2": (upscale_choices, {"default": "4"}),
                "denoise_2": ("FLOAT", {"default": 0.35, "min": 0.00, "max": 1.00, "step": 0.01}),
            }
        }

    def run(
        self,
        image: torch.Tensor,
        trigger_string: str = "",
        X_coord: int = 0,
        Y_coord: int = 0,
        positive_prompt: str = "",
        negative_prompt: str = "",
        asset_image: str = "",
        square_size_1: int = 384,
        upscale_factor_1: str = "4",
        denoise_1: float = 0.35,
        square_size_2: int = 384,
        upscale_factor_2: str = "4",
        denoise_2: float = 0.35,
    ):
        if image.ndim == 3:
            image = image.unsqueeze(0)
        if image.ndim != 4:
            raise ValueError("Input image must be [B,H,W,C].")

        B, H, W, C = image.shape
        if C not in (3, 4):
            raise ValueError("Input image must have 3 (RGB) or 4 (RGBA) channels.")

        x = int(X_coord)
        y = int(Y_coord)
        s1 = int(square_size_1)
        s2 = int(square_size_2)

        def _validate_square_bounds(s: int, label: str):
            if s <= 0:
                raise ValueError(f"{label}: square_size must be > 0")
            if x < 0 or y < 0 or x + s > W or y + s > H:
                raise ValueError(f"{label}: out of bounds. image={W}x{H}, rect at ({x},{y}) size={s}")

        def _validate_upscale(up: int, s: int, label: str):
            if up not in (1, 2, 4, 6, 8, 10, 12, 14, 16):
                raise ValueError(f"{label}: upscale_factor must be one of 1,2,4,6,8,10,12,14,16")
            if ((s * up) % 8) != 0:
                raise ValueError(f"{label}: square_size * upscale_factor must be divisible by 8 (VAE requirement).")

        def _crop_square(img: torch.Tensor, s: int) -> torch.Tensor:
            return img[:, y:y + s, x:x + s, :]

        _validate_square_bounds(s2, "final crop (square_size_2)")

        if trigger_string == "":
            out2 = image
            cropped = _crop_square(out2, s2)
            return (out2, cropped)

        _validate_square_bounds(s1, "pass1 (square_size_1)")
        _validate_square_bounds(s2, "pass2 (square_size_2)")

        up1 = int(upscale_factor_1)
        up2 = int(upscale_factor_2)
        _validate_upscale(up1, s1, "pass1")
        _validate_upscale(up2, s2, "pass2")

        d1 = float(max(0.0, min(1.0, denoise_1)))
        d2 = float(max(0.0, min(1.0, denoise_2)))

        if asset_image == "<no pngs found>":
            raise FileNotFoundError("No PNGs found in assets/images for this plugin.")
        _asset_img_unused, asset_mask = _load_asset_image_and_mask(asset_image)

        if asset_mask.ndim == 2:
            asset_mask = asset_mask.unsqueeze(0)
        if asset_mask.ndim != 3:
            raise ValueError("Asset mask must be [B,H,W].")

        if asset_mask.shape[0] != B:
            if asset_mask.shape[0] == 1 and B > 1:
                asset_mask = asset_mask.expand(B, -1, -1)
            else:
                raise ValueError("Batch mismatch for asset mask vs input image batch.")

        import nodes

        try:
            model, clip, vae = _load_checkpoint_cached(_HARDCODED_CKPT_NAME)
        except Exception as e:
            available = folder_paths.get_filename_list("checkpoints") or []
            raise FileNotFoundError(
                f"Hardcoded ckpt not found: '{_HARDCODED_CKPT_NAME}'. "
                f"Put it in models/checkpoints. Available (first 50): {available[:50]}"
            ) from e

        try:
            controlnet = _load_controlnet_cached(_HARDCODED_CONTROLNET_NAME)
        except Exception as e:
            available = folder_paths.get_filename_list("controlnet") or []
            raise FileNotFoundError(
                f"Hardcoded controlnet not found: '{_HARDCODED_CONTROLNET_NAME}'. "
                f"Put it in models/controlnet. Available (first 50): {available[:50]}"
            ) from e

        pos_enc = nodes.CLIPTextEncode()
        neg_enc = nodes.CLIPTextEncode()
        pos_fn = getattr(pos_enc, pos_enc.FUNCTION)
        neg_fn = getattr(neg_enc, neg_enc.FUNCTION)
        (pos_cond,) = pos_fn(text=str(positive_prompt), clip=clip)
        (neg_cond,) = neg_fn(text=str(negative_prompt), clip=clip)

        cn_apply = nodes.ControlNetApplyAdvanced()
        cn_fn = getattr(cn_apply, cn_apply.FUNCTION)
        vae_enc = nodes.VAEEncode()
        vae_enc_fn = getattr(vae_enc, vae_enc.FUNCTION)
        ksampler = nodes.KSampler()
        k_fn = getattr(ksampler, ksampler.FUNCTION)
        vae_dec = nodes.VAEDecode()
        vae_dec_fn = getattr(vae_dec, vae_dec.FUNCTION)

        def _run_pass(
            pass_index: int,
            in_image: torch.Tensor,
            s: int,
            up: int,
            denoise_v: float,
            steps_v: int,
            cfg_v: float,
            sampler_v: str,
            scheduler_v: str,
            controlnet_strength_v: float,
        ) -> torch.Tensor:
            up_w = s * up
            up_h = s * up

            crop = in_image[:, y:y + s, x:x + s, :]
            crop_rgb = crop[:, :, :, 0:3].contiguous()

            depth_small = _salia_depth_execute(crop_rgb, resolution=s)
            depth_up = _resize_image_lanczos(depth_small, up_w, up_h)

            crop_up = _resize_image_lanczos(crop_rgb, up_w, up_h)

            asset_mask_up = _resize_mask_lanczos(asset_mask, up_w, up_h)

            pos_cn, neg_cn = cn_fn(
                strength=float(controlnet_strength_v),
                start_percent=float(_HARDCODED_CN_START),
                end_percent=float(_HARDCODED_CN_END),
                positive=pos_cond,
                negative=neg_cond,
                control_net=controlnet,
                image=depth_up,
                vae=vae,
            )

            (latent,) = vae_enc_fn(pixels=crop_up, vae=vae)

            seed_material = (
                f"{_HARDCODED_CKPT_NAME}|{_HARDCODED_CONTROLNET_NAME}|{asset_image}|"
                f"pass={pass_index}|x={x}|y={y}|s={s}|up={up}|"
                f"steps={steps_v}|cfg={cfg_v}|sampler={sampler_v}|scheduler={scheduler_v}|denoise={denoise_v}|"
                f"cn_strength={controlnet_strength_v}|"
                f"{positive_prompt}|{negative_prompt}"
            ).encode("utf-8", errors="ignore")
            seed64 = int(hashlib.sha256(seed_material).hexdigest()[:16], 16)

            (sampled_latent,) = k_fn(
                seed=seed64,
                steps=int(steps_v),
                cfg=float(cfg_v),
                sampler_name=str(sampler_v),
                scheduler=str(scheduler_v),
                denoise=float(denoise_v),
                model=model,
                positive=pos_cn,
                negative=neg_cn,
                latent_image=latent,
            )

            (decoded_rgb,) = vae_dec_fn(samples=sampled_latent, vae=vae)

            rgba_up = _rgb_to_rgba_with_comfy_mask(decoded_rgb, asset_mask_up)
            rgba_square = _resize_image_lanczos(rgba_up, s, s)
            out = _salia_alpha_over_region(in_image, rgba_square, x=x, y=y)
            return out

        out1 = _run_pass(
            pass_index=1,
            in_image=image,
            s=s1,
            up=up1,
            denoise_v=d1,
            steps_v=_PASS1_STEPS,
            cfg_v=_PASS1_CFG,
            sampler_v=_PASS1_SAMPLER_NAME,
            scheduler_v=_PASS1_SCHEDULER,
            controlnet_strength_v=_PASS1_CONTROLNET_STRENGTH,
        )

        out2 = _run_pass(
            pass_index=2,
            in_image=out1,
            s=s2,
            up=up2,
            denoise_v=d2,
            steps_v=_PASS2_STEPS,
            cfg_v=_PASS2_CFG,
            sampler_v=_PASS2_SAMPLER_NAME,
            scheduler_v=_PASS2_SCHEDULER,
            controlnet_strength_v=_PASS2_CONTROLNET_STRENGTH,
        )

        cropped = out2[:, y:y + s2, x:x + s2, :]
        return (out2, cropped)


# ======================================================================================
# apply_segment_4 (standalone, embedded) - rename internal alpha paste helper to avoid clash
# ======================================================================================

# Expects: <this_file_dir>/assets/images/*.png
_AP4_ASSETS_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "assets", "images")


def ap4_list_pngs() -> List[str]:
    if not os.path.isdir(_AP4_ASSETS_DIR):
        return []
    files: List[str] = []
    for root, _, fnames in os.walk(_AP4_ASSETS_DIR):
        for f in fnames:
            if f.lower().endswith(".png"):
                full = os.path.join(root, f)
                if os.path.isfile(full):
                    rel = os.path.relpath(full, _AP4_ASSETS_DIR)
                    files.append(rel.replace("\\", "/"))
    return sorted(files)


def ap4_safe_path(filename: str) -> str:
    candidate = os.path.join(_AP4_ASSETS_DIR, filename)
    real_assets = os.path.realpath(_AP4_ASSETS_DIR)
    real_candidate = os.path.realpath(candidate)
    if not real_candidate.startswith(real_assets + os.sep) and real_candidate != real_assets:
        raise ValueError("Unsafe path (path traversal detected).")
    return real_candidate


def ap4_file_hash(filename: str) -> str:
    path = ap4_safe_path(filename)
    h = hashlib.sha256()
    with open(path, "rb") as f:
        for chunk in iter(lambda: f.read(1024 * 1024), b""):
            h.update(chunk)
    return h.hexdigest()


def ap4_load_image_from_assets(filename: str) -> Tuple[torch.Tensor, torch.Tensor]:
    path = ap4_safe_path(filename)
    i = Image.open(path)
    i = ImageOps.exif_transpose(i)

    if i.mode == "I":
        i = i.point(lambda px: px * (1 / 255))

    rgb = i.convert("RGB")
    rgb_np = np.array(rgb).astype(np.float32) / 255.0
    image = torch.from_numpy(rgb_np)[None, ...]

    bands = i.getbands()
    if "A" in bands:
        a = np.array(i.getchannel("A")).astype(np.float32) / 255.0
        alpha = torch.from_numpy(a)
    else:
        l = np.array(i.convert("L")).astype(np.float32) / 255.0
        alpha = torch.from_numpy(l)

    mask = 1.0 - alpha
    mask = mask.clamp(0.0, 1.0).unsqueeze(0)
    return image, mask


def ap4_as_image(img: torch.Tensor) -> torch.Tensor:
    if not isinstance(img, torch.Tensor):
        raise TypeError("IMAGE must be a torch.Tensor")
    if img.dim() != 4:
        raise ValueError(f"Expected IMAGE shape [B,H,W,C], got {tuple(img.shape)}")
    if img.shape[-1] not in (3, 4):
        raise ValueError(f"Expected IMAGE channels 3 (RGB) or 4 (RGBA), got C={img.shape[-1]}")
    return img


def ap4_as_mask(mask: torch.Tensor) -> torch.Tensor:
    if not isinstance(mask, torch.Tensor):
        raise TypeError("MASK must be a torch.Tensor")
    if mask.dim() == 2:
        mask = mask.unsqueeze(0)
    if mask.dim() != 3:
        raise ValueError(f"Expected MASK shape [B,H,W] or [H,W], got {tuple(mask.shape)}")
    return mask


def ap4_ensure_rgba(img: torch.Tensor) -> torch.Tensor:
    img = ap4_as_image(img)
    if img.shape[-1] == 4:
        return img
    B, H, W, _ = img.shape
    alpha = torch.ones((B, H, W, 1), device=img.device, dtype=img.dtype)
    return torch.cat([img, alpha], dim=-1)


def ap4_alpha_over_region(overlay: torch.Tensor, canvas: torch.Tensor, x: int, y: int) -> torch.Tensor:
    overlay = ap4_as_image(overlay)
    canvas = ap4_as_image(canvas)

    if overlay.shape[0] != canvas.shape[0]:
        if overlay.shape[0] == 1 and canvas.shape[0] > 1:
            overlay = overlay.expand(canvas.shape[0], *overlay.shape[1:])
        elif canvas.shape[0] == 1 and overlay.shape[0] > 1:
            canvas = canvas.expand(overlay.shape[0], *canvas.shape[1:])
        else:
            raise ValueError(f"Batch mismatch: overlay {overlay.shape[0]} vs canvas {canvas.shape[0]}")

    _, Hc, Wc, Cc = canvas.shape
    _, Ho, Wo, _ = overlay.shape

    x = int(x)
    y = int(y)

    out = canvas.clone()

    x0c = max(0, x)
    y0c = max(0, y)
    x1c = min(Wc, x + Wo)
    y1c = min(Hc, y + Ho)

    if x1c <= x0c or y1c <= y0c:
        return out

    x0o = x0c - x
    y0o = y0c - y
    x1o = x0o + (x1c - x0c)
    y1o = y0o + (y1c - y0c)

    canvas_region = out[:, y0c:y1c, x0c:x1c, :]
    overlay_region = overlay[:, y0o:y1o, x0o:x1o, :]

    canvas_rgba = ap4_ensure_rgba(canvas_region)
    overlay_rgba = ap4_ensure_rgba(overlay_region)

    over_rgb = overlay_rgba[..., :3].clamp(0.0, 1.0)
    over_a = overlay_rgba[..., 3:4].clamp(0.0, 1.0)

    under_rgb = canvas_rgba[..., :3].clamp(0.0, 1.0)
    under_a = canvas_rgba[..., 3:4].clamp(0.0, 1.0)

    over_pm = over_rgb * over_a
    under_pm = under_rgb * under_a

    out_a = over_a + under_a * (1.0 - over_a)
    out_pm = over_pm + under_pm * (1.0 - over_a)

    eps = 1e-6
    out_rgb = torch.where(out_a > eps, out_pm / (out_a + eps), torch.zeros_like(out_pm))
    out_rgb = out_rgb.clamp(0.0, 1.0)
    out_a = out_a.clamp(0.0, 1.0)

    if Cc == 3:
        out[:, y0c:y1c, x0c:x1c, :] = out_rgb
    else:
        out[:, y0c:y1c, x0c:x1c, :] = torch.cat([out_rgb, out_a], dim=-1)

    return out


class AP4_AILab_MaskCombiner_Exact:
    def combine_masks(self, mask_1, mode="combine", mask_2=None, mask_3=None, mask_4=None):
        masks = [m for m in [mask_1, mask_2, mask_3, mask_4] if m is not None]
        if len(masks) <= 1:
            return (masks[0] if masks else torch.zeros((1, 64, 64), dtype=torch.float32),)

        ref_shape = masks[0].shape
        masks = [self._resize_if_needed(m, ref_shape) for m in masks]

        if mode == "combine":
            result = torch.maximum(masks[0], masks[1])
            for mask in masks[2:]:
                result = torch.maximum(result, mask)
        elif mode == "intersection":
            result = torch.minimum(masks[0], masks[1])
        else:
            result = torch.abs(masks[0] - masks[1])

        return (torch.clamp(result, 0, 1),)

    def _resize_if_needed(self, mask, target_shape):
        if mask.shape == target_shape:
            return mask

        if len(mask.shape) == 2:
            mask = mask.unsqueeze(0)
        elif len(mask.shape) == 4:
            mask = mask.squeeze(1)

        target_height = target_shape[-2] if len(target_shape) >= 2 else target_shape[0]
        target_width = target_shape[-1] if len(target_shape) >= 2 else target_shape[1]

        resized_masks = []
        for i in range(mask.shape[0]):
            mask_np = mask[i].cpu().numpy()
            img = Image.fromarray((mask_np * 255).astype(np.uint8))
            img_resized = img.resize((target_width, target_height), Image.LANCZOS)
            mask_resized = np.array(img_resized).astype(np.float32) / 255.0
            resized_masks.append(torch.from_numpy(mask_resized))

        return torch.stack(resized_masks)


def ap4_resize_mask_comfy(alpha_mask: torch.Tensor, image_shape_hwc: Tuple[int, int, int]) -> torch.Tensor:
    H = int(image_shape_hwc[0])
    W = int(image_shape_hwc[1])
    return F.interpolate(
        alpha_mask.reshape((-1, 1, alpha_mask.shape[-2], alpha_mask.shape[-1])),
        size=(H, W),
        mode="bilinear",
    ).squeeze(1)


def ap4_join_image_with_alpha_comfy(image: torch.Tensor, alpha: torch.Tensor) -> torch.Tensor:
    image = ap4_as_image(image)
    alpha = ap4_as_mask(alpha)
    alpha = alpha.to(device=image.device, dtype=image.dtype)

    batch_size = min(len(image), len(alpha))
    out_images = []

    alpha_resized = 1.0 - ap4_resize_mask_comfy(alpha, image.shape[1:])

    for i in range(batch_size):
        out_images.append(torch.cat((image[i][:, :, :3], alpha_resized[i].unsqueeze(2)), dim=2))

    return torch.stack(out_images)


def ap4_try_get_comfy_model_management():
    try:
        import comfy.model_management as mm  # type: ignore
        return mm
    except Exception:
        return None


def ap4_gaussian_kernel_1d(kernel_size: int, sigma: float, device: torch.device, dtype: torch.dtype) -> torch.Tensor:
    center = (kernel_size - 1) / 2.0
    xs = torch.arange(kernel_size, device=device, dtype=dtype) - center
    kernel = torch.exp(-(xs * xs) / (2.0 * sigma * sigma))
    kernel = kernel / kernel.sum()
    return kernel


def ap4_mask_blur(mask: torch.Tensor, amount: int = 8, device: str = "gpu") -> torch.Tensor:
    mask = ap4_as_mask(mask).clamp(0.0, 1.0)

    if amount == 0:
        return mask

    k = int(amount)
    if k % 2 == 0:
        k += 1

    sigma = 0.3 * (((k - 1) * 0.5) - 1.0) + 0.8

    mm = ap4_try_get_comfy_model_management()

    if device == "gpu":
        if mm is not None:
            proc_device = mm.get_torch_device()
        else:
            proc_device = torch.device("cuda") if torch.cuda.is_available() else mask.device
    elif device == "cpu":
        proc_device = torch.device("cpu")
    else:
        proc_device = mask.device

    out_device = mask.device
    if device in ("gpu", "cpu") and mm is not None:
        out_device = mm.intermediate_device()

    orig_dtype = mask.dtype
    x = mask.to(device=proc_device, dtype=torch.float32)

    _, H, W = x.shape
    pad = k // 2

    pad_mode = "reflect" if (H > pad and W > pad and H > 1 and W > 1) else "replicate"

    x4 = x.unsqueeze(1)
    x4 = F.pad(x4, (pad, pad, pad, pad), mode=pad_mode)

    kern1d = ap4_gaussian_kernel_1d(k, sigma, device=proc_device, dtype=torch.float32)
    w_h = kern1d.view(1, 1, 1, k)
    w_v = kern1d.view(1, 1, k, 1)

    x4 = F.conv2d(x4, w_h)
    x4 = F.conv2d(x4, w_v)

    out = x4.squeeze(1).clamp(0.0, 1.0)
    return out.to(device=out_device, dtype=orig_dtype)


def ap4_dilate_mask(mask: torch.Tensor, dilation: int = 3) -> torch.Tensor:
    mask = ap4_as_mask(mask).clamp(0.0, 1.0)
    dilation = int(dilation)
    if dilation == 0:
        return mask

    k = abs(dilation)
    x = mask.unsqueeze(1)

    if dilation > 0:
        y = F.max_pool2d(x, kernel_size=k, stride=1, padding=k // 2)
    else:
        y = -F.max_pool2d(-x, kernel_size=k, stride=1, padding=k // 2)

    return y.squeeze(1).clamp(0.0, 1.0)


def ap4_fill_holes_grayscale_numpy_heap(f: np.ndarray, connectivity: int = 8) -> np.ndarray:
    f = np.clip(f, 0.0, 1.0).astype(np.float32, copy=False)
    H, W = f.shape
    if H == 0 or W == 0:
        return f

    cost = np.full((H, W), np.inf, dtype=np.float32)
    finalized = np.zeros((H, W), dtype=np.bool_)
    heap: List[Tuple[float, int, int]] = []

    def push(y: int, x: int):
        c = float(f[y, x])
        if c < float(cost[y, x]):
            cost[y, x] = c
            heapq.heappush(heap, (c, y, x))

    for x in range(W):
        push(0, x)
        if H > 1:
            push(H - 1, x)
    for y in range(H):
        push(y, 0)
        if W > 1:
            push(y, W - 1)

    if connectivity == 4:
        neigh = [(-1, 0), (1, 0), (0, -1), (0, 1)]
    else:
        neigh = [(-1, -1), (-1, 0), (-1, 1),
                 (0, -1),           (0, 1),
                 (1, -1),  (1, 0),  (1, 1)]

    eps = 1e-8
    while heap:
        c, y, x = heapq.heappop(heap)

        if finalized[y, x]:
            continue
        if c > float(cost[y, x]) + eps:
            continue

        finalized[y, x] = True

        for dy, dx in neigh:
            ny = y + dy
            nx = x + dx
            if ny < 0 or ny >= H or nx < 0 or nx >= W:
                continue
            if finalized[ny, nx]:
                continue

            v = float(f[ny, nx])
            nc = c if c >= v else v
            if nc < float(cost[ny, nx]) - eps:
                cost[ny, nx] = nc
                heapq.heappush(heap, (nc, ny, nx))

    return cost


def ap4_fill_holes_mask(mask: torch.Tensor) -> torch.Tensor:
    mask = ap4_as_mask(mask).clamp(0.0, 1.0)

    B, H, W = mask.shape
    device = mask.device
    dtype = mask.dtype

    mask_np = np.ascontiguousarray(mask.detach().cpu().numpy().astype(np.float32, copy=False))
    filled_np = np.empty_like(mask_np)

    try:
        from skimage.morphology import reconstruction  # type: ignore
        footprint = np.ones((3, 3), dtype=bool)

        for b in range(B):
            f = mask_np[b]
            seed = f.copy()

            if H > 2 and W > 2:
                seed[1:-1, 1:-1] = 1.0
            else:
                seed[:, :] = 1.0
                seed[0, :] = f[0, :]
                seed[-1, :] = f[-1, :]
                seed[:, 0] = f[:, 0]
                seed[:, -1] = f[:, -1]

            filled_np[b] = reconstruction(seed, f, method="erosion", footprint=footprint).astype(np.float32)

    except Exception:
        for b in range(B):
            filled_np[b] = ap4_fill_holes_grayscale_numpy_heap(mask_np[b], connectivity=8)

    out = torch.from_numpy(filled_np).to(device=device, dtype=dtype)
    return out.clamp(0.0, 1.0)


class apply_segment_4:
    CATEGORY = "image/salia"

    @classmethod
    def INPUT_TYPES(cls):
        choices = ap4_list_pngs() or ["<no pngs found>"]
        return {
            "required": {
                "mask": ("MASK",),
                "image": (choices, {}),
                "img": ("IMAGE",),
                "canvas": ("IMAGE",),
                "x": ("INT", {"default": 0, "min": -100000, "max": 100000, "step": 1}),
                "y": ("INT", {"default": 0, "min": -100000, "max": 100000, "step": 1}),
            }
        }

    RETURN_TYPES = ("IMAGE",)
    RETURN_NAMES = ("Final_Image",)
    FUNCTION = "run"

    def run(self, mask, image, img, canvas, x, y):
        if image == "<no pngs found>":
            raise FileNotFoundError("No PNGs found in assets/images next to this node")

        mask_in = ap4_as_mask(mask).clamp(0.0, 1.0)

        blurred = ap4_mask_blur(mask_in, amount=8, device="gpu")
        dilated = ap4_dilate_mask(blurred, dilation=3)
        filled = ap4_fill_holes_mask(dilated)

        inversed_mask = 1.0 - filled

        _asset_img, loaded_mask = ap4_load_image_from_assets(image)

        combiner = AP4_AILab_MaskCombiner_Exact()

        inv_cpu = inversed_mask.detach().cpu()
        loaded_cpu = ap4_as_mask(loaded_mask).detach().cpu()

        (alpha_mask,) = combiner.combine_masks(inv_cpu, mode="combine", mask_2=(1.0 - loaded_cpu))
        alpha_mask = torch.clamp(alpha_mask, 0.0, 1.0)

        alpha_image = ap4_join_image_with_alpha_comfy(img, alpha_mask)

        canvas = ap4_as_image(canvas)
        alpha_image = alpha_image.to(device=canvas.device, dtype=canvas.dtype)
        final = ap4_alpha_over_region(alpha_image, canvas, x, y)

        return (final,)

    @classmethod
    def IS_CHANGED(cls, mask, image, img, canvas, x, y):
        if image == "<no pngs found>":
            return image
        return ap4_file_hash(image)

    @classmethod
    def VALIDATE_INPUTS(cls, mask, image, img, canvas, x, y):
        if image == "<no pngs found>":
            return "No PNGs found in assets/images next to this node"
        try:
            path = ap4_safe_path(image)
        except Exception as e:
            return str(e)
        if not os.path.isfile(path):
            return f"File not found in assets/images: {image}"
        return True


# ======================================================================================
# Fused node: Salia_ezpz_gated_Duo2 -> SAM3Segment (hardcoded) -> apply_segment_4
# ======================================================================================

class SAM3Segment_Salia:
    CATEGORY = "image/salia"
    RETURN_TYPES = ("IMAGE",)
    RETURN_NAMES = ("Final_Image",)
    FUNCTION = "run"

    @classmethod
    def INPUT_TYPES(cls):
        # Use the exact dropdown sources of the embedded nodes
        salia_assets = _list_asset_pngs() or ["<no pngs found>"]
        ap4_assets = ap4_list_pngs() or ["<no pngs found>"]
        upscale_choices = ["1", "2", "4", "6", "8", "10", "12", "14", "16"]
        return {
            "required": {
                "image": ("IMAGE",),
                "trigger_string": ("STRING", {"default": ""}),

                "X_coord": ("INT", {"default": 0, "min": 0, "max": 16384, "step": 1}),
                "Y_coord": ("INT", {"default": 0, "min": 0, "max": 16384, "step": 1}),

                "positive_prompt": ("STRING", {"default": "", "multiline": True}),
                "negative_prompt": ("STRING", {"default": "", "multiline": True}),
                "prompt": ("STRING", {"default": "", "multiline": True, "placeholder": "SAM3 prompt"}),

                "asset_image": (salia_assets, {}),
                "apply_asset_image": (ap4_assets, {}),

                "square_size_1": ("INT", {"default": 384, "min": 8, "max": 8192, "step": 1}),
                "upscale_factor_1": (upscale_choices, {"default": "4"}),
                "denoise_1": ("FLOAT", {"default": 0.35, "min": 0.00, "max": 1.00, "step": 0.01}),

                "square_size_2": ("INT", {"default": 384, "min": 8, "max": 8192, "step": 1}),
                "upscale_factor_2": (upscale_choices, {"default": "4"}),
                "denoise_2": ("FLOAT", {"default": 0.35, "min": 0.00, "max": 1.00, "step": 0.01}),
            }
        }

    def __init__(self):
        self._sam3 = SAM3Segment()
        self._salia = Salia_ezpz_gated_Duo2()
        self._ap4 = apply_segment_4()

    def run(
        self,
        image,
        trigger_string="",
        X_coord=0,
        Y_coord=0,
        positive_prompt="",
        negative_prompt="",
        prompt="",
        asset_image="",
        apply_asset_image="",
        square_size_1=384,
        upscale_factor_1="4",
        denoise_1=0.35,
        square_size_2=384,
        upscale_factor_2="4",
        denoise_2=0.35,
    ):
        # EXACT bypass: if trigger_string is empty, return input image as Final_Image
        if trigger_string == "":
            return (image,)

        # 1) Pre-node: Salia_ezpz_gated_Duo2 -> image_cropped
        _out_image, image_cropped = self._salia.run(
            image=image,
            trigger_string=trigger_string,
            X_coord=int(X_coord),
            Y_coord=int(Y_coord),
            positive_prompt=str(positive_prompt),
            negative_prompt=str(negative_prompt),
            asset_image=str(asset_image),
            square_size_1=int(square_size_1),
            upscale_factor_1=str(upscale_factor_1),
            denoise_1=float(denoise_1),
            square_size_2=int(square_size_2),
            upscale_factor_2=str(upscale_factor_2),
            denoise_2=float(denoise_2),
        )

        # 2) Center: SAM3Segment with hardcoded settings on the CROPPED image
        seg_image, seg_mask, _mask_image = self._sam3.segment(
            image=image_cropped,
            prompt=str(prompt),
            sam3_model="sam3",
            device="GPU",
            confidence_threshold=0.50,
            mask_blur=0,
            mask_offset=0,
            invert_output=False,
            unload_model=False,
            background="Alpha",
            background_color="#222222",
        )

        # 3) Post-node: apply_segment_4 onto ORIGINAL input canvas (not Duo2 output)
        (final_image,) = self._ap4.run(
            mask=seg_mask,
            image=str(apply_asset_image),
            img=seg_image,
            canvas=image,
            x=int(X_coord),
            y=int(Y_coord),
        )

        return (final_image,)


# ======================================================================================
# Node mappings (all nodes in this file)
# ======================================================================================

NODE_CLASS_MAPPINGS = {
    "SAM3Segment": SAM3Segment,
    "Salia_ezpz_gated_Duo2": Salia_ezpz_gated_Duo2,
    "apply_segment_4": apply_segment_4,
    "SAM3Segment_Salia": SAM3Segment_Salia,
}

NODE_DISPLAY_NAME_MAPPINGS = {
    "SAM3Segment": "SAM3 Segmentation (RMBG)",
    "Salia_ezpz_gated_Duo2": "Salia_ezpz_gated_Duo2",
    "apply_segment_4": "apply_segment_4",
    "SAM3Segment_Salia": "SAM3Segment_Salia (Duo2 → SAM3 → apply_segment_4)",
}