File size: 21,766 Bytes

import os
import shutil
import urllib.request
from pathlib import Path
from typing import Dict, Tuple, Any, Optional, List

import numpy as np
import torch
from PIL import Image

import comfy.model_management as model_management

# transformers is required for depth-estimation pipeline
try:
    from transformers import pipeline
except Exception as e:
    pipeline = None
    _TRANSFORMERS_IMPORT_ERROR = e


# --------------------------------------------------------------------------------------
# Paths / sources
# --------------------------------------------------------------------------------------

# This file: comfyui-salia_online/nodes/Salia_Depth.py
# Plugin root: comfyui-salia_online/
PLUGIN_ROOT = Path(__file__).resolve().parent.parent

# Requested local path: assets/depth
MODEL_DIR = PLUGIN_ROOT / "assets" / "depth"
MODEL_DIR.mkdir(parents=True, exist_ok=True)

REQUIRED_FILES = {
    "config.json": "https://huggingface.co/saliacoel/depth/resolve/main/config.json",
    "model.safetensors": "https://huggingface.co/saliacoel/depth/resolve/main/model.safetensors",
    "preprocessor_config.json": "https://huggingface.co/saliacoel/depth/resolve/main/preprocessor_config.json",
}

# "zoe-path" fallback
ZOE_FALLBACK_REPO_ID = "Intel/zoedepth-nyu-kitti"


# --------------------------------------------------------------------------------------
# Logging helpers
# --------------------------------------------------------------------------------------

def _make_logger() -> Tuple[List[str], Any]:
    lines: List[str] = []

    def log(msg: str):
        # console
        try:
            print(msg)
        except Exception:
            pass
        # UI string
        lines.append(str(msg))

    return lines, log


def _fmt_bytes(n: Optional[int]) -> str:
    if n is None:
        return "?"
    # simple readable
    for unit in ["B", "KB", "MB", "GB", "TB"]:
        if n < 1024:
            return f"{n:.0f}{unit}"
        n /= 1024.0
    return f"{n:.1f}PB"


def _file_size(path: Path) -> Optional[int]:
    try:
        return path.stat().st_size
    except Exception:
        return None


def _hf_cache_info() -> Dict[str, str]:
    info: Dict[str, str] = {}
    info["env.HF_HOME"] = os.environ.get("HF_HOME", "")
    info["env.HF_HUB_CACHE"] = os.environ.get("HF_HUB_CACHE", "")
    info["env.TRANSFORMERS_CACHE"] = os.environ.get("TRANSFORMERS_CACHE", "")
    info["env.HUGGINGFACE_HUB_CACHE"] = os.environ.get("HUGGINGFACE_HUB_CACHE", "")

    try:
        from huggingface_hub import constants as hf_constants
        # These exist in most hub versions:
        info["huggingface_hub.constants.HF_HOME"] = str(getattr(hf_constants, "HF_HOME", ""))
        info["huggingface_hub.constants.HF_HUB_CACHE"] = str(getattr(hf_constants, "HF_HUB_CACHE", ""))
    except Exception:
        pass

    return info


# --------------------------------------------------------------------------------------
# Download helpers
# --------------------------------------------------------------------------------------

def _have_required_files() -> bool:
    return all((MODEL_DIR / name).exists() for name in REQUIRED_FILES.keys())


def _download_url_to_file(url: str, dst: Path, timeout: int = 180) -> None:
    """
    Download with atomic temp rename.
    """
    dst.parent.mkdir(parents=True, exist_ok=True)
    tmp = dst.with_suffix(dst.suffix + ".tmp")

    if tmp.exists():
        try:
            tmp.unlink()
        except Exception:
            pass

    req = urllib.request.Request(url, headers={"User-Agent": "ComfyUI-SaliaDepth/1.1"})
    with urllib.request.urlopen(req, timeout=timeout) as r, open(tmp, "wb") as f:
        shutil.copyfileobj(r, f)

    tmp.replace(dst)


def ensure_local_model_files(log) -> bool:
    """
    Ensure assets/depth contains the 3 files.
    Returns True if present or downloaded successfully, else False.
    """
    # Always log expected locations + URLs, even if we don't download.
    log("[SaliaDepth] ===== Local model file check =====")
    log(f"[SaliaDepth] Plugin root: {PLUGIN_ROOT}")
    log(f"[SaliaDepth] Local model dir (on drive): {MODEL_DIR}")

    for fname, url in REQUIRED_FILES.items():
        fpath = MODEL_DIR / fname
        exists = fpath.exists()
        size = _file_size(fpath) if exists else None
        log(f"[SaliaDepth]   - {fname}")
        log(f"[SaliaDepth]       local path: {fpath}  exists={exists}  size={_fmt_bytes(size)}")
        log(f"[SaliaDepth]       remote url : {url}")

    if _have_required_files():
        log("[SaliaDepth] All required local files already exist. No download needed.")
        return True

    log("[SaliaDepth] One or more local files missing. Attempting download...")

    try:
        for fname, url in REQUIRED_FILES.items():
            fpath = MODEL_DIR / fname
            if fpath.exists():
                continue
            log(f"[SaliaDepth] Downloading '{fname}' -> '{fpath}'")
            _download_url_to_file(url, fpath)
            log(f"[SaliaDepth] Downloaded '{fname}' size={_fmt_bytes(_file_size(fpath))}")

        ok = _have_required_files()
        log(f"[SaliaDepth] Download finished. ok={ok}")
        return ok
    except Exception as e:
        log(f"[SaliaDepth] Download failed with error: {repr(e)}")
        return False


# --------------------------------------------------------------------------------------
# Exact Zoe-style preprocessing helpers (copied/adapted from your snippet)
# --------------------------------------------------------------------------------------

def HWC3(x: np.ndarray) -> np.ndarray:
    assert x.dtype == np.uint8
    if x.ndim == 2:
        x = x[:, :, None]
    assert x.ndim == 3
    H, W, C = x.shape
    assert C == 1 or C == 3 or C == 4
    if C == 3:
        return x
    if C == 1:
        return np.concatenate([x, x, x], axis=2)
    # C == 4
    color = x[:, :, 0:3].astype(np.float32)
    alpha = x[:, :, 3:4].astype(np.float32) / 255.0
    y = color * alpha + 255.0 * (1.0 - alpha)  # white background
    y = y.clip(0, 255).astype(np.uint8)
    return y


def pad64(x: int) -> int:
    return int(np.ceil(float(x) / 64.0) * 64 - x)


def safer_memory(x: np.ndarray) -> np.ndarray:
    return np.ascontiguousarray(x.copy()).copy()


def resize_image_with_pad_min_side(
    input_image: np.ndarray,
    resolution: int,
    upscale_method: str = "INTER_CUBIC",
    skip_hwc3: bool = False,
    mode: str = "edge",
    log=None
) -> Tuple[np.ndarray, Any]:
    """
    EXACT behavior like your zoe.transformers.py:
      k = resolution / min(H,W)
      resize to (W_target, H_target)
      pad to multiple of 64
      return padded image and remove_pad() closure
    """
    # prefer cv2 like original for matching results
    cv2 = None
    try:
        import cv2 as _cv2
        cv2 = _cv2
    except Exception:
        cv2 = None
        if log:
            log("[SaliaDepth] WARN: cv2 not available; resizing will use PIL fallback (may change results).")

    if skip_hwc3:
        img = input_image
    else:
        img = HWC3(input_image)

    H_raw, W_raw, _ = img.shape
    if resolution <= 0:
        # keep original, but still pad to 64 (we will handle padding separately for -1 path)
        return img, (lambda x: x)

    k = float(resolution) / float(min(H_raw, W_raw))
    H_target = int(np.round(float(H_raw) * k))
    W_target = int(np.round(float(W_raw) * k))

    if cv2 is not None:
        upscale_methods = {
            "INTER_NEAREST": cv2.INTER_NEAREST,
            "INTER_LINEAR": cv2.INTER_LINEAR,
            "INTER_AREA": cv2.INTER_AREA,
            "INTER_CUBIC": cv2.INTER_CUBIC,
            "INTER_LANCZOS4": cv2.INTER_LANCZOS4,
        }
        method = upscale_methods.get(upscale_method, cv2.INTER_CUBIC)
        img = cv2.resize(img, (W_target, H_target), interpolation=method if k > 1 else cv2.INTER_AREA)
    else:
        # PIL fallback
        pil = Image.fromarray(img)
        resample = Image.BICUBIC if k > 1 else Image.LANCZOS
        pil = pil.resize((W_target, H_target), resample=resample)
        img = np.array(pil, dtype=np.uint8)

    H_pad, W_pad = pad64(H_target), pad64(W_target)
    img_padded = np.pad(img, [[0, H_pad], [0, W_pad], [0, 0]], mode=mode)

    def remove_pad(x: np.ndarray) -> np.ndarray:
        return safer_memory(x[:H_target, :W_target, ...])

    return safer_memory(img_padded), remove_pad


def pad_only_to_64(img_u8: np.ndarray, mode: str = "edge") -> Tuple[np.ndarray, Any]:
    """
    For resolution == -1: keep original resolution but still pad to multiples of 64,
    then provide remove_pad that returns original size.
    """
    img = HWC3(img_u8)
    H_raw, W_raw, _ = img.shape
    H_pad, W_pad = pad64(H_raw), pad64(W_raw)
    img_padded = np.pad(img, [[0, H_pad], [0, W_pad], [0, 0]], mode=mode)

    def remove_pad(x: np.ndarray) -> np.ndarray:
        return safer_memory(x[:H_raw, :W_raw, ...])

    return safer_memory(img_padded), remove_pad


# --------------------------------------------------------------------------------------
# RGBA rules (as you requested)
# --------------------------------------------------------------------------------------

def composite_rgba_over_white_keep_alpha(inp_u8: np.ndarray) -> Tuple[np.ndarray, Optional[np.ndarray]]:
    """
    If RGBA: return RGB composited over WHITE + alpha_u8 kept separately.
    If RGB: return input RGB + None alpha.
    """
    if inp_u8.ndim == 3 and inp_u8.shape[2] == 4:
        rgba = inp_u8.astype(np.uint8)
        rgb = rgba[:, :, 0:3].astype(np.float32)
        a = (rgba[:, :, 3:4].astype(np.float32) / 255.0)
        rgb_white = (rgb * a + 255.0 * (1.0 - a)).clip(0, 255).astype(np.uint8)
        alpha_u8 = rgba[:, :, 3].copy()
        return rgb_white, alpha_u8
    # force to RGB
    return HWC3(inp_u8), None


def apply_alpha_then_black_background(depth_rgb_u8: np.ndarray, alpha_u8: np.ndarray) -> np.ndarray:
    """
    Requested output rule:
      - attach alpha to depth (conceptually RGBA)
      - composite over BLACK
      - output RGB
    That is equivalent to depth_rgb * alpha.
    """
    depth_rgb_u8 = HWC3(depth_rgb_u8)
    a = (alpha_u8.astype(np.float32) / 255.0)[:, :, None]
    out = (depth_rgb_u8.astype(np.float32) * a).clip(0, 255).astype(np.uint8)
    return out


# --------------------------------------------------------------------------------------
# ComfyUI conversion helpers
# --------------------------------------------------------------------------------------

def comfy_tensor_to_u8(img: torch.Tensor) -> np.ndarray:
    """
    Comfy IMAGE: float [0..1], shape [H,W,C] or [B,H,W,C]
    Convert to uint8 HWC.
    """
    if img.ndim == 4:
        img = img[0]
    arr = img.detach().cpu().float().clamp(0, 1).numpy()
    u8 = (arr * 255.0).round().astype(np.uint8)
    return u8


def u8_to_comfy_tensor(img_u8: np.ndarray) -> torch.Tensor:
    img_u8 = HWC3(img_u8)
    t = torch.from_numpy(img_u8.astype(np.float32) / 255.0)
    return t.unsqueeze(0)  # [1,H,W,C]


# --------------------------------------------------------------------------------------
# Pipeline loading (local-first, then zoe fallback)
# --------------------------------------------------------------------------------------

_PIPE_CACHE: Dict[Tuple[str, str], Any] = {}  # (model_source, device_str) -> pipeline


def _try_load_pipeline(model_source: str, device: torch.device, log):
    """
    Use transformers.pipeline like Zoe code does.
    We intentionally do NOT pass device=... here, and instead move model like Zoe node.
    """
    if pipeline is None:
        raise RuntimeError(f"transformers import failed: {_TRANSFORMERS_IMPORT_ERROR}")

    key = (model_source, str(device))
    if key in _PIPE_CACHE:
        log(f"[SaliaDepth] Using cached pipeline for source='{model_source}' device='{device}'")
        return _PIPE_CACHE[key]

    log(f"[SaliaDepth] Creating pipeline(task='depth-estimation', model='{model_source}')")
    p = pipeline(task="depth-estimation", model=model_source)

    # Try to move model to torch device, like ZoeDetector.to()
    try:
        p.model = p.model.to(device)
        p.device = device  # Zoe code sets this; newer transformers uses torch.device internally
        log(f"[SaliaDepth] Moved pipeline model to device: {device}")
    except Exception as e:
        log(f"[SaliaDepth] WARN: Could not move pipeline model to device {device}: {repr(e)}")

    # Log config info for debugging
    try:
        cfg = p.model.config
        log(f"[SaliaDepth] Model class: {p.model.__class__.__name__}")
        log(f"[SaliaDepth] Config class: {cfg.__class__.__name__}")
        log(f"[SaliaDepth] Config model_type: {getattr(cfg, 'model_type', '')}")
        log(f"[SaliaDepth] Config _name_or_path: {getattr(cfg, '_name_or_path', '')}")
    except Exception as e:
        log(f"[SaliaDepth] WARN: Could not log model config: {repr(e)}")

    _PIPE_CACHE[key] = p
    return p


def get_depth_pipeline(device: torch.device, log):
    """
    1) Ensure assets/depth files exist (download if missing)
    2) Try load local dir
    3) Fallback to Intel/zoedepth-nyu-kitti
    4) If both fail -> None
    """
    # Always log HF cache info (helps locate where fallback downloads go)
    log("[SaliaDepth] ===== Hugging Face cache info (fallback path) =====")
    for k, v in _hf_cache_info().items():
        if v:
            log(f"[SaliaDepth] {k} = {v}")
    log(f"[SaliaDepth] Zoe fallback repo id: {ZOE_FALLBACK_REPO_ID}")

    # Local-first
    local_ok = ensure_local_model_files(log)
    if local_ok:
        try:
            log(f"[SaliaDepth] Trying LOCAL model from directory: {MODEL_DIR}")
            return _try_load_pipeline(str(MODEL_DIR), device, log)
        except Exception as e:
            log(f"[SaliaDepth] Local model load FAILED: {repr(e)}")

    # Fallback
    try:
        log(f"[SaliaDepth] Trying ZOE fallback model: {ZOE_FALLBACK_REPO_ID}")
        return _try_load_pipeline(ZOE_FALLBACK_REPO_ID, device, log)
    except Exception as e:
        log(f"[SaliaDepth] Zoe fallback load FAILED: {repr(e)}")

    return None


# --------------------------------------------------------------------------------------
# Depth inference (Zoe-style)
# --------------------------------------------------------------------------------------

def depth_estimate_zoe_style(
    pipe,
    input_rgb_u8: np.ndarray,
    detect_resolution: int,
    log,
    upscale_method: str = "INTER_CUBIC"
) -> np.ndarray:
    """
    Matches your ZoeDetector.__call__ logic very closely.
    Returns uint8 RGB depth map.
    """
    # detect_resolution:
    #   - if -1: keep original but pad-to-64
    #   - else: min-side resize to detect_resolution, then pad-to-64
    if detect_resolution == -1:
        work_img, remove_pad = pad_only_to_64(input_rgb_u8, mode="edge")
        log(f"[SaliaDepth] Preprocess: resolution=-1 (no resize), padded to 64. work={work_img.shape}")
    else:
        work_img, remove_pad = resize_image_with_pad_min_side(
            input_rgb_u8,
            int(detect_resolution),
            upscale_method=upscale_method,
            skip_hwc3=False,
            mode="edge",
            log=log
        )
        log(f"[SaliaDepth] Preprocess: min-side resized to {detect_resolution}, padded to 64. work={work_img.shape}")

    pil_image = Image.fromarray(work_img)

    with torch.no_grad():
        result = pipe(pil_image)
        depth = result["depth"]

        if isinstance(depth, Image.Image):
            depth_array = np.array(depth, dtype=np.float32)
        else:
            depth_array = np.array(depth, dtype=np.float32)

        # EXACT normalization like your Zoe code
        vmin = float(np.percentile(depth_array, 2))
        vmax = float(np.percentile(depth_array, 85))

        log(f"[SaliaDepth] Depth raw stats: shape={depth_array.shape} vmin(p2)={vmin:.6f} vmax(p85)={vmax:.6f} mean={float(depth_array.mean()):.6f}")

        depth_array = depth_array - vmin
        denom = (vmax - vmin)
        if abs(denom) < 1e-12:
            # avoid division by zero; log it
            log("[SaliaDepth] WARN: vmax==vmin; forcing denom epsilon to avoid NaNs.")
            denom = 1e-6
        depth_array = depth_array / denom

        # EXACT invert like your Zoe code
        depth_array = 1.0 - depth_array

        depth_image = (depth_array * 255.0).clip(0, 255).astype(np.uint8)

    detected_map = remove_pad(HWC3(depth_image))
    log(f"[SaliaDepth] Output (post-remove_pad): {detected_map.shape} dtype={detected_map.dtype}")
    return detected_map


def resize_to_original(depth_rgb_u8: np.ndarray, w0: int, h0: int, log) -> np.ndarray:
    """
    Resize depth output back to original input size.
    Use cv2 if available, else PIL.
    """
    try:
        import cv2
        out = cv2.resize(depth_rgb_u8, (w0, h0), interpolation=cv2.INTER_LINEAR)
        return out.astype(np.uint8)
    except Exception as e:
        log(f"[SaliaDepth] WARN: cv2 resize failed ({repr(e)}); using PIL.")
        pil = Image.fromarray(depth_rgb_u8)
        pil = pil.resize((w0, h0), resample=Image.BILINEAR)
        return np.array(pil, dtype=np.uint8)


# --------------------------------------------------------------------------------------
# ComfyUI Node
# --------------------------------------------------------------------------------------

class Salia_Depth_Preprocessor:
    @classmethod
    def INPUT_TYPES(cls):
        return {
            "required": {
                "image": ("IMAGE",),
                # note: default -1, min -1
                "resolution": ("INT", {"default": -1, "min": -1, "max": 8192, "step": 1}),
            }
        }

    # 2 outputs: image + log string
    RETURN_TYPES = ("IMAGE", "STRING")
    FUNCTION = "execute"
    CATEGORY = "ControlNet Preprocessors/Normal and Depth Estimators"

    def execute(self, image, resolution=-1):
        lines, log = _make_logger()
        log("[SaliaDepth] ==================================================")
        log("[SaliaDepth] SaliaDepthPreprocessor starting")
        log(f"[SaliaDepth] resolution input = {resolution}")

        # Get torch device
        try:
            device = model_management.get_torch_device()
        except Exception as e:
            device = torch.device("cpu")
            log(f"[SaliaDepth] WARN: model_management.get_torch_device failed: {repr(e)} -> using CPU")

        log(f"[SaliaDepth] torch device = {device}")

        # Load pipeline
        pipe = None
        try:
            pipe = get_depth_pipeline(device, log)
        except Exception as e:
            log(f"[SaliaDepth] ERROR: get_depth_pipeline crashed: {repr(e)}")
            pipe = None

        if pipe is None:
            log("[SaliaDepth] FATAL: No pipeline available. Returning input image unchanged.")
            return (image, "\n".join(lines))

        # Batch support
        if image.ndim == 3:
            image = image.unsqueeze(0)

        outs = []
        for i in range(image.shape[0]):
            try:
                # Original dimensions
                h0 = int(image[i].shape[0])
                w0 = int(image[i].shape[1])
                c0 = int(image[i].shape[2])
                log(f"[SaliaDepth] ---- Batch index {i} input shape = ({h0},{w0},{c0}) ----")

                inp_u8 = comfy_tensor_to_u8(image[i])

                # RGBA rule (pre)
                rgb_for_depth, alpha_u8 = composite_rgba_over_white_keep_alpha(inp_u8)
                had_rgba = alpha_u8 is not None
                log(f"[SaliaDepth] had_rgba={had_rgba}")

                # Run depth (Zoe-style)
                depth_rgb = depth_estimate_zoe_style(
                    pipe=pipe,
                    input_rgb_u8=rgb_for_depth,
                    detect_resolution=int(resolution),
                    log=log,
                    upscale_method="INTER_CUBIC"
                )

                # Resize back to original input size
                depth_rgb = resize_to_original(depth_rgb, w0=w0, h0=h0, log=log)

                # RGBA rule (post)
                if had_rgba:
                    # Use original alpha at original size.
                    # If alpha size differs, resize alpha to match.
                    if alpha_u8.shape[0] != h0 or alpha_u8.shape[1] != w0:
                        log("[SaliaDepth] Alpha size mismatch; resizing alpha to original size.")
                        try:
                            import cv2
                            alpha_u8 = cv2.resize(alpha_u8, (w0, h0), interpolation=cv2.INTER_LINEAR).astype(np.uint8)
                        except Exception:
                            pil_a = Image.fromarray(alpha_u8)
                            pil_a = pil_a.resize((w0, h0), resample=Image.BILINEAR)
                            alpha_u8 = np.array(pil_a, dtype=np.uint8)

                    # "Put alpha on RGB turning it into RGBA, then put BLACK background behind it, then back to RGB"
                    depth_rgb = apply_alpha_then_black_background(depth_rgb, alpha_u8)
                    log("[SaliaDepth] Applied RGBA post-step (alpha + black background).")

                outs.append(u8_to_comfy_tensor(depth_rgb))

            except Exception as e:
                log(f"[SaliaDepth] ERROR: Inference failed at batch index {i}: {repr(e)}")
                log("[SaliaDepth] Passing through original input image for this batch item.")
                outs.append(image[i].unsqueeze(0))

        out = torch.cat(outs, dim=0)
        log("[SaliaDepth] Done.")
        return (out, "\n".join(lines))


NODE_CLASS_MAPPINGS = {
    "SaliaDepthPreprocessor": Salia_Depth_Preprocessor
}

NODE_DISPLAY_NAME_MAPPINGS = {
    "SaliaDepthPreprocessor": "Salia Depth (local assets/depth + logs)"
}