import os import shutil import urllib.request from pathlib import Path from typing import Dict, Tuple, Any, Optional, List import numpy as np import torch from PIL import Image import comfy.model_management as model_management # transformers is required for depth-estimation pipeline try: from transformers import pipeline except Exception as e: pipeline = None _TRANSFORMERS_IMPORT_ERROR = e # -------------------------------------------------------------------------------------- # Paths / sources # -------------------------------------------------------------------------------------- # This file: comfyui-salia_online/nodes/Salia_Depth.py # Plugin root: comfyui-salia_online/ PLUGIN_ROOT = Path(__file__).resolve().parent.parent # Requested local path: assets/depth MODEL_DIR = PLUGIN_ROOT / "assets" / "depth" MODEL_DIR.mkdir(parents=True, exist_ok=True) REQUIRED_FILES = { "config.json": "https://huggingface.co/saliacoel/depth/resolve/main/config.json", "model.safetensors": "https://huggingface.co/saliacoel/depth/resolve/main/model.safetensors", "preprocessor_config.json": "https://huggingface.co/saliacoel/depth/resolve/main/preprocessor_config.json", } # "zoe-path" fallback ZOE_FALLBACK_REPO_ID = "Intel/zoedepth-nyu-kitti" # -------------------------------------------------------------------------------------- # Logging helpers # -------------------------------------------------------------------------------------- def _make_logger() -> Tuple[List[str], Any]: lines: List[str] = [] def log(msg: str): # console try: print(msg) except Exception: pass # UI string lines.append(str(msg)) return lines, log def _fmt_bytes(n: Optional[int]) -> str: if n is None: return "?" # simple readable for unit in ["B", "KB", "MB", "GB", "TB"]: if n < 1024: return f"{n:.0f}{unit}" n /= 1024.0 return f"{n:.1f}PB" def _file_size(path: Path) -> Optional[int]: try: return path.stat().st_size except Exception: return None def _hf_cache_info() -> Dict[str, str]: info: Dict[str, str] = {} info["env.HF_HOME"] = os.environ.get("HF_HOME", "") info["env.HF_HUB_CACHE"] = os.environ.get("HF_HUB_CACHE", "") info["env.TRANSFORMERS_CACHE"] = os.environ.get("TRANSFORMERS_CACHE", "") info["env.HUGGINGFACE_HUB_CACHE"] = os.environ.get("HUGGINGFACE_HUB_CACHE", "") try: from huggingface_hub import constants as hf_constants # These exist in most hub versions: info["huggingface_hub.constants.HF_HOME"] = str(getattr(hf_constants, "HF_HOME", "")) info["huggingface_hub.constants.HF_HUB_CACHE"] = str(getattr(hf_constants, "HF_HUB_CACHE", "")) except Exception: pass return info # -------------------------------------------------------------------------------------- # Download helpers # -------------------------------------------------------------------------------------- def _have_required_files() -> bool: return all((MODEL_DIR / name).exists() for name in REQUIRED_FILES.keys()) def _download_url_to_file(url: str, dst: Path, timeout: int = 180) -> None: """ Download with atomic temp rename. """ dst.parent.mkdir(parents=True, exist_ok=True) tmp = dst.with_suffix(dst.suffix + ".tmp") if tmp.exists(): try: tmp.unlink() except Exception: pass req = urllib.request.Request(url, headers={"User-Agent": "ComfyUI-SaliaDepth/1.1"}) with urllib.request.urlopen(req, timeout=timeout) as r, open(tmp, "wb") as f: shutil.copyfileobj(r, f) tmp.replace(dst) def ensure_local_model_files(log) -> bool: """ Ensure assets/depth contains the 3 files. Returns True if present or downloaded successfully, else False. """ # Always log expected locations + URLs, even if we don't download. log("[SaliaDepth] ===== Local model file check =====") log(f"[SaliaDepth] Plugin root: {PLUGIN_ROOT}") log(f"[SaliaDepth] Local model dir (on drive): {MODEL_DIR}") for fname, url in REQUIRED_FILES.items(): fpath = MODEL_DIR / fname exists = fpath.exists() size = _file_size(fpath) if exists else None log(f"[SaliaDepth] - {fname}") log(f"[SaliaDepth] local path: {fpath} exists={exists} size={_fmt_bytes(size)}") log(f"[SaliaDepth] remote url : {url}") if _have_required_files(): log("[SaliaDepth] All required local files already exist. No download needed.") return True log("[SaliaDepth] One or more local files missing. Attempting download...") try: for fname, url in REQUIRED_FILES.items(): fpath = MODEL_DIR / fname if fpath.exists(): continue log(f"[SaliaDepth] Downloading '{fname}' -> '{fpath}'") _download_url_to_file(url, fpath) log(f"[SaliaDepth] Downloaded '{fname}' size={_fmt_bytes(_file_size(fpath))}") ok = _have_required_files() log(f"[SaliaDepth] Download finished. ok={ok}") return ok except Exception as e: log(f"[SaliaDepth] Download failed with error: {repr(e)}") return False # -------------------------------------------------------------------------------------- # Exact Zoe-style preprocessing helpers (copied/adapted from your snippet) # -------------------------------------------------------------------------------------- def HWC3(x: np.ndarray) -> np.ndarray: assert x.dtype == np.uint8 if x.ndim == 2: x = x[:, :, None] assert x.ndim == 3 H, W, C = x.shape assert C == 1 or C == 3 or C == 4 if C == 3: return x if C == 1: return np.concatenate([x, x, x], axis=2) # C == 4 color = x[:, :, 0:3].astype(np.float32) alpha = x[:, :, 3:4].astype(np.float32) / 255.0 y = color * alpha + 255.0 * (1.0 - alpha) # white background y = y.clip(0, 255).astype(np.uint8) return y def pad64(x: int) -> int: return int(np.ceil(float(x) / 64.0) * 64 - x) def safer_memory(x: np.ndarray) -> np.ndarray: return np.ascontiguousarray(x.copy()).copy() def resize_image_with_pad_min_side( input_image: np.ndarray, resolution: int, upscale_method: str = "INTER_CUBIC", skip_hwc3: bool = False, mode: str = "edge", log=None ) -> Tuple[np.ndarray, Any]: """ EXACT behavior like your zoe.transformers.py: k = resolution / min(H,W) resize to (W_target, H_target) pad to multiple of 64 return padded image and remove_pad() closure """ # prefer cv2 like original for matching results cv2 = None try: import cv2 as _cv2 cv2 = _cv2 except Exception: cv2 = None if log: log("[SaliaDepth] WARN: cv2 not available; resizing will use PIL fallback (may change results).") if skip_hwc3: img = input_image else: img = HWC3(input_image) H_raw, W_raw, _ = img.shape if resolution <= 0: # keep original, but still pad to 64 (we will handle padding separately for -1 path) return img, (lambda x: x) k = float(resolution) / float(min(H_raw, W_raw)) H_target = int(np.round(float(H_raw) * k)) W_target = int(np.round(float(W_raw) * k)) if cv2 is not None: upscale_methods = { "INTER_NEAREST": cv2.INTER_NEAREST, "INTER_LINEAR": cv2.INTER_LINEAR, "INTER_AREA": cv2.INTER_AREA, "INTER_CUBIC": cv2.INTER_CUBIC, "INTER_LANCZOS4": cv2.INTER_LANCZOS4, } method = upscale_methods.get(upscale_method, cv2.INTER_CUBIC) img = cv2.resize(img, (W_target, H_target), interpolation=method if k > 1 else cv2.INTER_AREA) else: # PIL fallback pil = Image.fromarray(img) resample = Image.BICUBIC if k > 1 else Image.LANCZOS pil = pil.resize((W_target, H_target), resample=resample) img = np.array(pil, dtype=np.uint8) H_pad, W_pad = pad64(H_target), pad64(W_target) img_padded = np.pad(img, [[0, H_pad], [0, W_pad], [0, 0]], mode=mode) def remove_pad(x: np.ndarray) -> np.ndarray: return safer_memory(x[:H_target, :W_target, ...]) return safer_memory(img_padded), remove_pad def pad_only_to_64(img_u8: np.ndarray, mode: str = "edge") -> Tuple[np.ndarray, Any]: """ For resolution == -1: keep original resolution but still pad to multiples of 64, then provide remove_pad that returns original size. """ img = HWC3(img_u8) H_raw, W_raw, _ = img.shape H_pad, W_pad = pad64(H_raw), pad64(W_raw) img_padded = np.pad(img, [[0, H_pad], [0, W_pad], [0, 0]], mode=mode) def remove_pad(x: np.ndarray) -> np.ndarray: return safer_memory(x[:H_raw, :W_raw, ...]) return safer_memory(img_padded), remove_pad # -------------------------------------------------------------------------------------- # RGBA rules (as you requested) # -------------------------------------------------------------------------------------- def composite_rgba_over_white_keep_alpha(inp_u8: np.ndarray) -> Tuple[np.ndarray, Optional[np.ndarray]]: """ If RGBA: return RGB composited over WHITE + alpha_u8 kept separately. If RGB: return input RGB + None alpha. """ if inp_u8.ndim == 3 and inp_u8.shape[2] == 4: rgba = inp_u8.astype(np.uint8) rgb = rgba[:, :, 0:3].astype(np.float32) a = (rgba[:, :, 3:4].astype(np.float32) / 255.0) rgb_white = (rgb * a + 255.0 * (1.0 - a)).clip(0, 255).astype(np.uint8) alpha_u8 = rgba[:, :, 3].copy() return rgb_white, alpha_u8 # force to RGB return HWC3(inp_u8), None def apply_alpha_then_black_background(depth_rgb_u8: np.ndarray, alpha_u8: np.ndarray) -> np.ndarray: """ Requested output rule: - attach alpha to depth (conceptually RGBA) - composite over BLACK - output RGB That is equivalent to depth_rgb * alpha. """ depth_rgb_u8 = HWC3(depth_rgb_u8) a = (alpha_u8.astype(np.float32) / 255.0)[:, :, None] out = (depth_rgb_u8.astype(np.float32) * a).clip(0, 255).astype(np.uint8) return out # -------------------------------------------------------------------------------------- # ComfyUI conversion helpers # -------------------------------------------------------------------------------------- def comfy_tensor_to_u8(img: torch.Tensor) -> np.ndarray: """ Comfy IMAGE: float [0..1], shape [H,W,C] or [B,H,W,C] Convert to uint8 HWC. """ if img.ndim == 4: img = img[0] arr = img.detach().cpu().float().clamp(0, 1).numpy() u8 = (arr * 255.0).round().astype(np.uint8) return u8 def u8_to_comfy_tensor(img_u8: np.ndarray) -> torch.Tensor: img_u8 = HWC3(img_u8) t = torch.from_numpy(img_u8.astype(np.float32) / 255.0) return t.unsqueeze(0) # [1,H,W,C] # -------------------------------------------------------------------------------------- # Pipeline loading (local-first, then zoe fallback) # -------------------------------------------------------------------------------------- _PIPE_CACHE: Dict[Tuple[str, str], Any] = {} # (model_source, device_str) -> pipeline def _try_load_pipeline(model_source: str, device: torch.device, log): """ Use transformers.pipeline like Zoe code does. We intentionally do NOT pass device=... here, and instead move model like Zoe node. """ if pipeline is None: raise RuntimeError(f"transformers import failed: {_TRANSFORMERS_IMPORT_ERROR}") key = (model_source, str(device)) if key in _PIPE_CACHE: log(f"[SaliaDepth] Using cached pipeline for source='{model_source}' device='{device}'") return _PIPE_CACHE[key] log(f"[SaliaDepth] Creating pipeline(task='depth-estimation', model='{model_source}')") p = pipeline(task="depth-estimation", model=model_source) # Try to move model to torch device, like ZoeDetector.to() try: p.model = p.model.to(device) p.device = device # Zoe code sets this; newer transformers uses torch.device internally log(f"[SaliaDepth] Moved pipeline model to device: {device}") except Exception as e: log(f"[SaliaDepth] WARN: Could not move pipeline model to device {device}: {repr(e)}") # Log config info for debugging try: cfg = p.model.config log(f"[SaliaDepth] Model class: {p.model.__class__.__name__}") log(f"[SaliaDepth] Config class: {cfg.__class__.__name__}") log(f"[SaliaDepth] Config model_type: {getattr(cfg, 'model_type', '')}") log(f"[SaliaDepth] Config _name_or_path: {getattr(cfg, '_name_or_path', '')}") except Exception as e: log(f"[SaliaDepth] WARN: Could not log model config: {repr(e)}") _PIPE_CACHE[key] = p return p def get_depth_pipeline(device: torch.device, log): """ 1) Ensure assets/depth files exist (download if missing) 2) Try load local dir 3) Fallback to Intel/zoedepth-nyu-kitti 4) If both fail -> None """ # Always log HF cache info (helps locate where fallback downloads go) log("[SaliaDepth] ===== Hugging Face cache info (fallback path) =====") for k, v in _hf_cache_info().items(): if v: log(f"[SaliaDepth] {k} = {v}") log(f"[SaliaDepth] Zoe fallback repo id: {ZOE_FALLBACK_REPO_ID}") # Local-first local_ok = ensure_local_model_files(log) if local_ok: try: log(f"[SaliaDepth] Trying LOCAL model from directory: {MODEL_DIR}") return _try_load_pipeline(str(MODEL_DIR), device, log) except Exception as e: log(f"[SaliaDepth] Local model load FAILED: {repr(e)}") # Fallback try: log(f"[SaliaDepth] Trying ZOE fallback model: {ZOE_FALLBACK_REPO_ID}") return _try_load_pipeline(ZOE_FALLBACK_REPO_ID, device, log) except Exception as e: log(f"[SaliaDepth] Zoe fallback load FAILED: {repr(e)}") return None # -------------------------------------------------------------------------------------- # Depth inference (Zoe-style) # -------------------------------------------------------------------------------------- def depth_estimate_zoe_style( pipe, input_rgb_u8: np.ndarray, detect_resolution: int, log, upscale_method: str = "INTER_CUBIC" ) -> np.ndarray: """ Matches your ZoeDetector.__call__ logic very closely. Returns uint8 RGB depth map. """ # detect_resolution: # - if -1: keep original but pad-to-64 # - else: min-side resize to detect_resolution, then pad-to-64 if detect_resolution == -1: work_img, remove_pad = pad_only_to_64(input_rgb_u8, mode="edge") log(f"[SaliaDepth] Preprocess: resolution=-1 (no resize), padded to 64. work={work_img.shape}") else: work_img, remove_pad = resize_image_with_pad_min_side( input_rgb_u8, int(detect_resolution), upscale_method=upscale_method, skip_hwc3=False, mode="edge", log=log ) log(f"[SaliaDepth] Preprocess: min-side resized to {detect_resolution}, padded to 64. work={work_img.shape}") pil_image = Image.fromarray(work_img) with torch.no_grad(): result = pipe(pil_image) depth = result["depth"] if isinstance(depth, Image.Image): depth_array = np.array(depth, dtype=np.float32) else: depth_array = np.array(depth, dtype=np.float32) # EXACT normalization like your Zoe code vmin = float(np.percentile(depth_array, 2)) vmax = float(np.percentile(depth_array, 85)) log(f"[SaliaDepth] Depth raw stats: shape={depth_array.shape} vmin(p2)={vmin:.6f} vmax(p85)={vmax:.6f} mean={float(depth_array.mean()):.6f}") depth_array = depth_array - vmin denom = (vmax - vmin) if abs(denom) < 1e-12: # avoid division by zero; log it log("[SaliaDepth] WARN: vmax==vmin; forcing denom epsilon to avoid NaNs.") denom = 1e-6 depth_array = depth_array / denom # EXACT invert like your Zoe code depth_array = 1.0 - depth_array depth_image = (depth_array * 255.0).clip(0, 255).astype(np.uint8) detected_map = remove_pad(HWC3(depth_image)) log(f"[SaliaDepth] Output (post-remove_pad): {detected_map.shape} dtype={detected_map.dtype}") return detected_map def resize_to_original(depth_rgb_u8: np.ndarray, w0: int, h0: int, log) -> np.ndarray: """ Resize depth output back to original input size. Use cv2 if available, else PIL. """ try: import cv2 out = cv2.resize(depth_rgb_u8, (w0, h0), interpolation=cv2.INTER_LINEAR) return out.astype(np.uint8) except Exception as e: log(f"[SaliaDepth] WARN: cv2 resize failed ({repr(e)}); using PIL.") pil = Image.fromarray(depth_rgb_u8) pil = pil.resize((w0, h0), resample=Image.BILINEAR) return np.array(pil, dtype=np.uint8) # -------------------------------------------------------------------------------------- # ComfyUI Node # -------------------------------------------------------------------------------------- class Salia_Depth_Preprocessor: @classmethod def INPUT_TYPES(cls): return { "required": { "image": ("IMAGE",), # note: default -1, min -1 "resolution": ("INT", {"default": -1, "min": -1, "max": 8192, "step": 1}), } } # 2 outputs: image + log string RETURN_TYPES = ("IMAGE", "STRING") FUNCTION = "execute" CATEGORY = "ControlNet Preprocessors/Normal and Depth Estimators" def execute(self, image, resolution=-1): lines, log = _make_logger() log("[SaliaDepth] ==================================================") log("[SaliaDepth] SaliaDepthPreprocessor starting") log(f"[SaliaDepth] resolution input = {resolution}") # Get torch device try: device = model_management.get_torch_device() except Exception as e: device = torch.device("cpu") log(f"[SaliaDepth] WARN: model_management.get_torch_device failed: {repr(e)} -> using CPU") log(f"[SaliaDepth] torch device = {device}") # Load pipeline pipe = None try: pipe = get_depth_pipeline(device, log) except Exception as e: log(f"[SaliaDepth] ERROR: get_depth_pipeline crashed: {repr(e)}") pipe = None if pipe is None: log("[SaliaDepth] FATAL: No pipeline available. Returning input image unchanged.") return (image, "\n".join(lines)) # Batch support if image.ndim == 3: image = image.unsqueeze(0) outs = [] for i in range(image.shape[0]): try: # Original dimensions h0 = int(image[i].shape[0]) w0 = int(image[i].shape[1]) c0 = int(image[i].shape[2]) log(f"[SaliaDepth] ---- Batch index {i} input shape = ({h0},{w0},{c0}) ----") inp_u8 = comfy_tensor_to_u8(image[i]) # RGBA rule (pre) rgb_for_depth, alpha_u8 = composite_rgba_over_white_keep_alpha(inp_u8) had_rgba = alpha_u8 is not None log(f"[SaliaDepth] had_rgba={had_rgba}") # Run depth (Zoe-style) depth_rgb = depth_estimate_zoe_style( pipe=pipe, input_rgb_u8=rgb_for_depth, detect_resolution=int(resolution), log=log, upscale_method="INTER_CUBIC" ) # Resize back to original input size depth_rgb = resize_to_original(depth_rgb, w0=w0, h0=h0, log=log) # RGBA rule (post) if had_rgba: # Use original alpha at original size. # If alpha size differs, resize alpha to match. if alpha_u8.shape[0] != h0 or alpha_u8.shape[1] != w0: log("[SaliaDepth] Alpha size mismatch; resizing alpha to original size.") try: import cv2 alpha_u8 = cv2.resize(alpha_u8, (w0, h0), interpolation=cv2.INTER_LINEAR).astype(np.uint8) except Exception: pil_a = Image.fromarray(alpha_u8) pil_a = pil_a.resize((w0, h0), resample=Image.BILINEAR) alpha_u8 = np.array(pil_a, dtype=np.uint8) # "Put alpha on RGB turning it into RGBA, then put BLACK background behind it, then back to RGB" depth_rgb = apply_alpha_then_black_background(depth_rgb, alpha_u8) log("[SaliaDepth] Applied RGBA post-step (alpha + black background).") outs.append(u8_to_comfy_tensor(depth_rgb)) except Exception as e: log(f"[SaliaDepth] ERROR: Inference failed at batch index {i}: {repr(e)}") log("[SaliaDepth] Passing through original input image for this batch item.") outs.append(image[i].unsqueeze(0)) out = torch.cat(outs, dim=0) log("[SaliaDepth] Done.") return (out, "\n".join(lines)) NODE_CLASS_MAPPINGS = { "SaliaDepthPreprocessor": Salia_Depth_Preprocessor } NODE_DISPLAY_NAME_MAPPINGS = { "SaliaDepthPreprocessor": "Salia Depth (local assets/depth + logs)" }