# FaceDetailerStandalone_MIN_FIXED_FAST_EMBEDDED_SAM.py # One-node Face Detailer (image-only) with fixed settings + embedded Ultralytics bbox detector + embedded SAM loader. # - Output parity with Impact Pack Face Detailer at the same settings # - No separate bbox-detector node; detector is cached/constructed internally # - No separate SAM loader node; SAM is cached/constructed internally # - Lightweight runtime overhead (cached imports, inference_mode, fused layers, TF32, FP16 on CUDA) import os from dataclasses import dataclass from typing import List, Tuple, Optional import warnings warnings.filterwarnings("ignore") # Silence OpenCV before importing it (env var) and after (setLogLevel) os.environ["OPENCV_LOG_LEVEL"] = "ERROR" import numpy as np import torch import comfy from PIL import Image import cv2 try: if hasattr(cv2, "setLogLevel"): try: lvl = cv2.LOG_LEVEL_ERROR if hasattr(cv2, "LOG_LEVEL_ERROR") else 3 # 3 == error cv2.setLogLevel(lvl) except Exception: pass except Exception: pass # ---------------- Fixed FaceDetailer settings (do not expose in UI) ---------------- # GUIDE_SIZE = 512 # GUIDE_SIZE_FOR_BBOX = True # MAX_SIZE = 1024 # STEPS = 30 # CFG = 7.0 # SCHEDULER = "simple" # DENOISE = 0.5 # FEATHER = 5 # NOISE_MASK = True # FORCE_INPAINT = True # BBOX_THRESHOLD = 0.5 # BBOX_DILATION = 10 # BBOX_CROP_FACTOR = 3.0 # DROP_SIZE = 10 # SAM_DETECTION_HINT = "center-1" # SAM_DILATION = 0 # SAM_THRESHOLD = 0.93 # SAM_BBOX_EXPANSION = 0 # SAM_MASK_HINT_THRESHOLD = 0.7 # SAM_MASK_HINT_USE_NEGATIVE = "False" # WILDCARD = "" # CYCLE = 1 # INPAINT_MODEL = False # NOISE_MASK_FEATHER = 20 # TILED_ENCODE = False # TILED_DECODE = False # --------------------------------------------------------------------- # ---------------- Ultralytics / YOLO detector integration (embedded) ---------------- # Torch runtime perf switches torch.backends.cudnn.benchmark = True # autotune best conv algorithms if torch.cuda.is_available(): torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True try: torch.set_float32_matmul_precision("high") # PyTorch 2.x except Exception: pass # Optional Impact Pack interop (SEG type) try: # If Impact Pack is installed, use its SEG to be perfectly compatible. from impact.core import SEG as _IMPACT_SEG # type: ignore _USE_IMPACT_SEG = True except Exception: _USE_IMPACT_SEG = False @dataclass class _LocalSEG: cropped_image: Optional[torch.Tensor] cropped_mask: np.ndarray # 2D float32 [0..1] confidence: float crop_region: Tuple[int, int, int, int] # (x1,y1,x2,y2) bbox: Tuple[int, int, int, int] # (x1,y1,x2,y2) label: str control_net_wrapper: Optional[object] = None SEG = _IMPACT_SEG if _USE_IMPACT_SEG else _LocalSEG # --------------------------------------------------------------------- # LOCAL ASSET PATHS (no hardcoded absolute paths) # --------------------------------------------------------------------- # Base directory of this node file (cross-platform, works on RunPod/ComfyUI) BASE_DIR = os.path.dirname(os.path.abspath(__file__)) # Local YOLO model path inside this custom node folder YOLO_MODEL_PATH = os.path.join(BASE_DIR, "assets", "face_yolov8m_salia.pt") YOLO_IMGSZ = 640 # Local SAM checkpoint path inside this custom node folder SAM_CKPT_PATH = os.path.join(BASE_DIR, "assets", "sam_vit_b_01ec64_salia.pth") # Cached instances (process-local) _CACHED_YOLO_MODEL = None _CACHED_ULTRA_DETECTOR = None def _tensor_to_pil(image: torch.Tensor) -> Image.Image: # image: [1, H, W, 3], float(0..1) img = image[0].detach().cpu().clamp(0, 1).numpy() img = (img * 255.0).round().astype(np.uint8) # (H, W, 3) RGB return Image.fromarray(img, mode="RGB") def _make_crop_region(w: int, h: int, bbox_xyxy, crop_factor: float) -> Tuple[int, int, int, int]: x1, y1, x2, y2 = map(int, bbox_xyxy) cx = (x1 + x2) * 0.5 cy = (y1 + y2) * 0.5 bw = (x2 - x1) bh = (y2 - y1) new_w = max(1, int(bw * crop_factor)) new_h = max(1, int(bh * crop_factor)) # center to image nx1 = int(max(0, round(cx - new_w * 0.5))) ny1 = int(max(0, round(cy - new_h * 0.5))) nx2 = int(min(w, nx1 + new_w)) ny2 = int(min(h, ny1 + new_h)) # clamp again nx1 = max(0, min(nx1, w - 1)) ny1 = max(0, min(ny1, h - 1)) nx2 = max(nx1 + 1, min(nx2, w)) ny2 = max(ny1 + 1, min(ny2, h)) return (nx1, ny1, nx2, ny2) def _crop_tensor_image(image: torch.Tensor, crop: Tuple[int, int, int, int]) -> torch.Tensor: # image: [1,H,W,3]; crop: (x1,y1,x2,y2) x1, y1, xb, yb = crop return image[:, y1:yb, x1:xb, :].contiguous() def _crop_ndarray(mask: np.ndarray, crop: Tuple[int, int, int, int]) -> np.ndarray: # mask: [H,W] float/bool/uint8; crop: (x1,y1,x2,y2) x1, y1, xb, yb = crop return mask[int(y1):int(yb), int(x1):int(xb)] def _dilate_masks(segmasks: List[Tuple[np.ndarray, np.ndarray, float]], factor: int): if factor == 0 or not segmasks: return segmasks k = abs(int(factor)) if k < 1: return segmasks kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (k, k)) do_dilate = factor > 0 out = [] for (bbox, m, conf) in segmasks: u8 = (m * 255.0).astype(np.uint8) if m.dtype != np.uint8 else m d = cv2.dilate(u8, kernel, iterations=1) if do_dilate else cv2.erode(u8, kernel, iterations=1) out.append((bbox, d.astype(np.float32) / 255.0, conf)) return out def _combine_masks(segmasks: List[Tuple[np.ndarray, np.ndarray, float]]) -> Optional[torch.Tensor]: if not segmasks: return None h = segmasks[0][1].shape[0] w = segmasks[0][1].shape[1] acc = np.zeros((h, w), dtype=np.uint8) for _, m, _ in segmasks: u8 = (m * 255.0).astype(np.uint8) if m.dtype != np.uint8 else m acc = cv2.bitwise_or(acc, u8) return torch.from_numpy(acc.astype(np.float32) / 255.0) # [H,W], float32 0..1 CPU def _pick_device_str(user_device: str = "") -> str: if user_device: return user_device return "cuda" if torch.cuda.is_available() else "cpu" @torch.inference_mode() def _inference_bbox(model, image_pil: Image.Image, confidence: float = 0.3, device: str = ""): """ Returns results = [labels(str), bboxes(xyxy), segms(full-image bool masks), conf(float)] For bbox models, segm "masks" are rectangles from the boxes (Subpack parity). """ pred = model( image_pil, conf=confidence, device=_pick_device_str(device), verbose=False, imgsz=YOLO_IMGSZ, # fixed size can be faster ) p0 = pred[0] boxes = p0.boxes bboxes = boxes.xyxy.detach().cpu().numpy() # (N,4) float, xyxy W, H = image_pil.size segms = [] for x0, y0, x1, y1 in bboxes: m = np.zeros((H, W), np.uint8) cv2.rectangle(m, (int(x0), int(y0)), (int(x1), int(y1)), 255, -1) segms.append(m.astype(bool)) if bboxes.shape[0] == 0: return [[], [], [], []] results = [[], [], [], []] names = p0.names for i, (bbox, segm) in enumerate(zip(bboxes, segms)): cls_i = int(boxes.cls[i].item()) results[0].append(names[cls_i]) results[1].append(bbox) results[2].append(segm) results[3].append(float(boxes.conf[i].item())) return results def _create_segmasks(results): bboxes = results[1] segms = results[2] confs = results[3] out = [] for i in range(len(segms)): out.append((bboxes[i], segms[i].astype(np.float32), confs[i])) return out class UltraBBoxDetector: def __init__(self, yolo_model): self.bbox_model = yolo_model def detect(self, image, threshold, dilation, crop_factor, drop_size=1, detailer_hook=None): drop_size = max(int(drop_size), 1) detected = _inference_bbox(self.bbox_model, _tensor_to_pil(image), threshold) segmasks = _create_segmasks(detected) if int(dilation) != 0: segmasks = _dilate_masks(segmasks, int(dilation)) H = int(image.shape[1]) W = int(image.shape[2]) items = [] for (bbox_xyxy, full_mask, conf), label in zip(segmasks, detected[0]): x1, y1, x2, y2 = map(int, bbox_xyxy) if (x2 - x1) > drop_size and (y2 - y1) > drop_size: crop_region = _make_crop_region(W, H, (x1, y1, x2, y2), float(crop_factor)) if detailer_hook is not None and hasattr(detailer_hook, "post_crop_region"): crop_region = detailer_hook.post_crop_region(W, H, (x1, y1, x2, y2), crop_region) cropped_image = _crop_tensor_image(image, crop_region) cropped_mask = _crop_ndarray(full_mask, crop_region).astype(np.float32) items.append(SEG(cropped_image, cropped_mask, float(conf), crop_region, (x1, y1, x2, y2), str(label), None)) segs = ((H, W), items) if detailer_hook is not None and hasattr(detailer_hook, "post_detection"): segs = detailer_hook.post_detection(segs) return segs def detect_combined(self, image, threshold, dilation): detected = _inference_bbox(self.bbox_model, _tensor_to_pil(image), threshold) segmasks = _create_segmasks(detected) if int(dilation) != 0: segmasks = _dilate_masks(segmasks, int(dilation)) return _combine_masks(segmasks) def setAux(self, x): # kept for signature parity pass def _load_ultralytics_model(model_path: str): # Import here so that module import doesn't hard-fail if ultralytics is missing try: from ultralytics import YOLO except Exception as e: raise RuntimeError( "[FaceDetailerStandalone] The 'ultralytics' package is required for the embedded bbox detector.\n" "Install in your ComfyUI python: python -m pip install --upgrade ultralytics" ) from e if not os.path.isfile(model_path): raise FileNotFoundError( "[FaceDetailerStandalone] Embedded YOLO model file not found.\n" f"Expected at: {model_path}\n" "Please place 'face_yolov8m_salia.pt' in the 'assets' folder next to this node." ) yolo = YOLO(model_path) # One-time graph/model optimizations try: dev = _pick_device_str() try: yolo.to(dev) # newer Ultralytics except Exception: yolo.model.to(dev) # older versions except Exception: pass # Fuse Conv+BN where possible (small speedup) try: yolo.fuse() except Exception: pass # Use half precision weights on CUDA (big win; safe for inference) try: if torch.cuda.is_available(): yolo.model.half() except Exception: pass return yolo def _get_embedded_detector(): global _CACHED_YOLO_MODEL, _CACHED_ULTRA_DETECTOR if _CACHED_ULTRA_DETECTOR is not None: return _CACHED_ULTRA_DETECTOR if _CACHED_YOLO_MODEL is None: _CACHED_YOLO_MODEL = _load_ultralytics_model(YOLO_MODEL_PATH) _CACHED_ULTRA_DETECTOR = UltraBBoxDetector(_CACHED_YOLO_MODEL) return _CACHED_ULTRA_DETECTOR # ---------------- Embedded SAM loader (GPU-only, hardcoded path, reuse one predictor) ---------------- # Matches your SAMLoaderStandalone design, but embedded + cached. def _to_numpy_rgb(image_tensor): """ Comfy 'IMAGE' is NHWC in [0..1]. Convert to uint8 HxWx3 RGB numpy. Accepts torch.Tensor (NHWC) or numpy already in HWC. """ if isinstance(image_tensor, torch.Tensor): img = image_tensor if img.dim() == 4 and img.shape[0] == 1: img = img[0] img = (img.clamp(0, 1) * 255.0).to(torch.uint8).cpu().numpy() # HWC return img elif isinstance(image_tensor, np.ndarray): if image_tensor.dtype != np.uint8: img = np.clip(image_tensor, 0, 255).astype(np.uint8) else: img = image_tensor return img else: raise TypeError(f"Unsupported image type for SAM: {type(image_tensor)}") class _SAMWrapperGPUOnlyFast: """ FaceDetailer-compatible wrapper: - Stays on CUDA - Reuses a single SamPredictor - predict(image, points, plabs, bbox, threshold) -> list[HxW float32 CPU masks] """ def __init__(self, model): self.model = model dev = comfy.model_management.get_torch_device() if "cuda" not in str(dev).lower(): raise RuntimeError( f"[FaceDetailerStandalone] GPU-only SAM: CUDA device not available (got '{dev}')." ) self._device = dev self.model.to(self._device).eval() # Lazy import for segment_anything predictor from segment_anything import SamPredictor # type: ignore # Reuse one predictor instance (cheaper than re-creating every call) self._predictor = SamPredictor(self.model) def prepare_device(self): if "cuda" not in str(self._device).lower(): raise RuntimeError("[FaceDetailerStandalone] CUDA device lost/unavailable for SAM.") def release_device(self): # GPU-only; keep on GPU (no-op) pass @torch.inference_mode() def predict(self, image, points, plabs, bbox, threshold: float): """ image: Comfy IMAGE (NHWC, [0..1]) or numpy points: list[[x,y], ...] or None plabs: list[int] (1=fg, 0=bg) or None bbox: [x1,y1,x2,y2] or None threshold: float in [0..1] returns: list of HxW float32 CPU masks (0/1) """ self.prepare_device() np_img = _to_numpy_rgb(image) # Some builds call set_image(img, "RGB"); accept both signatures. try: self._predictor.set_image(np_img, "RGB") except TypeError: self._predictor.set_image(np_img) pc = np.array(points, dtype=np.float32) if points else None pl = np.array(plabs, dtype=np.int32) if plabs else None bx = np.array(bbox, dtype=np.float32) if bbox is not None else None # Keep provided behavior: multimask_output=False masks, scores, _ = self._predictor.predict( point_coords=pc, point_labels=pl, box=bx, multimask_output=False ) out = [] if masks is not None and scores is not None: for m, s in zip(masks, scores): if float(s) >= float(threshold): if isinstance(m, torch.Tensor): t = m.to(torch.float32).cpu() else: t = torch.from_numpy(m.astype(np.float32)).cpu() out.append(t) return out # Cache for SAM _CACHED_SAM_MODEL = None def _get_embedded_sam(): """Load SAM vit_b from SAM_CKPT_PATH and attach GPU-only fast wrapper, cached.""" global _CACHED_SAM_MODEL if _CACHED_SAM_MODEL is not None: return _CACHED_SAM_MODEL if not os.path.isfile(SAM_CKPT_PATH): raise FileNotFoundError( f"[FaceDetailerStandalone] SAM checkpoint not found:\n {SAM_CKPT_PATH}\n" f"Place 'sam_vit_b_01ec64_salia.pth' in the 'assets' folder next to this node." ) # Import here to avoid module import failure at file load time try: from segment_anything import sam_model_registry # type: ignore except Exception as e: raise RuntimeError( "[FaceDetailerStandalone] 'segment_anything' is not installed for embedded SAM. " "Install in your Comfy python, e.g.: python -m pip install " "git+https://github.com/facebookresearch/segment-anything" ) from e # Fixed to vit_b (matches 'sam_vit_b_01ec64' weights) sam = sam_model_registry['vit_b'](checkpoint=SAM_CKPT_PATH) sam.eval() # ensure eval mode # Attach GPU-only, faster wrapper wrapper = _SAMWrapperGPUOnlyFast(sam) sam.sam_wrapper = wrapper _CACHED_SAM_MODEL = sam return _CACHED_SAM_MODEL # ---------------- Impact Pack Face Detailer binding ---------------- _ENHANCE_FACE = None _IMPORT_ERR = None try: from impact.impact_pack import FaceDetailer as _FD _ENHANCE_FACE = _FD.enhance_face except Exception as _e: _IMPORT_ERR = _e _ENHANCE_FACE = None # ---------------- Single public node ---------------- class dn_04: @classmethod def INPUT_TYPES(cls): # Only essential, connectable parts remain editable. (No bbox or SAM inputs.) return { "required": { "image": ("IMAGE",), "model": ("MODEL", {"tooltip": "If `ImpactDummyInput` is connected to model, inference is skipped."}), "clip": ("CLIP",), "vae": ("VAE",), # Keep sampler selectable; all other knobs are fixed "sampler_name": (comfy.samplers.KSampler.SAMPLERS,), # Conditioning stays connectable "positive": ("CONDITIONING",), "negative": ("CONDITIONING",), # Keep seed editable but fixed after generate for reproducibility "seed": ("INT", { "default": 0, "min": 0, "max": 0xffffffffffffffff, "step": 1, "control_after_generate": "fixed", }), }, "optional": { # No external SAM input; embedded } } RETURN_TYPES = ("IMAGE",) RETURN_NAMES = ("image",) FUNCTION = "doit" CATEGORY = "ImpactPack/Standalone" DESCRIPTION = ( "Face Detailer with requested parameters hardcoded (non-editable), " "and embedded Ultralytics face bbox detector + embedded SAM (no external input nodes). " "Optimized call path (cached imports + inference_mode) for lower overhead; " "results identical to Impact Pack Face Detailer at the same settings." ) def doit( self, image, model, clip, vae, sampler_name, positive, negative, seed, ): if _ENHANCE_FACE is None: raise RuntimeError( "ComfyUI-Impact-Pack is required for Face Detailer logic. " "Please install/enable ComfyUI-Impact-Pack." ) from _IMPORT_ERR # Embedded detector & SAM (cached) bbox_detector = _get_embedded_detector() sam_model_opt = _get_embedded_sam() enhance = _ENHANCE_FACE # Determine batch size safely B = image.shape[0] if (hasattr(image, "shape") and image.ndim == 4) else 1 # No autograd, faster kernel choices, identical math for inference with torch.inference_mode(): if B == 1: # Fast-path for single image (avoid list + cat) single = image[0] if image.ndim == 4 else image # [H,W,C] enhanced_img, _, _, _, _ = enhance( single.unsqueeze(0), # -> [1,H,W,C] model, clip, vae, 512, True, 1024, # guide_size, guide_for_bbox, max_size seed, 38, 7.0, # steps, cfg sampler_name, "simple", # scheduler name positive, negative, 0.4, 5, True, True, # denoise, feather, noise_mask, force_inpaint 0.5, 10, 3.0, # bbox_threshold, bbox_dilation, bbox_crop_factor "center-1", 0, 0.93, 0, # sam_detection_hint, sam_dilation, sam_threshold, sam_bbox_expansion 0.7, "False", # sam_mask_hint_threshold, sam_mask_hint_use_negative 10, bbox_detector, # drop_size, bbox_detector # Internals not exposed (kept fixed/None) segm_detector=None, sam_model_opt=sam_model_opt, wildcard_opt="", detailer_hook=None, refiner_ratio=None, refiner_model=None, refiner_clip=None, refiner_positive=None, refiner_negative=None, cycle=1, inpaint_model=False, noise_mask_feather=20, scheduler_func_opt=None, tiled_encode=False, tiled_decode=False, ) return (enhanced_img,) # Batch of images; per-frame process with seed+i out_imgs = [] for i, single in enumerate(image.unbind(0)): enhanced_img, _, _, _, _ = enhance( single.unsqueeze(0), # [1,H,W,C] model, clip, vae, 512, True, 1024, seed + i, 30, 7.0, sampler_name, "simple", positive, negative, 0.5, 5, True, True, 0.5, 10, 3.0, "center-1", 0, 0.93, 0, 0.7, "False", 10, bbox_detector, segm_detector=None, sam_model_opt=sam_model_opt, wildcard_opt="", detailer_hook=None, refiner_ratio=None, refiner_model=None, refiner_clip=None, refiner_positive=None, refiner_negative=None, cycle=1, inpaint_model=False, noise_mask_feather=20, scheduler_func_opt=None, tiled_encode=False, tiled_decode=False, ) out_imgs.append(enhanced_img) return (torch.cat(out_imgs, dim=0),) NODE_CLASS_MAPPINGS = { "dn_04": dn_04, } NODE_DISPLAY_NAME_MAPPINGS = { "dn_04": "dn_04", }