Upload dn04.py

90fbd5f verified 5 months ago

22.5 kB

	# FaceDetailerStandalone_MIN_FIXED_FAST_EMBEDDED_SAM.py
	# One-node Face Detailer (image-only) with fixed settings + embedded Ultralytics bbox detector + embedded SAM loader.
	# - Output parity with Impact Pack Face Detailer at the same settings
	# - No separate bbox-detector node; detector is cached/constructed internally
	# - No separate SAM loader node; SAM is cached/constructed internally
	# - Lightweight runtime overhead (cached imports, inference_mode, fused layers, TF32, FP16 on CUDA)

	import os
	from dataclasses import dataclass
	from typing import List, Tuple, Optional

	import warnings
	warnings.filterwarnings("ignore")

	# Silence OpenCV before importing it (env var) and after (setLogLevel)
	os.environ["OPENCV_LOG_LEVEL"] = "ERROR"

	import numpy as np
	import torch
	import comfy
	from PIL import Image
	import cv2

	try:
	if hasattr(cv2, "setLogLevel"):
	try:
	lvl = cv2.LOG_LEVEL_ERROR if hasattr(cv2, "LOG_LEVEL_ERROR") else 3 # 3 == error
	cv2.setLogLevel(lvl)
	except Exception:
	pass
	except Exception:
	pass

	# ---------------- Fixed FaceDetailer settings (do not expose in UI) ----------------
	# GUIDE_SIZE = 512
	# GUIDE_SIZE_FOR_BBOX = True
	# MAX_SIZE = 1024
	# STEPS = 30
	# CFG = 7.0
	# SCHEDULER = "simple"
	# DENOISE = 0.5
	# FEATHER = 5
	# NOISE_MASK = True
	# FORCE_INPAINT = True
	# BBOX_THRESHOLD = 0.5
	# BBOX_DILATION = 10
	# BBOX_CROP_FACTOR = 3.0
	# DROP_SIZE = 10
	# SAM_DETECTION_HINT = "center-1"
	# SAM_DILATION = 0
	# SAM_THRESHOLD = 0.93
	# SAM_BBOX_EXPANSION = 0
	# SAM_MASK_HINT_THRESHOLD = 0.7
	# SAM_MASK_HINT_USE_NEGATIVE = "False"
	# WILDCARD = ""
	# CYCLE = 1
	# INPAINT_MODEL = False
	# NOISE_MASK_FEATHER = 20
	# TILED_ENCODE = False
	# TILED_DECODE = False
	# ---------------------------------------------------------------------

	# ---------------- Ultralytics / YOLO detector integration (embedded) ----------------

	# Torch runtime perf switches
	torch.backends.cudnn.benchmark = True # autotune best conv algorithms
	if torch.cuda.is_available():
	torch.backends.cuda.matmul.allow_tf32 = True
	torch.backends.cudnn.allow_tf32 = True
	try:
	torch.set_float32_matmul_precision("high") # PyTorch 2.x
	except Exception:
	pass

	# Optional Impact Pack interop (SEG type)
	try:
	# If Impact Pack is installed, use its SEG to be perfectly compatible.
	from impact.core import SEG as _IMPACT_SEG # type: ignore
	_USE_IMPACT_SEG = True
	except Exception:
	_USE_IMPACT_SEG = False

	@dataclass
	class _LocalSEG:
	cropped_image: Optional[torch.Tensor]
	cropped_mask: np.ndarray # 2D float32 [0..1]
	confidence: float
	crop_region: Tuple[int, int, int, int] # (x1,y1,x2,y2)
	bbox: Tuple[int, int, int, int] # (x1,y1,x2,y2)
	label: str
	control_net_wrapper: Optional[object] = None

	SEG = _IMPACT_SEG if _USE_IMPACT_SEG else _LocalSEG

	# ---------------------------------------------------------------------
	# LOCAL ASSET PATHS (no hardcoded absolute paths)
	# ---------------------------------------------------------------------

	# Base directory of this node file (cross-platform, works on RunPod/ComfyUI)
	BASE_DIR = os.path.dirname(os.path.abspath(__file__))

	# Local YOLO model path inside this custom node folder
	YOLO_MODEL_PATH = os.path.join(BASE_DIR, "assets", "face_yolov8m_salia.pt")
	YOLO_IMGSZ = 640

	# Local SAM checkpoint path inside this custom node folder
	SAM_CKPT_PATH = os.path.join(BASE_DIR, "assets", "sam_vit_b_01ec64_salia.pth")

	# Cached instances (process-local)
	_CACHED_YOLO_MODEL = None
	_CACHED_ULTRA_DETECTOR = None


	def _tensor_to_pil(image: torch.Tensor) -> Image.Image:
	# image: [1, H, W, 3], float(0..1)
	img = image[0].detach().cpu().clamp(0, 1).numpy()
	img = (img * 255.0).round().astype(np.uint8) # (H, W, 3) RGB
	return Image.fromarray(img, mode="RGB")


	def _make_crop_region(w: int, h: int, bbox_xyxy, crop_factor: float) -> Tuple[int, int, int, int]:
	x1, y1, x2, y2 = map(int, bbox_xyxy)
	cx = (x1 + x2) * 0.5
	cy = (y1 + y2) * 0.5
	bw = (x2 - x1)
	bh = (y2 - y1)
	new_w = max(1, int(bw * crop_factor))
	new_h = max(1, int(bh * crop_factor))
	# center to image
	nx1 = int(max(0, round(cx - new_w * 0.5)))
	ny1 = int(max(0, round(cy - new_h * 0.5)))
	nx2 = int(min(w, nx1 + new_w))
	ny2 = int(min(h, ny1 + new_h))
	# clamp again
	nx1 = max(0, min(nx1, w - 1))
	ny1 = max(0, min(ny1, h - 1))
	nx2 = max(nx1 + 1, min(nx2, w))
	ny2 = max(ny1 + 1, min(ny2, h))
	return (nx1, ny1, nx2, ny2)


	def _crop_tensor_image(image: torch.Tensor, crop: Tuple[int, int, int, int]) -> torch.Tensor:
	# image: [1,H,W,3]; crop: (x1,y1,x2,y2)
	x1, y1, xb, yb = crop
	return image[:, y1:yb, x1:xb, :].contiguous()


	def _crop_ndarray(mask: np.ndarray, crop: Tuple[int, int, int, int]) -> np.ndarray:
	# mask: [H,W] float/bool/uint8; crop: (x1,y1,x2,y2)
	x1, y1, xb, yb = crop
	return mask[int(y1):int(yb), int(x1):int(xb)]


	def _dilate_masks(segmasks: List[Tuple[np.ndarray, np.ndarray, float]], factor: int):
	if factor == 0 or not segmasks:
	return segmasks
	k = abs(int(factor))
	if k < 1:
	return segmasks
	kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (k, k))
	do_dilate = factor > 0
	out = []
	for (bbox, m, conf) in segmasks:
	u8 = (m * 255.0).astype(np.uint8) if m.dtype != np.uint8 else m
	d = cv2.dilate(u8, kernel, iterations=1) if do_dilate else cv2.erode(u8, kernel, iterations=1)
	out.append((bbox, d.astype(np.float32) / 255.0, conf))
	return out


	def _combine_masks(segmasks: List[Tuple[np.ndarray, np.ndarray, float]]) -> Optional[torch.Tensor]:
	if not segmasks:
	return None
	h = segmasks[0][1].shape[0]
	w = segmasks[0][1].shape[1]
	acc = np.zeros((h, w), dtype=np.uint8)
	for _, m, _ in segmasks:
	u8 = (m * 255.0).astype(np.uint8) if m.dtype != np.uint8 else m
	acc = cv2.bitwise_or(acc, u8)
	return torch.from_numpy(acc.astype(np.float32) / 255.0) # [H,W], float32 0..1 CPU


	def _pick_device_str(user_device: str = "") -> str:
	if user_device:
	return user_device
	return "cuda" if torch.cuda.is_available() else "cpu"


	@torch.inference_mode()
	def _inference_bbox(model, image_pil: Image.Image, confidence: float = 0.3, device: str = ""):
	"""
	Returns results = [labels(str), bboxes(xyxy), segms(full-image bool masks), conf(float)]
	For bbox models, segm "masks" are rectangles from the boxes (Subpack parity).
	"""
	pred = model(
	image_pil,
	conf=confidence,
	device=_pick_device_str(device),
	verbose=False,
	imgsz=YOLO_IMGSZ, # fixed size can be faster
	)

	p0 = pred[0]
	boxes = p0.boxes
	bboxes = boxes.xyxy.detach().cpu().numpy() # (N,4) float, xyxy

	W, H = image_pil.size
	segms = []
	for x0, y0, x1, y1 in bboxes:
	m = np.zeros((H, W), np.uint8)
	cv2.rectangle(m, (int(x0), int(y0)), (int(x1), int(y1)), 255, -1)
	segms.append(m.astype(bool))

	if bboxes.shape[0] == 0:
	return [[], [], [], []]

	results = [[], [], [], []]
	names = p0.names
	for i, (bbox, segm) in enumerate(zip(bboxes, segms)):
	cls_i = int(boxes.cls[i].item())
	results[0].append(names[cls_i])
	results[1].append(bbox)
	results[2].append(segm)
	results[3].append(float(boxes.conf[i].item()))
	return results


	def _create_segmasks(results):
	bboxes = results[1]
	segms = results[2]
	confs = results[3]
	out = []
	for i in range(len(segms)):
	out.append((bboxes[i], segms[i].astype(np.float32), confs[i]))
	return out


	class UltraBBoxDetector:
	def __init__(self, yolo_model):
	self.bbox_model = yolo_model

	def detect(self, image, threshold, dilation, crop_factor, drop_size=1, detailer_hook=None):
	drop_size = max(int(drop_size), 1)
	detected = _inference_bbox(self.bbox_model, _tensor_to_pil(image), threshold)
	segmasks = _create_segmasks(detected)
	if int(dilation) != 0:
	segmasks = _dilate_masks(segmasks, int(dilation))

	H = int(image.shape[1])
	W = int(image.shape[2])
	items = []
	for (bbox_xyxy, full_mask, conf), label in zip(segmasks, detected[0]):
	x1, y1, x2, y2 = map(int, bbox_xyxy)
	if (x2 - x1) > drop_size and (y2 - y1) > drop_size:
	crop_region = _make_crop_region(W, H, (x1, y1, x2, y2), float(crop_factor))
	if detailer_hook is not None and hasattr(detailer_hook, "post_crop_region"):
	crop_region = detailer_hook.post_crop_region(W, H, (x1, y1, x2, y2), crop_region)

	cropped_image = _crop_tensor_image(image, crop_region)
	cropped_mask = _crop_ndarray(full_mask, crop_region).astype(np.float32)
	items.append(SEG(cropped_image, cropped_mask, float(conf), crop_region, (x1, y1, x2, y2), str(label), None))

	segs = ((H, W), items)
	if detailer_hook is not None and hasattr(detailer_hook, "post_detection"):
	segs = detailer_hook.post_detection(segs)
	return segs

	def detect_combined(self, image, threshold, dilation):
	detected = _inference_bbox(self.bbox_model, _tensor_to_pil(image), threshold)
	segmasks = _create_segmasks(detected)
	if int(dilation) != 0:
	segmasks = _dilate_masks(segmasks, int(dilation))
	return _combine_masks(segmasks)

	def setAux(self, x):
	# kept for signature parity
	pass


	def _load_ultralytics_model(model_path: str):
	# Import here so that module import doesn't hard-fail if ultralytics is missing
	try:
	from ultralytics import YOLO
	except Exception as e:
	raise RuntimeError(
	"[FaceDetailerStandalone] The 'ultralytics' package is required for the embedded bbox detector.\n"
	"Install in your ComfyUI python: python -m pip install --upgrade ultralytics"
	) from e

	if not os.path.isfile(model_path):
	raise FileNotFoundError(
	"[FaceDetailerStandalone] Embedded YOLO model file not found.\n"
	f"Expected at: {model_path}\n"
	"Please place 'face_yolov8m_salia.pt' in the 'assets' folder next to this node."
	)

	yolo = YOLO(model_path)

	# One-time graph/model optimizations
	try:
	dev = _pick_device_str()
	try:
	yolo.to(dev) # newer Ultralytics
	except Exception:
	yolo.model.to(dev) # older versions
	except Exception:
	pass

	# Fuse Conv+BN where possible (small speedup)
	try:
	yolo.fuse()
	except Exception:
	pass

	# Use half precision weights on CUDA (big win; safe for inference)
	try:
	if torch.cuda.is_available():
	yolo.model.half()
	except Exception:
	pass

	return yolo


	def _get_embedded_detector():
	global _CACHED_YOLO_MODEL, _CACHED_ULTRA_DETECTOR
	if _CACHED_ULTRA_DETECTOR is not None:
	return _CACHED_ULTRA_DETECTOR
	if _CACHED_YOLO_MODEL is None:
	_CACHED_YOLO_MODEL = _load_ultralytics_model(YOLO_MODEL_PATH)
	_CACHED_ULTRA_DETECTOR = UltraBBoxDetector(_CACHED_YOLO_MODEL)
	return _CACHED_ULTRA_DETECTOR

	# ---------------- Embedded SAM loader (GPU-only, hardcoded path, reuse one predictor) ----------------
	# Matches your SAMLoaderStandalone design, but embedded + cached.


	def _to_numpy_rgb(image_tensor):
	"""
	Comfy 'IMAGE' is NHWC in [0..1]. Convert to uint8 HxWx3 RGB numpy.
	Accepts torch.Tensor (NHWC) or numpy already in HWC.
	"""
	if isinstance(image_tensor, torch.Tensor):
	img = image_tensor
	if img.dim() == 4 and img.shape[0] == 1:
	img = img[0]
	img = (img.clamp(0, 1) * 255.0).to(torch.uint8).cpu().numpy() # HWC
	return img
	elif isinstance(image_tensor, np.ndarray):
	if image_tensor.dtype != np.uint8:
	img = np.clip(image_tensor, 0, 255).astype(np.uint8)
	else:
	img = image_tensor
	return img
	else:
	raise TypeError(f"Unsupported image type for SAM: {type(image_tensor)}")


	class _SAMWrapperGPUOnlyFast:
	"""
	FaceDetailer-compatible wrapper:
	- Stays on CUDA
	- Reuses a single SamPredictor
	- predict(image, points, plabs, bbox, threshold) -> list[HxW float32 CPU masks]
	"""
	def __init__(self, model):
	self.model = model
	dev = comfy.model_management.get_torch_device()
	if "cuda" not in str(dev).lower():
	raise RuntimeError(
	f"[FaceDetailerStandalone] GPU-only SAM: CUDA device not available (got '{dev}')."
	)
	self._device = dev
	self.model.to(self._device).eval()
	# Lazy import for segment_anything predictor
	from segment_anything import SamPredictor # type: ignore
	# Reuse one predictor instance (cheaper than re-creating every call)
	self._predictor = SamPredictor(self.model)

	def prepare_device(self):
	if "cuda" not in str(self._device).lower():
	raise RuntimeError("[FaceDetailerStandalone] CUDA device lost/unavailable for SAM.")

	def release_device(self):
	# GPU-only; keep on GPU (no-op)
	pass

	@torch.inference_mode()
	def predict(self, image, points, plabs, bbox, threshold: float):
	"""
	image: Comfy IMAGE (NHWC, [0..1]) or numpy
	points: list[[x,y], ...] or None
	plabs: list[int] (1=fg, 0=bg) or None
	bbox: [x1,y1,x2,y2] or None
	threshold: float in [0..1]
	returns: list of HxW float32 CPU masks (0/1)
	"""
	self.prepare_device()

	np_img = _to_numpy_rgb(image)
	# Some builds call set_image(img, "RGB"); accept both signatures.
	try:
	self._predictor.set_image(np_img, "RGB")
	except TypeError:
	self._predictor.set_image(np_img)

	pc = np.array(points, dtype=np.float32) if points else None
	pl = np.array(plabs, dtype=np.int32) if plabs else None
	bx = np.array(bbox, dtype=np.float32) if bbox is not None else None

	# Keep provided behavior: multimask_output=False
	masks, scores, _ = self._predictor.predict(
	point_coords=pc,
	point_labels=pl,
	box=bx,
	multimask_output=False
	)

	out = []
	if masks is not None and scores is not None:
	for m, s in zip(masks, scores):
	if float(s) >= float(threshold):
	if isinstance(m, torch.Tensor):
	t = m.to(torch.float32).cpu()
	else:
	t = torch.from_numpy(m.astype(np.float32)).cpu()
	out.append(t)
	return out


	# Cache for SAM
	_CACHED_SAM_MODEL = None


	def _get_embedded_sam():
	"""Load SAM vit_b from SAM_CKPT_PATH and attach GPU-only fast wrapper, cached."""
	global _CACHED_SAM_MODEL
	if _CACHED_SAM_MODEL is not None:
	return _CACHED_SAM_MODEL

	if not os.path.isfile(SAM_CKPT_PATH):
	raise FileNotFoundError(
	f"[FaceDetailerStandalone] SAM checkpoint not found:\n {SAM_CKPT_PATH}\n"
	f"Place 'sam_vit_b_01ec64_salia.pth' in the 'assets' folder next to this node."
	)

	# Import here to avoid module import failure at file load time
	try:
	from segment_anything import sam_model_registry # type: ignore
	except Exception as e:
	raise RuntimeError(
	"[FaceDetailerStandalone] 'segment_anything' is not installed for embedded SAM. "
	"Install in your Comfy python, e.g.: python -m pip install "
	"git+https://github.com/facebookresearch/segment-anything"
	) from e

	# Fixed to vit_b (matches 'sam_vit_b_01ec64' weights)
	sam = sam_model_registry['vit_b'](checkpoint=SAM_CKPT_PATH)
	sam.eval() # ensure eval mode

	# Attach GPU-only, faster wrapper
	wrapper = _SAMWrapperGPUOnlyFast(sam)
	sam.sam_wrapper = wrapper

	_CACHED_SAM_MODEL = sam
	return _CACHED_SAM_MODEL

	# ---------------- Impact Pack Face Detailer binding ----------------
	_ENHANCE_FACE = None
	_IMPORT_ERR = None
	try:
	from impact.impact_pack import FaceDetailer as _FD
	_ENHANCE_FACE = _FD.enhance_face
	except Exception as _e:
	_IMPORT_ERR = _e
	_ENHANCE_FACE = None

	# ---------------- Single public node ----------------
	class dn_04:
	@classmethod
	def INPUT_TYPES(cls):
	# Only essential, connectable parts remain editable. (No bbox or SAM inputs.)
	return {
	"required": {
	"image": ("IMAGE",),
	"model": ("MODEL", {"tooltip": "If `ImpactDummyInput` is connected to model, inference is skipped."}),
	"clip": ("CLIP",),
	"vae": ("VAE",),

	# Keep sampler selectable; all other knobs are fixed
	"sampler_name": (comfy.samplers.KSampler.SAMPLERS,),

	# Conditioning stays connectable
	"positive": ("CONDITIONING",),
	"negative": ("CONDITIONING",),

	# Keep seed editable but fixed after generate for reproducibility
	"seed": ("INT", {
	"default": 0,
	"min": 0,
	"max": 0xffffffffffffffff,
	"step": 1,
	"control_after_generate": "fixed",
	}),
	},
	"optional": {
	# No external SAM input; embedded
	}
	}

	RETURN_TYPES = ("IMAGE",)
	RETURN_NAMES = ("image",)
	FUNCTION = "doit"
	CATEGORY = "ImpactPack/Standalone"
	DESCRIPTION = (
	"Face Detailer with requested parameters hardcoded (non-editable), "
	"and embedded Ultralytics face bbox detector + embedded SAM (no external input nodes). "
	"Optimized call path (cached imports + inference_mode) for lower overhead; "
	"results identical to Impact Pack Face Detailer at the same settings."
	)

	def doit(
	self,
	image, model, clip, vae,
	sampler_name,
	positive, negative,
	seed,
	):
	if _ENHANCE_FACE is None:
	raise RuntimeError(
	"ComfyUI-Impact-Pack is required for Face Detailer logic. "
	"Please install/enable ComfyUI-Impact-Pack."
	) from _IMPORT_ERR

	# Embedded detector & SAM (cached)
	bbox_detector = _get_embedded_detector()
	sam_model_opt = _get_embedded_sam()

	enhance = _ENHANCE_FACE

	# Determine batch size safely
	B = image.shape[0] if (hasattr(image, "shape") and image.ndim == 4) else 1

	# No autograd, faster kernel choices, identical math for inference
	with torch.inference_mode():
	if B == 1:
	# Fast-path for single image (avoid list + cat)
	single = image[0] if image.ndim == 4 else image # [H,W,C]
	enhanced_img, _, _, _, _ = enhance(
	single.unsqueeze(0), # -> [1,H,W,C]
	model, clip, vae,
	512, True, 1024, # guide_size, guide_for_bbox, max_size
	seed, 38, 7.0, # steps, cfg
	sampler_name, "simple", # scheduler name
	positive, negative,
	0.4, 5, True, True, # denoise, feather, noise_mask, force_inpaint
	0.5, 10, 3.0, # bbox_threshold, bbox_dilation, bbox_crop_factor
	"center-1", 0, 0.93, 0, # sam_detection_hint, sam_dilation, sam_threshold, sam_bbox_expansion
	0.7, "False", # sam_mask_hint_threshold, sam_mask_hint_use_negative
	10, bbox_detector, # drop_size, bbox_detector
	# Internals not exposed (kept fixed/None)
	segm_detector=None, sam_model_opt=sam_model_opt,
	wildcard_opt="", detailer_hook=None,
	refiner_ratio=None, refiner_model=None, refiner_clip=None,
	refiner_positive=None, refiner_negative=None,
	cycle=1, inpaint_model=False,
	noise_mask_feather=20,
	scheduler_func_opt=None,
	tiled_encode=False, tiled_decode=False,
	)
	return (enhanced_img,)

	# Batch of images; per-frame process with seed+i
	out_imgs = []
	for i, single in enumerate(image.unbind(0)):
	enhanced_img, _, _, _, _ = enhance(
	single.unsqueeze(0), # [1,H,W,C]
	model, clip, vae,
	512, True, 1024,
	seed + i, 30, 7.0,
	sampler_name, "simple",
	positive, negative,
	0.5, 5, True, True,
	0.5, 10, 3.0,
	"center-1", 0, 0.93, 0,
	0.7, "False",
	10, bbox_detector,
	segm_detector=None, sam_model_opt=sam_model_opt,
	wildcard_opt="", detailer_hook=None,
	refiner_ratio=None, refiner_model=None, refiner_clip=None,
	refiner_positive=None, refiner_negative=None,
	cycle=1, inpaint_model=False,
	noise_mask_feather=20,
	scheduler_func_opt=None,
	tiled_encode=False, tiled_decode=False,
	)
	out_imgs.append(enhanced_img)

	return (torch.cat(out_imgs, dim=0),)


	NODE_CLASS_MAPPINGS = {
	"dn_04": dn_04,
	}
	NODE_DISPLAY_NAME_MAPPINGS = {
	"dn_04": "dn_04",
	}