File size: 24,644 Bytes

b4760b6

import cv2
import numpy as np
import torch
import logging
import os
from PIL import Image
from typing import Tuple, List, Optional

try:
    from comfy_api.v0_0_3_io import (
        ComfyNode, Schema, InputBehavior, NumberDisplay,
        IntegerInput, MaskInput, ImageInput, ImageOutput, ComboInput, CustomInput,
        IntegerOutput, NodeOutput,
    )
    COMFY_V3_AVAILABLE = True
except ImportError:
    # Mock classes for v1/v2 compatibility
    ComfyNode = object
    Schema = None
    InputBehavior = None
    NumberDisplay = None
    ImageInput = None
    ImageOutput = None
    ComboInput = None
    CustomInput = None
    IntegerInput = None
    NodeOutput = None
    COMFY_V3_AVAILABLE = False

# Configure logging level from environment variable
log_level = os.getenv('COMFYUI_FACE_DETECTION_LOG_LEVEL', 'INFO').upper()
logging.basicConfig(level=getattr(logging, log_level, logging.INFO))
logger = logging.getLogger(__name__)

if COMFY_V3_AVAILABLE:
    class FaceDetectionNode(ComfyNode):
        @classmethod
        def DEFINE_SCHEMA(cls):
            return Schema(
                node_id="FaceDetectionNode",
                display_name="Face Detection and Crop",
                description="Detect and crop faces from images using Haar cascades.",
                category="image/processing",
                inputs=[
                    ImageInput("image", display_name="Input Image"),
                    CustomInput("detection_threshold", io_type="FLOAT",
                              min=0.1, max=1.0, default=0.8,
                              tooltip="Confidence threshold for face detection",
                              display_mode=NumberDisplay.slider),
                    IntegerInput("min_face_size", display_name="Min Face Size",
                               min=32, max=512, default=64,
                               tooltip="Minimum size for detected faces",
                               display_mode=NumberDisplay.slider),
                    IntegerInput("padding", display_name="Padding",
                               min=0, max=256, default=32,
                               tooltip="Padding around detected faces",
                               display_mode=NumberDisplay.slider),
                    ComboInput("output_mode", options=["largest_face", "all_faces"],
                              tooltip="Output mode for detected faces"),
                    ComboInput("face_output_format", options=["strip", "individual"],
                              tooltip="Format for multiple faces: strip (horizontal layout) or individual (separate batch items). Only applies when output_mode='all_faces'. Max size: 512px.",
                              behavior=InputBehavior.optional),
                    ComboInput("classifier_type", options=["default", "alternative"],
                              behavior=InputBehavior.optional),
                ],
                outputs=[
                    ImageOutput("cropped_faces", display_name="Cropped Faces",
                              tooltip="Detected and cropped faces"),
                ],
                is_output_node=False,
            )

        @staticmethod
        def _get_cascade_classifiers():
            """Get cascade classifiers - static method for stateless execution"""
            default_cascade = None
            alternative_cascade = None
            
            try:
                # Default Haar cascade - most commonly used and well-tested
                default_path = cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
                if os.path.exists(default_path):
                    default_cascade = cv2.CascadeClassifier(default_path)
                    if default_cascade.empty():
                        logger.error(f"Failed to load cascade from {default_path}")
                        default_cascade = None
                else:
                    logger.error(f"Default cascade file not found: {default_path}")
                
                # Alternative Haar cascade - different training, may detect faces missed by default
                alt_path = cv2.data.haarcascades + 'haarcascade_frontalface_alt.xml'
                if os.path.exists(alt_path):
                    alternative_cascade = cv2.CascadeClassifier(alt_path)
                    if alternative_cascade.empty():
                        logger.warning(f"Failed to load alternative cascade from {alt_path}")
                        alternative_cascade = None
                else:
                    logger.warning(f"Alternative cascade file not found: {alt_path}")
                    
            except Exception as e:
                logger.error(f"Error initializing cascade classifiers: {str(e)}")
                default_cascade = None
                alternative_cascade = None
                
            return default_cascade, alternative_cascade

        @staticmethod
        def add_padding(image: np.ndarray, face_rect: Tuple[int, int, int, int], padding: int) -> Tuple[np.ndarray, Tuple[int, int, int, int]]:
            """Add padding around detected face and handle boundaries"""
            x, y, w, h = face_rect
            height, width = image.shape[:2]
            
            # Calculate padded coordinates
            x1 = max(0, x - padding)
            y1 = max(0, y - padding)
            x2 = min(width, x + w + padding)
            y2 = min(height, y + h + padding)
            
            return image[y1:y2, x1:x2], (x1, y1, x2-x1, y2-y1)

        @staticmethod
        def _process_individual_faces(cropped_faces: List[np.ndarray]) -> torch.Tensor:
            """
            Process multiple faces into individual batch items with consistent dimensions.
            
            Args:
                cropped_faces: List of face images as numpy arrays
                
            Returns:
                Tensor with shape [N, H, W, C] where N is the number of faces
                
            Note:
                - Faces are resized to consistent dimensions (max 512px) for proper batching
                - All faces maintain their aspect ratios during resizing to target dimensions
            """
            # Resize all faces to consistent dimensions for proper batching
            # Use 512px as maximum to balance quality with memory usage
            max_height = min(512, max(face.shape[0] for face in cropped_faces))
            max_width = min(512, max(face.shape[1] for face in cropped_faces))
            
            # Use the maximum dimensions to ensure consistent sizing
            target_size = (max_width, max_height)
            resized_faces = []
            for face in cropped_faces:
                resized = cv2.resize(face, target_size)
                resized_faces.append(resized)
            
            # Stack faces as batch dimension [N, H, W, C]
            result_batch = np.stack(resized_faces, axis=0)
            
            # Ensure correct channel count for each face
            if result_batch.shape[3] == 1:
                result_batch = np.repeat(result_batch, 3, axis=3)
            elif result_batch.shape[3] == 4:
                result_batch = result_batch[:, :, :, :3]
            
            # Convert to tensor with proper dimensions [B, H, W, C]
            result = torch.from_numpy(result_batch).float() / 255.0
            
            # Validate output tensor
            assert result.shape[3] == 3, f"Output must have 3 channels, got {result.shape[3]}"
            
            return result

        @classmethod
        async def execute(cls, image: torch.Tensor, detection_threshold: float, min_face_size: int, 
                         padding: int, output_mode: str, face_output_format: str = "strip",
                         classifier_type: str = "default", mask: torch.Tensor = None) -> NodeOutput:
            
            # Get cascade classifiers
            default_cascade, alternative_cascade = cls._get_cascade_classifiers()
            
            # Convert input to numpy array for OpenCV processing
            if isinstance(image, torch.Tensor):
                logger.debug(f"Processing tensor - Shape: {image.shape}, Type: {image.dtype}")
                
                # Ensure 4D tensor [B, H, W, C] and normalize to RGB
                if len(image.shape) == 3:
                    image = image.unsqueeze(0)
                elif len(image.shape) != 4:
                    raise ValueError(f"Expected 3D or 4D tensor, got shape: {image.shape}")
                
                B, H, W, C = image.shape
                
                # Handle different channel configurations
                if C == 1:
                    image = image.repeat(1, 1, 1, 3)  # Grayscale to RGB
                elif C == 4:
                    image = image[:, :, :, :3]  # RGBA to RGB
                elif C > 4:
                    logger.warning(f"Input has {C} channels, using first 3")
                    image = image[:, :, :, :3]
                elif C != 3:
                    raise ValueError(f"Cannot handle {C} channels")
                
                # Single conversion: tensor -> numpy (uint8)
                image_np = image[0].cpu().numpy()
                if image_np.max() <= 1.0:
                    image_np = (image_np * 255).astype(np.uint8)
                else:
                    image_np = np.clip(image_np, 0, 255).astype(np.uint8)
                    
            else:
                # Already numpy array
                image_np = image

            # Validate and ensure RGB format
            if not isinstance(image_np, np.ndarray) or len(image_np.shape) != 3:
                raise ValueError(f"Expected 3D numpy array, got {type(image_np)} with shape {getattr(image_np, 'shape', 'unknown')}")
            
            if image_np.shape[2] != 3:
                raise ValueError(f"Expected RGB image (3 channels), got {image_np.shape[2]} channels")

            # Convert to grayscale for face detection
            gray = cv2.cvtColor(image_np, cv2.COLOR_RGB2GRAY)
            
            # Select appropriate cascade based on classifier_type
            if classifier_type == "alternative":
                if alternative_cascade is None:
                    logger.warning("Alternative Haar cascade not available, falling back to default")
                    if default_cascade is None:
                        logger.error("No cascade classifiers available")
                        return NodeOutput(cropped_faces=torch.zeros((1, 512, 512, 3)))
                    face_cascade = default_cascade
                else:
                    face_cascade = alternative_cascade
            else:  # default
                if default_cascade is None:
                    logger.error("Default Haar cascade not available")
                    return NodeOutput(cropped_faces=torch.zeros((1, 512, 512, 3)))
                face_cascade = default_cascade
            
            try:
                faces = face_cascade.detectMultiScale(
                    gray,
                    scaleFactor=1.1,
                    minNeighbors=5,
                    minSize=(min_face_size, min_face_size)
                )
            except Exception as e:
                logger.error(f"Face detection failed: {str(e)}")
                return NodeOutput(cropped_faces=torch.zeros((1, 512, 512, 3)))

            if len(faces) == 0:
                logger.warning("No faces detected in image")
                # Return empty image with correct dimensions [B, H, W, C]
                return NodeOutput(cropped_faces=torch.zeros((1, 512, 512, 3)))

            cropped_faces = []
            for x, y, w, h in faces:
                face_img, _ = cls.add_padding(image_np, (x, y, w, h), padding)
                cropped_faces.append(face_img)

            if output_mode == "largest_face":
                largest_face = max(cropped_faces, key=lambda x: x.shape[0] * x.shape[1])
                cropped_faces = [largest_face]

            # Enhanced result handling with support for individual face outputs
            # Note: face_output_format only applies when output_mode="all_faces" with multiple faces
            if output_mode == "all_faces" and len(cropped_faces) > 1 and face_output_format == "individual":
                result = cls._process_individual_faces(cropped_faces)
                return NodeOutput(cropped_faces=result)
                
            elif len(cropped_faces) > 1:
                # Original strip format - resize all faces to same height while maintaining aspect ratio
                max_height = min(512, max(face.shape[0] for face in cropped_faces))
                resized_faces = []
                for face in cropped_faces:
                    aspect_ratio = face.shape[1] / face.shape[0]
                    new_width = int(max_height * aspect_ratio)
                    resized = cv2.resize(face, (new_width, max_height))
                    resized_faces.append(resized)
                result = np.hstack(resized_faces)
            else:
                result = cropped_faces[0]
            
            # Ensure result has correct channel count
            if result.shape[2] == 1:
                result = cv2.cvtColor(result, cv2.COLOR_GRAY2RGB)
            elif result.shape[2] == 4:
                result = cv2.cvtColor(result, cv2.COLOR_RGBA2RGB)
            
            # Convert back to tensor with proper dimensions [B, H, W, C]
            result = torch.from_numpy(result).float() / 255.0
            result = result.unsqueeze(0)  # Add batch dimension
            
            # Validate output tensor (format: [B, H, W, C])
            assert result.shape[3] == 3, f"Output must have 3 channels, got {result.shape[3]}"
            
            return NodeOutput(cropped_faces=result)

        @classmethod
        def IS_CHANGED(cls, **kwargs):
            return False

# Backward compatibility wrapper for v1/v2
class FaceDetectionNodeV1:
    """Backward compatibility wrapper for ComfyUI v1/v2"""
    
    @classmethod
    def INPUT_TYPES(s):
        return {
            "required": {
                "image": ("IMAGE",),
                "detection_threshold": ("FLOAT", {
                    "default": 0.8,
                    "min": 0.1,
                    "max": 1.0,
                    "step": 0.1
                }),
                "min_face_size": ("INT", {
                    "default": 64,
                    "min": 32,
                    "max": 512,
                    "step": 8
                }),
                "padding": ("INT", {
                    "default": 32,
                    "min": 0,
                    "max": 256,
                    "step": 8
                }),
                "output_mode": (["largest_face", "all_faces"],),
            },
            "optional": {
                "face_output_format": (["strip", "individual"], {"default": "strip"}),
                "classifier_type": (["default", "alternative"], {"default": "default"}),
            }
        }

    RETURN_TYPES = ("IMAGE",)
    RETURN_NAMES = ("Cropped Faces",)
    FUNCTION = "detect_and_crop_faces"
    CATEGORY = "image/processing"

    def __init__(self):
        self.default_cascade = None
        self.alternative_cascade = None
        
        try:
            # Default Haar cascade - most commonly used and well-tested
            default_path = cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
            if os.path.exists(default_path):
                self.default_cascade = cv2.CascadeClassifier(default_path)
                if self.default_cascade.empty():
                    logger.error(f"Failed to load cascade from {default_path}")
                    self.default_cascade = None
            else:
                logger.error(f"Default cascade file not found: {default_path}")
            
            # Alternative Haar cascade - different training, may detect faces missed by default
            alt_path = cv2.data.haarcascades + 'haarcascade_frontalface_alt.xml'
            if os.path.exists(alt_path):
                self.alternative_cascade = cv2.CascadeClassifier(alt_path)
                if self.alternative_cascade.empty():
                    logger.warning(f"Failed to load alternative cascade from {alt_path}")
                    self.alternative_cascade = None
            else:
                logger.warning(f"Alternative cascade file not found: {alt_path}")
                
        except Exception as e:
            logger.error(f"Error initializing cascade classifiers: {str(e)}")
            self.default_cascade = None
            self.alternative_cascade = None

    def add_padding(self, image: np.ndarray, face_rect: Tuple[int, int, int, int], padding: int) -> Tuple[np.ndarray, Tuple[int, int, int, int]]:
        """Add padding around detected face and handle boundaries"""
        x, y, w, h = face_rect
        height, width = image.shape[:2]
        
        # Calculate padded coordinates
        x1 = max(0, x - padding)
        y1 = max(0, y - padding)
        x2 = min(width, x + w + padding)
        y2 = min(height, y + h + padding)
        
        return image[y1:y2, x1:x2], (x1, y1, x2-x1, y2-y1)

    def _process_individual_faces(self, cropped_faces: List[np.ndarray]) -> torch.Tensor:
        """
        Process multiple faces into individual batch items with consistent dimensions.
        Shared logic between v1/v2 and v3 implementations.
        """
        if COMFY_V3_AVAILABLE:
            # Use the static method from the v3 class
            return FaceDetectionNode._process_individual_faces(cropped_faces)
        else:
            # Fallback implementation for v1/v2 only environments
            max_height = min(512, max(face.shape[0] for face in cropped_faces))
            max_width = min(512, max(face.shape[1] for face in cropped_faces))
            
            target_size = (max_width, max_height)
            resized_faces = []
            for face in cropped_faces:
                resized = cv2.resize(face, target_size)
                resized_faces.append(resized)
            
            result_batch = np.stack(resized_faces, axis=0)
            
            if result_batch.shape[3] == 1:
                result_batch = np.repeat(result_batch, 3, axis=3)
            elif result_batch.shape[3] == 4:
                result_batch = result_batch[:, :, :, :3]
            
            result = torch.from_numpy(result_batch).float() / 255.0
            assert result.shape[3] == 3, f"Output must have 3 channels, got {result.shape[3]}"
            
            return result

    def detect_and_crop_faces(self, image, detection_threshold, min_face_size, padding, output_mode, face_output_format="strip", classifier_type="default"):
        """Legacy method for v1/v2 compatibility"""
        
        # Convert input to numpy array for OpenCV processing
        if isinstance(image, torch.Tensor):
            logger.debug(f"Processing tensor - Shape: {image.shape}, Type: {image.dtype}")
            
            # Ensure 4D tensor [B, H, W, C] and normalize to RGB
            if len(image.shape) == 3:
                image = image.unsqueeze(0)
            elif len(image.shape) != 4:
                raise ValueError(f"Expected 3D or 4D tensor, got shape: {image.shape}")
            
            B, H, W, C = image.shape
            
            # Handle different channel configurations
            if C == 1:
                image = image.repeat(1, 1, 1, 3)  # Grayscale to RGB
            elif C == 4:
                image = image[:, :, :, :3]  # RGBA to RGB
            elif C > 4:
                logger.warning(f"Input has {C} channels, using first 3")
                image = image[:, :, :, :3]
            elif C != 3:
                raise ValueError(f"Cannot handle {C} channels")
            
            # Single conversion: tensor -> numpy (uint8)
            image_np = image[0].cpu().numpy()
            if image_np.max() <= 1.0:
                image_np = (image_np * 255).astype(np.uint8)
            else:
                image_np = np.clip(image_np, 0, 255).astype(np.uint8)
                
        else:
            # Already numpy array
            image_np = image

        # Validate and ensure RGB format
        if not isinstance(image_np, np.ndarray) or len(image_np.shape) != 3:
            raise ValueError(f"Expected 3D numpy array, got {type(image_np)} with shape {getattr(image_np, 'shape', 'unknown')}")
        
        if image_np.shape[2] != 3:
            raise ValueError(f"Expected RGB image (3 channels), got {image_np.shape[2]} channels")

        # Convert to grayscale for face detection
        gray = cv2.cvtColor(image_np, cv2.COLOR_RGB2GRAY)
        
        # Select appropriate cascade based on classifier_type
        if classifier_type == "alternative":
            if self.alternative_cascade is None:
                logger.warning("Alternative Haar cascade not available, falling back to default")
                if self.default_cascade is None:
                    logger.error("No cascade classifiers available")
                    return (torch.zeros((1, 512, 512, 3)),)
                face_cascade = self.default_cascade
            else:
                face_cascade = self.alternative_cascade
        else:  # default
            if self.default_cascade is None:
                logger.error("Default Haar cascade not available")
                return (torch.zeros((1, 512, 512, 3)),)
            face_cascade = self.default_cascade
        
        try:
            faces = face_cascade.detectMultiScale(
                gray,
                scaleFactor=1.1,
                minNeighbors=5,
                minSize=(min_face_size, min_face_size)
            )
        except Exception as e:
            logger.error(f"Face detection failed: {str(e)}")
            return (torch.zeros((1, 512, 512, 3)),)

        if len(faces) == 0:
            logger.warning("No faces detected in image")
            # Return empty image with correct dimensions [B, H, W, C]
            return (torch.zeros((1, 512, 512, 3)),)

        cropped_faces = []
        for x, y, w, h in faces:
            face_img, _ = self.add_padding(image_np, (x, y, w, h), padding)
            cropped_faces.append(face_img)

        if output_mode == "largest_face":
            largest_face = max(cropped_faces, key=lambda x: x.shape[0] * x.shape[1])
            cropped_faces = [largest_face]

        # Enhanced result handling with support for individual face outputs
        # Note: face_output_format only applies when output_mode="all_faces" with multiple faces
        if output_mode == "all_faces" and len(cropped_faces) > 1 and face_output_format == "individual":
            result = self._process_individual_faces(cropped_faces)
            return (result,)
            
        elif len(cropped_faces) > 1:
            # Original strip format - resize all faces to same height while maintaining aspect ratio
            max_height = min(512, max(face.shape[0] for face in cropped_faces))
            resized_faces = []
            for face in cropped_faces:
                aspect_ratio = face.shape[1] / face.shape[0]
                new_width = int(max_height * aspect_ratio)
                resized = cv2.resize(face, (new_width, max_height))
                resized_faces.append(resized)
            result = np.hstack(resized_faces)
        else:
            result = cropped_faces[0]
        
        # Ensure result has correct channel count
        if result.shape[2] == 1:
            result = cv2.cvtColor(result, cv2.COLOR_GRAY2RGB)
        elif result.shape[2] == 4:
            result = cv2.cvtColor(result, cv2.COLOR_RGBA2RGB)
        
        # Convert back to tensor with proper dimensions [B, H, W, C]
        result = torch.from_numpy(result).float() / 255.0
        result = result.unsqueeze(0)  # Add batch dimension
        
        # Validate output tensor (format: [B, H, W, C])
        assert result.shape[3] == 3, f"Output must have 3 channels, got {result.shape[3]}"
        
        return (result,)

    @classmethod
    def IS_CHANGED(s, **kwargs):
        return False

# Export appropriate node class based on ComfyUI version
if COMFY_V3_AVAILABLE:
    # v3 available, use new node
    NODE_CLASS_MAPPINGS = {
        "FaceDetectionNode": FaceDetectionNode
    }
else:
    # Fall back to v1/v2 compatibility
    NODE_CLASS_MAPPINGS = {
        "FaceDetectionNode": FaceDetectionNodeV1
    }

NODE_DISPLAY_NAME_MAPPINGS = {
    "FaceDetectionNode": "Face Detection and Crop"
}