Maxclon
/

Flux_Models

GGUF

Model card Files Files and versions

xet

Community

Maxclon commited on Jan 28, 2025

Commit

7480dd0

1 Parent(s): c4206ff

Update pulidflux.py

Browse files

Files changed (1) hide show

pulidflux.py +148 -15

pulidflux.py CHANGED Viewed

@@ -12,6 +12,8 @@ from insightface.app import FaceAnalysis
 from facexlib.parsing import init_parsing_model
 from facexlib.utils.face_restoration_helper import FaceRestoreHelper
 from .eva_clip.constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
 from .encoders_flux import IDFormer, PerceiverAttentionCA
@@ -24,6 +26,8 @@ else:
     current_paths, _ = folder_paths.folder_names_and_paths["pulid"]
 folder_paths.folder_names_and_paths["pulid"] = (current_paths, folder_paths.supported_pt_extensions)
 class PulidFluxModel(nn.Module):
     def __init__(self):
         super().__init__()
@@ -72,7 +76,12 @@ def forward_orig(
     y: Tensor,
     guidance: Tensor = None,
     control=None,
 ) -> Tensor:
     if img.ndim != 3 or txt.ndim != 3:
         raise ValueError("Input img and txt tensors must have 3 dimensions.")
@@ -91,8 +100,32 @@ def forward_orig(
     pe = self.pe_embedder(ids)
     ca_idx = 0
     for i, block in enumerate(self.double_blocks):
-        img, txt = block(img=img, txt=txt, vec=vec, pe=pe)
         if control is not None: # Controlnet
             control_i = control.get("input")
@@ -106,14 +139,34 @@ def forward_orig(
             if i % self.pulid_double_interval == 0:
                 # Will calculate influence of all pulid nodes at once
                 for _, node_data in self.pulid_data.items():
-                    if torch.any((node_data['sigma_start'] >= timesteps) & (timesteps >= node_data['sigma_end'])):
                         img = img + node_data['weight'] * self.pulid_ca[ca_idx](node_data['embedding'], img)
                 ca_idx += 1
     img = torch.cat((txt, img), 1)
     for i, block in enumerate(self.single_blocks):
-        img = block(img, vec=vec, pe=pe)
         if control is not None: # Controlnet
             control_o = control.get("output")
@@ -122,13 +175,20 @@ def forward_orig(
                 if add is not None:
                     img[:, txt.shape[1] :, ...] += add
         # PuLID attention
         if self.pulid_data:
             real_img, txt = img[:, txt.shape[1]:, ...], img[:, :txt.shape[1], ...]
             if i % self.pulid_single_interval == 0:
                 # Will calculate influence of all nodes at once
                 for _, node_data in self.pulid_data.items():
-                    if torch.any((node_data['sigma_start'] >= timesteps) & (timesteps >= node_data['sigma_end'])):
                         real_img = real_img + node_data['weight'] * self.pulid_ca[ca_idx](node_data['embedding'], real_img)
                 ca_idx += 1
             img = torch.cat((txt, real_img), 1)
@@ -148,6 +208,29 @@ def image_to_tensor(image):
     tensor = tensor[..., [2, 1, 0]]
     return tensor
 def to_gray(img):
     x = 0.299 * img[:, 0:1] + 0.587 * img[:, 1:2] + 0.114 * img[:, 2:3]
     x = x.repeat(1, 3, 1, 1)
@@ -227,7 +310,7 @@ class PulidFluxEvaClipLoader:
 class ApplyPulidFlux:
     @classmethod
-    def INPUT_TYPES(s):
         return {
             "required": {
                 "model": ("MODEL", ),
@@ -238,9 +321,15 @@ class ApplyPulidFlux:
                 "weight": ("FLOAT", {"default": 1.0, "min": -1.0, "max": 5.0, "step": 0.05 }),
                 "start_at": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.001 }),
                 "end_at": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.001 }),
             },
             "optional": {
                 "attn_mask": ("MASK", ),
             },
             "hidden": {
                 "unique_id": "UNIQUE_ID"
@@ -254,15 +343,13 @@ class ApplyPulidFlux:
     def __init__(self):
         self.pulid_data_dict = None
-    def apply_pulid_flux(self, model, pulid_flux, eva_clip, face_analysis, image, weight, start_at, end_at, attn_mask=None, unique_id=None):
         device = comfy.model_management.get_torch_device()
         # Why should I care what args say, when the unet model has a different dtype?!
         # Am I missing something?!
         #dtype = comfy.model_management.unet_dtype()
         dtype = model.model.diffusion_model.dtype
-        # Because of 8bit models we must check what cast type does the unet uses
-        # ZLUDA (Intel, AMD) & GPUs with compute capability < 8.0 don't support bfloat16 etc.
-        # Issue: https://github.com/balazik/ComfyUI-PuLID-Flux/issues/6
         if model.model.manual_cast_dtype is not None:
             dtype = model.model.manual_cast_dtype
@@ -277,6 +364,9 @@ class ApplyPulidFlux:
                 attn_mask = attn_mask.unsqueeze(0)
             attn_mask = attn_mask.to(device, dtype=dtype)
         image = tensor_to_image(image)
         face_helper = FaceRestoreHelper(
@@ -333,7 +423,11 @@ class ApplyPulidFlux:
             bg = sum(parsing_out == i for i in bg_label).bool()
             white_image = torch.ones_like(align_face)
             # Only keep the face features
-            face_features_image = torch.where(bg, white_image, to_gray(align_face))
             # Transform img before sending to eva_clip
             # Apparently MPS only supports NEAREST interpolation?
@@ -359,10 +453,49 @@ class ApplyPulidFlux:
             logging.warning("PuLID warning: No faces detected in any of the given images, returning unmodified model.")
             return (model,)
-        # average embeddings
-        cond = torch.cat(cond).to(device, dtype=dtype)
-        if cond.shape[0] > 1:
-            cond = torch.mean(cond, dim=0, keepdim=True)
         sigma_start = model.get_model_object("model_sampling").percent_to_sigma(start_at)
         sigma_end = model.get_model_object("model_sampling").percent_to_sigma(end_at)

 from facexlib.parsing import init_parsing_model
 from facexlib.utils.face_restoration_helper import FaceRestoreHelper
+import torch.nn.functional as F
 from .eva_clip.constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
 from .encoders_flux import IDFormer, PerceiverAttentionCA
     current_paths, _ = folder_paths.folder_names_and_paths["pulid"]
 folder_paths.folder_names_and_paths["pulid"] = (current_paths, folder_paths.supported_pt_extensions)
+from .online_train2 import online_train
 class PulidFluxModel(nn.Module):
     def __init__(self):
         super().__init__()
     y: Tensor,
     guidance: Tensor = None,
     control=None,
+    transformer_options={},
+    attn_mask: Tensor = None,
+    **kwargs # so it won't break if we add more stuff in the future
 ) -> Tensor:
+    patches_replace = transformer_options.get("patches_replace", {})
     if img.ndim != 3 or txt.ndim != 3:
         raise ValueError("Input img and txt tensors must have 3 dimensions.")
     pe = self.pe_embedder(ids)
     ca_idx = 0
+    blocks_replace = patches_replace.get("dit", {})
     for i, block in enumerate(self.double_blocks):
+        if ("double_block", i) in blocks_replace:
+            def block_wrap(args):
+                out = {}
+                out["img"], out["txt"] = block(img=args["img"],
+                                               txt=args["txt"],
+                                               vec=args["vec"],
+                                               pe=args["pe"],
+                                               attn_mask=args.get("attn_mask"))
+                return out
+            out = blocks_replace[("double_block", i)]({"img": img,
+                                                       "txt": txt,
+                                                       "vec": vec,
+                                                       "pe": pe,
+                                                       "attn_mask": attn_mask},
+                                                      {"original_block": block_wrap})
+            txt = out["txt"]
+            img = out["img"]
+        else:
+            img, txt = block(img=img,
+                             txt=txt,
+                             vec=vec,
+                             pe=pe,
+                             attn_mask=attn_mask)
         if control is not None: # Controlnet
             control_i = control.get("input")
             if i % self.pulid_double_interval == 0:
                 # Will calculate influence of all pulid nodes at once
                 for _, node_data in self.pulid_data.items():
+                    condition_start = node_data['sigma_start'] >= timesteps
+                    condition_end = timesteps >= node_data['sigma_end']
+                    condition = torch.logical_and(
+                        condition_start, condition_end).all()
+                    if condition:
                         img = img + node_data['weight'] * self.pulid_ca[ca_idx](node_data['embedding'], img)
                 ca_idx += 1
     img = torch.cat((txt, img), 1)
     for i, block in enumerate(self.single_blocks):
+        if ("single_block", i) in blocks_replace:
+            def block_wrap(args):
+                out = {}
+                out["img"] = block(args["img"],
+                                   vec=args["vec"],
+                                   pe=args["pe"],
+                                   attn_mask=args.get("attn_mask"))
+                return out
+            out = blocks_replace[("single_block", i)]({"img": img,
+                                                       "vec": vec,
+                                                       "pe": pe,
+                                                       "attn_mask": attn_mask},
+                                                      {"original_block": block_wrap})
+            img = out["img"]
+        else:
+            img = block(img, vec=vec, pe=pe, attn_mask=attn_mask)
         if control is not None: # Controlnet
             control_o = control.get("output")
                 if add is not None:
                     img[:, txt.shape[1] :, ...] += add
         # PuLID attention
         if self.pulid_data:
             real_img, txt = img[:, txt.shape[1]:, ...], img[:, :txt.shape[1], ...]
             if i % self.pulid_single_interval == 0:
                 # Will calculate influence of all nodes at once
                 for _, node_data in self.pulid_data.items():
+                    condition_start = node_data['sigma_start'] >= timesteps
+                    condition_end = timesteps >= node_data['sigma_end']
+                    # Combine conditions and reduce to a single boolean
+                    condition = torch.logical_and(condition_start, condition_end).all()
+                    if condition:
                         real_img = real_img + node_data['weight'] * self.pulid_ca[ca_idx](node_data['embedding'], real_img)
                 ca_idx += 1
             img = torch.cat((txt, real_img), 1)
     tensor = tensor[..., [2, 1, 0]]
     return tensor
+def resize_with_pad(img, target_size): # image: 1, h, w, 3
+    img = img.permute(0, 3, 1, 2)
+    H, W = target_size
+    h, w = img.shape[2], img.shape[3]
+    scale_h = H / h
+    scale_w = W / w
+    scale = min(scale_h, scale_w)
+    new_h = int(min(h * scale,H))
+    new_w = int(min(w * scale,W))
+    new_size = (new_h, new_w)
+    img = F.interpolate(img, size=new_size, mode='bicubic', align_corners=False)
+    pad_top = (H - new_h) // 2
+    pad_bottom = (H - new_h) - pad_top
+    pad_left = (W - new_w) // 2
+    pad_right = (W - new_w) - pad_left
+    img = F.pad(img, pad=(pad_left, pad_right, pad_top, pad_bottom), mode='constant', value=0)
+    return img.permute(0, 2, 3, 1)
 def to_gray(img):
     x = 0.299 * img[:, 0:1] + 0.587 * img[:, 1:2] + 0.114 * img[:, 2:3]
     x = x.repeat(1, 3, 1, 1)
 class ApplyPulidFlux:
     @classmethod
+    def INPUT_TYPES(s):
         return {
             "required": {
                 "model": ("MODEL", ),
                 "weight": ("FLOAT", {"default": 1.0, "min": -1.0, "max": 5.0, "step": 0.05 }),
                 "start_at": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.001 }),
                 "end_at": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.001 }),
+                "fusion": (["mean","concat","max","norm_id","max_token","auto_weight","train_weight"],),
+                "fusion_weight_max": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 20.0, "step": 0.1 }),
+                "fusion_weight_min": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 20.0, "step": 0.1 }),
+                "train_step": ("INT", {"default": 1000, "min": 0, "max": 20000, "step": 1 }),
+                "use_gray": ("BOOLEAN", {"default": True, "label_on": "enabled", "label_off": "disabled"}),
             },
             "optional": {
                 "attn_mask": ("MASK", ),
+                "prior_image": ("IMAGE",), # for train weight, as the target
             },
             "hidden": {
                 "unique_id": "UNIQUE_ID"
     def __init__(self):
         self.pulid_data_dict = None
+    def apply_pulid_flux(self, model, pulid_flux, eva_clip, face_analysis, image, weight, start_at, end_at, prior_image=None,fusion="mean", fusion_weight_max=1.0, fusion_weight_min=0.0, train_step=1000, use_gray=True, attn_mask=None, unique_id=None):
         device = comfy.model_management.get_torch_device()
         # Why should I care what args say, when the unet model has a different dtype?!
         # Am I missing something?!
         #dtype = comfy.model_management.unet_dtype()
         dtype = model.model.diffusion_model.dtype
+        # For 8bit use bfloat16 (because ufunc_add_CUDA is not implemented)
         if model.model.manual_cast_dtype is not None:
             dtype = model.model.manual_cast_dtype
                 attn_mask = attn_mask.unsqueeze(0)
             attn_mask = attn_mask.to(device, dtype=dtype)
+        if prior_image is not None:
+            prior_image = resize_with_pad(prior_image.to(image.device, dtype=image.dtype), target_size=(image.shape[1], image.shape[2]))
+            image=torch.cat((prior_image,image),dim=0)
         image = tensor_to_image(image)
         face_helper = FaceRestoreHelper(
             bg = sum(parsing_out == i for i in bg_label).bool()
             white_image = torch.ones_like(align_face)
             # Only keep the face features
+            if use_gray:
+                _align_face = to_gray(align_face)
+            else:
+                _align_face = align_face
+            face_features_image = torch.where(bg, white_image, _align_face)
             # Transform img before sending to eva_clip
             # Apparently MPS only supports NEAREST interpolation?
             logging.warning("PuLID warning: No faces detected in any of the given images, returning unmodified model.")
             return (model,)
+        # fusion embeddings
+        if fusion == "mean":
+            cond = torch.cat(cond).to(device, dtype=dtype) # N,32,2048
+            if cond.shape[0] > 1:
+                cond = torch.mean(cond, dim=0, keepdim=True)
+        elif fusion == "concat":
+            cond = torch.cat(cond, dim=1).to(device, dtype=dtype)
+        elif fusion == "max":
+            cond = torch.cat(cond).to(device, dtype=dtype)
+            if cond.shape[0] > 1:
+                cond = torch.max(cond, dim=0, keepdim=True)[0]
+        elif fusion == "norm_id":
+            cond = torch.cat(cond).to(device, dtype=dtype)
+            if cond.shape[0] > 1:
+                norm=torch.norm(cond,dim=(1,2))
+                norm=norm/torch.sum(norm)
+                cond=torch.einsum("wij,w->ij",cond,norm).unsqueeze(0)
+        elif fusion == "max_token":
+            cond = torch.cat(cond).to(device, dtype=dtype)
+            if cond.shape[0] > 1:
+                norm=torch.norm(cond,dim=2)
+                _,idx=torch.max(norm,dim=0)
+                cond=torch.stack([cond[j,i] for i,j in enumerate(idx)]).unsqueeze(0)
+        elif fusion == "auto_weight": # 🤔
+            cond = torch.cat(cond).to(device, dtype=dtype)
+            if cond.shape[0] > 1:
+                norm=torch.norm(cond,dim=2)
+                order=torch.argsort(norm,descending=False,dim=0)
+                regular_weight=torch.linspace(fusion_weight_min,fusion_weight_max,norm.shape[0]).to(device, dtype=dtype)
+                _cond=[]
+                for i in range(cond.shape[1]):
+                    o=order[:,i]
+                    _cond.append(torch.einsum('ij,i->j',cond[:,i,:],regular_weight[o]))
+                cond=torch.stack(_cond,dim=0).unsqueeze(0)
+        elif fusion == "train_weight":
+            cond = torch.cat(cond).to(device, dtype=dtype)
+            if cond.shape[0] > 1:
+                if train_step > 0:
+                    with torch.inference_mode(False):
+                        cond = online_train(cond, device=cond.device, step=train_step)
+                else:
+                    cond = torch.mean(cond, dim=0, keepdim=True)
         sigma_start = model.get_model_object("model_sampling").percent_to_sigma(start_at)
         sigma_end = model.get_model_object("model_sampling").percent_to_sigma(end_at)