diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..cf90875ef82db0dd15e8995ee0678cf1084e3c3a 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,32 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +comfyui-mvadapter/assets/comfyui_i2mv_lora.png filter=lfs diff=lfs merge=lfs -text +comfyui-mvadapter/assets/comfyui_i2mv_multiple_loras.jpg filter=lfs diff=lfs merge=lfs -text +comfyui-mvadapter/assets/comfyui_i2mv_view_selector.png filter=lfs diff=lfs merge=lfs -text +comfyui-mvadapter/assets/comfyui_i2mv.png filter=lfs diff=lfs merge=lfs -text +comfyui-mvadapter/assets/comfyui_t2mv_controlnet.png filter=lfs diff=lfs merge=lfs -text +comfyui-mvadapter/assets/comfyui_t2mv_lora.png filter=lfs diff=lfs merge=lfs -text +comfyui-mvadapter/assets/comfyui_t2mv_multiple_loras.jpg filter=lfs diff=lfs merge=lfs -text +comfyui-mvadapter/assets/comfyui_t2mv.png filter=lfs diff=lfs merge=lfs -text +comfyui-salia/assets/images/boy0.png filter=lfs diff=lfs merge=lfs -text +comfyui-salia/assets/images/boy1.png filter=lfs diff=lfs merge=lfs -text +comfyui-salia/assets/images/boy2.png filter=lfs diff=lfs merge=lfs -text +comfyui-salia/assets/images/boy3.png filter=lfs diff=lfs merge=lfs -text +comfyui-salia/assets/images/boy4.png filter=lfs diff=lfs merge=lfs -text +comfyui-salia/assets/images/boy5.png filter=lfs diff=lfs merge=lfs -text +comfyui-salia/assets/images/girl0.png filter=lfs diff=lfs merge=lfs -text +comfyui-salia/assets/images/girl1.png filter=lfs diff=lfs merge=lfs -text +comfyui-salia/assets/images/girl2.png filter=lfs diff=lfs merge=lfs -text +comfyui-salia/assets/images/girl3.png filter=lfs diff=lfs merge=lfs -text +comfyui-salia/assets/images/girl4.png filter=lfs diff=lfs merge=lfs -text +comfyui-salia/assets/images/girl5.png filter=lfs diff=lfs merge=lfs -text +comfyui-salia/assets/images/hair_L_Bound_Braided.png filter=lfs diff=lfs merge=lfs -text +comfyui-salia/assets/images/hair_L_Bound.png filter=lfs diff=lfs merge=lfs -text +comfyui-salia/assets/images/hair_L_Loose.png filter=lfs diff=lfs merge=lfs -text +comfyui-salia/assets/images/hair_M_Bound_Braided.png filter=lfs diff=lfs merge=lfs -text +comfyui-salia/assets/images/hair_M_Bound.png filter=lfs diff=lfs merge=lfs -text +comfyui-salia/assets/images/hair_M_Loose.png filter=lfs diff=lfs merge=lfs -text +comfyui-salia/assets/images/hair_S_Bound_Braided.png filter=lfs diff=lfs merge=lfs -text +comfyui-salia/assets/images/hair_S_Bound.png filter=lfs diff=lfs merge=lfs -text +comfyui-salia/assets/images/hair_S_Loose.png filter=lfs diff=lfs merge=lfs -text diff --git a/comfyui-mvadapter/.github/workflows/publish.yml b/comfyui-mvadapter/.github/workflows/publish.yml new file mode 100644 index 0000000000000000000000000000000000000000..9a3be8b53547fd894cb5f0d4d8b89cb9ca3cb453 --- /dev/null +++ b/comfyui-mvadapter/.github/workflows/publish.yml @@ -0,0 +1,25 @@ +name: Publish to Comfy registry +on: + workflow_dispatch: + push: + branches: + - main + paths: + - "pyproject.toml" + +permissions: + issues: write + +jobs: + publish-node: + name: Publish Custom Node to registry + runs-on: ubuntu-latest + if: ${{ github.repository_owner == 'huanngzh' }} + steps: + - name: Check out code + uses: actions/checkout@v4 + - name: Publish Custom Node + uses: Comfy-Org/publish-node-action@v1 + with: + ## Add your own personal access token to your Github Repository secrets and reference it here. + personal_access_token: ${{ secrets.REGISTRY_ACCESS_TOKEN }} diff --git a/comfyui-mvadapter/BACKUP_nodes.py b/comfyui-mvadapter/BACKUP_nodes.py new file mode 100644 index 0000000000000000000000000000000000000000..c49513d436ab515760ac777e3367d0b9fab2fa79 --- /dev/null +++ b/comfyui-mvadapter/BACKUP_nodes.py @@ -0,0 +1,843 @@ +# Adapted from https://github.com/Limitex/ComfyUI-Diffusers/blob/main/nodes.py +import copy +import os +import torch +from safetensors.torch import load_file +from torchvision import transforms +from .utils import ( + SCHEDULERS, + PIPELINES, + MVADAPTERS, + vae_pt_to_vae_diffuser, + convert_images_to_tensors, + convert_tensors_to_images, + prepare_camera_embed, + preprocess_image, +) +from comfy.model_management import get_torch_device +import folder_paths + +from diffusers import StableDiffusionXLPipeline, AutoencoderKL, ControlNetModel +from transformers import AutoModelForImageSegmentation # <-- restored + +# ADDED: import DPMSolverMultistepScheduler for DPM++ Karras +from diffusers import DPMSolverMultistepScheduler + +from .mvadapter.pipelines.pipeline_mvadapter_t2mv_sdxl import MVAdapterT2MVSDXLPipeline +from .mvadapter.schedulers.scheduling_shift_snr import ShiftSNRScheduler + +# ADDED: import your new Karras-enabled shift scheduler (file sits next to scheduling_shift_snr.py) +from .mvadapter.schedulers.ShiftSNRSchedulerKarras import ShiftSNRSchedulerKarras + + + +class DiffusersMVPipelineLoader: + def __init__(self): + self.hf_dir = folder_paths.get_folder_paths("diffusers")[0] + self.dtype = torch.float16 + + @classmethod + def INPUT_TYPES(s): + return { + "required": { + "ckpt_name": ( + "STRING", + {"default": "stabilityai/stable-diffusion-xl-base-1.0"}, + ), + "pipeline_name": ( + list(PIPELINES.keys()), + {"default": "MVAdapterT2MVSDXLPipeline"}, + ), + } + } + + RETURN_TYPES = ( + "PIPELINE", + "AUTOENCODER", + "SCHEDULER", + ) + + FUNCTION = "create_pipeline" + + CATEGORY = "MV-Adapter" + + def create_pipeline(self, ckpt_name, pipeline_name): + pipeline_class = PIPELINES[pipeline_name] + pipe = pipeline_class.from_pretrained( + pretrained_model_name_or_path=ckpt_name, + torch_dtype=self.dtype, + cache_dir=self.hf_dir, + ) + return (pipe, pipe.vae, pipe.scheduler) + + +class LdmPipelineLoader: + def __init__(self): + self.hf_dir = folder_paths.get_folder_paths("diffusers")[0] + self.dtype = torch.float16 + + @classmethod + def INPUT_TYPES(s): + return { + "required": { + "ckpt_name": (folder_paths.get_filename_list("checkpoints"),), + "pipeline_name": ( + list(PIPELINES.keys()), + {"default": "MVAdapterT2MVSDXLPipeline"}, + ), + } + } + + RETURN_TYPES = ( + "PIPELINE", + "AUTOENCODER", + "SCHEDULER", + ) + + FUNCTION = "create_pipeline" + + CATEGORY = "MV-Adapter" + + def create_pipeline(self, ckpt_name, pipeline_name): + pipeline_class = PIPELINES[pipeline_name] + + pipe = pipeline_class.from_single_file( + pretrained_model_link_or_path=folder_paths.get_full_path( + "checkpoints", ckpt_name + ), + torch_dtype=self.dtype, + cache_dir=self.hf_dir, + ) + + return (pipe, pipe.vae, pipe.scheduler) + + +class DiffusersMVVaeLoader: + def __init__(self): + self.hf_dir = folder_paths.get_folder_paths("diffusers")[0] + self.dtype = torch.float16 + + @classmethod + def INPUT_TYPES(s): + return { + "required": { + "vae_name": ( + "STRING", + {"default": "madebyollin/sdxl-vae-fp16-fix"}, + ), + } + } + + RETURN_TYPES = ("AUTOENCODER",) + + FUNCTION = "create_pipeline" + + CATEGORY = "MV-Adapter" + + def create_pipeline(self, vae_name): + vae = AutoencoderKL.from_pretrained( + pretrained_model_name_or_path=vae_name, + torch_dtype=self.dtype, + cache_dir=self.hf_dir, + ) + + return (vae,) + + +class LdmVaeLoader: + def __init__(self): + self.dtype = torch.float16 + + @classmethod + def INPUT_TYPES(s): + return { + "required": { + "vae_name": (folder_paths.get_filename_list("vae"),), + "upcast_fp32": ("BOOLEAN", {"default": True}), + }, + } + + RETURN_TYPES = ("AUTOENCODER",) + + FUNCTION = "create_pipeline" + + CATEGORY = "MV-Adapter" + + def create_pipeline(self, vae_name, upcast_fp32): + vae = vae_pt_to_vae_diffuser( + folder_paths.get_full_path("vae", vae_name), force_upcast=upcast_fp32 + ).to(self.dtype) + + return (vae,) + + +class DiffusersMVSchedulerLoader: + def __init__(self): + self.hf_dir = folder_paths.get_folder_paths("diffusers")[0] + self.dtype = torch.float16 + + @classmethod + def INPUT_TYPES(s): + return { + "required": { + "pipeline": ("PIPELINE",), + "scheduler_name": (list(SCHEDULERS.keys()),), + "shift_snr": ("BOOLEAN", {"default": True}), + "shift_mode": ( + list(ShiftSNRScheduler.SHIFT_MODES), + {"default": "interpolated"}, + ), + "shift_scale": ( + "FLOAT", + {"default": 8.0, "min": 0.0, "max": 50.0, "step": 1.0}, + ), + } + } + + RETURN_TYPES = ("SCHEDULER",) + + FUNCTION = "load_scheduler" + + CATEGORY = "MV-Adapter" + + def load_scheduler( + self, pipeline, scheduler_name, shift_snr, shift_mode, shift_scale + ): + scheduler = SCHEDULERS[scheduler_name].from_config( + pipeline.scheduler.config, torch_dtype=self.dtype + ) + if shift_snr: + scheduler = ShiftSNRScheduler.from_scheduler( + scheduler, + shift_mode=shift_mode, + shift_scale=shift_scale, + scheduler_class=scheduler.__class__, + ) + return (scheduler,) + + +# ADDED: Karras version — same inputs/outputs, but always returns a DPM++ (Karras) scheduler. +class DiffusersMVSchedulerLoaderKarras: + def __init__(self): + self.hf_dir = folder_paths.get_folder_paths("diffusers")[0] + self.dtype = torch.float16 + + @classmethod + def INPUT_TYPES(s): + return { + "required": { + "pipeline": ("PIPELINE",), + "scheduler_name": (list(SCHEDULERS.keys()),), + "shift_snr": ("BOOLEAN", {"default": True}), + "shift_mode": ( + list(ShiftSNRSchedulerKarras.SHIFT_MODES), + {"default": "interpolated"}, + ), + "shift_scale": ( + "FLOAT", + {"default": 8.0, "min": 0.0, "max": 50.0, "step": 1.0}, + ), + } + } + + RETURN_TYPES = ("SCHEDULER",) + + FUNCTION = "load_scheduler" + + CATEGORY = "MV-Adapter" + + def load_scheduler( + self, pipeline, scheduler_name, shift_snr, shift_mode, shift_scale + ): + # Build a base scheduler from the pipeline config (kept for parity with original UI), + # then *replace* it with DPM++ (Karras). If SNR shift is requested, apply via your Karras class. + base_sched = SCHEDULERS[scheduler_name].from_config( + pipeline.scheduler.config, torch_dtype=self.dtype + ) + + # Always use DPM++ Karras: + if shift_snr: + # Apply your Karras-enabled Shift SNR on top, and force DPM++ class to guarantee Karras works. + scheduler = ShiftSNRSchedulerKarras.from_scheduler( + base_sched, + shift_mode=shift_mode, + shift_scale=shift_scale, + scheduler_class=DPMSolverMultistepScheduler, + ) + else: + # No SNR shift requested: just return DPM++ with Karras sigmas + scheduler = DPMSolverMultistepScheduler.from_config( + pipeline.scheduler.config, + algorithm_type="dpmsolver++", + use_karras_sigmas=True, + torch_dtype=self.dtype, + ) + + return (scheduler,) + + +class CustomLoraModelLoader: + def __init__(self): + self.loaded_lora = None + + @classmethod + def INPUT_TYPES(s): + return { + "required": { + "pipeline": ("PIPELINE",), + "lora_name": (folder_paths.get_filename_list("loras"),), + "strength_model": ( + "FLOAT", + {"default": 1.0, "min": -100.0, "max": 100.0, "step": 0.01}, + ), + "enable": ( + "BOOLEAN", + {"default": True}, + ), + "last_lora_node": ( + "BOOLEAN", + {"default": True}, + ), + } + } + + RETURN_TYPES = ("PIPELINE",) + FUNCTION = "load_lora" + CATEGORY = "MV-Adapter" + + def load_lora(self, pipeline, lora_name, strength_model, enable, last_lora_node): + if not hasattr(pipeline, "loaded_loras"): + pipeline.loaded_loras = [] + + lora_path = folder_paths.get_full_path("loras", lora_name) + lora_dir = os.path.dirname(lora_path) + lora_name = os.path.basename(lora_path) + lora = None + if enable: + if self.loaded_lora is not None: + if self.loaded_lora[0] == lora_path: + lora = self.loaded_lora[1] + else: + temp = self.loaded_lora + pipeline.delete_adapters(temp[1]) + pipeline.loaded_loras = [(name, strength) for (name, strength) in pipeline.loaded_loras if name != temp[1]] + self.loaded_lora = None + + if lora is None: + adapter_name = lora_name.rsplit(".", 1)[0] + pipeline.load_lora_weights( + lora_dir, weight_name=lora_name, adapter_name=adapter_name + ) + pipeline.set_adapters(adapter_name, strength_model) + self.loaded_lora = (lora_path, adapter_name) + lora = adapter_name + + pipeline.loaded_loras.append((adapter_name, strength_model)) + else: + # Delete the loaded lora + if self.loaded_lora is not None: + temp = self.loaded_lora + pipeline.delete_adapters(temp[1]) + pipeline.loaded_loras = [(name, strength) for (name, strength) in pipeline.loaded_loras if name != temp[1]] + self.loaded_lora = None + + if last_lora_node: + adapter_names = [x[0] for x in pipeline.loaded_loras] + strengths = [x[1] for x in pipeline.loaded_loras] + pipeline.set_adapters(adapter_names, strengths) + + print(adapter_names) + + return (pipeline,) + + +class ControlNetModelLoader: + def __init__(self): + self.loaded_controlnet = None + self.dtype = torch.float16 + self.torch_device = get_torch_device() + self.hf_dir = folder_paths.get_folder_paths("diffusers")[0] + + @classmethod + def INPUT_TYPES(s): + return { + "required": { + "pipeline": ("PIPELINE",), + "controlnet_name": ( + "STRING", + {"default": "xinsir/controlnet-scribble-sdxl-1.0"}, + ), + } + } + + RETURN_TYPES = ("PIPELINE",) + FUNCTION = "load_controlnet" + CATEGORY = "MV-Adapter" + + def load_controlnet(self, pipeline, controlnet_name): + controlnet = None + if self.loaded_controlnet is not None: + if self.loaded_controlnet == controlnet_name: + controlnet = self.loaded_controlnet + else: + del pipeline.controlnet + self.loaded_controlnet = None + + if controlnet is None: + controlnet = ControlNetModel.from_pretrained( + controlnet_name, cache_dir=self.hf_dir, torch_dtype=self.dtype + ) + pipeline.controlnet = controlnet + pipeline.controlnet.to(device=self.torch_device, dtype=self.dtype) + + self.loaded_controlnet = controlnet_name + controlnet = controlnet_name + + return (pipeline,) + + +class DiffusersMVModelMakeup: + def __init__(self): + self.hf_dir = folder_paths.get_folder_paths("diffusers")[0] + self.torch_device = get_torch_device() + self.dtype = torch.float16 + + @classmethod + def INPUT_TYPES(s): + return { + "required": { + "pipeline": ("PIPELINE",), + "scheduler": ("SCHEDULER",), + "autoencoder": ("AUTOENCODER",), + "load_mvadapter": ("BOOLEAN", {"default": True}), + "adapter_path": ("STRING", {"default": "huanngzh/mv-adapter"}), + "adapter_name": ( + MVADAPTERS, + {"default": "mvadapter_t2mv_sdxl.safetensors"}, + ), + "num_views": ("INT", {"default": 6, "min": 1, "max": 12}), + }, + "optional": { + "enable_vae_slicing": ("BOOLEAN", {"default": True}), + "enable_vae_tiling": ("BOOLEAN", {"default": False}), + }, + } + + RETURN_TYPES = ("PIPELINE",) + + FUNCTION = "makeup_pipeline" + + CATEGORY = "MV-Adapter" + + def makeup_pipeline( + self, + pipeline, + scheduler, + autoencoder, + load_mvadapter, + adapter_path, + adapter_name, + num_views, + enable_vae_slicing=True, + enable_vae_tiling=False, + ): + pipeline.vae = autoencoder + pipeline.scheduler = scheduler + + if load_mvadapter: + pipeline.init_custom_adapter(num_views=num_views) + pipeline.load_custom_adapter( + adapter_path, weight_name=adapter_name, cache_dir=self.hf_dir + ) + pipeline.cond_encoder.to(device=self.torch_device, dtype=self.dtype) + + pipeline = pipeline.to(self.torch_device, self.dtype) + + if enable_vae_slicing: + pipeline.enable_vae_slicing() + if enable_vae_tiling: + pipeline.enable_vae_tiling() + + return (pipeline,) + + +class DiffusersSampler: + def __init__(self): + self.torch_device = get_torch_device() + + @classmethod + def INPUT_TYPES(s): + return { + "required": { + "pipeline": ("PIPELINE",), + "prompt": ( + "STRING", + {"multiline": True, "default": "a photo of a cat"}, + ), + "negative_prompt": ( + "STRING", + { + "multiline": True, + "default": "watermark, ugly, deformed, noisy, blurry, low contrast", + }, + ), + "width": ("INT", {"default": 768, "min": 1, "max": 2048, "step": 1}), + "height": ("INT", {"default": 768, "min": 1, "max": 2048, "step": 1}), + "steps": ("INT", {"default": 50, "min": 1, "max": 2000}), + "cfg": ( + "FLOAT", + { + "default": 7.0, + "min": 0.0, + "max": 100.0, + "step": 0.1, + "round": 0.01, + }, + ), + "seed": ("INT", {"default": 0, "min": 0, "max": 0xFFFFFFFFFFFFFFFF}), + } + } + + RETURN_TYPES = ("IMAGE",) + + FUNCTION = "sample" + + CATEGORY = "MV-Adapter" + + def sample( + self, + pipeline, + prompt, + negative_prompt, + height, + width, + steps, + cfg, + seed, + ): + images = pipeline( + prompt=prompt, + height=height, + width=width, + num_inference_steps=steps, + guidance_scale=cfg, + negative_prompt=negative_prompt, + generator=torch.Generator(self.torch_device).manual_seed(seed), + ).images + return (convert_images_to_tensors(images),) + + +class DiffusersMVSampler: + def __init__(self): + self.torch_device = get_torch_device() + + @classmethod + def INPUT_TYPES(s): + return { + "required": { + "pipeline": ("PIPELINE",), + "num_views": ("INT", {"default": 6, "min": 1, "max": 12}), + "prompt": ( + "STRING", + {"multiline": True, "default": "an astronaut riding a horse"}, + ), + "negative_prompt": ( + "STRING", + { + "multiline": True, + "default": "watermark, ugly, deformed, noisy, blurry, low contrast", + }, + ), + "width": ("INT", {"default": 768, "min": 1, "max": 2048, "step": 1}), + "height": ("INT", {"default": 768, "min": 1, "max": 2048, "step": 1}), + "steps": ("INT", {"default": 50, "min": 1, "max": 2000}), + "cfg": ( + "FLOAT", + { + "default": 7.0, + "min": 0.0, + "max": 100.0, + "step": 0.1, + "round": 0.01, + }, + ), + "seed": ("INT", {"default": 0, "min": 0, "max": 0xFFFFFFFFFFFFFFFF}), + }, + "optional": { + "reference_image": ("IMAGE",), + "controlnet_image": ("IMAGE",), + "controlnet_conditioning_scale": ("FLOAT", {"default": 1.0}), + "azimuth_degrees": ("LIST", {"default": [0, 45, 90, 180, 270, 315]}), + }, + } + + RETURN_TYPES = ("IMAGE",) + + FUNCTION = "sample" + + CATEGORY = "MV-Adapter" + + def sample( + self, + pipeline, + num_views, + prompt, + negative_prompt, + height, + width, + steps, + cfg, + seed, + reference_image=None, + controlnet_image=None, + controlnet_conditioning_scale=1.0, + azimuth_degrees=[0, 45, 90, 180, 270, 315], + ): + num_views = len(azimuth_degrees) + control_images = prepare_camera_embed( + num_views, width, self.torch_device, azimuth_degrees + ) + + pipe_kwargs = {} + if reference_image is not None: + pipe_kwargs.update( + { + "reference_image": convert_tensors_to_images(reference_image)[0], + "reference_conditioning_scale": 1.0, + } + ) + if controlnet_image is not None: + controlnet_image = convert_tensors_to_images(controlnet_image) + pipe_kwargs.update( + { + "controlnet_image": controlnet_image, + "controlnet_conditioning_scale": controlnet_conditioning_scale, + } + ) + + images = pipeline( + prompt=prompt, + height=height, + width=width, + num_inference_steps=steps, + guidance_scale=cfg, + num_images_per_prompt=num_views, + control_image=control_images, + control_conditioning_scale=1.0, + negative_prompt=negative_prompt, + generator=torch.Generator(self.torch_device).manual_seed(seed), + cross_attention_kwargs={"num_views": num_views}, + **pipe_kwargs, + ).images + return (convert_images_to_tensors(images),) + + +class BiRefNet: + def __init__(self): + self.hf_dir = folder_paths.get_folder_paths("diffusers")[0] + self.torch_device = get_torch_device() + self.dtype = torch.float32 + + RETURN_TYPES = ("FUNCTION",) + + FUNCTION = "load_model_fn" + + CATEGORY = "MV-Adapter" + + @classmethod + def INPUT_TYPES(s): + return { + "required": {"ckpt_name": ("STRING", {"default": "briaai/RMBG-2.0"})} + } + + def remove_bg(self, image, net, transform, device): + image_size = image.size + input_images = transform(image).unsqueeze(0).to(device) + with torch.no_grad(): + preds = net(input_images)[-1].sigmoid().cpu() + pred = preds[0].squeeze() + pred_pil = transforms.ToPILImage()(pred) + mask = pred_pil.resize(image_size) + image.putalpha(mask) + return image + + def load_model_fn(self, ckpt_name): + model = AutoModelForImageSegmentation.from_pretrained( + ckpt_name, trust_remote_code=True, cache_dir=self.hf_dir + ).to(self.torch_device, self.dtype) + + transform_image = transforms.Compose( + [ + transforms.Resize((1024, 1024)), + transforms.ToTensor(), + transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), + ] + ) + + remove_bg_fn = lambda x: self.remove_bg( + x, model, transform_image, self.torch_device + ) + return (remove_bg_fn,) + + +class ImagePreprocessor: + def __init__(self): + self.torch_device = get_torch_device() + + @classmethod + def INPUT_TYPES(s): + return { + "required": { + "remove_bg_fn": ("FUNCTION",), + "image": ("IMAGE",), + "height": ("INT", {"default": 768, "min": 1, "max": 2048, "step": 1}), + "width": ("INT", {"default": 768, "min": 1, "max": 2048, "step": 1}), + } + } + + RETURN_TYPES = ("IMAGE",) + + FUNCTION = "process" + + def process(self, remove_bg_fn, image, height, width): + images = convert_tensors_to_images(image) + images = [ + preprocess_image(remove_bg_fn(img.convert("RGB")), height, width) + for img in images + ] + + return (convert_images_to_tensors(images),) + + +class ControlImagePreprocessor: + def __init__(self): + self.torch_device = get_torch_device() + + @classmethod + def INPUT_TYPES(s): + return { + "required": { + "front_view": ("IMAGE",), + "front_right_view": ("IMAGE",), + "right_view": ("IMAGE",), + "back_view": ("IMAGE",), + "left_view": ("IMAGE",), + "front_left_view": ("IMAGE",), + "width": ("INT", {"default": 768, "min": 1, "max": 2048, "step": 1}), + "height": ("INT", {"default": 768, "min": 1, "max": 2048, "step": 1}), + } + } + + RETURN_TYPES = ("IMAGE",) + + FUNCTION = "process" + + def process( + self, + front_view, + front_right_view, + right_view, + back_view, + left_view, + front_left_view, + width, + height, + ): + images = torch.cat( + [ + front_view, + front_right_view, + right_view, + back_view, + left_view, + front_left_view, + ], + dim=0, + ) + images = convert_tensors_to_images(images) + images = [img.resize((width, height)).convert("RGB") for img in images] + return (convert_images_to_tensors(images),) + + +class ViewSelector: + def __init__(self): + pass + + @classmethod + def INPUT_TYPES(s): + return { + "required": { + "front_view": ("BOOLEAN", {"default": True}), + "front_right_view": ("BOOLEAN", {"default": True}), + "right_view": ("BOOLEAN", {"default": True}), + "back_view": ("BOOLEAN", {"default": True}), + "left_view": ("BOOLEAN", {"default": True}), + "front_left_view": ("BOOLEAN", {"default": True}), + } + } + + RETURN_TYPES = ("LIST",) + FUNCTION = "process" + CATEGORY = "MV-Adapter" + + def process( + self, + front_view, + front_right_view, + right_view, + back_view, + left_view, + front_left_view, + ): + azimuth_deg = [] + if front_view: + azimuth_deg.append(0) + if front_right_view: + azimuth_deg.append(45) + if right_view: + azimuth_deg.append(90) + if back_view: + azimuth_deg.append(180) + if left_view: + azimuth_deg.append(270) + if front_left_view: + azimuth_deg.append(315) + + return (azimuth_deg,) + + +NODE_CLASS_MAPPINGS = { + "LdmPipelineLoader": LdmPipelineLoader, + "LdmVaeLoader": LdmVaeLoader, + "DiffusersMVPipelineLoader": DiffusersMVPipelineLoader, + "DiffusersMVVaeLoader": DiffusersMVVaeLoader, + "DiffusersMVSchedulerLoader": DiffusersMVSchedulerLoader, + # ADDED: Karras version + "DiffusersMVSchedulerLoaderKarras": DiffusersMVSchedulerLoaderKarras, + "DiffusersMVModelMakeup": DiffusersMVModelMakeup, + "CustomLoraModelLoader": CustomLoraModelLoader, + "DiffusersMVSampler": DiffusersMVSampler, + "BiRefNet": BiRefNet, + "ImagePreprocessor": ImagePreprocessor, + "ControlNetModelLoader": ControlNetModelLoader, + "ControlImagePreprocessor": ControlImagePreprocessor, + "ViewSelector": ViewSelector, +} + +NODE_DISPLAY_NAME_MAPPINGS = { + "LdmPipelineLoader": "LDM Pipeline Loader", + "LdmVaeLoader": "LDM Vae Loader", + "DiffusersMVPipelineLoader": "Diffusers MV Pipeline Loader", + "DiffusersMVVaeLoader": "Diffusers MV Vae Loader", + "DiffusersMVSchedulerLoader": "Diffusers MV Scheduler Loader", + # ADDED: Karras version + "DiffusersMVSchedulerLoaderKarras": "Diffusers MV Scheduler Loader (Karras)", + "DiffusersMVModelMakeup": "Diffusers MV Model Makeup", + "CustomLoraModelLoader": "Custom Lora Model Loader", + "DiffusersMVSampler": "Diffusers MV Sampler", + "BiRefNet": "BiRefNet", + "ImagePreprocessor": "Image Preprocessor", + "ControlNetModelLoader": "ControlNet Model Loader", + "ControlImagePreprocessor": "Control Image Preprocessor", + "ViewSelector": "View Selector", +} diff --git a/comfyui-mvadapter/LICENSE b/comfyui-mvadapter/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..29f81d812f3e768fa89638d1f72920dbfd1413a8 --- /dev/null +++ b/comfyui-mvadapter/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/comfyui-mvadapter/README.md b/comfyui-mvadapter/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a7c621e63d2e615f0669f802a85994b03cbb4a4e --- /dev/null +++ b/comfyui-mvadapter/README.md @@ -0,0 +1,88 @@ +# ComfyUI-MVAdapter + +This extension integrates [MV-Adapter](https://github.com/huanngzh/MV-Adapter) into ComfyUI, allowing users to generate multi-view consistent images from text prompts or single images directly within the ComfyUI interface. + +## 🔥 Feature Updates + +* [2025-06-26] Support multiple loras for multi-view synthesis [See [here](https://github.com/huanngzh/ComfyUI-MVAdapter/pull/96)] +* [2025-01-15] Support selection of generated perspectives, such as generating only 2 views (front&back) [See [here](#view-selection)] +* [2024-12-25] Support integration with ControlNet, for applications like scribble to multi-view images [See [here](#with-controlnet)] +* [2024-12-09] Support integration with SDXL LoRA [See [here](#with-lora)] +* [2024-12-02] Generate multi-view consistent images from text prompts or a single image + +## Installation + +### From Source + +* Clone or download this repository into your `ComfyUI/custom_nodes/` directory. +* Install the required dependencies by running `pip install -r requirements.txt`. + +## Notes + +### Workflows + +We provide the example workflows in `workflows` directory. + +Note that our code depends on diffusers, and will automatically download the model weights from huggingface to the hf cache path at the first time. The `ckpt_name` in the node corresponds to the model name in huggingface, such as `stabilityai/stable-diffusion-xl-base-1.0`. + +We also provide the nodes `Ldm**Loader` to support loading text-to-image models in `ldm` format. Please see the workflow files with the suffix `_ldm.json`. + +### GPU Memory + +If your GPU resources are limited, we recommend using the following configuration: + +* Use [madebyollin/sdxl-vae-fp16-fix](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix) as VAE. If using ldm-format pipeline, remember to set `upcast_fp32` to `False`. + +![upcast_fp32_to_false](assets/comfyui_ldm_vae.png) + +* Set `enable_vae_slicing` in the Diffusers Model Makeup node to `True`. + +![enable_vae_slicing](assets/comfyui_model_makeup.png) + +However, since SDXL is used as the base model, it still requires about 13G to 14G GPU memory. + +## Usage + +### Text to Multi-view Images + +#### With SDXL or other base models + +![comfyui_t2mv](assets/comfyui_t2mv.png) + +* `workflows/t2mv_sdxl_diffusers.json` for loading diffusers-format models +* `workflows/t2mv_sdxl_ldm.json` for loading ldm-format models + +#### With LoRA + +![comfyui_t2mv_lora](assets/comfyui_t2mv_lora.png) + +`workflows/t2mv_sdxl_ldm_lora.json` for loading ldm-format models with LoRA for text-to-multi-view generation + +#### With ControlNet + +![comfyui_t2mv_controlnet](assets/comfyui_t2mv_controlnet.png) + +`workflows/t2mv_sdxl_ldm_controlnet.json` for loading diffusers-format controlnets for text-scribble-to-multi-view generation + +### Image to Multi-view Images + +#### With SDXL or other base models + +![comfyui_i2mv](assets/comfyui_i2mv.png) + +* `workflows/i2mv_sdxl_diffusers.json` for loading diffusers-format models +* `workflows/i2mv_sdxl_ldm.json` for loading ldm-format models + +#### With LoRA + +![comfyui_i2mv_lora](assets/comfyui_i2mv_lora.png) + +`workflows/i2mv_sdxl_ldm_lora.json` for loading ldm-format models with LoRA for image-to-multi-view generation + +#### View Selection + +![comfyui_i2mv_pair_views](assets/comfyui_i2mv_view_selector.png) + +`workflows/i2mv_sdxl_ldm_view_selector.json` for loading ldm-format models and selecting specific views to generate + +The key is to replace the `adapter_name` in `Diffusers Model Makeup` with `mvadapter_i2mv_sdxl_beta.safetensors`, and add a `View Selector` node to choose which views you want to generate. After a rough test, the beta model is better at generating 2 views (front&back), 3 views (front&right&back), 4 views (front&right&back&left). Note that the attribute `num_views` is not used and can be ignored. diff --git a/comfyui-mvadapter/__init__.py b/comfyui-mvadapter/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..fd1531c4c4c42312e96b74fb3d2c89ab1ff31c37 --- /dev/null +++ b/comfyui-mvadapter/__init__.py @@ -0,0 +1,45 @@ +# __init__.py for comfyui-mvadapter +# Register BOTH node sets: the original nodes.py and nodes_local_mv.py + +import traceback + +# Load the original nodes (if present) +try: + from .nodes import ( + NODE_CLASS_MAPPINGS as CORE_NODE_CLASS_MAPPINGS, + NODE_DISPLAY_NAME_MAPPINGS as CORE_NODE_DISPLAY_NAME_MAPPINGS, + ) +except Exception as e: + print("[comfyui-mvadapter] WARN: Failed to import .nodes") + traceback.print_exc() + CORE_NODE_CLASS_MAPPINGS = {} + CORE_NODE_DISPLAY_NAME_MAPPINGS = {} + +# Load the local-only nodes (if present) +try: + from .nodes_local_mv import ( + NODE_CLASS_MAPPINGS as LOCAL_NODE_CLASS_MAPPINGS, + NODE_DISPLAY_NAME_MAPPINGS as LOCAL_NODE_DISPLAY_NAME_MAPPINGS, + ) +except Exception as e: + print("[comfyui-mvadapter] WARN: Failed to import .nodes_local_mv") + traceback.print_exc() + LOCAL_NODE_CLASS_MAPPINGS = {} + LOCAL_NODE_DISPLAY_NAME_MAPPINGS = {} + +# Merge into the symbols ComfyUI looks for +NODE_CLASS_MAPPINGS = {} +NODE_CLASS_MAPPINGS.update(CORE_NODE_CLASS_MAPPINGS) +NODE_CLASS_MAPPINGS.update(LOCAL_NODE_CLASS_MAPPINGS) + +NODE_DISPLAY_NAME_MAPPINGS = {} +NODE_DISPLAY_NAME_MAPPINGS.update(CORE_NODE_DISPLAY_NAME_MAPPINGS) +NODE_DISPLAY_NAME_MAPPINGS.update(LOCAL_NODE_DISPLAY_NAME_MAPPINGS) + +# Optional: quick summary to help debug load order +print( + "[comfyui-mvadapter] Registered nodes:", + ", ".join(sorted(NODE_CLASS_MAPPINGS.keys())) or "(none)", +) + +__all__ = ["NODE_CLASS_MAPPINGS", "NODE_DISPLAY_NAME_MAPPINGS"] diff --git a/comfyui-mvadapter/__pycache__/__init__.cpython-312.pyc b/comfyui-mvadapter/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..097e35be2bcf83debb0b6a628faeef2e2d7b9f8c Binary files /dev/null and b/comfyui-mvadapter/__pycache__/__init__.cpython-312.pyc differ diff --git a/comfyui-mvadapter/__pycache__/nodes.cpython-312.pyc b/comfyui-mvadapter/__pycache__/nodes.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a4b4b443a5c331cbde47e43ac94c5cd33edfa1a0 Binary files /dev/null and b/comfyui-mvadapter/__pycache__/nodes.cpython-312.pyc differ diff --git a/comfyui-mvadapter/__pycache__/nodes_local_mv.cpython-312.pyc b/comfyui-mvadapter/__pycache__/nodes_local_mv.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..35910ae933787ab2ffe8c5d2470b1e3e68770f4d Binary files /dev/null and b/comfyui-mvadapter/__pycache__/nodes_local_mv.cpython-312.pyc differ diff --git a/comfyui-mvadapter/__pycache__/utils.cpython-312.pyc b/comfyui-mvadapter/__pycache__/utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..295c6da0dbd21074a9e07550957ef85ec424a66a Binary files /dev/null and b/comfyui-mvadapter/__pycache__/utils.cpython-312.pyc differ diff --git a/comfyui-mvadapter/assets/CustomLoraModelLoader.png b/comfyui-mvadapter/assets/CustomLoraModelLoader.png new file mode 100644 index 0000000000000000000000000000000000000000..e43a05bac7060b145651360540cbabbb6949c4e4 Binary files /dev/null and b/comfyui-mvadapter/assets/CustomLoraModelLoader.png differ diff --git a/comfyui-mvadapter/assets/comfyui_i2mv.png b/comfyui-mvadapter/assets/comfyui_i2mv.png new file mode 100644 index 0000000000000000000000000000000000000000..cc6bccd83881f2a2031f50afb56549839f14418c --- /dev/null +++ b/comfyui-mvadapter/assets/comfyui_i2mv.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c364ee7e709ced6c9fe32111ed8ef0f6b893410b7165d87fa12dc7ec6c61953 +size 432373 diff --git a/comfyui-mvadapter/assets/comfyui_i2mv_lora.png b/comfyui-mvadapter/assets/comfyui_i2mv_lora.png new file mode 100644 index 0000000000000000000000000000000000000000..23bfb26af02b7b3e199f2c614305e8cd38095209 --- /dev/null +++ b/comfyui-mvadapter/assets/comfyui_i2mv_lora.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d037b0b3f026f308e6dacf9261483a8e9e069507ab09cf86ad22fc5fcf2aa49 +size 853152 diff --git a/comfyui-mvadapter/assets/comfyui_i2mv_multiple_loras.jpg b/comfyui-mvadapter/assets/comfyui_i2mv_multiple_loras.jpg new file mode 100644 index 0000000000000000000000000000000000000000..9f134cda151181576e6473fcc18fe58e4219cd42 --- /dev/null +++ b/comfyui-mvadapter/assets/comfyui_i2mv_multiple_loras.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65c901ec52c76dd2e3ee49e121b52a4589ce9e9f9e67edccf297b5028470768b +size 470722 diff --git a/comfyui-mvadapter/assets/comfyui_i2mv_view_selector.png b/comfyui-mvadapter/assets/comfyui_i2mv_view_selector.png new file mode 100644 index 0000000000000000000000000000000000000000..d093954329967e032e5188f831a5597a8a146339 --- /dev/null +++ b/comfyui-mvadapter/assets/comfyui_i2mv_view_selector.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a48cde4ec2a44b1a9a29d4b9e1aaaf5a9ae287ef2d5ad4fe5da23e876c76c74 +size 401473 diff --git a/comfyui-mvadapter/assets/comfyui_ldm_vae.png b/comfyui-mvadapter/assets/comfyui_ldm_vae.png new file mode 100644 index 0000000000000000000000000000000000000000..c1d2deb6408928a11b1f783370280d22fad68f2d Binary files /dev/null and b/comfyui-mvadapter/assets/comfyui_ldm_vae.png differ diff --git a/comfyui-mvadapter/assets/comfyui_model_makeup.png b/comfyui-mvadapter/assets/comfyui_model_makeup.png new file mode 100644 index 0000000000000000000000000000000000000000..575d330d787f1be5543d56255faf2ec54259b7bd Binary files /dev/null and b/comfyui-mvadapter/assets/comfyui_model_makeup.png differ diff --git a/comfyui-mvadapter/assets/comfyui_t2mv.png b/comfyui-mvadapter/assets/comfyui_t2mv.png new file mode 100644 index 0000000000000000000000000000000000000000..4c90727d402af5d4a4f2d0b72fc7e7ee1a06f84f --- /dev/null +++ b/comfyui-mvadapter/assets/comfyui_t2mv.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61f807f5665dbe404be09ab27214ae3e545160c6f99005f7d309e31af15ed41f +size 311149 diff --git a/comfyui-mvadapter/assets/comfyui_t2mv_controlnet.png b/comfyui-mvadapter/assets/comfyui_t2mv_controlnet.png new file mode 100644 index 0000000000000000000000000000000000000000..5d298e51636db30335217da7fb058cc9a6d58aa7 --- /dev/null +++ b/comfyui-mvadapter/assets/comfyui_t2mv_controlnet.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1b1923de261e12963fc5dbdc929e3f4f832aae34cb198beab14748c24758aee +size 425670 diff --git a/comfyui-mvadapter/assets/comfyui_t2mv_lora.png b/comfyui-mvadapter/assets/comfyui_t2mv_lora.png new file mode 100644 index 0000000000000000000000000000000000000000..136803f5cef8fe916b8a5557d6eb3d21feaed341 --- /dev/null +++ b/comfyui-mvadapter/assets/comfyui_t2mv_lora.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62293e0d4897848f7b2117d5b18036c9ed82b01eaa7b9e39e55ed33f53ee0ec3 +size 1052074 diff --git a/comfyui-mvadapter/assets/comfyui_t2mv_multiple_loras.jpg b/comfyui-mvadapter/assets/comfyui_t2mv_multiple_loras.jpg new file mode 100644 index 0000000000000000000000000000000000000000..f28654d84eb86ec5f4d39c629d4d1dacb0d60e2b --- /dev/null +++ b/comfyui-mvadapter/assets/comfyui_t2mv_multiple_loras.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7436db15d4fb65113bc544eeda0ad9be9ee03cb589959847763eaf85fe93f65e +size 491618 diff --git a/comfyui-mvadapter/assets/demo/scribbles/scribble_0.png b/comfyui-mvadapter/assets/demo/scribbles/scribble_0.png new file mode 100644 index 0000000000000000000000000000000000000000..c54eb5bc5b5ae312c8bd698faace4304ffc9dd9a Binary files /dev/null and b/comfyui-mvadapter/assets/demo/scribbles/scribble_0.png differ diff --git a/comfyui-mvadapter/assets/demo/scribbles/scribble_1.png b/comfyui-mvadapter/assets/demo/scribbles/scribble_1.png new file mode 100644 index 0000000000000000000000000000000000000000..eb1da8c96f4d6d4192b3a3d1b9ef81cce11f3270 Binary files /dev/null and b/comfyui-mvadapter/assets/demo/scribbles/scribble_1.png differ diff --git a/comfyui-mvadapter/assets/demo/scribbles/scribble_2.png b/comfyui-mvadapter/assets/demo/scribbles/scribble_2.png new file mode 100644 index 0000000000000000000000000000000000000000..d6a5e101193fcadcfd305be197a46650bd1a0ea3 Binary files /dev/null and b/comfyui-mvadapter/assets/demo/scribbles/scribble_2.png differ diff --git a/comfyui-mvadapter/assets/demo/scribbles/scribble_3.png b/comfyui-mvadapter/assets/demo/scribbles/scribble_3.png new file mode 100644 index 0000000000000000000000000000000000000000..a882e58b4b81e9d676d856a7bcbe18283d7a808d Binary files /dev/null and b/comfyui-mvadapter/assets/demo/scribbles/scribble_3.png differ diff --git a/comfyui-mvadapter/assets/demo/scribbles/scribble_4.png b/comfyui-mvadapter/assets/demo/scribbles/scribble_4.png new file mode 100644 index 0000000000000000000000000000000000000000..901fe4cede0f33c1c8e7f7a0432e9b040e6e44d6 Binary files /dev/null and b/comfyui-mvadapter/assets/demo/scribbles/scribble_4.png differ diff --git a/comfyui-mvadapter/assets/demo/scribbles/scribble_5.png b/comfyui-mvadapter/assets/demo/scribbles/scribble_5.png new file mode 100644 index 0000000000000000000000000000000000000000..22bf31fb8520c97631caa602c1ff5f4107b941f4 Binary files /dev/null and b/comfyui-mvadapter/assets/demo/scribbles/scribble_5.png differ diff --git a/comfyui-mvadapter/cache/stable-diffusion-v1-inference.yaml b/comfyui-mvadapter/cache/stable-diffusion-v1-inference.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9b17e56142bf79f741b34c27d5a6d2c6e0afaf13 --- /dev/null +++ b/comfyui-mvadapter/cache/stable-diffusion-v1-inference.yaml @@ -0,0 +1,70 @@ +model: + base_learning_rate: 1.0e-04 + target: ldm.models.diffusion.ddpm.LatentDiffusion + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + image_size: 64 + channels: 4 + cond_stage_trainable: false # Note: different from the one we trained before + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + + scheduler_config: # 10000 warmup steps + target: ldm.lr_scheduler.LambdaLinearScheduler + params: + warm_up_steps: [ 10000 ] + cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases + f_start: [ 1.e-6 ] + f_max: [ 1. ] + f_min: [ 1. ] + + unet_config: + target: ldm.modules.diffusionmodules.openaimodel.UNetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder \ No newline at end of file diff --git a/comfyui-mvadapter/mvadapter/__init__.py b/comfyui-mvadapter/mvadapter/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/comfyui-mvadapter/mvadapter/__pycache__/__init__.cpython-312.pyc b/comfyui-mvadapter/mvadapter/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4b46c24665863e30a10f2f4c9a7657ab86e1a37c Binary files /dev/null and b/comfyui-mvadapter/mvadapter/__pycache__/__init__.cpython-312.pyc differ diff --git a/comfyui-mvadapter/mvadapter/loaders/__init__.py b/comfyui-mvadapter/mvadapter/loaders/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7f97d96b94cf3e8ea6fd71c3ec6ad11914c7adc7 --- /dev/null +++ b/comfyui-mvadapter/mvadapter/loaders/__init__.py @@ -0,0 +1 @@ +from .custom_adapter import CustomAdapterMixin diff --git a/comfyui-mvadapter/mvadapter/loaders/__pycache__/__init__.cpython-312.pyc b/comfyui-mvadapter/mvadapter/loaders/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5e71e1122e98de8f240d522c6bbc7ccb14e047d0 Binary files /dev/null and b/comfyui-mvadapter/mvadapter/loaders/__pycache__/__init__.cpython-312.pyc differ diff --git a/comfyui-mvadapter/mvadapter/loaders/__pycache__/custom_adapter.cpython-312.pyc b/comfyui-mvadapter/mvadapter/loaders/__pycache__/custom_adapter.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..294b5ac463de3fef01d7cdcab1bf84e2ec3b747b Binary files /dev/null and b/comfyui-mvadapter/mvadapter/loaders/__pycache__/custom_adapter.cpython-312.pyc differ diff --git a/comfyui-mvadapter/mvadapter/loaders/custom_adapter.py b/comfyui-mvadapter/mvadapter/loaders/custom_adapter.py new file mode 100644 index 0000000000000000000000000000000000000000..d92c12483022d1b32d6670aceca29f55e64b5ece --- /dev/null +++ b/comfyui-mvadapter/mvadapter/loaders/custom_adapter.py @@ -0,0 +1,98 @@ +import os +from typing import Dict, Optional, Union + +import safetensors +import torch +from diffusers.utils import _get_model_file, logging +from safetensors import safe_open + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +class CustomAdapterMixin: + def init_custom_adapter(self, *args, **kwargs): + self._init_custom_adapter(*args, **kwargs) + + def _init_custom_adapter(self, *args, **kwargs): + raise NotImplementedError + + def load_custom_adapter( + self, + pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], + weight_name: str, + subfolder: Optional[str] = None, + **kwargs, + ): + # Load the main state dict first. + cache_dir = kwargs.pop("cache_dir", None) + force_download = kwargs.pop("force_download", False) + proxies = kwargs.pop("proxies", None) + local_files_only = kwargs.pop("local_files_only", None) + token = kwargs.pop("token", None) + revision = kwargs.pop("revision", None) + + user_agent = { + "file_type": "attn_procs_weights", + "framework": "pytorch", + } + + if not isinstance(pretrained_model_name_or_path_or_dict, dict): + model_file = _get_model_file( + pretrained_model_name_or_path_or_dict, + weights_name=weight_name, + subfolder=subfolder, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + local_files_only=local_files_only, + token=token, + revision=revision, + user_agent=user_agent, + ) + if weight_name.endswith(".safetensors"): + state_dict = {} + with safe_open(model_file, framework="pt", device="cpu") as f: + for key in f.keys(): + state_dict[key] = f.get_tensor(key) + else: + state_dict = torch.load(model_file, map_location="cpu") + else: + state_dict = pretrained_model_name_or_path_or_dict + + self._load_custom_adapter(state_dict) + + def _load_custom_adapter(self, state_dict): + raise NotImplementedError + + def save_custom_adapter( + self, + save_directory: Union[str, os.PathLike], + weight_name: str, + safe_serialization: bool = False, + **kwargs, + ): + if os.path.isfile(save_directory): + logger.error( + f"Provided path ({save_directory}) should be a directory, not a file" + ) + return + + if safe_serialization: + + def save_function(weights, filename): + return safetensors.torch.save_file( + weights, filename, metadata={"format": "pt"} + ) + + else: + save_function = torch.save + + # Save the model + state_dict = self._save_custom_adapter(**kwargs) + save_function(state_dict, os.path.join(save_directory, weight_name)) + logger.info( + f"Custom adapter weights saved in {os.path.join(save_directory, weight_name)}" + ) + + def _save_custom_adapter(self): + raise NotImplementedError diff --git a/comfyui-mvadapter/mvadapter/models/__init__.py b/comfyui-mvadapter/mvadapter/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/comfyui-mvadapter/mvadapter/models/__pycache__/__init__.cpython-312.pyc b/comfyui-mvadapter/mvadapter/models/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0f59e6dda9aab26d18362395c4376afd146d9384 Binary files /dev/null and b/comfyui-mvadapter/mvadapter/models/__pycache__/__init__.cpython-312.pyc differ diff --git a/comfyui-mvadapter/mvadapter/models/__pycache__/attention_processor.cpython-312.pyc b/comfyui-mvadapter/mvadapter/models/__pycache__/attention_processor.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b060646931d4b16ad105ec8317186a3270fa790b Binary files /dev/null and b/comfyui-mvadapter/mvadapter/models/__pycache__/attention_processor.cpython-312.pyc differ diff --git a/comfyui-mvadapter/mvadapter/models/attention_processor.py b/comfyui-mvadapter/mvadapter/models/attention_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..eae1ff1026abddbcd8073125091b06a794485625 --- /dev/null +++ b/comfyui-mvadapter/mvadapter/models/attention_processor.py @@ -0,0 +1,377 @@ +import math +from typing import Callable, List, Optional, Union + +import torch +import torch.nn.functional as F +from diffusers.models.attention_processor import Attention +from diffusers.models.unets import UNet2DConditionModel +from diffusers.utils import deprecate, logging +from diffusers.utils.import_utils import is_torch_npu_available, is_xformers_available +from einops import rearrange +from torch import nn + + +def default_set_attn_proc_func( + name: str, + hidden_size: int, + cross_attention_dim: Optional[int], + ori_attn_proc: object, +) -> object: + return ori_attn_proc + + +def set_unet_2d_condition_attn_processor( + unet: UNet2DConditionModel, + set_self_attn_proc_func: Callable = default_set_attn_proc_func, + set_cross_attn_proc_func: Callable = default_set_attn_proc_func, + set_custom_attn_proc_func: Callable = default_set_attn_proc_func, + set_self_attn_module_names: Optional[List[str]] = None, + set_cross_attn_module_names: Optional[List[str]] = None, + set_custom_attn_module_names: Optional[List[str]] = None, +) -> None: + do_set_processor = lambda name, module_names: ( + any([name.startswith(module_name) for module_name in module_names]) + if module_names is not None + else True + ) # prefix match + + attn_procs = {} + for name, attn_processor in unet.attn_processors.items(): + # set attn_processor by default, if module_names is None + set_self_attn_processor = do_set_processor(name, set_self_attn_module_names) + set_cross_attn_processor = do_set_processor(name, set_cross_attn_module_names) + set_custom_attn_processor = do_set_processor(name, set_custom_attn_module_names) + + if name.startswith("mid_block"): + hidden_size = unet.config.block_out_channels[-1] + elif name.startswith("up_blocks"): + block_id = int(name[len("up_blocks.")]) + hidden_size = list(reversed(unet.config.block_out_channels))[block_id] + elif name.startswith("down_blocks"): + block_id = int(name[len("down_blocks.")]) + hidden_size = unet.config.block_out_channels[block_id] + + is_custom = "attn_mid_blocks" in name or "attn_post_blocks" in name + if is_custom: + attn_procs[name] = ( + set_custom_attn_proc_func(name, hidden_size, None, attn_processor) + if set_custom_attn_processor + else attn_processor + ) + else: + cross_attention_dim = ( + None + if name.endswith("attn1.processor") + else unet.config.cross_attention_dim + ) + if cross_attention_dim is None or "motion_modules" in name: + # self attention + attn_procs[name] = ( + set_self_attn_proc_func( + name, hidden_size, cross_attention_dim, attn_processor + ) + if set_self_attn_processor + else attn_processor + ) + else: + # cross attention + attn_procs[name] = ( + set_cross_attn_proc_func( + name, hidden_size, cross_attention_dim, attn_processor + ) + if set_cross_attn_processor + else attn_processor + ) + + unet.set_attn_processor(attn_procs) + + +class DecoupledMVRowSelfAttnProcessor2_0(torch.nn.Module): + r""" + Attention processor for Decoupled Row-wise Self-Attention and Image Cross-Attention for PyTorch 2.0. + """ + + def __init__( + self, + query_dim: int, + inner_dim: int, + num_views: int = 1, + name: Optional[str] = None, + use_mv: bool = True, + use_ref: bool = False, + ): + if not hasattr(F, "scaled_dot_product_attention"): + raise ImportError( + "DecoupledMVRowSelfAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0." + ) + + super().__init__() + + self.num_views = num_views + self.name = name # NOTE: need for image cross-attention + self.use_mv = use_mv + self.use_ref = use_ref + + if self.use_mv: + self.to_q_mv = nn.Linear( + in_features=query_dim, out_features=inner_dim, bias=False + ) + self.to_k_mv = nn.Linear( + in_features=query_dim, out_features=inner_dim, bias=False + ) + self.to_v_mv = nn.Linear( + in_features=query_dim, out_features=inner_dim, bias=False + ) + self.to_out_mv = nn.ModuleList( + [ + nn.Linear(in_features=inner_dim, out_features=query_dim, bias=True), + nn.Dropout(0.0), + ] + ) + + if self.use_ref: + self.to_q_ref = nn.Linear( + in_features=query_dim, out_features=inner_dim, bias=False + ) + self.to_k_ref = nn.Linear( + in_features=query_dim, out_features=inner_dim, bias=False + ) + self.to_v_ref = nn.Linear( + in_features=query_dim, out_features=inner_dim, bias=False + ) + self.to_out_ref = nn.ModuleList( + [ + nn.Linear(in_features=inner_dim, out_features=query_dim, bias=True), + nn.Dropout(0.0), + ] + ) + + def __call__( + self, + attn: Attention, + hidden_states: torch.FloatTensor, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + temb: Optional[torch.FloatTensor] = None, + mv_scale: float = 1.0, + ref_hidden_states: Optional[torch.FloatTensor] = None, + ref_scale: float = 1.0, + cache_hidden_states: Optional[List[torch.FloatTensor]] = None, + use_mv: bool = True, + use_ref: bool = True, + num_views: Optional[int] = None, + *args, + **kwargs, + ) -> torch.FloatTensor: + """ + New args: + mv_scale (float): scale for multi-view self-attention. + ref_hidden_states (torch.FloatTensor): reference encoder hidden states for image cross-attention. + ref_scale (float): scale for image cross-attention. + cache_hidden_states (List[torch.FloatTensor]): cache hidden states from reference unet. + + """ + if len(args) > 0 or kwargs.get("scale", None) is not None: + deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." + deprecate("scale", "1.0.0", deprecation_message) + + if num_views is not None: + self.num_views = num_views + + # NEW: cache hidden states for reference unet + if cache_hidden_states is not None: + cache_hidden_states[self.name] = hidden_states.clone() + + # NEW: whether to use multi-view attention and image cross-attention + use_mv = self.use_mv and use_mv + use_ref = self.use_ref and use_ref + + residual = hidden_states + if attn.spatial_norm is not None: + hidden_states = attn.spatial_norm(hidden_states, temb) + + input_ndim = hidden_states.ndim + + if input_ndim == 4: + batch_size, channel, height, width = hidden_states.shape + hidden_states = hidden_states.view( + batch_size, channel, height * width + ).transpose(1, 2) + + batch_size, sequence_length, _ = ( + hidden_states.shape + if encoder_hidden_states is None + else encoder_hidden_states.shape + ) + + if attention_mask is not None: + attention_mask = attn.prepare_attention_mask( + attention_mask, sequence_length, batch_size + ) + # scaled_dot_product_attention expects attention_mask shape to be + # (batch, heads, source_length, target_length) + attention_mask = attention_mask.view( + batch_size, attn.heads, -1, attention_mask.shape[-1] + ) + + if attn.group_norm is not None: + hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose( + 1, 2 + ) + + query = attn.to_q(hidden_states) + + # NEW: for decoupled multi-view attention + if use_mv: + query_mv = self.to_q_mv(hidden_states) + + # NEW: for decoupled reference cross attention + if use_ref: + query_ref = self.to_q_ref(hidden_states) + + if encoder_hidden_states is None: + encoder_hidden_states = hidden_states + elif attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states( + encoder_hidden_states + ) + + key = attn.to_k(encoder_hidden_states) + value = attn.to_v(encoder_hidden_states) + + inner_dim = key.shape[-1] + head_dim = inner_dim // attn.heads + + query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + + key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + + # the output of sdp = (batch, num_heads, seq_len, head_dim) + # TODO: add support for attn.scale when we move to Torch 2.1 + hidden_states = F.scaled_dot_product_attention( + query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False + ) + + hidden_states = hidden_states.transpose(1, 2).reshape( + batch_size, -1, attn.heads * head_dim + ) + hidden_states = hidden_states.to(query.dtype) + + ####### Decoupled multi-view self-attention ######## + if use_mv: + key_mv = self.to_k_mv(encoder_hidden_states) + value_mv = self.to_v_mv(encoder_hidden_states) + + query_mv = query_mv.view(batch_size, -1, attn.heads, head_dim) + key_mv = key_mv.view(batch_size, -1, attn.heads, head_dim) + value_mv = value_mv.view(batch_size, -1, attn.heads, head_dim) + + height = width = math.isqrt(sequence_length) + + # row self-attention + query_mv = rearrange( + query_mv, + "(b nv) (ih iw) h c -> (b nv ih) iw h c", + nv=self.num_views, + ih=height, + iw=width, + ).transpose(1, 2) + key_mv = rearrange( + key_mv, + "(b nv) (ih iw) h c -> b ih (nv iw) h c", + nv=self.num_views, + ih=height, + iw=width, + ) + key_mv = ( + key_mv.repeat_interleave(self.num_views, dim=0) + .view(batch_size * height, -1, attn.heads, head_dim) + .transpose(1, 2) + ) + value_mv = rearrange( + value_mv, + "(b nv) (ih iw) h c -> b ih (nv iw) h c", + nv=self.num_views, + ih=height, + iw=width, + ) + value_mv = ( + value_mv.repeat_interleave(self.num_views, dim=0) + .view(batch_size * height, -1, attn.heads, head_dim) + .transpose(1, 2) + ) + + hidden_states_mv = F.scaled_dot_product_attention( + query_mv, + key_mv, + value_mv, + dropout_p=0.0, + is_causal=False, + ) + hidden_states_mv = rearrange( + hidden_states_mv, + "(b nv ih) h iw c -> (b nv) (ih iw) (h c)", + nv=self.num_views, + ih=height, + ) + hidden_states_mv = hidden_states_mv.to(query.dtype) + + # linear proj + hidden_states_mv = self.to_out_mv[0](hidden_states_mv) + # dropout + hidden_states_mv = self.to_out_mv[1](hidden_states_mv) + + if use_ref: + reference_hidden_states = ref_hidden_states[self.name] + + key_ref = self.to_k_ref(reference_hidden_states) + value_ref = self.to_v_ref(reference_hidden_states) + + query_ref = query_ref.view(batch_size, -1, attn.heads, head_dim).transpose( + 1, 2 + ) + key_ref = key_ref.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + value_ref = value_ref.view(batch_size, -1, attn.heads, head_dim).transpose( + 1, 2 + ) + + hidden_states_ref = F.scaled_dot_product_attention( + query_ref, key_ref, value_ref, dropout_p=0.0, is_causal=False + ) + + hidden_states_ref = hidden_states_ref.transpose(1, 2).reshape( + batch_size, -1, attn.heads * head_dim + ) + hidden_states_ref = hidden_states_ref.to(query.dtype) + + # linear proj + hidden_states_ref = self.to_out_ref[0](hidden_states_ref) + # dropout + hidden_states_ref = self.to_out_ref[1](hidden_states_ref) + + # linear proj + hidden_states = attn.to_out[0](hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + if use_mv: + hidden_states = hidden_states + hidden_states_mv * mv_scale + + if use_ref: + hidden_states = hidden_states + hidden_states_ref * ref_scale + + if input_ndim == 4: + hidden_states = hidden_states.transpose(-1, -2).reshape( + batch_size, channel, height, width + ) + + if attn.residual_connection: + hidden_states = hidden_states + residual + + hidden_states = hidden_states / attn.rescale_output_factor + + return hidden_states + + def set_num_views(self, num_views: int) -> None: + self.num_views = num_views diff --git a/comfyui-mvadapter/mvadapter/pipelines/__pycache__/pipeline_mvadapter_i2mv_sd.cpython-312.pyc b/comfyui-mvadapter/mvadapter/pipelines/__pycache__/pipeline_mvadapter_i2mv_sd.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ea01762b0bfe29a6283d7213c5bc7b4ed26683fe Binary files /dev/null and b/comfyui-mvadapter/mvadapter/pipelines/__pycache__/pipeline_mvadapter_i2mv_sd.cpython-312.pyc differ diff --git a/comfyui-mvadapter/mvadapter/pipelines/__pycache__/pipeline_mvadapter_i2mv_sdxl.cpython-312.pyc b/comfyui-mvadapter/mvadapter/pipelines/__pycache__/pipeline_mvadapter_i2mv_sdxl.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c38abec3e7bbe4082388e2d93c853d4745d6ad21 Binary files /dev/null and b/comfyui-mvadapter/mvadapter/pipelines/__pycache__/pipeline_mvadapter_i2mv_sdxl.cpython-312.pyc differ diff --git a/comfyui-mvadapter/mvadapter/pipelines/__pycache__/pipeline_mvadapter_t2mv_sd.cpython-312.pyc b/comfyui-mvadapter/mvadapter/pipelines/__pycache__/pipeline_mvadapter_t2mv_sd.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9272125081cea9b8612b3b577add577f4aa2d688 Binary files /dev/null and b/comfyui-mvadapter/mvadapter/pipelines/__pycache__/pipeline_mvadapter_t2mv_sd.cpython-312.pyc differ diff --git a/comfyui-mvadapter/mvadapter/pipelines/__pycache__/pipeline_mvadapter_t2mv_sdxl.cpython-312.pyc b/comfyui-mvadapter/mvadapter/pipelines/__pycache__/pipeline_mvadapter_t2mv_sdxl.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2a9e4e4b9584c745914a5b5f09f22dd86fe5aa6c Binary files /dev/null and b/comfyui-mvadapter/mvadapter/pipelines/__pycache__/pipeline_mvadapter_t2mv_sdxl.cpython-312.pyc differ diff --git a/comfyui-mvadapter/mvadapter/pipelines/pipeline_mvadapter_i2mv_sdxl.py b/comfyui-mvadapter/mvadapter/pipelines/pipeline_mvadapter_i2mv_sdxl.py new file mode 100644 index 0000000000000000000000000000000000000000..ba5b09378565b359e20d8478bbcb07ab7dceef20 --- /dev/null +++ b/comfyui-mvadapter/mvadapter/pipelines/pipeline_mvadapter_i2mv_sdxl.py @@ -0,0 +1,903 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +from typing import Any, Callable, Dict, List, Optional, Tuple, Union + +import numpy as np +import PIL +import torch +import torch.nn as nn +from diffusers.image_processor import PipelineImageInput, VaeImageProcessor +from diffusers.models import ( + AutoencoderKL, + ImageProjection, + T2IAdapter, + UNet2DConditionModel, +) +from diffusers.pipelines.stable_diffusion_xl.pipeline_output import ( + StableDiffusionXLPipelineOutput, +) +from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl import ( + StableDiffusionXLPipeline, + rescale_noise_cfg, + retrieve_timesteps, +) +from diffusers.schedulers import KarrasDiffusionSchedulers +from diffusers.utils import deprecate, logging +from diffusers.utils.torch_utils import randn_tensor +from einops import rearrange +from transformers import ( + CLIPImageProcessor, + CLIPTextModel, + CLIPTextModelWithProjection, + CLIPTokenizer, + CLIPVisionModelWithProjection, +) + +from ..loaders import CustomAdapterMixin +from ..models.attention_processor import ( + DecoupledMVRowSelfAttnProcessor2_0, + set_unet_2d_condition_attn_processor, +) + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +def retrieve_latents( + encoder_output: torch.Tensor, + generator: Optional[torch.Generator] = None, + sample_mode: str = "sample", +): + if hasattr(encoder_output, "latent_dist") and sample_mode == "sample": + return encoder_output.latent_dist.sample(generator) + elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax": + return encoder_output.latent_dist.mode() + elif hasattr(encoder_output, "latents"): + return encoder_output.latents + else: + raise AttributeError("Could not access latents of provided encoder_output") + + +class MVAdapterI2MVSDXLPipeline(StableDiffusionXLPipeline, CustomAdapterMixin): + def __init__( + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + text_encoder_2: CLIPTextModelWithProjection, + tokenizer: CLIPTokenizer, + tokenizer_2: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: KarrasDiffusionSchedulers, + image_encoder: CLIPVisionModelWithProjection = None, + feature_extractor: CLIPImageProcessor = None, + force_zeros_for_empty_prompt: bool = True, + add_watermarker: Optional[bool] = None, + ): + super().__init__( + vae=vae, + text_encoder=text_encoder, + text_encoder_2=text_encoder_2, + tokenizer=tokenizer, + tokenizer_2=tokenizer_2, + unet=unet, + scheduler=scheduler, + image_encoder=image_encoder, + feature_extractor=feature_extractor, + force_zeros_for_empty_prompt=force_zeros_for_empty_prompt, + add_watermarker=add_watermarker, + ) + + self.control_image_processor = VaeImageProcessor( + vae_scale_factor=self.vae_scale_factor, + do_convert_rgb=True, + do_normalize=False, + ) + + # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.prepare_latents + def prepare_image_latents( + self, + image, + timestep, + batch_size, + num_images_per_prompt, + dtype, + device, + generator=None, + add_noise=True, + ): + if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)): + raise ValueError( + f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}" + ) + + latents_mean = latents_std = None + if ( + hasattr(self.vae.config, "latents_mean") + and self.vae.config.latents_mean is not None + ): + latents_mean = torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1) + if ( + hasattr(self.vae.config, "latents_std") + and self.vae.config.latents_std is not None + ): + latents_std = torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1) + + # Offload text encoder if `enable_model_cpu_offload` was enabled + if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: + self.text_encoder_2.to("cpu") + torch.cuda.empty_cache() + + image = image.to(device=device, dtype=dtype) + + batch_size = batch_size * num_images_per_prompt + + if image.shape[1] == 4: + init_latents = image + + else: + # make sure the VAE is in float32 mode, as it overflows in float16 + if self.vae.config.force_upcast: + image = image.float() + self.vae.to(dtype=torch.float32) + + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + elif isinstance(generator, list): + if image.shape[0] < batch_size and batch_size % image.shape[0] == 0: + image = torch.cat([image] * (batch_size // image.shape[0]), dim=0) + elif image.shape[0] < batch_size and batch_size % image.shape[0] != 0: + raise ValueError( + f"Cannot duplicate `image` of batch size {image.shape[0]} to effective batch_size {batch_size} " + ) + + init_latents = [ + retrieve_latents( + self.vae.encode(image[i : i + 1]), generator=generator[i] + ) + for i in range(batch_size) + ] + init_latents = torch.cat(init_latents, dim=0) + else: + init_latents = retrieve_latents( + self.vae.encode(image), generator=generator + ) + + if self.vae.config.force_upcast: + self.vae.to(dtype) + + init_latents = init_latents.to(dtype) + if latents_mean is not None and latents_std is not None: + latents_mean = latents_mean.to(device=device, dtype=dtype) + latents_std = latents_std.to(device=device, dtype=dtype) + init_latents = ( + (init_latents - latents_mean) + * self.vae.config.scaling_factor + / latents_std + ) + else: + init_latents = self.vae.config.scaling_factor * init_latents + + if ( + batch_size > init_latents.shape[0] + and batch_size % init_latents.shape[0] == 0 + ): + # expand init_latents for batch_size + additional_image_per_prompt = batch_size // init_latents.shape[0] + init_latents = torch.cat( + [init_latents] * additional_image_per_prompt, dim=0 + ) + elif ( + batch_size > init_latents.shape[0] + and batch_size % init_latents.shape[0] != 0 + ): + raise ValueError( + f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts." + ) + else: + init_latents = torch.cat([init_latents], dim=0) + + if add_noise: + shape = init_latents.shape + noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + # get latents + init_latents = self.scheduler.add_noise(init_latents, noise, timestep) + + latents = init_latents + + return latents + + def prepare_control_image( + self, + image, + width, + height, + batch_size, + num_images_per_prompt, + device, + dtype, + do_classifier_free_guidance=False, + num_empty_images=0, # for concat in batch like ImageDream + ): + """ + Accepts either: + - regular RGB-like images -> preprocess via VaeImageProcessor, or + - native 6-channel Plücker tensors (B,6,H,W) or (6,H,W) -> pass through without normalization + """ + assert hasattr( + self, "control_image_processor" + ), "control_image_processor is not initialized" + + # Fast path: native 6-channel tensor + if isinstance(image, torch.Tensor): + if image.dim() == 3 and image.shape[0] == 6: + image = image.unsqueeze(0) # (1,6,H,W) + if image.dim() == 4 and image.shape[1] == 6: + ctrl = image.to(device=device, dtype=torch.float32) + if num_empty_images > 0: + ctrl = torch.cat([ctrl, torch.zeros_like(ctrl[:num_empty_images])], dim=0) + + image_batch_size = ctrl.shape[0] + repeat_by = batch_size if image_batch_size == 1 else num_images_per_prompt # always 1 per control + ctrl = ctrl.repeat_interleave(repeat_by, dim=0) + ctrl = ctrl.to(device=device, dtype=dtype) + + if do_classifier_free_guidance: + ctrl = torch.cat([ctrl] * 2) + + return ctrl + + # Fallback: treat as regular image(s) + image = self.control_image_processor.preprocess( + image, height=height, width=width + ).to(dtype=torch.float32) + + if num_empty_images > 0: + image = torch.cat( + [image, torch.zeros_like(image[:num_empty_images])], dim=0 + ) + + image_batch_size = image.shape[0] + + if image_batch_size == 1: + repeat_by = batch_size + else: + # image batch size is the same as prompt batch size + repeat_by = num_images_per_prompt # always 1 for control image + + image = image.repeat_interleave(repeat_by, dim=0) + + image = image.to(device=device, dtype=dtype) + + if do_classifier_free_guidance: + image = torch.cat([image] * 2) + + return image + + @torch.no_grad() + def __call__( + self, + prompt: Union[str, List[str]] = None, + prompt_2: Optional[Union[str, List[str]]] = None, + # --- Task 1: NEW (reference-only prompt used only for the ref cache pass) --- + reference_prompt: Optional[Union[str, List[str]]] = None, + reference_prompt_2: Optional[Union[str, List[str]]] = None, + # ----------------------------------------------------------------------------- + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + timesteps: List[int] = None, + denoising_end: Optional[float] = None, + guidance_scale: float = 5.0, + negative_prompt: Optional[Union[str, List[str]]] = None, + negative_prompt_2: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + ip_adapter_image: Optional[PipelineImageInput] = None, + ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + guidance_rescale: float = 0.0, + original_size: Optional[Tuple[int, int]] = None, + crops_coords_top_left: Tuple[int, int] = (0, 0), + target_size: Optional[Tuple[int, int]] = None, + negative_original_size: Optional[Tuple[int, int]] = None, + negative_crops_coords_top_left: Tuple[int, int] = (0, 0), + negative_target_size: Optional[Tuple[int, int]] = None, + clip_skip: Optional[int] = None, + callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, + callback_on_step_end_tensor_inputs: List[str] = ["latents"], + # NEW + mv_scale: float = 1.0, + # Camera or geometry condition + control_image: Optional[PipelineImageInput] = None, + control_conditioning_scale: Optional[float] = 1.0, + control_conditioning_factor: float = 1.0, + # Image condition + reference_image: Optional[PipelineImageInput] = None, + reference_conditioning_scale: Optional[float] = 1.0, + **kwargs, + ): + r""" + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The main prompt(s) for generation. + prompt_2 (`str` or `List[str]`, *optional*): + Prompt(s) for the second text encoder. Falls back to `prompt` if None. + reference_prompt (`str` or `List[str]`, *optional*): + Prompt used **only** during the one-shot reference UNet pass that caches identity features + from `reference_image`. If None or empty, falls back to the positive branch of the main prompt + (original behavior). + reference_prompt_2 (`str` or `List[str]`, *optional*): + Second-encoder counterpart for `reference_prompt`. + ... (other arguments unchanged) ... + """ + + callback = kwargs.pop("callback", None) + callback_steps = kwargs.pop("callback_steps", None) + + if callback is not None: + deprecate( + "callback", + "1.0.0", + "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`", + ) + if callback_steps is not None: + deprecate( + "callback_steps", + "1.0.0", + "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`", + ) + + # 0. Default height and width to unet + height = height or self.default_sample_size * self.vae_scale_factor + width = width or self.default_sample_size * self.vae_scale_factor + + original_size = original_size or (height, width) + target_size = target_size or (height, width) + + # 1. Check inputs. Raise error if not correct + self.check_inputs( + prompt, + prompt_2, + height, + width, + callback_steps, + negative_prompt, + negative_prompt_2, + prompt_embeds, + negative_prompt_embeds, + pooled_prompt_embeds, + negative_pooled_prompt_embeds, + ip_adapter_image, + ip_adapter_image_embeds, + callback_on_step_end_tensor_inputs, + ) + + self._guidance_scale = guidance_scale + self._guidance_rescale = guidance_rescale + self._clip_skip = clip_skip + self._cross_attention_kwargs = cross_attention_kwargs + self._denoising_end = denoising_end + self._interrupt = False + + # 2. Define call parameters + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + device = self._execution_device + + # 3. Encode input prompt + lora_scale = ( + self.cross_attention_kwargs.get("scale", None) + if self.cross_attention_kwargs is not None + else None + ) + + ( + prompt_embeds, + negative_prompt_embeds, + pooled_prompt_embeds, + negative_pooled_prompt_embeds, + ) = self.encode_prompt( + prompt=prompt, + prompt_2=prompt_2, + device=device, + num_images_per_prompt=num_images_per_prompt, + do_classifier_free_guidance=self.do_classifier_free_guidance, + negative_prompt=negative_prompt, + negative_prompt_2=negative_prompt_2, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + pooled_prompt_embeds=pooled_prompt_embeds, + negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, + lora_scale=lora_scale, + clip_skip=self.clip_skip, + ) + + # 4. Prepare timesteps + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, num_inference_steps, device, timesteps + ) + + # 5. Prepare latent variables + num_channels_latents = self.unet.config.in_channels + latents = self.prepare_latents( + batch_size * num_images_per_prompt, + num_channels_latents, + height, + width, + prompt_embeds.dtype, + device, + generator, + latents, + ) + + # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + # 7. Prepare added time ids & embeddings + add_text_embeds = pooled_prompt_embeds + if self.text_encoder_2 is None: + text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1]) + else: + text_encoder_projection_dim = self.text_encoder_2.config.projection_dim + + add_time_ids = self._get_add_time_ids( + original_size, + crops_coords_top_left, + target_size, + dtype=prompt_embeds.dtype, + text_encoder_projection_dim=text_encoder_projection_dim, + ) + if negative_original_size is not None and negative_target_size is not None: + negative_add_time_ids = self._get_add_time_ids( + negative_original_size, + negative_crops_coords_top_left, + negative_target_size, + dtype=prompt_embeds.dtype, + text_encoder_projection_dim=text_encoder_projection_dim, + ) + else: + negative_add_time_ids = add_time_ids + + if self.do_classifier_free_guidance: + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0) + add_text_embeds = torch.cat( + [negative_pooled_prompt_embeds, add_text_embeds], dim=0 + ) + add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0) + + prompt_embeds = prompt_embeds.to(device) + add_text_embeds = add_text_embeds.to(device) + add_time_ids = add_time_ids.to(device).repeat( + batch_size * num_images_per_prompt, 1 + ) + + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: + image_embeds = self.prepare_ip_adapter_image_embeds( + ip_adapter_image, + ip_adapter_image_embeds, + device, + batch_size * num_images_per_prompt, + self.do_classifier_free_guidance, + ) + + # Preprocess reference image (required) + reference_image = self.image_processor.preprocess(reference_image) + reference_latents = self.prepare_image_latents( + reference_image, + timesteps[:1].repeat(batch_size * num_images_per_prompt), # no use + batch_size, + 1, + prompt_embeds.dtype, + device, + generator, + add_noise=False, + ) + + with torch.no_grad(): + ref_timesteps = torch.zeros_like(timesteps[0]) + ref_hidden_states = {} + + # reference-only prompt support (Task 1) + def _first_or_none(x): + if x is None: + return None + if isinstance(x, list) and len(x) > 0: + return x[0] + return x + + rp = _first_or_none(reference_prompt) + rp2 = _first_or_none(reference_prompt_2) + have_ref_prompt = (rp is not None and str(rp).strip() != "") or ( + rp2 is not None and str(rp2).strip() != "" + ) + + if have_ref_prompt: + ref_prompt_embeds, _, ref_pooled_prompt_embeds, _ = self.encode_prompt( + prompt=rp or prompt, + prompt_2=rp2 or prompt_2, + device=device, + num_images_per_prompt=1, + do_classifier_free_guidance=False, + prompt_embeds=None, + negative_prompt_embeds=None, + pooled_prompt_embeds=None, + negative_pooled_prompt_embeds=None, + lora_scale=( + self.cross_attention_kwargs.get("scale", None) + if self.cross_attention_kwargs is not None + else None + ), + clip_skip=self.clip_skip, + ) + else: + if self.do_classifier_free_guidance: + ref_prompt_embeds = prompt_embeds[-1:].clone() + ref_pooled_prompt_embeds = add_text_embeds[-1:].clone() + else: + ref_prompt_embeds = prompt_embeds[:1].clone() + ref_pooled_prompt_embeds = add_text_embeds[:1].clone() + + self.unet( + reference_latents, + ref_timesteps, + encoder_hidden_states=ref_prompt_embeds, + added_cond_kwargs={ + "text_embeds": ref_pooled_prompt_embeds, + "time_ids": add_time_ids[-1:], + }, + cross_attention_kwargs={ + "cache_hidden_states": ref_hidden_states, + "use_mv": False, + "use_ref": False, + }, + return_dict=False, + ) + ref_hidden_states = { + k: v.repeat_interleave(num_images_per_prompt, dim=0) + for k, v in ref_hidden_states.items() + } + if self.do_classifier_free_guidance: + ref_hidden_states = { + k: torch.cat([torch.zeros_like(v), v], dim=0) + for k, v in ref_hidden_states.items() + } + + cross_attention_kwargs = { + "mv_scale": mv_scale, + "ref_hidden_states": ref_hidden_states, + "ref_scale": reference_conditioning_scale, + **(self.cross_attention_kwargs or {}), + } + + # ------------- control image (Task 2 supports 6ch pass-through) ------------- + control_image_feature = self.prepare_control_image( + image=control_image, + width=width, + height=height, + batch_size=batch_size * num_images_per_prompt, + num_images_per_prompt=1, # NOTE: always 1 for control images + device=device, + dtype=latents.dtype, + do_classifier_free_guidance=self.do_classifier_free_guidance, + ).to(device=device, dtype=latents.dtype) + + adapter_state = self.cond_encoder(control_image_feature) + for i, state in enumerate(adapter_state): + adapter_state[i] = state * control_conditioning_scale + # --------------------------------------------------------------------------- + + # 8. Denoising loop + num_warmup_steps = max( + len(timesteps) - num_inference_steps * self.scheduler.order, 0 + ) + + # 8.1 Apply denoising_end + if ( + self.denoising_end is not None + and isinstance(self.denoising_end, float) + and self.denoising_end > 0 + and self.denoising_end < 1 + ): + discrete_timestep_cutoff = int( + round( + self.scheduler.config.num_train_timesteps + - (self.denoising_end * self.scheduler.config.num_train_timesteps) + ) + ) + num_inference_steps = len( + list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)) + ) + timesteps = timesteps[:num_inference_steps] + + # 9. Optionally get Guidance Scale Embedding + timestep_cond = None + if self.unet.config.time_cond_proj_dim is not None: + guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat( + batch_size * num_images_per_prompt + ) + timestep_cond = self.get_guidance_scale_embedding( + guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim + ).to(device=device, dtype=latents.dtype) + + self._num_timesteps = len(timesteps) + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + if self.interrupt: + continue + + # expand the latents if we are doing classifier free guidance + latent_model_input = ( + torch.cat([latents] * 2) + if self.do_classifier_free_guidance + else latents + ) + + latent_model_input = self.scheduler.scale_model_input( + latent_model_input, t + ) + + added_cond_kwargs = { + "text_embeds": add_text_embeds, + "time_ids": add_time_ids, + } + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: + added_cond_kwargs["image_embeds"] = image_embeds + + if i < int(num_inference_steps * control_conditioning_factor): + down_intrablock_additional_residuals = [ + state.clone() for state in adapter_state + ] + else: + down_intrablock_additional_residuals = None + + # predict the noise residual + noise_pred = self.unet( + latent_model_input, + t, + encoder_hidden_states=prompt_embeds, + timestep_cond=timestep_cond, + cross_attention_kwargs=cross_attention_kwargs, + down_intrablock_additional_residuals=down_intrablock_additional_residuals, + added_cond_kwargs=added_cond_kwargs, + return_dict=False, + )[0] + + # perform guidance + if self.do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + self.guidance_scale * ( + noise_pred_text - noise_pred_uncond + ) + + if self.do_classifier_free_guidance and self.guidance_rescale > 0.0: + noise_pred = rescale_noise_cfg( + noise_pred, + noise_pred_text, + guidance_rescale=self.guidance_rescale, + ) + + # compute the previous noisy sample x_t -> x_t-1 + latents_dtype = latents.dtype + latents = self.scheduler.step( + noise_pred, t, latents, **extra_step_kwargs, return_dict=False + )[0] + if latents.dtype != latents_dtype: + if torch.backends.mps.is_available(): + # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272 + latents = latents.to(latents_dtype) + + if callback_on_step_end is not None: + callback_kwargs = {} + for k in callback_on_step_end_tensor_inputs: + callback_kwargs[k] = locals()[k] + callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) + + latents = callback_outputs.pop("latents", latents) + prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) + negative_prompt_embeds = callback_outputs.pop( + "negative_prompt_embeds", negative_prompt_embeds + ) + add_text_embeds = callback_outputs.pop( + "add_text_embeds", add_text_embeds + ) + negative_pooled_prompt_embeds = callback_outputs.pop( + "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds + ) + add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids) + negative_add_time_ids = callback_outputs.pop( + "negative_add_time_ids", negative_add_time_ids + ) + + # call the callback, if provided + if i == len(timesteps) - 1 or ( + (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0 + ): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + step_idx = i // getattr(self.scheduler, "order", 1) + callback(step_idx, t, latents) + + if not output_type == "latent": + # make sure the VAE is in float32 mode, as it overflows in float16 + needs_upcasting = ( + self.vae.dtype == torch.float16 and self.vae.config.force_upcast + ) + + if needs_upcasting: + self.upcast_vae() + latents = latents.to( + next(iter(self.vae.post_quant_conv.parameters())).dtype + ) + elif latents.dtype != self.vae.dtype: + if torch.backends.mps.is_available(): + # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272 + self.vae = self.vae.to(latents.dtype) + + # unscale/denormalize the latents + # denormalize with the mean and std if available and not None + has_latents_mean = ( + hasattr(self.vae.config, "latents_mean") + and self.vae.config.latents_mean is not None + ) + has_latents_std = ( + hasattr(self.vae.config, "latents_std") + and self.vae.config.latents_std is not None + ) + if has_latents_mean and has_latents_std: + latents_mean = ( + torch.tensor(self.vae.config.latents_mean) + .view(1, 4, 1, 1) + .to(latents.device, latents.dtype) + ) + latents_std = ( + torch.tensor(self.vae.config.latents_std) + .view(1, 4, 1, 1) + .to(latents.device, latents.dtype) + ) + latents = ( + latents * latents_std / self.vae.config.scaling_factor + + latents_mean + ) + else: + latents = latents / self.vae.config.scaling_factor + + image = self.vae.decode(latents, return_dict=False)[0] + + # cast back to fp16 if needed + if needs_upcasting: + self.vae.to(dtype=torch.float16) + else: + image = latents + + if not output_type == "latent": + # apply watermark if available + if self.watermark is not None: + image = self.watermark.apply_watermark(image) + + image = self.image_processor.postprocess(image, output_type=output_type) + + # Offload all models + self.maybe_free_model_hooks() + + if not return_dict: + return (image,) + + return StableDiffusionXLPipelineOutput(images=image) + + ### NEW: adapters ### + def _init_custom_adapter( + self, + # Multi-view adapter + num_views: int = 1, + self_attn_processor: Any = DecoupledMVRowSelfAttnProcessor2_0, + # Condition encoder + cond_in_channels: int = 6, + # For training + copy_attn_weights: bool = True, + zero_init_module_keys: List[str] = [], + ): + # Condition encoder + self.cond_encoder = T2IAdapter( + in_channels=cond_in_channels, + channels=(320, 640, 1280, 1280), + num_res_blocks=2, + downscale_factor=16, + adapter_type="full_adapter_xl", + ) + + # set custom attn processor for multi-view attention and image cross-attention + self.unet: UNet2DConditionModel + set_unet_2d_condition_attn_processor( + self.unet, + set_self_attn_proc_func=lambda name, hs, cad, ap: self_attn_processor( + query_dim=hs, + inner_dim=hs, + num_views=num_views, + name=name, + use_mv=True, + use_ref=True, + ), + set_cross_attn_proc_func=lambda name, hs, cad, ap: self_attn_processor( + query_dim=hs, + inner_dim=hs, + num_views=num_views, + name=name, + use_mv=False, + use_ref=False, + ), + ) + + # copy decoupled attention weights from original unet + if copy_attn_weights: + state_dict = self.unet.state_dict() + for key in state_dict.keys(): + if "_mv" in key: + compatible_key = key.replace("_mv", "").replace("processor.", "") + elif "_ref" in key: + compatible_key = key.replace("_ref", "").replace("processor.", "") + else: + compatible_key = key + + is_zero_init_key = any([k in key for k in zero_init_module_keys]) + if is_zero_init_key: + state_dict[key] = torch.zeros_like(state_dict[compatible_key]) + else: + state_dict[key] = state_dict[compatible_key].clone() + self.unet.load_state_dict(state_dict) + + def _load_custom_adapter(self, state_dict): + self.unet.load_state_dict(state_dict, strict=False) + self.cond_encoder.load_state_dict(state_dict, strict=False) + + def _save_custom_adapter( + self, + include_keys: Optional[List[str]] = None, + exclude_keys: Optional[List[str]] = None, + ): + def include_fn(k): + is_included = False + + if include_keys is not None: + is_included = is_included or any([key in k for key in include_keys]) + if exclude_keys is not None: + is_included = is_included and not any( + [key in k for key in exclude_keys] + ) + + return is_included + + state_dict = {k: v for k, v in self.unet.state_dict().items() if include_fn(k)} + state_dict.update(self.cond_encoder.state_dict()) + + return state_dict diff --git a/comfyui-mvadapter/mvadapter/schedulers/ShiftSNRSchedulerKarras.py b/comfyui-mvadapter/mvadapter/schedulers/ShiftSNRSchedulerKarras.py new file mode 100644 index 0000000000000000000000000000000000000000..80f24b63d16f6275f16fbfbaf4ebafa2d9c8e2ac --- /dev/null +++ b/comfyui-mvadapter/mvadapter/schedulers/ShiftSNRSchedulerKarras.py @@ -0,0 +1,120 @@ +from typing import Any + +import torch + +from .scheduler_utils import SNR_to_betas, compute_snr + + +class ShiftSNRSchedulerKarras: + """ + Wraps a Diffusers scheduler to apply SNR shifting to its noise schedule and + rebuilds a DPMSolverMultistepScheduler that supports Karras sigmas. + + Usage: + new_sched = ShiftSNRSchedulerKarras.from_scheduler( + noise_scheduler=base_sched, + shift_mode="interpolated", # or "default" + shift_scale=8.0, + scheduler_class=DPMSolverMultistepScheduler, # usually this + ) + """ + + # Supported modes for how the SNR shift is applied + SHIFT_MODES = ["default", "interpolated"] + + def __init__( + self, + noise_scheduler: Any, + timesteps: Any, + shift_scale: float, + scheduler_class: Any, + ): + # original scheduler (used only as a reference/config source) + self.noise_scheduler = noise_scheduler + # tensor of timesteps to compute SNR/betas on + self.timesteps = timesteps + # scale by which to divide the SNR (e.g., 8.0) + self.shift_scale = shift_scale + # the scheduler class to construct for output (e.g., DPMSolverMultistepScheduler) + self.scheduler_class = scheduler_class + + def _get_shift_scheduler(self): + """ + Apply a uniform SNR shift: snr' = snr / shift_scale + Then convert to betas and rebuild the scheduler with Karras enabled. + """ + snr = compute_snr(self.timesteps, self.noise_scheduler) + shifted_betas = SNR_to_betas(snr / self.shift_scale) + + return self.scheduler_class.from_config( + self.noise_scheduler.config, + trained_betas=shifted_betas.numpy(), + # Enable Karras sigmas in the rebuilt scheduler + algorithm_type="dpmsolver++", + use_karras_sigmas=True, + ) + + def _get_interpolated_shift_scheduler(self): + """ + Interpolate SNR in log-space between the original and the shifted SNR + as timesteps progress. This tends to preserve early behavior and + gradually apply the shift later in the schedule. + """ + snr = compute_snr(self.timesteps, self.noise_scheduler) + shifted_snr = snr / self.shift_scale + + # Interpolate in log-space from original -> shifted across timesteps + weighting = self.timesteps.float() / ( + self.noise_scheduler.config.num_train_timesteps - 1 + ) + interpolated_snr = torch.exp( + torch.log(snr) * (1 - weighting) + torch.log(shifted_snr) * weighting + ) + + shifted_betas = SNR_to_betas(interpolated_snr) + + return self.scheduler_class.from_config( + self.noise_scheduler.config, + trained_betas=shifted_betas.numpy(), + # Enable Karras sigmas in the rebuilt scheduler + algorithm_type="dpmsolver++", + use_karras_sigmas=True, + ) + + @classmethod + def from_scheduler( + cls, + noise_scheduler: Any, + shift_mode: str = "default", + timesteps: Any = None, + shift_scale: float = 1.0, + scheduler_class: Any = None, + ): + """ + Factory that returns a NEW scheduler instance with the shifted betas applied. + + Args: + noise_scheduler: the original Diffusers scheduler (used for config & base betas) + shift_mode: "default" or "interpolated" + timesteps: tensor of timesteps to evaluate SNR on; if None, uses full training range + shift_scale: divide SNR by this value (e.g., 8.0) + scheduler_class: class to construct for the output scheduler (defaults to original class) + """ + if timesteps is None: + timesteps = torch.arange(0, noise_scheduler.config.num_train_timesteps) + if scheduler_class is None: + scheduler_class = noise_scheduler.__class__ + + wrapper = cls( + noise_scheduler=noise_scheduler, + timesteps=timesteps, + shift_scale=shift_scale, + scheduler_class=scheduler_class, + ) + + if shift_mode == "default": + return wrapper._get_shift_scheduler() + elif shift_mode == "interpolated": + return wrapper._get_interpolated_shift_scheduler() + else: + raise ValueError(f"Unknown shift_mode: {shift_mode}") diff --git a/comfyui-mvadapter/mvadapter/schedulers/__pycache__/ShiftSNRSchedulerKarras.cpython-312.pyc b/comfyui-mvadapter/mvadapter/schedulers/__pycache__/ShiftSNRSchedulerKarras.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5108b54dc5cbc3f63ad8418e2ced7bd02c408b28 Binary files /dev/null and b/comfyui-mvadapter/mvadapter/schedulers/__pycache__/ShiftSNRSchedulerKarras.cpython-312.pyc differ diff --git a/comfyui-mvadapter/mvadapter/schedulers/__pycache__/scheduler_utils.cpython-312.pyc b/comfyui-mvadapter/mvadapter/schedulers/__pycache__/scheduler_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b2a0c5f73a020142ca5487399a3f46522d7ee652 Binary files /dev/null and b/comfyui-mvadapter/mvadapter/schedulers/__pycache__/scheduler_utils.cpython-312.pyc differ diff --git a/comfyui-mvadapter/mvadapter/schedulers/__pycache__/scheduling_shift_snr.cpython-312.pyc b/comfyui-mvadapter/mvadapter/schedulers/__pycache__/scheduling_shift_snr.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8532dd18f877b83a263f2a50a9c7387a6e664f10 Binary files /dev/null and b/comfyui-mvadapter/mvadapter/schedulers/__pycache__/scheduling_shift_snr.cpython-312.pyc differ diff --git a/comfyui-mvadapter/mvadapter/schedulers/scheduler_utils.py b/comfyui-mvadapter/mvadapter/schedulers/scheduler_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..eb65c1905a8b43814cdde721315c719b26fbdcf7 --- /dev/null +++ b/comfyui-mvadapter/mvadapter/schedulers/scheduler_utils.py @@ -0,0 +1,70 @@ +import torch + + +def get_sigmas(noise_scheduler, timesteps, n_dim=4, dtype=torch.float32, device=None): + sigmas = noise_scheduler.sigmas.to(device=device, dtype=dtype) + schedule_timesteps = noise_scheduler.timesteps.to(device) + timesteps = timesteps.to(device) + step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps] + sigma = sigmas[step_indices].flatten() + while len(sigma.shape) < n_dim: + sigma = sigma.unsqueeze(-1) + return sigma + + +def SNR_to_betas(snr): + """ + Converts SNR to betas + """ + # alphas_cumprod = pass + # snr = (alpha / ) ** 2 + # alpha_t^2 / (1 - alpha_t^2) = snr + alpha_t = (snr / (1 + snr)) ** 0.5 + alphas_cumprod = alpha_t**2 + alphas = alphas_cumprod / torch.cat( + [torch.ones(1, device=snr.device), alphas_cumprod[:-1]] + ) + betas = 1 - alphas + return betas + + +def compute_snr(timesteps, noise_scheduler): + """ + Computes SNR as per Min-SNR-Diffusion-Training/guided_diffusion/gaussian_diffusion.py at 521b624bd70c67cee4bdf49225915f5 + """ + alphas_cumprod = noise_scheduler.alphas_cumprod + sqrt_alphas_cumprod = alphas_cumprod**0.5 + sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod) ** 0.5 + + # Expand the tensors. + # Adapted from Min-SNR-Diffusion-Training/guided_diffusion/gaussian_diffusion.py at 521b624bd70c67cee4bdf49225915f5 + sqrt_alphas_cumprod = sqrt_alphas_cumprod.to(device=timesteps.device)[ + timesteps + ].float() + while len(sqrt_alphas_cumprod.shape) < len(timesteps.shape): + sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None] + alpha = sqrt_alphas_cumprod.expand(timesteps.shape) + + sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod.to( + device=timesteps.device + )[timesteps].float() + while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape): + sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., None] + sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape) + + # Compute SNR. + snr = (alpha / sigma) ** 2 + return snr + + +def compute_alpha(timesteps, noise_scheduler): + alphas_cumprod = noise_scheduler.alphas_cumprod + sqrt_alphas_cumprod = alphas_cumprod**0.5 + sqrt_alphas_cumprod = sqrt_alphas_cumprod.to(device=timesteps.device)[ + timesteps + ].float() + while len(sqrt_alphas_cumprod.shape) < len(timesteps.shape): + sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None] + alpha = sqrt_alphas_cumprod.expand(timesteps.shape) + + return alpha diff --git a/comfyui-mvadapter/mvadapter/schedulers/scheduling_shift_snr.py b/comfyui-mvadapter/mvadapter/schedulers/scheduling_shift_snr.py new file mode 100644 index 0000000000000000000000000000000000000000..4c1a384b60093e323b013b3a4fd1f07bcf07bc86 --- /dev/null +++ b/comfyui-mvadapter/mvadapter/schedulers/scheduling_shift_snr.py @@ -0,0 +1,140 @@ +from typing import Any + +import torch + +from .scheduler_utils import SNR_to_betas, compute_snr + + +class ShiftSNRScheduler: + SHIFT_MODES = ["default", "interpolated"] + + def __init__( + self, + noise_scheduler: Any, + timesteps: Any, + shift_scale: float, + scheduler_class: Any, + ): + self.noise_scheduler = noise_scheduler + self.timesteps = timesteps + self.shift_scale = shift_scale + self.scheduler_class = scheduler_class + + def _get_shift_scheduler(self): + """ + Prepare scheduler for shifted betas. + + :return: A scheduler object configured with shifted betas + """ + snr = compute_snr(self.timesteps, self.noise_scheduler) + shifted_betas = SNR_to_betas(snr / self.shift_scale) + + return self.scheduler_class.from_config( + self.noise_scheduler.config, trained_betas=shifted_betas.numpy() + ) + + def _get_interpolated_shift_scheduler(self): + """ + Prepare scheduler for shifted betas and interpolate with the original betas in log space. + + :return: A scheduler object configured with interpolated shifted betas + """ + snr = compute_snr(self.timesteps, self.noise_scheduler) + shifted_snr = snr / self.shift_scale + + weighting = self.timesteps.float() / ( + self.noise_scheduler.config.num_train_timesteps - 1 + ) + interpolated_snr = torch.exp( + torch.log(snr) * (1 - weighting) + torch.log(shifted_snr) * weighting + ) + + shifted_betas = SNR_to_betas(interpolated_snr) + + return self.scheduler_class.from_config( + self.noise_scheduler.config, trained_betas=shifted_betas.numpy() + ) + + @classmethod + def from_scheduler( + cls, + noise_scheduler: Any, + shift_mode: str = "default", + timesteps: Any = None, + shift_scale: float = 1.0, + scheduler_class: Any = None, + ): + # Check input + if timesteps is None: + timesteps = torch.arange(0, noise_scheduler.config.num_train_timesteps) + if scheduler_class is None: + scheduler_class = noise_scheduler.__class__ + + # Create scheduler + shift_scheduler = cls( + noise_scheduler=noise_scheduler, + timesteps=timesteps, + shift_scale=shift_scale, + scheduler_class=scheduler_class, + ) + + if shift_mode == "default": + return shift_scheduler._get_shift_scheduler() + elif shift_mode == "interpolated": + return shift_scheduler._get_interpolated_shift_scheduler() + else: + raise ValueError(f"Unknown shift_mode: {shift_mode}") + + +if __name__ == "__main__": + """ + Compare the alpha values for different noise schedulers. + """ + import matplotlib.pyplot as plt + from diffusers import DDPMScheduler + + from .scheduler_utils import compute_alpha + + # Base + timesteps = torch.arange(0, 1000) + noise_scheduler_base = DDPMScheduler.from_pretrained( + "runwayml/stable-diffusion-v1-5", subfolder="scheduler" + ) + alpha = compute_alpha(timesteps, noise_scheduler_base) + plt.plot(timesteps.numpy(), alpha.numpy(), label="Base") + + # Kolors + num_train_timesteps_ = 1100 + timesteps_ = torch.arange(0, num_train_timesteps_) + noise_kwargs = {"beta_end": 0.014, "num_train_timesteps": num_train_timesteps_} + noise_scheduler_kolors = DDPMScheduler.from_config( + noise_scheduler_base.config, **noise_kwargs + ) + alpha = compute_alpha(timesteps_, noise_scheduler_kolors) + plt.plot(timesteps_.numpy(), alpha.numpy(), label="Kolors") + + # Shift betas + shift_scale = 8.0 + noise_scheduler_shift = ShiftSNRScheduler.from_scheduler( + noise_scheduler_base, shift_mode="default", shift_scale=shift_scale + ) + alpha = compute_alpha(timesteps, noise_scheduler_shift) + plt.plot(timesteps.numpy(), alpha.numpy(), label="Shift Noise (scale 8.0)") + + # Shift betas (interpolated) + noise_scheduler_inter = ShiftSNRScheduler.from_scheduler( + noise_scheduler_base, shift_mode="interpolated", shift_scale=shift_scale + ) + alpha = compute_alpha(timesteps, noise_scheduler_inter) + plt.plot(timesteps.numpy(), alpha.numpy(), label="Interpolated (scale 8.0)") + + # ZeroSNR + noise_scheduler = DDPMScheduler.from_config( + noise_scheduler_base.config, rescale_betas_zero_snr=True + ) + alpha = compute_alpha(timesteps, noise_scheduler) + plt.plot(timesteps.numpy(), alpha.numpy(), label="ZeroSNR") + + plt.legend() + plt.grid() + plt.savefig("check_alpha.png") diff --git a/comfyui-mvadapter/mvadapter/utils/__init__.py b/comfyui-mvadapter/mvadapter/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..48eaeb33d3089d497681cf019137ca7144d7be48 --- /dev/null +++ b/comfyui-mvadapter/mvadapter/utils/__init__.py @@ -0,0 +1,3 @@ +from .camera import get_camera, get_orthogonal_camera +from .geometry import get_plucker_embeds_from_cameras_ortho +from .saving import make_image_grid, tensor_to_image diff --git a/comfyui-mvadapter/mvadapter/utils/__pycache__/__init__.cpython-312.pyc b/comfyui-mvadapter/mvadapter/utils/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bcb072d81771acc77f049b6a07ce51af1cda772b Binary files /dev/null and b/comfyui-mvadapter/mvadapter/utils/__pycache__/__init__.cpython-312.pyc differ diff --git a/comfyui-mvadapter/mvadapter/utils/__pycache__/camera.cpython-312.pyc b/comfyui-mvadapter/mvadapter/utils/__pycache__/camera.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4bb2387ce475878292ac273ab61258b26bab9371 Binary files /dev/null and b/comfyui-mvadapter/mvadapter/utils/__pycache__/camera.cpython-312.pyc differ diff --git a/comfyui-mvadapter/mvadapter/utils/__pycache__/geometry.cpython-312.pyc b/comfyui-mvadapter/mvadapter/utils/__pycache__/geometry.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8a1fad739068350a0ebc1ca6dcab8ec0c14427b2 Binary files /dev/null and b/comfyui-mvadapter/mvadapter/utils/__pycache__/geometry.cpython-312.pyc differ diff --git a/comfyui-mvadapter/mvadapter/utils/__pycache__/saving.cpython-312.pyc b/comfyui-mvadapter/mvadapter/utils/__pycache__/saving.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..47b92d6b585530d4265eb6b04796e20702135719 Binary files /dev/null and b/comfyui-mvadapter/mvadapter/utils/__pycache__/saving.cpython-312.pyc differ diff --git a/comfyui-mvadapter/mvadapter/utils/camera.py b/comfyui-mvadapter/mvadapter/utils/camera.py new file mode 100644 index 0000000000000000000000000000000000000000..8e5bd144909695de47841a340933f5a56390e71f --- /dev/null +++ b/comfyui-mvadapter/mvadapter/utils/camera.py @@ -0,0 +1,211 @@ +import math +from dataclasses import dataclass +from typing import List, Optional, Union + +import numpy as np +import torch +import torch.nn.functional as F +import trimesh +from PIL import Image +from torch import BoolTensor, FloatTensor + +LIST_TYPE = Union[list, np.ndarray, torch.Tensor] + + +def list_to_pt( + x: LIST_TYPE, dtype: Optional[torch.dtype] = None, device: Optional[str] = None +) -> torch.Tensor: + if isinstance(x, list) or isinstance(x, np.ndarray): + return torch.tensor(x, dtype=dtype, device=device) + return x.to(dtype=dtype) + + +def get_c2w( + elevation_deg: LIST_TYPE, + distance: LIST_TYPE, + azimuth_deg: Optional[LIST_TYPE], + num_views: Optional[int] = 1, + device: Optional[str] = None, +) -> torch.FloatTensor: + if azimuth_deg is None: + assert ( + num_views is not None + ), "num_views must be provided if azimuth_deg is None." + azimuth_deg = torch.linspace( + 0, 360, num_views + 1, dtype=torch.float32, device=device + )[:-1] + else: + num_views = len(azimuth_deg) + azimuth_deg = list_to_pt(azimuth_deg, dtype=torch.float32, device=device) + elevation_deg = list_to_pt(elevation_deg, dtype=torch.float32, device=device) + camera_distances = list_to_pt(distance, dtype=torch.float32, device=device) + elevation = elevation_deg * math.pi / 180 + azimuth = azimuth_deg * math.pi / 180 + camera_positions = torch.stack( + [ + camera_distances * torch.cos(elevation) * torch.cos(azimuth), + camera_distances * torch.cos(elevation) * torch.sin(azimuth), + camera_distances * torch.sin(elevation), + ], + dim=-1, + ) + center = torch.zeros_like(camera_positions) + up = torch.tensor([0, 0, 1], dtype=torch.float32, device=device)[None, :].repeat( + num_views, 1 + ) + lookat = F.normalize(center - camera_positions, dim=-1) + right = F.normalize(torch.cross(lookat, up, dim=-1), dim=-1) + up = F.normalize(torch.cross(right, lookat, dim=-1), dim=-1) + c2w3x4 = torch.cat( + [torch.stack([right, up, -lookat], dim=-1), camera_positions[:, :, None]], + dim=-1, + ) + c2w = torch.cat([c2w3x4, torch.zeros_like(c2w3x4[:, :1])], dim=1) + c2w[:, 3, 3] = 1.0 + return c2w + + +def get_projection_matrix( + fovy_deg: LIST_TYPE, + aspect_wh: float = 1.0, + near: float = 0.1, + far: float = 100.0, + device: Optional[str] = None, +) -> torch.FloatTensor: + fovy_deg = list_to_pt(fovy_deg, dtype=torch.float32, device=device) + batch_size = fovy_deg.shape[0] + fovy = fovy_deg * math.pi / 180 + tan_half_fovy = torch.tan(fovy / 2) + projection_matrix = torch.zeros( + batch_size, 4, 4, dtype=torch.float32, device=device + ) + projection_matrix[:, 0, 0] = 1 / (aspect_wh * tan_half_fovy) + projection_matrix[:, 1, 1] = -1 / tan_half_fovy + projection_matrix[:, 2, 2] = -(far + near) / (far - near) + projection_matrix[:, 2, 3] = -2 * far * near / (far - near) + projection_matrix[:, 3, 2] = -1 + return projection_matrix + + +def get_orthogonal_projection_matrix( + batch_size: int, + left: float, + right: float, + bottom: float, + top: float, + near: float = 0.1, + far: float = 100.0, + device: Optional[str] = None, +) -> torch.FloatTensor: + projection_matrix = torch.zeros( + batch_size, 4, 4, dtype=torch.float32, device=device + ) + projection_matrix[:, 0, 0] = 2 / (right - left) + projection_matrix[:, 1, 1] = -2 / (top - bottom) + projection_matrix[:, 2, 2] = -2 / (far - near) + projection_matrix[:, 0, 3] = -(right + left) / (right - left) + projection_matrix[:, 1, 3] = -(top + bottom) / (top - bottom) + projection_matrix[:, 2, 3] = -(far + near) / (far - near) + projection_matrix[:, 3, 3] = 1 + return projection_matrix + + +@dataclass +class Camera: + c2w: Optional[torch.FloatTensor] + w2c: torch.FloatTensor + proj_mtx: torch.FloatTensor + mvp_mtx: torch.FloatTensor + cam_pos: Optional[torch.FloatTensor] + + def __getitem__(self, index): + if isinstance(index, int): + sl = slice(index, index + 1) + elif isinstance(index, slice): + sl = index + else: + raise NotImplementedError + + return Camera( + c2w=self.c2w[sl] if self.c2w is not None else None, + w2c=self.w2c[sl], + proj_mtx=self.proj_mtx[sl], + mvp_mtx=self.mvp_mtx[sl], + cam_pos=self.cam_pos[sl] if self.cam_pos is not None else None, + ) + + def to(self, device: Optional[str] = None): + if self.c2w is not None: + self.c2w = self.c2w.to(device) + self.w2c = self.w2c.to(device) + self.proj_mtx = self.proj_mtx.to(device) + self.mvp_mtx = self.mvp_mtx.to(device) + if self.cam_pos is not None: + self.cam_pos = self.cam_pos.to(device) + + def __len__(self): + return self.c2w.shape[0] + + +def get_camera( + elevation_deg: Optional[LIST_TYPE] = None, + distance: Optional[LIST_TYPE] = None, + fovy_deg: Optional[LIST_TYPE] = None, + azimuth_deg: Optional[LIST_TYPE] = None, + num_views: Optional[int] = 1, + c2w: Optional[torch.FloatTensor] = None, + w2c: Optional[torch.FloatTensor] = None, + proj_mtx: Optional[torch.FloatTensor] = None, + aspect_wh: float = 1.0, + near: float = 0.1, + far: float = 100.0, + device: Optional[str] = None, +): + if w2c is None: + if c2w is None: + c2w = get_c2w(elevation_deg, distance, azimuth_deg, num_views, device) + camera_positions = c2w[:, :3, 3] + w2c = torch.linalg.inv(c2w) + else: + camera_positions = None + c2w = None + if proj_mtx is None: + proj_mtx = get_projection_matrix( + fovy_deg, aspect_wh=aspect_wh, near=near, far=far, device=device + ) + mvp_mtx = proj_mtx @ w2c + return Camera( + c2w=c2w, w2c=w2c, proj_mtx=proj_mtx, mvp_mtx=mvp_mtx, cam_pos=camera_positions + ) + + +def get_orthogonal_camera( + elevation_deg: LIST_TYPE, + distance: LIST_TYPE, + left: float, + right: float, + bottom: float, + top: float, + azimuth_deg: Optional[LIST_TYPE] = None, + num_views: Optional[int] = 1, + near: float = 0.1, + far: float = 100.0, + device: Optional[str] = None, +): + c2w = get_c2w(elevation_deg, distance, azimuth_deg, num_views, device) + camera_positions = c2w[:, :3, 3] + w2c = torch.linalg.inv(c2w) + proj_mtx = get_orthogonal_projection_matrix( + batch_size=c2w.shape[0], + left=left, + right=right, + bottom=bottom, + top=top, + near=near, + far=far, + device=device, + ) + mvp_mtx = proj_mtx @ w2c + return Camera( + c2w=c2w, w2c=w2c, proj_mtx=proj_mtx, mvp_mtx=mvp_mtx, cam_pos=camera_positions + ) diff --git a/comfyui-mvadapter/mvadapter/utils/geometry.py b/comfyui-mvadapter/mvadapter/utils/geometry.py new file mode 100644 index 0000000000000000000000000000000000000000..4e23c796a9acaabc485a4f193f0e93d31d048c30 --- /dev/null +++ b/comfyui-mvadapter/mvadapter/utils/geometry.py @@ -0,0 +1,253 @@ +from typing import List, Optional, Tuple + +import numpy as np +import torch +from torch.nn import functional as F + + +def get_position_map_from_depth(depth, mask, intrinsics, extrinsics, image_wh=None): + """Compute the position map from the depth map and the camera parameters for a batch of views. + + Args: + depth (torch.Tensor): The depth maps with the shape (B, H, W, 1). + mask (torch.Tensor): The masks with the shape (B, H, W, 1). + intrinsics (torch.Tensor): The camera intrinsics matrices with the shape (B, 3, 3). + extrinsics (torch.Tensor): The camera extrinsics matrices with the shape (B, 4, 4). + image_wh (Tuple[int, int]): The image width and height. + + Returns: + torch.Tensor: The position maps with the shape (B, H, W, 3). + """ + if image_wh is None: + image_wh = depth.shape[2], depth.shape[1] + + B, H, W, _ = depth.shape + depth = depth.squeeze(-1) + + u_coord, v_coord = torch.meshgrid( + torch.arange(image_wh[0]), torch.arange(image_wh[1]), indexing="xy" + ) + u_coord = u_coord.type_as(depth).unsqueeze(0).expand(B, -1, -1) + v_coord = v_coord.type_as(depth).unsqueeze(0).expand(B, -1, -1) + + # Compute the position map by back-projecting depth pixels to 3D space + x = ( + (u_coord - intrinsics[:, 0, 2].unsqueeze(-1).unsqueeze(-1)) + * depth + / intrinsics[:, 0, 0].unsqueeze(-1).unsqueeze(-1) + ) + y = ( + (v_coord - intrinsics[:, 1, 2].unsqueeze(-1).unsqueeze(-1)) + * depth + / intrinsics[:, 1, 1].unsqueeze(-1).unsqueeze(-1) + ) + z = depth + + # Concatenate to form the 3D coordinates in the camera frame + camera_coords = torch.stack([x, y, z], dim=-1) + + # Apply the extrinsic matrix to get coordinates in the world frame + coords_homogeneous = torch.nn.functional.pad( + camera_coords, (0, 1), "constant", 1.0 + ) # Add a homogeneous coordinate + world_coords = torch.matmul( + coords_homogeneous.view(B, -1, 4), extrinsics.transpose(1, 2) + ).view(B, H, W, 4) + + # Apply the mask to the position map + position_map = world_coords[..., :3] * mask + + return position_map + + +def get_position_map_from_depth_ortho( + depth, mask, extrinsics, ortho_scale, image_wh=None +): + """Compute the position map from the depth map and the camera parameters for a batch of views + using orthographic projection with a given ortho_scale. + + Args: + depth (torch.Tensor): The depth maps with the shape (B, H, W, 1). + mask (torch.Tensor): The masks with the shape (B, H, W, 1). + extrinsics (torch.Tensor): The camera extrinsics matrices with the shape (B, 4, 4). + ortho_scale (torch.Tensor): The scaling factor for the orthographic projection with the shape (B, 1, 1, 1). + image_wh (Tuple[int, int]): Optional. The image width and height. + + Returns: + torch.Tensor: The position maps with the shape (B, H, W, 3). + """ + if image_wh is None: + image_wh = depth.shape[2], depth.shape[1] + + B, H, W, _ = depth.shape + depth = depth.squeeze(-1) + + # Generating grid of coordinates in the image space + u_coord, v_coord = torch.meshgrid( + torch.arange(0, image_wh[0]), torch.arange(0, image_wh[1]), indexing="xy" + ) + u_coord = u_coord.type_as(depth).unsqueeze(0).expand(B, -1, -1) + v_coord = v_coord.type_as(depth).unsqueeze(0).expand(B, -1, -1) + + # Compute the position map using orthographic projection with ortho_scale + x = (u_coord - image_wh[0] / 2) / ortho_scale / image_wh[0] + y = (v_coord - image_wh[1] / 2) / ortho_scale / image_wh[1] + z = depth + + # Concatenate to form the 3D coordinates in the camera frame + camera_coords = torch.stack([x, y, z], dim=-1) + + # Apply the extrinsic matrix to get coordinates in the world frame + coords_homogeneous = torch.nn.functional.pad( + camera_coords, (0, 1), "constant", 1.0 + ) # Add a homogeneous coordinate + world_coords = torch.matmul( + coords_homogeneous.view(B, -1, 4), extrinsics.transpose(1, 2) + ).view(B, H, W, 4) + + # Apply the mask to the position map + position_map = world_coords[..., :3] * mask + + return position_map + + +def get_opencv_from_blender(matrix_world, fov=None, image_size=None): + # convert matrix_world to opencv format extrinsics + opencv_world_to_cam = matrix_world.inverse() + opencv_world_to_cam[1, :] *= -1 + opencv_world_to_cam[2, :] *= -1 + R, T = opencv_world_to_cam[:3, :3], opencv_world_to_cam[:3, 3] + + if fov is None: # orthographic camera + return R, T + + R, T = R.unsqueeze(0), T.unsqueeze(0) + # convert fov to opencv format intrinsics + focal = 1 / np.tan(fov / 2) + intrinsics = np.diag(np.array([focal, focal, 1])).astype(np.float32) + opencv_cam_matrix = ( + torch.from_numpy(intrinsics).unsqueeze(0).float().to(matrix_world.device) + ) + opencv_cam_matrix[:, :2, -1] += torch.tensor([image_size / 2, image_size / 2]).to( + matrix_world.device + ) + opencv_cam_matrix[:, [0, 1], [0, 1]] *= image_size / 2 + + return R, T, opencv_cam_matrix + + +def get_ray_directions( + H: int, + W: int, + focal: float, + principal: Optional[Tuple[float, float]] = None, + use_pixel_centers: bool = True, +) -> torch.Tensor: + """ + Get ray directions for all pixels in camera coordinate. + Args: + H, W, focal, principal, use_pixel_centers: image height, width, focal length, principal point and whether use pixel centers + Outputs: + directions: (H, W, 3), the direction of the rays in camera coordinate + """ + pixel_center = 0.5 if use_pixel_centers else 0 + cx, cy = W / 2, H / 2 if principal is None else principal + i, j = torch.meshgrid( + torch.arange(W, dtype=torch.float32) + pixel_center, + torch.arange(H, dtype=torch.float32) + pixel_center, + indexing="xy", + ) + directions = torch.stack( + [(i - cx) / focal, -(j - cy) / focal, -torch.ones_like(i)], -1 + ) + return F.normalize(directions, dim=-1) + + +def get_rays( + directions: torch.Tensor, c2w: torch.Tensor +) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Get ray origins and directions from camera coordinates to world coordinates + Args: + directions: (H, W, 3) ray directions in camera coordinates + c2w: (4, 4) camera-to-world transformation matrix + Outputs: + rays_o, rays_d: (H, W, 3) ray origins and directions in world coordinates + """ + # Rotate ray directions from camera coordinate to the world coordinate + rays_d = directions @ c2w[:3, :3].T + rays_o = c2w[:3, 3].expand(rays_d.shape) + return rays_o, rays_d + + +def compute_plucker_embed( + c2w: torch.Tensor, image_width: int, image_height: int, focal: float +) -> torch.Tensor: + """ + Computes Plucker coordinates for a camera. + Args: + c2w: (4, 4) camera-to-world transformation matrix + image_width: Image width + image_height: Image height + focal: Focal length of the camera + Returns: + plucker: (6, H, W) Plucker embedding + """ + directions = get_ray_directions(image_height, image_width, focal) + rays_o, rays_d = get_rays(directions, c2w) + # Cross product to get Plucker coordinates + cross = torch.cross(rays_o, rays_d, dim=-1) + plucker = torch.cat((rays_d, cross), dim=-1) + return plucker.permute(2, 0, 1) + + +def get_plucker_embeds_from_cameras( + c2w: List[torch.Tensor], fov: List[float], image_size: int +) -> torch.Tensor: + """ + Given lists of camera transformations and fov, returns the batched plucker embeddings. + Args: + c2w: list of camera-to-world transformation matrices + fov: list of field of view values + image_size: size of the image + Returns: + plucker_embeds: (B, 6, H, W) batched plucker embeddings + """ + plucker_embeds = [] + for cam_matrix, cam_fov in zip(c2w, fov): + focal = 0.5 * image_size / np.tan(0.5 * cam_fov) + plucker = compute_plucker_embed(cam_matrix, image_size, image_size, focal) + plucker_embeds.append(plucker) + return torch.stack(plucker_embeds) + + +def get_plucker_embeds_from_cameras_ortho( + c2w: List[torch.Tensor], ortho_scale: List[float], image_size: int +): + """ + Given lists of camera transformations and fov, returns the batched plucker embeddings. + + Parameters: + c2w: list of camera-to-world transformation matrices + fov: list of field of view values + image_size: size of the image + + Returns: + plucker_embeds: plucker embeddings (B, 6, H, W) + """ + plucker_embeds = [] + # compute pairwise mask and plucker embeddings + for cam_matrix, scale in zip(c2w, ortho_scale): + # blender to opencv to pytorch3d + R, T = get_opencv_from_blender(cam_matrix) + cam_pos = -R.T @ T + view_dir = R.T @ torch.tensor([0, 0, 1]).float().to(cam_matrix.device) + # normalize camera position + cam_pos = F.normalize(cam_pos, dim=0) + plucker = torch.concat([view_dir, cam_pos]) + plucker = plucker.unsqueeze(-1).unsqueeze(-1).repeat(1, image_size, image_size) + plucker_embeds.append(plucker) + + plucker_embeds = torch.stack(plucker_embeds) + + return plucker_embeds diff --git a/comfyui-mvadapter/mvadapter/utils/saving.py b/comfyui-mvadapter/mvadapter/utils/saving.py new file mode 100644 index 0000000000000000000000000000000000000000..664cf08efd728ea2b1cee0490e24e4b918f37814 --- /dev/null +++ b/comfyui-mvadapter/mvadapter/utils/saving.py @@ -0,0 +1,88 @@ +import math +from typing import List, Optional, Union + +import numpy as np +import torch +from PIL import Image + + +def tensor_to_image( + data: Union[Image.Image, torch.Tensor, np.ndarray], + batched: bool = False, + format: str = "HWC", +) -> Union[Image.Image, List[Image.Image]]: + if isinstance(data, Image.Image): + return data + if isinstance(data, torch.Tensor): + data = data.detach().cpu().numpy() + if data.dtype == np.float32 or data.dtype == np.float16: + data = (data * 255).astype(np.uint8) + elif data.dtype == np.bool_: + data = data.astype(np.uint8) * 255 + assert data.dtype == np.uint8 + if format == "CHW": + if batched and data.ndim == 4: + data = data.transpose((0, 2, 3, 1)) + elif not batched and data.ndim == 3: + data = data.transpose((1, 2, 0)) + + if batched: + return [Image.fromarray(d) for d in data] + return Image.fromarray(data) + + +def largest_factor_near_sqrt(n: int) -> int: + """ + Finds the largest factor of n that is closest to the square root of n. + + Args: + n (int): The integer for which to find the largest factor near its square root. + + Returns: + int: The largest factor of n that is closest to the square root of n. + """ + sqrt_n = int(math.sqrt(n)) # Get the integer part of the square root + + # First, check if the square root itself is a factor + if sqrt_n * sqrt_n == n: + return sqrt_n + + # Otherwise, find the largest factor by iterating from sqrt_n downwards + for i in range(sqrt_n, 0, -1): + if n % i == 0: + return i + + # If n is 1, return 1 + return 1 + + +def make_image_grid( + images: List[Image.Image], + rows: Optional[int] = None, + cols: Optional[int] = None, + resize: Optional[int] = None, +) -> Image.Image: + """ + Prepares a single grid of images. Useful for visualization purposes. + """ + if rows is None and cols is not None: + assert len(images) % cols == 0 + rows = len(images) // cols + elif cols is None and rows is not None: + assert len(images) % rows == 0 + cols = len(images) // rows + elif rows is None and cols is None: + rows = largest_factor_near_sqrt(len(images)) + cols = len(images) // rows + + assert len(images) == rows * cols + + if resize is not None: + images = [img.resize((resize, resize)) for img in images] + + w, h = images[0].size + grid = Image.new("RGB", size=(cols * w, rows * h)) + + for i, img in enumerate(images): + grid.paste(img, box=(i % cols * w, i // cols * h)) + return grid diff --git a/comfyui-mvadapter/nodes.py b/comfyui-mvadapter/nodes.py new file mode 100644 index 0000000000000000000000000000000000000000..48710c0f80fda445a308a5da916d581804272380 --- /dev/null +++ b/comfyui-mvadapter/nodes.py @@ -0,0 +1,273 @@ +# Adapted from https://github.com/Limitex/ComfyUI-Diffusers/blob/main/nodes.py + +import torch +from .utils import ( + SCHEDULERS, + vae_pt_to_vae_diffuser, + convert_images_to_tensors, + convert_tensors_to_images, + prepare_camera_embed, +) +from comfy.model_management import get_torch_device +import folder_paths + +# DPM++ variants +from diffusers import DPMSolverMultistepScheduler + +# Karras-enabled Shift SNR wrapper +from .mvadapter.schedulers.ShiftSNRSchedulerKarras import ShiftSNRSchedulerKarras + + +class LdmVaeLoader: + def __init__(self): + self.dtype = torch.float16 + + @classmethod + def INPUT_TYPES(s): + return { + "required": { + "vae_name": (folder_paths.get_filename_list("vae"),), + "upcast_fp32": ("BOOLEAN", {"default": True}), + }, + } + + RETURN_TYPES = ("AUTOENCODER",) + FUNCTION = "create_pipeline" + CATEGORY = "MV-Adapter" + + def create_pipeline(self, vae_name, upcast_fp32): + vae = vae_pt_to_vae_diffuser( + folder_paths.get_full_path("vae", vae_name), force_upcast=upcast_fp32 + ).to(self.dtype) + return (vae,) + + +# DPM++ (Karras) + Shift-SNR — supports DPM++ 2M and DPM++ 2M SDE +class DiffusersMVSchedulerLoaderKarras: + def __init__(self): + self.dtype = torch.float16 + + @classmethod + def INPUT_TYPES(s): + return { + "required": { + "pipeline": ("PIPELINE",), + + # kept for UI parity; not strictly needed when forcing DPM++ + "scheduler_name": (list(SCHEDULERS.keys()),), + + # Karras SNR-shift options + "shift_snr": ("BOOLEAN", {"default": True}), + "shift_mode": ( + list(ShiftSNRSchedulerKarras.SHIFT_MODES), + {"default": "interpolated"}, + ), + "shift_scale": ( + "FLOAT", + {"default": 8.0, "min": 0.0, "max": 50.0, "step": 1.0}, + ), + + # choose DPM++ flavor + "dpmpp_variant": ( + ["DPM++ 2M", "DPM++ 2M SDE"], + {"default": "DPM++ 2M SDE"}, + ), + }, + "optional": { + "solver_order": ("INT", {"default": 2, "min": 1, "max": 3}), + "use_karras_sigmas": ("BOOLEAN", {"default": True}), + }, + } + + RETURN_TYPES = ("SCHEDULER",) + FUNCTION = "load_scheduler" + CATEGORY = "MV-Adapter" + + def load_scheduler( + self, + pipeline, + scheduler_name, + shift_snr, + shift_mode, + shift_scale, + dpmpp_variant, + solver_order=2, + use_karras_sigmas=True, + ): + # Build from current pipeline scheduler config + base_cfg = dict(pipeline.scheduler.config) + algo = "sde-dpmsolver++" if "SDE" in dpmpp_variant else "dpmsolver++" + + base_sched = DPMSolverMultistepScheduler.from_config( + base_cfg, + algorithm_type=algo, + use_karras_sigmas=bool(use_karras_sigmas), + solver_order=int(solver_order), + torch_dtype=self.dtype, + ) + + if shift_snr: + scheduler = ShiftSNRSchedulerKarras.from_scheduler( + base_sched, + shift_mode=shift_mode, + shift_scale=shift_scale, + scheduler_class=DPMSolverMultistepScheduler, + ) + else: + scheduler = base_sched + + return (scheduler,) + + +class DiffusersMVSampler: + def __init__(self): + self.torch_device = get_torch_device() + + @classmethod + def INPUT_TYPES(s): + return { + "required": { + "pipeline": ("PIPELINE",), + "num_views": ("INT", {"default": 6, "min": 1, "max": 12}), + "prompt": ( + "STRING", + {"multiline": True, "default": "an astronaut riding a horse"}, + ), + "negative_prompt": ( + "STRING", + {"multiline": True, "default": "watermark, ugly, deformed, noisy, blurry, low contrast"}, + ), + "width": ("INT", {"default": 768, "min": 1, "max": 2048, "step": 1}), + "height": ("INT", {"default": 768, "min": 1, "max": 2048, "step": 1}), + "steps": ("INT", {"default": 50, "min": 1, "max": 2000}), + "cfg": ("FLOAT", {"default": 7.0, "min": 0.0, "max": 100.0, "step": 0.1, "round": 0.01}), + "seed": ("INT", {"default": 0, "min": 0, "max": 0xFFFFFFFFFFFFFFFF}), + }, + "optional": { + "reference_image": ("IMAGE",), + "controlnet_image": ("IMAGE",), + "controlnet_conditioning_scale": ("FLOAT", {"default": 1.0}), + "azimuth_degrees": ("LIST", {"default": [0, 45, 90, 180, 270, 315]}), + # Task 1 – reference-only prompt fields + "reference_prompt": ("STRING", {"multiline": True, "default": ""}), + "reference_prompt_2": ("STRING", {"multiline": True, "default": ""}), + }, + } + + RETURN_TYPES = ("IMAGE",) + FUNCTION = "sample" + CATEGORY = "MV-Adapter" + + def sample( + self, + pipeline, + num_views, + prompt, + negative_prompt, + height, + width, + steps, + cfg, + seed, + reference_image=None, + controlnet_image=None, + controlnet_conditioning_scale=1.0, + azimuth_degrees=[0, 45, 90, 180, 270, 315], + reference_prompt="", + reference_prompt_2="", + ): + num_views = len(azimuth_degrees) + control_images = prepare_camera_embed(num_views, width, self.torch_device, azimuth_degrees) + + pipe_kwargs = {} + if reference_image is not None: + pipe_kwargs["reference_image"] = convert_tensors_to_images(reference_image)[0] + pipe_kwargs["reference_conditioning_scale"] = 1.0 + if controlnet_image is not None: + controlnet_image = convert_tensors_to_images(controlnet_image) + pipe_kwargs["controlnet_image"] = controlnet_image + pipe_kwargs["controlnet_conditioning_scale"] = controlnet_conditioning_scale + + rp = (reference_prompt or "").strip() + rp2 = (reference_prompt_2 or "").strip() + if rp: + pipe_kwargs["reference_prompt"] = rp + if rp2: + pipe_kwargs["reference_prompt_2"] = rp2 + + images = pipeline( + prompt=prompt, + height=height, + width=width, + num_inference_steps=steps, + guidance_scale=cfg, + num_images_per_prompt=num_views, + control_image=control_images, + control_conditioning_scale=1.0, + negative_prompt=negative_prompt, + generator=torch.Generator(self.torch_device).manual_seed(seed), + cross_attention_kwargs={"num_views": num_views}, + **pipe_kwargs, + ).images + return (convert_images_to_tensors(images),) + + +class ViewSelector: + def __init__(self): + pass + + @classmethod + def INPUT_TYPES(s): + return { + "required": { + "front_view": ("BOOLEAN", {"default": True}), + "front_right_view": ("BOOLEAN", {"default": True}), + "right_view": ("BOOLEAN", {"default": True}), + "back_view": ("BOOLEAN", {"default": True}), + "left_view": ("BOOLEAN", {"default": True}), + "front_left_view": ("BOOLEAN", {"default": True}), + } + } + + RETURN_TYPES = ("LIST",) + FUNCTION = "process" + CATEGORY = "MV-Adapter" + + def process( + self, + front_view, + front_right_view, + right_view, + back_view, + left_view, + front_left_view, + ): + azimuth_deg = [] + if front_view: + azimuth_deg.append(0) + if front_right_view: + azimuth_deg.append(45) + if right_view: + azimuth_deg.append(90) + if back_view: + azimuth_deg.append(180) + if left_view: + azimuth_deg.append(270) + if front_left_view: + azimuth_deg.append(315) + return (azimuth_deg,) + + +NODE_CLASS_MAPPINGS = { + "LdmVaeLoader": LdmVaeLoader, + "DiffusersMVSchedulerLoaderKarras": DiffusersMVSchedulerLoaderKarras, + "DiffusersMVSampler": DiffusersMVSampler, + "ViewSelector": ViewSelector, +} + +NODE_DISPLAY_NAME_MAPPINGS = { + "LdmVaeLoader": "LDM Vae Loader", + "DiffusersMVSchedulerLoaderKarras": "Diffusers MV Scheduler Loader (Karras)", + "DiffusersMVSampler": "Diffusers MV Sampler", + "ViewSelector": "View Selector", +} diff --git a/comfyui-mvadapter/nodes_local_mv.py b/comfyui-mvadapter/nodes_local_mv.py new file mode 100644 index 0000000000000000000000000000000000000000..b81d0bcbc81079c33a97a920cab44d224552acde --- /dev/null +++ b/comfyui-mvadapter/nodes_local_mv.py @@ -0,0 +1,248 @@ +# nodes_local_mv.py +# Local-only MV-Adapter nodes: no auto-downloads, no HF Hub. +# Pick EVERYTHING from your ComfyUI model folders. + +from PIL import Image +import os +import torch +from torchvision import transforms + +import folder_paths +from comfy.model_management import get_torch_device + +from transformers import AutoModelForImageSegmentation + +from .utils import ( + PIPELINES, # includes MVAdapterI2MVSDXLPipeline + convert_images_to_tensors, + convert_tensors_to_images, + preprocess_image, +) + +# ------------------------------------------------------------ +# LDM Pipeline Loader (Local .ckpt/.safetensors from checkpoints/) +# ------------------------------------------------------------ + +class LocalMVPipelineLoader: + """ + Load an MV-Adapter pipeline from a *single local checkpoint file* (ckpt/safetensors) + sitting in models/checkpoints/. No HuggingFace usage. + """ + def __init__(self): + self.dtype = torch.float16 + + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "ckpt_name": (folder_paths.get_filename_list("checkpoints"),), + "pipeline_name": (list(PIPELINES.keys()), {"default": "MVAdapterI2MVSDXLPipeline"}), + } + } + + RETURN_TYPES = ("PIPELINE", "AUTOENCODER", "SCHEDULER") + FUNCTION = "create_pipeline" + CATEGORY = "MV-Adapter (Local)" + + def create_pipeline(self, ckpt_name, pipeline_name): + pipeline_class = PIPELINES[pipeline_name] + full_path = folder_paths.get_full_path("checkpoints", ckpt_name) + pipe = pipeline_class.from_single_file( + pretrained_model_link_or_path=full_path, + torch_dtype=self.dtype, + cache_dir=None, + local_files_only=True, + ) + return (pipe, pipe.vae, pipe.scheduler) + + +# ------------------------------------------------------------ +# MV Model Makeup (Local adapter .safetensors picked from checkpoints/) +# ------------------------------------------------------------ + +class LocalMVModelMakeup: + """ + Attach MV-Adapter weights from a *local* .safetensors file in models/checkpoints/. + No huggingface repo ids; no cache downloads. + """ + def __init__(self): + self.torch_device = get_torch_device() + self.dtype = torch.float16 + + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "pipeline": ("PIPELINE",), + "scheduler": ("SCHEDULER",), + "autoencoder": ("AUTOENCODER",), + "adapter_file": (folder_paths.get_filename_list("checkpoints"),), + "num_views": ("INT", {"default": 6, "min": 1, "max": 12}), + }, + "optional": { + "enable_vae_slicing": ("BOOLEAN", {"default": True}), + "enable_vae_tiling": ("BOOLEAN", {"default": False}), + }, + } + + RETURN_TYPES = ("PIPELINE",) + FUNCTION = "makeup_pipeline" + CATEGORY = "MV-Adapter (Local)" + + def makeup_pipeline( + self, + pipeline, + scheduler, + autoencoder, + adapter_file, + num_views, + enable_vae_slicing=True, + enable_vae_tiling=False, + ): + pipeline.vae = autoencoder + pipeline.scheduler = scheduler + + pipeline.init_custom_adapter(num_views=num_views) + + adapter_path_full = folder_paths.get_full_path("checkpoints", adapter_file) + adapter_dir = os.path.dirname(adapter_path_full) + adapter_name = os.path.basename(adapter_path_full) + + pipeline.load_custom_adapter(adapter_dir, weight_name=adapter_name, cache_dir=None) + pipeline.cond_encoder.to(device=self.torch_device, dtype=self.dtype) + + pipeline = pipeline.to(self.torch_device, self.dtype) + + if enable_vae_slicing: + pipeline.enable_vae_slicing() + if enable_vae_tiling: + pipeline.enable_vae_tiling() + + return (pipeline,) + + +# ------------------------------------------------------------ +# BiRefNet (Local) — select from models/checkpoints/ +# ------------------------------------------------------------ + +class BiRefNetLocal: + """ + Load a background-removal (image segmentation) model by **selecting a file in models/checkpoints/**. + We try to treat the *parent directory* of the selected file as a local Transformers folder. + If that fails, we fall back to a pass-through remove_bg that simply adds an opaque alpha channel. + """ + def __init__(self): + self.torch_device = get_torch_device() + self.dtype = torch.float32 + + RETURN_TYPES = ("FUNCTION",) + FUNCTION = "load_model_fn" + CATEGORY = "MV-Adapter (Local)" + + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "model_ckpt": (folder_paths.get_filename_list("checkpoints"),), + "resize_to": ("INT", {"default": 1024, "min": 64, "max": 2048, "step": 64}), + } + } + + def _remove_bg(self, image, net, transform, device): + image_size = image.size + input_images = transform(image).unsqueeze(0).to(device) + with torch.no_grad(): + preds = net(input_images)[-1].sigmoid().cpu() + pred = preds[0].squeeze() + pred_pil = transforms.ToPILImage()(pred) + mask = pred_pil.resize(image_size) + image = image.convert("RGBA") + image.putalpha(mask) + return image + + def _remove_bg_passthrough(self, image): + image = image.convert("RGBA") + opaque = Image.new("L", image.size, 255) + image.putalpha(opaque) + return image + + def load_model_fn(self, model_ckpt, resize_to): + chosen = folder_paths.get_full_path("checkpoints", model_ckpt) + try_dir = chosen if os.path.isdir(chosen) else os.path.dirname(chosen) + + model = None + try: + model = AutoModelForImageSegmentation.from_pretrained( + try_dir, trust_remote_code=True, local_files_only=True + ).to(self.torch_device, self.dtype) + print(f"[BiRefNetLocal] Loaded segmentation model from: {try_dir}") + except Exception as e: + print(f"[BiRefNetLocal] WARN: Failed to load a Transformers model from '{try_dir}'.") + print(f"[BiRefNetLocal] Reason: {e}") + print("[BiRefNetLocal] Falling back to opaque-alpha passthrough (no background removal).") + + transform_image = transforms.Compose( + [ + transforms.Resize((resize_to, resize_to)), + transforms.ToTensor(), + transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), + ] + ) + + if model is None: + remove_bg_fn = lambda x: self._remove_bg_passthrough(x) + else: + remove_bg_fn = lambda x: self._remove_bg(x, model, transform_image, self.torch_device) + + return (remove_bg_fn,) + + +# ------------------------------------------------------------ +# Image Preprocessor (Local) +# ------------------------------------------------------------ + +class ImagePreprocessorLocal: + """ + Apply background removal function (from BiRefNetLocal) and pack to target HxW. + """ + def __init__(self): + self.torch_device = get_torch_device() + + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "remove_bg_fn": ("FUNCTION",), + "image": ("IMAGE",), + "height": ("INT", {"default": 768, "min": 64, "max": 2048, "step": 1}), + "width": ("INT", {"default": 768, "min": 64, "max": 2048, "step": 1}), + } + } + + RETURN_TYPES = ("IMAGE",) + FUNCTION = "process" + CATEGORY = "MV-Adapter (Local)" + + def process(self, remove_bg_fn, image, height, width): + images = convert_tensors_to_images(image) + images = [preprocess_image(remove_bg_fn(img.convert("RGB")), height, width) for img in images] + return (convert_images_to_tensors(images),) + + +# --------------------------- +# Node registration +# --------------------------- + +NODE_CLASS_MAPPINGS = { + "LocalMVPipelineLoader": LocalMVPipelineLoader, + "LocalMVModelMakeup": LocalMVModelMakeup, + "BiRefNetLocal": BiRefNetLocal, + "ImagePreprocessorLocal": ImagePreprocessorLocal, +} + +NODE_DISPLAY_NAME_MAPPINGS = { + "LocalMVPipelineLoader": "LDM Pipeline Loader (Local)", + "LocalMVModelMakeup": "MV Model Makeup (Local Adapter File)", + "BiRefNetLocal": "BiRefNet (Local)", + "ImagePreprocessorLocal": "Image Preprocessor (Local)", +} diff --git a/comfyui-mvadapter/pyproject.toml b/comfyui-mvadapter/pyproject.toml new file mode 100644 index 0000000000000000000000000000000000000000..102131ddbfae5d71599756e3a5d486e2b87fc59b --- /dev/null +++ b/comfyui-mvadapter/pyproject.toml @@ -0,0 +1,15 @@ +[project] +name = "comfyui-mvadapter" +description = "This extension integrates [a/MV-Adapter](https://github.com/huanngzh/MV-Adapter) into ComfyUI, allowing users to generate multi-view consistent images from text prompts or single images directly within the ComfyUI interface." +version = "1.0.2" +license = {file = "LICENSE"} +dependencies = ["torch>=2.1.1", "torchvision>=0.16.1", "diffusers>=0.31.0", "transformers>=4.46.3", "peft", "numpy>=1.26.2", "huggingface_hub>=0.24.6", "accelerate>=1.1.1", "opencv-python", "safetensors", "pillow", "omegaconf", "trimesh", "einops", "timm", "kornia", "scikit-image"] + +[project.urls] +Repository = "https://github.com/huanngzh/ComfyUI-MVAdapter" +# Used by Comfy Registry https://comfyregistry.org + +[tool.comfy] +PublisherId = "huanngzh" +DisplayName = "ComfyUI-MVAdapter" +Icon = "" diff --git a/comfyui-mvadapter/requirements.txt b/comfyui-mvadapter/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ec63f1a0b90bc8ac3bd0868cf8ac1593586eeea --- /dev/null +++ b/comfyui-mvadapter/requirements.txt @@ -0,0 +1,17 @@ +torch>=2.1.1 +torchvision>=0.16.1 +diffusers==0.31.0 +transformers==4.46.3 +peft +numpy>=1.26.2 +huggingface_hub==0.24.6 +accelerate==1.1.1 +opencv-python +safetensors +pillow +omegaconf +trimesh +einops +timm +kornia +scikit-image \ No newline at end of file diff --git a/comfyui-mvadapter/utils.py b/comfyui-mvadapter/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..4356a41d9eb9035ebad151c200d521aaff557897 --- /dev/null +++ b/comfyui-mvadapter/utils.py @@ -0,0 +1,279 @@ +# Adapted from https://github.com/Limitex/ComfyUI-Diffusers/blob/main/utils.py + +import os +import torch +import numpy as np +from PIL import Image +from omegaconf import OmegaConf +from torchvision.transforms import ToTensor +from diffusers.pipelines.stable_diffusion.convert_from_ckpt import ( + assign_to_checkpoint, + conv_attn_to_linear, + create_vae_diffusers_config, + renew_vae_attention_paths, + renew_vae_resnet_paths, +) +from diffusers import ( + AutoencoderKL, + DDIMScheduler, + DDPMScheduler, + DEISMultistepScheduler, + DPMSolverMultistepScheduler, + DPMSolverSinglestepScheduler, + EulerAncestralDiscreteScheduler, + EulerDiscreteScheduler, + HeunDiscreteScheduler, + KDPM2AncestralDiscreteScheduler, + KDPM2DiscreteScheduler, + UniPCMultistepScheduler, + LCMScheduler, +) + +# KEEP ONLY the I2MV SDXL pipeline +from .mvadapter.pipelines.pipeline_mvadapter_i2mv_sdxl import MVAdapterI2MVSDXLPipeline + +# Geometry / utils used by nodes +from .mvadapter.utils import ( + get_orthogonal_camera, + get_plucker_embeds_from_cameras_ortho, + make_image_grid, +) + +NODE_CACHE_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "cache") + +# ------------------------- +# Pipelines (trimmed) +# ------------------------- +PIPELINES = { + "MVAdapterI2MVSDXLPipeline": MVAdapterI2MVSDXLPipeline, +} + +# ------------------------- +# Schedulers (unchanged) +# ------------------------- +SCHEDULERS = { + "DDIM": DDIMScheduler, + "DDPM": DDPMScheduler, + "DEISMultistep": DEISMultistepScheduler, + "DPMSolverMultistep": DPMSolverMultistepScheduler, + "DPMSolverSinglestep": DPMSolverSinglestepScheduler, + "EulerAncestralDiscrete": EulerAncestralDiscreteScheduler, + "EulerDiscrete": EulerDiscreteScheduler, + "HeunDiscrete": HeunDiscreteScheduler, + "KDPM2AncestralDiscrete": KDPM2AncestralDiscreteScheduler, + "KDPM2Discrete": KDPM2DiscreteScheduler, + "UniPCMultistep": UniPCMultistepScheduler, + "LCM": LCMScheduler, +} + +# ------------------------- +# MV-Adapter files (trimmed) +# MOST IMPORTANT: keep beta +# ------------------------- +MVADAPTERS = [ + "mvadapter_i2mv_sdxl.safetensors", + "mvadapter_i2mv_sdxl_beta.safetensors", # <-- MOST IMPORTANT +] + +# -------------------------------------------------------------------- +# VAE conversion helpers (local-only; no network downloads permitted) +# -------------------------------------------------------------------- + +# Reference from : https://github.com/huggingface/diffusers/blob/main/scripts/convert_vae_pt_to_diffusers.py +def custom_convert_ldm_vae_checkpoint(checkpoint, config): + vae_state_dict = checkpoint + + new_checkpoint = {} + + new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"] + new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"] + new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"] + new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"] + new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"] + new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"] + + new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"] + new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"] + new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"] + new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"] + new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"] + new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"] + + new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"] + new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"] + new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"] + new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"] + + # Retrieves the keys for the encoder down blocks only + num_down_blocks = len( + {".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer} + ) + down_blocks = {layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)} + + # Retrieves the keys for the decoder up blocks only + num_up_blocks = len( + {".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer} + ) + up_blocks = {layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)} + + for i in range(num_down_blocks): + resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key] + + if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict: + new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(f"encoder.down.{i}.downsample.conv.weight") + new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(f"encoder.down.{i}.downsample.conv.bias") + + paths = renew_vae_resnet_paths(resnets) + meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"} + assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) + + mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key] + num_mid_res_blocks = 2 + for i in range(1, num_mid_res_blocks + 1): + resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key] + + paths = renew_vae_resnet_paths(resnets) + meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"} + assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) + + mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key] + paths = renew_vae_attention_paths(mid_attentions) + meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"} + assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) + conv_attn_to_linear(new_checkpoint) + + for i in range(num_up_blocks): + block_id = num_up_blocks - 1 - i + resnets = [key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key] + + if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict: + new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[f"decoder.up.{block_id}.upsample.conv.weight"] + new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[f"decoder.up.{block_id}.upsample.conv.bias"] + + paths = renew_vae_resnet_paths(resnets) + meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"} + assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) + + mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key] + num_mid_res_blocks = 2 + for i in range(1, num_mid_res_blocks + 1): + resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key] + + paths = renew_vae_resnet_paths(resnets) + meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"} + assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) + + mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key] + paths = renew_vae_attention_paths(mid_attentions) + meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"} + assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) + conv_attn_to_linear(new_checkpoint) + return new_checkpoint + + +# Reference from : https://github.com/huggingface/diffusers/blob/main/scripts/convert_vae_pt_to_diffusers.py +def vae_pt_to_vae_diffuser(checkpoint_path: str, force_upcast: bool = True): + """ + Local-only conversion. Requires a local 'stable-diffusion-v1-inference.yaml' in: + custom_nodes/comfyui-mvadapter/cache/ + + We intentionally do NOT download anything. If the config is missing, raise and tell the user + to place it locally. + """ + config_path = os.path.join(NODE_CACHE_PATH, "stable-diffusion-v1-inference.yaml") + if not os.path.exists(config_path): + raise FileNotFoundError( + f"Missing config: {config_path}\n" + "Place 'stable-diffusion-v1-inference.yaml' in the 'cache' folder next to utils.py.\n" + "You can get it from the original Stable Diffusion repo." + ) + original_config = OmegaConf.load(config_path) + + image_size = 512 + device = "cuda" if torch.cuda.is_available() else "cpu" + + if checkpoint_path.endswith("safetensors"): + from safetensors import safe_open + + checkpoint = {} + with safe_open(checkpoint_path, framework="pt", device="cpu") as f: + for key in f.keys(): + checkpoint[key] = f.get_tensor(key) + else: + checkpoint = torch.load(checkpoint_path, map_location=device)["state_dict"] + + # Convert the VAE model. + vae_config = create_vae_diffusers_config(original_config, image_size=image_size) + vae_config.update({"force_upcast": force_upcast}) + converted_vae_checkpoint = custom_convert_ldm_vae_checkpoint(checkpoint, vae_config) + + vae = AutoencoderKL(**vae_config) + vae.load_state_dict(converted_vae_checkpoint) + + return vae + + +def convert_images_to_tensors(images: list[Image.Image]): + return torch.stack([np.transpose(ToTensor()(image), (1, 2, 0)) for image in images]) + + +def convert_tensors_to_images(images: torch.tensor): + return [ + Image.fromarray(np.clip(255.0 * image.cpu().numpy(), 0, 255).astype(np.uint8)) + for image in images + ] + + +def resize_images(images: list[Image.Image], size: tuple[int, int]): + return [image.resize(size) for image in images] + + +def prepare_camera_embed(num_views, size, device, azimuth_degrees=None): + cameras = get_orthogonal_camera( + elevation_deg=[0] * num_views, + distance=[1.8] * num_views, + left=-0.55, + right=0.55, + bottom=-0.55, + top=0.55, + azimuth_deg=[x - 90 for x in azimuth_degrees], + device=device, + ) + + plucker_embeds = get_plucker_embeds_from_cameras_ortho( + cameras.c2w, [1.1] * num_views, size + ) + control_images = ((plucker_embeds + 1.0) / 2.0).clamp(0, 1) + + return control_images + + +def preprocess_image(image: Image.Image, height, width): + image = np.array(image) + alpha = image[..., 3] > 0 + H, W = alpha.shape + # get the bounding box of alpha + y, x = np.where(alpha) + y0, y1 = max(y.min() - 1, 0), min(y.max() + 1, H) + x0, x1 = max(x.min() - 1, 0), min(x.max() + 1, W) + image_center = image[y0:y1, x0:x1] + # resize the longer side to H * 0.9 + H, W, _ = image_center.shape + if H > W: + W = int(W * (height * 0.9) / H) + H = int(height * 0.9) + else: + H = int(H * (width * 0.9) / W) + W = int(width * 0.9) + image_center = np.array(Image.fromarray(image_center).resize((W, H))) + # pad to H, W + start_h = (height - H) // 2 + start_w = (width - W) // 2 + image = np.zeros((height, width, 4), dtype=np.uint8) + image[start_h : start_h + H, start_w : start_w + W] = image_center + image = image.astype(np.float32) / 255.0 + image = image[:, :, :3] * image[:, :, 3:4] + (1 - image[:, :, 3:4]) * 0.5 + image = (image * 255).clip(0, 255).astype(np.uint8) + image = Image.fromarray(image) + + return image diff --git a/comfyui-mvadapter/workflows/i2mv_sdxl_diffusers.json b/comfyui-mvadapter/workflows/i2mv_sdxl_diffusers.json new file mode 100644 index 0000000000000000000000000000000000000000..b48ecdd739127ad6d528b8539e10655745ed0074 --- /dev/null +++ b/comfyui-mvadapter/workflows/i2mv_sdxl_diffusers.json @@ -0,0 +1,489 @@ +{ + "last_node_id": 11, + "last_link_id": 10, + "nodes": [ + { + "id": 6, + "type": "DiffusersMVModelMakeup", + "pos": [ + 944.978759765625, + 234.29940795898438 + ], + "size": [ + 315, + 170 + ], + "flags": {}, + "order": 7, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "PIPELINE", + "link": 2 + }, + { + "name": "scheduler", + "type": "SCHEDULER", + "link": 3 + }, + { + "name": "autoencoder", + "type": "AUTOENCODER", + "link": 4 + } + ], + "outputs": [ + { + "name": "PIPELINE", + "type": "PIPELINE", + "links": [ + 5 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "DiffusersMVModelMakeup" + }, + "widgets_values": [ + true, + "huanngzh/mv-adapter", + "mvadapter_i2mv_sdxl.safetensors", + 6 + ] + }, + { + "id": 5, + "type": "DiffusersMVVaeLoader", + "pos": [ + 519.3989868164062, + 334.48828125 + ], + "size": [ + 315, + 58 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "AUTOENCODER", + "type": "AUTOENCODER", + "links": [ + 4 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "DiffusersMVVaeLoader" + }, + "widgets_values": [ + "madebyollin/sdxl-vae-fp16-fix" + ] + }, + { + "id": 3, + "type": "DiffusersMVSchedulerLoader", + "pos": [ + 515.5944213867188, + 125.65931701660156 + ], + "size": [ + 327.5999755859375, + 130 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "PIPELINE", + "link": 1 + } + ], + "outputs": [ + { + "name": "SCHEDULER", + "type": "SCHEDULER", + "links": [ + 3 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "DiffusersMVSchedulerLoader" + }, + "widgets_values": [ + "DDPM", + true, + "interpolated", + 8 + ] + }, + { + "id": 2, + "type": "BiRefNet", + "pos": [ + 521.8474731445312, + -224.9335479736328 + ], + "size": [ + 315, + 58 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "FUNCTION", + "type": "FUNCTION", + "links": [ + 6 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "BiRefNet" + }, + "widgets_values": [ + "ZhengPeng7/BiRefNet" + ] + }, + { + "id": 8, + "type": "LoadImage", + "pos": [ + 940.2247314453125, + -300.4877014160156 + ], + "size": [ + 315, + 314 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 7 + ], + "slot_index": 0 + }, + { + "name": "MASK", + "type": "MASK", + "links": null + } + ], + "properties": { + "Node name for S&R": "LoadImage" + }, + "widgets_values": [ + "已移除背景的image (1).jpeg", + "image" + ] + }, + { + "id": 10, + "type": "PreviewImage", + "pos": [ + 1337.1131591796875, + -263.8614501953125 + ], + "size": [ + 313.3982849121094, + 246 + ], + "flags": {}, + "order": 6, + "mode": 0, + "inputs": [ + { + "name": "images", + "type": "IMAGE", + "link": 8 + } + ], + "outputs": [], + "properties": { + "Node name for S&R": "PreviewImage" + }, + "widgets_values": [] + }, + { + "id": 9, + "type": "ImagePreprocessor", + "pos": [ + 944.402099609375, + 75.06153869628906 + ], + "size": [ + 315, + 102 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "name": "remove_bg_fn", + "type": "FUNCTION", + "link": 6 + }, + { + "name": "image", + "type": "IMAGE", + "link": 7 + } + ], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 8, + 9 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "ImagePreprocessor" + }, + "widgets_values": [ + 768, + 768 + ] + }, + { + "id": 7, + "type": "DiffusersMVSampler", + "pos": [ + 1324.947265625, + 70.82652282714844 + ], + "size": [ + 400, + 314 + ], + "flags": {}, + "order": 8, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "PIPELINE", + "link": 5 + }, + { + "name": "reference_image", + "type": "IMAGE", + "link": 9, + "shape": 7 + } + ], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 10 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "DiffusersMVSampler" + }, + "widgets_values": [ + 6, + "A decorative figurine of a young anime-style girl", + "watermark, ugly, deformed, noisy, blurry, low contrast", + 768, + 768, + 50, + 3, + 490054611146870, + "randomize" + ] + }, + { + "id": 11, + "type": "PreviewImage", + "pos": [ + 1778.79638671875, + -213.63694763183594 + ], + "size": [ + 365.73077392578125, + 534.254150390625 + ], + "flags": {}, + "order": 9, + "mode": 0, + "inputs": [ + { + "name": "images", + "type": "IMAGE", + "link": 10 + } + ], + "outputs": [], + "properties": { + "Node name for S&R": "PreviewImage" + }, + "widgets_values": [] + }, + { + "id": 1, + "type": "DiffusersMVPipelineLoader", + "pos": [ + 519.635498046875, + -73.85352325439453 + ], + "size": [ + 315, + 122 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "PIPELINE", + "type": "PIPELINE", + "links": [ + 1, + 2 + ], + "slot_index": 0 + }, + { + "name": "AUTOENCODER", + "type": "AUTOENCODER", + "links": null + }, + { + "name": "SCHEDULER", + "type": "SCHEDULER", + "links": null + } + ], + "properties": { + "Node name for S&R": "DiffusersMVPipelineLoader" + }, + "widgets_values": [ + "stabilityai/stable-diffusion-xl-base-1.0", + "MVAdapterI2MVSDXLPipeline" + ] + } + ], + "links": [ + [ + 1, + 1, + 0, + 3, + 0, + "PIPELINE" + ], + [ + 2, + 1, + 0, + 6, + 0, + "PIPELINE" + ], + [ + 3, + 3, + 0, + 6, + 1, + "SCHEDULER" + ], + [ + 4, + 5, + 0, + 6, + 2, + "AUTOENCODER" + ], + [ + 5, + 6, + 0, + 7, + 0, + "PIPELINE" + ], + [ + 6, + 2, + 0, + 9, + 0, + "FUNCTION" + ], + [ + 7, + 8, + 0, + 9, + 1, + "IMAGE" + ], + [ + 8, + 9, + 0, + 10, + 0, + "IMAGE" + ], + [ + 9, + 9, + 0, + 7, + 1, + "IMAGE" + ], + [ + 10, + 7, + 0, + 11, + 0, + "IMAGE" + ] + ], + "groups": [], + "config": {}, + "extra": { + "ds": { + "scale": 0.8264462809917354, + "offset": [ + -46.02437931617331, + 392.2111603041893 + ] + } + }, + "version": 0.4 +} \ No newline at end of file diff --git a/comfyui-mvadapter/workflows/i2mv_sdxl_ldm.json b/comfyui-mvadapter/workflows/i2mv_sdxl_ldm.json new file mode 100644 index 0000000000000000000000000000000000000000..0039ceaa770e70281a94c5586f78ddc81f3f70ed --- /dev/null +++ b/comfyui-mvadapter/workflows/i2mv_sdxl_ldm.json @@ -0,0 +1,490 @@ +{ + "last_node_id": 11, + "last_link_id": 10, + "nodes": [ + { + "id": 3, + "type": "LdmVaeLoader", + "pos": [ + 460.4966125488281, + 490.3278503417969 + ], + "size": [ + 315, + 58 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "AUTOENCODER", + "type": "AUTOENCODER", + "links": [ + 4 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "LdmVaeLoader" + }, + "widgets_values": [ + "sdxl_vae.safetensors", + true + ] + }, + { + "id": 4, + "type": "DiffusersMVModelMakeup", + "pos": [ + 873.04052734375, + 378.09552001953125 + ], + "size": [ + 315, + 170 + ], + "flags": {}, + "order": 6, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "PIPELINE", + "link": 2 + }, + { + "name": "scheduler", + "type": "SCHEDULER", + "link": 3 + }, + { + "name": "autoencoder", + "type": "AUTOENCODER", + "link": 4 + } + ], + "outputs": [ + { + "name": "PIPELINE", + "type": "PIPELINE", + "links": [ + 5 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "DiffusersMVModelMakeup" + }, + "widgets_values": [ + true, + "huanngzh/mv-adapter", + "mvadapter_i2mv_sdxl.safetensors", + 6 + ] + }, + { + "id": 1, + "type": "LdmPipelineLoader", + "pos": [ + 459.6553649902344, + 47.24098205566406 + ], + "size": [ + 315, + 122 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "PIPELINE", + "type": "PIPELINE", + "links": [ + 1, + 2 + ], + "slot_index": 0 + }, + { + "name": "AUTOENCODER", + "type": "AUTOENCODER", + "links": null + }, + { + "name": "SCHEDULER", + "type": "SCHEDULER", + "links": null + } + ], + "properties": { + "Node name for S&R": "LdmPipelineLoader" + }, + "widgets_values": [ + "sd_xl_base_1.0.safetensors", + "MVAdapterI2MVSDXLPipeline" + ] + }, + { + "id": 2, + "type": "DiffusersMVSchedulerLoader", + "pos": [ + 452.2912292480469, + 260.1961975097656 + ], + "size": [ + 327.5999755859375, + 130 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "PIPELINE", + "link": 1 + } + ], + "outputs": [ + { + "name": "SCHEDULER", + "type": "SCHEDULER", + "links": [ + 3 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "DiffusersMVSchedulerLoader" + }, + "widgets_values": [ + "DDPM", + true, + "interpolated", + 8 + ] + }, + { + "id": 9, + "type": "BiRefNet", + "pos": [ + 461.276123046875, + -98.90441131591797 + ], + "size": [ + 315, + 58 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "FUNCTION", + "type": "FUNCTION", + "links": [ + 6 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "BiRefNet" + }, + "widgets_values": [ + "ZhengPeng7/BiRefNet" + ] + }, + { + "id": 8, + "type": "ImagePreprocessor", + "pos": [ + 879.4166870117188, + 160.73989868164062 + ], + "size": [ + 315, + 102 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "name": "remove_bg_fn", + "type": "FUNCTION", + "link": 6 + }, + { + "name": "image", + "type": "IMAGE", + "link": 7 + } + ], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 8, + 9 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "ImagePreprocessor" + }, + "widgets_values": [ + 768, + 768 + ] + }, + { + "id": 7, + "type": "LoadImage", + "pos": [ + 873.844482421875, + -214.40762329101562 + ], + "size": [ + 316.98516845703125, + 314 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 7 + ], + "slot_index": 0 + }, + { + "name": "MASK", + "type": "MASK", + "links": null + } + ], + "properties": { + "Node name for S&R": "LoadImage" + }, + "widgets_values": [ + "已移除背景的image (1).jpeg", + "image" + ] + }, + { + "id": 10, + "type": "PreviewImage", + "pos": [ + 1288.9661865234375, + -175.12034606933594 + ], + "size": [ + 267.6073303222656, + 276.0325927734375 + ], + "flags": {}, + "order": 7, + "mode": 0, + "inputs": [ + { + "name": "images", + "type": "IMAGE", + "link": 8 + } + ], + "outputs": [], + "properties": { + "Node name for S&R": "PreviewImage" + }, + "widgets_values": [] + }, + { + "id": 11, + "type": "PreviewImage", + "pos": [ + 1719.0726318359375, + -112.28995513916016 + ], + "size": [ + 390.2191162109375, + 614.5867919921875 + ], + "flags": {}, + "order": 9, + "mode": 0, + "inputs": [ + { + "name": "images", + "type": "IMAGE", + "link": 10 + } + ], + "outputs": [], + "properties": { + "Node name for S&R": "PreviewImage" + }, + "widgets_values": [] + }, + { + "id": 6, + "type": "DiffusersMVSampler", + "pos": [ + 1270.0447998046875, + 187.63858032226562 + ], + "size": [ + 398.4827880859375, + 355.830078125 + ], + "flags": {}, + "order": 8, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "PIPELINE", + "link": 5 + }, + { + "name": "reference_image", + "type": "IMAGE", + "link": 9, + "shape": 7 + } + ], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 10 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "DiffusersMVSampler" + }, + "widgets_values": [ + 6, + "A decorative figurine of a young anime-style girl", + "watermark, ugly, deformed, noisy, blurry, low contrast", + 768, + 768, + 50, + 3, + 21, + "fixed" + ] + } + ], + "links": [ + [ + 1, + 1, + 0, + 2, + 0, + "PIPELINE" + ], + [ + 2, + 1, + 0, + 4, + 0, + "PIPELINE" + ], + [ + 3, + 2, + 0, + 4, + 1, + "SCHEDULER" + ], + [ + 4, + 3, + 0, + 4, + 2, + "AUTOENCODER" + ], + [ + 5, + 4, + 0, + 6, + 0, + "PIPELINE" + ], + [ + 6, + 9, + 0, + 8, + 0, + "FUNCTION" + ], + [ + 7, + 7, + 0, + 8, + 1, + "IMAGE" + ], + [ + 8, + 8, + 0, + 10, + 0, + "IMAGE" + ], + [ + 9, + 8, + 0, + 6, + 1, + "IMAGE" + ], + [ + 10, + 6, + 0, + 11, + 0, + "IMAGE" + ] + ], + "groups": [], + "config": {}, + "extra": { + "ds": { + "scale": 0.8264462809917354, + "offset": [ + 5.887456621326669, + 285.11670717918946 + ] + } + }, + "version": 0.4 +} \ No newline at end of file diff --git a/comfyui-mvadapter/workflows/i2mv_sdxl_ldm_lora.json b/comfyui-mvadapter/workflows/i2mv_sdxl_ldm_lora.json new file mode 100644 index 0000000000000000000000000000000000000000..3cb18a5534e8b16e91f722e7cd57dc898332c73f --- /dev/null +++ b/comfyui-mvadapter/workflows/i2mv_sdxl_ldm_lora.json @@ -0,0 +1,540 @@ +{ + "last_node_id": 12, + "last_link_id": 12, + "nodes": [ + { + "id": 3, + "type": "LdmVaeLoader", + "pos": [ + 460.4966125488281, + 490.3278503417969 + ], + "size": [ + 315, + 82 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "AUTOENCODER", + "type": "AUTOENCODER", + "links": [ + 4 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "LdmVaeLoader" + }, + "widgets_values": [ + "sdxl_vae.safetensors", + true + ] + }, + { + "id": 1, + "type": "LdmPipelineLoader", + "pos": [ + 459.6553649902344, + 47.24098205566406 + ], + "size": [ + 315, + 122 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "PIPELINE", + "type": "PIPELINE", + "links": [ + 1, + 2 + ], + "slot_index": 0 + }, + { + "name": "AUTOENCODER", + "type": "AUTOENCODER", + "links": null + }, + { + "name": "SCHEDULER", + "type": "SCHEDULER", + "links": null + } + ], + "properties": { + "Node name for S&R": "LdmPipelineLoader" + }, + "widgets_values": [ + "sd_xl_base_1.0.safetensors", + "MVAdapterI2MVSDXLPipeline" + ] + }, + { + "id": 2, + "type": "DiffusersMVSchedulerLoader", + "pos": [ + 452.2912292480469, + 260.1961975097656 + ], + "size": [ + 327.5999755859375, + 130 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "PIPELINE", + "link": 1 + } + ], + "outputs": [ + { + "name": "SCHEDULER", + "type": "SCHEDULER", + "links": [ + 3 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "DiffusersMVSchedulerLoader" + }, + "widgets_values": [ + "DDPM", + true, + "interpolated", + 8 + ] + }, + { + "id": 9, + "type": "BiRefNet", + "pos": [ + 461.276123046875, + -98.90441131591797 + ], + "size": [ + 315, + 58 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "FUNCTION", + "type": "FUNCTION", + "links": [ + 6 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "BiRefNet" + }, + "widgets_values": [ + "ZhengPeng7/BiRefNet" + ] + }, + { + "id": 8, + "type": "ImagePreprocessor", + "pos": [ + 879.4166870117188, + 160.73989868164062 + ], + "size": [ + 315, + 102 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "name": "remove_bg_fn", + "type": "FUNCTION", + "link": 6 + }, + { + "name": "image", + "type": "IMAGE", + "link": 7 + } + ], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 8, + 9 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "ImagePreprocessor" + }, + "widgets_values": [ + 768, + 768 + ] + }, + { + "id": 7, + "type": "LoadImage", + "pos": [ + 873.844482421875, + -214.40762329101562 + ], + "size": [ + 316.98516845703125, + 314 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 7 + ], + "slot_index": 0 + }, + { + "name": "MASK", + "type": "MASK", + "links": null + } + ], + "properties": { + "Node name for S&R": "LoadImage" + }, + "widgets_values": [ + "已移除背景的image (1).jpeg", + "image" + ] + }, + { + "id": 11, + "type": "PreviewImage", + "pos": [ + 1987.6944580078125, + -87.29558563232422 + ], + "size": [ + 390.2191162109375, + 614.5867919921875 + ], + "flags": {}, + "order": 10, + "mode": 0, + "inputs": [ + { + "name": "images", + "type": "IMAGE", + "link": 10 + } + ], + "outputs": [], + "properties": { + "Node name for S&R": "PreviewImage" + }, + "widgets_values": [] + }, + { + "id": 10, + "type": "PreviewImage", + "pos": [ + 1395.756591796875, + -179.6309814453125 + ], + "size": [ + 267.6073303222656, + 276.0325927734375 + ], + "flags": {}, + "order": 7, + "mode": 0, + "inputs": [ + { + "name": "images", + "type": "IMAGE", + "link": 8 + } + ], + "outputs": [], + "properties": { + "Node name for S&R": "PreviewImage" + }, + "widgets_values": [] + }, + { + "id": 4, + "type": "DiffusersMVModelMakeup", + "pos": [ + 873.04052734375, + 378.09552001953125 + ], + "size": [ + 315, + 194 + ], + "flags": {}, + "order": 6, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "PIPELINE", + "link": 2 + }, + { + "name": "scheduler", + "type": "SCHEDULER", + "link": 3 + }, + { + "name": "autoencoder", + "type": "AUTOENCODER", + "link": 4 + } + ], + "outputs": [ + { + "name": "PIPELINE", + "type": "PIPELINE", + "links": [ + 11 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "DiffusersMVModelMakeup" + }, + "widgets_values": [ + true, + "huanngzh/mv-adapter", + "mvadapter_i2mv_sdxl.safetensors", + 6, + true + ] + }, + { + "id": 12, + "type": "CustomLoraModelLoader", + "pos": [ + 1206.2666015625, + 348.32861328125 + ], + "size": [ + 315, + 82 + ], + "flags": {}, + "order": 8, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "PIPELINE", + "link": 11 + } + ], + "outputs": [ + { + "name": "PIPELINE", + "type": "PIPELINE", + "links": [ + 12 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "CustomLoraModelLoader" + }, + "widgets_values": [ + "3d_render_style_xl.safetensors", + 1, + true, + true + ] + }, + { + "id": 6, + "type": "DiffusersMVSampler", + "pos": [ + 1545.4605712890625, + 165.60733032226562 + ], + "size": [ + 398.4827880859375, + 355.830078125 + ], + "flags": {}, + "order": 9, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "PIPELINE", + "link": 12 + }, + { + "name": "reference_image", + "type": "IMAGE", + "link": 9, + "shape": 7 + } + ], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 10 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "DiffusersMVSampler" + }, + "widgets_values": [ + 6, + "3d style, A decorative figurine of a young anime-style girl", + "watermark, ugly, deformed, noisy, blurry, low contrast", + 768, + 768, + 50, + 3, + 21, + "fixed" + ] + } + ], + "links": [ + [ + 1, + 1, + 0, + 2, + 0, + "PIPELINE" + ], + [ + 2, + 1, + 0, + 4, + 0, + "PIPELINE" + ], + [ + 3, + 2, + 0, + 4, + 1, + "SCHEDULER" + ], + [ + 4, + 3, + 0, + 4, + 2, + "AUTOENCODER" + ], + [ + 6, + 9, + 0, + 8, + 0, + "FUNCTION" + ], + [ + 7, + 7, + 0, + 8, + 1, + "IMAGE" + ], + [ + 8, + 8, + 0, + 10, + 0, + "IMAGE" + ], + [ + 9, + 8, + 0, + 6, + 1, + "IMAGE" + ], + [ + 10, + 6, + 0, + 11, + 0, + "IMAGE" + ], + [ + 11, + 4, + 0, + 12, + 0, + "PIPELINE" + ], + [ + 12, + 12, + 0, + 6, + 0, + "PIPELINE" + ] + ], + "groups": [], + "config": {}, + "extra": { + "ds": { + "scale": 0.620921323059155, + "offset": [ + -328.02932510914184, + 300.9334967924711 + ] + } + }, + "version": 0.4 +} \ No newline at end of file diff --git a/comfyui-mvadapter/workflows/i2mv_sdxl_ldm_multiple_loras.json b/comfyui-mvadapter/workflows/i2mv_sdxl_ldm_multiple_loras.json new file mode 100644 index 0000000000000000000000000000000000000000..68dfa2dae5b30f62e51e2aeeff61af23f6b9ad59 --- /dev/null +++ b/comfyui-mvadapter/workflows/i2mv_sdxl_ldm_multiple_loras.json @@ -0,0 +1,655 @@ +{ + "id": "e247e559-e7e7-48b2-8b2a-935cc48735ca", + "revision": 0, + "last_node_id": 14, + "last_link_id": 15, + "nodes": [ + { + "id": 3, + "type": "LdmVaeLoader", + "pos": [ + 460.4966125488281, + 490.3278503417969 + ], + "size": [ + 315, + 82 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "AUTOENCODER", + "type": "AUTOENCODER", + "slot_index": 0, + "links": [ + 4 + ] + } + ], + "properties": { + "Node name for S&R": "LdmVaeLoader" + }, + "widgets_values": [ + "sdxl_vae.safetensors", + true + ] + }, + { + "id": 1, + "type": "LdmPipelineLoader", + "pos": [ + 459.6553649902344, + 47.24098205566406 + ], + "size": [ + 315, + 122 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "PIPELINE", + "type": "PIPELINE", + "slot_index": 0, + "links": [ + 1, + 2 + ] + }, + { + "name": "AUTOENCODER", + "type": "AUTOENCODER", + "links": null + }, + { + "name": "SCHEDULER", + "type": "SCHEDULER", + "links": null + } + ], + "properties": { + "Node name for S&R": "LdmPipelineLoader" + }, + "widgets_values": [ + "sd_xl_base_1.0.safetensors", + "MVAdapterI2MVSDXLPipeline" + ] + }, + { + "id": 2, + "type": "DiffusersMVSchedulerLoader", + "pos": [ + 452.2912292480469, + 260.1961975097656 + ], + "size": [ + 327.5999755859375, + 130 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "PIPELINE", + "link": 1 + } + ], + "outputs": [ + { + "name": "SCHEDULER", + "type": "SCHEDULER", + "slot_index": 0, + "links": [ + 3 + ] + } + ], + "properties": { + "Node name for S&R": "DiffusersMVSchedulerLoader" + }, + "widgets_values": [ + "DDPM", + true, + "interpolated", + 8 + ] + }, + { + "id": 9, + "type": "BiRefNet", + "pos": [ + 461.276123046875, + -98.90441131591797 + ], + "size": [ + 315, + 58 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "FUNCTION", + "type": "FUNCTION", + "slot_index": 0, + "links": [ + 6 + ] + } + ], + "properties": { + "Node name for S&R": "BiRefNet" + }, + "widgets_values": [ + "ZhengPeng7/BiRefNet" + ] + }, + { + "id": 8, + "type": "ImagePreprocessor", + "pos": [ + 879.4166870117188, + 160.73989868164062 + ], + "size": [ + 315, + 102 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "name": "remove_bg_fn", + "type": "FUNCTION", + "link": 6 + }, + { + "name": "image", + "type": "IMAGE", + "link": 7 + } + ], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "slot_index": 0, + "links": [ + 8, + 9 + ] + } + ], + "properties": { + "Node name for S&R": "ImagePreprocessor" + }, + "widgets_values": [ + 768, + 768 + ] + }, + { + "id": 10, + "type": "PreviewImage", + "pos": [ + 1395.756591796875, + -179.6309814453125 + ], + "size": [ + 267.6073303222656, + 276.0325927734375 + ], + "flags": {}, + "order": 7, + "mode": 0, + "inputs": [ + { + "name": "images", + "type": "IMAGE", + "link": 8 + } + ], + "outputs": [], + "properties": { + "Node name for S&R": "PreviewImage" + }, + "widgets_values": [] + }, + { + "id": 4, + "type": "DiffusersMVModelMakeup", + "pos": [ + 873.04052734375, + 378.09552001953125 + ], + "size": [ + 315, + 218 + ], + "flags": {}, + "order": 6, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "PIPELINE", + "link": 2 + }, + { + "name": "scheduler", + "type": "SCHEDULER", + "link": 3 + }, + { + "name": "autoencoder", + "type": "AUTOENCODER", + "link": 4 + } + ], + "outputs": [ + { + "name": "PIPELINE", + "type": "PIPELINE", + "slot_index": 0, + "links": [ + 11 + ] + } + ], + "properties": { + "Node name for S&R": "DiffusersMVModelMakeup" + }, + "widgets_values": [ + true, + "huanngzh/mv-adapter", + "mvadapter_i2mv_sdxl.safetensors", + 6, + true, + false + ] + }, + { + "id": 11, + "type": "PreviewImage", + "pos": [ + 2012.598876953125, + -84.84674072265625 + ], + "size": [ + 390.2191162109375, + 614.5867919921875 + ], + "flags": {}, + "order": 12, + "mode": 0, + "inputs": [ + { + "name": "images", + "type": "IMAGE", + "link": 10 + } + ], + "outputs": [], + "properties": { + "Node name for S&R": "PreviewImage" + }, + "widgets_values": [] + }, + { + "id": 14, + "type": "CustomLoraModelLoader", + "pos": [ + 1213.9622802734375, + 616.24169921875 + ], + "size": [ + 315, + 130 + ], + "flags": {}, + "order": 10, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "PIPELINE", + "link": 14 + } + ], + "outputs": [ + { + "name": "PIPELINE", + "type": "PIPELINE", + "slot_index": 0, + "links": [ + 15 + ] + } + ], + "properties": { + "Node name for S&R": "CustomLoraModelLoader" + }, + "widgets_values": [ + "APose_v2.safetensors", + 1, + true, + true + ] + }, + { + "id": 7, + "type": "LoadImage", + "pos": [ + 873.844482421875, + -214.40762329101562 + ], + "size": [ + 316.98516845703125, + 314 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "slot_index": 0, + "links": [ + 7 + ] + }, + { + "name": "MASK", + "type": "MASK", + "links": null + } + ], + "properties": { + "Node name for S&R": "LoadImage" + }, + "widgets_values": [ + "example.png", + "image" + ] + }, + { + "id": 12, + "type": "CustomLoraModelLoader", + "pos": [ + 1209.0845947265625, + 254.66990661621094 + ], + "size": [ + 315, + 130 + ], + "flags": {}, + "order": 8, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "PIPELINE", + "link": 11 + } + ], + "outputs": [ + { + "name": "PIPELINE", + "type": "PIPELINE", + "slot_index": 0, + "links": [ + 13 + ] + } + ], + "properties": { + "Node name for S&R": "CustomLoraModelLoader" + }, + "widgets_values": [ + "3d_render_style_xl.safetensors", + 1, + true, + false + ] + }, + { + "id": 13, + "type": "CustomLoraModelLoader", + "pos": [ + 1210.6497802734375, + 435.50677490234375 + ], + "size": [ + 315, + 130 + ], + "flags": {}, + "order": 9, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "PIPELINE", + "link": 13 + } + ], + "outputs": [ + { + "name": "PIPELINE", + "type": "PIPELINE", + "slot_index": 0, + "links": [ + 14 + ] + } + ], + "properties": { + "Node name for S&R": "CustomLoraModelLoader" + }, + "widgets_values": [ + "3dpolygonStyle.safetensors", + 1, + true, + false + ] + }, + { + "id": 6, + "type": "DiffusersMVSampler", + "pos": [ + 1567.6197509765625, + 148.37710571289062 + ], + "size": [ + 398.4827880859375, + 394 + ], + "flags": {}, + "order": 11, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "PIPELINE", + "link": 15 + }, + { + "name": "reference_image", + "shape": 7, + "type": "IMAGE", + "link": 9 + }, + { + "name": "controlnet_image", + "shape": 7, + "type": "IMAGE", + "link": null + }, + { + "name": "azimuth_degrees", + "shape": 7, + "type": "LIST", + "link": null + } + ], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "slot_index": 0, + "links": [ + 10 + ] + } + ], + "properties": { + "Node name for S&R": "DiffusersMVSampler" + }, + "widgets_values": [ + 6, + "3d style, A decorative figurine of a young anime-style girl", + "watermark, ugly, deformed, noisy, blurry, low contrast", + 768, + 768, + 50, + 3, + 21, + "fixed", + 1.7000000000000002 + ] + } + ], + "links": [ + [ + 1, + 1, + 0, + 2, + 0, + "PIPELINE" + ], + [ + 2, + 1, + 0, + 4, + 0, + "PIPELINE" + ], + [ + 3, + 2, + 0, + 4, + 1, + "SCHEDULER" + ], + [ + 4, + 3, + 0, + 4, + 2, + "AUTOENCODER" + ], + [ + 6, + 9, + 0, + 8, + 0, + "FUNCTION" + ], + [ + 7, + 7, + 0, + 8, + 1, + "IMAGE" + ], + [ + 8, + 8, + 0, + 10, + 0, + "IMAGE" + ], + [ + 9, + 8, + 0, + 6, + 1, + "IMAGE" + ], + [ + 10, + 6, + 0, + 11, + 0, + "IMAGE" + ], + [ + 11, + 4, + 0, + 12, + 0, + "PIPELINE" + ], + [ + 13, + 12, + 0, + 13, + 0, + "PIPELINE" + ], + [ + 14, + 13, + 0, + 14, + 0, + "PIPELINE" + ], + [ + 15, + 14, + 0, + 6, + 0, + "PIPELINE" + ] + ], + "groups": [], + "config": {}, + "extra": { + "ds": { + "scale": 0.7513148009015777, + "offset": [ + -384.4616398829057, + 153.4666862355734 + ] + }, + "frontendVersion": "1.22.2" + }, + "version": 0.4 +} \ No newline at end of file diff --git a/comfyui-mvadapter/workflows/i2mv_sdxl_ldm_view_selector.json b/comfyui-mvadapter/workflows/i2mv_sdxl_ldm_view_selector.json new file mode 100644 index 0000000000000000000000000000000000000000..1f2fcce4a22c989917871e093611c919527f3284 --- /dev/null +++ b/comfyui-mvadapter/workflows/i2mv_sdxl_ldm_view_selector.json @@ -0,0 +1,550 @@ +{ + "last_node_id": 12, + "last_link_id": 11, + "nodes": [ + { + "id": 3, + "type": "LdmVaeLoader", + "pos": [ + 460.4966125488281, + 490.3278503417969 + ], + "size": [ + 315, + 82 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "AUTOENCODER", + "type": "AUTOENCODER", + "links": [ + 4 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "LdmVaeLoader" + }, + "widgets_values": [ + "sdxl_vae.safetensors", + true + ] + }, + { + "id": 2, + "type": "DiffusersMVSchedulerLoader", + "pos": [ + 452.2912292480469, + 260.1961975097656 + ], + "size": [ + 327.5999755859375, + 130 + ], + "flags": {}, + "order": 6, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "PIPELINE", + "link": 1 + } + ], + "outputs": [ + { + "name": "SCHEDULER", + "type": "SCHEDULER", + "links": [ + 3 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "DiffusersMVSchedulerLoader" + }, + "widgets_values": [ + "DDPM", + true, + "interpolated", + 8 + ] + }, + { + "id": 9, + "type": "BiRefNet", + "pos": [ + 461.276123046875, + -98.90441131591797 + ], + "size": [ + 315, + 58 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "FUNCTION", + "type": "FUNCTION", + "links": [ + 6 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "BiRefNet" + }, + "widgets_values": [ + "ZhengPeng7/BiRefNet" + ] + }, + { + "id": 8, + "type": "ImagePreprocessor", + "pos": [ + 879.4166870117188, + 160.73989868164062 + ], + "size": [ + 315, + 102 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "name": "remove_bg_fn", + "type": "FUNCTION", + "link": 6 + }, + { + "name": "image", + "type": "IMAGE", + "link": 7 + } + ], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 8, + 9 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "ImagePreprocessor" + }, + "widgets_values": [ + 768, + 768 + ] + }, + { + "id": 7, + "type": "LoadImage", + "pos": [ + 873.844482421875, + -214.40762329101562 + ], + "size": [ + 316.98516845703125, + 314 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 7 + ], + "slot_index": 0 + }, + { + "name": "MASK", + "type": "MASK", + "links": null + } + ], + "properties": { + "Node name for S&R": "LoadImage" + }, + "widgets_values": [ + "已移除背景的image (1).jpeg", + "image" + ] + }, + { + "id": 10, + "type": "PreviewImage", + "pos": [ + 1288.9661865234375, + -175.12034606933594 + ], + "size": [ + 267.6073303222656, + 276.0325927734375 + ], + "flags": {}, + "order": 7, + "mode": 0, + "inputs": [ + { + "name": "images", + "type": "IMAGE", + "link": 8 + } + ], + "outputs": [], + "properties": { + "Node name for S&R": "PreviewImage" + }, + "widgets_values": [] + }, + { + "id": 1, + "type": "LdmPipelineLoader", + "pos": [ + 459.6553649902344, + 47.24098205566406 + ], + "size": [ + 315, + 122 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "PIPELINE", + "type": "PIPELINE", + "links": [ + 1, + 2 + ], + "slot_index": 0 + }, + { + "name": "AUTOENCODER", + "type": "AUTOENCODER", + "links": null + }, + { + "name": "SCHEDULER", + "type": "SCHEDULER", + "links": null + } + ], + "properties": { + "Node name for S&R": "LdmPipelineLoader" + }, + "widgets_values": [ + "sd_xl_base_1.0.safetensors", + "MVAdapterI2MVSDXLPipeline" + ] + }, + { + "id": 12, + "type": "ViewSelector", + "pos": [ + 1219.32373046875, + 335.45733642578125 + ], + "size": [ + 315, + 178 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "LIST", + "type": "LIST", + "links": [ + 11 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "ViewSelector" + }, + "widgets_values": [ + true, + false, + true, + true, + false, + false + ] + }, + { + "id": 11, + "type": "PreviewImage", + "pos": [ + 2021.5838623046875, + -108.2677001953125 + ], + "size": [ + 390.2191162109375, + 614.5867919921875 + ], + "flags": {}, + "order": 10, + "mode": 0, + "inputs": [ + { + "name": "images", + "type": "IMAGE", + "link": 10 + } + ], + "outputs": [], + "properties": { + "Node name for S&R": "PreviewImage" + }, + "widgets_values": [] + }, + { + "id": 4, + "type": "DiffusersMVModelMakeup", + "pos": [ + 844.7613525390625, + 364.16156005859375 + ], + "size": [ + 350.9596862792969, + 218 + ], + "flags": {}, + "order": 8, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "PIPELINE", + "link": 2 + }, + { + "name": "scheduler", + "type": "SCHEDULER", + "link": 3 + }, + { + "name": "autoencoder", + "type": "AUTOENCODER", + "link": 4 + } + ], + "outputs": [ + { + "name": "PIPELINE", + "type": "PIPELINE", + "links": [ + 5 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "DiffusersMVModelMakeup" + }, + "widgets_values": [ + true, + "huanngzh/mv-adapter", + "mvadapter_i2mv_sdxl_beta.safetensors", + 6, + true, + false + ] + }, + { + "id": 6, + "type": "DiffusersMVSampler", + "pos": [ + 1561.8929443359375, + 158.94821166992188 + ], + "size": [ + 398.4827880859375, + 378 + ], + "flags": {}, + "order": 9, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "PIPELINE", + "link": 5 + }, + { + "name": "reference_image", + "type": "IMAGE", + "link": 9, + "shape": 7 + }, + { + "name": "controlnet_image", + "type": "IMAGE", + "link": null, + "shape": 7 + }, + { + "name": "azimuth_degrees", + "type": "LIST", + "link": 11, + "shape": 7 + } + ], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 10 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "DiffusersMVSampler" + }, + "widgets_values": [ + 6, + "A decorative figurine of a young anime-style girl", + "watermark, ugly, deformed, noisy, blurry, low contrast", + 768, + 768, + 50, + 3, + 0, + "fixed", + 1 + ] + } + ], + "links": [ + [ + 1, + 1, + 0, + 2, + 0, + "PIPELINE" + ], + [ + 2, + 1, + 0, + 4, + 0, + "PIPELINE" + ], + [ + 3, + 2, + 0, + 4, + 1, + "SCHEDULER" + ], + [ + 4, + 3, + 0, + 4, + 2, + "AUTOENCODER" + ], + [ + 5, + 4, + 0, + 6, + 0, + "PIPELINE" + ], + [ + 6, + 9, + 0, + 8, + 0, + "FUNCTION" + ], + [ + 7, + 7, + 0, + 8, + 1, + "IMAGE" + ], + [ + 8, + 8, + 0, + 10, + 0, + "IMAGE" + ], + [ + 9, + 8, + 0, + 6, + 1, + "IMAGE" + ], + [ + 10, + 6, + 0, + 11, + 0, + "IMAGE" + ], + [ + 11, + 12, + 0, + 6, + 3, + "LIST" + ] + ], + "groups": [], + "config": {}, + "extra": { + "ds": { + "scale": 0.8264462809917354, + "offset": [ + -237.18891056617306, + 276.270472804189 + ] + } + }, + "version": 0.4 +} \ No newline at end of file diff --git a/comfyui-mvadapter/workflows/t2mv_sdxl_diffusers.json b/comfyui-mvadapter/workflows/t2mv_sdxl_diffusers.json new file mode 100644 index 0000000000000000000000000000000000000000..4ef8c001b3c70fd01d0779af157a69888947d6bb --- /dev/null +++ b/comfyui-mvadapter/workflows/t2mv_sdxl_diffusers.json @@ -0,0 +1,314 @@ +{ + "last_node_id": 7, + "last_link_id": 6, + "nodes": [ + { + "id": 1, + "type": "DiffusersMVPipelineLoader", + "pos": [ + 324.3054504394531, + 130.34339904785156 + ], + "size": [ + 315, + 122 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "PIPELINE", + "type": "PIPELINE", + "links": [ + 1, + 2 + ], + "slot_index": 0 + }, + { + "name": "AUTOENCODER", + "type": "AUTOENCODER", + "links": null + }, + { + "name": "SCHEDULER", + "type": "SCHEDULER", + "links": null + } + ], + "properties": { + "Node name for S&R": "DiffusersMVPipelineLoader" + }, + "widgets_values": [ + "stabilityai/stable-diffusion-xl-base-1.0", + "MVAdapterT2MVSDXLPipeline" + ] + }, + { + "id": 3, + "type": "DiffusersMVSchedulerLoader", + "pos": [ + 320.6045227050781, + 323.5510559082031 + ], + "size": [ + 327.5999755859375, + 130 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "PIPELINE", + "link": 1 + } + ], + "outputs": [ + { + "name": "SCHEDULER", + "type": "SCHEDULER", + "links": [ + 3 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "DiffusersMVSchedulerLoader" + }, + "widgets_values": [ + "DDPM", + true, + "interpolated", + 8 + ] + }, + { + "id": 2, + "type": "DiffusersMVVaeLoader", + "pos": [ + 328.6159362792969, + 541.3416748046875 + ], + "size": [ + 315, + 58 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "AUTOENCODER", + "type": "AUTOENCODER", + "links": [ + 4 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "DiffusersMVVaeLoader" + }, + "widgets_values": [ + "madebyollin/sdxl-vae-fp16-fix" + ] + }, + { + "id": 4, + "type": "DiffusersMVModelMakeup", + "pos": [ + 728.7667846679688, + 261.0943298339844 + ], + "size": [ + 315, + 170 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "PIPELINE", + "link": 2 + }, + { + "name": "scheduler", + "type": "SCHEDULER", + "link": 3 + }, + { + "name": "autoencoder", + "type": "AUTOENCODER", + "link": 4 + } + ], + "outputs": [ + { + "name": "PIPELINE", + "type": "PIPELINE", + "links": [ + 5 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "DiffusersMVModelMakeup" + }, + "widgets_values": [ + true, + "huanngzh/mv-adapter", + "mvadapter_t2mv_sdxl.safetensors", + 6 + ] + }, + { + "id": 6, + "type": "DiffusersMVSampler", + "pos": [ + 1124.235595703125, + 195.65020751953125 + ], + "size": [ + 400, + 314 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "PIPELINE", + "link": 5 + }, + { + "name": "reference_image", + "type": "IMAGE", + "link": null, + "shape": 7 + } + ], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 6 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "DiffusersMVSampler" + }, + "widgets_values": [ + 6, + "an astronaut riding a horse", + "watermark, ugly, deformed, noisy, blurry, low contrast", + 768, + 768, + 50, + 7, + 26340599063291, + "randomize" + ] + }, + { + "id": 7, + "type": "PreviewImage", + "pos": [ + 1592.892822265625, + 111.47964477539062 + ], + "size": [ + 391.566162109375, + 532.7274780273438 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "name": "images", + "type": "IMAGE", + "link": 6 + } + ], + "outputs": [], + "properties": { + "Node name for S&R": "PreviewImage" + } + } + ], + "links": [ + [ + 1, + 1, + 0, + 3, + 0, + "PIPELINE" + ], + [ + 2, + 1, + 0, + 4, + 0, + "PIPELINE" + ], + [ + 3, + 3, + 0, + 4, + 1, + "SCHEDULER" + ], + [ + 4, + 2, + 0, + 4, + 2, + "AUTOENCODER" + ], + [ + 5, + 4, + 0, + 6, + 0, + "PIPELINE" + ], + [ + 6, + 6, + 0, + 7, + 0, + "IMAGE" + ] + ], + "groups": [], + "config": {}, + "extra": { + "ds": { + "scale": 0.8264462809917354, + "offset": [ + 137.93343318382662, + 114.90373842918925 + ] + } + }, + "version": 0.4 +} \ No newline at end of file diff --git a/comfyui-mvadapter/workflows/t2mv_sdxl_ldm.json b/comfyui-mvadapter/workflows/t2mv_sdxl_ldm.json new file mode 100644 index 0000000000000000000000000000000000000000..ec9b29d3ec32c2d8960442f494fb3729101de271 --- /dev/null +++ b/comfyui-mvadapter/workflows/t2mv_sdxl_ldm.json @@ -0,0 +1,315 @@ +{ + "last_node_id": 10, + "last_link_id": 15, + "nodes": [ + { + "id": 4, + "type": "LdmVaeLoader", + "pos": [ + 247.52098083496094, + 558.488525390625 + ], + "size": [ + 315, + 58 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "AUTOENCODER", + "type": "AUTOENCODER", + "links": [ + 14 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "LdmVaeLoader" + }, + "widgets_values": [ + "sdxl_vae.safetensors" + ] + }, + { + "id": 10, + "type": "DiffusersMVModelMakeup", + "pos": [ + 651.51123046875, + 328.2811584472656 + ], + "size": [ + 315, + 170 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "PIPELINE", + "link": 12 + }, + { + "name": "scheduler", + "type": "SCHEDULER", + "link": 13 + }, + { + "name": "autoencoder", + "type": "AUTOENCODER", + "link": 14 + } + ], + "outputs": [ + { + "name": "PIPELINE", + "type": "PIPELINE", + "links": [ + 15 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "DiffusersMVModelMakeup" + }, + "widgets_values": [ + true, + "huanngzh/mv-adapter", + "mvadapter_t2mv_sdxl.safetensors", + 6 + ] + }, + { + "id": 8, + "type": "PreviewImage", + "pos": [ + 1521.929443359375, + 102.87110137939453 + ], + "size": [ + 337.5841064453125, + 545.9476318359375 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "name": "images", + "type": "IMAGE", + "link": 7 + } + ], + "outputs": [], + "properties": { + "Node name for S&R": "PreviewImage" + }, + "widgets_values": [] + }, + { + "id": 9, + "type": "DiffusersMVSchedulerLoader", + "pos": [ + 236.58033752441406, + 346.41693115234375 + ], + "size": [ + 327.5999755859375, + 130 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "PIPELINE", + "link": 9 + } + ], + "outputs": [ + { + "name": "SCHEDULER", + "type": "SCHEDULER", + "links": [ + 13 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "DiffusersMVSchedulerLoader" + }, + "widgets_values": [ + "DDIM", + true, + "interpolated", + 8 + ] + }, + { + "id": 1, + "type": "LdmPipelineLoader", + "pos": [ + 245.8949432373047, + 130.0254364013672 + ], + "size": [ + 315, + 122 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "PIPELINE", + "type": "PIPELINE", + "links": [ + 9, + 12 + ], + "slot_index": 0 + }, + { + "name": "AUTOENCODER", + "type": "AUTOENCODER", + "links": null + }, + { + "name": "SCHEDULER", + "type": "SCHEDULER", + "links": null + } + ], + "properties": { + "Node name for S&R": "LdmPipelineLoader" + }, + "widgets_values": [ + "sd_xl_base_1.0.safetensors", + "MVAdapterT2MVSDXLPipeline" + ] + }, + { + "id": 7, + "type": "DiffusersMVSampler", + "pos": [ + 1050.723388671875, + 218.12826538085938 + ], + "size": [ + 400, + 314 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "PIPELINE", + "link": 15 + }, + { + "name": "reference_image", + "type": "IMAGE", + "link": null, + "shape": 7 + } + ], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 7 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "DiffusersMVSampler" + }, + "widgets_values": [ + 6, + "an astronaut riding a horse", + "watermark, ugly, deformed, noisy, blurry, low contrast", + 768, + 768, + 50, + 7, + 1081631136394980, + "randomize" + ] + } + ], + "links": [ + [ + 7, + 7, + 0, + 8, + 0, + "IMAGE" + ], + [ + 9, + 1, + 0, + 9, + 0, + "PIPELINE" + ], + [ + 12, + 1, + 0, + 10, + 0, + "PIPELINE" + ], + [ + 13, + 9, + 0, + 10, + 1, + "SCHEDULER" + ], + [ + 14, + 4, + 0, + 10, + 2, + "AUTOENCODER" + ], + [ + 15, + 10, + 0, + 7, + 0, + "PIPELINE" + ] + ], + "groups": [], + "config": {}, + "extra": { + "ds": { + "scale": 0.8264462809917354, + "offset": [ + 227.13784724632666, + 76.3208087416891 + ] + } + }, + "version": 0.4 +} \ No newline at end of file diff --git a/comfyui-mvadapter/workflows/t2mv_sdxl_ldm_controlnet.json b/comfyui-mvadapter/workflows/t2mv_sdxl_ldm_controlnet.json new file mode 100644 index 0000000000000000000000000000000000000000..8606f3f1d2a603c7ca1c8a163cb4cfb78b20b797 --- /dev/null +++ b/comfyui-mvadapter/workflows/t2mv_sdxl_ldm_controlnet.json @@ -0,0 +1,719 @@ +{ + "last_node_id": 20, + "last_link_id": 24, + "nodes": [ + { + "id": 4, + "type": "LdmVaeLoader", + "pos": [ + 247.52098083496094, + 558.488525390625 + ], + "size": [ + 315, + 82 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "AUTOENCODER", + "type": "AUTOENCODER", + "links": [ + 14 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "LdmVaeLoader" + }, + "widgets_values": [ + "sdxl_vae.safetensors", + true + ] + }, + { + "id": 9, + "type": "DiffusersMVSchedulerLoader", + "pos": [ + 236.58033752441406, + 346.41693115234375 + ], + "size": [ + 327.5999755859375, + 130 + ], + "flags": {}, + "order": 8, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "PIPELINE", + "link": 9 + } + ], + "outputs": [ + { + "name": "SCHEDULER", + "type": "SCHEDULER", + "links": [ + 13 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "DiffusersMVSchedulerLoader" + }, + "widgets_values": [ + "DDIM", + true, + "interpolated", + 8 + ] + }, + { + "id": 1, + "type": "LdmPipelineLoader", + "pos": [ + 246.9536590576172, + 153.577880859375 + ], + "size": [ + 315, + 122 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "PIPELINE", + "type": "PIPELINE", + "links": [ + 9, + 12 + ], + "slot_index": 0 + }, + { + "name": "AUTOENCODER", + "type": "AUTOENCODER", + "links": null + }, + { + "name": "SCHEDULER", + "type": "SCHEDULER", + "links": null + } + ], + "properties": { + "Node name for S&R": "LdmPipelineLoader" + }, + "widgets_values": [ + "sd_xl_base_1.0.safetensors", + "MVAdapterT2MVSDXLPipeline" + ] + }, + { + "id": 10, + "type": "DiffusersMVModelMakeup", + "pos": [ + 654.0358276367188, + 338.9443054199219 + ], + "size": [ + 315, + 218 + ], + "flags": {}, + "order": 10, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "PIPELINE", + "link": 12 + }, + { + "name": "scheduler", + "type": "SCHEDULER", + "link": 13 + }, + { + "name": "autoencoder", + "type": "AUTOENCODER", + "link": 14 + } + ], + "outputs": [ + { + "name": "PIPELINE", + "type": "PIPELINE", + "links": [ + 16 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "DiffusersMVModelMakeup" + }, + "widgets_values": [ + true, + "huanngzh/mv-adapter", + "mvadapter_t2mv_sdxl.safetensors", + 6, + true, + false + ] + }, + { + "id": 11, + "type": "LoadImage", + "pos": [ + 220.8171844482422, + -212.83360290527344 + ], + "size": [ + 210, + 314 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 18 + ], + "slot_index": 0 + }, + { + "name": "MASK", + "type": "MASK", + "links": null + } + ], + "properties": { + "Node name for S&R": "LoadImage" + }, + "widgets_values": [ + "scribble_0.png", + "image" + ] + }, + { + "id": 14, + "type": "LoadImage", + "pos": [ + 451.238037109375, + -214.33116149902344 + ], + "size": [ + 214.94398498535156, + 314 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 19 + ], + "slot_index": 0 + }, + { + "name": "MASK", + "type": "MASK", + "links": null + } + ], + "properties": { + "Node name for S&R": "LoadImage" + }, + "widgets_values": [ + "scribble_1.png", + "image" + ] + }, + { + "id": 15, + "type": "LoadImage", + "pos": [ + 684.787841796875, + -216.00900268554688 + ], + "size": [ + 210, + 314 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 20 + ], + "slot_index": 0 + }, + { + "name": "MASK", + "type": "MASK", + "links": null + } + ], + "properties": { + "Node name for S&R": "LoadImage" + }, + "widgets_values": [ + "scribble_2.png", + "image" + ] + }, + { + "id": 16, + "type": "LoadImage", + "pos": [ + 911.8894653320312, + -214.99267578125 + ], + "size": [ + 210, + 314 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 21 + ], + "slot_index": 0 + }, + { + "name": "MASK", + "type": "MASK", + "links": null + } + ], + "properties": { + "Node name for S&R": "LoadImage" + }, + "widgets_values": [ + "scribble_3.png", + "image" + ] + }, + { + "id": 17, + "type": "LoadImage", + "pos": [ + 1140.20751953125, + -213.4659423828125 + ], + "size": [ + 210, + 314 + ], + "flags": {}, + "order": 6, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 22 + ], + "slot_index": 0 + }, + { + "name": "MASK", + "type": "MASK", + "links": null + } + ], + "properties": { + "Node name for S&R": "LoadImage" + }, + "widgets_values": [ + "scribble_4.png", + "image" + ] + }, + { + "id": 18, + "type": "LoadImage", + "pos": [ + 1370.2098388671875, + -214.5530548095703 + ], + "size": [ + 210, + 314 + ], + "flags": {}, + "order": 7, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 23 + ], + "slot_index": 0 + }, + { + "name": "MASK", + "type": "MASK", + "links": null + } + ], + "properties": { + "Node name for S&R": "LoadImage" + }, + "widgets_values": [ + "scribble_5.png", + "image" + ] + }, + { + "id": 20, + "type": "ControlImagePreprocessor", + "pos": [ + 1646.3026123046875, + -156.30767822265625 + ], + "size": [ + 327.5999755859375, + 182 + ], + "flags": {}, + "order": 9, + "mode": 0, + "inputs": [ + { + "name": "front_view", + "type": "IMAGE", + "link": 18 + }, + { + "name": "front_right_view", + "type": "IMAGE", + "link": 19 + }, + { + "name": "right_view", + "type": "IMAGE", + "link": 20 + }, + { + "name": "back_view", + "type": "IMAGE", + "link": 21 + }, + { + "name": "left_view", + "type": "IMAGE", + "link": 22 + }, + { + "name": "front_left_view", + "type": "IMAGE", + "link": 23 + } + ], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 24 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "ControlImagePreprocessor" + }, + "widgets_values": [ + 768, + 768 + ] + }, + { + "id": 19, + "type": "ControlNetModelLoader", + "pos": [ + 777.6534423828125, + 168.4779052734375 + ], + "size": [ + 315, + 58 + ], + "flags": {}, + "order": 11, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "PIPELINE", + "link": 16 + } + ], + "outputs": [ + { + "name": "PIPELINE", + "type": "PIPELINE", + "links": [ + 17 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "ControlNetModelLoader" + }, + "widgets_values": [ + "xinsir/controlnet-scribble-sdxl-1.0" + ] + }, + { + "id": 8, + "type": "PreviewImage", + "pos": [ + 1707.440185546875, + 93.78192138671875 + ], + "size": [ + 337.5841064453125, + 545.9476318359375 + ], + "flags": {}, + "order": 13, + "mode": 0, + "inputs": [ + { + "name": "images", + "type": "IMAGE", + "link": 7 + } + ], + "outputs": [], + "properties": { + "Node name for S&R": "PreviewImage" + }, + "widgets_values": [] + }, + { + "id": 7, + "type": "DiffusersMVSampler", + "pos": [ + 1211.4144287109375, + 199.29754638671875 + ], + "size": [ + 400, + 358 + ], + "flags": {}, + "order": 12, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "PIPELINE", + "link": 17 + }, + { + "name": "reference_image", + "type": "IMAGE", + "link": null, + "shape": 7 + }, + { + "name": "controlnet_image", + "type": "IMAGE", + "link": 24, + "shape": 7 + } + ], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 7 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "DiffusersMVSampler" + }, + "widgets_values": [ + 6, + "A 3D model of Finn the Human from the animated television series Adventure Time. He is wearing his iconic blue shirt and green backpack and has a neutral expression on his face. He is standing in a relaxed pose with his left foot slightly forward and his right foot back. His arms are at his sides and his head is turned slightly to the right. The model is made up of simple shapes and has a stylized, cartoon-like appearance.", + "watermark, ugly, deformed, noisy, blurry, low contrast", + 768, + 768, + 50, + 7, + 153327331713128, + "randomize", + 0.7000000000000001 + ] + } + ], + "links": [ + [ + 7, + 7, + 0, + 8, + 0, + "IMAGE" + ], + [ + 9, + 1, + 0, + 9, + 0, + "PIPELINE" + ], + [ + 12, + 1, + 0, + 10, + 0, + "PIPELINE" + ], + [ + 13, + 9, + 0, + 10, + 1, + "SCHEDULER" + ], + [ + 14, + 4, + 0, + 10, + 2, + "AUTOENCODER" + ], + [ + 16, + 10, + 0, + 19, + 0, + "PIPELINE" + ], + [ + 17, + 19, + 0, + 7, + 0, + "PIPELINE" + ], + [ + 18, + 11, + 0, + 20, + 0, + "IMAGE" + ], + [ + 19, + 14, + 0, + 20, + 1, + "IMAGE" + ], + [ + 20, + 15, + 0, + 20, + 2, + "IMAGE" + ], + [ + 21, + 16, + 0, + 20, + 3, + "IMAGE" + ], + [ + 22, + 17, + 0, + 20, + 4, + "IMAGE" + ], + [ + 23, + 18, + 0, + 20, + 5, + "IMAGE" + ], + [ + 24, + 20, + 0, + 7, + 2, + "IMAGE" + ] + ], + "groups": [], + "config": {}, + "extra": { + "ds": { + "scale": 0.8264462809917354, + "offset": [ + -77.40918400367313, + 268.5256134291891 + ] + } + }, + "version": 0.4 +} \ No newline at end of file diff --git a/comfyui-mvadapter/workflows/t2mv_sdxl_ldm_lora.json b/comfyui-mvadapter/workflows/t2mv_sdxl_ldm_lora.json new file mode 100644 index 0000000000000000000000000000000000000000..17bcdda1ae4f262dbb4663fdf9a2642ce78e7913 --- /dev/null +++ b/comfyui-mvadapter/workflows/t2mv_sdxl_ldm_lora.json @@ -0,0 +1,372 @@ +{ + "last_node_id": 11, + "last_link_id": 17, + "nodes": [ + { + "id": 4, + "type": "LdmVaeLoader", + "pos": [ + 247.52098083496094, + 558.488525390625 + ], + "size": [ + 315, + 82 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "AUTOENCODER", + "type": "AUTOENCODER", + "links": [ + 14 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "LdmVaeLoader" + }, + "widgets_values": [ + "sdxl_vae.safetensors", + true + ] + }, + { + "id": 9, + "type": "DiffusersMVSchedulerLoader", + "pos": [ + 236.58033752441406, + 346.41693115234375 + ], + "size": [ + 327.5999755859375, + 130 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "PIPELINE", + "link": 9 + } + ], + "outputs": [ + { + "name": "SCHEDULER", + "type": "SCHEDULER", + "links": [ + 13 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "DiffusersMVSchedulerLoader" + }, + "widgets_values": [ + "DDIM", + true, + "interpolated", + 8 + ] + }, + { + "id": 1, + "type": "LdmPipelineLoader", + "pos": [ + 245.8949432373047, + 130.0254364013672 + ], + "size": [ + 315, + 122 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "PIPELINE", + "type": "PIPELINE", + "links": [ + 9, + 12 + ], + "slot_index": 0 + }, + { + "name": "AUTOENCODER", + "type": "AUTOENCODER", + "links": null + }, + { + "name": "SCHEDULER", + "type": "SCHEDULER", + "links": null + } + ], + "properties": { + "Node name for S&R": "LdmPipelineLoader" + }, + "widgets_values": [ + "sd_xl_base_1.0.safetensors", + "MVAdapterT2MVSDXLPipeline" + ] + }, + { + "id": 10, + "type": "DiffusersMVModelMakeup", + "pos": [ + 659.5178833007812, + 174.95619201660156 + ], + "size": [ + 315, + 214 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "PIPELINE", + "link": 12 + }, + { + "name": "scheduler", + "type": "SCHEDULER", + "link": 13 + }, + { + "name": "autoencoder", + "type": "AUTOENCODER", + "link": 14 + }, + { + "name": "lora", + "type": "L", + "link": null, + "shape": 7 + } + ], + "outputs": [ + { + "name": "PIPELINE", + "type": "PIPELINE", + "links": [ + 16 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "DiffusersMVModelMakeup" + }, + "widgets_values": [ + true, + "huanngzh/mv-adapter", + "mvadapter_t2mv_sdxl.safetensors", + 6, + true + ] + }, + { + "id": 7, + "type": "DiffusersMVSampler", + "pos": [ + 1050.723388671875, + 218.12826538085938 + ], + "size": [ + 400, + 314 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "PIPELINE", + "link": 17 + }, + { + "name": "reference_image", + "type": "IMAGE", + "link": null, + "shape": 7 + } + ], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 7 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "DiffusersMVSampler" + }, + "widgets_values": [ + 6, + "3d style, a fox with flowers around it", + "watermark, ugly, deformed, noisy, blurry, low contrast", + 768, + 768, + 50, + 7, + 66009262624567, + "randomize" + ] + }, + { + "id": 8, + "type": "PreviewImage", + "pos": [ + 1521.929443359375, + 102.87110137939453 + ], + "size": [ + 337.5841064453125, + 545.9476318359375 + ], + "flags": {}, + "order": 6, + "mode": 0, + "inputs": [ + { + "name": "images", + "type": "IMAGE", + "link": 7 + } + ], + "outputs": [], + "properties": { + "Node name for S&R": "PreviewImage" + }, + "widgets_values": [] + }, + { + "id": 11, + "type": "CustomLoraModelLoader", + "pos": [ + 656.7574462890625, + 490.3138427734375 + ], + "size": [ + 315, + 82 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "PIPELINE", + "link": 16 + } + ], + "outputs": [ + { + "name": "PIPELINE", + "type": "PIPELINE", + "links": [ + 17 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "CustomLoraModelLoader" + }, + "widgets_values": [ + "3d_render_style_xl.safetensors", + 1, + true, + true + ] + } + ], + "links": [ + [ + 7, + 7, + 0, + 8, + 0, + "IMAGE" + ], + [ + 9, + 1, + 0, + 9, + 0, + "PIPELINE" + ], + [ + 12, + 1, + 0, + 10, + 0, + "PIPELINE" + ], + [ + 13, + 9, + 0, + 10, + 1, + "SCHEDULER" + ], + [ + 14, + 4, + 0, + 10, + 2, + "AUTOENCODER" + ], + [ + 16, + 10, + 0, + 11, + 0, + "PIPELINE" + ], + [ + 17, + 11, + 0, + 7, + 0, + "PIPELINE" + ] + ], + "groups": [], + "config": {}, + "extra": { + "ds": { + "scale": 0.803552158862256, + "offset": [ + -210.691560897588, + -27.3938617433714 + ] + } + }, + "version": 0.4 +} \ No newline at end of file diff --git a/comfyui-mvadapter/workflows/t2mv_sdxl_ldm_multiple_loras.json b/comfyui-mvadapter/workflows/t2mv_sdxl_ldm_multiple_loras.json new file mode 100644 index 0000000000000000000000000000000000000000..98611671235017d9ba0e767ec1901e34b80c84e4 --- /dev/null +++ b/comfyui-mvadapter/workflows/t2mv_sdxl_ldm_multiple_loras.json @@ -0,0 +1,487 @@ +{ + "id": "27703e03-4acd-4b13-88a8-06877ccc3682", + "revision": 0, + "last_node_id": 13, + "last_link_id": 22, + "nodes": [ + { + "id": 4, + "type": "LdmVaeLoader", + "pos": [ + 247.52098083496094, + 558.488525390625 + ], + "size": [ + 315, + 82 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "AUTOENCODER", + "type": "AUTOENCODER", + "slot_index": 0, + "links": [ + 14 + ] + } + ], + "properties": { + "Node name for S&R": "LdmVaeLoader" + }, + "widgets_values": [ + "sdxl_vae.safetensors", + true + ] + }, + { + "id": 9, + "type": "DiffusersMVSchedulerLoader", + "pos": [ + 236.58033752441406, + 346.41693115234375 + ], + "size": [ + 327.5999755859375, + 130 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "PIPELINE", + "link": 9 + } + ], + "outputs": [ + { + "name": "SCHEDULER", + "type": "SCHEDULER", + "slot_index": 0, + "links": [ + 13 + ] + } + ], + "properties": { + "Node name for S&R": "DiffusersMVSchedulerLoader" + }, + "widgets_values": [ + "DDIM", + true, + "interpolated", + 8 + ] + }, + { + "id": 1, + "type": "LdmPipelineLoader", + "pos": [ + 245.8949432373047, + 130.0254364013672 + ], + "size": [ + 315, + 122 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "PIPELINE", + "type": "PIPELINE", + "slot_index": 0, + "links": [ + 9, + 12 + ] + }, + { + "name": "AUTOENCODER", + "type": "AUTOENCODER", + "links": null + }, + { + "name": "SCHEDULER", + "type": "SCHEDULER", + "links": null + } + ], + "properties": { + "Node name for S&R": "LdmPipelineLoader" + }, + "widgets_values": [ + "sd_xl_base_1.0.safetensors", + "MVAdapterT2MVSDXLPipeline" + ] + }, + { + "id": 10, + "type": "DiffusersMVModelMakeup", + "pos": [ + 659.5178833007812, + 174.95619201660156 + ], + "size": [ + 315, + 238 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "PIPELINE", + "link": 12 + }, + { + "name": "scheduler", + "type": "SCHEDULER", + "link": 13 + }, + { + "name": "autoencoder", + "type": "AUTOENCODER", + "link": 14 + }, + { + "name": "lora", + "shape": 7, + "type": "L", + "link": null + } + ], + "outputs": [ + { + "name": "PIPELINE", + "type": "PIPELINE", + "slot_index": 0, + "links": [ + 16 + ] + } + ], + "properties": { + "Node name for S&R": "DiffusersMVModelMakeup" + }, + "widgets_values": [ + true, + "huanngzh/mv-adapter", + "mvadapter_t2mv_sdxl.safetensors", + 6, + true, + false + ] + }, + { + "id": 8, + "type": "PreviewImage", + "pos": [ + 1521.929443359375, + 102.87110137939453 + ], + "size": [ + 337.5841064453125, + 545.9476318359375 + ], + "flags": {}, + "order": 8, + "mode": 0, + "inputs": [ + { + "name": "images", + "type": "IMAGE", + "link": 7 + } + ], + "outputs": [], + "properties": { + "Node name for S&R": "PreviewImage" + }, + "widgets_values": [] + }, + { + "id": 7, + "type": "DiffusersMVSampler", + "pos": [ + 1050.723388671875, + 218.12826538085938 + ], + "size": [ + 400, + 394 + ], + "flags": {}, + "order": 7, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "PIPELINE", + "link": 22 + }, + { + "name": "reference_image", + "shape": 7, + "type": "IMAGE", + "link": null + }, + { + "name": "controlnet_image", + "shape": 7, + "type": "IMAGE", + "link": null + }, + { + "name": "azimuth_degrees", + "shape": 7, + "type": "LIST", + "link": null + } + ], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "slot_index": 0, + "links": [ + 7 + ] + } + ], + "properties": { + "Node name for S&R": "DiffusersMVSampler" + }, + "widgets_values": [ + 6, + "3d style, a fox with flowers around it", + "watermark, ugly, deformed, noisy, blurry, low contrast", + 768, + 768, + 50, + 7, + 375723783265130, + "randomize", + 1 + ] + }, + { + "id": 13, + "type": "CustomLoraModelLoader", + "pos": [ + 663.2525024414062, + 843.3848876953125 + ], + "size": [ + 315, + 130 + ], + "flags": {}, + "order": 6, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "PIPELINE", + "link": 21 + } + ], + "outputs": [ + { + "name": "PIPELINE", + "type": "PIPELINE", + "slot_index": 0, + "links": [ + 22 + ] + } + ], + "properties": { + "Node name for S&R": "CustomLoraModelLoader" + }, + "widgets_values": [ + "1980s_Fantasy_Style_SDXL.safetensors", + 0.30000000000000004, + true, + true + ] + }, + { + "id": 11, + "type": "CustomLoraModelLoader", + "pos": [ + 659.8198852539062, + 467.5633239746094 + ], + "size": [ + 315, + 130 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "PIPELINE", + "link": 16 + } + ], + "outputs": [ + { + "name": "PIPELINE", + "type": "PIPELINE", + "slot_index": 0, + "links": [ + 20 + ] + } + ], + "properties": { + "Node name for S&R": "CustomLoraModelLoader" + }, + "widgets_values": [ + "3d_render_style_xl.safetensors", + 0.30000000000000004, + true, + false + ] + }, + { + "id": 12, + "type": "CustomLoraModelLoader", + "pos": [ + 662.6155395507812, + 655.0852661132812 + ], + "size": [ + 315, + 130 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "name": "pipeline", + "type": "PIPELINE", + "link": 20 + } + ], + "outputs": [ + { + "name": "PIPELINE", + "type": "PIPELINE", + "slot_index": 0, + "links": [ + 21 + ] + } + ], + "properties": { + "Node name for S&R": "CustomLoraModelLoader" + }, + "widgets_values": [ + "3dpolygonStyle.safetensors", + 0.30000000000000004, + true, + false + ] + } + ], + "links": [ + [ + 7, + 7, + 0, + 8, + 0, + "IMAGE" + ], + [ + 9, + 1, + 0, + 9, + 0, + "PIPELINE" + ], + [ + 12, + 1, + 0, + 10, + 0, + "PIPELINE" + ], + [ + 13, + 9, + 0, + 10, + 1, + "SCHEDULER" + ], + [ + 14, + 4, + 0, + 10, + 2, + "AUTOENCODER" + ], + [ + 16, + 10, + 0, + 11, + 0, + "PIPELINE" + ], + [ + 20, + 11, + 0, + 12, + 0, + "PIPELINE" + ], + [ + 21, + 12, + 0, + 13, + 0, + "PIPELINE" + ], + [ + 22, + 13, + 0, + 7, + 0, + "PIPELINE" + ] + ], + "groups": [], + "config": {}, + "extra": { + "ds": { + "scale": 0.803552158862256, + "offset": [ + -128.2888904303056, + -107.83259622641816 + ] + }, + "frontendVersion": "1.22.2" + }, + "version": 0.4 +} \ No newline at end of file diff --git a/comfyui-salia/__init__.py b/comfyui-salia/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..adeca0d54c796456b0212065b25550f2aea15887 --- /dev/null +++ b/comfyui-salia/__init__.py @@ -0,0 +1,87 @@ +# custom_nodes/comfyui-salia/__init__.py +# --- Stable node registrar for hyphenated custom_nodes packages --- +import os +import sys +import types +import importlib.util +import traceback + +NODE_CLASS_MAPPINGS = {} +NODE_DISPLAY_NAME_MAPPINGS = {} + +BASE_DIR = os.path.dirname(__file__) +UTILS_DIR = os.path.join(BASE_DIR, "utils") +NODES_DIR = os.path.join(BASE_DIR, "nodes") + +ALIAS_PKG = "comfyui_salia" # hyphen-safe alias +ALIAS_UTILS = f"{ALIAS_PKG}.utils" +ALIAS_NODES = f"{ALIAS_PKG}.nodes" + +def _ensure_ns_package(name: str, path: str): + """Create a namespace-like package module aliased to a filesystem path.""" + if name in sys.modules: + return sys.modules[name] + m = types.ModuleType(name) + m.__file__ = os.path.join(path, "__init__.py") + m.__path__ = [path] + sys.modules[name] = m + return m + +def _load_module_as(mod_name: str, file_path: str): + """Load a module from a file under an explicit module name.""" + try: + spec = importlib.util.spec_from_file_location(mod_name, file_path) + if not spec or not spec.loader: + print(f"[comfyui-salia] Skipping {file_path}: no loader") + return None + mod = importlib.util.module_from_spec(spec) + sys.modules[mod_name] = mod + spec.loader.exec_module(mod) + return mod + except Exception as e: + print(f"[comfyui-salia] Failed to load '{file_path}': {e}") + traceback.print_exc() + return None + +def _gather_nodes(mod): + """Collect node mappings from a loaded module.""" + global NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS + cls_map = getattr(mod, "NODE_CLASS_MAPPINGS", {}) + dsp_map = getattr(mod, "NODE_DISPLAY_NAME_MAPPINGS", {}) + if cls_map: + NODE_CLASS_MAPPINGS.update(cls_map) + if dsp_map: + NODE_DISPLAY_NAME_MAPPINGS.update(dsp_map) + +def _load_all(): + # 1) Alias the base package so relative imports '..utils' work inside nodes + _ensure_ns_package(ALIAS_PKG, BASE_DIR) + + # 2) Preload utils as a package and all its modules + if os.path.isdir(UTILS_DIR): + _ensure_ns_package(ALIAS_UTILS, UTILS_DIR) + for fname in os.listdir(UTILS_DIR): + if fname.endswith(".py") and not fname.startswith("_"): + mod_name = os.path.splitext(fname)[0] + mod = _load_module_as(f"{ALIAS_UTILS}.{mod_name}", os.path.join(UTILS_DIR, fname)) + if mod: + # utils typically don't export node mappings; no gather + pass + + # 3) Load every node as comfyui_salia.nodes. + if os.path.isdir(NODES_DIR): + _ensure_ns_package(ALIAS_NODES, NODES_DIR) + for fname in os.listdir(NODES_DIR): + if fname.endswith(".py") and not fname.startswith("_"): + mod_name = os.path.splitext(fname)[0] + mod = _load_module_as(f"{ALIAS_NODES}.{mod_name}", os.path.join(NODES_DIR, fname)) + if mod: + _gather_nodes(mod) + # Log summary + keys = list(getattr(mod, "NODE_CLASS_MAPPINGS", {}).keys()) + pretty = ", ".join(keys) if keys else "no nodes found" + print(f"[comfyui-salia] Loaded node file '{fname}': {pretty}") + else: + print("[comfyui-salia] No 'nodes' folder found.") + +_load_all() diff --git a/comfyui-salia/__pycache__/__init__.cpython-312.pyc b/comfyui-salia/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..da9e1840ac7806b01eeaad8c671c4a27ab0b6370 Binary files /dev/null and b/comfyui-salia/__pycache__/__init__.cpython-312.pyc differ diff --git a/comfyui-salia/assets/images/BLACK.png b/comfyui-salia/assets/images/BLACK.png new file mode 100644 index 0000000000000000000000000000000000000000..bd4db7df3db4f1a91b6f6612bda78b0825d56956 Binary files /dev/null and b/comfyui-salia/assets/images/BLACK.png differ diff --git a/comfyui-salia/assets/images/GREY.png b/comfyui-salia/assets/images/GREY.png new file mode 100644 index 0000000000000000000000000000000000000000..101d671f71f9a7a9da41cb808917a8ff52a52329 Binary files /dev/null and b/comfyui-salia/assets/images/GREY.png differ diff --git a/comfyui-salia/assets/images/TRANSPARENT.png b/comfyui-salia/assets/images/TRANSPARENT.png new file mode 100644 index 0000000000000000000000000000000000000000..5adb0d0c06ce2238a27a32369cadb6cf90ca5080 Binary files /dev/null and b/comfyui-salia/assets/images/TRANSPARENT.png differ diff --git a/comfyui-salia/assets/images/boy0.png b/comfyui-salia/assets/images/boy0.png new file mode 100644 index 0000000000000000000000000000000000000000..70ba23d419bbd5cb831e611552aab4ffbeccf6f9 --- /dev/null +++ b/comfyui-salia/assets/images/boy0.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1c5c86624550b983acb32870e72b39f6d49c005ec6b6706e776e34eb6dbf678 +size 152036 diff --git a/comfyui-salia/assets/images/boy1.png b/comfyui-salia/assets/images/boy1.png new file mode 100644 index 0000000000000000000000000000000000000000..b801bd7c1248e0e79112b53f04f9973b976e1a44 --- /dev/null +++ b/comfyui-salia/assets/images/boy1.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d6c5c9cd0be8e0e2fa2fcce62f114758f1870f997a5196294fc732d216a93da +size 155339 diff --git a/comfyui-salia/assets/images/boy2.png b/comfyui-salia/assets/images/boy2.png new file mode 100644 index 0000000000000000000000000000000000000000..4766f94b6ccafef8fadfc618f0130043b81a16f3 --- /dev/null +++ b/comfyui-salia/assets/images/boy2.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56162ccf8859d66bcc105278d9dfd33b42b707cf337253dc2b9ec7c1b92f957c +size 158191 diff --git a/comfyui-salia/assets/images/boy3.png b/comfyui-salia/assets/images/boy3.png new file mode 100644 index 0000000000000000000000000000000000000000..5f895205004f1f9e169248a98d43a9431b3e9b69 --- /dev/null +++ b/comfyui-salia/assets/images/boy3.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea28cf1c881ce54a185322868938770e22a96396964b5771eca96c1ff44eeb68 +size 180975 diff --git a/comfyui-salia/assets/images/boy4.png b/comfyui-salia/assets/images/boy4.png new file mode 100644 index 0000000000000000000000000000000000000000..07c55936717ed047ae8a984041f1ede0fd42170c --- /dev/null +++ b/comfyui-salia/assets/images/boy4.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:820c4e617ffb015fce14cf1f4f6f70c92b9dee80267a284f8ce10466ee60bcba +size 174994 diff --git a/comfyui-salia/assets/images/boy5.png b/comfyui-salia/assets/images/boy5.png new file mode 100644 index 0000000000000000000000000000000000000000..2e2719214ea90e50e3e643f4da3b2768afb485f4 --- /dev/null +++ b/comfyui-salia/assets/images/boy5.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a72f38d8e8afa458f8b10dbefb7271746e8d6f6ce268245b47e378e539ef6ad7 +size 192033 diff --git a/comfyui-salia/assets/images/girl0.png b/comfyui-salia/assets/images/girl0.png new file mode 100644 index 0000000000000000000000000000000000000000..38377463893e0d96777fbb33a9cd56e677aeb2c4 --- /dev/null +++ b/comfyui-salia/assets/images/girl0.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3901752822694170495ea208c75931711e69a82fa38c6e50a49633cb0a463cf0 +size 141823 diff --git a/comfyui-salia/assets/images/girl1.png b/comfyui-salia/assets/images/girl1.png new file mode 100644 index 0000000000000000000000000000000000000000..483bec45df3ed91c30139a7f8330d7166502a5fe --- /dev/null +++ b/comfyui-salia/assets/images/girl1.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e032fe03536b17d9975ac10964931230d04ab14dd901bd12adf3401417a5792 +size 234857 diff --git a/comfyui-salia/assets/images/girl2.png b/comfyui-salia/assets/images/girl2.png new file mode 100644 index 0000000000000000000000000000000000000000..77d3b37926ffcd784aaf886fd71b5384f5fc65dc --- /dev/null +++ b/comfyui-salia/assets/images/girl2.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:666323f9e53739a5678bd164df6a84603c4f23a400f896bdd5602a2a1706ec87 +size 161202 diff --git a/comfyui-salia/assets/images/girl3.png b/comfyui-salia/assets/images/girl3.png new file mode 100644 index 0000000000000000000000000000000000000000..df31c7c1143a3b46c1bcca7d9eae865463acd261 --- /dev/null +++ b/comfyui-salia/assets/images/girl3.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edc198e09e13d762579b8424e925aaef33a4485932c4d9c3e729c36eefea148c +size 155607 diff --git a/comfyui-salia/assets/images/girl4.png b/comfyui-salia/assets/images/girl4.png new file mode 100644 index 0000000000000000000000000000000000000000..bcb5252ed2a8fc268cd1135ee9334117ad7a50b8 --- /dev/null +++ b/comfyui-salia/assets/images/girl4.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:542d2fed005d73931dacf3eb55a97d591a10d0e1c8d71baa20e2743225f4079c +size 183479 diff --git a/comfyui-salia/assets/images/girl5.png b/comfyui-salia/assets/images/girl5.png new file mode 100644 index 0000000000000000000000000000000000000000..7728c9dd323388ef570f4e63de59bba763470bc4 --- /dev/null +++ b/comfyui-salia/assets/images/girl5.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bc9a67e27b6d326f18d2bf7db2d9bbab7bf2964d447b007adde914dc963ce77 +size 132360 diff --git a/comfyui-salia/assets/images/hair_L_Bound.png b/comfyui-salia/assets/images/hair_L_Bound.png new file mode 100644 index 0000000000000000000000000000000000000000..66a948b22b528fa1c0bcce9168470baf843419ec --- /dev/null +++ b/comfyui-salia/assets/images/hair_L_Bound.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64f71381a47a1b78bda2306f1c2dcba94fb840b103733ca05bdd744cbbe1ea9d +size 207924 diff --git a/comfyui-salia/assets/images/hair_L_Bound_Braided.png b/comfyui-salia/assets/images/hair_L_Bound_Braided.png new file mode 100644 index 0000000000000000000000000000000000000000..6d8633adf38b51bca39b7b2520387fe13d76636f --- /dev/null +++ b/comfyui-salia/assets/images/hair_L_Bound_Braided.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03cdd206726a3728727b551acd9366e502d51e628acc07eb09ef44ccb98d658d +size 225929 diff --git a/comfyui-salia/assets/images/hair_L_Loose.png b/comfyui-salia/assets/images/hair_L_Loose.png new file mode 100644 index 0000000000000000000000000000000000000000..0011b80d7bdb34de3d2b623d92a97280bbf4084f --- /dev/null +++ b/comfyui-salia/assets/images/hair_L_Loose.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a694cf3990f671bc11a7bda39d51126d1bbba8f8954252e86a90661c6a6b19e2 +size 325029 diff --git a/comfyui-salia/assets/images/hair_M_Bound.png b/comfyui-salia/assets/images/hair_M_Bound.png new file mode 100644 index 0000000000000000000000000000000000000000..39a59b2818c16ec112224b0a20d91e2b3bea15be --- /dev/null +++ b/comfyui-salia/assets/images/hair_M_Bound.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06db92d4c1351051cfa887b574a161e7c78e07f5905710e3068ef33b2f55be79 +size 186899 diff --git a/comfyui-salia/assets/images/hair_M_Bound_Braided.png b/comfyui-salia/assets/images/hair_M_Bound_Braided.png new file mode 100644 index 0000000000000000000000000000000000000000..8f19ac3babf35d3bd34707906e300dd34121c800 --- /dev/null +++ b/comfyui-salia/assets/images/hair_M_Bound_Braided.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d713da8e028da8880c1ad5b21a912438c58214f4c65dc75702bea7c25ecc98e +size 192196 diff --git a/comfyui-salia/assets/images/hair_M_Loose.png b/comfyui-salia/assets/images/hair_M_Loose.png new file mode 100644 index 0000000000000000000000000000000000000000..9a0099c0400214f9d1fedb8c0177db2eb99616b5 --- /dev/null +++ b/comfyui-salia/assets/images/hair_M_Loose.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:441180532c973769af2d502bd2b002958233f2765557d4701f435e427e7ec94a +size 322639 diff --git a/comfyui-salia/assets/images/hair_S_Bound.png b/comfyui-salia/assets/images/hair_S_Bound.png new file mode 100644 index 0000000000000000000000000000000000000000..cb43bdd079f821a0ea2b4ecd73d955d645d7b214 --- /dev/null +++ b/comfyui-salia/assets/images/hair_S_Bound.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67937f727e6feb054074e815d35f0942b83056c77df52c79c67f7773ee1cfb67 +size 168369 diff --git a/comfyui-salia/assets/images/hair_S_Bound_Braided.png b/comfyui-salia/assets/images/hair_S_Bound_Braided.png new file mode 100644 index 0000000000000000000000000000000000000000..a2ce8a80b75b24ab04ea71c7a7cc64c985bfc19d --- /dev/null +++ b/comfyui-salia/assets/images/hair_S_Bound_Braided.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25ff95c4069a44f4309b52a7d2c5ecc215e4387be1d19835a2eb07c5b7520a5d +size 155515 diff --git a/comfyui-salia/assets/images/hair_S_Loose.png b/comfyui-salia/assets/images/hair_S_Loose.png new file mode 100644 index 0000000000000000000000000000000000000000..73bd15c103c8e0ec913c04f483d2c955b09a8521 --- /dev/null +++ b/comfyui-salia/assets/images/hair_S_Loose.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10d8d4ca6d7ec99728cb0426520fa9df9248b8d0bc03de1b411cf5bf4efc20e6 +size 181787 diff --git a/comfyui-salia/nodes/__init__.py b/comfyui-salia/nodes/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..319dd3ca994bf42216523360bcb236810623d7fb --- /dev/null +++ b/comfyui-salia/nodes/__init__.py @@ -0,0 +1 @@ +# package marker for comfyui-salia.utils (no logic here) diff --git a/comfyui-salia/nodes/__pycache__/__init__.cpython-312.pyc b/comfyui-salia/nodes/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..65336414c19dae306197cb7b2c09450b27989f3d Binary files /dev/null and b/comfyui-salia/nodes/__pycache__/__init__.cpython-312.pyc differ diff --git a/comfyui-salia/nodes/__pycache__/boygirl_by_text.cpython-312.pyc b/comfyui-salia/nodes/__pycache__/boygirl_by_text.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..56dbe98b1bcb9f056f5703cdee9d0859bbbd1bf5 Binary files /dev/null and b/comfyui-salia/nodes/__pycache__/boygirl_by_text.cpython-312.pyc differ diff --git a/comfyui-salia/nodes/__pycache__/hair_by_text.cpython-312.pyc b/comfyui-salia/nodes/__pycache__/hair_by_text.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0ba49a7cf075e1efda8c5405c806e62f7ecf8325 Binary files /dev/null and b/comfyui-salia/nodes/__pycache__/hair_by_text.cpython-312.pyc differ diff --git a/comfyui-salia/nodes/__pycache__/haircolor_inject.cpython-312.pyc b/comfyui-salia/nodes/__pycache__/haircolor_inject.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a479d61d1073d56a9f0448689b2c4b3d772c9d1f Binary files /dev/null and b/comfyui-salia/nodes/__pycache__/haircolor_inject.cpython-312.pyc differ diff --git a/comfyui-salia/nodes/__pycache__/load_image.cpython-312.pyc b/comfyui-salia/nodes/__pycache__/load_image.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5deb2d1441086f116a2b9f9352e3a83951c70cec Binary files /dev/null and b/comfyui-salia/nodes/__pycache__/load_image.cpython-312.pyc differ diff --git a/comfyui-salia/nodes/boygirl_by_text.py b/comfyui-salia/nodes/boygirl_by_text.py new file mode 100644 index 0000000000000000000000000000000000000000..ef58a66cc1b8f56a5aff0b7c0e39f91f0297896f --- /dev/null +++ b/comfyui-salia/nodes/boygirl_by_text.py @@ -0,0 +1,121 @@ +# custom_nodes/comfyui-salia/nodes/boygirl_by_text.py + +import re +import os +from ..utils.io import load_image_from_assets, file_hash, safe_path + +PLACEHOLDER_PREFIX = "1" + +# EDIT THESE EXACT STRINGS: +CUSTOM_TOKENS = { + ("boy", 0): "preschool shota", + ("boy", 1): "child shota", + ("boy", 2): "preteen shota", + ("boy", 3): "preteen boy", + ("boy", 4): "teenage boy", + ("boy", 5): "man", + + ("girl", 0): "preschool loli", + ("girl", 1): "child loli", + ("girl", 2): "preteen loli", + ("girl", 3): "preteen girl", + ("girl", 4): "teenage girl", + ("girl", 5): "woman", +} + +_ALLOWED = {f"boy{i}.png" for i in range(6)} | {f"girl{i}.png" for i in range(6)} + +_num2_re = re.compile(r"(\d{2,})") # first 2+ digit number +_num1_re = re.compile(r"(? int: + if n is None: + return 5 + if n <= 7: return 0 + if n <= 9: return 1 + if n <= 11: return 2 + if n <= 14: return 3 + if n <= 19: return 4 + return 5 + +def _parse_gender(text: str) -> str: + return "boy" if _boy_re.search(text or "") else "girl" + +def _parse_number(text: str) -> int | None: + if not text: return None + m = _num2_re.search(text) + if m: + try: return int(m.group(1)) + except: pass + m = _num1_re.search(text) + if m: + try: return int(m.group(1)) + except: pass + return None + +class SaliaLoadBoyGirlByText: + CATEGORY = "image/salia" + + @classmethod + def INPUT_TYPES(cls): + return {"required": {"text": ("STRING", {"default": ""})}} + + RETURN_TYPES = ("IMAGE", "MASK", "STRING", "STRING") + RETURN_NAMES = ("image", "mask", "group_token", "custom_token") + FUNCTION = "run" + + def run(self, text: str): + text = text or "" + gender = _parse_gender(text) # "boy" | "girl" + n = _parse_number(text) # int | None + bucket = _bucket_from_number(n) # 0..5 + + fname = f"{gender}{bucket}.png" + if fname not in _ALLOWED: + raise FileNotFoundError(f"Unexpected filename: {fname}") + path = safe_path(fname) + if not os.path.isfile(path): + raise FileNotFoundError(f"Missing asset in assets/images: {fname}") + + img, msk = load_image_from_assets(fname) + + group_token = f"{PLACEHOLDER_PREFIX}{gender}" # "1boy" or "1girl" + custom_token = CUSTOM_TOKENS.get((gender, bucket), fname) + + return (img, msk, group_token, custom_token) + + @classmethod + def IS_CHANGED(cls, text): + text = text or "" + gender = _parse_gender(text) + n = _parse_number(text) + bucket = _bucket_from_number(n) + fname = f"{gender}{bucket}.png" + return file_hash(fname) + + @classmethod + def VALIDATE_INPUTS(cls, text): + text = text or "" + gender = _parse_gender(text) + n = _parse_number(text) + bucket = _bucket_from_number(n) + fname = f"{gender}{bucket}.png" + try: + path = safe_path(fname) + except Exception as e: + return str(e) + if not os.path.isfile(path): + return f"Required asset not found: {fname} (place it in assets/images/)" + return True + + +# Register BOTH names so old graphs load too +NODE_CLASS_MAPPINGS = { + "SaliaLoadBoyGirlByText": SaliaLoadBoyGirlByText, + "SailaLoadBoyGirlByText": SaliaLoadBoyGirlByText, # alias +} +NODE_DISPLAY_NAME_MAPPINGS = { + "SaliaLoadBoyGirlByText": "Load Boy/Girl Bucket (Salia)", + "SailaLoadBoyGirlByText": "Load Boy/Girl Bucket (Salia – alias)", +} diff --git a/comfyui-salia/nodes/hair_by_text.py b/comfyui-salia/nodes/hair_by_text.py new file mode 100644 index 0000000000000000000000000000000000000000..ef0188bc020b79a23b61aa4c311e4b3d5eaa6701 --- /dev/null +++ b/comfyui-salia/nodes/hair_by_text.py @@ -0,0 +1,90 @@ +# Choose one of 9 hair PNGs by parsing a free-text hint (case-insensitive) + +from ..utils.io import load_image_from_assets, file_hash, safe_path +import os + +HAIR_FILES = { + "hair_L_Bound.png", + "hair_L_Bound_Braided.png", + "hair_L_Loose.png", + "hair_M_Bound.png", + "hair_M_Bound_Braided.png", + "hair_M_Loose.png", + "hair_S_Bound.png", + "hair_S_Bound_Braided.png", + "hair_S_Loose.png", +} + +def _choose_hair_filename(text: str) -> str: + t = (text or "").lower() + + # Length + if "short" in t: + length = "S" + elif "long" in t: + length = "L" + else: + length = "M" + + # Style + if "braid" in t: + style = "Bound_Braided" + elif ("ponytail" in t) or ("bun" in t) or ("bound" in t): + style = "Bound" + else: + style = "Loose" + + return f"hair_{length}_{style}.png" + +class SaliaLoadHairByText: + CATEGORY = "image/salia" + + @classmethod + def INPUT_TYPES(cls): + return {"required": {"text": ("STRING", {"default": ""})}} + + RETURN_TYPES = ("IMAGE", "MASK") + RETURN_NAMES = ("image", "mask") + FUNCTION = "run" + + def run(self, text): + fname = _choose_hair_filename(text) + if fname not in HAIR_FILES: + raise FileNotFoundError(f"Parsed filename invalid: {fname}") + + # ensure file exists + path = safe_path(fname) + if not os.path.isfile(path): + raise FileNotFoundError( + f"Expected file not found: {fname} in assets/images.\n" + f"Hint parsed from text='{text}'" + ) + + img, msk = load_image_from_assets(fname) + return (img, msk) + + @classmethod + def IS_CHANGED(cls, text): + fname = _choose_hair_filename(text) + return file_hash(fname) + + @classmethod + def VALIDATE_INPUTS(cls, text): + try: + fname = _choose_hair_filename(text) + if fname not in HAIR_FILES: + return f"Parsed to unexpected name: {fname}" + path = safe_path(fname) + except Exception as e: + return str(e) + if not os.path.isfile(path): + return f"File not found for parsed hint → {fname}" + return True + + +NODE_CLASS_MAPPINGS = { + "SaliaLoadHairByText": SaliaLoadHairByText, +} +NODE_DISPLAY_NAME_MAPPINGS = { + "SaliaLoadHairByText": "Load Hair (Salia by Text)", +} diff --git a/comfyui-salia/nodes/haircolor_inject.py b/comfyui-salia/nodes/haircolor_inject.py new file mode 100644 index 0000000000000000000000000000000000000000..28dd0a4dbd56faab56c1b2802261f674d0c869b7 --- /dev/null +++ b/comfyui-salia/nodes/haircolor_inject.py @@ -0,0 +1,328 @@ +# SaliaInjectHairColor – expands {{HAIRCOLOR}}, {{BRAIDS}}, {{HAIRLENGTH}}, {{HAIRSTYLE}} +# control: free text ("light blonde braided bun pixie curly ...") +# prompt : string with placeholders to replace + +import re +import hashlib + +def _word_regex(s: str) -> re.Pattern: + """ + Build a loose 'whole-word-ish' pattern that tolerates spaces/hyphens. + E.g. 'inverted bob' -> r'\binverted[\s\-]+bob\b' + """ + s = s.strip() + if not s: + return re.compile(r"$a") # never matches + parts = re.split(r"[\s\-]+", s) + pat = r"\b" + r"[\s\-]+".join(map(re.escape, parts)) + r"\b" + return re.compile(pat, re.IGNORECASE) + +class SaliaInjectHairColor: + CATEGORY = "text/salia" + + # ---------- HAIRCOLOR ---------- + _COLORS = [ + "black", "brown", "blonde", "red", + "blue", "green", "pink", "purple", + "white", "gray", "silver", + "orange", "yellow", "cyan", "teal", + "magenta", "violet", "auburn", + ] + _ALIASES = { + "blond": "blonde", + "grey": "gray", + } + _COLOR_PATTERNS = [ + (c, re.compile(rf"\b{re.escape(c)}\b", re.IGNORECASE)) for c in _COLORS + ] + [ + (src, re.compile(rf"\b{re.escape(src)}\b", re.IGNORECASE)) for src in _ALIASES.keys() + ] + _LIGHT_RE = re.compile(r"\blight\b", re.IGNORECASE) + _DARK_RE = re.compile(r"\bdark\b", re.IGNORECASE) + + # ---------- HAIRSTYLE keyword maps ---------- + # Base haircuts + _CUTS = { + "bob": "bob cut", + "inverted bob": "inverted bob", + "bowl": "bowl cut", + "buzz": "buzz cut", + "pixie": "pixie cut", + "undercut": "undercut", + "flipped hair": "flipped hair", + "hime": "hime cut", + "hime cut": "hime cut", + } + + # Tied + _TIED = { + "bow-shaped": "bow-shaped", + "flower-shaped": "flower-shaped", + "updo": "hair updo", + "one side up": "one side up", + "two side up": "two side up", + "low-tide long": "low-tide long", + "multi-tied": "multi-tied", + "twintails": "twintails", + "low twintails": "low twintails", + "short twintails": "short twintails", + "twisted": "twisted", + } + + # Braids (also drives {{BRAIDS}}) + _BRAIDS = { + "front braid": "front braid", + "side braid": "side braid", + "french braid": "french braid", + "single braid": "single braid", + "twin braids": "twin braids", + "half up braid": "half up braid", + "low-braided long": "low-braided long", + "cornrows": "cornrows", + "dreadlocks": "dreadlocks", + # generic 'braid' is handled separately for the {{BRAIDS}} placeholder + } + + # Buns + _BUNS = { + "braided bun": "braided bun", + "single hair bun": "single hair bun", + "double bun": "double bun", + "cone hair bun": "cone hair bun", + "doughnut hair bun": "doughnut hair bun", + } + + # Rings + _RINGS = { + "hair rings": "hair rings", + "single hair ring": "single hair ring", + "double hair ring": "double hair ring", + } + + # Ponytails + _PONY = { + "ponytails": "ponytails", + "folded ponytail": "folded ponytail", + "front ponytail": "front ponytail", + "high ponytail": "high ponytail", + "short ponytail": "short ponytail", # accepts 'short pontail' via fuzzy pattern below if needed + "side ponytail": "side ponytail", + "topknot": "topknot", + } + + # Tall hair + _TALL = { + "afro": "afro", + "beehive hairdo": "beehive hairdo", + "crested": "crested", + "pompadour": "pompadour", + } + + # Texture + _TEXTURE = { + "wavy": "wavy", + "straight": "straight", + "spiked": "spiked", + "ringlets": "ringlets", + "pointy": "pointy", + "messy": "messy", + "hair flaps": "hair flaps", + "twin drills": "twin drills", + "drill": "drill", # 'drill hair' -> we accept just 'drill' + "curly": "curly", + } + + # Length/Volume descriptors + _LENGTH_VOL = { + "bald": "bald head", + "very short hair": "very short hair", + "short hair": "short hair", + "medium hair": "medium hair", + "long hair": "long hair", + "very long hair": "very long hair", + "absurdly long hair": "absurdly long hair", + "big hair": "big hair", + } + + # Precompile patterns for all keyword maps (once) + _MAP_PATTERNS = None # initialized on first use + + @classmethod + def _build_map_patterns(cls): + if cls._MAP_PATTERNS is not None: + return cls._MAP_PATTERNS + + def compile_map(d): + return [(kw, _word_regex(kw), out) for kw, out in d.items()] + + cls._MAP_PATTERNS = { + "cuts": compile_map(cls._CUTS), + "tied": compile_map(cls._TIED), + "braids": compile_map(cls._BRAIDS), + "buns": compile_map(cls._BUNS), + "rings": compile_map(cls._RINGS), + "pony": compile_map(cls._PONY), + "tall": compile_map(cls._TALL), + "texture": compile_map(cls._TEXTURE), + "lenvol": compile_map(cls._LENGTH_VOL), + } + return cls._MAP_PATTERNS + + @classmethod + def INPUT_TYPES(cls): + return {"required": { + "control": ("STRING", {"default": ""}), + "prompt": ("STRING", {"default": ""}), + }} + + RETURN_TYPES = ("STRING", "STRING") + RETURN_NAMES = ("replaced", "haircolor") + FUNCTION = "run" + + # ---------- helpers ---------- + @classmethod + def _normalize_color(cls, word: str) -> str: + w = word.lower() + return cls._ALIASES.get(w, w) + + @classmethod + def _find_best_color(cls, control: str): + if not control: + return None, None + best = (None, None) # (color, idx) + text = control + for key, pat in cls._COLOR_PATTERNS: + m = pat.search(text) + if not m: + continue + idx = m.start() + color = cls._normalize_color(key) + if best[1] is None or idx < best[1]: + best = (color, idx) + return best + + @classmethod + def _find_darkness_near_color(cls, control: str, color_idx): + if not control: + return "" + m_light = cls._LIGHT_RE.search(control) + m_dark = cls._DARK_RE.search(control) + if not m_light and not m_dark: + return "" + if m_light and not m_dark: + return "light-" + if m_dark and not m_light: + return "dark-" + if color_idx is not None: + d_light = abs(m_light.start() - color_idx) + d_dark = abs(m_dark.start() - color_idx) + return "light-" if d_light <= d_dark else "dark-" + return "light-" if m_light.start() <= m_dark.start() else "dark-" + + @classmethod + def _contains(cls, text: str, word: str) -> bool: + return bool(_word_regex(word).search(text)) + + @classmethod + def _collect_hairstyle_parts(cls, control: str): + """ + Scan control and collect matched phrases across categories. + Return (parts:list[str], matched_any:bool, found_bald:bool, found_texture:bool) + """ + maps = cls._build_map_patterns() + text = control or "" + matches = [] # (start_idx, phrase, category) + found_bald = False + found_texture = False + + for cat, items in maps.items(): + for kw, pat, out in items: + m = pat.search(text) + if not m: + # special fuzzy accept: "short ponytail" even if user writes 'pontail' + if cat == "pony" and kw == "short ponytail": + if re.search(r"\bshort\s+pon?yt?ail\b", text, re.IGNORECASE): + # fake a position to keep relative order + matches.append((0, out, cat)) + continue + if cat == "lenvol" and out == "bald head": + found_bald = True + if cat == "texture": + found_texture = True + matches.append((m.start(), out, cat)) + + # sort by appearance order; dedupe while preserving first occurrence of each phrase + matches.sort(key=lambda x: x[0]) + seen = set() + parts = [] + for _, phrase, _cat in matches: + if phrase in seen: + continue + seen.add(phrase) + parts.append(phrase) + + matched_any = len(parts) > 0 + return parts, matched_any, found_bald, found_texture + + # ---------- node exec ---------- + def run(self, control: str, prompt: str): + control = control or "" + prompt = prompt or "" + + # 1) HAIRCOLOR + color, idx = self._find_best_color(control) + haircolor = f"{self._find_darkness_near_color(control, idx)}{color}" if color else "" + + # 2) BRAIDS placeholder + braids_placeholder = "braided " if re.search(r"\bbraid", control, re.IGNORECASE) else "" + + # 3) HAIRLENGTH placeholder + if re.search(r"\bshort\b", control, re.IGNORECASE): + hairlength_placeholder = "short" + elif re.search(r"\blong\b", control, re.IGNORECASE): + hairlength_placeholder = "long" + else: + hairlength_placeholder = "" + + # 4) Build HAIRSTYLE phrase from categories + parts, matched_any, found_bald, found_texture = self._collect_hairstyle_parts(control) + + # If nothing matched at all: + if not matched_any: + if re.search(r"\bbald\b", control, re.IGNORECASE): + hairstyle_phrase = "bald head" # explicit bald requested + else: + hairstyle_phrase = "short hair" # default fallback + else: + # If we matched only texture (e.g., "curly") and nothing contains 'hair', + # append 'hair' to make it natural: "curly hair". + contains_hair_word = any(" hair" in p for p in parts) or any(p.endswith("hair") for p in parts) + if found_texture and not contains_hair_word: + # add 'hair' to the last part (usually the texture term) + parts[-1] = parts[-1] + " hair" + hairstyle_phrase = " ".join(parts).strip() + + # 5) Replace placeholders + out = prompt + out = out.replace("{{HAIRCOLOR}}", haircolor) + out = out.replace("{{BRAIDS}}", braids_placeholder) + out = out.replace("{{HAIRLENGTH}}", hairlength_placeholder) + out = out.replace("{{HAIRSTYLE}}", hairstyle_phrase) + + return (out, haircolor) + + # cache key + @classmethod + def IS_CHANGED(cls, control, prompt): + h = hashlib.sha256() + h.update((control or "").encode("utf-8")) + h.update(b"\x00") + h.update((prompt or "").encode("utf-8")) + return h.hexdigest() + +NODE_CLASS_MAPPINGS = { + "SaliaInjectHairColor": SaliaInjectHairColor, +} +NODE_DISPLAY_NAME_MAPPINGS = { + "SaliaInjectHairColor": "Inject Hair Color (Salia)", +} diff --git a/comfyui-salia/nodes/load_image.py b/comfyui-salia/nodes/load_image.py new file mode 100644 index 0000000000000000000000000000000000000000..0773c6fe176f7fff09dfb69124cfe56d34a9f8ea --- /dev/null +++ b/comfyui-salia/nodes/load_image.py @@ -0,0 +1,46 @@ +# Load any shipped PNG from assets/images via dropdown + +from ..utils.io import list_pngs, load_image_from_assets, file_hash, safe_path +import os + +class SaliaLoadImage: + CATEGORY = "image/salia" + + @classmethod + def INPUT_TYPES(cls): + choices = list_pngs() or [""] + return {"required": {"image": (choices, {})}} + + RETURN_TYPES = ("IMAGE", "MASK") + RETURN_NAMES = ("image", "mask") + FUNCTION = "run" + + def run(self, image): + if image == "": + raise FileNotFoundError("No PNGs in assets/images") + img, msk = load_image_from_assets(image) + return (img, msk) + + @classmethod + def IS_CHANGED(cls, image): + if image == "": + return image + return file_hash(image) + + @classmethod + def VALIDATE_INPUTS(cls, image): + try: + path = safe_path(image) + except Exception as e: + return str(e) + if not os.path.isfile(path): + return f"File not found in assets/images: {image}" + return True + + +NODE_CLASS_MAPPINGS = { + "SaliaLoadImage": SaliaLoadImage, +} +NODE_DISPLAY_NAME_MAPPINGS = { + "SaliaLoadImage": "Load Image (Salia Assets)", +} diff --git a/comfyui-salia/utils/__init__.py b/comfyui-salia/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..319dd3ca994bf42216523360bcb236810623d7fb --- /dev/null +++ b/comfyui-salia/utils/__init__.py @@ -0,0 +1 @@ +# package marker for comfyui-salia.utils (no logic here) diff --git a/comfyui-salia/utils/__pycache__/__init__.cpython-312.pyc b/comfyui-salia/utils/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..612bb1f7ec2f4a25f93533ef12e4ddb748248a20 Binary files /dev/null and b/comfyui-salia/utils/__pycache__/__init__.cpython-312.pyc differ diff --git a/comfyui-salia/utils/__pycache__/io.cpython-312.pyc b/comfyui-salia/utils/__pycache__/io.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..77b0483a58c94db67d238ef9a5eb68f92ecb883c Binary files /dev/null and b/comfyui-salia/utils/__pycache__/io.cpython-312.pyc differ diff --git a/comfyui-salia/utils/io.py b/comfyui-salia/utils/io.py new file mode 100644 index 0000000000000000000000000000000000000000..2ede7e68f5b2cdcd92c669c78023ae698b3dd5a5 --- /dev/null +++ b/comfyui-salia/utils/io.py @@ -0,0 +1,80 @@ +import os +import hashlib +import numpy as np +import torch +from PIL import Image, ImageOps, ImageSequence +import node_helpers # ComfyUI helper utilities + +EXT_DIR = os.path.dirname(os.path.dirname(__file__)) +ASSETS_DIR = os.path.abspath(os.path.join(EXT_DIR, "assets", "images")) +os.makedirs(ASSETS_DIR, exist_ok=True) + +def list_pngs(): + """Recursively list all .png under assets/images as relative POSIX paths.""" + out = [] + for root, _, files in os.walk(ASSETS_DIR): + for f in files: + if f.lower().endswith(".png"): + rel = os.path.relpath(os.path.join(root, f), ASSETS_DIR) + out.append(rel.replace("\\", "/")) + out.sort() + return out + +def safe_path(rel: str) -> str: + """Prevent path escape outside assets/images.""" + p = os.path.abspath(os.path.join(ASSETS_DIR, rel)) + if not (p == ASSETS_DIR or p.startswith(ASSETS_DIR + os.sep)): + raise ValueError("Invalid path outside assets/images") + return p + +def pil_to_tensors(pil: Image.Image): + """Convert PIL (possibly multi-frame) -> (IMAGE, MASK) tensors in Comfy format.""" + images, masks = [], [] + w = h = None + for frame in ImageSequence.Iterator(pil): + frame = node_helpers.pillow(ImageOps.exif_transpose, frame) + + if frame.mode == "I": + frame = frame.point(lambda x: x * (1 / 255)) + + rgb = frame.convert("RGB") + if w is None: + w, h = rgb.size + if rgb.size != (w, h): + continue + + arr = np.array(rgb).astype(np.float32) / 255.0 # (H,W,3) + img_t = torch.from_numpy(arr)[None, ...] # (1,H,W,3) + + if "A" in frame.getbands(): + a = np.array(frame.getchannel("A")).astype(np.float32) / 255.0 + mask = 1.0 - torch.from_numpy(a) # (H,W) + else: + mask = torch.zeros((h, w), dtype=torch.float32) + + images.append(img_t) + masks.append(mask.unsqueeze(0)) # (1,H,W) + + if len(images) > 1: + image_out = torch.cat(images, dim=0) + mask_out = torch.cat(masks, dim=0) + else: + image_out = images[0] + mask_out = masks[0] + return image_out, mask_out + +def load_image_from_assets(rel: str): + """Open a PNG from assets/images and return (IMAGE, MASK).""" + path = safe_path(rel) + if not os.path.isfile(path): + raise FileNotFoundError(f"Not found in assets/images: {rel}") + pil = node_helpers.pillow(Image.open, path) + return pil_to_tensors(pil) + +def file_hash(rel: str) -> str: + """Stable hash of a file (for IS_CHANGED).""" + path = safe_path(rel) + h = hashlib.sha256() + with open(path, "rb") as f: + h.update(f.read()) + return h.hexdigest()