saliacoel commited on Aug 22, 2025

Commit

f93d68a

verified ·

1 Parent(s): 5f4a806

Upload 111 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +29 -0
comfyui-mvadapter/.github/workflows/publish.yml +25 -0
comfyui-mvadapter/BACKUP_nodes.py +843 -0
comfyui-mvadapter/LICENSE +201 -0
comfyui-mvadapter/README.md +88 -0
comfyui-mvadapter/__init__.py +45 -0
comfyui-mvadapter/__pycache__/__init__.cpython-312.pyc +0 -0
comfyui-mvadapter/__pycache__/nodes.cpython-312.pyc +0 -0
comfyui-mvadapter/__pycache__/nodes_local_mv.cpython-312.pyc +0 -0
comfyui-mvadapter/__pycache__/utils.cpython-312.pyc +0 -0
comfyui-mvadapter/assets/CustomLoraModelLoader.png +0 -0
comfyui-mvadapter/assets/comfyui_i2mv.png +3 -0
comfyui-mvadapter/assets/comfyui_i2mv_lora.png +3 -0
comfyui-mvadapter/assets/comfyui_i2mv_multiple_loras.jpg +3 -0
comfyui-mvadapter/assets/comfyui_i2mv_view_selector.png +3 -0
comfyui-mvadapter/assets/comfyui_ldm_vae.png +0 -0
comfyui-mvadapter/assets/comfyui_model_makeup.png +0 -0
comfyui-mvadapter/assets/comfyui_t2mv.png +3 -0
comfyui-mvadapter/assets/comfyui_t2mv_controlnet.png +3 -0
comfyui-mvadapter/assets/comfyui_t2mv_lora.png +3 -0
comfyui-mvadapter/assets/comfyui_t2mv_multiple_loras.jpg +3 -0
comfyui-mvadapter/assets/demo/scribbles/scribble_0.png +0 -0
comfyui-mvadapter/assets/demo/scribbles/scribble_1.png +0 -0
comfyui-mvadapter/assets/demo/scribbles/scribble_2.png +0 -0
comfyui-mvadapter/assets/demo/scribbles/scribble_3.png +0 -0
comfyui-mvadapter/assets/demo/scribbles/scribble_4.png +0 -0
comfyui-mvadapter/assets/demo/scribbles/scribble_5.png +0 -0
comfyui-mvadapter/cache/stable-diffusion-v1-inference.yaml +70 -0
comfyui-mvadapter/mvadapter/__init__.py +0 -0
comfyui-mvadapter/mvadapter/__pycache__/__init__.cpython-312.pyc +0 -0
comfyui-mvadapter/mvadapter/loaders/__init__.py +1 -0
comfyui-mvadapter/mvadapter/loaders/__pycache__/__init__.cpython-312.pyc +0 -0
comfyui-mvadapter/mvadapter/loaders/__pycache__/custom_adapter.cpython-312.pyc +0 -0
comfyui-mvadapter/mvadapter/loaders/custom_adapter.py +98 -0
comfyui-mvadapter/mvadapter/models/__init__.py +0 -0
comfyui-mvadapter/mvadapter/models/__pycache__/__init__.cpython-312.pyc +0 -0
comfyui-mvadapter/mvadapter/models/__pycache__/attention_processor.cpython-312.pyc +0 -0
comfyui-mvadapter/mvadapter/models/attention_processor.py +377 -0
comfyui-mvadapter/mvadapter/pipelines/__pycache__/pipeline_mvadapter_i2mv_sd.cpython-312.pyc +0 -0
comfyui-mvadapter/mvadapter/pipelines/__pycache__/pipeline_mvadapter_i2mv_sdxl.cpython-312.pyc +0 -0
comfyui-mvadapter/mvadapter/pipelines/__pycache__/pipeline_mvadapter_t2mv_sd.cpython-312.pyc +0 -0
comfyui-mvadapter/mvadapter/pipelines/__pycache__/pipeline_mvadapter_t2mv_sdxl.cpython-312.pyc +0 -0
comfyui-mvadapter/mvadapter/pipelines/pipeline_mvadapter_i2mv_sdxl.py +903 -0
comfyui-mvadapter/mvadapter/schedulers/ShiftSNRSchedulerKarras.py +120 -0
comfyui-mvadapter/mvadapter/schedulers/__pycache__/ShiftSNRSchedulerKarras.cpython-312.pyc +0 -0
comfyui-mvadapter/mvadapter/schedulers/__pycache__/scheduler_utils.cpython-312.pyc +0 -0
comfyui-mvadapter/mvadapter/schedulers/__pycache__/scheduling_shift_snr.cpython-312.pyc +0 -0
comfyui-mvadapter/mvadapter/schedulers/scheduler_utils.py +70 -0
comfyui-mvadapter/mvadapter/schedulers/scheduling_shift_snr.py +140 -0
comfyui-mvadapter/mvadapter/utils/__init__.py +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,32 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+comfyui-mvadapter/assets/comfyui_i2mv_lora.png filter=lfs diff=lfs merge=lfs -text
+comfyui-mvadapter/assets/comfyui_i2mv_multiple_loras.jpg filter=lfs diff=lfs merge=lfs -text
+comfyui-mvadapter/assets/comfyui_i2mv_view_selector.png filter=lfs diff=lfs merge=lfs -text
+comfyui-mvadapter/assets/comfyui_i2mv.png filter=lfs diff=lfs merge=lfs -text
+comfyui-mvadapter/assets/comfyui_t2mv_controlnet.png filter=lfs diff=lfs merge=lfs -text
+comfyui-mvadapter/assets/comfyui_t2mv_lora.png filter=lfs diff=lfs merge=lfs -text
+comfyui-mvadapter/assets/comfyui_t2mv_multiple_loras.jpg filter=lfs diff=lfs merge=lfs -text
+comfyui-mvadapter/assets/comfyui_t2mv.png filter=lfs diff=lfs merge=lfs -text
+comfyui-salia/assets/images/boy0.png filter=lfs diff=lfs merge=lfs -text
+comfyui-salia/assets/images/boy1.png filter=lfs diff=lfs merge=lfs -text
+comfyui-salia/assets/images/boy2.png filter=lfs diff=lfs merge=lfs -text
+comfyui-salia/assets/images/boy3.png filter=lfs diff=lfs merge=lfs -text
+comfyui-salia/assets/images/boy4.png filter=lfs diff=lfs merge=lfs -text
+comfyui-salia/assets/images/boy5.png filter=lfs diff=lfs merge=lfs -text
+comfyui-salia/assets/images/girl0.png filter=lfs diff=lfs merge=lfs -text
+comfyui-salia/assets/images/girl1.png filter=lfs diff=lfs merge=lfs -text
+comfyui-salia/assets/images/girl2.png filter=lfs diff=lfs merge=lfs -text
+comfyui-salia/assets/images/girl3.png filter=lfs diff=lfs merge=lfs -text
+comfyui-salia/assets/images/girl4.png filter=lfs diff=lfs merge=lfs -text
+comfyui-salia/assets/images/girl5.png filter=lfs diff=lfs merge=lfs -text
+comfyui-salia/assets/images/hair_L_Bound_Braided.png filter=lfs diff=lfs merge=lfs -text
+comfyui-salia/assets/images/hair_L_Bound.png filter=lfs diff=lfs merge=lfs -text
+comfyui-salia/assets/images/hair_L_Loose.png filter=lfs diff=lfs merge=lfs -text
+comfyui-salia/assets/images/hair_M_Bound_Braided.png filter=lfs diff=lfs merge=lfs -text
+comfyui-salia/assets/images/hair_M_Bound.png filter=lfs diff=lfs merge=lfs -text
+comfyui-salia/assets/images/hair_M_Loose.png filter=lfs diff=lfs merge=lfs -text
+comfyui-salia/assets/images/hair_S_Bound_Braided.png filter=lfs diff=lfs merge=lfs -text
+comfyui-salia/assets/images/hair_S_Bound.png filter=lfs diff=lfs merge=lfs -text
+comfyui-salia/assets/images/hair_S_Loose.png filter=lfs diff=lfs merge=lfs -text

comfyui-mvadapter/.github/workflows/publish.yml ADDED Viewed

	@@ -0,0 +1,25 @@

+name: Publish to Comfy registry
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - main
+    paths:
+      - "pyproject.toml"
+permissions:
+  issues: write
+jobs:
+  publish-node:
+    name: Publish Custom Node to registry
+    runs-on: ubuntu-latest
+    if: ${{ github.repository_owner == 'huanngzh' }}
+    steps:
+      - name: Check out code
+        uses: actions/checkout@v4
+      - name: Publish Custom Node
+        uses: Comfy-Org/publish-node-action@v1
+        with:
+          ## Add your own personal access token to your Github Repository secrets and reference it here.
+          personal_access_token: ${{ secrets.REGISTRY_ACCESS_TOKEN }}

comfyui-mvadapter/BACKUP_nodes.py ADDED Viewed

	@@ -0,0 +1,843 @@

+# Adapted from https://github.com/Limitex/ComfyUI-Diffusers/blob/main/nodes.py
+import copy
+import os
+import torch
+from safetensors.torch import load_file
+from torchvision import transforms
+from .utils import (
+    SCHEDULERS,
+    PIPELINES,
+    MVADAPTERS,
+    vae_pt_to_vae_diffuser,
+    convert_images_to_tensors,
+    convert_tensors_to_images,
+    prepare_camera_embed,
+    preprocess_image,
+)
+from comfy.model_management import get_torch_device
+import folder_paths
+from diffusers import StableDiffusionXLPipeline, AutoencoderKL, ControlNetModel
+from transformers import AutoModelForImageSegmentation  # <-- restored
+# ADDED: import DPMSolverMultistepScheduler for DPM++ Karras
+from diffusers import DPMSolverMultistepScheduler
+from .mvadapter.pipelines.pipeline_mvadapter_t2mv_sdxl import MVAdapterT2MVSDXLPipeline
+from .mvadapter.schedulers.scheduling_shift_snr import ShiftSNRScheduler
+# ADDED: import your new Karras-enabled shift scheduler (file sits next to scheduling_shift_snr.py)
+from .mvadapter.schedulers.ShiftSNRSchedulerKarras import ShiftSNRSchedulerKarras
+class DiffusersMVPipelineLoader:
+    def __init__(self):
+        self.hf_dir = folder_paths.get_folder_paths("diffusers")[0]
+        self.dtype = torch.float16
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "ckpt_name": (
+                    "STRING",
+                    {"default": "stabilityai/stable-diffusion-xl-base-1.0"},
+                ),
+                "pipeline_name": (
+                    list(PIPELINES.keys()),
+                    {"default": "MVAdapterT2MVSDXLPipeline"},
+                ),
+            }
+        }
+    RETURN_TYPES = (
+        "PIPELINE",
+        "AUTOENCODER",
+        "SCHEDULER",
+    )
+    FUNCTION = "create_pipeline"
+    CATEGORY = "MV-Adapter"
+    def create_pipeline(self, ckpt_name, pipeline_name):
+        pipeline_class = PIPELINES[pipeline_name]
+        pipe = pipeline_class.from_pretrained(
+            pretrained_model_name_or_path=ckpt_name,
+            torch_dtype=self.dtype,
+            cache_dir=self.hf_dir,
+        )
+        return (pipe, pipe.vae, pipe.scheduler)
+class LdmPipelineLoader:
+    def __init__(self):
+        self.hf_dir = folder_paths.get_folder_paths("diffusers")[0]
+        self.dtype = torch.float16
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "ckpt_name": (folder_paths.get_filename_list("checkpoints"),),
+                "pipeline_name": (
+                    list(PIPELINES.keys()),
+                    {"default": "MVAdapterT2MVSDXLPipeline"},
+                ),
+            }
+        }
+    RETURN_TYPES = (
+        "PIPELINE",
+        "AUTOENCODER",
+        "SCHEDULER",
+    )
+    FUNCTION = "create_pipeline"
+    CATEGORY = "MV-Adapter"
+    def create_pipeline(self, ckpt_name, pipeline_name):
+        pipeline_class = PIPELINES[pipeline_name]
+        pipe = pipeline_class.from_single_file(
+            pretrained_model_link_or_path=folder_paths.get_full_path(
+                "checkpoints", ckpt_name
+            ),
+            torch_dtype=self.dtype,
+            cache_dir=self.hf_dir,
+        )
+        return (pipe, pipe.vae, pipe.scheduler)
+class DiffusersMVVaeLoader:
+    def __init__(self):
+        self.hf_dir = folder_paths.get_folder_paths("diffusers")[0]
+        self.dtype = torch.float16
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "vae_name": (
+                    "STRING",
+                    {"default": "madebyollin/sdxl-vae-fp16-fix"},
+                ),
+            }
+        }
+    RETURN_TYPES = ("AUTOENCODER",)
+    FUNCTION = "create_pipeline"
+    CATEGORY = "MV-Adapter"
+    def create_pipeline(self, vae_name):
+        vae = AutoencoderKL.from_pretrained(
+            pretrained_model_name_or_path=vae_name,
+            torch_dtype=self.dtype,
+            cache_dir=self.hf_dir,
+        )
+        return (vae,)
+class LdmVaeLoader:
+    def __init__(self):
+        self.dtype = torch.float16
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "vae_name": (folder_paths.get_filename_list("vae"),),
+                "upcast_fp32": ("BOOLEAN", {"default": True}),
+            },
+        }
+    RETURN_TYPES = ("AUTOENCODER",)
+    FUNCTION = "create_pipeline"
+    CATEGORY = "MV-Adapter"
+    def create_pipeline(self, vae_name, upcast_fp32):
+        vae = vae_pt_to_vae_diffuser(
+            folder_paths.get_full_path("vae", vae_name), force_upcast=upcast_fp32
+        ).to(self.dtype)
+        return (vae,)
+class DiffusersMVSchedulerLoader:
+    def __init__(self):
+        self.hf_dir = folder_paths.get_folder_paths("diffusers")[0]
+        self.dtype = torch.float16
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "pipeline": ("PIPELINE",),
+                "scheduler_name": (list(SCHEDULERS.keys()),),
+                "shift_snr": ("BOOLEAN", {"default": True}),
+                "shift_mode": (
+                    list(ShiftSNRScheduler.SHIFT_MODES),
+                    {"default": "interpolated"},
+                ),
+                "shift_scale": (
+                    "FLOAT",
+                    {"default": 8.0, "min": 0.0, "max": 50.0, "step": 1.0},
+                ),
+            }
+        }
+    RETURN_TYPES = ("SCHEDULER",)
+    FUNCTION = "load_scheduler"
+    CATEGORY = "MV-Adapter"
+    def load_scheduler(
+        self, pipeline, scheduler_name, shift_snr, shift_mode, shift_scale
+    ):
+        scheduler = SCHEDULERS[scheduler_name].from_config(
+            pipeline.scheduler.config, torch_dtype=self.dtype
+        )
+        if shift_snr:
+            scheduler = ShiftSNRScheduler.from_scheduler(
+                scheduler,
+                shift_mode=shift_mode,
+                shift_scale=shift_scale,
+                scheduler_class=scheduler.__class__,
+            )
+        return (scheduler,)
+# ADDED: Karras version — same inputs/outputs, but always returns a DPM++ (Karras) scheduler.
+class DiffusersMVSchedulerLoaderKarras:
+    def __init__(self):
+        self.hf_dir = folder_paths.get_folder_paths("diffusers")[0]
+        self.dtype = torch.float16
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "pipeline": ("PIPELINE",),
+                "scheduler_name": (list(SCHEDULERS.keys()),),
+                "shift_snr": ("BOOLEAN", {"default": True}),
+                "shift_mode": (
+                    list(ShiftSNRSchedulerKarras.SHIFT_MODES),
+                    {"default": "interpolated"},
+                ),
+                "shift_scale": (
+                    "FLOAT",
+                    {"default": 8.0, "min": 0.0, "max": 50.0, "step": 1.0},
+                ),
+            }
+        }
+    RETURN_TYPES = ("SCHEDULER",)
+    FUNCTION = "load_scheduler"
+    CATEGORY = "MV-Adapter"
+    def load_scheduler(
+        self, pipeline, scheduler_name, shift_snr, shift_mode, shift_scale
+    ):
+        # Build a base scheduler from the pipeline config (kept for parity with original UI),
+        # then *replace* it with DPM++ (Karras). If SNR shift is requested, apply via your Karras class.
+        base_sched = SCHEDULERS[scheduler_name].from_config(
+            pipeline.scheduler.config, torch_dtype=self.dtype
+        )
+        # Always use DPM++ Karras:
+        if shift_snr:
+            # Apply your Karras-enabled Shift SNR on top, and force DPM++ class to guarantee Karras works.
+            scheduler = ShiftSNRSchedulerKarras.from_scheduler(
+                base_sched,
+                shift_mode=shift_mode,
+                shift_scale=shift_scale,
+                scheduler_class=DPMSolverMultistepScheduler,
+            )
+        else:
+            # No SNR shift requested: just return DPM++ with Karras sigmas
+            scheduler = DPMSolverMultistepScheduler.from_config(
+                pipeline.scheduler.config,
+                algorithm_type="dpmsolver++",
+                use_karras_sigmas=True,
+                torch_dtype=self.dtype,
+            )
+        return (scheduler,)
+class CustomLoraModelLoader:
+    def __init__(self):
+        self.loaded_lora = None
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "pipeline": ("PIPELINE",),
+                "lora_name": (folder_paths.get_filename_list("loras"),),
+                "strength_model": (
+                    "FLOAT",
+                    {"default": 1.0, "min": -100.0, "max": 100.0, "step": 0.01},
+                ),
+                "enable": (
+                    "BOOLEAN",
+                    {"default": True},
+                ),
+                "last_lora_node": (
+                    "BOOLEAN",
+                    {"default": True},
+                ),
+            }
+        }
+    RETURN_TYPES = ("PIPELINE",)
+    FUNCTION = "load_lora"
+    CATEGORY = "MV-Adapter"
+    def load_lora(self, pipeline, lora_name, strength_model, enable, last_lora_node):
+        if not hasattr(pipeline, "loaded_loras"):
+            pipeline.loaded_loras = []
+        lora_path = folder_paths.get_full_path("loras", lora_name)
+        lora_dir = os.path.dirname(lora_path)
+        lora_name = os.path.basename(lora_path)
+        lora = None
+        if enable:
+            if self.loaded_lora is not None:
+                if self.loaded_lora[0] == lora_path:
+                    lora = self.loaded_lora[1]
+                else:
+                    temp = self.loaded_lora
+                    pipeline.delete_adapters(temp[1])
+                    pipeline.loaded_loras = [(name, strength) for (name, strength) in pipeline.loaded_loras if name != temp[1]]
+                    self.loaded_lora = None
+            if lora is None:
+                adapter_name = lora_name.rsplit(".", 1)[0]
+                pipeline.load_lora_weights(
+                    lora_dir, weight_name=lora_name, adapter_name=adapter_name
+                )
+                pipeline.set_adapters(adapter_name, strength_model)
+                self.loaded_lora = (lora_path, adapter_name)
+                lora = adapter_name
+                pipeline.loaded_loras.append((adapter_name, strength_model))
+        else:
+            # Delete the loaded lora
+            if self.loaded_lora is not None:
+                temp = self.loaded_lora
+                pipeline.delete_adapters(temp[1])
+                pipeline.loaded_loras = [(name, strength) for (name, strength) in pipeline.loaded_loras if name != temp[1]]
+                self.loaded_lora = None
+        if last_lora_node:
+            adapter_names = [x[0] for x in pipeline.loaded_loras]
+            strengths = [x[1] for x in pipeline.loaded_loras]
+            pipeline.set_adapters(adapter_names, strengths)
+            print(adapter_names)
+        return (pipeline,)
+class ControlNetModelLoader:
+    def __init__(self):
+        self.loaded_controlnet = None
+        self.dtype = torch.float16
+        self.torch_device = get_torch_device()
+        self.hf_dir = folder_paths.get_folder_paths("diffusers")[0]
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "pipeline": ("PIPELINE",),
+                "controlnet_name": (
+                    "STRING",
+                    {"default": "xinsir/controlnet-scribble-sdxl-1.0"},
+                ),
+            }
+        }
+    RETURN_TYPES = ("PIPELINE",)
+    FUNCTION = "load_controlnet"
+    CATEGORY = "MV-Adapter"
+    def load_controlnet(self, pipeline, controlnet_name):
+        controlnet = None
+        if self.loaded_controlnet is not None:
+            if self.loaded_controlnet == controlnet_name:
+                controlnet = self.loaded_controlnet
+            else:
+                del pipeline.controlnet
+                self.loaded_controlnet = None
+        if controlnet is None:
+            controlnet = ControlNetModel.from_pretrained(
+                controlnet_name, cache_dir=self.hf_dir, torch_dtype=self.dtype
+            )
+            pipeline.controlnet = controlnet
+            pipeline.controlnet.to(device=self.torch_device, dtype=self.dtype)
+            self.loaded_controlnet = controlnet_name
+            controlnet = controlnet_name
+        return (pipeline,)
+class DiffusersMVModelMakeup:
+    def __init__(self):
+        self.hf_dir = folder_paths.get_folder_paths("diffusers")[0]
+        self.torch_device = get_torch_device()
+        self.dtype = torch.float16
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "pipeline": ("PIPELINE",),
+                "scheduler": ("SCHEDULER",),
+                "autoencoder": ("AUTOENCODER",),
+                "load_mvadapter": ("BOOLEAN", {"default": True}),
+                "adapter_path": ("STRING", {"default": "huanngzh/mv-adapter"}),
+                "adapter_name": (
+                    MVADAPTERS,
+                    {"default": "mvadapter_t2mv_sdxl.safetensors"},
+                ),
+                "num_views": ("INT", {"default": 6, "min": 1, "max": 12}),
+            },
+            "optional": {
+                "enable_vae_slicing": ("BOOLEAN", {"default": True}),
+                "enable_vae_tiling": ("BOOLEAN", {"default": False}),
+            },
+        }
+    RETURN_TYPES = ("PIPELINE",)
+    FUNCTION = "makeup_pipeline"
+    CATEGORY = "MV-Adapter"
+    def makeup_pipeline(
+        self,
+        pipeline,
+        scheduler,
+        autoencoder,
+        load_mvadapter,
+        adapter_path,
+        adapter_name,
+        num_views,
+        enable_vae_slicing=True,
+        enable_vae_tiling=False,
+    ):
+        pipeline.vae = autoencoder
+        pipeline.scheduler = scheduler
+        if load_mvadapter:
+            pipeline.init_custom_adapter(num_views=num_views)
+            pipeline.load_custom_adapter(
+                adapter_path, weight_name=adapter_name, cache_dir=self.hf_dir
+            )
+            pipeline.cond_encoder.to(device=self.torch_device, dtype=self.dtype)
+        pipeline = pipeline.to(self.torch_device, self.dtype)
+        if enable_vae_slicing:
+            pipeline.enable_vae_slicing()
+        if enable_vae_tiling:
+            pipeline.enable_vae_tiling()
+        return (pipeline,)
+class DiffusersSampler:
+    def __init__(self):
+        self.torch_device = get_torch_device()
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "pipeline": ("PIPELINE",),
+                "prompt": (
+                    "STRING",
+                    {"multiline": True, "default": "a photo of a cat"},
+                ),
+                "negative_prompt": (
+                    "STRING",
+                    {
+                        "multiline": True,
+                        "default": "watermark, ugly, deformed, noisy, blurry, low contrast",
+                    },
+                ),
+                "width": ("INT", {"default": 768, "min": 1, "max": 2048, "step": 1}),
+                "height": ("INT", {"default": 768, "min": 1, "max": 2048, "step": 1}),
+                "steps": ("INT", {"default": 50, "min": 1, "max": 2000}),
+                "cfg": (
+                    "FLOAT",
+                    {
+                        "default": 7.0,
+                        "min": 0.0,
+                        "max": 100.0,
+                        "step": 0.1,
+                        "round": 0.01,
+                    },
+                ),
+                "seed": ("INT", {"default": 0, "min": 0, "max": 0xFFFFFFFFFFFFFFFF}),
+            }
+        }
+    RETURN_TYPES = ("IMAGE",)
+    FUNCTION = "sample"
+    CATEGORY = "MV-Adapter"
+    def sample(
+        self,
+        pipeline,
+        prompt,
+        negative_prompt,
+        height,
+        width,
+        steps,
+        cfg,
+        seed,
+    ):
+        images = pipeline(
+            prompt=prompt,
+            height=height,
+            width=width,
+            num_inference_steps=steps,
+            guidance_scale=cfg,
+            negative_prompt=negative_prompt,
+            generator=torch.Generator(self.torch_device).manual_seed(seed),
+        ).images
+        return (convert_images_to_tensors(images),)
+class DiffusersMVSampler:
+    def __init__(self):
+        self.torch_device = get_torch_device()
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "pipeline": ("PIPELINE",),
+                "num_views": ("INT", {"default": 6, "min": 1, "max": 12}),
+                "prompt": (
+                    "STRING",
+                    {"multiline": True, "default": "an astronaut riding a horse"},
+                ),
+                "negative_prompt": (
+                    "STRING",
+                    {
+                        "multiline": True,
+                        "default": "watermark, ugly, deformed, noisy, blurry, low contrast",
+                    },
+                ),
+                "width": ("INT", {"default": 768, "min": 1, "max": 2048, "step": 1}),
+                "height": ("INT", {"default": 768, "min": 1, "max": 2048, "step": 1}),
+                "steps": ("INT", {"default": 50, "min": 1, "max": 2000}),
+                "cfg": (
+                    "FLOAT",
+                    {
+                        "default": 7.0,
+                        "min": 0.0,
+                        "max": 100.0,
+                        "step": 0.1,
+                        "round": 0.01,
+                    },
+                ),
+                "seed": ("INT", {"default": 0, "min": 0, "max": 0xFFFFFFFFFFFFFFFF}),
+            },
+            "optional": {
+                "reference_image": ("IMAGE",),
+                "controlnet_image": ("IMAGE",),
+                "controlnet_conditioning_scale": ("FLOAT", {"default": 1.0}),
+                "azimuth_degrees": ("LIST", {"default": [0, 45, 90, 180, 270, 315]}),
+            },
+        }
+    RETURN_TYPES = ("IMAGE",)
+    FUNCTION = "sample"
+    CATEGORY = "MV-Adapter"
+    def sample(
+        self,
+        pipeline,
+        num_views,
+        prompt,
+        negative_prompt,
+        height,
+        width,
+        steps,
+        cfg,
+        seed,
+        reference_image=None,
+        controlnet_image=None,
+        controlnet_conditioning_scale=1.0,
+        azimuth_degrees=[0, 45, 90, 180, 270, 315],
+    ):
+        num_views = len(azimuth_degrees)
+        control_images = prepare_camera_embed(
+            num_views, width, self.torch_device, azimuth_degrees
+        )
+        pipe_kwargs = {}
+        if reference_image is not None:
+            pipe_kwargs.update(
+                {
+                    "reference_image": convert_tensors_to_images(reference_image)[0],
+                    "reference_conditioning_scale": 1.0,
+                }
+            )
+        if controlnet_image is not None:
+            controlnet_image = convert_tensors_to_images(controlnet_image)
+            pipe_kwargs.update(
+                {
+                    "controlnet_image": controlnet_image,
+                    "controlnet_conditioning_scale": controlnet_conditioning_scale,
+                }
+            )
+        images = pipeline(
+            prompt=prompt,
+            height=height,
+            width=width,
+            num_inference_steps=steps,
+            guidance_scale=cfg,
+            num_images_per_prompt=num_views,
+            control_image=control_images,
+            control_conditioning_scale=1.0,
+            negative_prompt=negative_prompt,
+            generator=torch.Generator(self.torch_device).manual_seed(seed),
+            cross_attention_kwargs={"num_views": num_views},
+            **pipe_kwargs,
+        ).images
+        return (convert_images_to_tensors(images),)
+class BiRefNet:
+    def __init__(self):
+        self.hf_dir = folder_paths.get_folder_paths("diffusers")[0]
+        self.torch_device = get_torch_device()
+        self.dtype = torch.float32
+    RETURN_TYPES = ("FUNCTION",)
+    FUNCTION = "load_model_fn"
+    CATEGORY = "MV-Adapter"
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {"ckpt_name": ("STRING", {"default": "briaai/RMBG-2.0"})}
+        }
+    def remove_bg(self, image, net, transform, device):
+        image_size = image.size
+        input_images = transform(image).unsqueeze(0).to(device)
+        with torch.no_grad():
+            preds = net(input_images)[-1].sigmoid().cpu()
+        pred = preds[0].squeeze()
+        pred_pil = transforms.ToPILImage()(pred)
+        mask = pred_pil.resize(image_size)
+        image.putalpha(mask)
+        return image
+    def load_model_fn(self, ckpt_name):
+        model = AutoModelForImageSegmentation.from_pretrained(
+            ckpt_name, trust_remote_code=True, cache_dir=self.hf_dir
+        ).to(self.torch_device, self.dtype)
+        transform_image = transforms.Compose(
+            [
+                transforms.Resize((1024, 1024)),
+                transforms.ToTensor(),
+                transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+            ]
+        )
+        remove_bg_fn = lambda x: self.remove_bg(
+            x, model, transform_image, self.torch_device
+        )
+        return (remove_bg_fn,)
+class ImagePreprocessor:
+    def __init__(self):
+        self.torch_device = get_torch_device()
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "remove_bg_fn": ("FUNCTION",),
+                "image": ("IMAGE",),
+                "height": ("INT", {"default": 768, "min": 1, "max": 2048, "step": 1}),
+                "width": ("INT", {"default": 768, "min": 1, "max": 2048, "step": 1}),
+            }
+        }
+    RETURN_TYPES = ("IMAGE",)
+    FUNCTION = "process"
+    def process(self, remove_bg_fn, image, height, width):
+        images = convert_tensors_to_images(image)
+        images = [
+            preprocess_image(remove_bg_fn(img.convert("RGB")), height, width)
+            for img in images
+        ]
+        return (convert_images_to_tensors(images),)
+class ControlImagePreprocessor:
+    def __init__(self):
+        self.torch_device = get_torch_device()
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "front_view": ("IMAGE",),
+                "front_right_view": ("IMAGE",),
+                "right_view": ("IMAGE",),
+                "back_view": ("IMAGE",),
+                "left_view": ("IMAGE",),
+                "front_left_view": ("IMAGE",),
+                "width": ("INT", {"default": 768, "min": 1, "max": 2048, "step": 1}),
+                "height": ("INT", {"default": 768, "min": 1, "max": 2048, "step": 1}),
+            }
+        }
+    RETURN_TYPES = ("IMAGE",)
+    FUNCTION = "process"
+    def process(
+        self,
+        front_view,
+        front_right_view,
+        right_view,
+        back_view,
+        left_view,
+        front_left_view,
+        width,
+        height,
+    ):
+        images = torch.cat(
+            [
+                front_view,
+                front_right_view,
+                right_view,
+                back_view,
+                left_view,
+                front_left_view,
+            ],
+            dim=0,
+        )
+        images = convert_tensors_to_images(images)
+        images = [img.resize((width, height)).convert("RGB") for img in images]
+        return (convert_images_to_tensors(images),)
+class ViewSelector:
+    def __init__(self):
+        pass
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "front_view": ("BOOLEAN", {"default": True}),
+                "front_right_view": ("BOOLEAN", {"default": True}),
+                "right_view": ("BOOLEAN", {"default": True}),
+                "back_view": ("BOOLEAN", {"default": True}),
+                "left_view": ("BOOLEAN", {"default": True}),
+                "front_left_view": ("BOOLEAN", {"default": True}),
+            }
+        }
+    RETURN_TYPES = ("LIST",)
+    FUNCTION = "process"
+    CATEGORY = "MV-Adapter"
+    def process(
+        self,
+        front_view,
+        front_right_view,
+        right_view,
+        back_view,
+        left_view,
+        front_left_view,
+    ):
+        azimuth_deg = []
+        if front_view:
+            azimuth_deg.append(0)
+        if front_right_view:
+            azimuth_deg.append(45)
+        if right_view:
+            azimuth_deg.append(90)
+        if back_view:
+            azimuth_deg.append(180)
+        if left_view:
+            azimuth_deg.append(270)
+        if front_left_view:
+            azimuth_deg.append(315)
+        return (azimuth_deg,)
+NODE_CLASS_MAPPINGS = {
+    "LdmPipelineLoader": LdmPipelineLoader,
+    "LdmVaeLoader": LdmVaeLoader,
+    "DiffusersMVPipelineLoader": DiffusersMVPipelineLoader,
+    "DiffusersMVVaeLoader": DiffusersMVVaeLoader,
+    "DiffusersMVSchedulerLoader": DiffusersMVSchedulerLoader,
+    # ADDED: Karras version
+    "DiffusersMVSchedulerLoaderKarras": DiffusersMVSchedulerLoaderKarras,
+    "DiffusersMVModelMakeup": DiffusersMVModelMakeup,
+    "CustomLoraModelLoader": CustomLoraModelLoader,
+    "DiffusersMVSampler": DiffusersMVSampler,
+    "BiRefNet": BiRefNet,
+    "ImagePreprocessor": ImagePreprocessor,
+    "ControlNetModelLoader": ControlNetModelLoader,
+    "ControlImagePreprocessor": ControlImagePreprocessor,
+    "ViewSelector": ViewSelector,
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+    "LdmPipelineLoader": "LDM Pipeline Loader",
+    "LdmVaeLoader": "LDM Vae Loader",
+    "DiffusersMVPipelineLoader": "Diffusers MV Pipeline Loader",
+    "DiffusersMVVaeLoader": "Diffusers MV Vae Loader",
+    "DiffusersMVSchedulerLoader": "Diffusers MV Scheduler Loader",
+    # ADDED: Karras version
+    "DiffusersMVSchedulerLoaderKarras": "Diffusers MV Scheduler Loader (Karras)",
+    "DiffusersMVModelMakeup": "Diffusers MV Model Makeup",
+    "CustomLoraModelLoader": "Custom Lora Model Loader",
+    "DiffusersMVSampler": "Diffusers MV Sampler",
+    "BiRefNet": "BiRefNet",
+    "ImagePreprocessor": "Image Preprocessor",
+    "ControlNetModelLoader": "ControlNet Model Loader",
+    "ControlImagePreprocessor": "Control Image Preprocessor",
+    "ViewSelector": "View Selector",
+}

comfyui-mvadapter/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

comfyui-mvadapter/README.md ADDED Viewed

	@@ -0,0 +1,88 @@

+# ComfyUI-MVAdapter
+This extension integrates [MV-Adapter](https://github.com/huanngzh/MV-Adapter) into ComfyUI, allowing users to generate multi-view consistent images from text prompts or single images directly within the ComfyUI interface.
+## 🔥 Feature Updates
+* [2025-06-26] Support multiple loras for multi-view synthesis [See [here](https://github.com/huanngzh/ComfyUI-MVAdapter/pull/96)]
+* [2025-01-15] Support selection of generated perspectives, such as generating only 2 views (front&back) [See [here](#view-selection)]
+* [2024-12-25] Support integration with ControlNet, for applications like scribble to multi-view images [See [here](#with-controlnet)]
+* [2024-12-09] Support integration with SDXL LoRA [See [here](#with-lora)]
+* [2024-12-02] Generate multi-view consistent images from text prompts or a single image
+## Installation
+### From Source
+* Clone or download this repository into your `ComfyUI/custom_nodes/` directory.
+* Install the required dependencies by running `pip install -r requirements.txt`.
+## Notes
+### Workflows
+We provide the example workflows in `workflows` directory.
+Note that our code depends on diffusers, and will automatically download the model weights from huggingface to the hf cache path at the first time. The `ckpt_name` in the node corresponds to the model name in huggingface, such as `stabilityai/stable-diffusion-xl-base-1.0`.
+We also provide the nodes `Ldm**Loader` to support loading text-to-image models in `ldm` format. Please see the workflow files with the suffix `_ldm.json`.
+### GPU Memory
+If your GPU resources are limited, we recommend using the following configuration:
+* Use [madebyollin/sdxl-vae-fp16-fix](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix) as VAE. If using ldm-format pipeline, remember to set `upcast_fp32` to `False`.
+![upcast_fp32_to_false](assets/comfyui_ldm_vae.png)
+* Set `enable_vae_slicing` in the Diffusers Model Makeup node to `True`.
+![enable_vae_slicing](assets/comfyui_model_makeup.png)
+However, since SDXL is used as the base model, it still requires about 13G to 14G GPU memory.
+## Usage
+### Text to Multi-view Images
+#### With SDXL or other base models
+![comfyui_t2mv](assets/comfyui_t2mv.png)
+* `workflows/t2mv_sdxl_diffusers.json` for loading diffusers-format models
+* `workflows/t2mv_sdxl_ldm.json` for loading ldm-format models
+#### With LoRA
+![comfyui_t2mv_lora](assets/comfyui_t2mv_lora.png)
+`workflows/t2mv_sdxl_ldm_lora.json` for loading ldm-format models with LoRA for text-to-multi-view generation
+#### With ControlNet
+![comfyui_t2mv_controlnet](assets/comfyui_t2mv_controlnet.png)
+`workflows/t2mv_sdxl_ldm_controlnet.json` for loading diffusers-format controlnets for text-scribble-to-multi-view generation
+### Image to Multi-view Images
+#### With SDXL or other base models
+![comfyui_i2mv](assets/comfyui_i2mv.png)
+* `workflows/i2mv_sdxl_diffusers.json` for loading diffusers-format models
+* `workflows/i2mv_sdxl_ldm.json` for loading ldm-format models
+#### With LoRA
+![comfyui_i2mv_lora](assets/comfyui_i2mv_lora.png)
+`workflows/i2mv_sdxl_ldm_lora.json` for loading ldm-format models with LoRA for image-to-multi-view generation
+#### View Selection
+![comfyui_i2mv_pair_views](assets/comfyui_i2mv_view_selector.png)
+`workflows/i2mv_sdxl_ldm_view_selector.json` for loading ldm-format models and selecting specific views to generate
+The key is to replace the `adapter_name` in `Diffusers Model Makeup` with `mvadapter_i2mv_sdxl_beta.safetensors`, and add a `View Selector` node to choose which views you want to generate. After a rough test, the beta model is better at generating 2 views (front&back), 3 views (front&right&back), 4 views (front&right&back&left). Note that the attribute `num_views` is not used and can be ignored.

comfyui-mvadapter/__init__.py ADDED Viewed

	@@ -0,0 +1,45 @@

+# __init__.py for comfyui-mvadapter
+# Register BOTH node sets: the original nodes.py and nodes_local_mv.py
+import traceback
+# Load the original nodes (if present)
+try:
+    from .nodes import (
+        NODE_CLASS_MAPPINGS as CORE_NODE_CLASS_MAPPINGS,
+        NODE_DISPLAY_NAME_MAPPINGS as CORE_NODE_DISPLAY_NAME_MAPPINGS,
+    )
+except Exception as e:
+    print("[comfyui-mvadapter] WARN: Failed to import .nodes")
+    traceback.print_exc()
+    CORE_NODE_CLASS_MAPPINGS = {}
+    CORE_NODE_DISPLAY_NAME_MAPPINGS = {}
+# Load the local-only nodes (if present)
+try:
+    from .nodes_local_mv import (
+        NODE_CLASS_MAPPINGS as LOCAL_NODE_CLASS_MAPPINGS,
+        NODE_DISPLAY_NAME_MAPPINGS as LOCAL_NODE_DISPLAY_NAME_MAPPINGS,
+    )
+except Exception as e:
+    print("[comfyui-mvadapter] WARN: Failed to import .nodes_local_mv")
+    traceback.print_exc()
+    LOCAL_NODE_CLASS_MAPPINGS = {}
+    LOCAL_NODE_DISPLAY_NAME_MAPPINGS = {}
+# Merge into the symbols ComfyUI looks for
+NODE_CLASS_MAPPINGS = {}
+NODE_CLASS_MAPPINGS.update(CORE_NODE_CLASS_MAPPINGS)
+NODE_CLASS_MAPPINGS.update(LOCAL_NODE_CLASS_MAPPINGS)
+NODE_DISPLAY_NAME_MAPPINGS = {}
+NODE_DISPLAY_NAME_MAPPINGS.update(CORE_NODE_DISPLAY_NAME_MAPPINGS)
+NODE_DISPLAY_NAME_MAPPINGS.update(LOCAL_NODE_DISPLAY_NAME_MAPPINGS)
+# Optional: quick summary to help debug load order
+print(
+    "[comfyui-mvadapter] Registered nodes:",
+    ", ".join(sorted(NODE_CLASS_MAPPINGS.keys())) or "(none)",
+)
+__all__ = ["NODE_CLASS_MAPPINGS", "NODE_DISPLAY_NAME_MAPPINGS"]

comfyui-mvadapter/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (1.4 kB). View file

comfyui-mvadapter/__pycache__/nodes.cpython-312.pyc ADDED Viewed

Binary file (8.32 kB). View file

comfyui-mvadapter/__pycache__/nodes_local_mv.cpython-312.pyc ADDED Viewed

Binary file (10.7 kB). View file

comfyui-mvadapter/__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (13.6 kB). View file

comfyui-mvadapter/assets/CustomLoraModelLoader.png ADDED Viewed

comfyui-mvadapter/assets/comfyui_i2mv.png ADDED Viewed

Git LFS Details

SHA256: 9c364ee7e709ced6c9fe32111ed8ef0f6b893410b7165d87fa12dc7ec6c61953
Pointer size: 131 Bytes
Size of remote file: 432 kB

comfyui-mvadapter/assets/comfyui_i2mv_lora.png ADDED Viewed

Git LFS Details

SHA256: 9d037b0b3f026f308e6dacf9261483a8e9e069507ab09cf86ad22fc5fcf2aa49
Pointer size: 131 Bytes
Size of remote file: 853 kB

comfyui-mvadapter/assets/comfyui_i2mv_multiple_loras.jpg ADDED Viewed

Git LFS Details

SHA256: 65c901ec52c76dd2e3ee49e121b52a4589ce9e9f9e67edccf297b5028470768b
Pointer size: 131 Bytes
Size of remote file: 471 kB

comfyui-mvadapter/assets/comfyui_i2mv_view_selector.png ADDED Viewed

Git LFS Details

SHA256: 6a48cde4ec2a44b1a9a29d4b9e1aaaf5a9ae287ef2d5ad4fe5da23e876c76c74
Pointer size: 131 Bytes
Size of remote file: 401 kB

comfyui-mvadapter/assets/comfyui_ldm_vae.png ADDED Viewed

comfyui-mvadapter/assets/comfyui_model_makeup.png ADDED Viewed

comfyui-mvadapter/assets/comfyui_t2mv.png ADDED Viewed

Git LFS Details

SHA256: 61f807f5665dbe404be09ab27214ae3e545160c6f99005f7d309e31af15ed41f
Pointer size: 131 Bytes
Size of remote file: 311 kB

comfyui-mvadapter/assets/comfyui_t2mv_controlnet.png ADDED Viewed

Git LFS Details

SHA256: b1b1923de261e12963fc5dbdc929e3f4f832aae34cb198beab14748c24758aee
Pointer size: 131 Bytes
Size of remote file: 426 kB

comfyui-mvadapter/assets/comfyui_t2mv_lora.png ADDED Viewed

Git LFS Details

SHA256: 62293e0d4897848f7b2117d5b18036c9ed82b01eaa7b9e39e55ed33f53ee0ec3
Pointer size: 132 Bytes
Size of remote file: 1.05 MB

comfyui-mvadapter/assets/comfyui_t2mv_multiple_loras.jpg ADDED Viewed

Git LFS Details

SHA256: 7436db15d4fb65113bc544eeda0ad9be9ee03cb589959847763eaf85fe93f65e
Pointer size: 131 Bytes
Size of remote file: 492 kB

comfyui-mvadapter/assets/demo/scribbles/scribble_0.png ADDED Viewed

comfyui-mvadapter/assets/demo/scribbles/scribble_1.png ADDED Viewed

comfyui-mvadapter/assets/demo/scribbles/scribble_2.png ADDED Viewed

comfyui-mvadapter/assets/demo/scribbles/scribble_3.png ADDED Viewed

comfyui-mvadapter/assets/demo/scribbles/scribble_4.png ADDED Viewed

comfyui-mvadapter/assets/demo/scribbles/scribble_5.png ADDED Viewed

comfyui-mvadapter/cache/stable-diffusion-v1-inference.yaml ADDED Viewed

	@@ -0,0 +1,70 @@

+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false   # Note: different from the one we trained before
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder

comfyui-mvadapter/mvadapter/__init__.py ADDED Viewed

File without changes

comfyui-mvadapter/mvadapter/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (161 Bytes). View file

comfyui-mvadapter/mvadapter/loaders/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .custom_adapter import CustomAdapterMixin

comfyui-mvadapter/mvadapter/loaders/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (229 Bytes). View file

comfyui-mvadapter/mvadapter/loaders/__pycache__/custom_adapter.cpython-312.pyc ADDED Viewed

Binary file (4.44 kB). View file

comfyui-mvadapter/mvadapter/loaders/custom_adapter.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import os
+from typing import Dict, Optional, Union
+import safetensors
+import torch
+from diffusers.utils import _get_model_file, logging
+from safetensors import safe_open
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class CustomAdapterMixin:
+    def init_custom_adapter(self, *args, **kwargs):
+        self._init_custom_adapter(*args, **kwargs)
+    def _init_custom_adapter(self, *args, **kwargs):
+        raise NotImplementedError
+    def load_custom_adapter(
+        self,
+        pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
+        weight_name: str,
+        subfolder: Optional[str] = None,
+        **kwargs,
+    ):
+        # Load the main state dict first.
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", None)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+        user_agent = {
+            "file_type": "attn_procs_weights",
+            "framework": "pytorch",
+        }
+        if not isinstance(pretrained_model_name_or_path_or_dict, dict):
+            model_file = _get_model_file(
+                pretrained_model_name_or_path_or_dict,
+                weights_name=weight_name,
+                subfolder=subfolder,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                token=token,
+                revision=revision,
+                user_agent=user_agent,
+            )
+            if weight_name.endswith(".safetensors"):
+                state_dict = {}
+                with safe_open(model_file, framework="pt", device="cpu") as f:
+                    for key in f.keys():
+                        state_dict[key] = f.get_tensor(key)
+            else:
+                state_dict = torch.load(model_file, map_location="cpu")
+        else:
+            state_dict = pretrained_model_name_or_path_or_dict
+        self._load_custom_adapter(state_dict)
+    def _load_custom_adapter(self, state_dict):
+        raise NotImplementedError
+    def save_custom_adapter(
+        self,
+        save_directory: Union[str, os.PathLike],
+        weight_name: str,
+        safe_serialization: bool = False,
+        **kwargs,
+    ):
+        if os.path.isfile(save_directory):
+            logger.error(
+                f"Provided path ({save_directory}) should be a directory, not a file"
+            )
+            return
+        if safe_serialization:
+            def save_function(weights, filename):
+                return safetensors.torch.save_file(
+                    weights, filename, metadata={"format": "pt"}
+                )
+        else:
+            save_function = torch.save
+        # Save the model
+        state_dict = self._save_custom_adapter(**kwargs)
+        save_function(state_dict, os.path.join(save_directory, weight_name))
+        logger.info(
+            f"Custom adapter weights saved in {os.path.join(save_directory, weight_name)}"
+        )
+    def _save_custom_adapter(self):
+        raise NotImplementedError

comfyui-mvadapter/mvadapter/models/__init__.py ADDED Viewed

File without changes

comfyui-mvadapter/mvadapter/models/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (168 Bytes). View file

comfyui-mvadapter/mvadapter/models/__pycache__/attention_processor.cpython-312.pyc ADDED Viewed

Binary file (13.8 kB). View file

comfyui-mvadapter/mvadapter/models/attention_processor.py ADDED Viewed

	@@ -0,0 +1,377 @@

+import math
+from typing import Callable, List, Optional, Union
+import torch
+import torch.nn.functional as F
+from diffusers.models.attention_processor import Attention
+from diffusers.models.unets import UNet2DConditionModel
+from diffusers.utils import deprecate, logging
+from diffusers.utils.import_utils import is_torch_npu_available, is_xformers_available
+from einops import rearrange
+from torch import nn
+def default_set_attn_proc_func(
+    name: str,
+    hidden_size: int,
+    cross_attention_dim: Optional[int],
+    ori_attn_proc: object,
+) -> object:
+    return ori_attn_proc
+def set_unet_2d_condition_attn_processor(
+    unet: UNet2DConditionModel,
+    set_self_attn_proc_func: Callable = default_set_attn_proc_func,
+    set_cross_attn_proc_func: Callable = default_set_attn_proc_func,
+    set_custom_attn_proc_func: Callable = default_set_attn_proc_func,
+    set_self_attn_module_names: Optional[List[str]] = None,
+    set_cross_attn_module_names: Optional[List[str]] = None,
+    set_custom_attn_module_names: Optional[List[str]] = None,
+) -> None:
+    do_set_processor = lambda name, module_names: (
+        any([name.startswith(module_name) for module_name in module_names])
+        if module_names is not None
+        else True
+    )  # prefix match
+    attn_procs = {}
+    for name, attn_processor in unet.attn_processors.items():
+        # set attn_processor by default, if module_names is None
+        set_self_attn_processor = do_set_processor(name, set_self_attn_module_names)
+        set_cross_attn_processor = do_set_processor(name, set_cross_attn_module_names)
+        set_custom_attn_processor = do_set_processor(name, set_custom_attn_module_names)
+        if name.startswith("mid_block"):
+            hidden_size = unet.config.block_out_channels[-1]
+        elif name.startswith("up_blocks"):
+            block_id = int(name[len("up_blocks.")])
+            hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+        elif name.startswith("down_blocks"):
+            block_id = int(name[len("down_blocks.")])
+            hidden_size = unet.config.block_out_channels[block_id]
+        is_custom = "attn_mid_blocks" in name or "attn_post_blocks" in name
+        if is_custom:
+            attn_procs[name] = (
+                set_custom_attn_proc_func(name, hidden_size, None, attn_processor)
+                if set_custom_attn_processor
+                else attn_processor
+            )
+        else:
+            cross_attention_dim = (
+                None
+                if name.endswith("attn1.processor")
+                else unet.config.cross_attention_dim
+            )
+            if cross_attention_dim is None or "motion_modules" in name:
+                # self attention
+                attn_procs[name] = (
+                    set_self_attn_proc_func(
+                        name, hidden_size, cross_attention_dim, attn_processor
+                    )
+                    if set_self_attn_processor
+                    else attn_processor
+                )
+            else:
+                # cross attention
+                attn_procs[name] = (
+                    set_cross_attn_proc_func(
+                        name, hidden_size, cross_attention_dim, attn_processor
+                    )
+                    if set_cross_attn_processor
+                    else attn_processor
+                )
+    unet.set_attn_processor(attn_procs)
+class DecoupledMVRowSelfAttnProcessor2_0(torch.nn.Module):
+    r"""
+    Attention processor for Decoupled Row-wise Self-Attention and Image Cross-Attention for PyTorch 2.0.
+    """
+    def __init__(
+        self,
+        query_dim: int,
+        inner_dim: int,
+        num_views: int = 1,
+        name: Optional[str] = None,
+        use_mv: bool = True,
+        use_ref: bool = False,
+    ):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "DecoupledMVRowSelfAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
+        super().__init__()
+        self.num_views = num_views
+        self.name = name  # NOTE: need for image cross-attention
+        self.use_mv = use_mv
+        self.use_ref = use_ref
+        if self.use_mv:
+            self.to_q_mv = nn.Linear(
+                in_features=query_dim, out_features=inner_dim, bias=False
+            )
+            self.to_k_mv = nn.Linear(
+                in_features=query_dim, out_features=inner_dim, bias=False
+            )
+            self.to_v_mv = nn.Linear(
+                in_features=query_dim, out_features=inner_dim, bias=False
+            )
+            self.to_out_mv = nn.ModuleList(
+                [
+                    nn.Linear(in_features=inner_dim, out_features=query_dim, bias=True),
+                    nn.Dropout(0.0),
+                ]
+            )
+        if self.use_ref:
+            self.to_q_ref = nn.Linear(
+                in_features=query_dim, out_features=inner_dim, bias=False
+            )
+            self.to_k_ref = nn.Linear(
+                in_features=query_dim, out_features=inner_dim, bias=False
+            )
+            self.to_v_ref = nn.Linear(
+                in_features=query_dim, out_features=inner_dim, bias=False
+            )
+            self.to_out_ref = nn.ModuleList(
+                [
+                    nn.Linear(in_features=inner_dim, out_features=query_dim, bias=True),
+                    nn.Dropout(0.0),
+                ]
+            )
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        temb: Optional[torch.FloatTensor] = None,
+        mv_scale: float = 1.0,
+        ref_hidden_states: Optional[torch.FloatTensor] = None,
+        ref_scale: float = 1.0,
+        cache_hidden_states: Optional[List[torch.FloatTensor]] = None,
+        use_mv: bool = True,
+        use_ref: bool = True,
+        num_views: Optional[int] = None,
+        *args,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        """
+        New args:
+            mv_scale (float): scale for multi-view self-attention.
+            ref_hidden_states (torch.FloatTensor): reference encoder hidden states for image cross-attention.
+            ref_scale (float): scale for image cross-attention.
+            cache_hidden_states (List[torch.FloatTensor]): cache hidden states from reference unet.
+        """
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+        if num_views is not None:
+            self.num_views = num_views
+        # NEW: cache hidden states for reference unet
+        if cache_hidden_states is not None:
+            cache_hidden_states[self.name] = hidden_states.clone()
+        # NEW: whether to use multi-view attention and image cross-attention
+        use_mv = self.use_mv and use_mv
+        use_ref = self.use_ref and use_ref
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(
+                batch_size, channel, height * width
+            ).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape
+            if encoder_hidden_states is None
+            else encoder_hidden_states.shape
+        )
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(
+                attention_mask, sequence_length, batch_size
+            )
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(
+                batch_size, attn.heads, -1, attention_mask.shape[-1]
+            )
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(
+                1, 2
+            )
+        query = attn.to_q(hidden_states)
+        # NEW: for decoupled multi-view attention
+        if use_mv:
+            query_mv = self.to_q_mv(hidden_states)
+        # NEW: for decoupled reference cross attention
+        if use_ref:
+            query_ref = self.to_q_ref(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(
+                encoder_hidden_states
+            )
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(
+            batch_size, -1, attn.heads * head_dim
+        )
+        hidden_states = hidden_states.to(query.dtype)
+        ####### Decoupled multi-view self-attention ########
+        if use_mv:
+            key_mv = self.to_k_mv(encoder_hidden_states)
+            value_mv = self.to_v_mv(encoder_hidden_states)
+            query_mv = query_mv.view(batch_size, -1, attn.heads, head_dim)
+            key_mv = key_mv.view(batch_size, -1, attn.heads, head_dim)
+            value_mv = value_mv.view(batch_size, -1, attn.heads, head_dim)
+            height = width = math.isqrt(sequence_length)
+            # row self-attention
+            query_mv = rearrange(
+                query_mv,
+                "(b nv) (ih iw) h c -> (b nv ih) iw h c",
+                nv=self.num_views,
+                ih=height,
+                iw=width,
+            ).transpose(1, 2)
+            key_mv = rearrange(
+                key_mv,
+                "(b nv) (ih iw) h c -> b ih (nv iw) h c",
+                nv=self.num_views,
+                ih=height,
+                iw=width,
+            )
+            key_mv = (
+                key_mv.repeat_interleave(self.num_views, dim=0)
+                .view(batch_size * height, -1, attn.heads, head_dim)
+                .transpose(1, 2)
+            )
+            value_mv = rearrange(
+                value_mv,
+                "(b nv) (ih iw) h c -> b ih (nv iw) h c",
+                nv=self.num_views,
+                ih=height,
+                iw=width,
+            )
+            value_mv = (
+                value_mv.repeat_interleave(self.num_views, dim=0)
+                .view(batch_size * height, -1, attn.heads, head_dim)
+                .transpose(1, 2)
+            )
+            hidden_states_mv = F.scaled_dot_product_attention(
+                query_mv,
+                key_mv,
+                value_mv,
+                dropout_p=0.0,
+                is_causal=False,
+            )
+            hidden_states_mv = rearrange(
+                hidden_states_mv,
+                "(b nv ih) h iw c -> (b nv) (ih iw) (h c)",
+                nv=self.num_views,
+                ih=height,
+            )
+            hidden_states_mv = hidden_states_mv.to(query.dtype)
+            # linear proj
+            hidden_states_mv = self.to_out_mv[0](hidden_states_mv)
+            # dropout
+            hidden_states_mv = self.to_out_mv[1](hidden_states_mv)
+        if use_ref:
+            reference_hidden_states = ref_hidden_states[self.name]
+            key_ref = self.to_k_ref(reference_hidden_states)
+            value_ref = self.to_v_ref(reference_hidden_states)
+            query_ref = query_ref.view(batch_size, -1, attn.heads, head_dim).transpose(
+                1, 2
+            )
+            key_ref = key_ref.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            value_ref = value_ref.view(batch_size, -1, attn.heads, head_dim).transpose(
+                1, 2
+            )
+            hidden_states_ref = F.scaled_dot_product_attention(
+                query_ref, key_ref, value_ref, dropout_p=0.0, is_causal=False
+            )
+            hidden_states_ref = hidden_states_ref.transpose(1, 2).reshape(
+                batch_size, -1, attn.heads * head_dim
+            )
+            hidden_states_ref = hidden_states_ref.to(query.dtype)
+            # linear proj
+            hidden_states_ref = self.to_out_ref[0](hidden_states_ref)
+            # dropout
+            hidden_states_ref = self.to_out_ref[1](hidden_states_ref)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if use_mv:
+            hidden_states = hidden_states + hidden_states_mv * mv_scale
+        if use_ref:
+            hidden_states = hidden_states + hidden_states_ref * ref_scale
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(
+                batch_size, channel, height, width
+            )
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+    def set_num_views(self, num_views: int) -> None:
+        self.num_views = num_views

comfyui-mvadapter/mvadapter/pipelines/__pycache__/pipeline_mvadapter_i2mv_sd.cpython-312.pyc ADDED Viewed

Binary file (30 kB). View file

comfyui-mvadapter/mvadapter/pipelines/__pycache__/pipeline_mvadapter_i2mv_sdxl.cpython-312.pyc ADDED Viewed

Binary file (32.6 kB). View file

comfyui-mvadapter/mvadapter/pipelines/__pycache__/pipeline_mvadapter_t2mv_sd.cpython-312.pyc ADDED Viewed

Binary file (24.9 kB). View file

comfyui-mvadapter/mvadapter/pipelines/__pycache__/pipeline_mvadapter_t2mv_sdxl.cpython-312.pyc ADDED Viewed

Binary file (34.8 kB). View file

comfyui-mvadapter/mvadapter/pipelines/pipeline_mvadapter_i2mv_sdxl.py ADDED Viewed

	@@ -0,0 +1,903 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import numpy as np
+import PIL
+import torch
+import torch.nn as nn
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.models import (
+    AutoencoderKL,
+    ImageProjection,
+    T2IAdapter,
+    UNet2DConditionModel,
+)
+from diffusers.pipelines.stable_diffusion_xl.pipeline_output import (
+    StableDiffusionXLPipelineOutput,
+)
+from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl import (
+    StableDiffusionXLPipeline,
+    rescale_noise_cfg,
+    retrieve_timesteps,
+)
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import deprecate, logging
+from diffusers.utils.torch_utils import randn_tensor
+from einops import rearrange
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+from ..loaders import CustomAdapterMixin
+from ..models.attention_processor import (
+    DecoupledMVRowSelfAttnProcessor2_0,
+    set_unet_2d_condition_attn_processor,
+)
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+def retrieve_latents(
+    encoder_output: torch.Tensor,
+    generator: Optional[torch.Generator] = None,
+    sample_mode: str = "sample",
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+class MVAdapterI2MVSDXLPipeline(StableDiffusionXLPipeline, CustomAdapterMixin):
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
+    ):
+        super().__init__(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            scheduler=scheduler,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+            force_zeros_for_empty_prompt=force_zeros_for_empty_prompt,
+            add_watermarker=add_watermarker,
+        )
+        self.control_image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor,
+            do_convert_rgb=True,
+            do_normalize=False,
+        )
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.prepare_latents
+    def prepare_image_latents(
+        self,
+        image,
+        timestep,
+        batch_size,
+        num_images_per_prompt,
+        dtype,
+        device,
+        generator=None,
+        add_noise=True,
+    ):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+        latents_mean = latents_std = None
+        if (
+            hasattr(self.vae.config, "latents_mean")
+            and self.vae.config.latents_mean is not None
+        ):
+            latents_mean = torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1)
+        if (
+            hasattr(self.vae.config, "latents_std")
+            and self.vae.config.latents_std is not None
+        ):
+            latents_std = torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1)
+        # Offload text encoder if `enable_model_cpu_offload` was enabled
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.text_encoder_2.to("cpu")
+            torch.cuda.empty_cache()
+        image = image.to(device=device, dtype=dtype)
+        batch_size = batch_size * num_images_per_prompt
+        if image.shape[1] == 4:
+            init_latents = image
+        else:
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            if self.vae.config.force_upcast:
+                image = image.float()
+                self.vae.to(dtype=torch.float32)
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+            elif isinstance(generator, list):
+                if image.shape[0] < batch_size and batch_size % image.shape[0] == 0:
+                    image = torch.cat([image] * (batch_size // image.shape[0]), dim=0)
+                elif image.shape[0] < batch_size and batch_size % image.shape[0] != 0:
+                    raise ValueError(
+                        f"Cannot duplicate `image` of batch size {image.shape[0]} to effective batch_size {batch_size} "
+                    )
+                init_latents = [
+                    retrieve_latents(
+                        self.vae.encode(image[i : i + 1]), generator=generator[i]
+                    )
+                    for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = retrieve_latents(
+                    self.vae.encode(image), generator=generator
+                )
+            if self.vae.config.force_upcast:
+                self.vae.to(dtype)
+            init_latents = init_latents.to(dtype)
+            if latents_mean is not None and latents_std is not None:
+                latents_mean = latents_mean.to(device=device, dtype=dtype)
+                latents_std = latents_std.to(device=device, dtype=dtype)
+                init_latents = (
+                    (init_latents - latents_mean)
+                    * self.vae.config.scaling_factor
+                    / latents_std
+                )
+            else:
+                init_latents = self.vae.config.scaling_factor * init_latents
+        if (
+            batch_size > init_latents.shape[0]
+            and batch_size % init_latents.shape[0] == 0
+        ):
+            # expand init_latents for batch_size
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat(
+                [init_latents] * additional_image_per_prompt, dim=0
+            )
+        elif (
+            batch_size > init_latents.shape[0]
+            and batch_size % init_latents.shape[0] != 0
+        ):
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+        if add_noise:
+            shape = init_latents.shape
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            # get latents
+            init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        latents = init_latents
+        return latents
+    def prepare_control_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        num_empty_images=0,  # for concat in batch like ImageDream
+    ):
+        """
+        Accepts either:
+          - regular RGB-like images -> preprocess via VaeImageProcessor, or
+          - native 6-channel Plücker tensors (B,6,H,W) or (6,H,W) -> pass through without normalization
+        """
+        assert hasattr(
+            self, "control_image_processor"
+        ), "control_image_processor is not initialized"
+        # Fast path: native 6-channel tensor
+        if isinstance(image, torch.Tensor):
+            if image.dim() == 3 and image.shape[0] == 6:
+                image = image.unsqueeze(0)  # (1,6,H,W)
+            if image.dim() == 4 and image.shape[1] == 6:
+                ctrl = image.to(device=device, dtype=torch.float32)
+                if num_empty_images > 0:
+                    ctrl = torch.cat([ctrl, torch.zeros_like(ctrl[:num_empty_images])], dim=0)
+                image_batch_size = ctrl.shape[0]
+                repeat_by = batch_size if image_batch_size == 1 else num_images_per_prompt  # always 1 per control
+                ctrl = ctrl.repeat_interleave(repeat_by, dim=0)
+                ctrl = ctrl.to(device=device, dtype=dtype)
+                if do_classifier_free_guidance:
+                    ctrl = torch.cat([ctrl] * 2)
+                return ctrl
+        # Fallback: treat as regular image(s)
+        image = self.control_image_processor.preprocess(
+            image, height=height, width=width
+        ).to(dtype=torch.float32)
+        if num_empty_images > 0:
+            image = torch.cat(
+                [image, torch.zeros_like(image[:num_empty_images])], dim=0
+            )
+        image_batch_size = image.shape[0]
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt  # always 1 for control image
+        image = image.repeat_interleave(repeat_by, dim=0)
+        image = image.to(device=device, dtype=dtype)
+        if do_classifier_free_guidance:
+            image = torch.cat([image] * 2)
+        return image
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        # --- Task 1: NEW (reference-only prompt used only for the ref cache pass) ---
+        reference_prompt: Optional[Union[str, List[str]]] = None,
+        reference_prompt_2: Optional[Union[str, List[str]]] = None,
+        # -----------------------------------------------------------------------------
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        # NEW
+        mv_scale: float = 1.0,
+        # Camera or geometry condition
+        control_image: Optional[PipelineImageInput] = None,
+        control_conditioning_scale: Optional[float] = 1.0,
+        control_conditioning_factor: float = 1.0,
+        # Image condition
+        reference_image: Optional[PipelineImageInput] = None,
+        reference_conditioning_scale: Optional[float] = 1.0,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The main prompt(s) for generation.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                Prompt(s) for the second text encoder. Falls back to `prompt` if None.
+            reference_prompt (`str` or `List[str]`, *optional*):
+                Prompt used **only** during the one-shot reference UNet pass that caches identity features
+                from `reference_image`. If None or empty, falls back to the positive branch of the main prompt
+                (original behavior).
+            reference_prompt_2 (`str` or `List[str]`, *optional*):
+                Second-encoder counterpart for `reference_prompt`.
+            ... (other arguments unchanged) ...
+        """
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        # 0. Default height and width to unet
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+        self._interrupt = False
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None)
+            if self.cross_attention_kwargs is not None
+            else None
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps
+        )
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        if negative_original_size is not None and negative_target_size is not None:
+            negative_add_time_ids = self._get_add_time_ids(
+                negative_original_size,
+                negative_crops_coords_top_left,
+                negative_target_size,
+                dtype=prompt_embeds.dtype,
+                text_encoder_projection_dim=text_encoder_projection_dim,
+            )
+        else:
+            negative_add_time_ids = add_time_ids
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat(
+                [negative_pooled_prompt_embeds, add_text_embeds], dim=0
+            )
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(
+            batch_size * num_images_per_prompt, 1
+        )
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+        # Preprocess reference image (required)
+        reference_image = self.image_processor.preprocess(reference_image)
+        reference_latents = self.prepare_image_latents(
+            reference_image,
+            timesteps[:1].repeat(batch_size * num_images_per_prompt),  # no use
+            batch_size,
+            1,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            add_noise=False,
+        )
+        with torch.no_grad():
+            ref_timesteps = torch.zeros_like(timesteps[0])
+            ref_hidden_states = {}
+            # reference-only prompt support (Task 1)
+            def _first_or_none(x):
+                if x is None:
+                    return None
+                if isinstance(x, list) and len(x) > 0:
+                    return x[0]
+                return x
+            rp = _first_or_none(reference_prompt)
+            rp2 = _first_or_none(reference_prompt_2)
+            have_ref_prompt = (rp is not None and str(rp).strip() != "") or (
+                rp2 is not None and str(rp2).strip() != ""
+            )
+            if have_ref_prompt:
+                ref_prompt_embeds, _, ref_pooled_prompt_embeds, _ = self.encode_prompt(
+                    prompt=rp or prompt,
+                    prompt_2=rp2 or prompt_2,
+                    device=device,
+                    num_images_per_prompt=1,
+                    do_classifier_free_guidance=False,
+                    prompt_embeds=None,
+                    negative_prompt_embeds=None,
+                    pooled_prompt_embeds=None,
+                    negative_pooled_prompt_embeds=None,
+                    lora_scale=(
+                        self.cross_attention_kwargs.get("scale", None)
+                        if self.cross_attention_kwargs is not None
+                        else None
+                    ),
+                    clip_skip=self.clip_skip,
+                )
+            else:
+                if self.do_classifier_free_guidance:
+                    ref_prompt_embeds = prompt_embeds[-1:].clone()
+                    ref_pooled_prompt_embeds = add_text_embeds[-1:].clone()
+                else:
+                    ref_prompt_embeds = prompt_embeds[:1].clone()
+                    ref_pooled_prompt_embeds = add_text_embeds[:1].clone()
+            self.unet(
+                reference_latents,
+                ref_timesteps,
+                encoder_hidden_states=ref_prompt_embeds,
+                added_cond_kwargs={
+                    "text_embeds": ref_pooled_prompt_embeds,
+                    "time_ids": add_time_ids[-1:],
+                },
+                cross_attention_kwargs={
+                    "cache_hidden_states": ref_hidden_states,
+                    "use_mv": False,
+                    "use_ref": False,
+                },
+                return_dict=False,
+            )
+            ref_hidden_states = {
+                k: v.repeat_interleave(num_images_per_prompt, dim=0)
+                for k, v in ref_hidden_states.items()
+            }
+        if self.do_classifier_free_guidance:
+            ref_hidden_states = {
+                k: torch.cat([torch.zeros_like(v), v], dim=0)
+                for k, v in ref_hidden_states.items()
+            }
+        cross_attention_kwargs = {
+            "mv_scale": mv_scale,
+            "ref_hidden_states": ref_hidden_states,
+            "ref_scale": reference_conditioning_scale,
+            **(self.cross_attention_kwargs or {}),
+        }
+        # ------------- control image (Task 2 supports 6ch pass-through) -------------
+        control_image_feature = self.prepare_control_image(
+            image=control_image,
+            width=width,
+            height=height,
+            batch_size=batch_size * num_images_per_prompt,
+            num_images_per_prompt=1,  # NOTE: always 1 for control images
+            device=device,
+            dtype=latents.dtype,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+        ).to(device=device, dtype=latents.dtype)
+        adapter_state = self.cond_encoder(control_image_feature)
+        for i, state in enumerate(adapter_state):
+            adapter_state[i] = state * control_conditioning_scale
+        # ---------------------------------------------------------------------------
+        # 8. Denoising loop
+        num_warmup_steps = max(
+            len(timesteps) - num_inference_steps * self.scheduler.order, 0
+        )
+        # 8.1 Apply denoising_end
+        if (
+            self.denoising_end is not None
+            and isinstance(self.denoising_end, float)
+            and self.denoising_end > 0
+            and self.denoising_end < 1
+        ):
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (self.denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(
+                list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps))
+            )
+            timesteps = timesteps[:num_inference_steps]
+        # 9. Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(
+                batch_size * num_images_per_prompt
+            )
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = (
+                    torch.cat([latents] * 2)
+                    if self.do_classifier_free_guidance
+                    else latents
+                )
+                latent_model_input = self.scheduler.scale_model_input(
+                    latent_model_input, t
+                )
+                added_cond_kwargs = {
+                    "text_embeds": add_text_embeds,
+                    "time_ids": add_time_ids,
+                }
+                if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+                    added_cond_kwargs["image_embeds"] = image_embeds
+                if i < int(num_inference_steps * control_conditioning_factor):
+                    down_intrablock_additional_residuals = [
+                        state.clone() for state in adapter_state
+                    ]
+                else:
+                    down_intrablock_additional_residuals = None
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    down_intrablock_additional_residuals=down_intrablock_additional_residuals,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (
+                        noise_pred_text - noise_pred_uncond
+                    )
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    noise_pred = rescale_noise_cfg(
+                        noise_pred,
+                        noise_pred_text,
+                        guidance_rescale=self.guidance_rescale,
+                    )
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, **extra_step_kwargs, return_dict=False
+                )[0]
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop(
+                        "negative_prompt_embeds", negative_prompt_embeds
+                    )
+                    add_text_embeds = callback_outputs.pop(
+                        "add_text_embeds", add_text_embeds
+                    )
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+                    add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+                    negative_add_time_ids = callback_outputs.pop(
+                        "negative_add_time_ids", negative_add_time_ids
+                    )
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or (
+                    (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+                ):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = (
+                self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+            )
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(
+                    next(iter(self.vae.post_quant_conv.parameters())).dtype
+                )
+            elif latents.dtype != self.vae.dtype:
+                if torch.backends.mps.is_available():
+                    # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                    self.vae = self.vae.to(latents.dtype)
+            # unscale/denormalize the latents
+            # denormalize with the mean and std if available and not None
+            has_latents_mean = (
+                hasattr(self.vae.config, "latents_mean")
+                and self.vae.config.latents_mean is not None
+            )
+            has_latents_std = (
+                hasattr(self.vae.config, "latents_std")
+                and self.vae.config.latents_std is not None
+            )
+            if has_latents_mean and has_latents_std:
+                latents_mean = (
+                    torch.tensor(self.vae.config.latents_mean)
+                    .view(1, 4, 1, 1)
+                    .to(latents.device, latents.dtype)
+                )
+                latents_std = (
+                    torch.tensor(self.vae.config.latents_std)
+                    .view(1, 4, 1, 1)
+                    .to(latents.device, latents.dtype)
+                )
+                latents = (
+                    latents * latents_std / self.vae.config.scaling_factor
+                    + latents_mean
+                )
+            else:
+                latents = latents / self.vae.config.scaling_factor
+            image = self.vae.decode(latents, return_dict=False)[0]
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+        if not output_type == "latent":
+            # apply watermark if available
+            if self.watermark is not None:
+                image = self.watermark.apply_watermark(image)
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image,)
+        return StableDiffusionXLPipelineOutput(images=image)
+    ### NEW: adapters ###
+    def _init_custom_adapter(
+        self,
+        # Multi-view adapter
+        num_views: int = 1,
+        self_attn_processor: Any = DecoupledMVRowSelfAttnProcessor2_0,
+        # Condition encoder
+        cond_in_channels: int = 6,
+        # For training
+        copy_attn_weights: bool = True,
+        zero_init_module_keys: List[str] = [],
+    ):
+        # Condition encoder
+        self.cond_encoder = T2IAdapter(
+            in_channels=cond_in_channels,
+            channels=(320, 640, 1280, 1280),
+            num_res_blocks=2,
+            downscale_factor=16,
+            adapter_type="full_adapter_xl",
+        )
+        # set custom attn processor for multi-view attention and image cross-attention
+        self.unet: UNet2DConditionModel
+        set_unet_2d_condition_attn_processor(
+            self.unet,
+            set_self_attn_proc_func=lambda name, hs, cad, ap: self_attn_processor(
+                query_dim=hs,
+                inner_dim=hs,
+                num_views=num_views,
+                name=name,
+                use_mv=True,
+                use_ref=True,
+            ),
+            set_cross_attn_proc_func=lambda name, hs, cad, ap: self_attn_processor(
+                query_dim=hs,
+                inner_dim=hs,
+                num_views=num_views,
+                name=name,
+                use_mv=False,
+                use_ref=False,
+            ),
+        )
+        # copy decoupled attention weights from original unet
+        if copy_attn_weights:
+            state_dict = self.unet.state_dict()
+            for key in state_dict.keys():
+                if "_mv" in key:
+                    compatible_key = key.replace("_mv", "").replace("processor.", "")
+                elif "_ref" in key:
+                    compatible_key = key.replace("_ref", "").replace("processor.", "")
+                else:
+                    compatible_key = key
+                is_zero_init_key = any([k in key for k in zero_init_module_keys])
+                if is_zero_init_key:
+                    state_dict[key] = torch.zeros_like(state_dict[compatible_key])
+                else:
+                    state_dict[key] = state_dict[compatible_key].clone()
+            self.unet.load_state_dict(state_dict)
+    def _load_custom_adapter(self, state_dict):
+        self.unet.load_state_dict(state_dict, strict=False)
+        self.cond_encoder.load_state_dict(state_dict, strict=False)
+    def _save_custom_adapter(
+        self,
+        include_keys: Optional[List[str]] = None,
+        exclude_keys: Optional[List[str]] = None,
+    ):
+        def include_fn(k):
+            is_included = False
+            if include_keys is not None:
+                is_included = is_included or any([key in k for key in include_keys])
+            if exclude_keys is not None:
+                is_included = is_included and not any(
+                    [key in k for key in exclude_keys]
+                )
+            return is_included
+        state_dict = {k: v for k, v in self.unet.state_dict().items() if include_fn(k)}
+        state_dict.update(self.cond_encoder.state_dict())
+        return state_dict

comfyui-mvadapter/mvadapter/schedulers/ShiftSNRSchedulerKarras.py ADDED Viewed

	@@ -0,0 +1,120 @@

+from typing import Any
+import torch
+from .scheduler_utils import SNR_to_betas, compute_snr
+class ShiftSNRSchedulerKarras:
+    """
+    Wraps a Diffusers scheduler to apply SNR shifting to its noise schedule and
+    rebuilds a DPMSolverMultistepScheduler that supports Karras sigmas.
+    Usage:
+        new_sched = ShiftSNRSchedulerKarras.from_scheduler(
+            noise_scheduler=base_sched,
+            shift_mode="interpolated",   # or "default"
+            shift_scale=8.0,
+            scheduler_class=DPMSolverMultistepScheduler,  # usually this
+        )
+    """
+    # Supported modes for how the SNR shift is applied
+    SHIFT_MODES = ["default", "interpolated"]
+    def __init__(
+        self,
+        noise_scheduler: Any,
+        timesteps: Any,
+        shift_scale: float,
+        scheduler_class: Any,
+    ):
+        # original scheduler (used only as a reference/config source)
+        self.noise_scheduler = noise_scheduler
+        # tensor of timesteps to compute SNR/betas on
+        self.timesteps = timesteps
+        # scale by which to divide the SNR (e.g., 8.0)
+        self.shift_scale = shift_scale
+        # the scheduler class to construct for output (e.g., DPMSolverMultistepScheduler)
+        self.scheduler_class = scheduler_class
+    def _get_shift_scheduler(self):
+        """
+        Apply a uniform SNR shift: snr' = snr / shift_scale
+        Then convert to betas and rebuild the scheduler with Karras enabled.
+        """
+        snr = compute_snr(self.timesteps, self.noise_scheduler)
+        shifted_betas = SNR_to_betas(snr / self.shift_scale)
+        return self.scheduler_class.from_config(
+            self.noise_scheduler.config,
+            trained_betas=shifted_betas.numpy(),
+            # Enable Karras sigmas in the rebuilt scheduler
+            algorithm_type="dpmsolver++",
+            use_karras_sigmas=True,
+        )
+    def _get_interpolated_shift_scheduler(self):
+        """
+        Interpolate SNR in log-space between the original and the shifted SNR
+        as timesteps progress. This tends to preserve early behavior and
+        gradually apply the shift later in the schedule.
+        """
+        snr = compute_snr(self.timesteps, self.noise_scheduler)
+        shifted_snr = snr / self.shift_scale
+        # Interpolate in log-space from original -> shifted across timesteps
+        weighting = self.timesteps.float() / (
+            self.noise_scheduler.config.num_train_timesteps - 1
+        )
+        interpolated_snr = torch.exp(
+            torch.log(snr) * (1 - weighting) + torch.log(shifted_snr) * weighting
+        )
+        shifted_betas = SNR_to_betas(interpolated_snr)
+        return self.scheduler_class.from_config(
+            self.noise_scheduler.config,
+            trained_betas=shifted_betas.numpy(),
+            # Enable Karras sigmas in the rebuilt scheduler
+            algorithm_type="dpmsolver++",
+            use_karras_sigmas=True,
+        )
+    @classmethod
+    def from_scheduler(
+        cls,
+        noise_scheduler: Any,
+        shift_mode: str = "default",
+        timesteps: Any = None,
+        shift_scale: float = 1.0,
+        scheduler_class: Any = None,
+    ):
+        """
+        Factory that returns a NEW scheduler instance with the shifted betas applied.
+        Args:
+            noise_scheduler: the original Diffusers scheduler (used for config & base betas)
+            shift_mode: "default" or "interpolated"
+            timesteps: tensor of timesteps to evaluate SNR on; if None, uses full training range
+            shift_scale: divide SNR by this value (e.g., 8.0)
+            scheduler_class: class to construct for the output scheduler (defaults to original class)
+        """
+        if timesteps is None:
+            timesteps = torch.arange(0, noise_scheduler.config.num_train_timesteps)
+        if scheduler_class is None:
+            scheduler_class = noise_scheduler.__class__
+        wrapper = cls(
+            noise_scheduler=noise_scheduler,
+            timesteps=timesteps,
+            shift_scale=shift_scale,
+            scheduler_class=scheduler_class,
+        )
+        if shift_mode == "default":
+            return wrapper._get_shift_scheduler()
+        elif shift_mode == "interpolated":
+            return wrapper._get_interpolated_shift_scheduler()
+        else:
+            raise ValueError(f"Unknown shift_mode: {shift_mode}")

comfyui-mvadapter/mvadapter/schedulers/__pycache__/ShiftSNRSchedulerKarras.cpython-312.pyc ADDED Viewed

Binary file (4.86 kB). View file

comfyui-mvadapter/mvadapter/schedulers/__pycache__/scheduler_utils.cpython-312.pyc ADDED Viewed

Binary file (3.78 kB). View file

comfyui-mvadapter/mvadapter/schedulers/__pycache__/scheduling_shift_snr.cpython-312.pyc ADDED Viewed

Binary file (5.9 kB). View file

comfyui-mvadapter/mvadapter/schedulers/scheduler_utils.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import torch
+def get_sigmas(noise_scheduler, timesteps, n_dim=4, dtype=torch.float32, device=None):
+    sigmas = noise_scheduler.sigmas.to(device=device, dtype=dtype)
+    schedule_timesteps = noise_scheduler.timesteps.to(device)
+    timesteps = timesteps.to(device)
+    step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+    sigma = sigmas[step_indices].flatten()
+    while len(sigma.shape) < n_dim:
+        sigma = sigma.unsqueeze(-1)
+    return sigma
+def SNR_to_betas(snr):
+    """
+    Converts SNR to betas
+    """
+    # alphas_cumprod = pass
+    # snr = (alpha / ) ** 2
+    # alpha_t^2 / (1 - alpha_t^2) = snr
+    alpha_t = (snr / (1 + snr)) ** 0.5
+    alphas_cumprod = alpha_t**2
+    alphas = alphas_cumprod / torch.cat(
+        [torch.ones(1, device=snr.device), alphas_cumprod[:-1]]
+    )
+    betas = 1 - alphas
+    return betas
+def compute_snr(timesteps, noise_scheduler):
+    """
+    Computes SNR as per Min-SNR-Diffusion-Training/guided_diffusion/gaussian_diffusion.py at 521b624bd70c67cee4bdf49225915f5
+    """
+    alphas_cumprod = noise_scheduler.alphas_cumprod
+    sqrt_alphas_cumprod = alphas_cumprod**0.5
+    sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod) ** 0.5
+    # Expand the tensors.
+    # Adapted from Min-SNR-Diffusion-Training/guided_diffusion/gaussian_diffusion.py at 521b624bd70c67cee4bdf49225915f5
+    sqrt_alphas_cumprod = sqrt_alphas_cumprod.to(device=timesteps.device)[
+        timesteps
+    ].float()
+    while len(sqrt_alphas_cumprod.shape) < len(timesteps.shape):
+        sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None]
+    alpha = sqrt_alphas_cumprod.expand(timesteps.shape)
+    sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod.to(
+        device=timesteps.device
+    )[timesteps].float()
+    while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape):
+        sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., None]
+    sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape)
+    # Compute SNR.
+    snr = (alpha / sigma) ** 2
+    return snr
+def compute_alpha(timesteps, noise_scheduler):
+    alphas_cumprod = noise_scheduler.alphas_cumprod
+    sqrt_alphas_cumprod = alphas_cumprod**0.5
+    sqrt_alphas_cumprod = sqrt_alphas_cumprod.to(device=timesteps.device)[
+        timesteps
+    ].float()
+    while len(sqrt_alphas_cumprod.shape) < len(timesteps.shape):
+        sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None]
+    alpha = sqrt_alphas_cumprod.expand(timesteps.shape)
+    return alpha

comfyui-mvadapter/mvadapter/schedulers/scheduling_shift_snr.py ADDED Viewed

	@@ -0,0 +1,140 @@

+from typing import Any
+import torch
+from .scheduler_utils import SNR_to_betas, compute_snr
+class ShiftSNRScheduler:
+    SHIFT_MODES = ["default", "interpolated"]
+    def __init__(
+        self,
+        noise_scheduler: Any,
+        timesteps: Any,
+        shift_scale: float,
+        scheduler_class: Any,
+    ):
+        self.noise_scheduler = noise_scheduler
+        self.timesteps = timesteps
+        self.shift_scale = shift_scale
+        self.scheduler_class = scheduler_class
+    def _get_shift_scheduler(self):
+        """
+        Prepare scheduler for shifted betas.
+        :return: A scheduler object configured with shifted betas
+        """
+        snr = compute_snr(self.timesteps, self.noise_scheduler)
+        shifted_betas = SNR_to_betas(snr / self.shift_scale)
+        return self.scheduler_class.from_config(
+            self.noise_scheduler.config, trained_betas=shifted_betas.numpy()
+        )
+    def _get_interpolated_shift_scheduler(self):
+        """
+        Prepare scheduler for shifted betas and interpolate with the original betas in log space.
+        :return: A scheduler object configured with interpolated shifted betas
+        """
+        snr = compute_snr(self.timesteps, self.noise_scheduler)
+        shifted_snr = snr / self.shift_scale
+        weighting = self.timesteps.float() / (
+            self.noise_scheduler.config.num_train_timesteps - 1
+        )
+        interpolated_snr = torch.exp(
+            torch.log(snr) * (1 - weighting) + torch.log(shifted_snr) * weighting
+        )
+        shifted_betas = SNR_to_betas(interpolated_snr)
+        return self.scheduler_class.from_config(
+            self.noise_scheduler.config, trained_betas=shifted_betas.numpy()
+        )
+    @classmethod
+    def from_scheduler(
+        cls,
+        noise_scheduler: Any,
+        shift_mode: str = "default",
+        timesteps: Any = None,
+        shift_scale: float = 1.0,
+        scheduler_class: Any = None,
+    ):
+        # Check input
+        if timesteps is None:
+            timesteps = torch.arange(0, noise_scheduler.config.num_train_timesteps)
+        if scheduler_class is None:
+            scheduler_class = noise_scheduler.__class__
+        # Create scheduler
+        shift_scheduler = cls(
+            noise_scheduler=noise_scheduler,
+            timesteps=timesteps,
+            shift_scale=shift_scale,
+            scheduler_class=scheduler_class,
+        )
+        if shift_mode == "default":
+            return shift_scheduler._get_shift_scheduler()
+        elif shift_mode == "interpolated":
+            return shift_scheduler._get_interpolated_shift_scheduler()
+        else:
+            raise ValueError(f"Unknown shift_mode: {shift_mode}")
+if __name__ == "__main__":
+    """
+    Compare the alpha values for different noise schedulers.
+    """
+    import matplotlib.pyplot as plt
+    from diffusers import DDPMScheduler
+    from .scheduler_utils import compute_alpha
+    # Base
+    timesteps = torch.arange(0, 1000)
+    noise_scheduler_base = DDPMScheduler.from_pretrained(
+        "runwayml/stable-diffusion-v1-5", subfolder="scheduler"
+    )
+    alpha = compute_alpha(timesteps, noise_scheduler_base)
+    plt.plot(timesteps.numpy(), alpha.numpy(), label="Base")
+    # Kolors
+    num_train_timesteps_ = 1100
+    timesteps_ = torch.arange(0, num_train_timesteps_)
+    noise_kwargs = {"beta_end": 0.014, "num_train_timesteps": num_train_timesteps_}
+    noise_scheduler_kolors = DDPMScheduler.from_config(
+        noise_scheduler_base.config, **noise_kwargs
+    )
+    alpha = compute_alpha(timesteps_, noise_scheduler_kolors)
+    plt.plot(timesteps_.numpy(), alpha.numpy(), label="Kolors")
+    # Shift betas
+    shift_scale = 8.0
+    noise_scheduler_shift = ShiftSNRScheduler.from_scheduler(
+        noise_scheduler_base, shift_mode="default", shift_scale=shift_scale
+    )
+    alpha = compute_alpha(timesteps, noise_scheduler_shift)
+    plt.plot(timesteps.numpy(), alpha.numpy(), label="Shift Noise (scale 8.0)")
+    # Shift betas (interpolated)
+    noise_scheduler_inter = ShiftSNRScheduler.from_scheduler(
+        noise_scheduler_base, shift_mode="interpolated", shift_scale=shift_scale
+    )
+    alpha = compute_alpha(timesteps, noise_scheduler_inter)
+    plt.plot(timesteps.numpy(), alpha.numpy(), label="Interpolated (scale 8.0)")
+    # ZeroSNR
+    noise_scheduler = DDPMScheduler.from_config(
+        noise_scheduler_base.config, rescale_betas_zero_snr=True
+    )
+    alpha = compute_alpha(timesteps, noise_scheduler)
+    plt.plot(timesteps.numpy(), alpha.numpy(), label="ZeroSNR")
+    plt.legend()
+    plt.grid()
+    plt.savefig("check_alpha.png")

comfyui-mvadapter/mvadapter/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .camera import get_camera, get_orthogonal_camera
+from .geometry import get_plucker_embeds_from_cameras_ortho
+from .saving import make_image_grid, tensor_to_image