Spaces:

Eugeoter
/

ControlNeXt

Runtime error

App Files Files Community

Eugeoter commited on Sep 4, 2024

Commit

76be739

1 Parent(s): 4074997

update

Browse files

Files changed (5) hide show

app.py +4 -24
models/unet.py +0 -70
pipeline/pipeline_controlnext.py +271 -1
utils/tools.py +107 -52
utils/utils.py +0 -68

app.py CHANGED Viewed

@@ -2,16 +2,12 @@ import gradio as gr
 import torch
 import numpy as np
 import spaces
-from PIL import Image
-from huggingface_hub import hf_hub_download
 from utils import utils, tools, preprocess
 BASE_MODEL_REPO_ID = "neta-art/neta-xl-2.0"
 BASE_MODEL_FILENAME = "neta-xl-v2.fp16.safetensors"
 VAE_PATH = "madebyollin/sdxl-vae-fp16-fix"
-CONTROLNEXT_REPO_ID = "Pbihao/ControlNeXt"
-UNET_FILENAME = "ControlAny-SDXL/anime_canny/unet.safetensors"
-CONTROLNET_FILENAME = "ControlAny-SDXL/anime_canny/controlnet.safetensors"
 CACHE_DIR = None
 DEFAULT_PROMPT = ""
@@ -20,26 +16,10 @@ DEFAULT_NEGATIVE_PROMPT = "worst quality, abstract, clumsy pose, deformed hand,
 def ui():
     device = "cuda" if torch.cuda.is_available() else "cpu"
-    model_file = hf_hub_download(
-        repo_id=BASE_MODEL_REPO_ID,
-        filename=BASE_MODEL_FILENAME,
-        cache_dir=CACHE_DIR,
-    )
-    unet_file = hf_hub_download(
-        repo_id=CONTROLNEXT_REPO_ID,
-        filename=UNET_FILENAME,
-        cache_dir=CACHE_DIR,
-    )
-    controlnet_file = hf_hub_download(
-        repo_id=CONTROLNEXT_REPO_ID,
-        filename=CONTROLNET_FILENAME,
-        cache_dir=CACHE_DIR,
-    )
     pipeline = tools.get_pipeline(
-        pretrained_model_name_or_path=model_file,
-        unet_model_name_or_path=unet_file,
-        controlnet_model_name_or_path=controlnet_file,
         vae_model_name_or_path=VAE_PATH,
         load_weight_increasement=True,
         device=device,

 import torch
 import numpy as np
 import spaces
 from utils import utils, tools, preprocess
 BASE_MODEL_REPO_ID = "neta-art/neta-xl-2.0"
 BASE_MODEL_FILENAME = "neta-xl-v2.fp16.safetensors"
 VAE_PATH = "madebyollin/sdxl-vae-fp16-fix"
+CONTROLNEXT_REPO_ID = "Eugeoter/controlnext-sdxl-anime-canny"
 CACHE_DIR = None
 DEFAULT_PROMPT = ""
 def ui():
     device = "cuda" if torch.cuda.is_available() else "cpu"
     pipeline = tools.get_pipeline(
+        pretrained_model_name_or_path=BASE_MODEL_REPO_ID,
+        unet_model_name_or_path=CONTROLNEXT_REPO_ID,
+        controlnet_model_name_or_path=CONTROLNEXT_REPO_ID,
         vae_model_name_or_path=VAE_PATH,
         load_weight_increasement=True,
         device=device,

models/unet.py CHANGED Viewed

@@ -53,76 +53,6 @@ from diffusers.models.unets.unet_2d_blocks import (
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-UNET_CONFIG = {
-    "_class_name": "UNet2DConditionModel",
-    "_diffusers_version": "0.19.0.dev0",
-    "act_fn": "silu",
-    "addition_embed_type": "text_time",
-    "addition_embed_type_num_heads": 64,
-    "addition_time_embed_dim": 256,
-    "attention_head_dim": [
-        5,
-        10,
-        20
-    ],
-    "block_out_channels": [
-        320,
-        640,
-        1280
-    ],
-    "center_input_sample": False,
-    "class_embed_type": None,
-    "class_embeddings_concat": False,
-    "conv_in_kernel": 3,
-    "conv_out_kernel": 3,
-    "cross_attention_dim": 2048,
-    "cross_attention_norm": None,
-    "down_block_types": [
-        "DownBlock2D",
-        "CrossAttnDownBlock2D",
-        "CrossAttnDownBlock2D"
-    ],
-    "downsample_padding": 1,
-    "dual_cross_attention": False,
-    "encoder_hid_dim": None,
-    "encoder_hid_dim_type": None,
-    "flip_sin_to_cos": True,
-    "freq_shift": 0,
-    "in_channels": 4,
-    "layers_per_block": 2,
-    "mid_block_only_cross_attention": None,
-    "mid_block_scale_factor": 1,
-    "mid_block_type": "UNetMidBlock2DCrossAttn",
-    "norm_eps": 1e-05,
-    "norm_num_groups": 32,
-    "num_attention_heads": None,
-    "num_class_embeds": None,
-    "only_cross_attention": False,
-    "out_channels": 4,
-    "projection_class_embeddings_input_dim": 2816,
-    "resnet_out_scale_factor": 1.0,
-    "resnet_skip_time_act": False,
-    "resnet_time_scale_shift": "default",
-    "sample_size": 128,
-    "time_cond_proj_dim": None,
-    "time_embedding_act_fn": None,
-    "time_embedding_dim": None,
-    "time_embedding_type": "positional",
-    "timestep_post_act": None,
-    "transformer_layers_per_block": [
-        1,
-        2,
-        10
-    ],
-    "up_block_types": [
-        "CrossAttnUpBlock2D",
-        "CrossAttnUpBlock2D",
-        "UpBlock2D"
-    ],
-    "upcast_attention": None,
-    "use_linear_projection": True
-}
 @dataclass
 class UNet2DConditionOutput(BaseOutput):

 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 @dataclass
 class UNet2DConditionOutput(BaseOutput):

pipeline/pipeline_controlnext.py CHANGED Viewed

@@ -14,7 +14,6 @@
 import inspect
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-from packaging import version
 import torch
 from transformers import (
     CLIPImageProcessor,
@@ -57,6 +56,7 @@ from diffusers.utils import (
 from diffusers.utils.torch_utils import randn_tensor
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
 from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
 if is_invisible_watermark_available():
     from diffusers.pipelines.stable_diffusion_xl.watermark import StableDiffusionXLWatermarker
@@ -87,8 +87,128 @@ EXAMPLE_DOC_STRING = """
         ```
 """
 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
 def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
     """
     Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
@@ -280,6 +400,156 @@ class StableDiffusionXLControlNeXtPipeline(
         else:
             self.watermark = None
     def prepare_image(
         self,
         image,

 import inspect
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 import torch
 from transformers import (
     CLIPImageProcessor,
 from diffusers.utils.torch_utils import randn_tensor
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
 from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
+from huggingface_hub.utils import validate_hf_hub_args
 if is_invisible_watermark_available():
     from diffusers.pipelines.stable_diffusion_xl.watermark import StableDiffusionXLWatermarker
         ```
 """
+CONTROLNEXT_WEIGHT_NAME = "controlnet.bin"
+CONTROLNEXT_WEIGHT_NAME_SAFE = "controlnet.safetensors"
+UNET_WEIGHT_NAME = "unet.bin"
+UNET_WEIGHT_NAME_SAFE = "unet.safetensors"
+# Copied from https://github.com/kohya-ss/sd-scripts/blob/main/library/sdxl_model_util.py
+def is_sdxl_state_dict(state_dict):
+    return any(key.startswith('input_blocks') for key in state_dict.keys())
+def convert_sdxl_unet_state_dict_to_diffusers(sd):
+    unet_conversion_map = make_unet_conversion_map()
+    conversion_dict = {sd: hf for sd, hf in unet_conversion_map}
+    return convert_unet_state_dict(sd, conversion_dict)
+def convert_unet_state_dict(src_sd, conversion_map):
+    converted_sd = {}
+    for src_key, value in src_sd.items():
+        src_key_fragments = src_key.split(".")[:-1]  # remove weight/bias
+        while len(src_key_fragments) > 0:
+            src_key_prefix = ".".join(src_key_fragments) + "."
+            if src_key_prefix in conversion_map:
+                converted_prefix = conversion_map[src_key_prefix]
+                converted_key = converted_prefix + src_key[len(src_key_prefix):]
+                converted_sd[converted_key] = value
+                break
+            src_key_fragments.pop(-1)
+        assert len(src_key_fragments) > 0, f"key {src_key} not found in conversion map"
+    return converted_sd
+def make_unet_conversion_map():
+    unet_conversion_map_layer = []
+    for i in range(3):  # num_blocks is 3 in sdxl
+        # loop over downblocks/upblocks
+        for j in range(2):
+            # loop over resnets/attentions for downblocks
+            hf_down_res_prefix = f"down_blocks.{i}.resnets.{j}."
+            sd_down_res_prefix = f"input_blocks.{3*i + j + 1}.0."
+            unet_conversion_map_layer.append((sd_down_res_prefix, hf_down_res_prefix))
+            if i < 3:
+                # no attention layers in down_blocks.3
+                hf_down_atn_prefix = f"down_blocks.{i}.attentions.{j}."
+                sd_down_atn_prefix = f"input_blocks.{3*i + j + 1}.1."
+                unet_conversion_map_layer.append((sd_down_atn_prefix, hf_down_atn_prefix))
+        for j in range(3):
+            # loop over resnets/attentions for upblocks
+            hf_up_res_prefix = f"up_blocks.{i}.resnets.{j}."
+            sd_up_res_prefix = f"output_blocks.{3*i + j}.0."
+            unet_conversion_map_layer.append((sd_up_res_prefix, hf_up_res_prefix))
+            # if i > 0: commentout for sdxl
+            # no attention layers in up_blocks.0
+            hf_up_atn_prefix = f"up_blocks.{i}.attentions.{j}."
+            sd_up_atn_prefix = f"output_blocks.{3*i + j}.1."
+            unet_conversion_map_layer.append((sd_up_atn_prefix, hf_up_atn_prefix))
+        if i < 3:
+            # no downsample in down_blocks.3
+            hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0.conv."
+            sd_downsample_prefix = f"input_blocks.{3*(i+1)}.0.op."
+            unet_conversion_map_layer.append((sd_downsample_prefix, hf_downsample_prefix))
+            # no upsample in up_blocks.3
+            hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0."
+            sd_upsample_prefix = f"output_blocks.{3*i + 2}.{2}."  # change for sdxl
+            unet_conversion_map_layer.append((sd_upsample_prefix, hf_upsample_prefix))
+    hf_mid_atn_prefix = "mid_block.attentions.0."
+    sd_mid_atn_prefix = "middle_block.1."
+    unet_conversion_map_layer.append((sd_mid_atn_prefix, hf_mid_atn_prefix))
+    for j in range(2):
+        hf_mid_res_prefix = f"mid_block.resnets.{j}."
+        sd_mid_res_prefix = f"middle_block.{2*j}."
+        unet_conversion_map_layer.append((sd_mid_res_prefix, hf_mid_res_prefix))
+    unet_conversion_map_resnet = [
+        # (stable-diffusion, HF Diffusers)
+        ("in_layers.0.", "norm1."),
+        ("in_layers.2.", "conv1."),
+        ("out_layers.0.", "norm2."),
+        ("out_layers.3.", "conv2."),
+        ("emb_layers.1.", "time_emb_proj."),
+        ("skip_connection.", "conv_shortcut."),
+    ]
+    unet_conversion_map = []
+    for sd, hf in unet_conversion_map_layer:
+        if "resnets" in hf:
+            for sd_res, hf_res in unet_conversion_map_resnet:
+                unet_conversion_map.append((sd + sd_res, hf + hf_res))
+        else:
+            unet_conversion_map.append((sd, hf))
+    for j in range(2):
+        hf_time_embed_prefix = f"time_embedding.linear_{j+1}."
+        sd_time_embed_prefix = f"time_embed.{j*2}."
+        unet_conversion_map.append((sd_time_embed_prefix, hf_time_embed_prefix))
+    for j in range(2):
+        hf_label_embed_prefix = f"add_embedding.linear_{j+1}."
+        sd_label_embed_prefix = f"label_emb.0.{j*2}."
+        unet_conversion_map.append((sd_label_embed_prefix, hf_label_embed_prefix))
+    unet_conversion_map.append(("input_blocks.0.0.", "conv_in."))
+    unet_conversion_map.append(("out.0.", "conv_norm_out."))
+    unet_conversion_map.append(("out.2.", "conv_out."))
+    return unet_conversion_map
 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
 def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
     """
     Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
         else:
             self.watermark = None
+    def load_controlnext_weights(
+        self,
+        pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
+        load_weight_increasement: bool = False,
+        **kwargs,
+    ):
+        self.load_controlnext_unet_weights(pretrained_model_name_or_path_or_dict, load_weight_increasement, **kwargs)
+        kwargs['torch_dtype'] = torch.float32
+        self.load_controlnext_controlnet_weights(pretrained_model_name_or_path_or_dict, **kwargs)
+    def load_controlnext_unet_weights(
+        self,
+        pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
+        load_weight_increasement: bool = False,
+        **kwargs,
+    ):
+        if isinstance(pretrained_model_name_or_path_or_dict, dict):
+            pretrained_model_name_or_path_or_dict = pretrained_model_name_or_path_or_dict.copy()
+        state_dict = self.controlnext_unet_state_dict(pretrained_model_name_or_path_or_dict, **kwargs)
+        if is_sdxl_state_dict(state_dict):
+            state_dict = convert_sdxl_unet_state_dict_to_diffusers(state_dict)
+        logger.info(f"Loading ControlNeXt UNet" + (f" with weight increasement." if load_weight_increasement else "."))
+        if load_weight_increasement:
+            unet_sd = self.unet.state_dict()
+            for k in state_dict.keys():
+                state_dict[k] = state_dict[k] + unet_sd[k]
+        self.unet.load_state_dict(state_dict, strict=False)
+    @classmethod
+    @validate_hf_hub_args
+    def controlnext_unet_state_dict(
+        cls,
+        pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
+        **kwargs,
+    ):
+        if 'weight_name' not in kwargs:
+            kwargs['weight_name'] = UNET_WEIGHT_NAME_SAFE if kwargs.get('use_safetensors', False) else UNET_WEIGHT_NAME
+        return cls.controlnext_state_dict(pretrained_model_name_or_path_or_dict, **kwargs)
+    def load_controlnext_controlnet_weights(
+        self,
+        pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
+        **kwargs,
+    ):
+        if self.controlnet is None:
+            raise ValueError("No ControlNeXt ControlNet found in the pipeline.")
+        if isinstance(pretrained_model_name_or_path_or_dict, dict):
+            pretrained_model_name_or_path_or_dict = pretrained_model_name_or_path_or_dict.copy()
+        state_dict = self.controlnext_controlnet_state_dict(pretrained_model_name_or_path_or_dict, **kwargs)
+        logger.info(f"Loading ControlNeXt ControlNet")
+        self.controlnet.load_state_dict(state_dict, strict=True)
+    @classmethod
+    @validate_hf_hub_args
+    def controlnext_controlnet_state_dict(
+        cls,
+        pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
+        **kwargs,
+    ):
+        if 'weight_name' not in kwargs:
+            kwargs['weight_name'] = CONTROLNEXT_WEIGHT_NAME_SAFE if kwargs.get('use_safetensors', False) else CONTROLNEXT_WEIGHT_NAME
+        return cls.controlnext_state_dict(pretrained_model_name_or_path_or_dict, **kwargs)
+    @classmethod
+    @validate_hf_hub_args
+    def controlnext_state_dict(
+        cls,
+        pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
+        **kwargs,
+    ):
+        r"""
+        Return state dict for controlnext weights.
+        Parameters:
+            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
+                Can be either:
+                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
+                      the Hub.
+                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
+                      with [`ModelMixin.save_pretrained`].
+                    - A [torch state
+                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            subfolder (`str`, *optional*, defaults to `""`):
+                The subfolder location of a model file within a larger model repository on the Hub or locally.
+            weight_name (`str`, *optional*, defaults to None):
+                Name of the serialized state dict file.
+        """
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", None)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+        subfolder = kwargs.pop("subfolder", None)
+        weight_name = kwargs.pop("weight_name", None)
+        unet_config = kwargs.pop("unet_config", None)
+        use_safetensors = kwargs.pop("use_safetensors", None)
+        allow_pickle = False
+        if use_safetensors is None:
+            use_safetensors = True
+            allow_pickle = True
+        user_agent = {
+            "file_type": "attn_procs_weights",
+            "framework": "pytorch",
+        }
+        state_dict = cls._fetch_state_dict(
+            pretrained_model_name_or_path_or_dict=pretrained_model_name_or_path_or_dict,
+            weight_name=weight_name,
+            use_safetensors=use_safetensors,
+            local_files_only=local_files_only,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            proxies=proxies,
+            token=token,
+            revision=revision,
+            subfolder=subfolder,
+            user_agent=user_agent,
+            allow_pickle=allow_pickle,
+        )
+        return state_dict
     def prepare_image(
         self,
         image,

utils/tools.py CHANGED Viewed

@@ -1,14 +1,90 @@
 import os
-import torch
 import gc
-from torch import nn
-from diffusers import UniPCMultistepScheduler, AutoencoderKL
 from safetensors.torch import load_file
 from pipeline.pipeline_controlnext import StableDiffusionXLControlNeXtPipeline
-from models.unet import UNet2DConditionModel, UNET_CONFIG
 from models.controlnet import ControlNetModel
 from . import utils
 def get_pipeline(
     pretrained_model_name_or_path,
@@ -26,20 +102,6 @@ def get_pipeline(
 ):
     pipeline_init_kwargs = {}
-    if controlnet_model_name_or_path is not None:
-        print(f"loading controlnet from {controlnet_model_name_or_path}")
-        controlnet = ControlNetModel()
-        if controlnet_model_name_or_path is not None:
-            utils.load_safetensors(controlnet, controlnet_model_name_or_path)
-        else:
-            controlnet.scale = nn.Parameter(torch.tensor(0.), requires_grad=False)
-        controlnet.to(device, dtype=torch.float32)
-        pipeline_init_kwargs["controlnet"] = controlnet
-        utils.log_model_info(controlnet, "controlnext")
-    else:
-        print(f"no controlnet")
     print(f"loading unet from {pretrained_model_name_or_path}")
     if os.path.isfile(pretrained_model_name_or_path):
         # load unet from local checkpoint
@@ -49,42 +111,15 @@ def get_pipeline(
         unet = UNet2DConditionModel.from_config(UNET_CONFIG)
         unet.load_state_dict(unet_sd, strict=True)
     else:
-        from huggingface_hub import hf_hub_download
-        filename = "diffusion_pytorch_model"
-        if variant == "fp16":
-            filename += ".fp16"
-        if use_safetensors:
-            filename += ".safetensors"
-        else:
-            filename += ".pt"
-        unet_file = hf_hub_download(
-            repo_id=pretrained_model_name_or_path,
-            filename="unet" + '/' + filename,
             cache_dir=hf_cache_dir,
         )
-        unet_sd = load_file(unet_file) if unet_file.endswith(".safetensors") else torch.load(pretrained_model_name_or_path)
-        unet_sd = utils.extract_unet_state_dict(unet_sd)
-        unet_sd = utils.convert_sdxl_unet_state_dict_to_diffusers(unet_sd)
-        unet = UNet2DConditionModel.from_config(UNET_CONFIG)
-        unet.load_state_dict(unet_sd, strict=True)
     unet = unet.to(dtype=torch.float16)
-    utils.log_model_info(unet, "unet")
-    if unet_model_name_or_path is not None:
-        print(f"loading controlnext unet from {unet_model_name_or_path}")
-        controlnext_unet_sd = load_file(unet_model_name_or_path)
-        controlnext_unet_sd = utils.convert_to_controlnext_unet_state_dict(controlnext_unet_sd)
-        unet_sd = unet.state_dict()
-        assert all(
-            k in unet_sd for k in controlnext_unet_sd), \
-            f"controlnext unet state dict is not compatible with unet state dict, missing keys: {set(controlnext_unet_sd.keys()) - set(unet_sd.keys())}, extra keys: {set(unet_sd.keys()) - set(controlnext_unet_sd.keys())}"
-        if load_weight_increasement:
-            print("loading weight increasement")
-            for k in controlnext_unet_sd.keys():
-                controlnext_unet_sd[k] = controlnext_unet_sd[k] + unet_sd[k]
-        unet.load_state_dict(controlnext_unet_sd, strict=False)
-        utils.log_model_info(controlnext_unet_sd, "controlnext unet")
     pipeline_init_kwargs["unet"] = unet
     if vae_model_name_or_path is not None:
@@ -92,6 +127,9 @@ def get_pipeline(
         vae = AutoencoderKL.from_pretrained(vae_model_name_or_path, cache_dir=hf_cache_dir, torch_dtype=torch.float16).to(device)
         pipeline_init_kwargs["vae"] = vae
     print(f"loading pipeline from {pretrained_model_name_or_path}")
     if os.path.isfile(pretrained_model_name_or_path):
         pipeline: StableDiffusionXLControlNeXtPipeline = StableDiffusionXLControlNeXtPipeline.from_single_file(
@@ -112,6 +150,23 @@ def get_pipeline(
         )
     pipeline.scheduler = UniPCMultistepScheduler.from_config(pipeline.scheduler.config)
     pipeline.set_progress_bar_config()
     pipeline = pipeline.to(device, dtype=torch.float16)
@@ -121,7 +176,7 @@ def get_pipeline(
         pipeline.enable_xformers_memory_efficient_attention()
     gc.collect()
-    if torch.cuda.is_available():
         torch.cuda.empty_cache()
     return pipeline

 import os
 import gc
+import torch
+from diffusers import UniPCMultistepScheduler, AutoencoderKL, ControlNetModel
 from safetensors.torch import load_file
 from pipeline.pipeline_controlnext import StableDiffusionXLControlNeXtPipeline
+from models.unet import UNet2DConditionModel
 from models.controlnet import ControlNetModel
 from . import utils
+UNET_CONFIG = {
+    "act_fn": "silu",
+    "addition_embed_type": "text_time",
+    "addition_embed_type_num_heads": 64,
+    "addition_time_embed_dim": 256,
+    "attention_head_dim": [
+        5,
+        10,
+        20
+    ],
+    "block_out_channels": [
+        320,
+        640,
+        1280
+    ],
+    "center_input_sample": False,
+    "class_embed_type": None,
+    "class_embeddings_concat": False,
+    "conv_in_kernel": 3,
+    "conv_out_kernel": 3,
+    "cross_attention_dim": 2048,
+    "cross_attention_norm": None,
+    "down_block_types": [
+        "DownBlock2D",
+        "CrossAttnDownBlock2D",
+        "CrossAttnDownBlock2D"
+    ],
+    "downsample_padding": 1,
+    "dual_cross_attention": False,
+    "encoder_hid_dim": None,
+    "encoder_hid_dim_type": None,
+    "flip_sin_to_cos": True,
+    "freq_shift": 0,
+    "in_channels": 4,
+    "layers_per_block": 2,
+    "mid_block_only_cross_attention": None,
+    "mid_block_scale_factor": 1,
+    "mid_block_type": "UNetMidBlock2DCrossAttn",
+    "norm_eps": 1e-05,
+    "norm_num_groups": 32,
+    "num_attention_heads": None,
+    "num_class_embeds": None,
+    "only_cross_attention": False,
+    "out_channels": 4,
+    "projection_class_embeddings_input_dim": 2816,
+    "resnet_out_scale_factor": 1.0,
+    "resnet_skip_time_act": False,
+    "resnet_time_scale_shift": "default",
+    "sample_size": 128,
+    "time_cond_proj_dim": None,
+    "time_embedding_act_fn": None,
+    "time_embedding_dim": None,
+    "time_embedding_type": "positional",
+    "timestep_post_act": None,
+    "transformer_layers_per_block": [
+        1,
+        2,
+        10
+    ],
+    "up_block_types": [
+        "CrossAttnUpBlock2D",
+        "CrossAttnUpBlock2D",
+        "UpBlock2D"
+    ],
+    "upcast_attention": None,
+    "use_linear_projection": True
+}
+CONTROLNET_CONFIG = {
+    'in_channels': [128, 128],
+    'out_channels': [128, 256],
+    'groups': [4, 8],
+    'time_embed_dim': 256,
+    'final_out_channels': 320,
+    '_use_default_values': ['time_embed_dim', 'groups', 'in_channels', 'final_out_channels', 'out_channels']
+}
 def get_pipeline(
     pretrained_model_name_or_path,
 ):
     pipeline_init_kwargs = {}
     print(f"loading unet from {pretrained_model_name_or_path}")
     if os.path.isfile(pretrained_model_name_or_path):
         # load unet from local checkpoint
         unet = UNet2DConditionModel.from_config(UNET_CONFIG)
         unet.load_state_dict(unet_sd, strict=True)
     else:
+        unet = UNet2DConditionModel.from_pretrained(
+            pretrained_model_name_or_path,
             cache_dir=hf_cache_dir,
+            variant=variant,
+            torch_dtype=torch.float16,
+            use_safetensors=use_safetensors,
+            subfolder="unet",
         )
     unet = unet.to(dtype=torch.float16)
     pipeline_init_kwargs["unet"] = unet
     if vae_model_name_or_path is not None:
         vae = AutoencoderKL.from_pretrained(vae_model_name_or_path, cache_dir=hf_cache_dir, torch_dtype=torch.float16).to(device)
         pipeline_init_kwargs["vae"] = vae
+    if controlnet_model_name_or_path is not None:
+        pipeline_init_kwargs["controlnet"] = ControlNetModel.from_config(CONTROLNET_CONFIG).to(device, dtype=torch.float32)  # init
     print(f"loading pipeline from {pretrained_model_name_or_path}")
     if os.path.isfile(pretrained_model_name_or_path):
         pipeline: StableDiffusionXLControlNeXtPipeline = StableDiffusionXLControlNeXtPipeline.from_single_file(
         )
     pipeline.scheduler = UniPCMultistepScheduler.from_config(pipeline.scheduler.config)
+    if unet_model_name_or_path is not None:
+        print(f"loading controlnext unet from {unet_model_name_or_path}")
+        pipeline.load_controlnext_unet_weights(
+            unet_model_name_or_path,
+            load_weight_increasement=load_weight_increasement,
+            use_safetensors=True,
+            torch_dtype=torch.float16,
+            cache_dir=hf_cache_dir,
+        )
+    if controlnet_model_name_or_path is not None:
+        print(f"loading controlnext controlnet from {controlnet_model_name_or_path}")
+        pipeline.load_controlnext_controlnet_weights(
+            controlnet_model_name_or_path,
+            use_safetensors=True,
+            torch_dtype=torch.float32,
+            cache_dir=hf_cache_dir,
+        )
     pipeline.set_progress_bar_config()
     pipeline = pipeline.to(device, dtype=torch.float16)
         pipeline.enable_xformers_memory_efficient_attention()
     gc.collect()
+    if str(device) == 'cuda' and torch.cuda.is_available():
         torch.cuda.empty_cache()
     return pipeline

utils/utils.py CHANGED Viewed

@@ -1,52 +1,5 @@
 import math
 from typing import Tuple, Union, Optional
-from safetensors.torch import load_file
-from transformers import PretrainedConfig
-def count_num_parameters_of_safetensors_model(safetensors_path):
-    state_dict = load_file(safetensors_path)
-    return sum(p.numel() for p in state_dict.values())
-def import_model_class_from_model_name_or_path(
-    pretrained_model_name_or_path: str, revision: str, subfolder: str = None
-):
-    text_encoder_config = PretrainedConfig.from_pretrained(
-        pretrained_model_name_or_path, revision=revision, subfolder=subfolder
-    )
-    model_class = text_encoder_config.architectures[0]
-    if model_class == "CLIPTextModel":
-        from transformers import CLIPTextModel
-        return CLIPTextModel
-    elif model_class == "CLIPTextModelWithProjection":
-        from transformers import CLIPTextModelWithProjection
-        return CLIPTextModelWithProjection
-    else:
-        raise ValueError(f"{model_class} is not supported.")
-def fix_clip_text_encoder_position_ids(text_encoder):
-    if hasattr(text_encoder.text_model.embeddings, "position_ids"):
-        text_encoder.text_model.embeddings.position_ids = text_encoder.text_model.embeddings.position_ids.long()
-def load_controlnext_unet_state_dict(unet_sd, controlnext_unet_sd):
-    assert all(
-        k in unet_sd for k in controlnext_unet_sd), f"controlnext unet state dict is not compatible with unet state dict, missing keys: {set(controlnext_unet_sd.keys()) - set(unet_sd.keys())}, extra keys: {set(unet_sd.keys()) - set(controlnext_unet_sd.keys())}"
-    for k in controlnext_unet_sd.keys():
-        unet_sd[k] = controlnext_unet_sd[k]
-    return unet_sd
-def convert_to_controlnext_unet_state_dict(state_dict):
-    import re
-    pattern = re.compile(r'.*attn2.*to_out.*')
-    state_dict = {k: v for k, v in state_dict.items() if pattern.match(k)}
-    # state_dict = extract_unet_state_dict(state_dict)
-    if is_sdxl_state_dict(state_dict):
-        state_dict = convert_sdxl_unet_state_dict_to_diffusers(state_dict)
-    return state_dict
 def make_unet_conversion_map():
@@ -166,27 +119,6 @@ def extract_unet_state_dict(state_dict):
     return unet_sd
-def is_sdxl_state_dict(state_dict):
-    return any(key.startswith('input_blocks') for key in state_dict.keys())
-def contains_unet_keys(state_dict):
-    UNET_KEY_PREFIX = "model.diffusion_model."
-    return any(k.startswith(UNET_KEY_PREFIX) for k in state_dict.keys())
-def load_safetensors(model, safetensors_path, strict=True, load_weight_increasement=False):
-    if not load_weight_increasement:
-        state_dict = load_file(safetensors_path)
-        model.load_state_dict(state_dict, strict=strict)
-    else:
-        state_dict = load_file(safetensors_path)
-        pretrained_state_dict = model.state_dict()
-        for k in state_dict.keys():
-            state_dict[k] = state_dict[k] + pretrained_state_dict[k]
-        model.load_state_dict(state_dict, strict=False)
 def log_model_info(model, name):
     sd = model.state_dict() if hasattr(model, "state_dict") else model
     print(

 import math
 from typing import Tuple, Union, Optional
 def make_unet_conversion_map():
     return unet_sd
 def log_model_info(model, name):
     sd = model.state_dict() if hasattr(model, "state_dict") else model
     print(