Spaces:

John6666
/

t2i_test1

Paused

App Files Files Community

John6666 commited on Dec 18, 2025

Commit

aefd7f3

verified ·

1 Parent(s): 6a84530

Upload 16 files

Browse files

Files changed (10) hide show

README.md +13 -13
app.py +118 -120
requirements.txt +18 -17
t2i/controlnet_union/guided_filter.py +280 -280
t2i/controlnet_union/mask.py +347 -347
t2i/controlnet_union/models/controlnet_union.py +957 -957
t2i/controlnet_union/pipeline/pipeline_controlnet_union_inpaint_sd_xl.py +0 -0
t2i/controlnet_union/pipeline/pipeline_controlnet_union_sd_xl.py +0 -0
t2i/controlnet_union/pipeline/pipeline_controlnet_union_sd_xl_img2img.py +0 -0
t2i/pipe.py +157 -157

README.md CHANGED Viewed

@@ -1,13 +1,13 @@
----
-title: T2I test
-emoji: 🖼
-colorFrom: purple
-colorTo: red
-sdk: gradio
-sdk_version: 5.50.0
-app_file: app.py
-pinned: false
-license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: T2I test
+emoji: 🖼
+colorFrom: purple
+colorTo: red
+sdk: gradio
+sdk_version: 6.1.0
+app_file: app.py
+pinned: false
+license: mit
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,120 +1,118 @@
-import spaces
-import gradio as gr
-from gradio_huggingfacehub_search import HuggingfaceHubSearch
-from t2i.infer import (infer, infer_multi, infer_simple, save_image_history, save_gallery_history,
-                       update_param_mode_gr, update_ar_gr,
-                       MAX_SEED, MAX_IMAGE_SIZE, ASPECT_RATIOS, FILE_FORMATS, DEFAULT_TASKS, DEFAULT_DURATION,
-                       DEFAULT_I2I_STRENGTH, DEFAULT_UPSCALE_STRENGTH, DEFAULT_UPSCALE_BY, DEFAULT_CLIP_SKIP,
-                       models, MODEL_TYPES, SAMPLER_NAMES, PRED_TYPES, VAE_NAMES,
-                       UPSCALE_MODES, PARAM_MODES, PIPELINE_TYPES)
-css = """
-#col-container {
-    margin: 0 auto;
-    max-width: 1080px;
-}
-"""
-with gr.Blocks(fill_height=True, fill_width=True, css=css) as demo:
-    with gr.Tab("Image Generator"):
-        lora_dict = gr.State({})
-        with gr.Column(elem_id="col-container"):
-            with gr.Tab("Normal"):
-                with gr.Row():
-                    prompt = gr.Text(label="Prompt", show_label=False, lines=1, placeholder="Enter your prompt", container=False)
-                    run_button = gr.Button("Run", scale=0)
-                    run_button_simple = gr.Button("Simple", scale=0, visible=False) # for API
-                result = gr.Image(label="Result", show_label=False, format="png", type="filepath", interactive=False, show_share_button=False, show_download_button=True)
-            with gr.Tab("Multi"):
-                with gr.Row():
-                    prompt_multi = gr.Text(label="Prompt", show_label=False, lines=1, placeholder="Enter your prompt", container=False)
-                    run_button_multi = gr.Button("Run", scale=0)
-                model_name_multi = gr.Dropdown(label="Model", choices=models, value=models[0], multiselect=True, allow_custom_value=True)
-                num_images = gr.Slider(label="Count", minimum=1, maximum=16, step=1, value=1)
-                result_multi = gr.Gallery(label="Result", columns=2, object_fit="contain", format="png", interactive=False, show_share_button=False, show_download_button=True)
-            with gr.Accordion("Output History", open=False):
-                history_files = gr.Files(interactive=False, visible=False)
-                history_gallery = gr.Gallery(label="History", columns=6, object_fit="contain", format="png", interactive=False, show_share_button=False,
-                show_download_button=True)
-                history_clear_button = gr.Button(value="Clear History", variant="secondary")
-                history_clear_button.click(lambda: ([], []), None, [history_gallery, history_files], queue=False, show_api=False)
-            with gr.Group():
-                negative_prompt = gr.Text(label="Negative prompt", max_lines=1, placeholder="Enter a negative prompt",
-                                            value="") # nsfw, (low quality, worst quality:1.2), very displeasing, 3d, watermark, signature, ugly, poorly drawn
-                with gr.Row(equal_height=True):
-                    seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0)
-                    randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
-                with gr.Row(equal_height=True):
-                    param_mode = gr.Radio(label="Parameter Settings", choices=PARAM_MODES, value=PARAM_MODES[0])
-                    ar = gr.Dropdown(label="Aspect Ratio", choices=ASPECT_RATIOS, value=ASPECT_RATIOS[0])
-                with gr.Row(equal_height=True):
-                    width = gr.Slider(label="Width", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=1024, visible=False)
-                    height = gr.Slider(label="Height", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=1024, visible=False)
-                    guidance_scale = gr.Slider(label="Guidance scale", minimum=0.0, maximum=20.0, step=0.1, value=7, visible=False)
-                    num_inference_steps = gr.Slider(label="Number of inference steps", minimum=1, maximum=60, step=1, value=28, visible=False)
-                with gr.Group():
-                    model_name = gr.Dropdown(label="Model", choices=models, value=models[0], allow_custom_value=True)
-                with gr.Accordion("Advanced Settings", open=False):
-                    with gr.Row(equal_height=True):
-                        model_type = gr.Dropdown(label="Model Type", choices=MODEL_TYPES, value=MODEL_TYPES[0])
-                        vae = gr.Dropdown(label="VAE", choices=VAE_NAMES, value=VAE_NAMES[0], allow_custom_value=True)
-                    with gr.Row(equal_height=True):
-                        sampler = gr.Dropdown(label="Sampler", choices=SAMPLER_NAMES, value=SAMPLER_NAMES[0])
-                        pred_type = gr.Dropdown(label="Sampler prediction", choices=PRED_TYPES, value=PRED_TYPES[0])
-                    with gr.Row(equal_height=True):
-                        pipe_type = gr.Dropdown(label="Pipeline Type", choices=PIPELINE_TYPES, value=PIPELINE_TYPES[0])
-                        clip_skip = gr.Slider(label="Clip Skip", minimum=0, maximum=12, step=1, value=DEFAULT_CLIP_SKIP)
-                    with gr.Row(equal_height=True):
-                        task = gr.Radio(label="Task", choices=DEFAULT_TASKS, value=DEFAULT_TASKS[0])
-                        strength = gr.Slider(label="Image-to-Image / Inpainting Strength", minimum=0, maximum=1., step=0.01, value=DEFAULT_I2I_STRENGTH)
-                    input_image = gr.ImageEditor(label="Input Image", type="filepath", sources=["upload", "clipboard", "webcam"], image_mode='RGB',
-                                                    show_share_button=False, show_fullscreen_button=False, layers=False, canvas_size=(384, 384), width=384, height=512,
-                                                    brush=gr.Brush(colors=["#FFFFFF"], color_mode="fixed", default_size=32), eraser=gr.Eraser(default_size="32"))
-                    with gr.Row(equal_height=True):
-                        upscale_mode = gr.Dropdown(label="Upscaling", choices=UPSCALE_MODES, value=UPSCALE_MODES[0])
-                        upscale_strength = gr.Slider(label="Strength", minimum=0, maximum=1, step=0.05, value=DEFAULT_UPSCALE_STRENGTH)
-                        upscale_by = gr.Slider(label="Upscale by", minimum=1, maximum=1.5, step=0.1, value=DEFAULT_UPSCALE_BY)
-                    with gr.Row(equal_height=True):
-                        format = gr.Dropdown(label="Output Format", choices=FILE_FORMATS, value=FILE_FORMATS[0])
-                        gpu_duration = gr.Number(minimum=0, maximum=240, value=DEFAULT_DURATION, label="GPU time duration (seconds per image)")
-    with gr.Tab("PNG Info"):
-        def extract_exif_data(image):
-            if image is None: return ""
-            try:
-                metadata_keys = ["parameters", "metadata", "prompt", "Comment"]
-                for key in metadata_keys:
-                    if key in image.info:
-                        return image.info[key]
-                return str(image.info)
-            except Exception as e:
-                return f"Error extracting metadata: {str(e)}"
-        with gr.Row():
-            with gr.Column():
-                image_metadata = gr.Image(label="Image with metadata", type="pil", sources=["upload"])
-            with gr.Column():
-                result_metadata = gr.Textbox(label="Metadata", show_label=True, show_copy_button=True, interactive=False, container=True, max_lines=99)
-                image_metadata.change(fn=extract_exif_data, inputs=[image_metadata], outputs=[result_metadata], show_api=False)
-    gr.on(triggers=[run_button.click, prompt.submit], fn=infer,
-          inputs=[prompt, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps,
-                  model_name, sampler, pred_type, vae, model_type, clip_skip, pipe_type, lora_dict, upscale_mode, upscale_strength, upscale_by,
-                  input_image, strength, param_mode, ar, format, task, gpu_duration],
-          outputs=[result])
-    gr.on(triggers=[run_button_multi.click, prompt_multi.submit], fn=infer_multi,
-          inputs=[prompt, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps,
-                  model_name_multi, sampler, pred_type, vae, clip_skip, pipe_type, lora_dict, upscale_mode, upscale_strength, upscale_by,
-                  input_image, strength, param_mode, ar, format, num_images, task, gpu_duration],
-          outputs=[result_multi])
-    run_button_simple.click(fn=infer_simple, inputs=[prompt, negative_prompt, seed, randomize_seed, model_name], outputs=[result], show_api=True)
-    result.change(save_image_history, [result, history_gallery, history_files], [history_gallery, history_files], queue=False, show_api=False)
-    result_multi.change(save_gallery_history, [result_multi, history_gallery, history_files], [history_gallery, history_files], queue=False, show_api=False)
-    ar.change(update_ar_gr, [ar], [width, height], queue=False, show_api=False)
-    param_mode.change(update_param_mode_gr, [param_mode], [guidance_scale, num_inference_steps], queue=False, show_api=False)
-demo.queue().launch(ssr_mode=False, mcp_server=True)

+import spaces
+import gradio as gr
+from gradio_huggingfacehub_search import HuggingfaceHubSearch
+from t2i.infer import (infer, infer_multi, infer_simple, save_image_history, save_gallery_history,
+                       update_param_mode_gr, update_ar_gr,
+                       MAX_SEED, MAX_IMAGE_SIZE, ASPECT_RATIOS, FILE_FORMATS, DEFAULT_TASKS, DEFAULT_DURATION,
+                       DEFAULT_I2I_STRENGTH, DEFAULT_UPSCALE_STRENGTH, DEFAULT_UPSCALE_BY, DEFAULT_CLIP_SKIP,
+                       models, MODEL_TYPES, SAMPLER_NAMES, PRED_TYPES, VAE_NAMES,
+                       UPSCALE_MODES, PARAM_MODES, PIPELINE_TYPES)
+css = """
+#col-container {
+    margin: 0 auto;
+    max-width: 1080px;
+}
+"""
+with gr.Blocks(fill_height=True, fill_width=True) as demo:
+    with gr.Tab("Image Generator"):
+        lora_dict = gr.State({})
+        with gr.Column(elem_id="col-container"):
+            with gr.Tab("Normal"):
+                with gr.Row():
+                    prompt = gr.Text(label="Prompt", show_label=False, lines=1, placeholder="Enter your prompt", container=False)
+                    run_button = gr.Button("Run", scale=0)
+                    run_button_simple = gr.Button("Simple", scale=0, visible=False) # for API
+                result = gr.Image(label="Result", show_label=False, format="png", type="filepath", interactive=False, buttons=["download", "fullscreen"])
+            with gr.Tab("Multi"):
+                with gr.Row():
+                    prompt_multi = gr.Text(label="Prompt", show_label=False, lines=1, placeholder="Enter your prompt", container=False)
+                    run_button_multi = gr.Button("Run", scale=0)
+                model_name_multi = gr.Dropdown(label="Model", choices=models, value=models[0], multiselect=True, allow_custom_value=True)
+                num_images = gr.Slider(label="Count", minimum=1, maximum=16, step=1, value=1)
+                result_multi = gr.Gallery(label="Result", columns=2, object_fit="contain", format="png", interactive=False, buttons=["download", "fullscreen"])
+            with gr.Accordion("Output History", open=False):
+                history_files = gr.Files(interactive=False, visible=False)
+                history_gallery = gr.Gallery(label="History", columns=6, object_fit="contain", format="png", interactive=False, buttons=["download", "fullscreen"])
+                history_clear_button = gr.Button(value="Clear History", variant="secondary")
+                history_clear_button.click(lambda: ([], []), None, [history_gallery, history_files], queue=False, api_visibility="undocumented")
+            with gr.Group():
+                negative_prompt = gr.Text(label="Negative prompt", max_lines=1, placeholder="Enter a negative prompt",
+                                            value="") # nsfw, (low quality, worst quality:1.2), very displeasing, 3d, watermark, signature, ugly, poorly drawn
+                with gr.Row(equal_height=True):
+                    seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0)
+                    randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
+                with gr.Row(equal_height=True):
+                    param_mode = gr.Radio(label="Parameter Settings", choices=PARAM_MODES, value=PARAM_MODES[0])
+                    ar = gr.Dropdown(label="Aspect Ratio", choices=ASPECT_RATIOS, value=ASPECT_RATIOS[0])
+                with gr.Row(equal_height=True):
+                    width = gr.Slider(label="Width", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=1024, visible=False)
+                    height = gr.Slider(label="Height", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=1024, visible=False)
+                    guidance_scale = gr.Slider(label="Guidance scale", minimum=0.0, maximum=20.0, step=0.1, value=7, visible=False)
+                    num_inference_steps = gr.Slider(label="Number of inference steps", minimum=1, maximum=60, step=1, value=28, visible=False)
+                with gr.Group():
+                    model_name = gr.Dropdown(label="Model", choices=models, value=models[0], allow_custom_value=True)
+                with gr.Accordion("Advanced Settings", open=False):
+                    with gr.Row(equal_height=True):
+                        model_type = gr.Dropdown(label="Model Type", choices=MODEL_TYPES, value=MODEL_TYPES[0])
+                        vae = gr.Dropdown(label="VAE", choices=VAE_NAMES, value=VAE_NAMES[0], allow_custom_value=True)
+                    with gr.Row(equal_height=True):
+                        sampler = gr.Dropdown(label="Sampler", choices=SAMPLER_NAMES, value=SAMPLER_NAMES[0])
+                        pred_type = gr.Dropdown(label="Sampler prediction", choices=PRED_TYPES, value=PRED_TYPES[0])
+                    with gr.Row(equal_height=True):
+                        pipe_type = gr.Dropdown(label="Pipeline Type", choices=PIPELINE_TYPES, value=PIPELINE_TYPES[0])
+                        clip_skip = gr.Slider(label="Clip Skip", minimum=0, maximum=12, step=1, value=DEFAULT_CLIP_SKIP)
+                    with gr.Row(equal_height=True):
+                        task = gr.Radio(label="Task", choices=DEFAULT_TASKS, value=DEFAULT_TASKS[0])
+                        strength = gr.Slider(label="Image-to-Image / Inpainting Strength", minimum=0, maximum=1., step=0.01, value=DEFAULT_I2I_STRENGTH)
+                    input_image = gr.ImageEditor(label="Input Image", type="filepath", sources=["upload", "clipboard", "webcam"], image_mode='RGB', layers=False, buttons=[], canvas_size=(384, 384), width=384, height=512,
+                                                    brush=gr.Brush(colors=["#FFFFFF"], color_mode="fixed", default_size=32), eraser=gr.Eraser(default_size="32"))
+                    with gr.Row(equal_height=True):
+                        upscale_mode = gr.Dropdown(label="Upscaling", choices=UPSCALE_MODES, value=UPSCALE_MODES[0])
+                        upscale_strength = gr.Slider(label="Strength", minimum=0, maximum=1, step=0.05, value=DEFAULT_UPSCALE_STRENGTH)
+                        upscale_by = gr.Slider(label="Upscale by", minimum=1, maximum=1.5, step=0.1, value=DEFAULT_UPSCALE_BY)
+                    with gr.Row(equal_height=True):
+                        format = gr.Dropdown(label="Output Format", choices=FILE_FORMATS, value=FILE_FORMATS[0])
+                        gpu_duration = gr.Number(minimum=0, maximum=240, value=DEFAULT_DURATION, label="GPU time duration (seconds per image)")
+    with gr.Tab("PNG Info"):
+        def extract_exif_data(image):
+            if image is None: return ""
+            try:
+                metadata_keys = ["parameters", "metadata", "prompt", "Comment"]
+                for key in metadata_keys:
+                    if key in image.info:
+                        return image.info[key]
+                return str(image.info)
+            except Exception as e:
+                return f"Error extracting metadata: {str(e)}"
+        with gr.Row():
+            with gr.Column():
+                image_metadata = gr.Image(label="Image with metadata", type="pil", sources=["upload"])
+            with gr.Column():
+                result_metadata = gr.Textbox(label="Metadata", show_label=True, buttons=["copy"], interactive=False, container=True, max_lines=99)
+                image_metadata.change(fn=extract_exif_data, inputs=[image_metadata], outputs=[result_metadata], api_visibility="undocumented")
+    gr.on(triggers=[run_button.click, prompt.submit], fn=infer,
+          inputs=[prompt, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps,
+                  model_name, sampler, pred_type, vae, model_type, clip_skip, pipe_type, lora_dict, upscale_mode, upscale_strength, upscale_by,
+                  input_image, strength, param_mode, ar, format, task, gpu_duration],
+          outputs=[result])
+    gr.on(triggers=[run_button_multi.click, prompt_multi.submit], fn=infer_multi,
+          inputs=[prompt_multi, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps,
+                  model_name_multi, sampler, pred_type, vae, clip_skip, pipe_type, lora_dict, upscale_mode, upscale_strength, upscale_by,
+                  input_image, strength, param_mode, ar, format, num_images, task, gpu_duration],
+          outputs=[result_multi])
+    run_button_simple.click(fn=infer_simple, inputs=[prompt, negative_prompt, seed, randomize_seed, model_name], outputs=[result])
+    result.change(save_image_history, [result, history_gallery, history_files], [history_gallery, history_files], queue=False, api_visibility="undocumented")
+    result_multi.change(save_gallery_history, [result_multi, history_gallery, history_files], [history_gallery, history_files], queue=False, api_visibility="undocumented")
+    ar.change(update_ar_gr, [ar], [width, height], queue=False, api_visibility="undocumented")
+    param_mode.change(update_param_mode_gr, [param_mode], [guidance_scale, num_inference_steps], queue=False, api_visibility="undocumented")
+demo.queue().launch(ssr_mode=False, mcp_server=True, css=css)

requirements.txt CHANGED Viewed

@@ -1,17 +1,18 @@
-huggingface_hub
-hf-xet
-torch==2.8.0
-#torchao
-torchvision
-accelerate
-diffusers
-transformers<=4.57.1
-peft
-invisible_watermark
-sentencepiece
-safetensors
-timm
-einops
-kernels
-gradio_huggingfacehub_search
-pydantic==2.10.6

+huggingface_hub
+hf-xet
+torch==2.8.0
+#torchao
+torchvision
+accelerate
+diffusers
+transformers<=4.57.1
+peft
+invisible_watermark
+sentencepiece
+safetensors
+timm
+einops
+kernels
+gradio_huggingfacehub_search
+pydantic==2.10.6
+opencv-python-headless

t2i/controlnet_union/guided_filter.py CHANGED Viewed

@@ -1,281 +1,281 @@
-# -*- coding: utf-8 -*-
-## @package guided_filter.core.filters
-#
-#  Implementation of guided filter.
-#  * GuidedFilter: Original guided filter.
-#  * FastGuidedFilter: Fast version of the guided filter.
-#  @author      tody
-#  @date        2015/08/26
-import numpy as np
-import cv2
-## Convert image into float32 type.
-def to32F(img):
-    if img.dtype == np.float32:
-        return img
-    return (1.0 / 255.0) * np.float32(img)
-## Convert image into uint8 type.
-def to8U(img):
-    if img.dtype == np.uint8:
-        return img
-    return np.clip(np.uint8(255.0 * img), 0, 255)
-## Return if the input image is gray or not.
-def _isGray(I):
-    return len(I.shape) == 2
-## Return down sampled image.
-#  @param scale (w/s, h/s) image will be created.
-#  @param shape I.shape[:2]=(h, w). numpy friendly size parameter.
-def _downSample(I, scale=4, shape=None):
-    if shape is not None:
-        h, w = shape
-        return cv2.resize(I, (w, h), interpolation=cv2.INTER_NEAREST)
-    h, w = I.shape[:2]
-    return cv2.resize(I, (int(w / scale), int(h / scale)), interpolation=cv2.INTER_NEAREST)
-## Return up sampled image.
-#  @param scale (w*s, h*s) image will be created.
-#  @param shape I.shape[:2]=(h, w). numpy friendly size parameter.
-def _upSample(I, scale=2, shape=None):
-    if shape is not None:
-        h, w = shape
-        return cv2.resize(I, (w, h), interpolation=cv2.INTER_LINEAR)
-    h, w = I.shape[:2]
-    return cv2.resize(I, (int(w * scale), int(h * scale)), interpolation=cv2.INTER_LINEAR)
-## Fast guide filter.
-class FastGuidedFilter:
-    ## Constructor.
-    #  @param I Input guidance image. Color or gray.
-    #  @param radius Radius of Guided Filter.
-    #  @param epsilon Regularization term of Guided Filter.
-    #  @param scale Down sampled scale.
-    def __init__(self, I, radius=5, epsilon=0.4, scale=4):
-        I_32F = to32F(I)
-        self._I = I_32F
-        h, w = I.shape[:2]
-        I_sub = _downSample(I_32F, scale)
-        self._I_sub = I_sub
-        radius = int(radius / scale)
-        if _isGray(I):
-            self._guided_filter = GuidedFilterGray(I_sub, radius, epsilon)
-        else:
-            self._guided_filter = GuidedFilterColor(I_sub, radius, epsilon)
-    ## Apply filter for the input image.
-    #  @param p Input image for the filtering.
-    def filter(self, p):
-        p_32F = to32F(p)
-        shape_original = p.shape[:2]
-        p_sub = _downSample(p_32F, shape=self._I_sub.shape[:2])
-        if _isGray(p_sub):
-            return self._filterGray(p_sub, shape_original)
-        cs = p.shape[2]
-        q = np.array(p_32F)
-        for ci in range(cs):
-            q[:, :, ci] = self._filterGray(p_sub[:, :, ci], shape_original)
-        return to8U(q)
-    def _filterGray(self, p_sub, shape_original):
-        ab_sub = self._guided_filter._computeCoefficients(p_sub)
-        ab = [_upSample(abi, shape=shape_original) for abi in ab_sub]
-        return self._guided_filter._computeOutput(ab, self._I)
-## Guide filter.
-class GuidedFilter:
-    ## Constructor.
-    #  @param I Input guidance image. Color or gray.
-    #  @param radius Radius of Guided Filter.
-    #  @param epsilon Regularization term of Guided Filter.
-    def __init__(self, I, radius=5, epsilon=0.4):
-        I_32F = to32F(I)
-        if _isGray(I):
-            self._guided_filter = GuidedFilterGray(I_32F, radius, epsilon)
-        else:
-            self._guided_filter = GuidedFilterColor(I_32F, radius, epsilon)
-    ## Apply filter for the input image.
-    #  @param p Input image for the filtering.
-    def filter(self, p):
-        return to8U(self._guided_filter.filter(p))
-## Common parts of guided filter.
-#
-#  This class is used by guided_filter class. GuidedFilterGray and GuidedFilterColor.
-#  Based on guided_filter._computeCoefficients, guided_filter._computeOutput,
-#  GuidedFilterCommon.filter computes filtered image for color and gray.
-class GuidedFilterCommon:
-    def __init__(self, guided_filter):
-        self._guided_filter = guided_filter
-    ## Apply filter for the input image.
-    #  @param p Input image for the filtering.
-    def filter(self, p):
-        p_32F = to32F(p)
-        if _isGray(p_32F):
-            return self._filterGray(p_32F)
-        cs = p.shape[2]
-        q = np.array(p_32F)
-        for ci in range(cs):
-            q[:, :, ci] = self._filterGray(p_32F[:, :, ci])
-        return q
-    def _filterGray(self, p):
-        ab = self._guided_filter._computeCoefficients(p)
-        return self._guided_filter._computeOutput(ab, self._guided_filter._I)
-## Guided filter for gray guidance image.
-class GuidedFilterGray:
-    #  @param I Input gray guidance image.
-    #  @param radius Radius of Guided Filter.
-    #  @param epsilon Regularization term of Guided Filter.
-    def __init__(self, I, radius=5, epsilon=0.4):
-        self._radius = 2 * radius + 1
-        self._epsilon = epsilon
-        self._I = to32F(I)
-        self._initFilter()
-        self._filter_common = GuidedFilterCommon(self)
-    ## Apply filter for the input image.
-    #  @param p Input image for the filtering.
-    def filter(self, p):
-        return self._filter_common.filter(p)
-    def _initFilter(self):
-        I = self._I
-        r = self._radius
-        self._I_mean = cv2.blur(I, (r, r))
-        I_mean_sq = cv2.blur(I ** 2, (r, r))
-        self._I_var = I_mean_sq - self._I_mean ** 2
-    def _computeCoefficients(self, p):
-        r = self._radius
-        p_mean = cv2.blur(p, (r, r))
-        p_cov = p_mean - self._I_mean * p_mean
-        a = p_cov / (self._I_var + self._epsilon)
-        b = p_mean - a * self._I_mean
-        a_mean = cv2.blur(a, (r, r))
-        b_mean = cv2.blur(b, (r, r))
-        return a_mean, b_mean
-    def _computeOutput(self, ab, I):
-        a_mean, b_mean = ab
-        return a_mean * I + b_mean
-## Guided filter for color guidance image.
-class GuidedFilterColor:
-    #  @param I Input color guidance image.
-   #  @param radius Radius of Guided Filter.
-    #  @param epsilon Regularization term of Guided Filter.
-    def __init__(self, I, radius=5, epsilon=0.2):
-        self._radius = 2 * radius + 1
-        self._epsilon = epsilon
-        self._I = to32F(I)
-        self._initFilter()
-        self._filter_common = GuidedFilterCommon(self)
-    ## Apply filter for the input image.
-    #  @param p Input image for the filtering.
-    def filter(self, p):
-        return self._filter_common.filter(p)
-    def _initFilter(self):
-        I = self._I
-        r = self._radius
-        eps = self._epsilon
-        Ir, Ig, Ib = I[:, :, 0], I[:, :, 1], I[:, :, 2]
-        self._Ir_mean = cv2.blur(Ir, (r, r))
-        self._Ig_mean = cv2.blur(Ig, (r, r))
-        self._Ib_mean = cv2.blur(Ib, (r, r))
-        Irr_var = cv2.blur(Ir ** 2, (r, r)) - self._Ir_mean ** 2 + eps
-        Irg_var = cv2.blur(Ir * Ig, (r, r)) - self._Ir_mean * self._Ig_mean
-        Irb_var = cv2.blur(Ir * Ib, (r, r)) - self._Ir_mean * self._Ib_mean
-        Igg_var = cv2.blur(Ig * Ig, (r, r)) - self._Ig_mean * self._Ig_mean + eps
-        Igb_var = cv2.blur(Ig * Ib, (r, r)) - self._Ig_mean * self._Ib_mean
-        Ibb_var = cv2.blur(Ib * Ib, (r, r)) - self._Ib_mean * self._Ib_mean + eps
-        Irr_inv = Igg_var * Ibb_var - Igb_var * Igb_var
-        Irg_inv = Igb_var * Irb_var - Irg_var * Ibb_var
-        Irb_inv = Irg_var * Igb_var - Igg_var * Irb_var
-        Igg_inv = Irr_var * Ibb_var - Irb_var * Irb_var
-        Igb_inv = Irb_var * Irg_var - Irr_var * Igb_var
-        Ibb_inv = Irr_var * Igg_var - Irg_var * Irg_var
-        I_cov = Irr_inv * Irr_var + Irg_inv * Irg_var + Irb_inv * Irb_var
-        Irr_inv /= I_cov
-        Irg_inv /= I_cov
-        Irb_inv /= I_cov
-        Igg_inv /= I_cov
-        Igb_inv /= I_cov
-        Ibb_inv /= I_cov
-        self._Irr_inv = Irr_inv
-        self._Irg_inv = Irg_inv
-        self._Irb_inv = Irb_inv
-        self._Igg_inv = Igg_inv
-        self._Igb_inv = Igb_inv
-        self._Ibb_inv = Ibb_inv
-    def _computeCoefficients(self, p):
-        r = self._radius
-        I = self._I
-        Ir, Ig, Ib = I[:, :, 0], I[:, :, 1], I[:, :, 2]
-        p_mean = cv2.blur(p, (r, r))
-        Ipr_mean = cv2.blur(Ir * p, (r, r))
-        Ipg_mean = cv2.blur(Ig * p, (r, r))
-        Ipb_mean = cv2.blur(Ib * p, (r, r))
-        Ipr_cov = Ipr_mean - self._Ir_mean * p_mean
-        Ipg_cov = Ipg_mean - self._Ig_mean * p_mean
-        Ipb_cov = Ipb_mean - self._Ib_mean * p_mean
-        ar = self._Irr_inv * Ipr_cov + self._Irg_inv * Ipg_cov + self._Irb_inv * Ipb_cov
-        ag = self._Irg_inv * Ipr_cov + self._Igg_inv * Ipg_cov + self._Igb_inv * Ipb_cov
-        ab = self._Irb_inv * Ipr_cov + self._Igb_inv * Ipg_cov + self._Ibb_inv * Ipb_cov
-        b = p_mean - ar * self._Ir_mean - ag * self._Ig_mean - ab * self._Ib_mean
-        ar_mean = cv2.blur(ar, (r, r))
-        ag_mean = cv2.blur(ag, (r, r))
-        ab_mean = cv2.blur(ab, (r, r))
-        b_mean = cv2.blur(b, (r, r))
-        return ar_mean, ag_mean, ab_mean, b_mean
-    def _computeOutput(self, ab, I):
-        ar_mean, ag_mean, ab_mean, b_mean = ab
-        Ir, Ig, Ib = I[:, :, 0], I[:, :, 1], I[:, :, 2]
-        q = (ar_mean * Ir +
-             ag_mean * Ig +
-             ab_mean * Ib +
-             b_mean)
         return q

+# -*- coding: utf-8 -*-
+## @package guided_filter.core.filters
+#
+#  Implementation of guided filter.
+#  * GuidedFilter: Original guided filter.
+#  * FastGuidedFilter: Fast version of the guided filter.
+#  @author      tody
+#  @date        2015/08/26
+import numpy as np
+import cv2
+## Convert image into float32 type.
+def to32F(img):
+    if img.dtype == np.float32:
+        return img
+    return (1.0 / 255.0) * np.float32(img)
+## Convert image into uint8 type.
+def to8U(img):
+    if img.dtype == np.uint8:
+        return img
+    return np.clip(np.uint8(255.0 * img), 0, 255)
+## Return if the input image is gray or not.
+def _isGray(I):
+    return len(I.shape) == 2
+## Return down sampled image.
+#  @param scale (w/s, h/s) image will be created.
+#  @param shape I.shape[:2]=(h, w). numpy friendly size parameter.
+def _downSample(I, scale=4, shape=None):
+    if shape is not None:
+        h, w = shape
+        return cv2.resize(I, (w, h), interpolation=cv2.INTER_NEAREST)
+    h, w = I.shape[:2]
+    return cv2.resize(I, (int(w / scale), int(h / scale)), interpolation=cv2.INTER_NEAREST)
+## Return up sampled image.
+#  @param scale (w*s, h*s) image will be created.
+#  @param shape I.shape[:2]=(h, w). numpy friendly size parameter.
+def _upSample(I, scale=2, shape=None):
+    if shape is not None:
+        h, w = shape
+        return cv2.resize(I, (w, h), interpolation=cv2.INTER_LINEAR)
+    h, w = I.shape[:2]
+    return cv2.resize(I, (int(w * scale), int(h * scale)), interpolation=cv2.INTER_LINEAR)
+## Fast guide filter.
+class FastGuidedFilter:
+    ## Constructor.
+    #  @param I Input guidance image. Color or gray.
+    #  @param radius Radius of Guided Filter.
+    #  @param epsilon Regularization term of Guided Filter.
+    #  @param scale Down sampled scale.
+    def __init__(self, I, radius=5, epsilon=0.4, scale=4):
+        I_32F = to32F(I)
+        self._I = I_32F
+        h, w = I.shape[:2]
+        I_sub = _downSample(I_32F, scale)
+        self._I_sub = I_sub
+        radius = int(radius / scale)
+        if _isGray(I):
+            self._guided_filter = GuidedFilterGray(I_sub, radius, epsilon)
+        else:
+            self._guided_filter = GuidedFilterColor(I_sub, radius, epsilon)
+    ## Apply filter for the input image.
+    #  @param p Input image for the filtering.
+    def filter(self, p):
+        p_32F = to32F(p)
+        shape_original = p.shape[:2]
+        p_sub = _downSample(p_32F, shape=self._I_sub.shape[:2])
+        if _isGray(p_sub):
+            return self._filterGray(p_sub, shape_original)
+        cs = p.shape[2]
+        q = np.array(p_32F)
+        for ci in range(cs):
+            q[:, :, ci] = self._filterGray(p_sub[:, :, ci], shape_original)
+        return to8U(q)
+    def _filterGray(self, p_sub, shape_original):
+        ab_sub = self._guided_filter._computeCoefficients(p_sub)
+        ab = [_upSample(abi, shape=shape_original) for abi in ab_sub]
+        return self._guided_filter._computeOutput(ab, self._I)
+## Guide filter.
+class GuidedFilter:
+    ## Constructor.
+    #  @param I Input guidance image. Color or gray.
+    #  @param radius Radius of Guided Filter.
+    #  @param epsilon Regularization term of Guided Filter.
+    def __init__(self, I, radius=5, epsilon=0.4):
+        I_32F = to32F(I)
+        if _isGray(I):
+            self._guided_filter = GuidedFilterGray(I_32F, radius, epsilon)
+        else:
+            self._guided_filter = GuidedFilterColor(I_32F, radius, epsilon)
+    ## Apply filter for the input image.
+    #  @param p Input image for the filtering.
+    def filter(self, p):
+        return to8U(self._guided_filter.filter(p))
+## Common parts of guided filter.
+#
+#  This class is used by guided_filter class. GuidedFilterGray and GuidedFilterColor.
+#  Based on guided_filter._computeCoefficients, guided_filter._computeOutput,
+#  GuidedFilterCommon.filter computes filtered image for color and gray.
+class GuidedFilterCommon:
+    def __init__(self, guided_filter):
+        self._guided_filter = guided_filter
+    ## Apply filter for the input image.
+    #  @param p Input image for the filtering.
+    def filter(self, p):
+        p_32F = to32F(p)
+        if _isGray(p_32F):
+            return self._filterGray(p_32F)
+        cs = p.shape[2]
+        q = np.array(p_32F)
+        for ci in range(cs):
+            q[:, :, ci] = self._filterGray(p_32F[:, :, ci])
+        return q
+    def _filterGray(self, p):
+        ab = self._guided_filter._computeCoefficients(p)
+        return self._guided_filter._computeOutput(ab, self._guided_filter._I)
+## Guided filter for gray guidance image.
+class GuidedFilterGray:
+    #  @param I Input gray guidance image.
+    #  @param radius Radius of Guided Filter.
+    #  @param epsilon Regularization term of Guided Filter.
+    def __init__(self, I, radius=5, epsilon=0.4):
+        self._radius = 2 * radius + 1
+        self._epsilon = epsilon
+        self._I = to32F(I)
+        self._initFilter()
+        self._filter_common = GuidedFilterCommon(self)
+    ## Apply filter for the input image.
+    #  @param p Input image for the filtering.
+    def filter(self, p):
+        return self._filter_common.filter(p)
+    def _initFilter(self):
+        I = self._I
+        r = self._radius
+        self._I_mean = cv2.blur(I, (r, r))
+        I_mean_sq = cv2.blur(I ** 2, (r, r))
+        self._I_var = I_mean_sq - self._I_mean ** 2
+    def _computeCoefficients(self, p):
+        r = self._radius
+        p_mean = cv2.blur(p, (r, r))
+        p_cov = p_mean - self._I_mean * p_mean
+        a = p_cov / (self._I_var + self._epsilon)
+        b = p_mean - a * self._I_mean
+        a_mean = cv2.blur(a, (r, r))
+        b_mean = cv2.blur(b, (r, r))
+        return a_mean, b_mean
+    def _computeOutput(self, ab, I):
+        a_mean, b_mean = ab
+        return a_mean * I + b_mean
+## Guided filter for color guidance image.
+class GuidedFilterColor:
+    #  @param I Input color guidance image.
+   #  @param radius Radius of Guided Filter.
+    #  @param epsilon Regularization term of Guided Filter.
+    def __init__(self, I, radius=5, epsilon=0.2):
+        self._radius = 2 * radius + 1
+        self._epsilon = epsilon
+        self._I = to32F(I)
+        self._initFilter()
+        self._filter_common = GuidedFilterCommon(self)
+    ## Apply filter for the input image.
+    #  @param p Input image for the filtering.
+    def filter(self, p):
+        return self._filter_common.filter(p)
+    def _initFilter(self):
+        I = self._I
+        r = self._radius
+        eps = self._epsilon
+        Ir, Ig, Ib = I[:, :, 0], I[:, :, 1], I[:, :, 2]
+        self._Ir_mean = cv2.blur(Ir, (r, r))
+        self._Ig_mean = cv2.blur(Ig, (r, r))
+        self._Ib_mean = cv2.blur(Ib, (r, r))
+        Irr_var = cv2.blur(Ir ** 2, (r, r)) - self._Ir_mean ** 2 + eps
+        Irg_var = cv2.blur(Ir * Ig, (r, r)) - self._Ir_mean * self._Ig_mean
+        Irb_var = cv2.blur(Ir * Ib, (r, r)) - self._Ir_mean * self._Ib_mean
+        Igg_var = cv2.blur(Ig * Ig, (r, r)) - self._Ig_mean * self._Ig_mean + eps
+        Igb_var = cv2.blur(Ig * Ib, (r, r)) - self._Ig_mean * self._Ib_mean
+        Ibb_var = cv2.blur(Ib * Ib, (r, r)) - self._Ib_mean * self._Ib_mean + eps
+        Irr_inv = Igg_var * Ibb_var - Igb_var * Igb_var
+        Irg_inv = Igb_var * Irb_var - Irg_var * Ibb_var
+        Irb_inv = Irg_var * Igb_var - Igg_var * Irb_var
+        Igg_inv = Irr_var * Ibb_var - Irb_var * Irb_var
+        Igb_inv = Irb_var * Irg_var - Irr_var * Igb_var
+        Ibb_inv = Irr_var * Igg_var - Irg_var * Irg_var
+        I_cov = Irr_inv * Irr_var + Irg_inv * Irg_var + Irb_inv * Irb_var
+        Irr_inv /= I_cov
+        Irg_inv /= I_cov
+        Irb_inv /= I_cov
+        Igg_inv /= I_cov
+        Igb_inv /= I_cov
+        Ibb_inv /= I_cov
+        self._Irr_inv = Irr_inv
+        self._Irg_inv = Irg_inv
+        self._Irb_inv = Irb_inv
+        self._Igg_inv = Igg_inv
+        self._Igb_inv = Igb_inv
+        self._Ibb_inv = Ibb_inv
+    def _computeCoefficients(self, p):
+        r = self._radius
+        I = self._I
+        Ir, Ig, Ib = I[:, :, 0], I[:, :, 1], I[:, :, 2]
+        p_mean = cv2.blur(p, (r, r))
+        Ipr_mean = cv2.blur(Ir * p, (r, r))
+        Ipg_mean = cv2.blur(Ig * p, (r, r))
+        Ipb_mean = cv2.blur(Ib * p, (r, r))
+        Ipr_cov = Ipr_mean - self._Ir_mean * p_mean
+        Ipg_cov = Ipg_mean - self._Ig_mean * p_mean
+        Ipb_cov = Ipb_mean - self._Ib_mean * p_mean
+        ar = self._Irr_inv * Ipr_cov + self._Irg_inv * Ipg_cov + self._Irb_inv * Ipb_cov
+        ag = self._Irg_inv * Ipr_cov + self._Igg_inv * Ipg_cov + self._Igb_inv * Ipb_cov
+        ab = self._Irb_inv * Ipr_cov + self._Igb_inv * Ipg_cov + self._Ibb_inv * Ipb_cov
+        b = p_mean - ar * self._Ir_mean - ag * self._Ig_mean - ab * self._Ib_mean
+        ar_mean = cv2.blur(ar, (r, r))
+        ag_mean = cv2.blur(ag, (r, r))
+        ab_mean = cv2.blur(ab, (r, r))
+        b_mean = cv2.blur(b, (r, r))
+        return ar_mean, ag_mean, ab_mean, b_mean
+    def _computeOutput(self, ab, I):
+        ar_mean, ag_mean, ab_mean, b_mean = ab
+        Ir, Ig, Ib = I[:, :, 0], I[:, :, 1], I[:, :, 2]
+        q = (ar_mean * Ir +
+             ag_mean * Ig +
+             ab_mean * Ib +
+             b_mean)
         return q

t2i/controlnet_union/mask.py CHANGED Viewed

@@ -1,347 +1,347 @@
-import math
-import random
-import hashlib
-import logging
-from enum import Enum
-import cv2
-import numpy as np
-# from saicinpainting.evaluation.masks.mask import SegmentationMask
-# from saicinpainting.utils import LinearRamp
-LOGGER = logging.getLogger(__name__)
-class LinearRamp:
-    def __init__(self, start_value=0, end_value=1, start_iter=-1, end_iter=0):
-        self.start_value = start_value
-        self.end_value = end_value
-        self.start_iter = start_iter
-        self.end_iter = end_iter
-    def __call__(self, i):
-        if i < self.start_iter:
-            return self.start_value
-        if i >= self.end_iter:
-            return self.end_value
-        part = (i - self.start_iter) / (self.end_iter - self.start_iter)
-        return self.start_value * (1 - part) + self.end_value * part
-class DrawMethod(Enum):
-    LINE = 'line'
-    CIRCLE = 'circle'
-    SQUARE = 'square'
-def make_random_irregular_mask(shape, max_angle=4, max_len=60, max_width=20, min_times=0, max_times=10,
-                               draw_method=DrawMethod.LINE):
-    draw_method = DrawMethod(draw_method)
-    height, width = shape
-    mask = np.zeros((height, width), np.float32)
-    times = np.random.randint(min_times, max_times + 1)
-    for i in range(times):
-        start_x = np.random.randint(width)
-        start_y = np.random.randint(height)
-        for j in range(1 + np.random.randint(5)):
-            angle = 0.01 + np.random.randint(max_angle)
-            if i % 2 == 0:
-                angle = 2 * 3.1415926 - angle
-            length = 10 + np.random.randint(max_len)
-            brush_w = 5 + np.random.randint(max_width)
-            end_x = np.clip((start_x + length * np.sin(angle)).astype(np.int32), 0, width)
-            end_y = np.clip((start_y + length * np.cos(angle)).astype(np.int32), 0, height)
-            if draw_method == DrawMethod.LINE:
-                cv2.line(mask, (start_x, start_y), (end_x, end_y), 1.0, brush_w)
-            elif draw_method == DrawMethod.CIRCLE:
-                cv2.circle(mask, (start_x, start_y), radius=brush_w, color=1., thickness=-1)
-            elif draw_method == DrawMethod.SQUARE:
-                radius = brush_w // 2
-                mask[start_y - radius:start_y + radius, start_x - radius:start_x + radius] = 1
-            start_x, start_y = end_x, end_y
-    return mask[None, ...]
-class RandomIrregularMaskGenerator:
-    def __init__(self, max_angle=4, max_len=60, max_width=20, min_times=0, max_times=10, ramp_kwargs=None,
-                 draw_method=DrawMethod.LINE):
-        self.max_angle = max_angle
-        self.max_len = max_len
-        self.max_width = max_width
-        self.min_times = min_times
-        self.max_times = max_times
-        self.draw_method = draw_method
-        self.ramp = LinearRamp(**ramp_kwargs) if ramp_kwargs is not None else None
-    def __call__(self, img, iter_i=None, raw_image=None):
-        coef = self.ramp(iter_i) if (self.ramp is not None) and (iter_i is not None) else 1
-        cur_max_len = int(max(1, self.max_len * coef))
-        cur_max_width = int(max(1, self.max_width * coef))
-        cur_max_times = int(self.min_times + 1 + (self.max_times - self.min_times) * coef)
-        return make_random_irregular_mask(img.shape[1:], max_angle=self.max_angle, max_len=cur_max_len,
-                                          max_width=cur_max_width, min_times=self.min_times, max_times=cur_max_times,
-                                          draw_method=self.draw_method)
-def make_random_rectangle_mask(shape, margin=10, bbox_min_size=30, bbox_max_size=100, min_times=0, max_times=3):
-    height, width = shape
-    mask = np.zeros((height, width), np.float32)
-    bbox_max_size = min(bbox_max_size, height - margin * 2, width - margin * 2)
-    times = np.random.randint(min_times, max_times + 1)
-    for i in range(times):
-        box_width = np.random.randint(bbox_min_size, bbox_max_size)
-        box_height = np.random.randint(bbox_min_size, bbox_max_size)
-        start_x = np.random.randint(margin, width - margin - box_width + 1)
-        start_y = np.random.randint(margin, height - margin - box_height + 1)
-        mask[start_y:start_y + box_height, start_x:start_x + box_width] = 1
-    return mask[None, ...]
-class RandomRectangleMaskGenerator:
-    def __init__(self, margin=10, bbox_min_size=30, bbox_max_size=100, min_times=0, max_times=3, ramp_kwargs=None):
-        self.margin = margin
-        self.bbox_min_size = bbox_min_size
-        self.bbox_max_size = bbox_max_size
-        self.min_times = min_times
-        self.max_times = max_times
-        self.ramp = LinearRamp(**ramp_kwargs) if ramp_kwargs is not None else None
-    def __call__(self, img, iter_i=None, raw_image=None):
-        coef = self.ramp(iter_i) if (self.ramp is not None) and (iter_i is not None) else 1
-        cur_bbox_max_size = int(self.bbox_min_size + 1 + (self.bbox_max_size - self.bbox_min_size) * coef)
-        cur_max_times = int(self.min_times + (self.max_times - self.min_times) * coef)
-        return make_random_rectangle_mask(img.shape[1:], margin=self.margin, bbox_min_size=self.bbox_min_size,
-                                          bbox_max_size=cur_bbox_max_size, min_times=self.min_times,
-                                          max_times=cur_max_times)
-# class RandomSegmentationMaskGenerator:
-#     def __init__(self, **kwargs):
-#         self.impl = None  # will be instantiated in first call (effectively in subprocess)
-#         self.kwargs = kwargs
-#     def __call__(self, img, iter_i=None, raw_image=None):
-#         if self.impl is None:
-#             self.impl = SegmentationMask(**self.kwargs)
-#         masks = self.impl.get_masks(np.transpose(img, (1, 2, 0)))
-#         masks = [m for m in masks if len(np.unique(m)) > 1]
-#         return np.random.choice(masks)
-def make_random_superres_mask(shape, min_step=2, max_step=4, min_width=1, max_width=3):
-    height, width = shape
-    mask = np.zeros((height, width), np.float32)
-    step_x = np.random.randint(min_step, max_step + 1)
-    width_x = np.random.randint(min_width, min(step_x, max_width + 1))
-    offset_x = np.random.randint(0, step_x)
-    step_y = np.random.randint(min_step, max_step + 1)
-    width_y = np.random.randint(min_width, min(step_y, max_width + 1))
-    offset_y = np.random.randint(0, step_y)
-    for dy in range(width_y):
-        mask[offset_y + dy::step_y] = 1
-    for dx in range(width_x):
-        mask[:, offset_x + dx::step_x] = 1
-    return mask[None, ...]
-class RandomSuperresMaskGenerator:
-    def __init__(self, **kwargs):
-        self.kwargs = kwargs
-    def __call__(self, img, iter_i=None):
-        return make_random_superres_mask(img.shape[1:], **self.kwargs)
-class DumbAreaMaskGenerator:
-    min_ratio = 0.1
-    max_ratio = 0.35
-    default_ratio = 0.225
-    def __init__(self, is_training):
-        #Parameters:
-        #    is_training(bool): If true - random rectangular mask, if false - central square mask
-        self.is_training = is_training
-    def _random_vector(self, dimension):
-        if self.is_training:
-            lower_limit = math.sqrt(self.min_ratio)
-            upper_limit = math.sqrt(self.max_ratio)
-            mask_side = round((random.random() * (upper_limit - lower_limit) + lower_limit) * dimension)
-            u = random.randint(0, dimension-mask_side-1)
-            v = u+mask_side
-        else:
-            margin = (math.sqrt(self.default_ratio) / 2) * dimension
-            u = round(dimension/2 - margin)
-            v = round(dimension/2 + margin)
-        return u, v
-    def __call__(self, img, iter_i=None, raw_image=None):
-        c, height, width = img.shape
-        mask = np.zeros((height, width), np.float32)
-        x1, x2 = self._random_vector(width)
-        y1, y2 = self._random_vector(height)
-        mask[x1:x2, y1:y2] = 1
-        return mask[None, ...]
-class OutpaintingMaskGenerator:
-    def __init__(self, min_padding_percent:float=0.04, max_padding_percent:int=0.25, left_padding_prob:float=0.5, top_padding_prob:float=0.5,
-                 right_padding_prob:float=0.5, bottom_padding_prob:float=0.5, is_fixed_randomness:bool=False):
-        """
-        is_fixed_randomness - get identical paddings for the same image if args are the same
-        """
-        self.min_padding_percent = min_padding_percent
-        self.max_padding_percent = max_padding_percent
-        self.probs = [left_padding_prob, top_padding_prob, right_padding_prob, bottom_padding_prob]
-        self.is_fixed_randomness = is_fixed_randomness
-        assert self.min_padding_percent <= self.max_padding_percent
-        assert self.max_padding_percent > 0
-        assert len([x for x in [self.min_padding_percent, self.max_padding_percent] if (x>=0 and x<=1)]) == 2, f"Padding percentage should be in [0,1]"
-        assert sum(self.probs) > 0, f"At least one of the padding probs should be greater than 0 - {self.probs}"
-        assert len([x for x in self.probs if (x >= 0) and (x <= 1)]) == 4, f"At least one of padding probs is not in [0,1] - {self.probs}"
-        if len([x for x in self.probs if x > 0]) == 1:
-            LOGGER.warning(f"Only one padding prob is greater than zero - {self.probs}. That means that the outpainting masks will be always on the same side")
-    def apply_padding(self, mask, coord):
-        mask[int(coord[0][0]*self.img_h):int(coord[1][0]*self.img_h),
-             int(coord[0][1]*self.img_w):int(coord[1][1]*self.img_w)] = 1
-        return mask
-    def get_padding(self, size):
-        n1 = int(self.min_padding_percent*size)
-        n2 = int(self.max_padding_percent*size)
-        return self.rnd.randint(n1, n2) / size
-    @staticmethod
-    def _img2rs(img):
-        arr = np.ascontiguousarray(img.astype(np.uint8))
-        str_hash = hashlib.sha1(arr).hexdigest()
-        res = hash(str_hash)%(2**32)
-        return res
-    def __call__(self, img, iter_i=None, raw_image=None):
-        c, self.img_h, self.img_w = img.shape
-        mask = np.zeros((self.img_h, self.img_w), np.float32)
-        at_least_one_mask_applied = False
-        if self.is_fixed_randomness:
-            assert raw_image is not None, f"Cant calculate hash on raw_image=None"
-            rs = self._img2rs(raw_image)
-            self.rnd = np.random.RandomState(rs)
-        else:
-            self.rnd = np.random
-        coords = [[
-                   (0,0),
-                   (1,self.get_padding(size=self.img_h))
-                  ],
-                  [
-                    (0,0),
-                    (self.get_padding(size=self.img_w),1)
-                  ],
-                  [
-                    (0,1-self.get_padding(size=self.img_h)),
-                    (1,1)
-                  ],
-                  [
-                    (1-self.get_padding(size=self.img_w),0),
-                    (1,1)
-                  ]]
-        for pp, coord in zip(self.probs, coords):
-            if self.rnd.random() < pp:
-                at_least_one_mask_applied = True
-                mask = self.apply_padding(mask=mask, coord=coord)
-        if not at_least_one_mask_applied:
-            idx = self.rnd.choice(range(len(coords)), p=np.array(self.probs)/sum(self.probs))
-            mask = self.apply_padding(mask=mask, coord=coords[idx])
-        return mask[None, ...]
-class MixedMaskGenerator:
-    def __init__(self, irregular_proba=1/3, irregular_kwargs=None,
-                 box_proba=1/3, box_kwargs=None,
-                 segm_proba=1/3, segm_kwargs=None,
-                 squares_proba=0, squares_kwargs=None,
-                 superres_proba=0, superres_kwargs=None,
-                 outpainting_proba=0, outpainting_kwargs=None,
-                 invert_proba=0):
-        self.probas = []
-        self.gens = []
-        if irregular_proba > 0:
-            self.probas.append(irregular_proba)
-            if irregular_kwargs is None:
-                irregular_kwargs = {}
-            else:
-                irregular_kwargs = dict(irregular_kwargs)
-            irregular_kwargs['draw_method'] = DrawMethod.LINE
-            self.gens.append(RandomIrregularMaskGenerator(**irregular_kwargs))
-        if box_proba > 0:
-            self.probas.append(box_proba)
-            if box_kwargs is None:
-                box_kwargs = {}
-            self.gens.append(RandomRectangleMaskGenerator(**box_kwargs))
-        # if segm_proba > 0:
-        #     self.probas.append(segm_proba)
-        #     if segm_kwargs is None:
-        #         segm_kwargs = {}
-        #     self.gens.append(RandomSegmentationMaskGenerator(**segm_kwargs))
-        if squares_proba > 0:
-            self.probas.append(squares_proba)
-            if squares_kwargs is None:
-                squares_kwargs = {}
-            else:
-                squares_kwargs = dict(squares_kwargs)
-            squares_kwargs['draw_method'] = DrawMethod.SQUARE
-            self.gens.append(RandomIrregularMaskGenerator(**squares_kwargs))
-        if superres_proba > 0:
-            self.probas.append(superres_proba)
-            if superres_kwargs is None:
-                superres_kwargs = {}
-            self.gens.append(RandomSuperresMaskGenerator(**superres_kwargs))
-        if outpainting_proba > 0:
-            self.probas.append(outpainting_proba)
-            if outpainting_kwargs is None:
-                outpainting_kwargs = {}
-            self.gens.append(OutpaintingMaskGenerator(**outpainting_kwargs))
-        self.probas = np.array(self.probas, dtype='float32')
-        self.probas /= self.probas.sum()
-        self.invert_proba = invert_proba
-    def __call__(self, img, iter_i=None, raw_image=None):
-        kind = np.random.choice(len(self.probas), p=self.probas)
-        gen = self.gens[kind]
-        result = gen(img, iter_i=iter_i, raw_image=raw_image)
-        if self.invert_proba > 0 and random.random() < self.invert_proba:
-            result = 1 - result
-        return result
-def get_mask_generator(kind, kwargs):
-    if kind is None:
-        kind = "mixed"
-    if kwargs is None:
-        kwargs = {}
-    if kind == "mixed":
-        cl = MixedMaskGenerator
-    elif kind == "outpainting":
-        cl = OutpaintingMaskGenerator
-    elif kind == "dumb":
-        cl = DumbAreaMaskGenerator
-    else:
-        raise NotImplementedError(f"No such generator kind = {kind}")
-    return cl(**kwargs)

+import math
+import random
+import hashlib
+import logging
+from enum import Enum
+import cv2
+import numpy as np
+# from saicinpainting.evaluation.masks.mask import SegmentationMask
+# from saicinpainting.utils import LinearRamp
+LOGGER = logging.getLogger(__name__)
+class LinearRamp:
+    def __init__(self, start_value=0, end_value=1, start_iter=-1, end_iter=0):
+        self.start_value = start_value
+        self.end_value = end_value
+        self.start_iter = start_iter
+        self.end_iter = end_iter
+    def __call__(self, i):
+        if i < self.start_iter:
+            return self.start_value
+        if i >= self.end_iter:
+            return self.end_value
+        part = (i - self.start_iter) / (self.end_iter - self.start_iter)
+        return self.start_value * (1 - part) + self.end_value * part
+class DrawMethod(Enum):
+    LINE = 'line'
+    CIRCLE = 'circle'
+    SQUARE = 'square'
+def make_random_irregular_mask(shape, max_angle=4, max_len=60, max_width=20, min_times=0, max_times=10,
+                               draw_method=DrawMethod.LINE):
+    draw_method = DrawMethod(draw_method)
+    height, width = shape
+    mask = np.zeros((height, width), np.float32)
+    times = np.random.randint(min_times, max_times + 1)
+    for i in range(times):
+        start_x = np.random.randint(width)
+        start_y = np.random.randint(height)
+        for j in range(1 + np.random.randint(5)):
+            angle = 0.01 + np.random.randint(max_angle)
+            if i % 2 == 0:
+                angle = 2 * 3.1415926 - angle
+            length = 10 + np.random.randint(max_len)
+            brush_w = 5 + np.random.randint(max_width)
+            end_x = np.clip((start_x + length * np.sin(angle)).astype(np.int32), 0, width)
+            end_y = np.clip((start_y + length * np.cos(angle)).astype(np.int32), 0, height)
+            if draw_method == DrawMethod.LINE:
+                cv2.line(mask, (start_x, start_y), (end_x, end_y), 1.0, brush_w)
+            elif draw_method == DrawMethod.CIRCLE:
+                cv2.circle(mask, (start_x, start_y), radius=brush_w, color=1., thickness=-1)
+            elif draw_method == DrawMethod.SQUARE:
+                radius = brush_w // 2
+                mask[start_y - radius:start_y + radius, start_x - radius:start_x + radius] = 1
+            start_x, start_y = end_x, end_y
+    return mask[None, ...]
+class RandomIrregularMaskGenerator:
+    def __init__(self, max_angle=4, max_len=60, max_width=20, min_times=0, max_times=10, ramp_kwargs=None,
+                 draw_method=DrawMethod.LINE):
+        self.max_angle = max_angle
+        self.max_len = max_len
+        self.max_width = max_width
+        self.min_times = min_times
+        self.max_times = max_times
+        self.draw_method = draw_method
+        self.ramp = LinearRamp(**ramp_kwargs) if ramp_kwargs is not None else None
+    def __call__(self, img, iter_i=None, raw_image=None):
+        coef = self.ramp(iter_i) if (self.ramp is not None) and (iter_i is not None) else 1
+        cur_max_len = int(max(1, self.max_len * coef))
+        cur_max_width = int(max(1, self.max_width * coef))
+        cur_max_times = int(self.min_times + 1 + (self.max_times - self.min_times) * coef)
+        return make_random_irregular_mask(img.shape[1:], max_angle=self.max_angle, max_len=cur_max_len,
+                                          max_width=cur_max_width, min_times=self.min_times, max_times=cur_max_times,
+                                          draw_method=self.draw_method)
+def make_random_rectangle_mask(shape, margin=10, bbox_min_size=30, bbox_max_size=100, min_times=0, max_times=3):
+    height, width = shape
+    mask = np.zeros((height, width), np.float32)
+    bbox_max_size = min(bbox_max_size, height - margin * 2, width - margin * 2)
+    times = np.random.randint(min_times, max_times + 1)
+    for i in range(times):
+        box_width = np.random.randint(bbox_min_size, bbox_max_size)
+        box_height = np.random.randint(bbox_min_size, bbox_max_size)
+        start_x = np.random.randint(margin, width - margin - box_width + 1)
+        start_y = np.random.randint(margin, height - margin - box_height + 1)
+        mask[start_y:start_y + box_height, start_x:start_x + box_width] = 1
+    return mask[None, ...]
+class RandomRectangleMaskGenerator:
+    def __init__(self, margin=10, bbox_min_size=30, bbox_max_size=100, min_times=0, max_times=3, ramp_kwargs=None):
+        self.margin = margin
+        self.bbox_min_size = bbox_min_size
+        self.bbox_max_size = bbox_max_size
+        self.min_times = min_times
+        self.max_times = max_times
+        self.ramp = LinearRamp(**ramp_kwargs) if ramp_kwargs is not None else None
+    def __call__(self, img, iter_i=None, raw_image=None):
+        coef = self.ramp(iter_i) if (self.ramp is not None) and (iter_i is not None) else 1
+        cur_bbox_max_size = int(self.bbox_min_size + 1 + (self.bbox_max_size - self.bbox_min_size) * coef)
+        cur_max_times = int(self.min_times + (self.max_times - self.min_times) * coef)
+        return make_random_rectangle_mask(img.shape[1:], margin=self.margin, bbox_min_size=self.bbox_min_size,
+                                          bbox_max_size=cur_bbox_max_size, min_times=self.min_times,
+                                          max_times=cur_max_times)
+# class RandomSegmentationMaskGenerator:
+#     def __init__(self, **kwargs):
+#         self.impl = None  # will be instantiated in first call (effectively in subprocess)
+#         self.kwargs = kwargs
+#     def __call__(self, img, iter_i=None, raw_image=None):
+#         if self.impl is None:
+#             self.impl = SegmentationMask(**self.kwargs)
+#         masks = self.impl.get_masks(np.transpose(img, (1, 2, 0)))
+#         masks = [m for m in masks if len(np.unique(m)) > 1]
+#         return np.random.choice(masks)
+def make_random_superres_mask(shape, min_step=2, max_step=4, min_width=1, max_width=3):
+    height, width = shape
+    mask = np.zeros((height, width), np.float32)
+    step_x = np.random.randint(min_step, max_step + 1)
+    width_x = np.random.randint(min_width, min(step_x, max_width + 1))
+    offset_x = np.random.randint(0, step_x)
+    step_y = np.random.randint(min_step, max_step + 1)
+    width_y = np.random.randint(min_width, min(step_y, max_width + 1))
+    offset_y = np.random.randint(0, step_y)
+    for dy in range(width_y):
+        mask[offset_y + dy::step_y] = 1
+    for dx in range(width_x):
+        mask[:, offset_x + dx::step_x] = 1
+    return mask[None, ...]
+class RandomSuperresMaskGenerator:
+    def __init__(self, **kwargs):
+        self.kwargs = kwargs
+    def __call__(self, img, iter_i=None):
+        return make_random_superres_mask(img.shape[1:], **self.kwargs)
+class DumbAreaMaskGenerator:
+    min_ratio = 0.1
+    max_ratio = 0.35
+    default_ratio = 0.225
+    def __init__(self, is_training):
+        #Parameters:
+        #    is_training(bool): If true - random rectangular mask, if false - central square mask
+        self.is_training = is_training
+    def _random_vector(self, dimension):
+        if self.is_training:
+            lower_limit = math.sqrt(self.min_ratio)
+            upper_limit = math.sqrt(self.max_ratio)
+            mask_side = round((random.random() * (upper_limit - lower_limit) + lower_limit) * dimension)
+            u = random.randint(0, dimension-mask_side-1)
+            v = u+mask_side
+        else:
+            margin = (math.sqrt(self.default_ratio) / 2) * dimension
+            u = round(dimension/2 - margin)
+            v = round(dimension/2 + margin)
+        return u, v
+    def __call__(self, img, iter_i=None, raw_image=None):
+        c, height, width = img.shape
+        mask = np.zeros((height, width), np.float32)
+        x1, x2 = self._random_vector(width)
+        y1, y2 = self._random_vector(height)
+        mask[x1:x2, y1:y2] = 1
+        return mask[None, ...]
+class OutpaintingMaskGenerator:
+    def __init__(self, min_padding_percent:float=0.04, max_padding_percent:int=0.25, left_padding_prob:float=0.5, top_padding_prob:float=0.5,
+                 right_padding_prob:float=0.5, bottom_padding_prob:float=0.5, is_fixed_randomness:bool=False):
+        """
+        is_fixed_randomness - get identical paddings for the same image if args are the same
+        """
+        self.min_padding_percent = min_padding_percent
+        self.max_padding_percent = max_padding_percent
+        self.probs = [left_padding_prob, top_padding_prob, right_padding_prob, bottom_padding_prob]
+        self.is_fixed_randomness = is_fixed_randomness
+        assert self.min_padding_percent <= self.max_padding_percent
+        assert self.max_padding_percent > 0
+        assert len([x for x in [self.min_padding_percent, self.max_padding_percent] if (x>=0 and x<=1)]) == 2, f"Padding percentage should be in [0,1]"
+        assert sum(self.probs) > 0, f"At least one of the padding probs should be greater than 0 - {self.probs}"
+        assert len([x for x in self.probs if (x >= 0) and (x <= 1)]) == 4, f"At least one of padding probs is not in [0,1] - {self.probs}"
+        if len([x for x in self.probs if x > 0]) == 1:
+            LOGGER.warning(f"Only one padding prob is greater than zero - {self.probs}. That means that the outpainting masks will be always on the same side")
+    def apply_padding(self, mask, coord):
+        mask[int(coord[0][0]*self.img_h):int(coord[1][0]*self.img_h),
+             int(coord[0][1]*self.img_w):int(coord[1][1]*self.img_w)] = 1
+        return mask
+    def get_padding(self, size):
+        n1 = int(self.min_padding_percent*size)
+        n2 = int(self.max_padding_percent*size)
+        return self.rnd.randint(n1, n2) / size
+    @staticmethod
+    def _img2rs(img):
+        arr = np.ascontiguousarray(img.astype(np.uint8))
+        str_hash = hashlib.sha1(arr).hexdigest()
+        res = hash(str_hash)%(2**32)
+        return res
+    def __call__(self, img, iter_i=None, raw_image=None):
+        c, self.img_h, self.img_w = img.shape
+        mask = np.zeros((self.img_h, self.img_w), np.float32)
+        at_least_one_mask_applied = False
+        if self.is_fixed_randomness:
+            assert raw_image is not None, f"Cant calculate hash on raw_image=None"
+            rs = self._img2rs(raw_image)
+            self.rnd = np.random.RandomState(rs)
+        else:
+            self.rnd = np.random
+        coords = [[
+                   (0,0),
+                   (1,self.get_padding(size=self.img_h))
+                  ],
+                  [
+                    (0,0),
+                    (self.get_padding(size=self.img_w),1)
+                  ],
+                  [
+                    (0,1-self.get_padding(size=self.img_h)),
+                    (1,1)
+                  ],
+                  [
+                    (1-self.get_padding(size=self.img_w),0),
+                    (1,1)
+                  ]]
+        for pp, coord in zip(self.probs, coords):
+            if self.rnd.random() < pp:
+                at_least_one_mask_applied = True
+                mask = self.apply_padding(mask=mask, coord=coord)
+        if not at_least_one_mask_applied:
+            idx = self.rnd.choice(range(len(coords)), p=np.array(self.probs)/sum(self.probs))
+            mask = self.apply_padding(mask=mask, coord=coords[idx])
+        return mask[None, ...]
+class MixedMaskGenerator:
+    def __init__(self, irregular_proba=1/3, irregular_kwargs=None,
+                 box_proba=1/3, box_kwargs=None,
+                 segm_proba=1/3, segm_kwargs=None,
+                 squares_proba=0, squares_kwargs=None,
+                 superres_proba=0, superres_kwargs=None,
+                 outpainting_proba=0, outpainting_kwargs=None,
+                 invert_proba=0):
+        self.probas = []
+        self.gens = []
+        if irregular_proba > 0:
+            self.probas.append(irregular_proba)
+            if irregular_kwargs is None:
+                irregular_kwargs = {}
+            else:
+                irregular_kwargs = dict(irregular_kwargs)
+            irregular_kwargs['draw_method'] = DrawMethod.LINE
+            self.gens.append(RandomIrregularMaskGenerator(**irregular_kwargs))
+        if box_proba > 0:
+            self.probas.append(box_proba)
+            if box_kwargs is None:
+                box_kwargs = {}
+            self.gens.append(RandomRectangleMaskGenerator(**box_kwargs))
+        # if segm_proba > 0:
+        #     self.probas.append(segm_proba)
+        #     if segm_kwargs is None:
+        #         segm_kwargs = {}
+        #     self.gens.append(RandomSegmentationMaskGenerator(**segm_kwargs))
+        if squares_proba > 0:
+            self.probas.append(squares_proba)
+            if squares_kwargs is None:
+                squares_kwargs = {}
+            else:
+                squares_kwargs = dict(squares_kwargs)
+            squares_kwargs['draw_method'] = DrawMethod.SQUARE
+            self.gens.append(RandomIrregularMaskGenerator(**squares_kwargs))
+        if superres_proba > 0:
+            self.probas.append(superres_proba)
+            if superres_kwargs is None:
+                superres_kwargs = {}
+            self.gens.append(RandomSuperresMaskGenerator(**superres_kwargs))
+        if outpainting_proba > 0:
+            self.probas.append(outpainting_proba)
+            if outpainting_kwargs is None:
+                outpainting_kwargs = {}
+            self.gens.append(OutpaintingMaskGenerator(**outpainting_kwargs))
+        self.probas = np.array(self.probas, dtype='float32')
+        self.probas /= self.probas.sum()
+        self.invert_proba = invert_proba
+    def __call__(self, img, iter_i=None, raw_image=None):
+        kind = np.random.choice(len(self.probas), p=self.probas)
+        gen = self.gens[kind]
+        result = gen(img, iter_i=iter_i, raw_image=raw_image)
+        if self.invert_proba > 0 and random.random() < self.invert_proba:
+            result = 1 - result
+        return result
+def get_mask_generator(kind, kwargs):
+    if kind is None:
+        kind = "mixed"
+    if kwargs is None:
+        kwargs = {}
+    if kind == "mixed":
+        cl = MixedMaskGenerator
+    elif kind == "outpainting":
+        cl = OutpaintingMaskGenerator
+    elif kind == "dumb":
+        cl = DumbAreaMaskGenerator
+    else:
+        raise NotImplementedError(f"No such generator kind = {kind}")
+    return cl(**kwargs)

t2i/controlnet_union/models/controlnet_union.py CHANGED Viewed

@@ -1,957 +1,957 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple, Union
-import torch
-from torch import nn
-from torch.nn import functional as F
-from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.loaders.single_file_model import FromOriginalModelMixin
-from diffusers.utils import BaseOutput, logging
-from diffusers.models.attention_processor import (
-    ADDED_KV_ATTENTION_PROCESSORS,
-    CROSS_ATTENTION_PROCESSORS,
-    AttentionProcessor,
-    AttnAddedKVProcessor,
-    AttnProcessor,
-)
-from diffusers.models.embeddings import TextImageProjection, TextImageTimeEmbedding, TextTimeEmbedding, TimestepEmbedding, Timesteps
-from diffusers.models.modeling_utils import ModelMixin
-from diffusers.models.unets.unet_2d_blocks import (
-    CrossAttnDownBlock2D,
-    DownBlock2D,
-    UNetMidBlock2DCrossAttn,
-    get_down_block,
-)
-from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-from collections import OrderedDict
-# Transformer Block
-# Used to exchange info between different conditions and input image
-# With reference to https://github.com/TencentARC/T2I-Adapter/blob/SD/ldm/modules/encoders/adapter.py#L147
-class QuickGELU(nn.Module):
-    def forward(self, x: torch.Tensor):
-        return x * torch.sigmoid(1.702 * x)
-class LayerNorm(nn.LayerNorm):
-    """Subclass torch's LayerNorm to handle fp16."""
-    def forward(self, x: torch.Tensor):
-        orig_type = x.dtype
-        ret = super().forward(x)
-        return ret.type(orig_type)
-class ResidualAttentionBlock(nn.Module):
-    def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
-        super().__init__()
-        self.attn = nn.MultiheadAttention(d_model, n_head)
-        self.ln_1 = LayerNorm(d_model)
-        self.mlp = nn.Sequential(
-            OrderedDict([("c_fc", nn.Linear(d_model, d_model * 4)), ("gelu", QuickGELU()),
-                         ("c_proj", nn.Linear(d_model * 4, d_model))]))
-        self.ln_2 = LayerNorm(d_model)
-        self.attn_mask = attn_mask
-    def attention(self, x: torch.Tensor):
-        self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
-        return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
-    def forward(self, x: torch.Tensor):
-        x = x + self.attention(self.ln_1(x))
-        x = x + self.mlp(self.ln_2(x))
-        return x
-#-----------------------------------------------------------------------------------------------------
-@dataclass
-class ControlNetOutput(BaseOutput):
-    """
-    The output of [`ControlNetModel`].
-    Args:
-        down_block_res_samples (`tuple[torch.Tensor]`):
-            A tuple of downsample activations at different resolutions for each downsampling block. Each tensor should
-            be of shape `(batch_size, channel * resolution, height //resolution, width // resolution)`. Output can be
-            used to condition the original UNet's downsampling activations.
-        mid_down_block_re_sample (`torch.Tensor`):
-            The activation of the midde block (the lowest sample resolution). Each tensor should be of shape
-            `(batch_size, channel * lowest_resolution, height // lowest_resolution, width // lowest_resolution)`.
-            Output can be used to condition the original UNet's middle block activation.
-    """
-    down_block_res_samples: Tuple[torch.Tensor]
-    mid_block_res_sample: torch.Tensor
-class ControlNetConditioningEmbedding(nn.Module):
-    """
-    Quoting from https://arxiv.org/abs/2302.05543: "Stable Diffusion uses a pre-processing method similar to VQ-GAN
-    [11] to convert the entire dataset of 512 × 512 images into smaller 64 × 64 “latent images” for stabilized
-    training. This requires ControlNets to convert image-based conditions to 64 × 64 feature space to match the
-    convolution size. We use a tiny network E(·) of four convolution layers with 4 × 4 kernels and 2 × 2 strides
-    (activated by ReLU, channels are 16, 32, 64, 128, initialized with Gaussian weights, trained jointly with the full
-    model) to encode image-space conditions ... into feature maps ..."
-    """
-    # original setting is (16, 32, 96, 256)
-    def __init__(
-        self,
-        conditioning_embedding_channels: int,
-        conditioning_channels: int = 3,
-        block_out_channels: Tuple[int] = (48, 96, 192, 384),
-    ):
-        super().__init__()
-        self.conv_in = nn.Conv2d(conditioning_channels, block_out_channels[0], kernel_size=3, padding=1)
-        self.blocks = nn.ModuleList([])
-        for i in range(len(block_out_channels) - 1):
-            channel_in = block_out_channels[i]
-            channel_out = block_out_channels[i + 1]
-            self.blocks.append(nn.Conv2d(channel_in, channel_in, kernel_size=3, padding=1))
-            self.blocks.append(nn.Conv2d(channel_in, channel_out, kernel_size=3, padding=1, stride=2))
-        self.conv_out = zero_module(
-            nn.Conv2d(block_out_channels[-1], conditioning_embedding_channels, kernel_size=3, padding=1)
-        )
-    def forward(self, conditioning):
-        embedding = self.conv_in(conditioning)
-        embedding = F.silu(embedding)
-        for block in self.blocks:
-            embedding = block(embedding)
-            embedding = F.silu(embedding)
-        embedding = self.conv_out(embedding)
-        return embedding
-class ControlNetModel_Union(ModelMixin, ConfigMixin, FromOriginalModelMixin):
-    """
-    A ControlNet model.
-    Args:
-        in_channels (`int`, defaults to 4):
-            The number of channels in the input sample.
-        flip_sin_to_cos (`bool`, defaults to `True`):
-            Whether to flip the sin to cos in the time embedding.
-        freq_shift (`int`, defaults to 0):
-            The frequency shift to apply to the time embedding.
-        down_block_types (`tuple[str]`, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
-            The tuple of downsample blocks to use.
-        only_cross_attention (`Union[bool, Tuple[bool]]`, defaults to `False`):
-        block_out_channels (`tuple[int]`, defaults to `(320, 640, 1280, 1280)`):
-            The tuple of output channels for each block.
-        layers_per_block (`int`, defaults to 2):
-            The number of layers per block.
-        downsample_padding (`int`, defaults to 1):
-            The padding to use for the downsampling convolution.
-        mid_block_scale_factor (`float`, defaults to 1):
-            The scale factor to use for the mid block.
-        act_fn (`str`, defaults to "silu"):
-            The activation function to use.
-        norm_num_groups (`int`, *optional*, defaults to 32):
-            The number of groups to use for the normalization. If None, normalization and activation layers is skipped
-            in post-processing.
-        norm_eps (`float`, defaults to 1e-5):
-            The epsilon to use for the normalization.
-        cross_attention_dim (`int`, defaults to 1280):
-            The dimension of the cross attention features.
-        transformer_layers_per_block (`int` or `Tuple[int]`, *optional*, defaults to 1):
-            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
-            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
-            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
-        encoder_hid_dim (`int`, *optional*, defaults to None):
-            If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
-            dimension to `cross_attention_dim`.
-        encoder_hid_dim_type (`str`, *optional*, defaults to `None`):
-            If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text
-            embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`.
-        attention_head_dim (`Union[int, Tuple[int]]`, defaults to 8):
-            The dimension of the attention heads.
-        use_linear_projection (`bool`, defaults to `False`):
-        class_embed_type (`str`, *optional*, defaults to `None`):
-            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from None,
-            `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
-        addition_embed_type (`str`, *optional*, defaults to `None`):
-            Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or
-            "text". "text" will use the `TextTimeEmbedding` layer.
-        num_class_embeds (`int`, *optional*, defaults to 0):
-            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
-            class conditioning with `class_embed_type` equal to `None`.
-        upcast_attention (`bool`, defaults to `False`):
-        resnet_time_scale_shift (`str`, defaults to `"default"`):
-            Time scale shift config for ResNet blocks (see `ResnetBlock2D`). Choose from `default` or `scale_shift`.
-        projection_class_embeddings_input_dim (`int`, *optional*, defaults to `None`):
-            The dimension of the `class_labels` input when `class_embed_type="projection"`. Required when
-            `class_embed_type="projection"`.
-        controlnet_conditioning_channel_order (`str`, defaults to `"rgb"`):
-            The channel order of conditional image. Will convert to `rgb` if it's `bgr`.
-        conditioning_embedding_out_channels (`tuple[int]`, *optional*, defaults to `(16, 32, 96, 256)`):
-            The tuple of output channel for each block in the `conditioning_embedding` layer.
-        global_pool_conditions (`bool`, defaults to `False`):
-    """
-    _supports_gradient_checkpointing = True
-    @register_to_config
-    def __init__(
-        self,
-        in_channels: int = 4,
-        conditioning_channels: int = 3,
-        flip_sin_to_cos: bool = True,
-        freq_shift: int = 0,
-        down_block_types: Tuple[str] = (
-            "CrossAttnDownBlock2D",
-            "CrossAttnDownBlock2D",
-            "CrossAttnDownBlock2D",
-            "DownBlock2D",
-        ),
-        only_cross_attention: Union[bool, Tuple[bool]] = False,
-        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
-        layers_per_block: int = 2,
-        downsample_padding: int = 1,
-        mid_block_scale_factor: float = 1,
-        act_fn: str = "silu",
-        norm_num_groups: Optional[int] = 32,
-        norm_eps: float = 1e-5,
-        cross_attention_dim: int = 1280,
-        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
-        encoder_hid_dim: Optional[int] = None,
-        encoder_hid_dim_type: Optional[str] = None,
-        attention_head_dim: Union[int, Tuple[int]] = 8,
-        num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
-        use_linear_projection: bool = False,
-        class_embed_type: Optional[str] = None,
-        addition_embed_type: Optional[str] = None,
-        addition_time_embed_dim: Optional[int] = None,
-        num_class_embeds: Optional[int] = None,
-        upcast_attention: bool = False,
-        resnet_time_scale_shift: str = "default",
-        projection_class_embeddings_input_dim: Optional[int] = None,
-        controlnet_conditioning_channel_order: str = "rgb",
-        conditioning_embedding_out_channels: Optional[Tuple[int]] = (16, 32, 96, 256),
-        global_pool_conditions: bool = False,
-        addition_embed_type_num_heads=64,
-        num_control_type = 6,
-    ):
-        super().__init__()
-        # If `num_attention_heads` is not defined (which is the case for most models)
-        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
-        # The reason for this behavior is to correct for incorrectly named variables that were introduced
-        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
-        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
-        # which is why we correct for the naming here.
-        num_attention_heads = num_attention_heads or attention_head_dim
-        # Check inputs
-        if len(block_out_channels) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
-            )
-        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
-            )
-        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
-            )
-        if isinstance(transformer_layers_per_block, int):
-            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
-        # input
-        conv_in_kernel = 3
-        conv_in_padding = (conv_in_kernel - 1) // 2
-        self.conv_in = nn.Conv2d(
-            in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
-        )
-        # time
-        time_embed_dim = block_out_channels[0] * 4
-        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
-        timestep_input_dim = block_out_channels[0]
-        self.time_embedding = TimestepEmbedding(
-            timestep_input_dim,
-            time_embed_dim,
-            act_fn=act_fn,
-        )
-        if encoder_hid_dim_type is None and encoder_hid_dim is not None:
-            encoder_hid_dim_type = "text_proj"
-            self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type)
-            logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.")
-        if encoder_hid_dim is None and encoder_hid_dim_type is not None:
-            raise ValueError(
-                f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
-            )
-        if encoder_hid_dim_type == "text_proj":
-            self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
-        elif encoder_hid_dim_type == "text_image_proj":
-            # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
-            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
-            # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)`
-            self.encoder_hid_proj = TextImageProjection(
-                text_embed_dim=encoder_hid_dim,
-                image_embed_dim=cross_attention_dim,
-                cross_attention_dim=cross_attention_dim,
-            )
-        elif encoder_hid_dim_type is not None:
-            raise ValueError(
-                f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
-            )
-        else:
-            self.encoder_hid_proj = None
-        # class embedding
-        if class_embed_type is None and num_class_embeds is not None:
-            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
-        elif class_embed_type == "timestep":
-            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
-        elif class_embed_type == "identity":
-            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
-        elif class_embed_type == "projection":
-            if projection_class_embeddings_input_dim is None:
-                raise ValueError(
-                    "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
-                )
-            # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
-            # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
-            # 2. it projects from an arbitrary input dimension.
-            #
-            # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
-            # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
-            # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
-            self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
-        else:
-            self.class_embedding = None
-        if addition_embed_type == "text":
-            if encoder_hid_dim is not None:
-                text_time_embedding_from_dim = encoder_hid_dim
-            else:
-                text_time_embedding_from_dim = cross_attention_dim
-            self.add_embedding = TextTimeEmbedding(
-                text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
-            )
-        elif addition_embed_type == "text_image":
-            # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
-            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
-            # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)`
-            self.add_embedding = TextImageTimeEmbedding(
-                text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
-            )
-        elif addition_embed_type == "text_time":
-            self.add_time_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos, freq_shift)
-            self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
-        elif addition_embed_type is not None:
-            raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
-        # control net conditioning embedding
-        self.controlnet_cond_embedding = ControlNetConditioningEmbedding(
-            conditioning_embedding_channels=block_out_channels[0],
-            block_out_channels=conditioning_embedding_out_channels,
-            conditioning_channels=conditioning_channels,
-        )
-        # Copyright by Qi Xin(2024/07/06)
-        # Condition Transformer(fuse single/multi conditions with input image)
-        # The Condition Transformer augment the feature representation of conditions
-        # The overall design is somewhat like resnet. The output of Condition Transformer is used to predict a condition bias adding to the original condition feature.
-        # num_control_type = 6
-        num_trans_channel = 320
-        num_trans_head = 8
-        num_trans_layer = 1
-        num_proj_channel = 320
-        task_scale_factor = num_trans_channel ** 0.5
-        self.task_embedding = nn.Parameter(task_scale_factor * torch.randn(num_control_type, num_trans_channel))
-        self.transformer_layes = nn.Sequential(*[ResidualAttentionBlock(num_trans_channel, num_trans_head) for _ in range(num_trans_layer)])
-        self.spatial_ch_projs = zero_module(nn.Linear(num_trans_channel, num_proj_channel))
-        #-----------------------------------------------------------------------------------------------------
-        # Copyright by Qi Xin(2024/07/06)
-        # Control Encoder to distinguish different control conditions
-        # A simple but effective module, consists of an embedding layer and a linear layer, to inject the control info to time embedding.
-        self.control_type_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos, freq_shift)
-        self.control_add_embedding = TimestepEmbedding(addition_time_embed_dim * num_control_type, time_embed_dim)
-        #-----------------------------------------------------------------------------------------------------
-        self.down_blocks = nn.ModuleList([])
-        self.controlnet_down_blocks = nn.ModuleList([])
-        if isinstance(only_cross_attention, bool):
-            only_cross_attention = [only_cross_attention] * len(down_block_types)
-        if isinstance(attention_head_dim, int):
-            attention_head_dim = (attention_head_dim,) * len(down_block_types)
-        if isinstance(num_attention_heads, int):
-            num_attention_heads = (num_attention_heads,) * len(down_block_types)
-        # down
-        output_channel = block_out_channels[0]
-        controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
-        controlnet_block = zero_module(controlnet_block)
-        self.controlnet_down_blocks.append(controlnet_block)
-        for i, down_block_type in enumerate(down_block_types):
-            input_channel = output_channel
-            output_channel = block_out_channels[i]
-            is_final_block = i == len(block_out_channels) - 1
-            down_block = get_down_block(
-                down_block_type,
-                num_layers=layers_per_block,
-                transformer_layers_per_block=transformer_layers_per_block[i],
-                in_channels=input_channel,
-                out_channels=output_channel,
-                temb_channels=time_embed_dim,
-                add_downsample=not is_final_block,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                resnet_groups=norm_num_groups,
-                cross_attention_dim=cross_attention_dim,
-                num_attention_heads=num_attention_heads[i],
-                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
-                downsample_padding=downsample_padding,
-                use_linear_projection=use_linear_projection,
-                only_cross_attention=only_cross_attention[i],
-                upcast_attention=upcast_attention,
-                resnet_time_scale_shift=resnet_time_scale_shift,
-            )
-            self.down_blocks.append(down_block)
-            for _ in range(layers_per_block):
-                controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
-                controlnet_block = zero_module(controlnet_block)
-                self.controlnet_down_blocks.append(controlnet_block)
-            if not is_final_block:
-                controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
-                controlnet_block = zero_module(controlnet_block)
-                self.controlnet_down_blocks.append(controlnet_block)
-        # mid
-        mid_block_channel = block_out_channels[-1]
-        controlnet_block = nn.Conv2d(mid_block_channel, mid_block_channel, kernel_size=1)
-        controlnet_block = zero_module(controlnet_block)
-        self.controlnet_mid_block = controlnet_block
-        self.mid_block = UNetMidBlock2DCrossAttn(
-            transformer_layers_per_block=transformer_layers_per_block[-1],
-            in_channels=mid_block_channel,
-            temb_channels=time_embed_dim,
-            resnet_eps=norm_eps,
-            resnet_act_fn=act_fn,
-            output_scale_factor=mid_block_scale_factor,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            cross_attention_dim=cross_attention_dim,
-            num_attention_heads=num_attention_heads[-1],
-            resnet_groups=norm_num_groups,
-            use_linear_projection=use_linear_projection,
-            upcast_attention=upcast_attention,
-        )
-    @classmethod
-    def from_unet(
-        cls,
-        unet: UNet2DConditionModel,
-        controlnet_conditioning_channel_order: str = "rgb",
-        conditioning_embedding_out_channels: Optional[Tuple[int]] = (16, 32, 96, 256),
-        load_weights_from_unet: bool = True,
-    ):
-        r"""
-        Instantiate a [`ControlNetModel`] from [`UNet2DConditionModel`].
-        Parameters:
-            unet (`UNet2DConditionModel`):
-                The UNet model weights to copy to the [`ControlNetModel`]. All configuration options are also copied
-                where applicable.
-        """
-        transformer_layers_per_block = (
-            unet.config.transformer_layers_per_block if "transformer_layers_per_block" in unet.config else 1
-        )
-        encoder_hid_dim = unet.config.encoder_hid_dim if "encoder_hid_dim" in unet.config else None
-        encoder_hid_dim_type = unet.config.encoder_hid_dim_type if "encoder_hid_dim_type" in unet.config else None
-        addition_embed_type = unet.config.addition_embed_type if "addition_embed_type" in unet.config else None
-        addition_time_embed_dim = (
-            unet.config.addition_time_embed_dim if "addition_time_embed_dim" in unet.config else None
-        )
-        controlnet = cls(
-            encoder_hid_dim=encoder_hid_dim,
-            encoder_hid_dim_type=encoder_hid_dim_type,
-            addition_embed_type=addition_embed_type,
-            addition_time_embed_dim=addition_time_embed_dim,
-            transformer_layers_per_block=transformer_layers_per_block,
-            # transformer_layers_per_block=[1, 2, 5],
-            in_channels=unet.config.in_channels,
-            flip_sin_to_cos=unet.config.flip_sin_to_cos,
-            freq_shift=unet.config.freq_shift,
-            down_block_types=unet.config.down_block_types,
-            only_cross_attention=unet.config.only_cross_attention,
-            block_out_channels=unet.config.block_out_channels,
-            layers_per_block=unet.config.layers_per_block,
-            downsample_padding=unet.config.downsample_padding,
-            mid_block_scale_factor=unet.config.mid_block_scale_factor,
-            act_fn=unet.config.act_fn,
-            norm_num_groups=unet.config.norm_num_groups,
-            norm_eps=unet.config.norm_eps,
-            cross_attention_dim=unet.config.cross_attention_dim,
-            attention_head_dim=unet.config.attention_head_dim,
-            num_attention_heads=unet.config.num_attention_heads,
-            use_linear_projection=unet.config.use_linear_projection,
-            class_embed_type=unet.config.class_embed_type,
-            num_class_embeds=unet.config.num_class_embeds,
-            upcast_attention=unet.config.upcast_attention,
-            resnet_time_scale_shift=unet.config.resnet_time_scale_shift,
-            projection_class_embeddings_input_dim=unet.config.projection_class_embeddings_input_dim,
-            controlnet_conditioning_channel_order=controlnet_conditioning_channel_order,
-            conditioning_embedding_out_channels=conditioning_embedding_out_channels,
-        )
-        if load_weights_from_unet:
-            controlnet.conv_in.load_state_dict(unet.conv_in.state_dict())
-            controlnet.time_proj.load_state_dict(unet.time_proj.state_dict())
-            controlnet.time_embedding.load_state_dict(unet.time_embedding.state_dict())
-            if controlnet.class_embedding:
-                controlnet.class_embedding.load_state_dict(unet.class_embedding.state_dict())
-            controlnet.down_blocks.load_state_dict(unet.down_blocks.state_dict(), strict=False)
-            controlnet.mid_block.load_state_dict(unet.mid_block.state_dict(), strict=False)
-        return controlnet
-    @property
-    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-            return processors
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-        return processors
-    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(
-        self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]], _remove_lora=False
-    ):
-        r"""
-        Sets the attention processor to use to compute attention.
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-        """
-        count = len(self.attn_processors.keys())
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor, _remove_lora=_remove_lora)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora)
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
-    def set_default_attn_processor(self):
-        """
-        Disables custom attention processors and sets the default attention implementation.
-        """
-        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
-            processor = AttnAddedKVProcessor()
-        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
-            processor = AttnProcessor()
-        else:
-            raise ValueError(
-                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
-            )
-        self.set_attn_processor(processor, _remove_lora=True)
-    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attention_slice
-    def set_attention_slice(self, slice_size):
-        r"""
-        Enable sliced attention computation.
-        When this option is enabled, the attention module splits the input tensor in slices to compute attention in
-        several steps. This is useful for saving some memory in exchange for a small decrease in speed.
-        Args:
-            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
-                When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
-                `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
-                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
-                must be a multiple of `slice_size`.
-        """
-        sliceable_head_dims = []
-        def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
-            if hasattr(module, "set_attention_slice"):
-                sliceable_head_dims.append(module.sliceable_head_dim)
-            for child in module.children():
-                fn_recursive_retrieve_sliceable_dims(child)
-        # retrieve number of attention layers
-        for module in self.children():
-            fn_recursive_retrieve_sliceable_dims(module)
-        num_sliceable_layers = len(sliceable_head_dims)
-        if slice_size == "auto":
-            # half the attention head size is usually a good trade-off between
-            # speed and memory
-            slice_size = [dim // 2 for dim in sliceable_head_dims]
-        elif slice_size == "max":
-            # make smallest slice possible
-            slice_size = num_sliceable_layers * [1]
-        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
-        if len(slice_size) != len(sliceable_head_dims):
-            raise ValueError(
-                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
-                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
-            )
-        for i in range(len(slice_size)):
-            size = slice_size[i]
-            dim = sliceable_head_dims[i]
-            if size is not None and size > dim:
-                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
-        # Recursively walk through all the children.
-        # Any children which exposes the set_attention_slice method
-        # gets the message
-        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
-            if hasattr(module, "set_attention_slice"):
-                module.set_attention_slice(slice_size.pop())
-            for child in module.children():
-                fn_recursive_set_attention_slice(child, slice_size)
-        reversed_slice_size = list(reversed(slice_size))
-        for module in self.children():
-            fn_recursive_set_attention_slice(module, reversed_slice_size)
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D)):
-            module.gradient_checkpointing = value
-    def forward(
-        self,
-        sample: torch.FloatTensor,
-        timestep: Union[torch.Tensor, float, int],
-        encoder_hidden_states: torch.Tensor,
-        controlnet_cond_list: torch.FloatTensor,
-        conditioning_scale: float = 1.0,
-        class_labels: Optional[torch.Tensor] = None,
-        timestep_cond: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        guess_mode: bool = False,
-        return_dict: bool = True,
-    ) -> Union[ControlNetOutput, Tuple]:
-        """
-        The [`ControlNetModel`] forward method.
-        Args:
-            sample (`torch.FloatTensor`):
-                The noisy input tensor.
-            timestep (`Union[torch.Tensor, float, int]`):
-                The number of timesteps to denoise an input.
-            encoder_hidden_states (`torch.Tensor`):
-                The encoder hidden states.
-            controlnet_cond (`torch.FloatTensor`):
-                The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
-            conditioning_scale (`float`, defaults to `1.0`):
-                The scale factor for ControlNet outputs.
-            class_labels (`torch.Tensor`, *optional*, defaults to `None`):
-                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
-            timestep_cond (`torch.Tensor`, *optional*, defaults to `None`):
-                Additional conditional embeddings for timestep. If provided, the embeddings will be summed with the
-                timestep_embedding passed through the `self.time_embedding` layer to obtain the final timestep
-                embeddings.
-            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
-                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
-                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
-                negative values to the attention scores corresponding to "discard" tokens.
-            added_cond_kwargs (`dict`):
-                Additional conditions for the Stable Diffusion XL UNet.
-            cross_attention_kwargs (`dict[str]`, *optional*, defaults to `None`):
-                A kwargs dictionary that if specified is passed along to the `AttnProcessor`.
-            guess_mode (`bool`, defaults to `False`):
-                In this mode, the ControlNet encoder tries its best to recognize the input content of the input even if
-                you remove all prompts. A `guidance_scale` between 3.0 and 5.0 is recommended.
-            return_dict (`bool`, defaults to `True`):
-                Whether or not to return a [`~models.controlnet.ControlNetOutput`] instead of a plain tuple.
-        Returns:
-            [`~models.controlnet.ControlNetOutput`] **or** `tuple`:
-                If `return_dict` is `True`, a [`~models.controlnet.ControlNetOutput`] is returned, otherwise a tuple is
-                returned where the first element is the sample tensor.
-        """
-        # check channel order
-        channel_order = self.config.controlnet_conditioning_channel_order
-        if channel_order == "rgb":
-            # in rgb order by default
-            ...
-        # elif channel_order == "bgr":
-        #     controlnet_cond = torch.flip(controlnet_cond, dims=[1])
-        else:
-            raise ValueError(f"unknown `controlnet_conditioning_channel_order`: {channel_order}")
-        # prepare attention_mask
-        if attention_mask is not None:
-            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
-            attention_mask = attention_mask.unsqueeze(1)
-        # 1. time
-        timesteps = timestep
-        if not torch.is_tensor(timesteps):
-            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
-            # This would be a good case for the `match` statement (Python 3.10+)
-            is_mps = sample.device.type == "mps"
-            if isinstance(timestep, float):
-                dtype = torch.float32 if is_mps else torch.float64
-            else:
-                dtype = torch.int32 if is_mps else torch.int64
-            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
-        elif len(timesteps.shape) == 0:
-            timesteps = timesteps[None].to(sample.device)
-        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-        timesteps = timesteps.expand(sample.shape[0])
-        t_emb = self.time_proj(timesteps)
-        # timesteps does not contain any weights and will always return f32 tensors
-        # but time_embedding might actually be running in fp16. so we need to cast here.
-        # there might be better ways to encapsulate this.
-        t_emb = t_emb.to(dtype=sample.dtype)
-        emb = self.time_embedding(t_emb, timestep_cond)
-        aug_emb = None
-        if self.class_embedding is not None:
-            if class_labels is None:
-                raise ValueError("class_labels should be provided when num_class_embeds > 0")
-            if self.config.class_embed_type == "timestep":
-                class_labels = self.time_proj(class_labels)
-            class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
-            emb = emb + class_emb
-        if self.config.addition_embed_type is not None:
-            if self.config.addition_embed_type == "text":
-                aug_emb = self.add_embedding(encoder_hidden_states)
-            elif self.config.addition_embed_type == "text_time":
-                if "text_embeds" not in added_cond_kwargs:
-                    raise ValueError(
-                        f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
-                    )
-                text_embeds = added_cond_kwargs.get("text_embeds")
-                if "time_ids" not in added_cond_kwargs:
-                    raise ValueError(
-                        f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
-                    )
-                time_ids = added_cond_kwargs.get("time_ids")
-                time_embeds = self.add_time_proj(time_ids.flatten())
-                time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
-                add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
-                add_embeds = add_embeds.to(emb.dtype)
-                aug_emb = self.add_embedding(add_embeds)
-        # Copyright by Qi Xin(2024/07/06)
-        # inject control type info to time embedding to distinguish different control conditions
-        control_type = added_cond_kwargs.get('control_type')
-        control_embeds = self.control_type_proj(control_type.flatten())
-        control_embeds = control_embeds.reshape((t_emb.shape[0], -1))
-        control_embeds = control_embeds.to(emb.dtype)
-        control_emb = self.control_add_embedding(control_embeds)
-        emb = emb + control_emb
-        #---------------------------------------------------------------------------------
-        emb = emb + aug_emb if aug_emb is not None else emb
-        # 2. pre-process
-        sample = self.conv_in(sample)
-        indices = torch.nonzero(control_type[0])
-        # Copyright by Qi Xin(2024/07/06)
-        # add single/multi conditons to input image.
-        # Condition Transformer provides an easy and effective way to fuse different features naturally
-        inputs = []
-        condition_list = []
-        for idx in range(indices.shape[0] + 1):
-            if idx == indices.shape[0]:
-                controlnet_cond = sample
-                feat_seq = torch.mean(controlnet_cond, dim=(2, 3)) # N * C
-            else:
-                controlnet_cond = self.controlnet_cond_embedding(controlnet_cond_list[indices[idx][0]])
-                feat_seq = torch.mean(controlnet_cond, dim=(2, 3)) # N * C
-                feat_seq = feat_seq + self.task_embedding[indices[idx][0]]
-            inputs.append(feat_seq.unsqueeze(1))
-            condition_list.append(controlnet_cond)
-        x = torch.cat(inputs, dim=1)  # NxLxC
-        x = self.transformer_layes(x)
-        controlnet_cond_fuser = sample * 0.0
-        for idx in range(indices.shape[0]):
-            alpha = self.spatial_ch_projs(x[:, idx])
-            alpha = alpha.unsqueeze(-1).unsqueeze(-1)
-            controlnet_cond_fuser += condition_list[idx] + alpha
-        sample = sample + controlnet_cond_fuser
-        #-------------------------------------------------------------------------------------------
-        # 3. down
-        down_block_res_samples = (sample,)
-        for downsample_block in self.down_blocks:
-            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
-                sample, res_samples = downsample_block(
-                    hidden_states=sample,
-                    temb=emb,
-                    encoder_hidden_states=encoder_hidden_states,
-                    attention_mask=attention_mask,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                )
-            else:
-                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
-            down_block_res_samples += res_samples
-        # 4. mid
-        if self.mid_block is not None:
-            sample = self.mid_block(
-                sample,
-                emb,
-                encoder_hidden_states=encoder_hidden_states,
-                attention_mask=attention_mask,
-                cross_attention_kwargs=cross_attention_kwargs,
-            )
-        # 5. Control net blocks
-        controlnet_down_block_res_samples = ()
-        for down_block_res_sample, controlnet_block in zip(down_block_res_samples, self.controlnet_down_blocks):
-            down_block_res_sample = controlnet_block(down_block_res_sample)
-            controlnet_down_block_res_samples = controlnet_down_block_res_samples + (down_block_res_sample,)
-        down_block_res_samples = controlnet_down_block_res_samples
-        mid_block_res_sample = self.controlnet_mid_block(sample)
-        # 6. scaling
-        if guess_mode and not self.config.global_pool_conditions:
-            scales = torch.logspace(-1, 0, len(down_block_res_samples) + 1, device=sample.device)  # 0.1 to 1.0
-            scales = scales * conditioning_scale
-            down_block_res_samples = [sample * scale for sample, scale in zip(down_block_res_samples, scales)]
-            mid_block_res_sample = mid_block_res_sample * scales[-1]  # last one
-        else:
-            down_block_res_samples = [sample * conditioning_scale for sample in down_block_res_samples]
-            mid_block_res_sample = mid_block_res_sample * conditioning_scale
-        if self.config.global_pool_conditions:
-            down_block_res_samples = [
-                torch.mean(sample, dim=(2, 3), keepdim=True) for sample in down_block_res_samples
-            ]
-            mid_block_res_sample = torch.mean(mid_block_res_sample, dim=(2, 3), keepdim=True)
-        if not return_dict:
-            return (down_block_res_samples, mid_block_res_sample)
-        return ControlNetOutput(
-            down_block_res_samples=down_block_res_samples, mid_block_res_sample=mid_block_res_sample
-        )
-def zero_module(module):
-    for p in module.parameters():
-        nn.init.zeros_(p)
-    return module

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch
+from torch import nn
+from torch.nn import functional as F
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders.single_file_model import FromOriginalModelMixin
+from diffusers.utils import BaseOutput, logging
+from diffusers.models.attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from diffusers.models.embeddings import TextImageProjection, TextImageTimeEmbedding, TextTimeEmbedding, TimestepEmbedding, Timesteps
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.unets.unet_2d_blocks import (
+    CrossAttnDownBlock2D,
+    DownBlock2D,
+    UNetMidBlock2DCrossAttn,
+    get_down_block,
+)
+from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+from collections import OrderedDict
+# Transformer Block
+# Used to exchange info between different conditions and input image
+# With reference to https://github.com/TencentARC/T2I-Adapter/blob/SD/ldm/modules/encoders/adapter.py#L147
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x)
+        return ret.type(orig_type)
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(
+            OrderedDict([("c_fc", nn.Linear(d_model, d_model * 4)), ("gelu", QuickGELU()),
+                         ("c_proj", nn.Linear(d_model * 4, d_model))]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
+        return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+    def forward(self, x: torch.Tensor):
+        x = x + self.attention(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+#-----------------------------------------------------------------------------------------------------
+@dataclass
+class ControlNetOutput(BaseOutput):
+    """
+    The output of [`ControlNetModel`].
+    Args:
+        down_block_res_samples (`tuple[torch.Tensor]`):
+            A tuple of downsample activations at different resolutions for each downsampling block. Each tensor should
+            be of shape `(batch_size, channel * resolution, height //resolution, width // resolution)`. Output can be
+            used to condition the original UNet's downsampling activations.
+        mid_down_block_re_sample (`torch.Tensor`):
+            The activation of the midde block (the lowest sample resolution). Each tensor should be of shape
+            `(batch_size, channel * lowest_resolution, height // lowest_resolution, width // lowest_resolution)`.
+            Output can be used to condition the original UNet's middle block activation.
+    """
+    down_block_res_samples: Tuple[torch.Tensor]
+    mid_block_res_sample: torch.Tensor
+class ControlNetConditioningEmbedding(nn.Module):
+    """
+    Quoting from https://arxiv.org/abs/2302.05543: "Stable Diffusion uses a pre-processing method similar to VQ-GAN
+    [11] to convert the entire dataset of 512 × 512 images into smaller 64 × 64 “latent images” for stabilized
+    training. This requires ControlNets to convert image-based conditions to 64 × 64 feature space to match the
+    convolution size. We use a tiny network E(·) of four convolution layers with 4 × 4 kernels and 2 × 2 strides
+    (activated by ReLU, channels are 16, 32, 64, 128, initialized with Gaussian weights, trained jointly with the full
+    model) to encode image-space conditions ... into feature maps ..."
+    """
+    # original setting is (16, 32, 96, 256)
+    def __init__(
+        self,
+        conditioning_embedding_channels: int,
+        conditioning_channels: int = 3,
+        block_out_channels: Tuple[int] = (48, 96, 192, 384),
+    ):
+        super().__init__()
+        self.conv_in = nn.Conv2d(conditioning_channels, block_out_channels[0], kernel_size=3, padding=1)
+        self.blocks = nn.ModuleList([])
+        for i in range(len(block_out_channels) - 1):
+            channel_in = block_out_channels[i]
+            channel_out = block_out_channels[i + 1]
+            self.blocks.append(nn.Conv2d(channel_in, channel_in, kernel_size=3, padding=1))
+            self.blocks.append(nn.Conv2d(channel_in, channel_out, kernel_size=3, padding=1, stride=2))
+        self.conv_out = zero_module(
+            nn.Conv2d(block_out_channels[-1], conditioning_embedding_channels, kernel_size=3, padding=1)
+        )
+    def forward(self, conditioning):
+        embedding = self.conv_in(conditioning)
+        embedding = F.silu(embedding)
+        for block in self.blocks:
+            embedding = block(embedding)
+            embedding = F.silu(embedding)
+        embedding = self.conv_out(embedding)
+        return embedding
+class ControlNetModel_Union(ModelMixin, ConfigMixin, FromOriginalModelMixin):
+    """
+    A ControlNet model.
+    Args:
+        in_channels (`int`, defaults to 4):
+            The number of channels in the input sample.
+        flip_sin_to_cos (`bool`, defaults to `True`):
+            Whether to flip the sin to cos in the time embedding.
+        freq_shift (`int`, defaults to 0):
+            The frequency shift to apply to the time embedding.
+        down_block_types (`tuple[str]`, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+            The tuple of downsample blocks to use.
+        only_cross_attention (`Union[bool, Tuple[bool]]`, defaults to `False`):
+        block_out_channels (`tuple[int]`, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, defaults to 2):
+            The number of layers per block.
+        downsample_padding (`int`, defaults to 1):
+            The padding to use for the downsampling convolution.
+        mid_block_scale_factor (`float`, defaults to 1):
+            The scale factor to use for the mid block.
+        act_fn (`str`, defaults to "silu"):
+            The activation function to use.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups to use for the normalization. If None, normalization and activation layers is skipped
+            in post-processing.
+        norm_eps (`float`, defaults to 1e-5):
+            The epsilon to use for the normalization.
+        cross_attention_dim (`int`, defaults to 1280):
+            The dimension of the cross attention features.
+        transformer_layers_per_block (`int` or `Tuple[int]`, *optional*, defaults to 1):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
+            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
+            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
+        encoder_hid_dim (`int`, *optional*, defaults to None):
+            If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
+            dimension to `cross_attention_dim`.
+        encoder_hid_dim_type (`str`, *optional*, defaults to `None`):
+            If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text
+            embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`.
+        attention_head_dim (`Union[int, Tuple[int]]`, defaults to 8):
+            The dimension of the attention heads.
+        use_linear_projection (`bool`, defaults to `False`):
+        class_embed_type (`str`, *optional*, defaults to `None`):
+            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from None,
+            `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
+        addition_embed_type (`str`, *optional*, defaults to `None`):
+            Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or
+            "text". "text" will use the `TextTimeEmbedding` layer.
+        num_class_embeds (`int`, *optional*, defaults to 0):
+            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
+            class conditioning with `class_embed_type` equal to `None`.
+        upcast_attention (`bool`, defaults to `False`):
+        resnet_time_scale_shift (`str`, defaults to `"default"`):
+            Time scale shift config for ResNet blocks (see `ResnetBlock2D`). Choose from `default` or `scale_shift`.
+        projection_class_embeddings_input_dim (`int`, *optional*, defaults to `None`):
+            The dimension of the `class_labels` input when `class_embed_type="projection"`. Required when
+            `class_embed_type="projection"`.
+        controlnet_conditioning_channel_order (`str`, defaults to `"rgb"`):
+            The channel order of conditional image. Will convert to `rgb` if it's `bgr`.
+        conditioning_embedding_out_channels (`tuple[int]`, *optional*, defaults to `(16, 32, 96, 256)`):
+            The tuple of output channel for each block in the `conditioning_embedding` layer.
+        global_pool_conditions (`bool`, defaults to `False`):
+    """
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 4,
+        conditioning_channels: int = 3,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "DownBlock2D",
+        ),
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: int = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: int = 1280,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        encoder_hid_dim: Optional[int] = None,
+        encoder_hid_dim_type: Optional[str] = None,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+        num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
+        use_linear_projection: bool = False,
+        class_embed_type: Optional[str] = None,
+        addition_embed_type: Optional[str] = None,
+        addition_time_embed_dim: Optional[int] = None,
+        num_class_embeds: Optional[int] = None,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        projection_class_embeddings_input_dim: Optional[int] = None,
+        controlnet_conditioning_channel_order: str = "rgb",
+        conditioning_embedding_out_channels: Optional[Tuple[int]] = (16, 32, 96, 256),
+        global_pool_conditions: bool = False,
+        addition_embed_type_num_heads=64,
+        num_control_type = 6,
+    ):
+        super().__init__()
+        # If `num_attention_heads` is not defined (which is the case for most models)
+        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
+        # The reason for this behavior is to correct for incorrectly named variables that were introduced
+        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
+        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
+        # which is why we correct for the naming here.
+        num_attention_heads = num_attention_heads or attention_head_dim
+        # Check inputs
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
+        # input
+        conv_in_kernel = 3
+        conv_in_padding = (conv_in_kernel - 1) // 2
+        self.conv_in = nn.Conv2d(
+            in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
+        )
+        # time
+        time_embed_dim = block_out_channels[0] * 4
+        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+        timestep_input_dim = block_out_channels[0]
+        self.time_embedding = TimestepEmbedding(
+            timestep_input_dim,
+            time_embed_dim,
+            act_fn=act_fn,
+        )
+        if encoder_hid_dim_type is None and encoder_hid_dim is not None:
+            encoder_hid_dim_type = "text_proj"
+            self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type)
+            logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.")
+        if encoder_hid_dim is None and encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
+            )
+        if encoder_hid_dim_type == "text_proj":
+            self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
+        elif encoder_hid_dim_type == "text_image_proj":
+            # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)`
+            self.encoder_hid_proj = TextImageProjection(
+                text_embed_dim=encoder_hid_dim,
+                image_embed_dim=cross_attention_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+        elif encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
+            )
+        else:
+            self.encoder_hid_proj = None
+        # class embedding
+        if class_embed_type is None and num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+        elif class_embed_type == "timestep":
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+        elif class_embed_type == "identity":
+            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
+        elif class_embed_type == "projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
+            # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
+            # 2. it projects from an arbitrary input dimension.
+            #
+            # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
+            # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
+            # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
+            self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        else:
+            self.class_embedding = None
+        if addition_embed_type == "text":
+            if encoder_hid_dim is not None:
+                text_time_embedding_from_dim = encoder_hid_dim
+            else:
+                text_time_embedding_from_dim = cross_attention_dim
+            self.add_embedding = TextTimeEmbedding(
+                text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
+            )
+        elif addition_embed_type == "text_image":
+            # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)`
+            self.add_embedding = TextImageTimeEmbedding(
+                text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
+            )
+        elif addition_embed_type == "text_time":
+            self.add_time_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos, freq_shift)
+            self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        elif addition_embed_type is not None:
+            raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
+        # control net conditioning embedding
+        self.controlnet_cond_embedding = ControlNetConditioningEmbedding(
+            conditioning_embedding_channels=block_out_channels[0],
+            block_out_channels=conditioning_embedding_out_channels,
+            conditioning_channels=conditioning_channels,
+        )
+        # Copyright by Qi Xin(2024/07/06)
+        # Condition Transformer(fuse single/multi conditions with input image)
+        # The Condition Transformer augment the feature representation of conditions
+        # The overall design is somewhat like resnet. The output of Condition Transformer is used to predict a condition bias adding to the original condition feature.
+        # num_control_type = 6
+        num_trans_channel = 320
+        num_trans_head = 8
+        num_trans_layer = 1
+        num_proj_channel = 320
+        task_scale_factor = num_trans_channel ** 0.5
+        self.task_embedding = nn.Parameter(task_scale_factor * torch.randn(num_control_type, num_trans_channel))
+        self.transformer_layes = nn.Sequential(*[ResidualAttentionBlock(num_trans_channel, num_trans_head) for _ in range(num_trans_layer)])
+        self.spatial_ch_projs = zero_module(nn.Linear(num_trans_channel, num_proj_channel))
+        #-----------------------------------------------------------------------------------------------------
+        # Copyright by Qi Xin(2024/07/06)
+        # Control Encoder to distinguish different control conditions
+        # A simple but effective module, consists of an embedding layer and a linear layer, to inject the control info to time embedding.
+        self.control_type_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos, freq_shift)
+        self.control_add_embedding = TimestepEmbedding(addition_time_embed_dim * num_control_type, time_embed_dim)
+        #-----------------------------------------------------------------------------------------------------
+        self.down_blocks = nn.ModuleList([])
+        self.controlnet_down_blocks = nn.ModuleList([])
+        if isinstance(only_cross_attention, bool):
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
+        # down
+        output_channel = block_out_channels[0]
+        controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
+        controlnet_block = zero_module(controlnet_block)
+        self.controlnet_down_blocks.append(controlnet_block)
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block,
+                transformer_layers_per_block=transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                num_attention_heads=num_attention_heads[i],
+                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
+                downsample_padding=downsample_padding,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+            )
+            self.down_blocks.append(down_block)
+            for _ in range(layers_per_block):
+                controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
+                controlnet_block = zero_module(controlnet_block)
+                self.controlnet_down_blocks.append(controlnet_block)
+            if not is_final_block:
+                controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
+                controlnet_block = zero_module(controlnet_block)
+                self.controlnet_down_blocks.append(controlnet_block)
+        # mid
+        mid_block_channel = block_out_channels[-1]
+        controlnet_block = nn.Conv2d(mid_block_channel, mid_block_channel, kernel_size=1)
+        controlnet_block = zero_module(controlnet_block)
+        self.controlnet_mid_block = controlnet_block
+        self.mid_block = UNetMidBlock2DCrossAttn(
+            transformer_layers_per_block=transformer_layers_per_block[-1],
+            in_channels=mid_block_channel,
+            temb_channels=time_embed_dim,
+            resnet_eps=norm_eps,
+            resnet_act_fn=act_fn,
+            output_scale_factor=mid_block_scale_factor,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads[-1],
+            resnet_groups=norm_num_groups,
+            use_linear_projection=use_linear_projection,
+            upcast_attention=upcast_attention,
+        )
+    @classmethod
+    def from_unet(
+        cls,
+        unet: UNet2DConditionModel,
+        controlnet_conditioning_channel_order: str = "rgb",
+        conditioning_embedding_out_channels: Optional[Tuple[int]] = (16, 32, 96, 256),
+        load_weights_from_unet: bool = True,
+    ):
+        r"""
+        Instantiate a [`ControlNetModel`] from [`UNet2DConditionModel`].
+        Parameters:
+            unet (`UNet2DConditionModel`):
+                The UNet model weights to copy to the [`ControlNetModel`]. All configuration options are also copied
+                where applicable.
+        """
+        transformer_layers_per_block = (
+            unet.config.transformer_layers_per_block if "transformer_layers_per_block" in unet.config else 1
+        )
+        encoder_hid_dim = unet.config.encoder_hid_dim if "encoder_hid_dim" in unet.config else None
+        encoder_hid_dim_type = unet.config.encoder_hid_dim_type if "encoder_hid_dim_type" in unet.config else None
+        addition_embed_type = unet.config.addition_embed_type if "addition_embed_type" in unet.config else None
+        addition_time_embed_dim = (
+            unet.config.addition_time_embed_dim if "addition_time_embed_dim" in unet.config else None
+        )
+        controlnet = cls(
+            encoder_hid_dim=encoder_hid_dim,
+            encoder_hid_dim_type=encoder_hid_dim_type,
+            addition_embed_type=addition_embed_type,
+            addition_time_embed_dim=addition_time_embed_dim,
+            transformer_layers_per_block=transformer_layers_per_block,
+            # transformer_layers_per_block=[1, 2, 5],
+            in_channels=unet.config.in_channels,
+            flip_sin_to_cos=unet.config.flip_sin_to_cos,
+            freq_shift=unet.config.freq_shift,
+            down_block_types=unet.config.down_block_types,
+            only_cross_attention=unet.config.only_cross_attention,
+            block_out_channels=unet.config.block_out_channels,
+            layers_per_block=unet.config.layers_per_block,
+            downsample_padding=unet.config.downsample_padding,
+            mid_block_scale_factor=unet.config.mid_block_scale_factor,
+            act_fn=unet.config.act_fn,
+            norm_num_groups=unet.config.norm_num_groups,
+            norm_eps=unet.config.norm_eps,
+            cross_attention_dim=unet.config.cross_attention_dim,
+            attention_head_dim=unet.config.attention_head_dim,
+            num_attention_heads=unet.config.num_attention_heads,
+            use_linear_projection=unet.config.use_linear_projection,
+            class_embed_type=unet.config.class_embed_type,
+            num_class_embeds=unet.config.num_class_embeds,
+            upcast_attention=unet.config.upcast_attention,
+            resnet_time_scale_shift=unet.config.resnet_time_scale_shift,
+            projection_class_embeddings_input_dim=unet.config.projection_class_embeddings_input_dim,
+            controlnet_conditioning_channel_order=controlnet_conditioning_channel_order,
+            conditioning_embedding_out_channels=conditioning_embedding_out_channels,
+        )
+        if load_weights_from_unet:
+            controlnet.conv_in.load_state_dict(unet.conv_in.state_dict())
+            controlnet.time_proj.load_state_dict(unet.time_proj.state_dict())
+            controlnet.time_embedding.load_state_dict(unet.time_embedding.state_dict())
+            if controlnet.class_embedding:
+                controlnet.class_embedding.load_state_dict(unet.class_embedding.state_dict())
+            controlnet.down_blocks.load_state_dict(unet.down_blocks.state_dict(), strict=False)
+            controlnet.mid_block.load_state_dict(unet.mid_block.state_dict(), strict=False)
+        return controlnet
+    @property
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(
+        self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]], _remove_lora=False
+    ):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor, _remove_lora=_remove_lora)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora)
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+        self.set_attn_processor(processor, _remove_lora=True)
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attention_slice
+    def set_attention_slice(self, slice_size):
+        r"""
+        Enable sliced attention computation.
+        When this option is enabled, the attention module splits the input tensor in slices to compute attention in
+        several steps. This is useful for saving some memory in exchange for a small decrease in speed.
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
+                `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        sliceable_head_dims = []
+        def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
+            if hasattr(module, "set_attention_slice"):
+                sliceable_head_dims.append(module.sliceable_head_dim)
+            for child in module.children():
+                fn_recursive_retrieve_sliceable_dims(child)
+        # retrieve number of attention layers
+        for module in self.children():
+            fn_recursive_retrieve_sliceable_dims(module)
+        num_sliceable_layers = len(sliceable_head_dims)
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = [dim // 2 for dim in sliceable_head_dims]
+        elif slice_size == "max":
+            # make smallest slice possible
+            slice_size = num_sliceable_layers * [1]
+        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
+        if len(slice_size) != len(sliceable_head_dims):
+            raise ValueError(
+                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+            )
+        for i in range(len(slice_size)):
+            size = slice_size[i]
+            dim = sliceable_head_dims[i]
+            if size is not None and size > dim:
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+        # Recursively walk through all the children.
+        # Any children which exposes the set_attention_slice method
+        # gets the message
+        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
+            if hasattr(module, "set_attention_slice"):
+                module.set_attention_slice(slice_size.pop())
+            for child in module.children():
+                fn_recursive_set_attention_slice(child, slice_size)
+        reversed_slice_size = list(reversed(slice_size))
+        for module in self.children():
+            fn_recursive_set_attention_slice(module, reversed_slice_size)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D)):
+            module.gradient_checkpointing = value
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        controlnet_cond_list: torch.FloatTensor,
+        conditioning_scale: float = 1.0,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guess_mode: bool = False,
+        return_dict: bool = True,
+    ) -> Union[ControlNetOutput, Tuple]:
+        """
+        The [`ControlNetModel`] forward method.
+        Args:
+            sample (`torch.FloatTensor`):
+                The noisy input tensor.
+            timestep (`Union[torch.Tensor, float, int]`):
+                The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.Tensor`):
+                The encoder hidden states.
+            controlnet_cond (`torch.FloatTensor`):
+                The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
+            conditioning_scale (`float`, defaults to `1.0`):
+                The scale factor for ControlNet outputs.
+            class_labels (`torch.Tensor`, *optional*, defaults to `None`):
+                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
+            timestep_cond (`torch.Tensor`, *optional*, defaults to `None`):
+                Additional conditional embeddings for timestep. If provided, the embeddings will be summed with the
+                timestep_embedding passed through the `self.time_embedding` layer to obtain the final timestep
+                embeddings.
+            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            added_cond_kwargs (`dict`):
+                Additional conditions for the Stable Diffusion XL UNet.
+            cross_attention_kwargs (`dict[str]`, *optional*, defaults to `None`):
+                A kwargs dictionary that if specified is passed along to the `AttnProcessor`.
+            guess_mode (`bool`, defaults to `False`):
+                In this mode, the ControlNet encoder tries its best to recognize the input content of the input even if
+                you remove all prompts. A `guidance_scale` between 3.0 and 5.0 is recommended.
+            return_dict (`bool`, defaults to `True`):
+                Whether or not to return a [`~models.controlnet.ControlNetOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.controlnet.ControlNetOutput`] **or** `tuple`:
+                If `return_dict` is `True`, a [`~models.controlnet.ControlNetOutput`] is returned, otherwise a tuple is
+                returned where the first element is the sample tensor.
+        """
+        # check channel order
+        channel_order = self.config.controlnet_conditioning_channel_order
+        if channel_order == "rgb":
+            # in rgb order by default
+            ...
+        # elif channel_order == "bgr":
+        #     controlnet_cond = torch.flip(controlnet_cond, dims=[1])
+        else:
+            raise ValueError(f"unknown `controlnet_conditioning_channel_order`: {channel_order}")
+        # prepare attention_mask
+        if attention_mask is not None:
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+        t_emb = self.time_proj(timesteps)
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+        emb = self.time_embedding(t_emb, timestep_cond)
+        aug_emb = None
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+            class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
+            emb = emb + class_emb
+        if self.config.addition_embed_type is not None:
+            if self.config.addition_embed_type == "text":
+                aug_emb = self.add_embedding(encoder_hidden_states)
+            elif self.config.addition_embed_type == "text_time":
+                if "text_embeds" not in added_cond_kwargs:
+                    raise ValueError(
+                        f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                    )
+                text_embeds = added_cond_kwargs.get("text_embeds")
+                if "time_ids" not in added_cond_kwargs:
+                    raise ValueError(
+                        f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                    )
+                time_ids = added_cond_kwargs.get("time_ids")
+                time_embeds = self.add_time_proj(time_ids.flatten())
+                time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+                add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
+                add_embeds = add_embeds.to(emb.dtype)
+                aug_emb = self.add_embedding(add_embeds)
+        # Copyright by Qi Xin(2024/07/06)
+        # inject control type info to time embedding to distinguish different control conditions
+        control_type = added_cond_kwargs.get('control_type')
+        control_embeds = self.control_type_proj(control_type.flatten())
+        control_embeds = control_embeds.reshape((t_emb.shape[0], -1))
+        control_embeds = control_embeds.to(emb.dtype)
+        control_emb = self.control_add_embedding(control_embeds)
+        emb = emb + control_emb
+        #---------------------------------------------------------------------------------
+        emb = emb + aug_emb if aug_emb is not None else emb
+        # 2. pre-process
+        sample = self.conv_in(sample)
+        indices = torch.nonzero(control_type[0])
+        # Copyright by Qi Xin(2024/07/06)
+        # add single/multi conditons to input image.
+        # Condition Transformer provides an easy and effective way to fuse different features naturally
+        inputs = []
+        condition_list = []
+        for idx in range(indices.shape[0] + 1):
+            if idx == indices.shape[0]:
+                controlnet_cond = sample
+                feat_seq = torch.mean(controlnet_cond, dim=(2, 3)) # N * C
+            else:
+                controlnet_cond = self.controlnet_cond_embedding(controlnet_cond_list[indices[idx][0]])
+                feat_seq = torch.mean(controlnet_cond, dim=(2, 3)) # N * C
+                feat_seq = feat_seq + self.task_embedding[indices[idx][0]]
+            inputs.append(feat_seq.unsqueeze(1))
+            condition_list.append(controlnet_cond)
+        x = torch.cat(inputs, dim=1)  # NxLxC
+        x = self.transformer_layes(x)
+        controlnet_cond_fuser = sample * 0.0
+        for idx in range(indices.shape[0]):
+            alpha = self.spatial_ch_projs(x[:, idx])
+            alpha = alpha.unsqueeze(-1).unsqueeze(-1)
+            controlnet_cond_fuser += condition_list[idx] + alpha
+        sample = sample + controlnet_cond_fuser
+        #-------------------------------------------------------------------------------------------
+        # 3. down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+            down_block_res_samples += res_samples
+        # 4. mid
+        if self.mid_block is not None:
+            sample = self.mid_block(
+                sample,
+                emb,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=attention_mask,
+                cross_attention_kwargs=cross_attention_kwargs,
+            )
+        # 5. Control net blocks
+        controlnet_down_block_res_samples = ()
+        for down_block_res_sample, controlnet_block in zip(down_block_res_samples, self.controlnet_down_blocks):
+            down_block_res_sample = controlnet_block(down_block_res_sample)
+            controlnet_down_block_res_samples = controlnet_down_block_res_samples + (down_block_res_sample,)
+        down_block_res_samples = controlnet_down_block_res_samples
+        mid_block_res_sample = self.controlnet_mid_block(sample)
+        # 6. scaling
+        if guess_mode and not self.config.global_pool_conditions:
+            scales = torch.logspace(-1, 0, len(down_block_res_samples) + 1, device=sample.device)  # 0.1 to 1.0
+            scales = scales * conditioning_scale
+            down_block_res_samples = [sample * scale for sample, scale in zip(down_block_res_samples, scales)]
+            mid_block_res_sample = mid_block_res_sample * scales[-1]  # last one
+        else:
+            down_block_res_samples = [sample * conditioning_scale for sample in down_block_res_samples]
+            mid_block_res_sample = mid_block_res_sample * conditioning_scale
+        if self.config.global_pool_conditions:
+            down_block_res_samples = [
+                torch.mean(sample, dim=(2, 3), keepdim=True) for sample in down_block_res_samples
+            ]
+            mid_block_res_sample = torch.mean(mid_block_res_sample, dim=(2, 3), keepdim=True)
+        if not return_dict:
+            return (down_block_res_samples, mid_block_res_sample)
+        return ControlNetOutput(
+            down_block_res_samples=down_block_res_samples, mid_block_res_sample=mid_block_res_sample
+        )
+def zero_module(module):
+    for p in module.parameters():
+        nn.init.zeros_(p)
+    return module

t2i/controlnet_union/pipeline/pipeline_controlnet_union_inpaint_sd_xl.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

t2i/controlnet_union/pipeline/pipeline_controlnet_union_sd_xl.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

t2i/controlnet_union/pipeline/pipeline_controlnet_union_sd_xl_img2img.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

t2i/pipe.py CHANGED Viewed

@@ -1,157 +1,157 @@
-import os, subprocess, time, datetime, inspect
-from typing import Any, Tuple, Dict, List, Optional
-from dataclasses import dataclass, field
-import torch
-from diffusers import DiffusionPipeline, AutoencoderKL
-from diffusers.models.attention_processor import AttnProcessor2_0
-from t2i_config import models, sdxl_vaes, sd15_vaes, PIPELINE_MAX_GIB
-from t2i.utils import (logger, get_token, free_memory, calc_pipe_size, is_weight_url, get_file,
-                       get_model_type, get_model_type_from_pipe, get_task_class, DEFAULT_TASKS, IS_ZEROGPU, DEVICE, DTYPE, IS_QUANT,
-                       MAX_SEED, MAX_IMAGE_SIZE, DEFAULT_MODEL_TYPE, DEFAULT_STR, ASPECT_RATIOS, PIPELINE_TYPES, DEFAULT_VAE, PARAM_MODES)
-if IS_ZEROGPU:
-    logger.info("Running on Zero GPU.")
-    os.environ["ZEROGPU_SIZE"] = "auto" # https://huggingface.co/posts/cbensimon/356529804559377
-    subprocess.run("rm -rf /data-nvme/zerogpu-offload/*", env={}, shell=True)
-    torch.set_float32_matmul_precision("high") # https://pytorch.org/blog/accelerating-generative-ai-3/
-logger.info(f"Using device: {DEVICE}")
-logger.info(f"Using dtype: {DTYPE}")
-#from torchao.quantization.quant_api import Int8WeightOnlyConfig, quantize_
-@dataclass(order=True)
-class Pipeline:
-    name: str = ""
-    pipe: Any = field(default_factory=Any)
-    lastmod: float = 0.
-    size: int = 0
-    type: str = DEFAULT_MODEL_TYPE
-    pipe_type: str = PIPELINE_TYPES[0]
-    def __str__(self):
-        return f"{self.name} ({type(self.pipe).__name__} {self.type} {self.pipe_type}) Size:{float(self.size) / (1024.**3):.2f}GiB LastMod.:{datetime.datetime.fromtimestamp(self.lastmod).strftime('%Y/%m/%d %H:%M:%S')}"
-    def __del__(self):
-        if not self.pipe: return
-        self.pipe.to("cpu")
-        del self.pipe
-        free_memory()
-        logger.debug(f"Unloaded pipeline {self.name}.")
-    def onload(self, device: str, model_type: str) -> Any:
-        self.lastmod = time.time()
-        if device != "cpu" and not IS_QUANT:
-            if self.pipe.device != device: self.pipe.to(device)
-            # https://huggingface.co/docs/diffusers/main/en/optimization/torch2.0
-            #if model_type in ["SD 1.5", "SDXL"]: self.pipe.unet.set_attn_processor(AttnProcessor2_0())
-            #elif model_type in ["FLUX"]: self.pipe.transformer.set_attn_processor(AttnProcessor2_0())
-            #self.pipe.vae.set_attn_processor(AttnProcessor2_0())
-            #logger.debug(f"SDPA enabled {type(self.pipe).__name__} ({model_type}) on {device}.") # by default in PyTorch 2.x
-        return self.pipe
-    def quantize(self):
-        if not IS_QUANT: return self
-        #if self.type in ["SD 1.5", "SDXL"]: quantize_(self.pipe.unet, Int8WeightOnlyConfig())
-        #elif self.type in ["FLUX"]: quantize_(self.pipe.transformer, Int8WeightOnlyConfig())
-        self.size=calc_pipe_size(self.pipe)
-        logger.debug(f"Quantized pipeline {self.name}.")
-        return self
-class Pipelines:
-    def __init__(self):
-        self.pipes: Dict[str, Pipeline] = {}
-        self.max_gib = PIPELINE_MAX_GIB
-    def __call__(self, name: str, device: str="cpu", model_type: str=DEFAULT_MODEL_TYPE, pipe_type: str=PIPELINE_TYPES[0]) -> Any:
-        try:
-            if name in self.pipes.keys():
-                pipe = self.pipes[name].onload(device, model_type)
-                free_memory()
-                return pipe
-            if model_type == DEFAULT_MODEL_TYPE: model_type = get_model_type(name)
-            pipe_class = get_task_class(model_type, DEFAULT_TASKS[0])
-            if is_weight_url(name):
-                path = get_file(name)
-                if model_type == "SDXL": pipe = pipe_class.from_single_file(path, add_watermarker=False, torch_dtype=DTYPE)
-                elif model_type == "SD 1.5": pipe = pipe_class.from_single_file(path, torch_dtype=DTYPE)
-                elif model_type == "FLUX": pipe = pipe_class.from_single_file(path, torch_dtype=DTYPE) #
-                else: raise Exception(f"Invalid architecture {name}")
-            else:
-                if model_type == "SDXL": pipe = pipe_class.from_pretrained(name, add_watermarker=False, torch_dtype=DTYPE)
-                elif model_type == "SD 1.5": pipe = pipe_class.from_pretrained(name, torch_dtype=DTYPE)
-                elif model_type == "FLUX": pipe = pipe_class.from_pretrained(name, torch_dtype=DTYPE) #
-                else:
-                    pipe = pipe_class.from_pretrained(name, torch_dtype=DTYPE)
-                    model_type = get_model_type_from_pipe(pipe)
-            if pipe_type == "Long Prompt Weighting" and model_type in ["SD 1.5", "SDXL"]:
-                if model_type == "SD 1.5": pipe = DiffusionPipeline.from_pipe(pipe, custom_pipeline="lpw_stable_diffusion", torch_dtype=DTYPE)
-                elif model_type == "SDXL": pipe = DiffusionPipeline.from_pipe(pipe, custom_pipeline="lpw_stable_diffusion_xl", add_watermarker=False, torch_dtype=DTYPE)
-            self.pipes[name] = Pipeline(name=name, pipe=pipe, lastmod=time.time(), size=calc_pipe_size(pipe), type=model_type, pipe_type=pipe_type)#.quantize()
-            logger.info(f"Loaded {self.pipes[name]}.")
-            self.clean()
-            pipe = self.pipes[name].onload(device, model_type)
-            free_memory()
-            return pipe
-        except Exception as e:
-            logger.info(f"Failed to load pipeline for {name} {e}")
-            return None
-    def get_model_type(self, name: str) -> str:
-        if name in self.pipes.keys(): return self.pipes[name].type
-        else: return DEFAULT_MODEL_TYPE
-    def __str__(self):
-        return "\n".join([str(x) for x in self.pipes.values()])
-    def clean(self):
-        items = sorted(list(self.pipes.values()), key=lambda x:x.lastmod, reverse=True)
-        sum_bytes = 0
-        max_bytes = self.max_gib * (1024 ** 3)
-        del_items = []
-        for i, item in enumerate(items):
-            sum_bytes += item.size
-            if sum_bytes > max_bytes and i > 0: del_items.append(item.name)
-        for item in del_items:
-            self.pipes.pop(item)
-            logger.debug(f"Unloaded {item}.")
-pipes = Pipelines()
-def get_current_model_type(name: str) -> str:
-    return pipes.get_model_type(name)
-VAE_NAMES = [DEFAULT_VAE] + sdxl_vaes + sd15_vaes
-def get_vae(pipe: Any, name: str, device: str, model_type: str=DEFAULT_MODEL_TYPE):
-    if name == DEFAULT_VAE or not pipe: return pipe
-    try:
-        model_type = get_current_model_type(name)
-        if (model_type == "SDXL" and name in sd15_vaes) or (model_type == "SD 1.5" and name in sdxl_vaes): return pipe
-        if is_weight_url(name): vae = AutoencoderKL.from_single_file(get_file(name), torch_dtype=DTYPE)
-        else: vae = AutoencoderKL.from_pretrained(name, torch_dtype=DTYPE)
-        if vae:
-            if device != "cpu" and vae.device != device: vae.to(device)
-            pipe.vae = vae
-            logger.info(f"VAE loaded {name}.")
-        return pipe
-    except Exception as e:
-        logger.info(f"{inspect.currentframe().f_code.co_name}: {e}")
-        return pipe
-def get_pipe(name: str, device: str="cpu", model_type: str=DEFAULT_MODEL_TYPE, pipe_type: str=PIPELINE_TYPES[0]):
-    global pipes
-    try:
-        pipe = pipes(name, device, model_type, pipe_type)
-        return pipe
-    except Exception as e:
-        logger.info(f"{inspect.currentframe().f_code.co_name}: {e}")
-        return None
-    finally:
-        logger.debug(f"Current pipes: {pipes}")

+import os, subprocess, time, datetime, inspect
+from typing import Any, Tuple, Dict, List, Optional
+from dataclasses import dataclass, field
+import torch
+from diffusers import DiffusionPipeline, AutoencoderKL
+from diffusers.models.attention_processor import AttnProcessor2_0
+from t2i_config import models, sdxl_vaes, sd15_vaes, PIPELINE_MAX_GIB
+from t2i.utils import (logger, get_token, free_memory, calc_pipe_size, is_weight_url, get_file,
+                       get_model_type, get_model_type_from_pipe, get_task_class, DEFAULT_TASKS, IS_ZEROGPU, DEVICE, DTYPE, IS_QUANT,
+                       MAX_SEED, MAX_IMAGE_SIZE, DEFAULT_MODEL_TYPE, DEFAULT_STR, ASPECT_RATIOS, PIPELINE_TYPES, DEFAULT_VAE, PARAM_MODES)
+if IS_ZEROGPU:
+    logger.info("Running on Zero GPU.")
+    os.environ["ZEROGPU_SIZE"] = "auto" # https://huggingface.co/posts/cbensimon/356529804559377
+    subprocess.run("rm -rf /data-nvme/zerogpu-offload/*", shell=True)
+    torch.set_float32_matmul_precision("high") # https://pytorch.org/blog/accelerating-generative-ai-3/
+logger.info(f"Using device: {DEVICE}")
+logger.info(f"Using dtype: {DTYPE}")
+#from torchao.quantization.quant_api import Int8WeightOnlyConfig, quantize_
+@dataclass(order=True)
+class Pipeline:
+    name: str = ""
+    pipe: Any = field(default_factory=Any)
+    lastmod: float = 0.
+    size: int = 0
+    type: str = DEFAULT_MODEL_TYPE
+    pipe_type: str = PIPELINE_TYPES[0]
+    def __str__(self):
+        return f"{self.name} ({type(self.pipe).__name__} {self.type} {self.pipe_type}) Size:{float(self.size) / (1024.**3):.2f}GiB LastMod.:{datetime.datetime.fromtimestamp(self.lastmod).strftime('%Y/%m/%d %H:%M:%S')}"
+    def __del__(self):
+        if not self.pipe: return
+        self.pipe.to("cpu")
+        del self.pipe
+        free_memory()
+        logger.debug(f"Unloaded pipeline {self.name}.")
+    def onload(self, device: str, model_type: str) -> Any:
+        self.lastmod = time.time()
+        if device != "cpu" and not IS_QUANT:
+            if self.pipe.device != device: self.pipe.to(device)
+            # https://huggingface.co/docs/diffusers/main/en/optimization/torch2.0
+            #if model_type in ["SD 1.5", "SDXL"]: self.pipe.unet.set_attn_processor(AttnProcessor2_0())
+            #elif model_type in ["FLUX"]: self.pipe.transformer.set_attn_processor(AttnProcessor2_0())
+            #self.pipe.vae.set_attn_processor(AttnProcessor2_0())
+            #logger.debug(f"SDPA enabled {type(self.pipe).__name__} ({model_type}) on {device}.") # by default in PyTorch 2.x
+        return self.pipe
+    def quantize(self):
+        if not IS_QUANT: return self
+        #if self.type in ["SD 1.5", "SDXL"]: quantize_(self.pipe.unet, Int8WeightOnlyConfig())
+        #elif self.type in ["FLUX"]: quantize_(self.pipe.transformer, Int8WeightOnlyConfig())
+        self.size=calc_pipe_size(self.pipe)
+        logger.debug(f"Quantized pipeline {self.name}.")
+        return self
+class Pipelines:
+    def __init__(self):
+        self.pipes: Dict[str, Pipeline] = {}
+        self.max_gib = PIPELINE_MAX_GIB
+    def __call__(self, name: str, device: str="cpu", model_type: str=DEFAULT_MODEL_TYPE, pipe_type: str=PIPELINE_TYPES[0]) -> Any:
+        try:
+            if name in self.pipes.keys():
+                pipe = self.pipes[name].onload(device, model_type)
+                free_memory()
+                return pipe
+            if model_type == DEFAULT_MODEL_TYPE: model_type = get_model_type(name)
+            pipe_class = get_task_class(model_type, DEFAULT_TASKS[0])
+            if is_weight_url(name):
+                path = get_file(name)
+                if model_type == "SDXL": pipe = pipe_class.from_single_file(path, add_watermarker=False, torch_dtype=DTYPE)
+                elif model_type == "SD 1.5": pipe = pipe_class.from_single_file(path, torch_dtype=DTYPE)
+                elif model_type == "FLUX": pipe = pipe_class.from_single_file(path, torch_dtype=DTYPE) #
+                else: raise Exception(f"Invalid architecture {name}")
+            else:
+                if model_type == "SDXL": pipe = pipe_class.from_pretrained(name, add_watermarker=False, torch_dtype=DTYPE)
+                elif model_type == "SD 1.5": pipe = pipe_class.from_pretrained(name, torch_dtype=DTYPE)
+                elif model_type == "FLUX": pipe = pipe_class.from_pretrained(name, torch_dtype=DTYPE) #
+                else:
+                    pipe = pipe_class.from_pretrained(name, torch_dtype=DTYPE)
+                    model_type = get_model_type_from_pipe(pipe)
+            if pipe_type == "Long Prompt Weighting" and model_type in ["SD 1.5", "SDXL"]:
+                if model_type == "SD 1.5": pipe = DiffusionPipeline.from_pipe(pipe, custom_pipeline="lpw_stable_diffusion", torch_dtype=DTYPE)
+                elif model_type == "SDXL": pipe = DiffusionPipeline.from_pipe(pipe, custom_pipeline="lpw_stable_diffusion_xl", add_watermarker=False, torch_dtype=DTYPE)
+            self.pipes[name] = Pipeline(name=name, pipe=pipe, lastmod=time.time(), size=calc_pipe_size(pipe), type=model_type, pipe_type=pipe_type)#.quantize()
+            logger.info(f"Loaded {self.pipes[name]}.")
+            self.clean()
+            pipe = self.pipes[name].onload(device, model_type)
+            free_memory()
+            return pipe
+        except Exception as e:
+            logger.info(f"Failed to load pipeline for {name} {e}")
+            return None
+    def get_model_type(self, name: str) -> str:
+        if name in self.pipes.keys(): return self.pipes[name].type
+        else: return DEFAULT_MODEL_TYPE
+    def __str__(self):
+        return "\n".join([str(x) for x in self.pipes.values()])
+    def clean(self):
+        items = sorted(list(self.pipes.values()), key=lambda x:x.lastmod, reverse=True)
+        sum_bytes = 0
+        max_bytes = self.max_gib * (1024 ** 3)
+        del_items = []
+        for i, item in enumerate(items):
+            sum_bytes += item.size
+            if sum_bytes > max_bytes and i > 0: del_items.append(item.name)
+        for item in del_items:
+            self.pipes.pop(item)
+            logger.debug(f"Unloaded {item}.")
+pipes = Pipelines()
+def get_current_model_type(name: str) -> str:
+    return pipes.get_model_type(name)
+VAE_NAMES = [DEFAULT_VAE] + sdxl_vaes + sd15_vaes
+def get_vae(pipe: Any, name: str, device: str, model_type: str=DEFAULT_MODEL_TYPE):
+    if name == DEFAULT_VAE or not pipe: return pipe
+    try:
+        model_type = get_current_model_type(name)
+        if (model_type == "SDXL" and name in sd15_vaes) or (model_type == "SD 1.5" and name in sdxl_vaes): return pipe
+        if is_weight_url(name): vae = AutoencoderKL.from_single_file(get_file(name), torch_dtype=DTYPE)
+        else: vae = AutoencoderKL.from_pretrained(name, torch_dtype=DTYPE)
+        if vae:
+            if device != "cpu" and vae.device != device: vae.to(device)
+            pipe.vae = vae
+            logger.info(f"VAE loaded {name}.")
+        return pipe
+    except Exception as e:
+        logger.info(f"{inspect.currentframe().f_code.co_name}: {e}")
+        return pipe
+def get_pipe(name: str, device: str="cpu", model_type: str=DEFAULT_MODEL_TYPE, pipe_type: str=PIPELINE_TYPES[0]):
+    global pipes
+    try:
+        pipe = pipes(name, device, model_type, pipe_type)
+        return pipe
+    except Exception as e:
+        logger.info(f"{inspect.currentframe().f_code.co_name}: {e}")
+        return None
+    finally:
+        logger.debug(f"Current pipes: {pipes}")