diff --git "a/src/utils.py" "b/src/utils.py"
new file mode 100644--- /dev/null
+++ "b/src/utils.py"
@@ -0,0 +1,3209 @@
+# This code uses diffusers library from https://github.com/huggingface/diffusers
+
+
+import numpy as np
+import cv2
+import torch
+import random
+from PIL import Image, ImageDraw, ImageFont
+import copy
+from typing import Optional, Union, Tuple, List, Callable, Dict, Any
+from tqdm.notebook import tqdm
+from diffusers.utils import BaseOutput, logging
+from diffusers.models.embeddings import TimestepEmbedding, Timesteps
+from diffusers.models.unet_2d_blocks import (
+    CrossAttnDownBlock2D,
+    CrossAttnUpBlock2D,
+    DownBlock2D,
+    UNetMidBlock2DCrossAttn,
+    UpBlock2D,
+    get_down_block,
+    get_up_block,
+)
+from diffusers.models.unet_2d_condition import UNet2DConditionOutput
+from copy import deepcopy
+import json
+
+import inspect
+import os
+import warnings
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils.torch_utils import is_compiled_module
+
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
+from tqdm import tqdm
+from controlnet_aux import HEDdetector, OpenposeDetector
+import time
+
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+
+# from ..configuration_utils import ConfigMixin, register_to_config
+# from ..loaders import UNet2DConditionLoadersMixin
+# from ..utils import USE_PEFT_BACKEND, BaseOutput, deprecate, logging, scale_lora_layers, unscale_lora_layers
+# from .activations import get_activation
+# from .attention_processor import (
+#     ADDED_KV_ATTENTION_PROCESSORS,
+#     CROSS_ATTENTION_PROCESSORS,
+#     Attention,
+#     AttentionProcessor,
+#     AttnAddedKVProcessor,
+#     AttnProcessor,
+# )
+from diffusers.models.embeddings import (
+    GaussianFourierProjection,
+    ImageHintTimeEmbedding,
+    ImageProjection,
+    ImageTimeEmbedding,
+    PositionNet,
+    TextImageProjection,
+    TextImageTimeEmbedding,
+    TextTimeEmbedding,
+    TimestepEmbedding,
+    Timesteps,
+)
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.unet_2d_blocks import (
+    UNetMidBlock2D,
+    UNetMidBlock2DCrossAttn,
+    UNetMidBlock2DSimpleCrossAttn,
+    get_down_block,
+    get_up_block,
+)
+# from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from diffusers.models.attention_processor import (
+    AttnProcessor2_0,
+    FusedAttnProcessor2_0,
+    XFormersAttnProcessor,
+)
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    is_invisible_watermark_available,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+# from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
+
+from typing import Any, Dict, List
+
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.utils import CONFIG_NAME
+
+
+class PipelineCallback(ConfigMixin):
+
+    config_name = CONFIG_NAME
+
+    @register_to_config
+    def __init__(self, cutoff_step_ratio=1.0, cutoff_step_index=None):
+        super().__init__()
+
+        if (cutoff_step_ratio is None and cutoff_step_index is None) or (
+            cutoff_step_ratio is not None and cutoff_step_index is not None
+        ):
+            raise ValueError("Either cutoff_step_ratio or cutoff_step_index should be provided, not both or none.")
+
+        if cutoff_step_ratio is not None and (
+            not isinstance(cutoff_step_ratio, float) or not (0.0 <= cutoff_step_ratio <= 1.0)
+        ):
+            raise ValueError("cutoff_step_ratio must be a float between 0.0 and 1.0.")
+
+    @property
+    def tensor_inputs(self) -> List[str]:
+        raise NotImplementedError(f"You need to set the attribute `tensor_inputs` for {self.__class__}")
+
+    def callback_fn(self, pipeline, step_index, timesteps, callback_kwargs) -> Dict[str, Any]:
+        raise NotImplementedError(f"You need to implement the method `callback_fn` for {self.__class__}")
+
+    def __call__(self, pipeline, step_index, timestep, callback_kwargs) -> Dict[str, Any]:
+        return self.callback_fn(pipeline, step_index, timestep, callback_kwargs)
+
+
+class MultiPipelineCallbacks:
+
+    def __init__(self, callbacks: List[PipelineCallback]):
+        self.callbacks = callbacks
+
+    @property
+    def tensor_inputs(self) -> List[str]:
+        return [input for callback in self.callbacks for input in callback.tensor_inputs]
+
+    def __call__(self, pipeline, step_index, timestep, callback_kwargs) -> Dict[str, Any]:
+        """
+        Calls all the callbacks in order with the given arguments and returns the final callback_kwargs.
+        """
+        for callback in self.callbacks:
+            callback_kwargs = callback(pipeline, step_index, timestep, callback_kwargs)
+
+        return callback_kwargs
+
+def seed_everything(seed):
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+
+def get_promptls(prompt_path):
+    with open(prompt_path) as f:
+        prompt_ls = json.load(f)
+    prompt_ls = [prompt['caption'].replace('/','_') for prompt in prompt_ls]
+    return prompt_ls
+
+def warpped_feature(sample, step):
+    """
+    sample: batch_size*dim*h*w, uncond: 0 - batch_size//2, cond: batch_size//2 - batch_size
+    step: timestep span
+    """
+    bs, dim, h, w = sample.shape
+    uncond_fea, cond_fea = sample.chunk(2)
+    uncond_fea = uncond_fea.repeat(step,1,1,1) # (step * bs//2) * dim * h *w
+    cond_fea = cond_fea.repeat(step,1,1,1) # (step * bs//2) * dim * h *w
+    return torch.cat([uncond_fea, cond_fea])
+
+def warpped_skip_feature(block_samples, step):
+    down_block_res_samples = []
+    for sample in block_samples:
+        sample_expand = warpped_feature(sample, step)
+        down_block_res_samples.append(sample_expand)
+    return tuple(down_block_res_samples)
+
+def warpped_text_emb(text_emb, step):
+    """
+    text_emb: batch_size*77*768, uncond: 0 - batch_size//2, cond: batch_size//2 - batch_size
+    step: timestep span
+    """
+    bs, token_len, dim = text_emb.shape
+    uncond_fea, cond_fea = text_emb.chunk(2)
+    uncond_fea = uncond_fea.repeat(step,1,1) # (step * bs//2) * 77 *768
+    cond_fea = cond_fea.repeat(step,1,1) # (step * bs//2) * 77 * 768
+    return torch.cat([uncond_fea, cond_fea]) # (step*bs) * 77 *768
+
+def warpped_timestep(timesteps, bs):
+    """
+    timestpes: list, such as [981, 961, 941]
+    """
+    semi_bs = bs//2
+    ts = []
+    for timestep in timesteps:
+        timestep = timestep[None]
+        texp = timestep.expand(semi_bs)
+        ts.append(texp)
+    timesteps = torch.cat(ts)
+    return timesteps.repeat(2,1).reshape(-1)
+
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+def register_normal_pipeline(pipe):
+    def new_call(self):
+        @torch.no_grad()
+        def call(
+            prompt: Union[str, List[str]] = None,
+            height: Optional[int] = None,
+            width: Optional[int] = None,
+            num_inference_steps: int = 50,
+            guidance_scale: float = 7.5,
+            negative_prompt: Optional[Union[str, List[str]]] = None,
+            num_images_per_prompt: Optional[int] = 1,
+            eta: float = 0.0,
+            generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+            latents: Optional[torch.FloatTensor] = None,
+            prompt_embeds: Optional[torch.FloatTensor] = None,
+            negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+            output_type: Optional[str] = "pil",
+            return_dict: bool = True,
+            cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+            guidance_rescale: float = 0.0,
+            clip_skip: Optional[int] = None,
+            callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+            callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+            **kwargs,
+        ):
+
+            callback = kwargs.pop("callback", None)
+            callback_steps = kwargs.pop("callback_steps", None)
+
+
+            # 0. Default height and width to unet
+            height = height or self.unet.config.sample_size * self.vae_scale_factor
+            width = width or self.unet.config.sample_size * self.vae_scale_factor
+            # to deal with lora scaling and other possible forward hooks
+
+            # 1. Check inputs. Raise error if not correct
+            self.check_inputs(
+                prompt,
+                height,
+                width,
+                callback_steps,
+                negative_prompt,
+                prompt_embeds,
+                negative_prompt_embeds,
+                callback_on_step_end_tensor_inputs,
+            )
+
+            self._guidance_scale = guidance_scale
+            self._guidance_rescale = guidance_rescale
+            self._clip_skip = clip_skip
+            self._cross_attention_kwargs = cross_attention_kwargs
+
+            # 2. Define call parameters
+            if prompt is not None and isinstance(prompt, str):
+                batch_size = 1
+            elif prompt is not None and isinstance(prompt, list):
+                batch_size = len(prompt)
+            else:
+                batch_size = prompt_embeds.shape[0]
+
+            device = self._execution_device
+
+            # 3. Encode input prompt
+            lora_scale = (
+                self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+            )
+
+            prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+                prompt,
+                device,
+                num_images_per_prompt,
+                self.do_classifier_free_guidance,
+                negative_prompt,
+                prompt_embeds=prompt_embeds,
+                negative_prompt_embeds=negative_prompt_embeds,
+                lora_scale=lora_scale,
+                clip_skip=self.clip_skip,
+            )
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            if self.do_classifier_free_guidance:
+                prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+            # 4. Prepare timesteps
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+            timesteps = self.scheduler.timesteps
+
+            # 5. Prepare latent variables
+            num_channels_latents = self.unet.config.in_channels
+            latents = self.prepare_latents(
+                batch_size * num_images_per_prompt,
+                num_channels_latents,
+                height,
+                width,
+                prompt_embeds.dtype,
+                device,
+                generator,
+                latents,
+            )
+
+            # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+            extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+            # 6.5 Optionally get Guidance Scale Embedding
+            timestep_cond = None
+            if self.unet.config.time_cond_proj_dim is not None:
+                guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+                timestep_cond = self.get_guidance_scale_embedding(
+                    guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+                ).to(device=device, dtype=latents.dtype)
+
+            # 7. Denoising loop
+            num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+            self._num_timesteps = len(timesteps)
+            init_latents = latents.detach().clone()
+            with self.progress_bar(total=num_inference_steps) as progress_bar:
+                for i, t in enumerate(timesteps):
+                    if t/1000 < 0.5:
+                        latents = latents + 0.003*init_latents
+                    setattr(self.unet, 'order', i)
+                    # expand the latents if we are doing classifier free guidance
+                    latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                    print("latent_model_input ", latent_model_input.shape)
+                    
+                    # predict the noise residual
+                    noise_pred = self.unet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=prompt_embeds,
+                        timestep_cond=timestep_cond,
+                        cross_attention_kwargs=self.cross_attention_kwargs,
+                        return_dict=False,
+                    )[0]
+
+                    # perform guidance
+                    if self.do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                    if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                        # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                        noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
+
+                    # compute the previous noisy sample x_t -> x_t-1
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                    if callback_on_step_end is not None:
+                        callback_kwargs = {}
+                        for k in callback_on_step_end_tensor_inputs:
+                            callback_kwargs[k] = locals()[k]
+                        callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                        latents = callback_outputs.pop("latents", latents)
+                        prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                        negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                    # call the callback, if provided
+                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                        progress_bar.update()
+                        if callback is not None and i % callback_steps == 0:
+                            step_idx = i // getattr(self.scheduler, "order", 1)
+                            callback(step_idx, t, latents)
+
+            if not output_type == "latent":
+                image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
+                    0
+                ]
+                image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+            else:
+                image = latents
+                has_nsfw_concept = None
+
+            if has_nsfw_concept is None:
+                do_denormalize = [True] * image.shape[0]
+            else:
+                do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+            image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+            # Offload all models
+            self.maybe_free_model_hooks()
+
+            if not return_dict:
+                return (image, has_nsfw_concept)
+
+            return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+        return call
+    pipe.call = new_call(pipe)
+
+
+
+def register_very_orig_pipeline(pipe):
+    def new_call(self):
+        @torch.no_grad()
+        def call(
+            prompt: Union[str, List[str]] = None,
+            prompt_2: Optional[Union[str, List[str]]] = None,
+            height: Optional[int] = None,
+            width: Optional[int] = None,
+            num_inference_steps: int = 50,
+            timesteps: List[int] = None,
+            sigmas: List[float] = None,
+            denoising_end: Optional[float] = None,
+            guidance_scale: float = 5.0,
+            negative_prompt: Optional[Union[str, List[str]]] = None,
+            negative_prompt_2: Optional[Union[str, List[str]]] = None,
+            num_images_per_prompt: Optional[int] = 1,
+            eta: float = 0.0,
+            generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+            latents: Optional[torch.Tensor] = None,
+            prompt_embeds: Optional[torch.Tensor] = None,
+            negative_prompt_embeds: Optional[torch.Tensor] = None,
+            pooled_prompt_embeds: Optional[torch.Tensor] = None,
+            negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+            ip_adapter_image: Optional[PipelineImageInput] = None,
+            ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+            output_type: Optional[str] = "pil",
+            return_dict: bool = True,
+            cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+            guidance_rescale: float = 0.0,
+            original_size: Optional[Tuple[int, int]] = None,
+            crops_coords_top_left: Tuple[int, int] = (0, 0),
+            target_size: Optional[Tuple[int, int]] = None,
+            negative_original_size: Optional[Tuple[int, int]] = None,
+            negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+            negative_target_size: Optional[Tuple[int, int]] = None,
+            clip_skip: Optional[int] = None,
+            callback_on_step_end: Optional[
+                Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+            ] = None,
+            callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+            **kwargs,
+        ):
+            r"""
+            Function invoked when calling the pipeline for generation.
+    
+            Args:
+                prompt (`str` or `List[str]`, *optional*):
+                    The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                    instead.
+                prompt_2 (`str` or `List[str]`, *optional*):
+                    The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                    used in both text-encoders
+                height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                    The height in pixels of the generated image. This is set to 1024 by default for the best results.
+                    Anything below 512 pixels won't work well for
+                    [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                    and checkpoints that are not specifically fine-tuned on low resolutions.
+                width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                    The width in pixels of the generated image. This is set to 1024 by default for the best results.
+                    Anything below 512 pixels won't work well for
+                    [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                    and checkpoints that are not specifically fine-tuned on low resolutions.
+                num_inference_steps (`int`, *optional*, defaults to 50):
+                    The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                    expense of slower inference.
+                timesteps (`List[int]`, *optional*):
+                    Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                    in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                    passed will be used. Must be in descending order.
+                sigmas (`List[float]`, *optional*):
+                    Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                    their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                    will be used.
+                denoising_end (`float`, *optional*):
+                    When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                    completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                    still retain a substantial amount of noise as determined by the discrete timesteps selected by the
+                    scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
+                    "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                    Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
+                guidance_scale (`float`, *optional*, defaults to 5.0):
+                    Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                    `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                    Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                    1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                    usually at the expense of lower image quality.
+                negative_prompt (`str` or `List[str]`, *optional*):
+                    The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                    `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                    less than `1`).
+                negative_prompt_2 (`str` or `List[str]`, *optional*):
+                    The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                    `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+                num_images_per_prompt (`int`, *optional*, defaults to 1):
+                    The number of images to generate per prompt.
+                eta (`float`, *optional*, defaults to 0.0):
+                    Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                    [`schedulers.DDIMScheduler`], will be ignored for others.
+                generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                    One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                    to make generation deterministic.
+                latents (`torch.Tensor`, *optional*):
+                    Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                    generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                    tensor will ge generated by sampling using the supplied random `generator`.
+                prompt_embeds (`torch.Tensor`, *optional*):
+                    Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                    provided, text embeddings will be generated from `prompt` input argument.
+                negative_prompt_embeds (`torch.Tensor`, *optional*):
+                    Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                    weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                    argument.
+                pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                    Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                    If not provided, pooled text embeddings will be generated from `prompt` input argument.
+                negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                    Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                    weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                    input argument.
+                ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+                ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                    Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                    IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                    contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                    provided, embeddings are computed from the `ip_adapter_image` input argument.
+                output_type (`str`, *optional*, defaults to `"pil"`):
+                    The output format of the generate image. Choose between
+                    [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+                return_dict (`bool`, *optional*, defaults to `True`):
+                    Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                    of a plain tuple.
+                cross_attention_kwargs (`dict`, *optional*):
+                    A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                    `self.processor` in
+                    [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+                guidance_rescale (`float`, *optional*, defaults to 0.0):
+                    Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                    Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                    [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                    Guidance rescale factor should fix overexposure when using zero terminal SNR.
+                original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                    If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                    `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                    explained in section 2.2 of
+                    [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+                crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                    `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                    `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                    `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                    [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+                target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                    For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                    not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                    section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+                negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                    To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                    micro-conditioning as explained in section 2.2 of
+                    [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                    information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+                negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                    To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                    micro-conditioning as explained in section 2.2 of
+                    [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                    information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+                negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                    To negatively condition the generation process based on a target image resolution. It should be as same
+                    as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                    [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                    information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+                callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                    A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                    each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                    DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                    list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+                callback_on_step_end_tensor_inputs (`List`, *optional*):
+                    The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                    will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                    `._callback_tensor_inputs` attribute of your pipeline class.
+    
+            Examples:
+    
+            Returns:
+                [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] or `tuple`:
+                [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
+                `tuple`. When returning a tuple, the first element is a list with the generated images.
+            """
+    
+            callback = kwargs.pop("callback", None)
+            callback_steps = kwargs.pop("callback_steps", None)
+    
+            if callback is not None:
+                deprecate(
+                    "callback",
+                    "1.0.0",
+                    "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+                )
+            if callback_steps is not None:
+                deprecate(
+                    "callback_steps",
+                    "1.0.0",
+                    "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+                )
+    
+            if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+                callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+    
+            # 0. Default height and width to unet
+            height = height or self.default_sample_size * self.vae_scale_factor
+            width = width or self.default_sample_size * self.vae_scale_factor
+    
+            original_size = original_size or (height, width)
+            target_size = target_size or (height, width)
+    
+            # 1. Check inputs. Raise error if not correct
+            self.check_inputs(
+                prompt,
+                prompt_2,
+                height,
+                width,
+                callback_steps,
+                negative_prompt,
+                negative_prompt_2,
+                prompt_embeds,
+                negative_prompt_embeds,
+                pooled_prompt_embeds,
+                negative_pooled_prompt_embeds,
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                callback_on_step_end_tensor_inputs,
+            )
+    
+            self._guidance_scale = guidance_scale
+            self._guidance_rescale = guidance_rescale
+            self._clip_skip = clip_skip
+            self._cross_attention_kwargs = cross_attention_kwargs
+            self._denoising_end = denoising_end
+            self._interrupt = False
+    
+            # 2. Define call parameters
+            if prompt is not None and isinstance(prompt, str):
+                batch_size = 1
+            elif prompt is not None and isinstance(prompt, list):
+                batch_size = len(prompt)
+            else:
+                batch_size = prompt_embeds.shape[0]
+    
+            device = self._execution_device
+    
+            # 3. Encode input prompt
+            lora_scale = (
+                self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+            )
+    
+            (
+                prompt_embeds,
+                negative_prompt_embeds,
+                pooled_prompt_embeds,
+                negative_pooled_prompt_embeds,
+            ) = self.encode_prompt(
+                prompt=prompt,
+                prompt_2=prompt_2,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                negative_prompt=negative_prompt,
+                negative_prompt_2=negative_prompt_2,
+                prompt_embeds=prompt_embeds,
+                negative_prompt_embeds=negative_prompt_embeds,
+                pooled_prompt_embeds=pooled_prompt_embeds,
+                negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+                lora_scale=lora_scale,
+                clip_skip=self.clip_skip,
+            )
+    
+            # 4. Prepare timesteps
+            timesteps, num_inference_steps = retrieve_timesteps(
+                self.scheduler, num_inference_steps, device, timesteps, sigmas
+            )
+    
+            # 5. Prepare latent variables
+            num_channels_latents = self.unet.config.in_channels
+            latents = self.prepare_latents(
+                batch_size * num_images_per_prompt,
+                num_channels_latents,
+                height,
+                width,
+                prompt_embeds.dtype,
+                device,
+                generator,
+                latents,
+            )
+    
+            # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+            extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+    
+            # 7. Prepare added time ids & embeddings
+            add_text_embeds = pooled_prompt_embeds
+            if self.text_encoder_2 is None:
+                text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+            else:
+                text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+    
+            add_time_ids = self._get_add_time_ids(
+                original_size,
+                crops_coords_top_left,
+                target_size,
+                dtype=prompt_embeds.dtype,
+                text_encoder_projection_dim=text_encoder_projection_dim,
+            )
+            if negative_original_size is not None and negative_target_size is not None:
+                negative_add_time_ids = self._get_add_time_ids(
+                    negative_original_size,
+                    negative_crops_coords_top_left,
+                    negative_target_size,
+                    dtype=prompt_embeds.dtype,
+                    text_encoder_projection_dim=text_encoder_projection_dim,
+                )
+            else:
+                negative_add_time_ids = add_time_ids
+    
+            if self.do_classifier_free_guidance:
+                prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+                add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+                add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+    
+            prompt_embeds = prompt_embeds.to(device)
+            add_text_embeds = add_text_embeds.to(device)
+            add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
+    
+            if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+                image_embeds = self.prepare_ip_adapter_image_embeds(
+                    ip_adapter_image,
+                    ip_adapter_image_embeds,
+                    device,
+                    batch_size * num_images_per_prompt,
+                    self.do_classifier_free_guidance,
+                )
+    
+            # 8. Denoising loop
+            num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+    
+            # 8.1 Apply denoising_end
+            if (
+                self.denoising_end is not None
+                and isinstance(self.denoising_end, float)
+                and self.denoising_end > 0
+                and self.denoising_end < 1
+            ):
+                discrete_timestep_cutoff = int(
+                    round(
+                        self.scheduler.config.num_train_timesteps
+                        - (self.denoising_end * self.scheduler.config.num_train_timesteps)
+                    )
+                )
+                num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+                timesteps = timesteps[:num_inference_steps]
+    
+            # 9. Optionally get Guidance Scale Embedding
+            timestep_cond = None
+            if self.unet.config.time_cond_proj_dim is not None:
+                guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+                timestep_cond = self.get_guidance_scale_embedding(
+                    guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+                ).to(device=device, dtype=latents.dtype)
+    
+            self._num_timesteps = len(timesteps)
+            with self.progress_bar(total=num_inference_steps) as progress_bar:
+                for i, t in enumerate(timesteps):
+                    if self.interrupt:
+                        continue
+    
+                    # expand the latents if we are doing classifier free guidance
+                    latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+    
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+    
+                    # predict the noise residual
+                    added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                    if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+                        added_cond_kwargs["image_embeds"] = image_embeds
+                    noise_pred = self.unet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=prompt_embeds,
+                        timestep_cond=timestep_cond,
+                        cross_attention_kwargs=self.cross_attention_kwargs,
+                        added_cond_kwargs=added_cond_kwargs,
+                        return_dict=False,
+                    )[0]
+    
+                    # perform guidance
+                    if self.do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+    
+                    if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                        # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                        noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
+    
+                    # compute the previous noisy sample x_t -> x_t-1
+                    latents_dtype = latents.dtype
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                    if latents.dtype != latents_dtype:
+                        if torch.backends.mps.is_available():
+                            # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                            latents = latents.to(latents_dtype)
+    
+                    if callback_on_step_end is not None:
+                        callback_kwargs = {}
+                        for k in callback_on_step_end_tensor_inputs:
+                            callback_kwargs[k] = locals()[k]
+                        callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+    
+                        latents = callback_outputs.pop("latents", latents)
+                        prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                        negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                        add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
+                        negative_pooled_prompt_embeds = callback_outputs.pop(
+                            "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                        )
+                        add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+                        negative_add_time_ids = callback_outputs.pop("negative_add_time_ids", negative_add_time_ids)
+    
+                    # call the callback, if provided
+                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                        progress_bar.update()
+                        if callback is not None and i % callback_steps == 0:
+                            step_idx = i // getattr(self.scheduler, "order", 1)
+                            callback(step_idx, t, latents)
+    
+                    if XLA_AVAILABLE:
+                        xm.mark_step()
+    
+            if not output_type == "latent":
+                # make sure the VAE is in float32 mode, as it overflows in float16
+                needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+    
+                if needs_upcasting:
+                    self.upcast_vae()
+                    latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+                elif latents.dtype != self.vae.dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        self.vae = self.vae.to(latents.dtype)
+    
+                # unscale/denormalize the latents
+                # denormalize with the mean and std if available and not None
+                has_latents_mean = hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None
+                has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None
+                if has_latents_mean and has_latents_std:
+                    latents_mean = (
+                        torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                    )
+                    latents_std = (
+                        torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                    )
+                    latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean
+                else:
+                    latents = latents / self.vae.config.scaling_factor
+    
+                image = self.vae.decode(latents, return_dict=False)[0]
+    
+                # cast back to fp16 if needed
+                if needs_upcasting:
+                    self.vae.to(dtype=torch.float16)
+            else:
+                image = latents
+    
+            if not output_type == "latent":
+                # apply watermark if available
+                if self.watermark is not None:
+                    image = self.watermark.apply_watermark(image)
+    
+                image = self.image_processor.postprocess(image, output_type=output_type)
+    
+            # Offload all models
+            self.maybe_free_model_hooks()
+    
+            if not return_dict:
+                return (image,)
+    
+            return StableDiffusionXLPipelineOutput(images=image)
+        return call
+    pipe.call = new_call(pipe)
+
+
+def register_parallel_pipeline_orig(pipe, mod='50ls'):
+    def new_call(self):
+        @torch.no_grad()
+        def call(
+            prompt: Union[str, List[str]] = None,
+            prompt_2: Optional[Union[str, List[str]]] = None,
+            height: Optional[int] = None,
+            width: Optional[int] = None,
+            num_inference_steps: int = 50,
+            timesteps: List[int] = None,
+            sigmas: List[float] = None,
+            denoising_end: Optional[float] = None,
+            guidance_scale: float = 5.0,
+            negative_prompt: Optional[Union[str, List[str]]] = None,
+            negative_prompt_2: Optional[Union[str, List[str]]] = None,
+            num_images_per_prompt: Optional[int] = 1,
+            eta: float = 0.0,
+            generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+            latents: Optional[torch.Tensor] = None,
+            prompt_embeds: Optional[torch.Tensor] = None,
+            negative_prompt_embeds: Optional[torch.Tensor] = None,
+            pooled_prompt_embeds: Optional[torch.Tensor] = None,
+            negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+            ip_adapter_image: Optional[PipelineImageInput] = None,
+            ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+            output_type: Optional[str] = "pil",
+            return_dict: bool = True,
+            cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+            guidance_rescale: float = 0.0,
+            original_size: Optional[Tuple[int, int]] = None,
+            crops_coords_top_left: Tuple[int, int] = (0, 0),
+            target_size: Optional[Tuple[int, int]] = None,
+            negative_original_size: Optional[Tuple[int, int]] = None,
+            negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+            negative_target_size: Optional[Tuple[int, int]] = None,
+            clip_skip: Optional[int] = None,
+            callback_on_step_end: Optional[
+                Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+            ] = None,
+            callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+            **kwargs,
+        ):
+
+            callback = kwargs.pop("callback", None)
+            callback_steps = kwargs.pop("callback_steps", None)
+
+            if callback is not None:
+                deprecate(
+                    "callback",
+                    "1.0.0",
+                    "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+                )
+            if callback_steps is not None:
+                deprecate(
+                    "callback_steps",
+                    "1.0.0",
+                    "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+                )
+
+            if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+                callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+            # 0. Default height and width to unet
+            height = height or self.default_sample_size * self.vae_scale_factor
+            width = width or self.default_sample_size * self.vae_scale_factor
+            original_size = original_size or (height, width)
+            target_size = target_size or (height, width)
+
+            # 1. Check inputs. Raise error if not correct
+            self.check_inputs(
+                prompt,
+                prompt_2,
+                height,
+                width,
+                callback_steps,
+                negative_prompt,
+                negative_prompt_2,
+                prompt_embeds,
+                negative_prompt_embeds,
+                pooled_prompt_embeds,
+                negative_pooled_prompt_embeds,
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                callback_on_step_end_tensor_inputs,
+            )
+
+            self._guidance_scale = guidance_scale
+            self._guidance_rescale = guidance_rescale
+            self._clip_skip = clip_skip
+            self._cross_attention_kwargs = cross_attention_kwargs
+            self._denoising_end = denoising_end
+            self._interrupt = False
+
+            # 2. Define call parameters
+            if prompt is not None and isinstance(prompt, str):
+                batch_size = 1
+            elif prompt is not None and isinstance(prompt, list):
+                batch_size = len(prompt)
+            else:
+                batch_size = prompt_embeds.shape[0]
+
+            device = self._execution_device
+
+            # 3. Encode input prompt (mostly unchanged from original)
+            lora_scale = (
+                self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+            )
+
+            (
+                prompt_embeds,
+                negative_prompt_embeds,
+                pooled_prompt_embeds,
+                negative_pooled_prompt_embeds,
+            ) = self.encode_prompt(
+                prompt=prompt,
+                prompt_2=prompt_2,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                negative_prompt=negative_prompt,
+                negative_prompt_2=negative_prompt_2,
+                prompt_embeds=prompt_embeds,
+                negative_prompt_embeds=negative_prompt_embeds,
+                pooled_prompt_embeds=pooled_prompt_embeds,
+                negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+                lora_scale=lora_scale,
+                clip_skip=self.clip_skip,
+            )
+
+            if self.do_classifier_free_guidance:
+                prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+                pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
+
+
+            # 4. Prepare timesteps (modified to handle parallel processing)
+            self.scheduler.set_timesteps(num_inference_steps, device=device) #Simplified timestep setting
+            timesteps = self.scheduler.timesteps
+
+            # 5. Prepare latent variables (unchanged)
+            num_channels_latents = self.unet.config.in_channels
+            latents = self.prepare_latents(
+                batch_size * num_images_per_prompt,
+                num_channels_latents,
+                height,
+                width,
+                prompt_embeds.dtype,
+                device,
+                generator,
+                latents,
+            )
+
+            # 6. Prepare extra step kwargs (unchanged)
+            extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+            # 6.5 Optionally get Guidance Scale Embedding (unchanged)
+            timestep_cond = None
+            if self.unet.config.time_cond_proj_dim is not None:
+                guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+                timestep_cond = self.get_guidance_scale_embedding(
+                    guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+                ).to(device=device, dtype=latents.dtype)
+
+            # 7. Denoising loop (modified for parallel processing)
+            init_latents = latents.detach().clone()
+            all_steps = len(self.scheduler.timesteps)
+            curr_step = 0
+            if mod == '50ls':
+                cond = lambda timestep: timestep in [0, 1, 2, 3, 5, 10, 15, 25, 35] #Example condition, adjust as needed
+            elif isinstance(mod, int):
+                cond = lambda timestep: timestep % mod == 0
+            else:
+                raise Exception("Currently not supported, But you can modify the code to customize the keytime")
+
+            #added time_ids and add_text_embeds for original pipeline compatibility
+            add_time_ids = self._get_add_time_ids(
+                original_size,
+                crops_coords_top_left,
+                target_size,
+                dtype=prompt_embeds.dtype,
+                text_encoder_projection_dim=pooled_prompt_embeds.shape[-1] if self.text_encoder_2 is None else self.text_encoder_2.config.projection_dim,
+            )
+            if negative_original_size is not None and negative_target_size is not None:
+                negative_add_time_ids = self._get_add_time_ids(
+                    negative_original_size,
+                    negative_crops_coords_top_left,
+                    negative_target_size,
+                    dtype=prompt_embeds.dtype,
+                    text_encoder_projection_dim=pooled_prompt_embeds.shape[-1] if self.text_encoder_2 is None else self.text_encoder_2.config.projection_dim,
+                )
+            else:
+                negative_add_time_ids = add_time_ids
+            
+            if self.do_classifier_free_guidance:
+                add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+
+            add_text_embeds = pooled_prompt_embeds.to(device)
+            add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
+
+
+            while curr_step < all_steps:
+                register_time(self.unet, curr_step) # Assuming register_time is defined elsewhere
+                time_ls = [timesteps[curr_step]]
+                curr_step += 1
+                while not cond(curr_step) and curr_step < all_steps:
+                    time_ls.append(timesteps[curr_step])
+                    curr_step += 1
+
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+
+                #added kwargs for original pipeline compatibility
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+                    image_embeds = self.prepare_ip_adapter_image_embeds(
+                        ip_adapter_image,
+                        ip_adapter_image_embeds,
+                        device,
+                        batch_size * num_images_per_prompt,
+                        self.do_classifier_free_guidance,
+                    )
+                    added_cond_kwargs["image_embeds"] = image_embeds
+
+                noise_pred = self.unet(
+                    latent_model_input,
+                    time_ls,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
+
+                bs = noise_pred.shape[0]
+                bs_perstep = bs // len(time_ls)
+                denoised_latent = latents
+                for i, timestep in enumerate(time_ls):
+                    if timestep / 1000 < 0.5:
+                        denoised_latent = denoised_latent + 0.003 * init_latents
+                    curr_noise = noise_pred[i * bs_perstep : (i + 1) * bs_perstep]
+                    denoised_latent = self.scheduler.step(
+                        curr_noise, timestep, denoised_latent, **extra_step_kwargs, return_dict=False
+                    )[0]
+                latents = denoised_latent
+
+            #rest of the code remains largely the same as the original pipeline
+            if not output_type == "latent":
+                needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+                if needs_upcasting:
+                    self.upcast_vae()
+                    latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+                elif latents.dtype != self.vae.dtype:
+                    if torch.backends.mps.is_available():
+                        self.vae = self.vae.to(latents.dtype)
+
+                has_latents_mean = hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None
+                has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None
+                if has_latents_mean and has_latents_std:
+                    latents_mean = (
+                        torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                    )
+                    latents_std = (
+                        torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                    )
+                    latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean
+                else:
+                    latents = latents / self.vae.config.scaling_factor
+
+                image = self.vae.decode(latents, return_dict=False)[0]
+
+                if needs_upcasting:
+                    self.vae.to(dtype=torch.float16)
+            else:
+                image = latents
+
+            if not output_type == "latent":
+                if self.watermark is not None:
+                    image = self.watermark.apply_watermark(image)
+
+                image = self.image_processor.postprocess(image, output_type=output_type)
+
+            self.maybe_free_model_hooks()
+
+            if not return_dict:
+                return (image,)
+
+            return StableDiffusionXLPipelineOutput(images=image)  # Assuming this output type exists
+
+        return call
+
+    pipe.call = new_call(pipe)
+
+def register_parallel_pipeline(pipe, mod = '50ls'):
+    def new_call(self):
+        @torch.no_grad()
+        def call(
+            prompt: Union[str, List[str]] = None,
+            height: Optional[int] = None,
+            width: Optional[int] = None,
+            num_inference_steps: int = 50,
+            guidance_scale: float = 7.5,
+            negative_prompt: Optional[Union[str, List[str]]] = None,
+            num_images_per_prompt: Optional[int] = 1,
+            eta: float = 0.0,
+            generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+            latents: Optional[torch.FloatTensor] = None,
+            prompt_embeds: Optional[torch.FloatTensor] = None,
+            negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+            output_type: Optional[str] = "pil",
+            return_dict: bool = True,
+            cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+            guidance_rescale: float = 0.0,
+            clip_skip: Optional[int] = None,
+            callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+            callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+            **kwargs,
+        ):
+
+            callback = kwargs.pop("callback", None)
+            callback_steps = kwargs.pop("callback_steps", None)
+
+
+            # 0. Default height and width to unet
+            height = height or self.unet.config.sample_size * self.vae_scale_factor
+            width = width or self.unet.config.sample_size * self.vae_scale_factor
+            # to deal with lora scaling and other possible forward hooks
+
+            # 1. Check inputs. Raise error if not correct
+            self.check_inputs(
+                prompt,
+                height,
+                width,
+                callback_steps,
+                negative_prompt,
+                prompt_embeds,
+                negative_prompt_embeds,
+                callback_on_step_end_tensor_inputs,
+            )
+
+            self._guidance_scale = guidance_scale
+            self._guidance_rescale = guidance_rescale
+            self._clip_skip = clip_skip
+            self._cross_attention_kwargs = cross_attention_kwargs
+
+            # 2. Define call parameters
+            if prompt is not None and isinstance(prompt, str):
+                batch_size = 1
+            elif prompt is not None and isinstance(prompt, list):
+                batch_size = len(prompt)
+            else:
+                batch_size = prompt_embeds.shape[0]
+
+            device = self._execution_device
+
+            # 3. Encode input prompt
+            lora_scale = (
+                self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+            )
+
+            prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+                prompt,
+                device,
+                num_images_per_prompt,
+                self.do_classifier_free_guidance,
+                negative_prompt,
+                prompt_embeds=prompt_embeds,
+                negative_prompt_embeds=negative_prompt_embeds,
+                lora_scale=lora_scale,
+                clip_skip=self.clip_skip,
+            )
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            if self.do_classifier_free_guidance:
+                prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+            # 4. Prepare timesteps
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+            timesteps = self.scheduler.timesteps
+
+            # 5. Prepare latent variables
+            num_channels_latents = self.unet.config.in_channels
+            latents = self.prepare_latents(
+                batch_size * num_images_per_prompt,
+                num_channels_latents,
+                height,
+                width,
+                prompt_embeds.dtype,
+                device,
+                generator,
+                latents,
+            )
+
+            # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+            extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+            # 6.5 Optionally get Guidance Scale Embedding
+            timestep_cond = None
+            if self.unet.config.time_cond_proj_dim is not None:
+                guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+                timestep_cond = self.get_guidance_scale_embedding(
+                    guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+                ).to(device=device, dtype=latents.dtype)
+
+            # 7. Denoising loop
+            num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+            self._num_timesteps = len(timesteps)
+            init_latents = latents.detach().clone()
+            #-------------------------------------------------------
+            all_steps = len(self.scheduler.timesteps)
+            curr_step = 0
+            if mod == '50ls':
+                cond = lambda timestep: timestep in [0,1,2,3,5,10,15,25,35]
+            elif isinstance(mod, int):
+                cond = lambda timestep: timestep % mod ==0
+            else:
+                raise Exception("Currently not supported, But you can modify the code to customize the keytime")
+            while curr_step<all_steps:
+                register_time(self.unet, curr_step)
+                time_ls = [self.scheduler.timesteps[curr_step]]
+                curr_step += 1
+                while not cond(curr_step):
+                    if curr_step<all_steps:
+                        time_ls.append(self.scheduler.timesteps[curr_step])
+                        curr_step += 1
+                    else:
+                        break
+
+                ##--------------------------------
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    time_ls,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                bs = noise_pred.shape[0]
+                bs_perstep = bs//len(time_ls)
+
+                denoised_latent = latents
+                for i, timestep in enumerate(time_ls):
+                    if timestep/1000 < 0.5:
+                        denoised_latent = denoised_latent + 0.003*init_latents
+                    curr_noise = noise_pred[i*bs_perstep:(i+1)*bs_perstep]
+                    denoised_latent = self.scheduler.step(curr_noise, timestep, denoised_latent, **extra_step_kwargs, return_dict=False)[0]
+                
+                latents = denoised_latent
+                ##----------------------------------------
+
+           
+            if not output_type == "latent":
+                image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
+                    0
+                ]
+                image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+            else:
+                image = latents
+                has_nsfw_concept = None
+
+            if has_nsfw_concept is None:
+                do_denormalize = [True] * image.shape[0]
+            else:
+                do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+            image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+            # Offload all models
+            self.maybe_free_model_hooks()
+
+            if not return_dict:
+                return (image, has_nsfw_concept)
+
+            return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+        return call
+    pipe.call = new_call(pipe)
+
+def register_faster_forward(model, mod = '50ls'):
+    def faster_forward(self):
+        def forward(
+                sample: torch.FloatTensor,
+                timestep: Union[torch.Tensor, float, int],
+                encoder_hidden_states: torch.Tensor,
+                class_labels: Optional[torch.Tensor] = None,
+                timestep_cond: Optional[torch.Tensor] = None,
+                attention_mask: Optional[torch.Tensor] = None,
+                cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+                added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None, # ADDED
+                down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+                mid_block_additional_residual: Optional[torch.Tensor] = None,
+                down_intrablock_additional_residuals: Optional[Tuple[torch.Tensor]] = None, # ADDED
+                return_dict: bool = True,
+            ) -> Union[UNet2DConditionOutput, Tuple]:
+                r"""
+                Args:
+                    sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
+                    timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps
+                    encoder_hidden_states (`torch.FloatTensor`): (batch, sequence_length, feature_dim) encoder hidden states
+                    return_dict (`bool`, *optional*, defaults to `True`):
+                        Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+                    cross_attention_kwargs (`dict`, *optional*):
+                        A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                        `self.processor` in
+                        [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+
+                Returns:
+                    [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+                    [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
+                    returning a tuple, the first element is the sample tensor.
+                """
+                # By default samples have to be AT least a multiple of the overall upsampling factor.
+                # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
+                # However, the upsampling interpolation output size can be forced to fit any upsampling size
+                # on the fly if necessary.
+                default_overall_up_factor = 2**self.num_upsamplers
+
+                # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+                forward_upsample_size = False
+                upsample_size = None
+
+                if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+                    print("Forward upsample size to force interpolation output size.")
+                    forward_upsample_size = True
+
+                # prepare attention_mask
+                if attention_mask is not None:
+                    attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+                    attention_mask = attention_mask.unsqueeze(1)
+
+                # 0. center input if necessary
+                if self.config.center_input_sample:
+                    sample = 2 * sample - 1.0
+
+                # 1. time
+                if isinstance(timestep, list):
+                    timesteps = timestep[0]
+                    step = len(timestep)
+                else:
+                    timesteps = timestep
+                    step = 1
+                if not torch.is_tensor(timesteps) and (not isinstance(timesteps,list)):
+                    # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+                    # This would be a good case for the `match` statement (Python 3.10+)
+                    is_mps = sample.device.type == "mps"
+                    if isinstance(timestep, float):
+                        dtype = torch.float32 if is_mps else torch.float64
+                    else:
+                        dtype = torch.int32 if is_mps else torch.int64
+                    timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+                elif (not isinstance(timesteps,list)) and len(timesteps.shape) == 0:
+                    timesteps = timesteps[None].to(sample.device)
+                
+                if (not isinstance(timesteps,list)) and len(timesteps.shape) == 1:
+                    # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                    timesteps = timesteps.expand(sample.shape[0])
+                elif isinstance(timesteps, list):
+                    #timesteps list, such as [981,961,941]
+                    timesteps = warpped_timestep(timesteps, sample.shape[0]).to(sample.device)
+                t_emb = self.time_proj(timesteps)
+
+                # `Timesteps` does not contain any weights and will always return f32 tensors
+                # but time_embedding might actually be running in fp16. so we need to cast here.
+                # there might be better ways to encapsulate this.
+                t_emb = t_emb.to(dtype=self.dtype)
+
+                emb = self.time_embedding(t_emb, timestep_cond)
+
+                # class_emb = self.get_class_embed(sample=sample, class_labels=class_labels)
+                # if class_emb is not None:
+                #     if self.config.class_embeddings_concat:
+                #         emb = torch.cat([emb, class_emb], dim=-1)
+                #     else:
+                #         emb = emb + class_emb
+        
+                # aug_emb = self.get_aug_embed(
+                #     emb=emb, encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
+                # )
+        
+        
+                # encoder_hidden_states = self.process_encoder_hidden_states(
+                #     encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
+                # )
+
+                if self.class_embedding is not None:
+                    if class_labels is None:
+                        raise ValueError("class_labels should be provided when num_class_embeds > 0")
+
+                    if self.config.class_embed_type == "timestep":
+                        class_labels = self.time_proj(class_labels)
+
+                        # `Timesteps` does not contain any weights and will always return f32 tensors
+                        # there might be better ways to encapsulate this.
+                        class_labels = class_labels.to(dtype=sample.dtype)
+
+                    class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
+
+                    if self.config.class_embeddings_concat:
+                        emb = torch.cat([emb, class_emb], dim=-1)
+                    else:
+                        emb = emb + class_emb
+
+                if self.config.addition_embed_type == "text":
+                    aug_emb = self.add_embedding(encoder_hidden_states)
+                    emb = emb + aug_emb
+
+                    # emb = emb + aug_emb if aug_emb is not None else emb
+
+                if self.time_embed_act is not None:
+                    emb = self.time_embed_act(emb)
+
+                if self.encoder_hid_proj is not None:
+                    print("Encoder hid projections are not none")
+                    encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
+
+                #===============
+                order = self.order #timestep, start by 0
+                #===============
+                ipow = int(np.sqrt(9 + 8*order))
+                cond = order in [0, 1, 2, 3, 5, 10, 15, 25, 35]
+                if isinstance(mod, int):
+                    cond = order % mod == 0
+                elif mod == "pro":
+                    cond = ipow * ipow == (9 + 8 * order)
+                elif mod == "50ls":
+                    cond = order in [0, 1, 2, 3, 5, 10, 15, 25, 35] #40 #[0,1,2,3, 5, 10, 15] #[0, 1, 2, 3, 5, 10, 15, 25, 35, 40]
+                elif mod == "50ls2":
+                    cond = order in [0, 10, 11, 12, 15, 20, 25, 30,35,45] #40 #[0,1,2,3, 5, 10, 15] #[0, 1, 2, 3, 5, 10, 15, 25, 35, 40]
+                elif mod == "50ls3":
+                    cond = order in [0, 20, 25, 30,35,45,46,47,48,49] #40 #[0,1,2,3, 5, 10, 15] #[0, 1, 2, 3, 5, 10, 15, 25, 35, 40]
+                elif mod == "50ls4":
+                    cond = order in [0, 9, 13, 14, 15, 28, 29, 32, 36,45] #40 #[0,1,2,3, 5, 10, 15] #[0, 1, 2, 3, 5, 10, 15, 25, 35, 40]
+                elif mod == "100ls":
+                    cond = order > 85 or order < 10 or order % 5 == 0
+                elif mod == "75ls":
+                    cond = order > 65 or order < 10 or order % 5 == 0
+                elif mod == "s2":
+                    cond = order < 20 or order > 40 or order % 2 == 0
+
+                if cond:
+                    # print('current timestep:', order)
+                    # 2. pre-process
+                    sample = self.conv_in(sample)
+
+                    # 3. down
+                    down_block_res_samples = (sample,)
+                    print(f"Shape of sample at the start of downsample: {sample.shape}")
+                    for i, downsample_block in enumerate(self.down_blocks):
+                        print(f"\n--- Downsample Block {i+1} ---")
+                        print(f"  Input shape: {sample.shape}")
+                        if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                            print("has cross attention")
+                            additional_residuals = {}
+                            if down_intrablock_additional_residuals is not None and len(down_intrablock_additional_residuals) > 0:
+                                additional_residuals["additional_residuals"] = down_intrablock_additional_residuals.pop(0)
+                                print(f"  Additional Residuals shape: {additional_residuals['additional_residuals'].shape}")
+                    
+                            sample, res_samples = downsample_block(
+                                hidden_states=sample,
+                                temb=emb,
+                                encoder_hidden_states=encoder_hidden_states,
+                                attention_mask=attention_mask,
+                                cross_attention_kwargs=cross_attention_kwargs,
+                                **additional_residuals
+                            )
+                        else:
+                            sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+                            # if down_intrablock_additional_residuals is not None and len(down_intrablock_additional_residuals) > 0:
+                            #     sample += down_intrablock_additional_residuals.pop(0)
+                        print(f"  Output shape: {sample.shape}")
+                        for j, res_sample in enumerate(res_samples):
+                            print(f"  Res sample {j+1} shape: {res_sample.shape}")
+                    down_block_res_samples += res_samples
+
+                    
+
+                    if down_block_additional_residuals is not None:
+                        new_down_block_res_samples = ()
+
+                        for down_block_res_sample, down_block_additional_residual in zip(
+                            down_block_res_samples, down_block_additional_residuals
+                        ):
+                            down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                            new_down_block_res_samples += (down_block_res_sample,)
+
+                        down_block_res_samples = new_down_block_res_samples
+
+                    # Handle ControlNet additional residuals
+                    if down_block_additional_residuals is not None:
+                        new_down_block_res_samples = ()
+                        for down_block_res_sample, down_block_additional_residual in zip(
+                                down_block_res_samples, down_block_additional_residuals
+                        ):
+                            down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                            new_down_block_res_samples += (down_block_res_sample,)
+                        down_block_res_samples = new_down_block_res_samples
+                        
+                    # 4. mid
+                    if self.mid_block is not None:
+                        if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention:
+                            sample = self.mid_block(
+                                sample,
+                                emb,
+                                encoder_hidden_states=encoder_hidden_states,
+                                attention_mask=attention_mask,
+                                cross_attention_kwargs=cross_attention_kwargs,
+                            )
+                        else:
+                            sample = self.mid_block(sample, emb)
+                        #Handle T2I-Adapter-XL
+                        if down_intrablock_additional_residuals is not None and len(down_intrablock_additional_residuals) > 0 and sample.shape == down_intrablock_additional_residuals[0].shape:
+                            sample += down_intrablock_additional_residuals.pop(0)
+
+                    if mid_block_additional_residual is not None:
+                        sample = sample + mid_block_additional_residual
+
+                    #----------------------save feature-------------------------
+                    # setattr(self, 'skip_feature', (tmp_sample.clone() for tmp_sample in down_block_res_samples))
+                    setattr(self, 'skip_feature', deepcopy(down_block_res_samples))
+                    setattr(self, 'toup_feature', sample.detach().clone())
+                    #-----------------------save feature------------------------
+
+
+
+                    #-------------------expand feature for parallel---------------
+                    if isinstance(timestep, list):
+                        #timesteps list, such as [981,961,941]
+                        timesteps = warpped_timestep(timestep, sample.shape[0]).to(sample.device)
+                        t_emb = self.time_proj(timesteps)
+
+                        # `Timesteps` does not contain any weights and will always return f32 tensors
+                        # but time_embedding might actually be running in fp16. so we need to cast here.
+                        # there might be better ways to encapsulate this.
+                        t_emb = t_emb.to(dtype=self.dtype)
+
+                        emb = self.time_embedding(t_emb, timestep_cond)
+
+                    down_block_res_samples = warpped_skip_feature(down_block_res_samples, step)
+                    sample = warpped_feature(sample, step)
+                    encoder_hidden_states = warpped_text_emb(encoder_hidden_states, step)
+                    #-------------------expand feature for parallel---------------
+                    
+                else:
+                    down_block_res_samples = self.skip_feature
+                    sample = self.toup_feature
+
+                    #-------------------expand feature for parallel---------------
+                    down_block_res_samples = warpped_skip_feature(down_block_res_samples, step)
+                    sample = warpped_feature(sample, step)
+                    encoder_hidden_states = warpped_text_emb(encoder_hidden_states, step)
+                    #-------------------expand feature for parallel---------------
+
+                # 5. up
+                for i, upsample_block in enumerate(self.up_blocks):
+                    is_final_block = i == len(self.up_blocks) - 1
+
+                    res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+                    down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+
+                    # if we have not reached the final block and need to forward the
+                    # upsample size, we do it here
+                    if not is_final_block and forward_upsample_size:
+                        upsample_size = down_block_res_samples[-1].shape[2:]
+
+                    if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                        sample = upsample_block(
+                            hidden_states=sample,
+                            temb=emb,
+                            res_hidden_states_tuple=res_samples,
+                            encoder_hidden_states=encoder_hidden_states,
+                            cross_attention_kwargs=cross_attention_kwargs,
+                            upsample_size=upsample_size,
+                            attention_mask=attention_mask,
+                        )
+                    else:
+                        sample = upsample_block(
+                            hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size
+                        )
+
+                # 6. post-process
+                if self.conv_norm_out:
+                    sample = self.conv_norm_out(sample)
+                    sample = self.conv_act(sample)
+                sample = self.conv_out(sample)
+
+                if not return_dict:
+                    return (sample,)
+
+                return UNet2DConditionOutput(sample=sample)
+        return forward
+    if model.__class__.__name__ == 'UNet2DConditionModel':
+        model.forward = faster_forward(model)
+
+def register_orig_forward(model, mod = '50ls'):
+    def orig_forward(self):
+        def forward(
+                sample: torch.FloatTensor,
+                timestep: Union[torch.Tensor, float, int],
+                encoder_hidden_states: torch.Tensor,
+                class_labels: Optional[torch.Tensor] = None,
+                timestep_cond: Optional[torch.Tensor] = None,
+                attention_mask: Optional[torch.Tensor] = None,
+                cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+                added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+                down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+                mid_block_additional_residual: Optional[torch.Tensor] = None,
+                down_intrablock_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+                encoder_attention_mask: Optional[torch.Tensor] = None,
+                return_dict: bool = True,
+            ) -> Union[UNet2DConditionOutput, Tuple]:
+            
+                default_overall_up_factor = 2**self.num_upsamplers
+        
+                # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+                forward_upsample_size = False
+                upsample_size = None
+        
+                for dim in sample.shape[-2:]:
+                    if dim % default_overall_up_factor != 0:
+                        # Forward upsample size to force interpolation output size.
+                        forward_upsample_size = True
+                        break
+        
+                if attention_mask is not None:
+                    attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+                    attention_mask = attention_mask.unsqueeze(1)
+        
+                # convert encoder_attention_mask to a bias the same way we do for attention_mask
+                if encoder_attention_mask is not None:
+                    encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
+                    encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+        
+                # 0. center input if necessary
+                if self.config.center_input_sample:
+                    sample = 2 * sample - 1.0
+        
+                # 1. time
+                timesteps = timestep
+                if not torch.is_tensor(timesteps):
+                    is_mps = sample.device.type == "mps"
+                    if isinstance(timestep, float):
+                        dtype = torch.float32 if is_mps else torch.float64
+                    else:
+                        dtype = torch.int32 if is_mps else torch.int64
+                    timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+                elif len(timesteps.shape) == 0:
+                    timesteps = timesteps[None].to(sample.device)
+        
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timesteps = timesteps.expand(sample.shape[0])
+        
+                t_emb = self.time_proj(timesteps)
+        
+                t_emb = t_emb.to(dtype=sample.dtype)
+        
+                emb = self.time_embedding(t_emb, timestep_cond)
+                aug_emb = None
+        
+                if self.class_embedding is not None:
+                    if class_labels is None:
+                        raise ValueError("class_labels should be provided when num_class_embeds > 0")
+        
+                    if self.config.class_embed_type == "timestep":
+                        class_labels = self.time_proj(class_labels)
+                        class_labels = class_labels.to(dtype=sample.dtype)
+        
+                    class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype)
+        
+                    if self.config.class_embeddings_concat:
+                        emb = torch.cat([emb, class_emb], dim=-1)
+                    else:
+                        emb = emb + class_emb
+        
+                if self.config.addition_embed_type == "text":
+                    aug_emb = self.add_embedding(encoder_hidden_states)
+                elif self.config.addition_embed_type == "text_image":
+                    # Kandinsky 2.1 - style
+                    if "image_embeds" not in added_cond_kwargs:
+                        raise ValueError(
+                            f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                        )
+        
+                    image_embs = added_cond_kwargs.get("image_embeds")
+                    text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states)
+                    aug_emb = self.add_embedding(text_embs, image_embs)
+                elif self.config.addition_embed_type == "text_time":
+                    # SDXL - style
+                    if "text_embeds" not in added_cond_kwargs:
+                        raise ValueError(
+                            f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                        )
+                    text_embeds = added_cond_kwargs.get("text_embeds")
+                    if "time_ids" not in added_cond_kwargs:
+                        raise ValueError(
+                            f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                        )
+                    time_ids = added_cond_kwargs.get("time_ids")
+                    time_embeds = self.add_time_proj(time_ids.flatten())
+                    time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+                    add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
+                    add_embeds = add_embeds.to(emb.dtype)
+                    aug_emb = self.add_embedding(add_embeds)
+                elif self.config.addition_embed_type == "image":
+                    # Kandinsky 2.2 - style
+                    if "image_embeds" not in added_cond_kwargs:
+                        raise ValueError(
+                            f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                        )
+                    image_embs = added_cond_kwargs.get("image_embeds")
+                    aug_emb = self.add_embedding(image_embs)
+                elif self.config.addition_embed_type == "image_hint":
+                    # Kandinsky 2.2 - style
+                    if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs:
+                        raise ValueError(
+                            f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`"
+                        )
+                    image_embs = added_cond_kwargs.get("image_embeds")
+                    hint = added_cond_kwargs.get("hint")
+                    aug_emb, hint = self.add_embedding(image_embs, hint)
+                    sample = torch.cat([sample, hint], dim=1)
+        
+                emb = emb + aug_emb if aug_emb is not None else emb
+        
+                if self.time_embed_act is not None:
+                    emb = self.time_embed_act(emb)
+        
+                if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj":
+                    encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
+                elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj":
+                    # Kadinsky 2.1 - style
+                    if "image_embeds" not in added_cond_kwargs:
+                        raise ValueError(
+                            f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                        )
+        
+                    image_embeds = added_cond_kwargs.get("image_embeds")
+                    encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
+                elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj":
+                    # Kandinsky 2.2 - style
+                    if "image_embeds" not in added_cond_kwargs:
+                        raise ValueError(
+                            f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                        )
+                    image_embeds = added_cond_kwargs.get("image_embeds")
+                    encoder_hidden_states = self.encoder_hid_proj(image_embeds)
+                elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "ip_image_proj":
+                    if "image_embeds" not in added_cond_kwargs:
+                        raise ValueError(
+                            f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                        )
+                    image_embeds = added_cond_kwargs.get("image_embeds")
+                    image_embeds = self.encoder_hid_proj(image_embeds).to(encoder_hidden_states.dtype)
+                    encoder_hidden_states = torch.cat([encoder_hidden_states, image_embeds], dim=1)
+        
+                # 2. pre-process
+                sample = self.conv_in(sample)
+        
+                # 2.5 GLIGEN position net
+                if cross_attention_kwargs is not None and cross_attention_kwargs.get("gligen", None) is not None:
+                    cross_attention_kwargs = cross_attention_kwargs.copy()
+                    gligen_args = cross_attention_kwargs.pop("gligen")
+                    cross_attention_kwargs["gligen"] = {"objs": self.position_net(**gligen_args)}
+        
+                # 3. down
+                lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+                if USE_PEFT_BACKEND:
+                    # weight the lora layers by setting `lora_scale` for each PEFT layer
+                    scale_lora_layers(self, lora_scale)
+        
+                is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
+                # using new arg down_intrablock_additional_residuals for T2I-Adapters, to distinguish from controlnets
+                is_adapter = down_intrablock_additional_residuals is not None
+                # maintain backward compatibility for legacy usage, where
+                #       T2I-Adapter and ControlNet both use down_block_additional_residuals arg
+                #       but can only use one or the other
+                if not is_adapter and mid_block_additional_residual is None and down_block_additional_residuals is not None:
+                    deprecate(
+                        "T2I should not use down_block_additional_residuals",
+                        "1.3.0",
+                        "Passing intrablock residual connections with `down_block_additional_residuals` is deprecated \
+                               and will be removed in diffusers 1.3.0.  `down_block_additional_residuals` should only be used \
+                               for ControlNet. Please make sure use `down_intrablock_additional_residuals` instead. ",
+                        standard_warn=False,
+                    )
+                    down_intrablock_additional_residuals = down_block_additional_residuals
+                    is_adapter = True
+        
+                down_block_res_samples = (sample,)
+                for downsample_block in self.down_blocks:
+                    if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                        # For t2i-adapter CrossAttnDownBlock2D
+                        additional_residuals = {}
+                        if is_adapter and len(down_intrablock_additional_residuals) > 0:
+                            additional_residuals["additional_residuals"] = down_intrablock_additional_residuals.pop(0)
+        
+                        sample, res_samples = downsample_block(
+                            hidden_states=sample,
+                            temb=emb,
+                            encoder_hidden_states=encoder_hidden_states,
+                            attention_mask=attention_mask,
+                            cross_attention_kwargs=cross_attention_kwargs,
+                            encoder_attention_mask=encoder_attention_mask,
+                            **additional_residuals,
+                        )
+                    else:
+                        sample, res_samples = downsample_block(hidden_states=sample, temb=emb, scale=lora_scale)
+                        if is_adapter and len(down_intrablock_additional_residuals) > 0:
+                            sample += down_intrablock_additional_residuals.pop(0)
+        
+                    down_block_res_samples += res_samples
+        
+                if is_controlnet:
+                    new_down_block_res_samples = ()
+        
+                    for down_block_res_sample, down_block_additional_residual in zip(
+                        down_block_res_samples, down_block_additional_residuals
+                    ):
+                        down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                        new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
+        
+                    down_block_res_samples = new_down_block_res_samples
+        
+                # 4. mid
+                if self.mid_block is not None:
+                    if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention:
+                        sample = self.mid_block(
+                            sample,
+                            emb,
+                            encoder_hidden_states=encoder_hidden_states,
+                            attention_mask=attention_mask,
+                            cross_attention_kwargs=cross_attention_kwargs,
+                            encoder_attention_mask=encoder_attention_mask,
+                        )
+                    else:
+                        sample = self.mid_block(sample, emb)
+        
+                    # To support T2I-Adapter-XL
+                    if (
+                        is_adapter
+                        and len(down_intrablock_additional_residuals) > 0
+                        and sample.shape == down_intrablock_additional_residuals[0].shape
+                    ):
+                        sample += down_intrablock_additional_residuals.pop(0)
+        
+                if is_controlnet:
+                    sample = sample + mid_block_additional_residual
+        
+                # 5. up
+                for i, upsample_block in enumerate(self.up_blocks):
+                    is_final_block = i == len(self.up_blocks) - 1
+        
+                    res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+                    down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+        
+                    # if we have not reached the final block and need to forward the
+                    # upsample size, we do it here
+                    if not is_final_block and forward_upsample_size:
+                        upsample_size = down_block_res_samples[-1].shape[2:]
+        
+                    if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                        sample = upsample_block(
+                            hidden_states=sample,
+                            temb=emb,
+                            res_hidden_states_tuple=res_samples,
+                            encoder_hidden_states=encoder_hidden_states,
+                            cross_attention_kwargs=cross_attention_kwargs,
+                            upsample_size=upsample_size,
+                            attention_mask=attention_mask,
+                            encoder_attention_mask=encoder_attention_mask,
+                        )
+                    else:
+                        sample = upsample_block(
+                            hidden_states=sample,
+                            temb=emb,
+                            res_hidden_states_tuple=res_samples,
+                            upsample_size=upsample_size,
+                            scale=lora_scale,
+                        )
+        
+                # 6. post-process
+                if self.conv_norm_out:
+                    sample = self.conv_norm_out(sample)
+                    sample = self.conv_act(sample)
+                sample = self.conv_out(sample)
+        
+                if USE_PEFT_BACKEND:
+                    # remove `lora_scale` from each PEFT layer
+                    unscale_lora_layers(self, lora_scale)
+        
+                if not return_dict:
+                    return (sample,)
+        
+                return UNet2DConditionOutput(sample=sample)
+        return forward
+    if model.__class__.__name__ == 'UNet2DConditionModel':
+        model.forward = orig_forward(model)
+
+def register_faster_orig_forward(model, mod = '50ls4'):
+    def faster_orig_forward(self):
+        def forward(
+                sample: torch.FloatTensor,
+                timestep: Union[torch.Tensor, float, int],
+                encoder_hidden_states: torch.Tensor,
+                class_labels: Optional[torch.Tensor] = None,
+                timestep_cond: Optional[torch.Tensor] = None,
+                attention_mask: Optional[torch.Tensor] = None,
+                cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+                added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+                down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+                mid_block_additional_residual: Optional[torch.Tensor] = None,
+                down_intrablock_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+                encoder_attention_mask: Optional[torch.Tensor] = None,
+                return_dict: bool = True,
+            ) -> Union[UNet2DConditionOutput, Tuple]:
+            
+                default_overall_up_factor = 2**self.num_upsamplers
+        
+                # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+                forward_upsample_size = False
+                upsample_size = None
+        
+                for dim in sample.shape[-2:]:
+                    if dim % default_overall_up_factor != 0:
+                        # Forward upsample size to force interpolation output size.
+                        forward_upsample_size = True
+                        break
+        
+                if attention_mask is not None:
+                    attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+                    attention_mask = attention_mask.unsqueeze(1)
+        
+                # convert encoder_attention_mask to a bias the same way we do for attention_mask
+                if encoder_attention_mask is not None:
+                    encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
+                    encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+        
+                # 0. center input if necessary
+                if self.config.center_input_sample:
+                    sample = 2 * sample - 1.0
+        
+                # 1. time
+                timesteps = timestep
+                if not torch.is_tensor(timesteps):
+                    is_mps = sample.device.type == "mps"
+                    if isinstance(timestep, float):
+                        dtype = torch.float32 if is_mps else torch.float64
+                    else:
+                        dtype = torch.int32 if is_mps else torch.int64
+                    timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+                elif len(timesteps.shape) == 0:
+                    timesteps = timesteps[None].to(sample.device)
+        
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timesteps = timesteps.expand(sample.shape[0])
+        
+                t_emb = self.time_proj(timesteps)
+        
+                t_emb = t_emb.to(dtype=sample.dtype)
+        
+                emb = self.time_embedding(t_emb, timestep_cond)
+                aug_emb = None
+        
+                if self.class_embedding is not None:
+                    if class_labels is None:
+                        raise ValueError("class_labels should be provided when num_class_embeds > 0")
+        
+                    if self.config.class_embed_type == "timestep":
+                        class_labels = self.time_proj(class_labels)
+                        class_labels = class_labels.to(dtype=sample.dtype)
+        
+                    class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype)
+        
+                    if self.config.class_embeddings_concat:
+                        emb = torch.cat([emb, class_emb], dim=-1)
+                    else:
+                        emb = emb + class_emb
+        
+                if self.config.addition_embed_type == "text":
+                    aug_emb = self.add_embedding(encoder_hidden_states)
+                elif self.config.addition_embed_type == "text_image":
+                    # Kandinsky 2.1 - style
+                    if "image_embeds" not in added_cond_kwargs:
+                        raise ValueError(
+                            f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                        )
+        
+                    image_embs = added_cond_kwargs.get("image_embeds")
+                    text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states)
+                    aug_emb = self.add_embedding(text_embs, image_embs)
+                elif self.config.addition_embed_type == "text_time":
+                    # SDXL - style
+                    if "text_embeds" not in added_cond_kwargs:
+                        raise ValueError(
+                            f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                        )
+                    text_embeds = added_cond_kwargs.get("text_embeds")
+                    if "time_ids" not in added_cond_kwargs:
+                        raise ValueError(
+                            f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                        )
+                    time_ids = added_cond_kwargs.get("time_ids")
+                    time_embeds = self.add_time_proj(time_ids.flatten())
+                    time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+                    add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
+                    add_embeds = add_embeds.to(emb.dtype)
+                    aug_emb = self.add_embedding(add_embeds)
+                elif self.config.addition_embed_type == "image":
+                    # Kandinsky 2.2 - style
+                    if "image_embeds" not in added_cond_kwargs:
+                        raise ValueError(
+                            f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                        )
+                    image_embs = added_cond_kwargs.get("image_embeds")
+                    aug_emb = self.add_embedding(image_embs)
+                elif self.config.addition_embed_type == "image_hint":
+                    # Kandinsky 2.2 - style
+                    if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs:
+                        raise ValueError(
+                            f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`"
+                        )
+                    image_embs = added_cond_kwargs.get("image_embeds")
+                    hint = added_cond_kwargs.get("hint")
+                    aug_emb, hint = self.add_embedding(image_embs, hint)
+                    sample = torch.cat([sample, hint], dim=1)
+        
+                emb = emb + aug_emb if aug_emb is not None else emb
+        
+                if self.time_embed_act is not None:
+                    emb = self.time_embed_act(emb)
+        
+                if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj":
+                    encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
+                elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj":
+                    # Kadinsky 2.1 - style
+                    if "image_embeds" not in added_cond_kwargs:
+                        raise ValueError(
+                            f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                        )
+        
+                    image_embeds = added_cond_kwargs.get("image_embeds")
+                    encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
+                elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj":
+                    # Kandinsky 2.2 - style
+                    if "image_embeds" not in added_cond_kwargs:
+                        raise ValueError(
+                            f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                        )
+                    image_embeds = added_cond_kwargs.get("image_embeds")
+                    encoder_hidden_states = self.encoder_hid_proj(image_embeds)
+                elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "ip_image_proj":
+                    if "image_embeds" not in added_cond_kwargs:
+                        raise ValueError(
+                            f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                        )
+                    image_embeds = added_cond_kwargs.get("image_embeds")
+                    image_embeds = self.encoder_hid_proj(image_embeds).to(encoder_hidden_states.dtype)
+                    encoder_hidden_states = torch.cat([encoder_hidden_states, image_embeds], dim=1)
+
+                #===============
+                order = model.order #timestep, start by 0
+                #===============
+                ipow = int(np.sqrt(9 + 8*order))
+                cond = order in [0, 1, 2, 3, 5, 10, 15, 25, 35]
+                if isinstance(mod, int):
+                    cond = order % mod == 0
+                elif mod == "pro":
+                    cond = ipow * ipow == (9 + 8 * order)
+                elif mod == "50ls":
+                    cond = order in [0, 1, 2, 3, 5, 10, 15, 25, 35] #40 #[0,1,2,3, 5, 10, 15] #[0, 1, 2, 3, 5, 10, 15, 25, 35, 40]
+                elif mod == "50ls2":
+                    cond = order in [0, 10, 11, 12, 15, 20, 25, 30,35,45] #40 #[0,1,2,3, 5, 10, 15] #[0, 1, 2, 3, 5, 10, 15, 25, 35, 40]
+                elif mod == "50ls3":
+                    cond = order in [0, 20, 25, 30,35,45,46,47,48,49] #40 #[0,1,2,3, 5, 10, 15] #[0, 1, 2, 3, 5, 10, 15, 25, 35, 40]
+                elif mod == "50ls4":
+                    cond = order in [0, 9, 13, 14, 15, 28, 29, 32, 36,45] #40 #[0,1,2,3, 5, 10, 15] #[0, 1, 2, 3, 5, 10, 15, 25, 35, 40]
+                elif mod == "100ls":
+                    cond = order > 85 or order < 10 or order % 5 == 0
+                elif mod == "75ls":
+                    cond = order > 65 or order < 10 or order % 5 == 0
+                elif mod == "s2":
+                    cond = order < 20 or order > 40 or order % 2 == 0
+
+                if cond: 
+                    # 2. pre-process
+                    sample = self.conv_in(sample)
+            
+                    # 2.5 GLIGEN position net
+                    if cross_attention_kwargs is not None and cross_attention_kwargs.get("gligen", None) is not None:
+                        cross_attention_kwargs = cross_attention_kwargs.copy()
+                        gligen_args = cross_attention_kwargs.pop("gligen")
+                        cross_attention_kwargs["gligen"] = {"objs": self.position_net(**gligen_args)}
+            
+                    # 3. down
+                    lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+                    if USE_PEFT_BACKEND:
+                        # weight the lora layers by setting `lora_scale` for each PEFT layer
+                        scale_lora_layers(self, lora_scale)
+            
+                    is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
+                    # using new arg down_intrablock_additional_residuals for T2I-Adapters, to distinguish from controlnets
+                    is_adapter = down_intrablock_additional_residuals is not None
+                    # maintain backward compatibility for legacy usage, where
+                    #       T2I-Adapter and ControlNet both use down_block_additional_residuals arg
+                    #       but can only use one or the other
+                    if not is_adapter and mid_block_additional_residual is None and down_block_additional_residuals is not None:
+                        deprecate(
+                            "T2I should not use down_block_additional_residuals",
+                            "1.3.0",
+                            "Passing intrablock residual connections with `down_block_additional_residuals` is deprecated \
+                                   and will be removed in diffusers 1.3.0.  `down_block_additional_residuals` should only be used \
+                                   for ControlNet. Please make sure use `down_intrablock_additional_residuals` instead. ",
+                            standard_warn=False,
+                        )
+                        down_intrablock_additional_residuals = down_block_additional_residuals
+                        is_adapter = True
+            
+                    down_block_res_samples = (sample,)
+                    for downsample_block in self.down_blocks:
+                        if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                            # For t2i-adapter CrossAttnDownBlock2D
+                            additional_residuals = {}
+                            if is_adapter and len(down_intrablock_additional_residuals) > 0:
+                                additional_residuals["additional_residuals"] = down_intrablock_additional_residuals.pop(0)
+            
+                            sample, res_samples = downsample_block(
+                                hidden_states=sample,
+                                temb=emb,
+                                encoder_hidden_states=encoder_hidden_states,
+                                attention_mask=attention_mask,
+                                cross_attention_kwargs=cross_attention_kwargs,
+                                encoder_attention_mask=encoder_attention_mask,
+                                **additional_residuals,
+                            )
+                        else:
+                            sample, res_samples = downsample_block(hidden_states=sample, temb=emb, scale=lora_scale)
+                            if is_adapter and len(down_intrablock_additional_residuals) > 0:
+                                sample += down_intrablock_additional_residuals.pop(0)
+            
+                        down_block_res_samples += res_samples
+            
+                    if is_controlnet:
+                        new_down_block_res_samples = ()
+            
+                        for down_block_res_sample, down_block_additional_residual in zip(
+                            down_block_res_samples, down_block_additional_residuals
+                        ):
+                            down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                            new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
+            
+                        down_block_res_samples = new_down_block_res_samples
+            
+                    # 4. mid
+                    if self.mid_block is not None:
+                        if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention:
+                            sample = self.mid_block(
+                                sample,
+                                emb,
+                                encoder_hidden_states=encoder_hidden_states,
+                                attention_mask=attention_mask,
+                                cross_attention_kwargs=cross_attention_kwargs,
+                                encoder_attention_mask=encoder_attention_mask,
+                            )
+                        else:
+                            sample = self.mid_block(sample, emb)
+            
+                        # To support T2I-Adapter-XL
+                        if (
+                            is_adapter
+                            and len(down_intrablock_additional_residuals) > 0
+                            and sample.shape == down_intrablock_additional_residuals[0].shape
+                        ):
+                            sample += down_intrablock_additional_residuals.pop(0)
+            
+                    if is_controlnet:
+                        sample = sample + mid_block_additional_residual
+
+                    #----------------------save feature-------------------------
+                    # setattr(self, 'skip_feature', (tmp_sample.clone() for tmp_sample in down_block_res_samples))
+                    setattr(self, 'skip_feature', deepcopy(down_block_res_samples))
+                    setattr(self, 'toup_feature', sample.detach().clone())
+                    #-----------------------save feature------------------------
+
+
+
+                    #-------------------expand feature for parallel---------------
+                    if isinstance(timestep, list):
+                        step = len(timestep)
+                        #timesteps list, such as [981,961,941]
+                        timesteps = warpped_timestep(timestep, sample.shape[0]).to(sample.device)
+                        t_emb = self.time_proj(timesteps)
+
+                        # `Timesteps` does not contain any weights and will always return f32 tensors
+                        # but time_embedding might actually be running in fp16. so we need to cast here.
+                        # there might be better ways to encapsulate this.
+                        t_emb = t_emb.to(dtype=self.dtype)
+
+                        emb = self.time_embedding(t_emb, timestep_cond)
+                    else:
+                        step = 1 
+                    down_block_res_samples = warpped_skip_feature(down_block_res_samples, step)
+                    sample = warpped_feature(sample, step)
+                    encoder_hidden_states = warpped_text_emb(encoder_hidden_states, step)
+                    #-------------------expand feature for parallel---------------
+
+                else:
+                    step = 1
+                    down_block_res_samples = self.skip_feature
+                    sample = self.toup_feature
+
+                    #-------------------expand feature for parallel---------------
+                    down_block_res_samples = warpped_skip_feature(down_block_res_samples, step)
+                    sample = warpped_feature(sample, step)
+                    encoder_hidden_states = warpped_text_emb(encoder_hidden_states, step)
+                    #-------------------expand feature for parallel---------------
+        
+                # 5. up
+                for i, upsample_block in enumerate(self.up_blocks):
+                    is_final_block = i == len(self.up_blocks) - 1
+        
+                    res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+                    down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+        
+                    # if we have not reached the final block and need to forward the
+                    # upsample size, we do it here
+                    if not is_final_block and forward_upsample_size:
+                        upsample_size = down_block_res_samples[-1].shape[2:]
+        
+                    if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                        sample = upsample_block(
+                            hidden_states=sample,
+                            temb=emb,
+                            res_hidden_states_tuple=res_samples,
+                            encoder_hidden_states=encoder_hidden_states,
+                            cross_attention_kwargs=cross_attention_kwargs,
+                            upsample_size=upsample_size,
+                            attention_mask=attention_mask,
+                            encoder_attention_mask=encoder_attention_mask,
+                        )
+                    else:
+                        sample = upsample_block(
+                            hidden_states=sample,
+                            temb=emb,
+                            res_hidden_states_tuple=res_samples,
+                            upsample_size=upsample_size,
+                            scale=lora_scale,
+                        )
+        
+                # 6. post-process
+                if self.conv_norm_out:
+                    sample = self.conv_norm_out(sample)
+                    sample = self.conv_act(sample)
+                sample = self.conv_out(sample)
+        
+                if USE_PEFT_BACKEND:
+                    # remove `lora_scale` from each PEFT layer
+                    unscale_lora_layers(self, lora_scale)
+        
+                if not return_dict:
+                    return (sample,)
+        
+                return UNet2DConditionOutput(sample=sample)
+        return forward
+    if model.__class__.__name__ == 'UNet2DConditionModel':
+        model.forward = faster_orig_forward(model)
+
+
+# sample: torch.FloatTensor,
+# timestep: Union[torch.Tensor, float, int],
+# encoder_hidden_states: torch.Tensor,
+# class_labels: Optional[torch.Tensor] = None,
+# timestep_cond: Optional[torch.Tensor] = None,
+# attention_mask: Optional[torch.Tensor] = None,
+# cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+# added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None, # ADDED
+# down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+# mid_block_additional_residual: Optional[torch.Tensor] = None,
+# down_intrablock_additional_residuals: Optional[Tuple[torch.Tensor]] = None, # ADDED
+# return_dict: bool = True,
+
+# encoder_attention_mask: Optional[torch.Tensor] = None,
+
+
+def register_original_forward(model, mod = '50ls'):
+    def original_forward(self):
+        def forward(
+                sample: torch.FloatTensor,
+                timestep: Union[torch.Tensor, float, int],
+                encoder_hidden_states: torch.Tensor,
+                class_labels: Optional[torch.Tensor] = None,
+                timestep_cond: Optional[torch.Tensor] = None,
+                attention_mask: Optional[torch.Tensor] = None,
+                cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+                added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None, # ADDED
+                down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+                mid_block_additional_residual: Optional[torch.Tensor] = None,
+                down_intrablock_additional_residuals: Optional[Tuple[torch.Tensor]] = None, # ADDED
+                return_dict: bool = True,
+            ) -> Union[UNet2DConditionOutput, Tuple]:
+                r"""
+                Args:
+                    sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
+                    timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps
+                    encoder_hidden_states (`torch.FloatTensor`): (batch, sequence_length, feature_dim) encoder hidden states
+                    return_dict (`bool`, *optional*, defaults to `True`):
+                        Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+                    cross_attention_kwargs (`dict`, *optional*):
+                        A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                        `self.processor` in
+                        [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+
+                Returns:
+                    [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+                    [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
+                    returning a tuple, the first element is the sample tensor.
+                """
+                # By default samples have to be AT least a multiple of the overall upsampling factor.
+                # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
+                # However, the upsampling interpolation output size can be forced to fit any upsampling size
+                # on the fly if necessary.
+                default_overall_up_factor = 2**self.num_upsamplers
+        
+                # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+                forward_upsample_size = False
+                upsample_size = None
+        
+                for dim in sample.shape[-2:]:
+                    if dim % default_overall_up_factor != 0:
+                        # Forward upsample size to force interpolation output size.
+                        forward_upsample_size = True
+                        break
+        
+                # ensure attention_mask is a bias, and give it a singleton query_tokens dimension
+                # expects mask of shape:
+                #   [batch, key_tokens]
+                # adds singleton query_tokens dimension:
+                #   [batch,                    1, key_tokens]
+                # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+                #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+                #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+                if attention_mask is not None:
+                    # assume that mask is expressed as:
+                    #   (1 = keep,      0 = discard)
+                    # convert mask into a bias that can be added to attention scores:
+                    #       (keep = +0,     discard = -10000.0)
+                    attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+                    attention_mask = attention_mask.unsqueeze(1)
+        
+                # convert encoder_attention_mask to a bias the same way we do for attention_mask
+                if encoder_attention_mask is not None:
+                    encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
+                    encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+        
+                # 0. center input if necessary
+                if self.config.center_input_sample:
+                    sample = 2 * sample - 1.0
+        
+                # 1. time
+                t_emb = self.get_time_embed(sample=sample, timestep=timestep)
+                emb = self.time_embedding(t_emb, timestep_cond)
+        
+                class_emb = self.get_class_embed(sample=sample, class_labels=class_labels)
+                if class_emb is not None:
+                    if self.config.class_embeddings_concat:
+                        emb = torch.cat([emb, class_emb], dim=-1)
+                    else:
+                        emb = emb + class_emb
+        
+                aug_emb = self.get_aug_embed(
+                    emb=emb, encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
+                )
+                if self.config.addition_embed_type == "image_hint":
+                    aug_emb, hint = aug_emb
+                    sample = torch.cat([sample, hint], dim=1)
+        
+                emb = emb + aug_emb if aug_emb is not None else emb
+        
+                if self.time_embed_act is not None:
+                    emb = self.time_embed_act(emb)
+        
+                encoder_hidden_states = self.process_encoder_hidden_states(
+                    encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
+                )
+        
+                # 2. pre-process
+                sample = self.conv_in(sample)
+        
+                # 2.5 GLIGEN position net
+                if cross_attention_kwargs is not None and cross_attention_kwargs.get("gligen", None) is not None:
+                    cross_attention_kwargs = cross_attention_kwargs.copy()
+                    gligen_args = cross_attention_kwargs.pop("gligen")
+                    cross_attention_kwargs["gligen"] = {"objs": self.position_net(**gligen_args)}
+        
+                # 3. down
+                # we're popping the `scale` instead of getting it because otherwise `scale` will be propagated
+                # to the internal blocks and will raise deprecation warnings. this will be confusing for our users.
+                if cross_attention_kwargs is not None:
+                    cross_attention_kwargs = cross_attention_kwargs.copy()
+                    lora_scale = cross_attention_kwargs.pop("scale", 1.0)
+                else:
+                    lora_scale = 1.0
+        
+                if USE_PEFT_BACKEND:
+                    # weight the lora layers by setting `lora_scale` for each PEFT layer
+                    scale_lora_layers(self, lora_scale)
+        
+                is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
+                # using new arg down_intrablock_additional_residuals for T2I-Adapters, to distinguish from controlnets
+                is_adapter = down_intrablock_additional_residuals is not None
+                # maintain backward compatibility for legacy usage, where
+                #       T2I-Adapter and ControlNet both use down_block_additional_residuals arg
+                #       but can only use one or the other
+                if not is_adapter and mid_block_additional_residual is None and down_block_additional_residuals is not None:
+                    deprecate(
+                        "T2I should not use down_block_additional_residuals",
+                        "1.3.0",
+                        "Passing intrablock residual connections with `down_block_additional_residuals` is deprecated \
+                               and will be removed in diffusers 1.3.0.  `down_block_additional_residuals` should only be used \
+                               for ControlNet. Please make sure use `down_intrablock_additional_residuals` instead. ",
+                        standard_warn=False,
+                    )
+                    down_intrablock_additional_residuals = down_block_additional_residuals
+                    is_adapter = True
+        
+                down_block_res_samples = (sample,)
+                for downsample_block in self.down_blocks:
+                    if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                        # For t2i-adapter CrossAttnDownBlock2D
+                        additional_residuals = {}
+                        if is_adapter and len(down_intrablock_additional_residuals) > 0:
+                            additional_residuals["additional_residuals"] = down_intrablock_additional_residuals.pop(0)
+        
+                        sample, res_samples = downsample_block(
+                            hidden_states=sample,
+                            temb=emb,
+                            encoder_hidden_states=encoder_hidden_states,
+                            attention_mask=attention_mask,
+                            cross_attention_kwargs=cross_attention_kwargs,
+                            encoder_attention_mask=encoder_attention_mask,
+                            **additional_residuals,
+                        )
+                    else:
+                        sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+                        if is_adapter and len(down_intrablock_additional_residuals) > 0:
+                            sample += down_intrablock_additional_residuals.pop(0)
+        
+                    down_block_res_samples += res_samples
+        
+                if is_controlnet:
+                    new_down_block_res_samples = ()
+        
+                    for down_block_res_sample, down_block_additional_residual in zip(
+                        down_block_res_samples, down_block_additional_residuals
+                    ):
+                        down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                        new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
+        
+                    down_block_res_samples = new_down_block_res_samples
+        
+                # 4. mid
+                if self.mid_block is not None:
+                    if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention:
+                        sample = self.mid_block(
+                            sample,
+                            emb,
+                            encoder_hidden_states=encoder_hidden_states,
+                            attention_mask=attention_mask,
+                            cross_attention_kwargs=cross_attention_kwargs,
+                            encoder_attention_mask=encoder_attention_mask,
+                        )
+                    else:
+                        sample = self.mid_block(sample, emb)
+        
+                    # To support T2I-Adapter-XL
+                    if (
+                        is_adapter
+                        and len(down_intrablock_additional_residuals) > 0
+                        and sample.shape == down_intrablock_additional_residuals[0].shape
+                    ):
+                        sample += down_intrablock_additional_residuals.pop(0)
+        
+                if is_controlnet:
+                    sample = sample + mid_block_additional_residual
+        
+                # 5. up
+                for i, upsample_block in enumerate(self.up_blocks):
+                    is_final_block = i == len(self.up_blocks) - 1
+        
+                    res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+                    down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+        
+                    # if we have not reached the final block and need to forward the
+                    # upsample size, we do it here
+                    if not is_final_block and forward_upsample_size:
+                        upsample_size = down_block_res_samples[-1].shape[2:]
+        
+                    if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                        sample = upsample_block(
+                            hidden_states=sample,
+                            temb=emb,
+                            res_hidden_states_tuple=res_samples,
+                            encoder_hidden_states=encoder_hidden_states,
+                            cross_attention_kwargs=cross_attention_kwargs,
+                            upsample_size=upsample_size,
+                            attention_mask=attention_mask,
+                            encoder_attention_mask=encoder_attention_mask,
+                        )
+                    else:
+                        sample = upsample_block(
+                            hidden_states=sample,
+                            temb=emb,
+                            res_hidden_states_tuple=res_samples,
+                            upsample_size=upsample_size,
+                        )
+        
+                # 6. post-process
+                if self.conv_norm_out:
+                    sample = self.conv_norm_out(sample)
+                    sample = self.conv_act(sample)
+                sample = self.conv_out(sample)
+        
+                if USE_PEFT_BACKEND:
+                    # remove `lora_scale` from each PEFT layer
+                    unscale_lora_layers(self, lora_scale)
+        
+                if not return_dict:
+                    return (sample,)
+        
+                return UNet2DConditionOutput(sample=sample)
+        return forward
+    if model.__class__.__name__ == 'UNet2DConditionModel':
+        model.forward = original_forward(model)
+        
+
+
+def register_normal_forward(model):
+    def normal_forward(self):
+        def forward(
+                sample: torch.FloatTensor,
+                timestep: Union[torch.Tensor, float, int],
+                encoder_hidden_states: torch.Tensor,
+                class_labels: Optional[torch.Tensor] = None,
+                timestep_cond: Optional[torch.Tensor] = None,
+                attention_mask: Optional[torch.Tensor] = None,
+                cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+                down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+                mid_block_additional_residual: Optional[torch.Tensor] = None,
+                return_dict: bool = True,
+            ) -> Union[UNet2DConditionOutput, Tuple]:
+                # By default samples have to be AT least a multiple of the overall upsampling factor.
+                # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
+                # However, the upsampling interpolation output size can be forced to fit any upsampling size
+                # on the fly if necessary.
+                default_overall_up_factor = 2**self.num_upsamplers
+
+                # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+                forward_upsample_size = False
+                upsample_size = None
+                #---------------------
+                # import os
+                # os.makedirs(f'{timestep.item()}_step', exist_ok=True)
+                #---------------------
+                if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+                    print("Forward upsample size to force interpolation output size.")
+                    forward_upsample_size = True
+
+                # prepare attention_mask
+                if attention_mask is not None:
+                    attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+                    attention_mask = attention_mask.unsqueeze(1)
+
+                # 0. center input if necessary
+                if self.config.center_input_sample:
+                    sample = 2 * sample - 1.0
+
+                # 1. time
+                timesteps = timestep
+                if not torch.is_tensor(timesteps):
+                    # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+                    # This would be a good case for the `match` statement (Python 3.10+)
+                    is_mps = sample.device.type == "mps"
+                    if isinstance(timestep, float):
+                        dtype = torch.float32 if is_mps else torch.float64
+                    else:
+                        dtype = torch.int32 if is_mps else torch.int64
+                    timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+                elif len(timesteps.shape) == 0:
+                    timesteps = timesteps[None].to(sample.device)
+
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timesteps = timesteps.expand(sample.shape[0])
+
+                t_emb = self.time_proj(timesteps)
+
+                # `Timesteps` does not contain any weights and will always return f32 tensors
+                # but time_embedding might actually be running in fp16. so we need to cast here.
+                # there might be better ways to encapsulate this.
+                t_emb = t_emb.to(dtype=self.dtype)
+
+                emb = self.time_embedding(t_emb, timestep_cond)
+
+                if self.class_embedding is not None:
+                    if class_labels is None:
+                        raise ValueError("class_labels should be provided when num_class_embeds > 0")
+
+                    if self.config.class_embed_type == "timestep":
+                        class_labels = self.time_proj(class_labels)
+
+                        # `Timesteps` does not contain any weights and will always return f32 tensors
+                        # there might be better ways to encapsulate this.
+                        class_labels = class_labels.to(dtype=sample.dtype)
+
+                    class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
+
+                    if self.config.class_embeddings_concat:
+                        emb = torch.cat([emb, class_emb], dim=-1)
+                    else:
+                        emb = emb + class_emb
+
+                if self.config.addition_embed_type == "text":
+                    aug_emb = self.add_embedding(encoder_hidden_states)
+                    emb = emb + aug_emb
+
+                if self.time_embed_act is not None:
+                    emb = self.time_embed_act(emb)
+
+                if self.encoder_hid_proj is not None:
+                    encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
+
+                # 2. pre-process
+                sample = self.conv_in(sample)
+
+                # 3. down
+                down_block_res_samples = (sample,)
+                for i, downsample_block in enumerate(self.down_blocks):
+                    if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                        sample, res_samples = downsample_block(
+                            hidden_states=sample,
+                            temb=emb,
+                            encoder_hidden_states=encoder_hidden_states,
+                            attention_mask=attention_mask,
+                            cross_attention_kwargs=cross_attention_kwargs,
+                        )
+                    else:
+                        sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+                    #---------------------------------
+                    # torch.save(sample, f'{timestep.item()}_step/down_{i}.pt')
+                    #----------------------------------
+                    down_block_res_samples += res_samples
+
+                if down_block_additional_residuals is not None:
+                    new_down_block_res_samples = ()
+
+                    for down_block_res_sample, down_block_additional_residual in zip(
+                        down_block_res_samples, down_block_additional_residuals
+                    ):
+                        down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                        new_down_block_res_samples += (down_block_res_sample,)
+
+                    down_block_res_samples = new_down_block_res_samples
+
+                # 4. mid
+                if self.mid_block is not None:
+                    sample = self.mid_block(
+                        sample,
+                        emb,
+                        encoder_hidden_states=encoder_hidden_states,
+                        attention_mask=attention_mask,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                    )
+                    # torch.save(sample, f'{timestep.item()}_step/mid.pt')
+                if mid_block_additional_residual is not None:
+                    sample = sample + mid_block_additional_residual
+                # 5. up
+                for i, upsample_block in enumerate(self.up_blocks):
+                    is_final_block = i == len(self.up_blocks) - 1
+
+                    res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+                    down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+
+                    # if we have not reached the final block and need to forward the
+                    # upsample size, we do it here
+                    if not is_final_block and forward_upsample_size:
+                        upsample_size = down_block_res_samples[-1].shape[2:]
+
+                    if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                        sample = upsample_block(
+                            hidden_states=sample,
+                            temb=emb,
+                            res_hidden_states_tuple=res_samples,
+                            encoder_hidden_states=encoder_hidden_states,
+                            cross_attention_kwargs=cross_attention_kwargs,
+                            upsample_size=upsample_size,
+                            attention_mask=attention_mask,
+                        )
+                    else:
+                        sample = upsample_block(
+                            hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size
+                        )
+                    #----------------------------
+                    # torch.save(sample, f'{timestep.item()}_step/up_{i}.pt')
+                    #----------------------------
+                # 6. post-process
+                if self.conv_norm_out:
+                    sample = self.conv_norm_out(sample)
+                    sample = self.conv_act(sample)
+                sample = self.conv_out(sample)
+
+                if not return_dict:
+                    return (sample,)
+
+                return UNet2DConditionOutput(sample=sample)
+        return forward
+    if model.__class__.__name__ == 'UNet2DConditionModel':
+        model.forward = normal_forward(model)
+
+def register_time(unet, t):
+    setattr(unet, 'order', t)
+
+def register_controlnet_pipeline(pipe):
+    def new_call(self):
+        @torch.no_grad()
+        def call(
+            prompt: Union[str, List[str]] = None,
+            image: Union[
+                torch.FloatTensor,
+                PIL.Image.Image,
+                np.ndarray,
+                List[torch.FloatTensor],
+                List[PIL.Image.Image],
+                List[np.ndarray],
+            ] = None,
+            height: Optional[int] = None,
+            width: Optional[int] = None,
+            num_inference_steps: int = 50,
+            guidance_scale: float = 7.5,
+            negative_prompt: Optional[Union[str, List[str]]] = None,
+            num_images_per_prompt: Optional[int] = 1,
+            eta: float = 0.0,
+            generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+            latents: Optional[torch.FloatTensor] = None,
+            prompt_embeds: Optional[torch.FloatTensor] = None,
+            negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+            output_type: Optional[str] = "pil",
+            return_dict: bool = True,
+            callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+            callback_steps: int = 1,
+            cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+            controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+            guess_mode: bool = False,
+        ):
+            # 1. Check inputs. Raise error if not correct
+            self.check_inputs(
+                prompt,
+                image,
+                callback_steps,
+                negative_prompt,
+                prompt_embeds,
+                negative_prompt_embeds,
+                controlnet_conditioning_scale,
+            )
+
+            # 2. Define call parameters
+            if prompt is not None and isinstance(prompt, str):
+                batch_size = 1
+            elif prompt is not None and isinstance(prompt, list):
+                batch_size = len(prompt)
+            else:
+                batch_size = prompt_embeds.shape[0]
+
+            device = self._execution_device
+            # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+            # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+            # corresponds to doing no classifier free guidance.
+            do_classifier_free_guidance = guidance_scale > 1.0
+
+            controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
+
+            if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+                controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
+
+            global_pool_conditions = (
+                controlnet.config.global_pool_conditions
+                if isinstance(controlnet, ControlNetModel)
+                else controlnet.nets[0].config.global_pool_conditions
+            )
+            guess_mode = guess_mode or global_pool_conditions
+
+            # 3. Encode input prompt
+            text_encoder_lora_scale = (
+                cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+            )
+            prompt_embeds = self._encode_prompt(
+                prompt,
+                device,
+                num_images_per_prompt,
+                do_classifier_free_guidance,
+                negative_prompt,
+                prompt_embeds=prompt_embeds,
+                negative_prompt_embeds=negative_prompt_embeds,
+                lora_scale=text_encoder_lora_scale,
+            )
+
+            # 4. Prepare image
+            if isinstance(controlnet, ControlNetModel):
+                image = self.prepare_image(
+                    image=image,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=controlnet.dtype,
+                    do_classifier_free_guidance=do_classifier_free_guidance,
+                    guess_mode=guess_mode,
+                )
+                height, width = image.shape[-2:]
+            elif isinstance(controlnet, MultiControlNetModel):
+                images = []
+
+                for image_ in image:
+                    image_ = self.prepare_image(
+                        image=image_,
+                        width=width,
+                        height=height,
+                        batch_size=batch_size * num_images_per_prompt,
+                        num_images_per_prompt=num_images_per_prompt,
+                        device=device,
+                        dtype=controlnet.dtype,
+                        do_classifier_free_guidance=do_classifier_free_guidance,
+                        guess_mode=guess_mode,
+                    )
+
+                    images.append(image_)
+
+                image = images
+                height, width = image[0].shape[-2:]
+            else:
+                assert False
+
+            # 5. Prepare timesteps
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+            timesteps = self.scheduler.timesteps
+
+            # 6. Prepare latent variables
+            num_channels_latents = self.unet.config.in_channels
+            latents = self.prepare_latents(
+                batch_size * num_images_per_prompt,
+                num_channels_latents,
+                height,
+                width,
+                prompt_embeds.dtype,
+                device,
+                generator,
+                latents,
+            )
+            self.init_latent = latents.detach().clone()
+            # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+            extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+            # 8. Denoising loop
+            #-------------------------------------------------------------
+            all_steps = len(self.scheduler.timesteps)
+            curr_span = 1
+            curr_step = 0
+
+            idx = 1
+            keytime = [0,1,2,3,5,10,15,25,35]
+            keytime.append(all_steps)
+            while curr_step<all_steps:
+                register_time(self.unet, curr_step)
+
+                if curr_span>0:
+                    time_ls = []
+                    for i in range(curr_step, curr_step+curr_span):
+                        if i<all_steps:
+                            time_ls.append(self.scheduler.timesteps[i])
+                        else:
+                            break
+
+                    ##--------------------------------
+                    latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, time_ls[0])
+                    
+                    if curr_step in [0,1,2,3,5,10,15,25,35]:
+                        # controlnet(s) inference
+                        control_model_input = latent_model_input
+                        controlnet_prompt_embeds = prompt_embeds
+
+                        down_block_res_samples, mid_block_res_sample = self.controlnet(
+                            control_model_input,
+                            time_ls[0],
+                            encoder_hidden_states=controlnet_prompt_embeds,
+                            controlnet_cond=image,
+                            conditioning_scale=controlnet_conditioning_scale,
+                            guess_mode=guess_mode,
+                            return_dict=False,
+                        )
+
+
+                        #----------------------save controlnet feature-------------------------
+                        #useless, should delete
+                        # setattr(self, 'downres_samples', deepcopy(down_block_res_samples))
+                        # setattr(self, 'midres_sample', mid_block_res_sample.detach().clone())
+                        #-----------------------save controlnet feature------------------------
+                    else:
+                        down_block_res_samples = None #self.downres_samples
+                        mid_block_res_sample = None #self.midres_sample
+                    # predict the noise residual
+                    noise_pred = self.unet(
+                        latent_model_input,
+                        time_ls,
+                        encoder_hidden_states=prompt_embeds,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                        down_block_additional_residuals=down_block_res_samples,
+                        mid_block_additional_residual=mid_block_res_sample,
+                        return_dict=False,
+                    )[0]
+
+                    # perform guidance
+                    if do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                    # compute the previous noisy sample x_t -> x_t-1
+
+                    if isinstance(time_ls, list):
+                        step_span = len(time_ls)
+                        bs = noise_pred.shape[0]
+                        bs_perstep = bs//step_span
+
+                        denoised_latent = latents
+                        for i, timestep in enumerate(time_ls):
+                            if timestep/1000 < 0.5:
+                                denoised_latent = denoised_latent + 0.003*self.init_latent                       
+                            curr_noise = noise_pred[i*bs_perstep:(i+1)*bs_perstep]
+                            denoised_latent = self.scheduler.step(curr_noise, timestep, denoised_latent, **extra_step_kwargs, return_dict=False)[0]
+                        
+                        latents = denoised_latent
+                    ##----------------------------------------
+                curr_step += curr_span
+                idx += 1
+                if curr_step<all_steps:
+                    curr_span = keytime[idx] - keytime[idx-1]
+
+            #-------------------------------------------------------------
+            # If we do sequential model offloading, let's offload unet and controlnet
+            # manually for max memory savings
+            if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+                self.unet.to("cpu")
+                self.controlnet.to("cpu")
+                torch.cuda.empty_cache()
+
+            if not output_type == "latent":
+                image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+                image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+            else:
+                image = latents
+                has_nsfw_concept = None
+
+            if has_nsfw_concept is None:
+                do_denormalize = [True] * image.shape[0]
+            else:
+                do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+            image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+            # Offload last model to CPU
+            if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+                self.final_offload_hook.offload()
+
+            if not return_dict:
+                return (image, has_nsfw_concept)
+
+            return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+        return call
+    pipe.call = new_call(pipe)
+
+@torch.no_grad()
+def multistep_pre(self, noise_pred, t, x):
+    step_span = len(t)
+    bs = noise_pred.shape[0]
+    bs_perstep = bs//step_span
+
+    denoised_latent = x
+    for i, timestep in enumerate(t):
+        curr_noise = noise_pred[i*bs_perstep:(i+1)*bs_perstep]
+        denoised_latent = self.scheduler.step(curr_noise, timestep, denoised_latent)['prev_sample']
+    return denoised_latent
+
+def register_t2v(model):
+    def new_back(self):
+        def backward_loop(
+        latents,
+        timesteps,
+        prompt_embeds,
+        guidance_scale,
+        callback,
+        callback_steps,
+        num_warmup_steps,
+        extra_step_kwargs,
+        cross_attention_kwargs=None,):
+            do_classifier_free_guidance = guidance_scale > 1.0
+            num_steps = (len(timesteps) - num_warmup_steps) // self.scheduler.order
+            import time
+            if num_steps<10:
+                with self.progress_bar(total=num_steps) as progress_bar:
+                    for i, t in enumerate(timesteps):
+                        setattr(self.unet, 'order', i)
+                        # expand the latents if we are doing classifier free guidance
+                        latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                        latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                        # predict the noise residual
+                        noise_pred = self.unet(
+                            latent_model_input,
+                            t,
+                            encoder_hidden_states=prompt_embeds,
+                            cross_attention_kwargs=cross_attention_kwargs,
+                        ).sample
+
+                        # perform guidance
+                        if do_classifier_free_guidance:
+                            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                            noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                        # compute the previous noisy sample x_t -> x_t-1
+                        latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                        # call the callback, if provided
+                        if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                            progress_bar.update()
+                            if callback is not None and i % callback_steps == 0:
+                                step_idx = i // getattr(self.scheduler, "order", 1)
+                                callback(step_idx, t, latents)
+
+            else:
+                all_timesteps = len(timesteps)
+                curr_step = 0
+   
+                while curr_step<all_timesteps:
+                    register_time(self.unet, curr_step)
+
+                    time_ls = []
+                    time_ls.append(timesteps[curr_step])
+                    curr_step += 1
+                    cond = curr_step in [0,1,2,3,5,10,15,25,35]
+                    
+                    while (not cond) and (curr_step<all_timesteps):
+                        time_ls.append(timesteps[curr_step])
+                        curr_step += 1
+                        cond = curr_step in [0,1,2,3,5,10,15,25,35]
+                    
+                    latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                    # predict the noise residual
+                    noise_pred = self.unet(
+                        latent_model_input,
+                        time_ls,
+                        encoder_hidden_states=prompt_embeds,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                    ).sample
+
+                    # perform guidance
+                    if do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                    # compute the previous noisy sample x_t -> x_t-1
+                    latents = multistep_pre(self, noise_pred, time_ls, latents)
+             
+            return latents.clone().detach()
+        return backward_loop
+    model.backward_loop = new_back(model)
+