xiaoanyu123 commited on Sep 10, 2025

Commit

8f3b606

verified ·

1 Parent(s): f01352f

Add files using upload-large-folder tool

Browse files

Files changed (20) hide show

pythonProject/.venv/Lib/site-packages/diffusers/guiders/__init__.py +41 -0
pythonProject/.venv/Lib/site-packages/diffusers/guiders/__pycache__/__init__.cpython-310.pyc +0 -0
pythonProject/.venv/Lib/site-packages/diffusers/guiders/__pycache__/adaptive_projected_guidance.cpython-310.pyc +0 -0
pythonProject/.venv/Lib/site-packages/diffusers/guiders/__pycache__/auto_guidance.cpython-310.pyc +0 -0
pythonProject/.venv/Lib/site-packages/diffusers/guiders/__pycache__/classifier_free_guidance.cpython-310.pyc +0 -0
pythonProject/.venv/Lib/site-packages/diffusers/guiders/__pycache__/classifier_free_zero_star_guidance.cpython-310.pyc +0 -0
pythonProject/.venv/Lib/site-packages/diffusers/guiders/__pycache__/frequency_decoupled_guidance.cpython-310.pyc +0 -0
pythonProject/.venv/Lib/site-packages/diffusers/guiders/__pycache__/guider_utils.cpython-310.pyc +0 -0
pythonProject/.venv/Lib/site-packages/diffusers/guiders/__pycache__/perturbed_attention_guidance.cpython-310.pyc +0 -0
pythonProject/.venv/Lib/site-packages/diffusers/guiders/__pycache__/skip_layer_guidance.cpython-310.pyc +0 -0
pythonProject/.venv/Lib/site-packages/diffusers/guiders/__pycache__/smoothed_energy_guidance.cpython-310.pyc +0 -0
pythonProject/.venv/Lib/site-packages/diffusers/guiders/__pycache__/tangential_classifier_free_guidance.cpython-310.pyc +0 -0
pythonProject/.venv/Lib/site-packages/diffusers/guiders/guider_utils.py +315 -0
pythonProject/.venv/Lib/site-packages/diffusers/guiders/perturbed_attention_guidance.py +271 -0
pythonProject/.venv/Lib/site-packages/diffusers/guiders/skip_layer_guidance.py +262 -0
pythonProject/.venv/Lib/site-packages/diffusers/guiders/smoothed_energy_guidance.py +251 -0
pythonProject/.venv/Lib/site-packages/diffusers/guiders/tangential_classifier_free_guidance.py +143 -0
pythonProject/.venv/Lib/site-packages/diffusers/hooks/faster_cache.py +654 -0
pythonProject/.venv/Lib/site-packages/diffusers/hooks/first_block_cache.py +259 -0
pythonProject/.venv/Lib/site-packages/diffusers/hooks/group_offloading.py +898 -0

pythonProject/.venv/Lib/site-packages/diffusers/guiders/__init__.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Union
+from ..utils import is_torch_available
+if is_torch_available():
+    from .adaptive_projected_guidance import AdaptiveProjectedGuidance
+    from .auto_guidance import AutoGuidance
+    from .classifier_free_guidance import ClassifierFreeGuidance
+    from .classifier_free_zero_star_guidance import ClassifierFreeZeroStarGuidance
+    from .frequency_decoupled_guidance import FrequencyDecoupledGuidance
+    from .perturbed_attention_guidance import PerturbedAttentionGuidance
+    from .skip_layer_guidance import SkipLayerGuidance
+    from .smoothed_energy_guidance import SmoothedEnergyGuidance
+    from .tangential_classifier_free_guidance import TangentialClassifierFreeGuidance
+    GuiderType = Union[
+        AdaptiveProjectedGuidance,
+        AutoGuidance,
+        ClassifierFreeGuidance,
+        ClassifierFreeZeroStarGuidance,
+        FrequencyDecoupledGuidance,
+        PerturbedAttentionGuidance,
+        SkipLayerGuidance,
+        SmoothedEnergyGuidance,
+        TangentialClassifierFreeGuidance,
+    ]

pythonProject/.venv/Lib/site-packages/diffusers/guiders/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (1.01 kB). View file

pythonProject/.venv/Lib/site-packages/diffusers/guiders/__pycache__/adaptive_projected_guidance.cpython-310.pyc ADDED Viewed

Binary file (6.65 kB). View file

pythonProject/.venv/Lib/site-packages/diffusers/guiders/__pycache__/auto_guidance.cpython-310.pyc ADDED Viewed

Binary file (7.43 kB). View file

pythonProject/.venv/Lib/site-packages/diffusers/guiders/__pycache__/classifier_free_guidance.cpython-310.pyc ADDED Viewed

Binary file (5.93 kB). View file

pythonProject/.venv/Lib/site-packages/diffusers/guiders/__pycache__/classifier_free_zero_star_guidance.cpython-310.pyc ADDED Viewed

Binary file (5.81 kB). View file

pythonProject/.venv/Lib/site-packages/diffusers/guiders/__pycache__/frequency_decoupled_guidance.cpython-310.pyc ADDED Viewed

Binary file (12.3 kB). View file

pythonProject/.venv/Lib/site-packages/diffusers/guiders/__pycache__/guider_utils.cpython-310.pyc ADDED Viewed

Binary file (14.9 kB). View file

pythonProject/.venv/Lib/site-packages/diffusers/guiders/__pycache__/perturbed_attention_guidance.cpython-310.pyc ADDED Viewed

Binary file (9.77 kB). View file

pythonProject/.venv/Lib/site-packages/diffusers/guiders/__pycache__/skip_layer_guidance.cpython-310.pyc ADDED Viewed

Binary file (10.3 kB). View file

pythonProject/.venv/Lib/site-packages/diffusers/guiders/__pycache__/smoothed_energy_guidance.cpython-310.pyc ADDED Viewed

Binary file (9.95 kB). View file

pythonProject/.venv/Lib/site-packages/diffusers/guiders/__pycache__/tangential_classifier_free_guidance.cpython-310.pyc ADDED Viewed

Binary file (5.16 kB). View file

pythonProject/.venv/Lib/site-packages/diffusers/guiders/guider_utils.py ADDED Viewed

	@@ -0,0 +1,315 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+import torch
+from huggingface_hub.utils import validate_hf_hub_args
+from typing_extensions import Self
+from ..configuration_utils import ConfigMixin
+from ..utils import BaseOutput, PushToHubMixin, get_logger
+if TYPE_CHECKING:
+    from ..modular_pipelines.modular_pipeline import BlockState
+GUIDER_CONFIG_NAME = "guider_config.json"
+logger = get_logger(__name__)  # pylint: disable=invalid-name
+class BaseGuidance(ConfigMixin, PushToHubMixin):
+    r"""Base class providing the skeleton for implementing guidance techniques."""
+    config_name = GUIDER_CONFIG_NAME
+    _input_predictions = None
+    _identifier_key = "__guidance_identifier__"
+    def __init__(self, start: float = 0.0, stop: float = 1.0):
+        self._start = start
+        self._stop = stop
+        self._step: int = None
+        self._num_inference_steps: int = None
+        self._timestep: torch.LongTensor = None
+        self._count_prepared = 0
+        self._input_fields: Dict[str, Union[str, Tuple[str, str]]] = None
+        self._enabled = True
+        if not (0.0 <= start < 1.0):
+            raise ValueError(f"Expected `start` to be between 0.0 and 1.0, but got {start}.")
+        if not (start <= stop <= 1.0):
+            raise ValueError(f"Expected `stop` to be between {start} and 1.0, but got {stop}.")
+        if self._input_predictions is None or not isinstance(self._input_predictions, list):
+            raise ValueError(
+                "`_input_predictions` must be a list of required prediction names for the guidance technique."
+            )
+    def disable(self):
+        self._enabled = False
+    def enable(self):
+        self._enabled = True
+    def set_state(self, step: int, num_inference_steps: int, timestep: torch.LongTensor) -> None:
+        self._step = step
+        self._num_inference_steps = num_inference_steps
+        self._timestep = timestep
+        self._count_prepared = 0
+    def set_input_fields(self, **kwargs: Dict[str, Union[str, Tuple[str, str]]]) -> None:
+        """
+        Set the input fields for the guidance technique. The input fields are used to specify the names of the returned
+        attributes containing the prepared data after `prepare_inputs` is called. The prepared data is obtained from
+        the values of the provided keyword arguments to this method.
+        Args:
+            **kwargs (`Dict[str, Union[str, Tuple[str, str]]]`):
+                A dictionary where the keys are the names of the fields that will be used to store the data once it is
+                prepared with `prepare_inputs`. The values can be either a string or a tuple of length 2, which is used
+                to look up the required data provided for preparation.
+                If a string is provided, it will be used as the conditional data (or unconditional if used with a
+                guidance method that requires it). If a tuple of length 2 is provided, the first element must be the
+                conditional data identifier and the second element must be the unconditional data identifier or None.
+                Example:
+                ```
+                data = {"prompt_embeds": <some tensor>, "negative_prompt_embeds": <some tensor>, "latents": <some tensor>}
+                BaseGuidance.set_input_fields(
+                    latents="latents",
+                    prompt_embeds=("prompt_embeds", "negative_prompt_embeds"),
+                )
+                ```
+        """
+        for key, value in kwargs.items():
+            is_string = isinstance(value, str)
+            is_tuple_of_str_with_len_2 = (
+                isinstance(value, tuple) and len(value) == 2 and all(isinstance(v, str) for v in value)
+            )
+            if not (is_string or is_tuple_of_str_with_len_2):
+                raise ValueError(
+                    f"Expected `set_input_fields` to be called with a string or a tuple of string with length 2, but got {type(value)} for key {key}."
+                )
+        self._input_fields = kwargs
+    def prepare_models(self, denoiser: torch.nn.Module) -> None:
+        """
+        Prepares the models for the guidance technique on a given batch of data. This method should be overridden in
+        subclasses to implement specific model preparation logic.
+        """
+        self._count_prepared += 1
+    def cleanup_models(self, denoiser: torch.nn.Module) -> None:
+        """
+        Cleans up the models for the guidance technique after a given batch of data. This method should be overridden
+        in subclasses to implement specific model cleanup logic. It is useful for removing any hooks or other stateful
+        modifications made during `prepare_models`.
+        """
+        pass
+    def prepare_inputs(self, data: "BlockState") -> List["BlockState"]:
+        raise NotImplementedError("BaseGuidance::prepare_inputs must be implemented in subclasses.")
+    def __call__(self, data: List["BlockState"]) -> Any:
+        if not all(hasattr(d, "noise_pred") for d in data):
+            raise ValueError("Expected all data to have `noise_pred` attribute.")
+        if len(data) != self.num_conditions:
+            raise ValueError(
+                f"Expected {self.num_conditions} data items, but got {len(data)}. Please check the input data."
+            )
+        forward_inputs = {getattr(d, self._identifier_key): d.noise_pred for d in data}
+        return self.forward(**forward_inputs)
+    def forward(self, *args, **kwargs) -> Any:
+        raise NotImplementedError("BaseGuidance::forward must be implemented in subclasses.")
+    @property
+    def is_conditional(self) -> bool:
+        raise NotImplementedError("BaseGuidance::is_conditional must be implemented in subclasses.")
+    @property
+    def is_unconditional(self) -> bool:
+        return not self.is_conditional
+    @property
+    def num_conditions(self) -> int:
+        raise NotImplementedError("BaseGuidance::num_conditions must be implemented in subclasses.")
+    @classmethod
+    def _prepare_batch(
+        cls,
+        input_fields: Dict[str, Union[str, Tuple[str, str]]],
+        data: "BlockState",
+        tuple_index: int,
+        identifier: str,
+    ) -> "BlockState":
+        """
+        Prepares a batch of data for the guidance technique. This method is used in the `prepare_inputs` method of the
+        `BaseGuidance` class. It prepares the batch based on the provided tuple index.
+        Args:
+            input_fields (`Dict[str, Union[str, Tuple[str, str]]]`):
+                A dictionary where the keys are the names of the fields that will be used to store the data once it is
+                prepared with `prepare_inputs`. The values can be either a string or a tuple of length 2, which is used
+                to look up the required data provided for preparation. If a string is provided, it will be used as the
+                conditional data (or unconditional if used with a guidance method that requires it). If a tuple of
+                length 2 is provided, the first element must be the conditional data identifier and the second element
+                must be the unconditional data identifier or None.
+            data (`BlockState`):
+                The input data to be prepared.
+            tuple_index (`int`):
+                The index to use when accessing input fields that are tuples.
+        Returns:
+            `BlockState`: The prepared batch of data.
+        """
+        from ..modular_pipelines.modular_pipeline import BlockState
+        if input_fields is None:
+            raise ValueError(
+                "Input fields cannot be None. Please pass `input_fields` to `prepare_inputs` or call `set_input_fields` before preparing inputs."
+            )
+        data_batch = {}
+        for key, value in input_fields.items():
+            try:
+                if isinstance(value, str):
+                    data_batch[key] = getattr(data, value)
+                elif isinstance(value, tuple):
+                    data_batch[key] = getattr(data, value[tuple_index])
+                else:
+                    # We've already checked that value is a string or a tuple of strings with length 2
+                    pass
+            except AttributeError:
+                logger.debug(f"`data` does not have attribute(s) {value}, skipping.")
+        data_batch[cls._identifier_key] = identifier
+        return BlockState(**data_batch)
+    @classmethod
+    @validate_hf_hub_args
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
+        subfolder: Optional[str] = None,
+        return_unused_kwargs=False,
+        **kwargs,
+    ) -> Self:
+        r"""
+        Instantiate a guider from a pre-defined JSON configuration file in a local directory or Hub repository.
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
+                      the Hub.
+                    - A path to a *directory* (for example `./my_model_directory`) containing the guider configuration
+                      saved with [`~BaseGuidance.save_pretrained`].
+            subfolder (`str`, *optional*):
+                The subfolder location of a model file within a larger model repository on the Hub or locally.
+            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
+                Whether kwargs that are not consumed by the Python class should be returned or not.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only(`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+        <Tip>
+        To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in with `hf
+        auth login`. You can also activate the special
+        ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use this method in a
+        firewalled environment.
+        </Tip>
+        """
+        config, kwargs, commit_hash = cls.load_config(
+            pretrained_model_name_or_path=pretrained_model_name_or_path,
+            subfolder=subfolder,
+            return_unused_kwargs=True,
+            return_commit_hash=True,
+            **kwargs,
+        )
+        return cls.from_config(config, return_unused_kwargs=return_unused_kwargs, **kwargs)
+    def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
+        """
+        Save a guider configuration object to a directory so that it can be reloaded using the
+        [`~BaseGuidance.from_pretrained`] class method.
+        Args:
+            save_directory (`str` or `os.PathLike`):
+                Directory where the configuration JSON file will be saved (will be created if it does not exist).
+            push_to_hub (`bool`, *optional*, defaults to `False`):
+                Whether or not to push your model to the Hugging Face Hub after saving it. You can specify the
+                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
+                namespace).
+            kwargs (`Dict[str, Any]`, *optional*):
+                Additional keyword arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+        self.save_config(save_directory=save_directory, push_to_hub=push_to_hub, **kwargs)
+class GuiderOutput(BaseOutput):
+    pred: torch.Tensor
+    pred_cond: Optional[torch.Tensor]
+    pred_uncond: Optional[torch.Tensor]
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    r"""
+    Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
+    Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
+    Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+    Args:
+        noise_cfg (`torch.Tensor`):
+            The predicted noise tensor for the guided diffusion process.
+        noise_pred_text (`torch.Tensor`):
+            The predicted noise tensor for the text-guided diffusion process.
+        guidance_rescale (`float`, *optional*, defaults to 0.0):
+            A rescale factor applied to the noise predictions.
+    Returns:
+        noise_cfg (`torch.Tensor`): The rescaled noise prediction tensor.
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg

pythonProject/.venv/Lib/site-packages/diffusers/guiders/perturbed_attention_guidance.py ADDED Viewed

	@@ -0,0 +1,271 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+import torch
+from ..configuration_utils import register_to_config
+from ..hooks import HookRegistry, LayerSkipConfig
+from ..hooks.layer_skip import _apply_layer_skip_hook
+from ..utils import get_logger
+from .guider_utils import BaseGuidance, GuiderOutput, rescale_noise_cfg
+if TYPE_CHECKING:
+    from ..modular_pipelines.modular_pipeline import BlockState
+logger = get_logger(__name__)  # pylint: disable=invalid-name
+class PerturbedAttentionGuidance(BaseGuidance):
+    """
+    Perturbed Attention Guidance (PAG): https://huggingface.co/papers/2403.17377
+    The intution behind PAG can be thought of as moving the CFG predicted distribution estimates further away from
+    worse versions of the conditional distribution estimates. PAG was one of the first techniques to introduce the idea
+    of using a worse version of the trained model for better guiding itself in the denoising process. It perturbs the
+    attention scores of the latent stream by replacing the score matrix with an identity matrix for selectively chosen
+    layers.
+    Additional reading:
+    - [Guiding a Diffusion Model with a Bad Version of Itself](https://huggingface.co/papers/2406.02507)
+    PAG is implemented with similar implementation to SkipLayerGuidance due to overlap in the configuration parameters
+    and implementation details.
+    Args:
+        guidance_scale (`float`, defaults to `7.5`):
+            The scale parameter for classifier-free guidance. Higher values result in stronger conditioning on the text
+            prompt, while lower values allow for more freedom in generation. Higher values may lead to saturation and
+            deterioration of image quality.
+        perturbed_guidance_scale (`float`, defaults to `2.8`):
+            The scale parameter for perturbed attention guidance.
+        perturbed_guidance_start (`float`, defaults to `0.01`):
+            The fraction of the total number of denoising steps after which perturbed attention guidance starts.
+        perturbed_guidance_stop (`float`, defaults to `0.2`):
+            The fraction of the total number of denoising steps after which perturbed attention guidance stops.
+        perturbed_guidance_layers (`int` or `List[int]`, *optional*):
+            The layer indices to apply perturbed attention guidance to. Can be a single integer or a list of integers.
+            If not provided, `perturbed_guidance_config` must be provided.
+        perturbed_guidance_config (`LayerSkipConfig` or `List[LayerSkipConfig]`, *optional*):
+            The configuration for the perturbed attention guidance. Can be a single `LayerSkipConfig` or a list of
+            `LayerSkipConfig`. If not provided, `perturbed_guidance_layers` must be provided.
+        guidance_rescale (`float`, defaults to `0.0`):
+            The rescale factor applied to the noise predictions. This is used to improve image quality and fix
+            overexposure. Based on Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
+            Flawed](https://huggingface.co/papers/2305.08891).
+        use_original_formulation (`bool`, defaults to `False`):
+            Whether to use the original formulation of classifier-free guidance as proposed in the paper. By default,
+            we use the diffusers-native implementation that has been in the codebase for a long time. See
+            [~guiders.classifier_free_guidance.ClassifierFreeGuidance] for more details.
+        start (`float`, defaults to `0.01`):
+            The fraction of the total number of denoising steps after which guidance starts.
+        stop (`float`, defaults to `0.2`):
+            The fraction of the total number of denoising steps after which guidance stops.
+    """
+    # NOTE: The current implementation does not account for joint latent conditioning (text + image/video tokens in
+    # the same latent stream). It assumes the entire latent is a single stream of visual tokens. It would be very
+    # complex to support joint latent conditioning in a model-agnostic manner without specializing the implementation
+    # for each model architecture.
+    _input_predictions = ["pred_cond", "pred_uncond", "pred_cond_skip"]
+    @register_to_config
+    def __init__(
+        self,
+        guidance_scale: float = 7.5,
+        perturbed_guidance_scale: float = 2.8,
+        perturbed_guidance_start: float = 0.01,
+        perturbed_guidance_stop: float = 0.2,
+        perturbed_guidance_layers: Optional[Union[int, List[int]]] = None,
+        perturbed_guidance_config: Union[LayerSkipConfig, List[LayerSkipConfig], Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        use_original_formulation: bool = False,
+        start: float = 0.0,
+        stop: float = 1.0,
+    ):
+        super().__init__(start, stop)
+        self.guidance_scale = guidance_scale
+        self.skip_layer_guidance_scale = perturbed_guidance_scale
+        self.skip_layer_guidance_start = perturbed_guidance_start
+        self.skip_layer_guidance_stop = perturbed_guidance_stop
+        self.guidance_rescale = guidance_rescale
+        self.use_original_formulation = use_original_formulation
+        if perturbed_guidance_config is None:
+            if perturbed_guidance_layers is None:
+                raise ValueError(
+                    "`perturbed_guidance_layers` must be provided if `perturbed_guidance_config` is not specified."
+                )
+            perturbed_guidance_config = LayerSkipConfig(
+                indices=perturbed_guidance_layers,
+                fqn="auto",
+                skip_attention=False,
+                skip_attention_scores=True,
+                skip_ff=False,
+            )
+        else:
+            if perturbed_guidance_layers is not None:
+                raise ValueError(
+                    "`perturbed_guidance_layers` should not be provided if `perturbed_guidance_config` is specified."
+                )
+        if isinstance(perturbed_guidance_config, dict):
+            perturbed_guidance_config = LayerSkipConfig.from_dict(perturbed_guidance_config)
+        if isinstance(perturbed_guidance_config, LayerSkipConfig):
+            perturbed_guidance_config = [perturbed_guidance_config]
+        if not isinstance(perturbed_guidance_config, list):
+            raise ValueError(
+                "`perturbed_guidance_config` must be a `LayerSkipConfig`, a list of `LayerSkipConfig`, or a dict that can be converted to a `LayerSkipConfig`."
+            )
+        elif isinstance(next(iter(perturbed_guidance_config), None), dict):
+            perturbed_guidance_config = [LayerSkipConfig.from_dict(config) for config in perturbed_guidance_config]
+        for config in perturbed_guidance_config:
+            if config.skip_attention or not config.skip_attention_scores or config.skip_ff:
+                logger.warning(
+                    "Perturbed Attention Guidance is designed to perturb attention scores, so `skip_attention` should be False, `skip_attention_scores` should be True, and `skip_ff` should be False. "
+                    "Please check your configuration. Modifying the config to match the expected values."
+                )
+            config.skip_attention = False
+            config.skip_attention_scores = True
+            config.skip_ff = False
+        self.skip_layer_config = perturbed_guidance_config
+        self._skip_layer_hook_names = [f"SkipLayerGuidance_{i}" for i in range(len(self.skip_layer_config))]
+    # Copied from diffusers.guiders.skip_layer_guidance.SkipLayerGuidance.prepare_models
+    def prepare_models(self, denoiser: torch.nn.Module) -> None:
+        self._count_prepared += 1
+        if self._is_slg_enabled() and self.is_conditional and self._count_prepared > 1:
+            for name, config in zip(self._skip_layer_hook_names, self.skip_layer_config):
+                _apply_layer_skip_hook(denoiser, config, name=name)
+    # Copied from diffusers.guiders.skip_layer_guidance.SkipLayerGuidance.cleanup_models
+    def cleanup_models(self, denoiser: torch.nn.Module) -> None:
+        if self._is_slg_enabled() and self.is_conditional and self._count_prepared > 1:
+            registry = HookRegistry.check_if_exists_or_initialize(denoiser)
+            # Remove the hooks after inference
+            for hook_name in self._skip_layer_hook_names:
+                registry.remove_hook(hook_name, recurse=True)
+    # Copied from diffusers.guiders.skip_layer_guidance.SkipLayerGuidance.prepare_inputs
+    def prepare_inputs(
+        self, data: "BlockState", input_fields: Optional[Dict[str, Union[str, Tuple[str, str]]]] = None
+    ) -> List["BlockState"]:
+        if input_fields is None:
+            input_fields = self._input_fields
+        if self.num_conditions == 1:
+            tuple_indices = [0]
+            input_predictions = ["pred_cond"]
+        elif self.num_conditions == 2:
+            tuple_indices = [0, 1]
+            input_predictions = (
+                ["pred_cond", "pred_uncond"] if self._is_cfg_enabled() else ["pred_cond", "pred_cond_skip"]
+            )
+        else:
+            tuple_indices = [0, 1, 0]
+            input_predictions = ["pred_cond", "pred_uncond", "pred_cond_skip"]
+        data_batches = []
+        for i in range(self.num_conditions):
+            data_batch = self._prepare_batch(input_fields, data, tuple_indices[i], input_predictions[i])
+            data_batches.append(data_batch)
+        return data_batches
+    # Copied from diffusers.guiders.skip_layer_guidance.SkipLayerGuidance.forward
+    def forward(
+        self,
+        pred_cond: torch.Tensor,
+        pred_uncond: Optional[torch.Tensor] = None,
+        pred_cond_skip: Optional[torch.Tensor] = None,
+    ) -> GuiderOutput:
+        pred = None
+        if not self._is_cfg_enabled() and not self._is_slg_enabled():
+            pred = pred_cond
+        elif not self._is_cfg_enabled():
+            shift = pred_cond - pred_cond_skip
+            pred = pred_cond if self.use_original_formulation else pred_cond_skip
+            pred = pred + self.skip_layer_guidance_scale * shift
+        elif not self._is_slg_enabled():
+            shift = pred_cond - pred_uncond
+            pred = pred_cond if self.use_original_formulation else pred_uncond
+            pred = pred + self.guidance_scale * shift
+        else:
+            shift = pred_cond - pred_uncond
+            shift_skip = pred_cond - pred_cond_skip
+            pred = pred_cond if self.use_original_formulation else pred_uncond
+            pred = pred + self.guidance_scale * shift + self.skip_layer_guidance_scale * shift_skip
+        if self.guidance_rescale > 0.0:
+            pred = rescale_noise_cfg(pred, pred_cond, self.guidance_rescale)
+        return GuiderOutput(pred=pred, pred_cond=pred_cond, pred_uncond=pred_uncond)
+    @property
+    # Copied from diffusers.guiders.skip_layer_guidance.SkipLayerGuidance.is_conditional
+    def is_conditional(self) -> bool:
+        return self._count_prepared == 1 or self._count_prepared == 3
+    @property
+    # Copied from diffusers.guiders.skip_layer_guidance.SkipLayerGuidance.num_conditions
+    def num_conditions(self) -> int:
+        num_conditions = 1
+        if self._is_cfg_enabled():
+            num_conditions += 1
+        if self._is_slg_enabled():
+            num_conditions += 1
+        return num_conditions
+    # Copied from diffusers.guiders.skip_layer_guidance.SkipLayerGuidance._is_cfg_enabled
+    def _is_cfg_enabled(self) -> bool:
+        if not self._enabled:
+            return False
+        is_within_range = True
+        if self._num_inference_steps is not None:
+            skip_start_step = int(self._start * self._num_inference_steps)
+            skip_stop_step = int(self._stop * self._num_inference_steps)
+            is_within_range = skip_start_step <= self._step < skip_stop_step
+        is_close = False
+        if self.use_original_formulation:
+            is_close = math.isclose(self.guidance_scale, 0.0)
+        else:
+            is_close = math.isclose(self.guidance_scale, 1.0)
+        return is_within_range and not is_close
+    # Copied from diffusers.guiders.skip_layer_guidance.SkipLayerGuidance._is_slg_enabled
+    def _is_slg_enabled(self) -> bool:
+        if not self._enabled:
+            return False
+        is_within_range = True
+        if self._num_inference_steps is not None:
+            skip_start_step = int(self.skip_layer_guidance_start * self._num_inference_steps)
+            skip_stop_step = int(self.skip_layer_guidance_stop * self._num_inference_steps)
+            is_within_range = skip_start_step < self._step < skip_stop_step
+        is_zero = math.isclose(self.skip_layer_guidance_scale, 0.0)
+        return is_within_range and not is_zero

pythonProject/.venv/Lib/site-packages/diffusers/guiders/skip_layer_guidance.py ADDED Viewed

	@@ -0,0 +1,262 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+import torch
+from ..configuration_utils import register_to_config
+from ..hooks import HookRegistry, LayerSkipConfig
+from ..hooks.layer_skip import _apply_layer_skip_hook
+from .guider_utils import BaseGuidance, GuiderOutput, rescale_noise_cfg
+if TYPE_CHECKING:
+    from ..modular_pipelines.modular_pipeline import BlockState
+class SkipLayerGuidance(BaseGuidance):
+    """
+    Skip Layer Guidance (SLG): https://github.com/Stability-AI/sd3.5
+    Spatio-Temporal Guidance (STG): https://huggingface.co/papers/2411.18664
+    SLG was introduced by StabilityAI for improving structure and anotomy coherence in generated images. It works by
+    skipping the forward pass of specified transformer blocks during the denoising process on an additional conditional
+    batch of data, apart from the conditional and unconditional batches already used in CFG
+    ([~guiders.classifier_free_guidance.ClassifierFreeGuidance]), and then scaling and shifting the CFG predictions
+    based on the difference between conditional without skipping and conditional with skipping predictions.
+    The intution behind SLG can be thought of as moving the CFG predicted distribution estimates further away from
+    worse versions of the conditional distribution estimates (because skipping layers is equivalent to using a worse
+    version of the model for the conditional prediction).
+    STG is an improvement and follow-up work combining ideas from SLG, PAG and similar techniques for improving
+    generation quality in video diffusion models.
+    Additional reading:
+    - [Guiding a Diffusion Model with a Bad Version of Itself](https://huggingface.co/papers/2406.02507)
+    The values for `skip_layer_guidance_scale`, `skip_layer_guidance_start`, and `skip_layer_guidance_stop` are
+    defaulted to the recommendations by StabilityAI for Stable Diffusion 3.5 Medium.
+    Args:
+        guidance_scale (`float`, defaults to `7.5`):
+            The scale parameter for classifier-free guidance. Higher values result in stronger conditioning on the text
+            prompt, while lower values allow for more freedom in generation. Higher values may lead to saturation and
+            deterioration of image quality.
+        skip_layer_guidance_scale (`float`, defaults to `2.8`):
+            The scale parameter for skip layer guidance. Anatomy and structure coherence may improve with higher
+            values, but it may also lead to overexposure and saturation.
+        skip_layer_guidance_start (`float`, defaults to `0.01`):
+            The fraction of the total number of denoising steps after which skip layer guidance starts.
+        skip_layer_guidance_stop (`float`, defaults to `0.2`):
+            The fraction of the total number of denoising steps after which skip layer guidance stops.
+        skip_layer_guidance_layers (`int` or `List[int]`, *optional*):
+            The layer indices to apply skip layer guidance to. Can be a single integer or a list of integers. If not
+            provided, `skip_layer_config` must be provided. The recommended values are `[7, 8, 9]` for Stable Diffusion
+            3.5 Medium.
+        skip_layer_config (`LayerSkipConfig` or `List[LayerSkipConfig]`, *optional*):
+            The configuration for the skip layer guidance. Can be a single `LayerSkipConfig` or a list of
+            `LayerSkipConfig`. If not provided, `skip_layer_guidance_layers` must be provided.
+        guidance_rescale (`float`, defaults to `0.0`):
+            The rescale factor applied to the noise predictions. This is used to improve image quality and fix
+            overexposure. Based on Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
+            Flawed](https://huggingface.co/papers/2305.08891).
+        use_original_formulation (`bool`, defaults to `False`):
+            Whether to use the original formulation of classifier-free guidance as proposed in the paper. By default,
+            we use the diffusers-native implementation that has been in the codebase for a long time. See
+            [~guiders.classifier_free_guidance.ClassifierFreeGuidance] for more details.
+        start (`float`, defaults to `0.01`):
+            The fraction of the total number of denoising steps after which guidance starts.
+        stop (`float`, defaults to `0.2`):
+            The fraction of the total number of denoising steps after which guidance stops.
+    """
+    _input_predictions = ["pred_cond", "pred_uncond", "pred_cond_skip"]
+    @register_to_config
+    def __init__(
+        self,
+        guidance_scale: float = 7.5,
+        skip_layer_guidance_scale: float = 2.8,
+        skip_layer_guidance_start: float = 0.01,
+        skip_layer_guidance_stop: float = 0.2,
+        skip_layer_guidance_layers: Optional[Union[int, List[int]]] = None,
+        skip_layer_config: Union[LayerSkipConfig, List[LayerSkipConfig], Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        use_original_formulation: bool = False,
+        start: float = 0.0,
+        stop: float = 1.0,
+    ):
+        super().__init__(start, stop)
+        self.guidance_scale = guidance_scale
+        self.skip_layer_guidance_scale = skip_layer_guidance_scale
+        self.skip_layer_guidance_start = skip_layer_guidance_start
+        self.skip_layer_guidance_stop = skip_layer_guidance_stop
+        self.guidance_rescale = guidance_rescale
+        self.use_original_formulation = use_original_formulation
+        if not (0.0 <= skip_layer_guidance_start < 1.0):
+            raise ValueError(
+                f"Expected `skip_layer_guidance_start` to be between 0.0 and 1.0, but got {skip_layer_guidance_start}."
+            )
+        if not (skip_layer_guidance_start <= skip_layer_guidance_stop <= 1.0):
+            raise ValueError(
+                f"Expected `skip_layer_guidance_stop` to be between 0.0 and 1.0, but got {skip_layer_guidance_stop}."
+            )
+        if skip_layer_guidance_layers is None and skip_layer_config is None:
+            raise ValueError(
+                "Either `skip_layer_guidance_layers` or `skip_layer_config` must be provided to enable Skip Layer Guidance."
+            )
+        if skip_layer_guidance_layers is not None and skip_layer_config is not None:
+            raise ValueError("Only one of `skip_layer_guidance_layers` or `skip_layer_config` can be provided.")
+        if skip_layer_guidance_layers is not None:
+            if isinstance(skip_layer_guidance_layers, int):
+                skip_layer_guidance_layers = [skip_layer_guidance_layers]
+            if not isinstance(skip_layer_guidance_layers, list):
+                raise ValueError(
+                    f"Expected `skip_layer_guidance_layers` to be an int or a list of ints, but got {type(skip_layer_guidance_layers)}."
+                )
+            skip_layer_config = [LayerSkipConfig(layer, fqn="auto") for layer in skip_layer_guidance_layers]
+        if isinstance(skip_layer_config, dict):
+            skip_layer_config = LayerSkipConfig.from_dict(skip_layer_config)
+        if isinstance(skip_layer_config, LayerSkipConfig):
+            skip_layer_config = [skip_layer_config]
+        if not isinstance(skip_layer_config, list):
+            raise ValueError(
+                f"Expected `skip_layer_config` to be a LayerSkipConfig or a list of LayerSkipConfig, but got {type(skip_layer_config)}."
+            )
+        elif isinstance(next(iter(skip_layer_config), None), dict):
+            skip_layer_config = [LayerSkipConfig.from_dict(config) for config in skip_layer_config]
+        self.skip_layer_config = skip_layer_config
+        self._skip_layer_hook_names = [f"SkipLayerGuidance_{i}" for i in range(len(self.skip_layer_config))]
+    def prepare_models(self, denoiser: torch.nn.Module) -> None:
+        self._count_prepared += 1
+        if self._is_slg_enabled() and self.is_conditional and self._count_prepared > 1:
+            for name, config in zip(self._skip_layer_hook_names, self.skip_layer_config):
+                _apply_layer_skip_hook(denoiser, config, name=name)
+    def cleanup_models(self, denoiser: torch.nn.Module) -> None:
+        if self._is_slg_enabled() and self.is_conditional and self._count_prepared > 1:
+            registry = HookRegistry.check_if_exists_or_initialize(denoiser)
+            # Remove the hooks after inference
+            for hook_name in self._skip_layer_hook_names:
+                registry.remove_hook(hook_name, recurse=True)
+    def prepare_inputs(
+        self, data: "BlockState", input_fields: Optional[Dict[str, Union[str, Tuple[str, str]]]] = None
+    ) -> List["BlockState"]:
+        if input_fields is None:
+            input_fields = self._input_fields
+        if self.num_conditions == 1:
+            tuple_indices = [0]
+            input_predictions = ["pred_cond"]
+        elif self.num_conditions == 2:
+            tuple_indices = [0, 1]
+            input_predictions = (
+                ["pred_cond", "pred_uncond"] if self._is_cfg_enabled() else ["pred_cond", "pred_cond_skip"]
+            )
+        else:
+            tuple_indices = [0, 1, 0]
+            input_predictions = ["pred_cond", "pred_uncond", "pred_cond_skip"]
+        data_batches = []
+        for i in range(self.num_conditions):
+            data_batch = self._prepare_batch(input_fields, data, tuple_indices[i], input_predictions[i])
+            data_batches.append(data_batch)
+        return data_batches
+    def forward(
+        self,
+        pred_cond: torch.Tensor,
+        pred_uncond: Optional[torch.Tensor] = None,
+        pred_cond_skip: Optional[torch.Tensor] = None,
+    ) -> GuiderOutput:
+        pred = None
+        if not self._is_cfg_enabled() and not self._is_slg_enabled():
+            pred = pred_cond
+        elif not self._is_cfg_enabled():
+            shift = pred_cond - pred_cond_skip
+            pred = pred_cond if self.use_original_formulation else pred_cond_skip
+            pred = pred + self.skip_layer_guidance_scale * shift
+        elif not self._is_slg_enabled():
+            shift = pred_cond - pred_uncond
+            pred = pred_cond if self.use_original_formulation else pred_uncond
+            pred = pred + self.guidance_scale * shift
+        else:
+            shift = pred_cond - pred_uncond
+            shift_skip = pred_cond - pred_cond_skip
+            pred = pred_cond if self.use_original_formulation else pred_uncond
+            pred = pred + self.guidance_scale * shift + self.skip_layer_guidance_scale * shift_skip
+        if self.guidance_rescale > 0.0:
+            pred = rescale_noise_cfg(pred, pred_cond, self.guidance_rescale)
+        return GuiderOutput(pred=pred, pred_cond=pred_cond, pred_uncond=pred_uncond)
+    @property
+    def is_conditional(self) -> bool:
+        return self._count_prepared == 1 or self._count_prepared == 3
+    @property
+    def num_conditions(self) -> int:
+        num_conditions = 1
+        if self._is_cfg_enabled():
+            num_conditions += 1
+        if self._is_slg_enabled():
+            num_conditions += 1
+        return num_conditions
+    def _is_cfg_enabled(self) -> bool:
+        if not self._enabled:
+            return False
+        is_within_range = True
+        if self._num_inference_steps is not None:
+            skip_start_step = int(self._start * self._num_inference_steps)
+            skip_stop_step = int(self._stop * self._num_inference_steps)
+            is_within_range = skip_start_step <= self._step < skip_stop_step
+        is_close = False
+        if self.use_original_formulation:
+            is_close = math.isclose(self.guidance_scale, 0.0)
+        else:
+            is_close = math.isclose(self.guidance_scale, 1.0)
+        return is_within_range and not is_close
+    def _is_slg_enabled(self) -> bool:
+        if not self._enabled:
+            return False
+        is_within_range = True
+        if self._num_inference_steps is not None:
+            skip_start_step = int(self.skip_layer_guidance_start * self._num_inference_steps)
+            skip_stop_step = int(self.skip_layer_guidance_stop * self._num_inference_steps)
+            is_within_range = skip_start_step < self._step < skip_stop_step
+        is_zero = math.isclose(self.skip_layer_guidance_scale, 0.0)
+        return is_within_range and not is_zero

pythonProject/.venv/Lib/site-packages/diffusers/guiders/smoothed_energy_guidance.py ADDED Viewed

	@@ -0,0 +1,251 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+import torch
+from ..configuration_utils import register_to_config
+from ..hooks import HookRegistry
+from ..hooks.smoothed_energy_guidance_utils import SmoothedEnergyGuidanceConfig, _apply_smoothed_energy_guidance_hook
+from .guider_utils import BaseGuidance, GuiderOutput, rescale_noise_cfg
+if TYPE_CHECKING:
+    from ..modular_pipelines.modular_pipeline import BlockState
+class SmoothedEnergyGuidance(BaseGuidance):
+    """
+    Smoothed Energy Guidance (SEG): https://huggingface.co/papers/2408.00760
+    SEG is only supported as an experimental prototype feature for now, so the implementation may be modified in the
+    future without warning or guarantee of reproducibility. This implementation assumes:
+    - Generated images are square (height == width)
+    - The model does not combine different modalities together (e.g., text and image latent streams are not combined
+      together such as Flux)
+    Args:
+        guidance_scale (`float`, defaults to `7.5`):
+            The scale parameter for classifier-free guidance. Higher values result in stronger conditioning on the text
+            prompt, while lower values allow for more freedom in generation. Higher values may lead to saturation and
+            deterioration of image quality.
+        seg_guidance_scale (`float`, defaults to `3.0`):
+            The scale parameter for smoothed energy guidance. Anatomy and structure coherence may improve with higher
+            values, but it may also lead to overexposure and saturation.
+        seg_blur_sigma (`float`, defaults to `9999999.0`):
+            The amount by which we blur the attention weights. Setting this value greater than 9999.0 results in
+            infinite blur, which means uniform queries. Controlling it exponentially is empirically effective.
+        seg_blur_threshold_inf (`float`, defaults to `9999.0`):
+            The threshold above which the blur is considered infinite.
+        seg_guidance_start (`float`, defaults to `0.0`):
+            The fraction of the total number of denoising steps after which smoothed energy guidance starts.
+        seg_guidance_stop (`float`, defaults to `1.0`):
+            The fraction of the total number of denoising steps after which smoothed energy guidance stops.
+        seg_guidance_layers (`int` or `List[int]`, *optional*):
+            The layer indices to apply smoothed energy guidance to. Can be a single integer or a list of integers. If
+            not provided, `seg_guidance_config` must be provided. The recommended values are `[7, 8, 9]` for Stable
+            Diffusion 3.5 Medium.
+        seg_guidance_config (`SmoothedEnergyGuidanceConfig` or `List[SmoothedEnergyGuidanceConfig]`, *optional*):
+            The configuration for the smoothed energy layer guidance. Can be a single `SmoothedEnergyGuidanceConfig` or
+            a list of `SmoothedEnergyGuidanceConfig`. If not provided, `seg_guidance_layers` must be provided.
+        guidance_rescale (`float`, defaults to `0.0`):
+            The rescale factor applied to the noise predictions. This is used to improve image quality and fix
+            overexposure. Based on Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
+            Flawed](https://huggingface.co/papers/2305.08891).
+        use_original_formulation (`bool`, defaults to `False`):
+            Whether to use the original formulation of classifier-free guidance as proposed in the paper. By default,
+            we use the diffusers-native implementation that has been in the codebase for a long time. See
+            [~guiders.classifier_free_guidance.ClassifierFreeGuidance] for more details.
+        start (`float`, defaults to `0.01`):
+            The fraction of the total number of denoising steps after which guidance starts.
+        stop (`float`, defaults to `0.2`):
+            The fraction of the total number of denoising steps after which guidance stops.
+    """
+    _input_predictions = ["pred_cond", "pred_uncond", "pred_cond_seg"]
+    @register_to_config
+    def __init__(
+        self,
+        guidance_scale: float = 7.5,
+        seg_guidance_scale: float = 2.8,
+        seg_blur_sigma: float = 9999999.0,
+        seg_blur_threshold_inf: float = 9999.0,
+        seg_guidance_start: float = 0.0,
+        seg_guidance_stop: float = 1.0,
+        seg_guidance_layers: Optional[Union[int, List[int]]] = None,
+        seg_guidance_config: Union[SmoothedEnergyGuidanceConfig, List[SmoothedEnergyGuidanceConfig]] = None,
+        guidance_rescale: float = 0.0,
+        use_original_formulation: bool = False,
+        start: float = 0.0,
+        stop: float = 1.0,
+    ):
+        super().__init__(start, stop)
+        self.guidance_scale = guidance_scale
+        self.seg_guidance_scale = seg_guidance_scale
+        self.seg_blur_sigma = seg_blur_sigma
+        self.seg_blur_threshold_inf = seg_blur_threshold_inf
+        self.seg_guidance_start = seg_guidance_start
+        self.seg_guidance_stop = seg_guidance_stop
+        self.guidance_rescale = guidance_rescale
+        self.use_original_formulation = use_original_formulation
+        if not (0.0 <= seg_guidance_start < 1.0):
+            raise ValueError(f"Expected `seg_guidance_start` to be between 0.0 and 1.0, but got {seg_guidance_start}.")
+        if not (seg_guidance_start <= seg_guidance_stop <= 1.0):
+            raise ValueError(f"Expected `seg_guidance_stop` to be between 0.0 and 1.0, but got {seg_guidance_stop}.")
+        if seg_guidance_layers is None and seg_guidance_config is None:
+            raise ValueError(
+                "Either `seg_guidance_layers` or `seg_guidance_config` must be provided to enable Smoothed Energy Guidance."
+            )
+        if seg_guidance_layers is not None and seg_guidance_config is not None:
+            raise ValueError("Only one of `seg_guidance_layers` or `seg_guidance_config` can be provided.")
+        if seg_guidance_layers is not None:
+            if isinstance(seg_guidance_layers, int):
+                seg_guidance_layers = [seg_guidance_layers]
+            if not isinstance(seg_guidance_layers, list):
+                raise ValueError(
+                    f"Expected `seg_guidance_layers` to be an int or a list of ints, but got {type(seg_guidance_layers)}."
+                )
+            seg_guidance_config = [SmoothedEnergyGuidanceConfig(layer, fqn="auto") for layer in seg_guidance_layers]
+        if isinstance(seg_guidance_config, dict):
+            seg_guidance_config = SmoothedEnergyGuidanceConfig.from_dict(seg_guidance_config)
+        if isinstance(seg_guidance_config, SmoothedEnergyGuidanceConfig):
+            seg_guidance_config = [seg_guidance_config]
+        if not isinstance(seg_guidance_config, list):
+            raise ValueError(
+                f"Expected `seg_guidance_config` to be a SmoothedEnergyGuidanceConfig or a list of SmoothedEnergyGuidanceConfig, but got {type(seg_guidance_config)}."
+            )
+        elif isinstance(next(iter(seg_guidance_config), None), dict):
+            seg_guidance_config = [SmoothedEnergyGuidanceConfig.from_dict(config) for config in seg_guidance_config]
+        self.seg_guidance_config = seg_guidance_config
+        self._seg_layer_hook_names = [f"SmoothedEnergyGuidance_{i}" for i in range(len(self.seg_guidance_config))]
+    def prepare_models(self, denoiser: torch.nn.Module) -> None:
+        if self._is_seg_enabled() and self.is_conditional and self._count_prepared > 1:
+            for name, config in zip(self._seg_layer_hook_names, self.seg_guidance_config):
+                _apply_smoothed_energy_guidance_hook(denoiser, config, self.seg_blur_sigma, name=name)
+    def cleanup_models(self, denoiser: torch.nn.Module):
+        if self._is_seg_enabled() and self.is_conditional and self._count_prepared > 1:
+            registry = HookRegistry.check_if_exists_or_initialize(denoiser)
+            # Remove the hooks after inference
+            for hook_name in self._seg_layer_hook_names:
+                registry.remove_hook(hook_name, recurse=True)
+    def prepare_inputs(
+        self, data: "BlockState", input_fields: Optional[Dict[str, Union[str, Tuple[str, str]]]] = None
+    ) -> List["BlockState"]:
+        if input_fields is None:
+            input_fields = self._input_fields
+        if self.num_conditions == 1:
+            tuple_indices = [0]
+            input_predictions = ["pred_cond"]
+        elif self.num_conditions == 2:
+            tuple_indices = [0, 1]
+            input_predictions = (
+                ["pred_cond", "pred_uncond"] if self._is_cfg_enabled() else ["pred_cond", "pred_cond_seg"]
+            )
+        else:
+            tuple_indices = [0, 1, 0]
+            input_predictions = ["pred_cond", "pred_uncond", "pred_cond_seg"]
+        data_batches = []
+        for i in range(self.num_conditions):
+            data_batch = self._prepare_batch(input_fields, data, tuple_indices[i], input_predictions[i])
+            data_batches.append(data_batch)
+        return data_batches
+    def forward(
+        self,
+        pred_cond: torch.Tensor,
+        pred_uncond: Optional[torch.Tensor] = None,
+        pred_cond_seg: Optional[torch.Tensor] = None,
+    ) -> GuiderOutput:
+        pred = None
+        if not self._is_cfg_enabled() and not self._is_seg_enabled():
+            pred = pred_cond
+        elif not self._is_cfg_enabled():
+            shift = pred_cond - pred_cond_seg
+            pred = pred_cond if self.use_original_formulation else pred_cond_seg
+            pred = pred + self.seg_guidance_scale * shift
+        elif not self._is_seg_enabled():
+            shift = pred_cond - pred_uncond
+            pred = pred_cond if self.use_original_formulation else pred_uncond
+            pred = pred + self.guidance_scale * shift
+        else:
+            shift = pred_cond - pred_uncond
+            shift_seg = pred_cond - pred_cond_seg
+            pred = pred_cond if self.use_original_formulation else pred_uncond
+            pred = pred + self.guidance_scale * shift + self.seg_guidance_scale * shift_seg
+        if self.guidance_rescale > 0.0:
+            pred = rescale_noise_cfg(pred, pred_cond, self.guidance_rescale)
+        return GuiderOutput(pred=pred, pred_cond=pred_cond, pred_uncond=pred_uncond)
+    @property
+    def is_conditional(self) -> bool:
+        return self._count_prepared == 1 or self._count_prepared == 3
+    @property
+    def num_conditions(self) -> int:
+        num_conditions = 1
+        if self._is_cfg_enabled():
+            num_conditions += 1
+        if self._is_seg_enabled():
+            num_conditions += 1
+        return num_conditions
+    def _is_cfg_enabled(self) -> bool:
+        if not self._enabled:
+            return False
+        is_within_range = True
+        if self._num_inference_steps is not None:
+            skip_start_step = int(self._start * self._num_inference_steps)
+            skip_stop_step = int(self._stop * self._num_inference_steps)
+            is_within_range = skip_start_step <= self._step < skip_stop_step
+        is_close = False
+        if self.use_original_formulation:
+            is_close = math.isclose(self.guidance_scale, 0.0)
+        else:
+            is_close = math.isclose(self.guidance_scale, 1.0)
+        return is_within_range and not is_close
+    def _is_seg_enabled(self) -> bool:
+        if not self._enabled:
+            return False
+        is_within_range = True
+        if self._num_inference_steps is not None:
+            skip_start_step = int(self.seg_guidance_start * self._num_inference_steps)
+            skip_stop_step = int(self.seg_guidance_stop * self._num_inference_steps)
+            is_within_range = skip_start_step < self._step < skip_stop_step
+        is_zero = math.isclose(self.seg_guidance_scale, 0.0)
+        return is_within_range and not is_zero

pythonProject/.venv/Lib/site-packages/diffusers/guiders/tangential_classifier_free_guidance.py ADDED Viewed

	@@ -0,0 +1,143 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+import torch
+from ..configuration_utils import register_to_config
+from .guider_utils import BaseGuidance, GuiderOutput, rescale_noise_cfg
+if TYPE_CHECKING:
+    from ..modular_pipelines.modular_pipeline import BlockState
+class TangentialClassifierFreeGuidance(BaseGuidance):
+    """
+    Tangential Classifier Free Guidance (TCFG): https://huggingface.co/papers/2503.18137
+    Args:
+        guidance_scale (`float`, defaults to `7.5`):
+            The scale parameter for classifier-free guidance. Higher values result in stronger conditioning on the text
+            prompt, while lower values allow for more freedom in generation. Higher values may lead to saturation and
+            deterioration of image quality.
+        guidance_rescale (`float`, defaults to `0.0`):
+            The rescale factor applied to the noise predictions. This is used to improve image quality and fix
+            overexposure. Based on Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
+            Flawed](https://huggingface.co/papers/2305.08891).
+        use_original_formulation (`bool`, defaults to `False`):
+            Whether to use the original formulation of classifier-free guidance as proposed in the paper. By default,
+            we use the diffusers-native implementation that has been in the codebase for a long time. See
+            [~guiders.classifier_free_guidance.ClassifierFreeGuidance] for more details.
+        start (`float`, defaults to `0.0`):
+            The fraction of the total number of denoising steps after which guidance starts.
+        stop (`float`, defaults to `1.0`):
+            The fraction of the total number of denoising steps after which guidance stops.
+    """
+    _input_predictions = ["pred_cond", "pred_uncond"]
+    @register_to_config
+    def __init__(
+        self,
+        guidance_scale: float = 7.5,
+        guidance_rescale: float = 0.0,
+        use_original_formulation: bool = False,
+        start: float = 0.0,
+        stop: float = 1.0,
+    ):
+        super().__init__(start, stop)
+        self.guidance_scale = guidance_scale
+        self.guidance_rescale = guidance_rescale
+        self.use_original_formulation = use_original_formulation
+    def prepare_inputs(
+        self, data: "BlockState", input_fields: Optional[Dict[str, Union[str, Tuple[str, str]]]] = None
+    ) -> List["BlockState"]:
+        if input_fields is None:
+            input_fields = self._input_fields
+        tuple_indices = [0] if self.num_conditions == 1 else [0, 1]
+        data_batches = []
+        for i in range(self.num_conditions):
+            data_batch = self._prepare_batch(input_fields, data, tuple_indices[i], self._input_predictions[i])
+            data_batches.append(data_batch)
+        return data_batches
+    def forward(self, pred_cond: torch.Tensor, pred_uncond: Optional[torch.Tensor] = None) -> GuiderOutput:
+        pred = None
+        if not self._is_tcfg_enabled():
+            pred = pred_cond
+        else:
+            pred = normalized_guidance(pred_cond, pred_uncond, self.guidance_scale, self.use_original_formulation)
+        if self.guidance_rescale > 0.0:
+            pred = rescale_noise_cfg(pred, pred_cond, self.guidance_rescale)
+        return GuiderOutput(pred=pred, pred_cond=pred_cond, pred_uncond=pred_uncond)
+    @property
+    def is_conditional(self) -> bool:
+        return self._num_outputs_prepared == 1
+    @property
+    def num_conditions(self) -> int:
+        num_conditions = 1
+        if self._is_tcfg_enabled():
+            num_conditions += 1
+        return num_conditions
+    def _is_tcfg_enabled(self) -> bool:
+        if not self._enabled:
+            return False
+        is_within_range = True
+        if self._num_inference_steps is not None:
+            skip_start_step = int(self._start * self._num_inference_steps)
+            skip_stop_step = int(self._stop * self._num_inference_steps)
+            is_within_range = skip_start_step <= self._step < skip_stop_step
+        is_close = False
+        if self.use_original_formulation:
+            is_close = math.isclose(self.guidance_scale, 0.0)
+        else:
+            is_close = math.isclose(self.guidance_scale, 1.0)
+        return is_within_range and not is_close
+def normalized_guidance(
+    pred_cond: torch.Tensor, pred_uncond: torch.Tensor, guidance_scale: float, use_original_formulation: bool = False
+) -> torch.Tensor:
+    cond_dtype = pred_cond.dtype
+    preds = torch.stack([pred_cond, pred_uncond], dim=1).float()
+    preds = preds.flatten(2)
+    U, S, Vh = torch.linalg.svd(preds, full_matrices=False)
+    Vh_modified = Vh.clone()
+    Vh_modified[:, 1] = 0
+    uncond_flat = pred_uncond.reshape(pred_uncond.size(0), 1, -1).float()
+    x_Vh = torch.matmul(uncond_flat, Vh.transpose(-2, -1))
+    x_Vh_V = torch.matmul(x_Vh, Vh_modified)
+    pred_uncond = x_Vh_V.reshape(pred_uncond.shape).to(cond_dtype)
+    pred = pred_cond if use_original_formulation else pred_uncond
+    shift = pred_cond - pred_uncond
+    pred = pred + guidance_scale * shift
+    return pred

pythonProject/.venv/Lib/site-packages/diffusers/hooks/faster_cache.py ADDED Viewed

	@@ -0,0 +1,654 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+from dataclasses import dataclass
+from typing import Any, Callable, List, Optional, Tuple
+import torch
+from ..models.attention import AttentionModuleMixin
+from ..models.modeling_outputs import Transformer2DModelOutput
+from ..utils import logging
+from ._common import _ATTENTION_CLASSES
+from .hooks import HookRegistry, ModelHook
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+_FASTER_CACHE_DENOISER_HOOK = "faster_cache_denoiser"
+_FASTER_CACHE_BLOCK_HOOK = "faster_cache_block"
+_SPATIAL_ATTENTION_BLOCK_IDENTIFIERS = (
+    "^blocks.*attn",
+    "^transformer_blocks.*attn",
+    "^single_transformer_blocks.*attn",
+)
+_TEMPORAL_ATTENTION_BLOCK_IDENTIFIERS = ("^temporal_transformer_blocks.*attn",)
+_TRANSFORMER_BLOCK_IDENTIFIERS = _SPATIAL_ATTENTION_BLOCK_IDENTIFIERS + _TEMPORAL_ATTENTION_BLOCK_IDENTIFIERS
+_UNCOND_COND_INPUT_KWARGS_IDENTIFIERS = (
+    "hidden_states",
+    "encoder_hidden_states",
+    "timestep",
+    "attention_mask",
+    "encoder_attention_mask",
+)
+@dataclass
+class FasterCacheConfig:
+    r"""
+    Configuration for [FasterCache](https://huggingface.co/papers/2410.19355).
+    Attributes:
+        spatial_attention_block_skip_range (`int`, defaults to `2`):
+            Calculate the attention states every `N` iterations. If this is set to `N`, the attention computation will
+            be skipped `N - 1` times (i.e., cached attention states will be reused) before computing the new attention
+            states again.
+        temporal_attention_block_skip_range (`int`, *optional*, defaults to `None`):
+            Calculate the attention states every `N` iterations. If this is set to `N`, the attention computation will
+            be skipped `N - 1` times (i.e., cached attention states will be reused) before computing the new attention
+            states again.
+        spatial_attention_timestep_skip_range (`Tuple[float, float]`, defaults to `(-1, 681)`):
+            The timestep range within which the spatial attention computation can be skipped without a significant loss
+            in quality. This is to be determined by the user based on the underlying model. The first value in the
+            tuple is the lower bound and the second value is the upper bound. Typically, diffusion timesteps for
+            denoising are in the reversed range of 0 to 1000 (i.e. denoising starts at timestep 1000 and ends at
+            timestep 0). For the default values, this would mean that the spatial attention computation skipping will
+            be applicable only after denoising timestep 681 is reached, and continue until the end of the denoising
+            process.
+        temporal_attention_timestep_skip_range (`Tuple[float, float]`, *optional*, defaults to `None`):
+            The timestep range within which the temporal attention computation can be skipped without a significant
+            loss in quality. This is to be determined by the user based on the underlying model. The first value in the
+            tuple is the lower bound and the second value is the upper bound. Typically, diffusion timesteps for
+            denoising are in the reversed range of 0 to 1000 (i.e. denoising starts at timestep 1000 and ends at
+            timestep 0).
+        low_frequency_weight_update_timestep_range (`Tuple[int, int]`, defaults to `(99, 901)`):
+            The timestep range within which the low frequency weight scaling update is applied. The first value in the
+            tuple is the lower bound and the second value is the upper bound of the timestep range. The callback
+            function for the update is called only within this range.
+        high_frequency_weight_update_timestep_range (`Tuple[int, int]`, defaults to `(-1, 301)`):
+            The timestep range within which the high frequency weight scaling update is applied. The first value in the
+            tuple is the lower bound and the second value is the upper bound of the timestep range. The callback
+            function for the update is called only within this range.
+        alpha_low_frequency (`float`, defaults to `1.1`):
+            The weight to scale the low frequency updates by. This is used to approximate the unconditional branch from
+            the conditional branch outputs.
+        alpha_high_frequency (`float`, defaults to `1.1`):
+            The weight to scale the high frequency updates by. This is used to approximate the unconditional branch
+            from the conditional branch outputs.
+        unconditional_batch_skip_range (`int`, defaults to `5`):
+            Process the unconditional branch every `N` iterations. If this is set to `N`, the unconditional branch
+            computation will be skipped `N - 1` times (i.e., cached unconditional branch states will be reused) before
+            computing the new unconditional branch states again.
+        unconditional_batch_timestep_skip_range (`Tuple[float, float]`, defaults to `(-1, 641)`):
+            The timestep range within which the unconditional branch computation can be skipped without a significant
+            loss in quality. This is to be determined by the user based on the underlying model. The first value in the
+            tuple is the lower bound and the second value is the upper bound.
+        spatial_attention_block_identifiers (`Tuple[str, ...]`, defaults to `("blocks.*attn1", "transformer_blocks.*attn1", "single_transformer_blocks.*attn1")`):
+            The identifiers to match the spatial attention blocks in the model. If the name of the block contains any
+            of these identifiers, FasterCache will be applied to that block. This can either be the full layer names,
+            partial layer names, or regex patterns. Matching will always be done using a regex match.
+        temporal_attention_block_identifiers (`Tuple[str, ...]`, defaults to `("temporal_transformer_blocks.*attn1",)`):
+            The identifiers to match the temporal attention blocks in the model. If the name of the block contains any
+            of these identifiers, FasterCache will be applied to that block. This can either be the full layer names,
+            partial layer names, or regex patterns. Matching will always be done using a regex match.
+        attention_weight_callback (`Callable[[torch.nn.Module], float]`, defaults to `None`):
+            The callback function to determine the weight to scale the attention outputs by. This function should take
+            the attention module as input and return a float value. This is used to approximate the unconditional
+            branch from the conditional branch outputs. If not provided, the default weight is 0.5 for all timesteps.
+            Typically, as described in the paper, this weight should gradually increase from 0 to 1 as the inference
+            progresses. Users are encouraged to experiment and provide custom weight schedules that take into account
+            the number of inference steps and underlying model behaviour as denoising progresses.
+        low_frequency_weight_callback (`Callable[[torch.nn.Module], float]`, defaults to `None`):
+            The callback function to determine the weight to scale the low frequency updates by. If not provided, the
+            default weight is 1.1 for timesteps within the range specified (as described in the paper).
+        high_frequency_weight_callback (`Callable[[torch.nn.Module], float]`, defaults to `None`):
+            The callback function to determine the weight to scale the high frequency updates by. If not provided, the
+            default weight is 1.1 for timesteps within the range specified (as described in the paper).
+        tensor_format (`str`, defaults to `"BCFHW"`):
+            The format of the input tensors. This should be one of `"BCFHW"`, `"BFCHW"`, or `"BCHW"`. The format is
+            used to split individual latent frames in order for low and high frequency components to be computed.
+        is_guidance_distilled (`bool`, defaults to `False`):
+            Whether the model is guidance distilled or not. If the model is guidance distilled, FasterCache will not be
+            applied at the denoiser-level to skip the unconditional branch computation (as there is none).
+        _unconditional_conditional_input_kwargs_identifiers (`List[str]`, defaults to `("hidden_states", "encoder_hidden_states", "timestep", "attention_mask", "encoder_attention_mask")`):
+            The identifiers to match the input kwargs that contain the batchwise-concatenated unconditional and
+            conditional inputs. If the name of the input kwargs contains any of these identifiers, FasterCache will
+            split the inputs into unconditional and conditional branches. This must be a list of exact input kwargs
+            names that contain the batchwise-concatenated unconditional and conditional inputs.
+    """
+    # In the paper and codebase, they hardcode these values to 2. However, it can be made configurable
+    # after some testing. We default to 2 if these parameters are not provided.
+    spatial_attention_block_skip_range: int = 2
+    temporal_attention_block_skip_range: Optional[int] = None
+    spatial_attention_timestep_skip_range: Tuple[int, int] = (-1, 681)
+    temporal_attention_timestep_skip_range: Tuple[int, int] = (-1, 681)
+    # Indicator functions for low/high frequency as mentioned in Equation 11 of the paper
+    low_frequency_weight_update_timestep_range: Tuple[int, int] = (99, 901)
+    high_frequency_weight_update_timestep_range: Tuple[int, int] = (-1, 301)
+    # ⍺1 and ⍺2 as mentioned in Equation 11 of the paper
+    alpha_low_frequency: float = 1.1
+    alpha_high_frequency: float = 1.1
+    # n as described in CFG-Cache explanation in the paper - dependent on the model
+    unconditional_batch_skip_range: int = 5
+    unconditional_batch_timestep_skip_range: Tuple[int, int] = (-1, 641)
+    spatial_attention_block_identifiers: Tuple[str, ...] = _SPATIAL_ATTENTION_BLOCK_IDENTIFIERS
+    temporal_attention_block_identifiers: Tuple[str, ...] = _TEMPORAL_ATTENTION_BLOCK_IDENTIFIERS
+    attention_weight_callback: Callable[[torch.nn.Module], float] = None
+    low_frequency_weight_callback: Callable[[torch.nn.Module], float] = None
+    high_frequency_weight_callback: Callable[[torch.nn.Module], float] = None
+    tensor_format: str = "BCFHW"
+    is_guidance_distilled: bool = False
+    current_timestep_callback: Callable[[], int] = None
+    _unconditional_conditional_input_kwargs_identifiers: List[str] = _UNCOND_COND_INPUT_KWARGS_IDENTIFIERS
+    def __repr__(self) -> str:
+        return (
+            f"FasterCacheConfig(\n"
+            f"  spatial_attention_block_skip_range={self.spatial_attention_block_skip_range},\n"
+            f"  temporal_attention_block_skip_range={self.temporal_attention_block_skip_range},\n"
+            f"  spatial_attention_timestep_skip_range={self.spatial_attention_timestep_skip_range},\n"
+            f"  temporal_attention_timestep_skip_range={self.temporal_attention_timestep_skip_range},\n"
+            f"  low_frequency_weight_update_timestep_range={self.low_frequency_weight_update_timestep_range},\n"
+            f"  high_frequency_weight_update_timestep_range={self.high_frequency_weight_update_timestep_range},\n"
+            f"  alpha_low_frequency={self.alpha_low_frequency},\n"
+            f"  alpha_high_frequency={self.alpha_high_frequency},\n"
+            f"  unconditional_batch_skip_range={self.unconditional_batch_skip_range},\n"
+            f"  unconditional_batch_timestep_skip_range={self.unconditional_batch_timestep_skip_range},\n"
+            f"  spatial_attention_block_identifiers={self.spatial_attention_block_identifiers},\n"
+            f"  temporal_attention_block_identifiers={self.temporal_attention_block_identifiers},\n"
+            f"  tensor_format={self.tensor_format},\n"
+            f")"
+        )
+class FasterCacheDenoiserState:
+    r"""
+    State for [FasterCache](https://huggingface.co/papers/2410.19355) top-level denoiser module.
+    """
+    def __init__(self) -> None:
+        self.iteration: int = 0
+        self.low_frequency_delta: torch.Tensor = None
+        self.high_frequency_delta: torch.Tensor = None
+    def reset(self):
+        self.iteration = 0
+        self.low_frequency_delta = None
+        self.high_frequency_delta = None
+class FasterCacheBlockState:
+    r"""
+    State for [FasterCache](https://huggingface.co/papers/2410.19355). Every underlying block that FasterCache is
+    applied to will have an instance of this state.
+    """
+    def __init__(self) -> None:
+        self.iteration: int = 0
+        self.batch_size: int = None
+        self.cache: Tuple[torch.Tensor, torch.Tensor] = None
+    def reset(self):
+        self.iteration = 0
+        self.batch_size = None
+        self.cache = None
+class FasterCacheDenoiserHook(ModelHook):
+    _is_stateful = True
+    def __init__(
+        self,
+        unconditional_batch_skip_range: int,
+        unconditional_batch_timestep_skip_range: Tuple[int, int],
+        tensor_format: str,
+        is_guidance_distilled: bool,
+        uncond_cond_input_kwargs_identifiers: List[str],
+        current_timestep_callback: Callable[[], int],
+        low_frequency_weight_callback: Callable[[torch.nn.Module], torch.Tensor],
+        high_frequency_weight_callback: Callable[[torch.nn.Module], torch.Tensor],
+    ) -> None:
+        super().__init__()
+        self.unconditional_batch_skip_range = unconditional_batch_skip_range
+        self.unconditional_batch_timestep_skip_range = unconditional_batch_timestep_skip_range
+        # We can't easily detect what args are to be split in unconditional and conditional branches. We
+        # can only do it for kwargs, hence they are the only ones we split. The args are passed as-is.
+        # If a model is to be made compatible with FasterCache, the user must ensure that the inputs that
+        # contain batchwise-concatenated unconditional and conditional inputs are passed as kwargs.
+        self.uncond_cond_input_kwargs_identifiers = uncond_cond_input_kwargs_identifiers
+        self.tensor_format = tensor_format
+        self.is_guidance_distilled = is_guidance_distilled
+        self.current_timestep_callback = current_timestep_callback
+        self.low_frequency_weight_callback = low_frequency_weight_callback
+        self.high_frequency_weight_callback = high_frequency_weight_callback
+    def initialize_hook(self, module):
+        self.state = FasterCacheDenoiserState()
+        return module
+    @staticmethod
+    def _get_cond_input(input: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Note: this method assumes that the input tensor is batchwise-concatenated with unconditional inputs
+        # followed by conditional inputs.
+        _, cond = input.chunk(2, dim=0)
+        return cond
+    def new_forward(self, module: torch.nn.Module, *args, **kwargs) -> Any:
+        # Split the unconditional and conditional inputs. We only want to infer the conditional branch if the
+        # requirements for skipping the unconditional branch are met as described in the paper.
+        # We skip the unconditional branch only if the following conditions are met:
+        #   1. We have completed at least one iteration of the denoiser
+        #   2. The current timestep is within the range specified by the user. This is the optimal timestep range
+        #      where approximating the unconditional branch from the computation of the conditional branch is possible
+        #      without a significant loss in quality.
+        #   3. The current iteration is not a multiple of the unconditional batch skip range. This is done so that
+        #      we compute the unconditional branch at least once every few iterations to ensure minimal quality loss.
+        is_within_timestep_range = (
+            self.unconditional_batch_timestep_skip_range[0]
+            < self.current_timestep_callback()
+            < self.unconditional_batch_timestep_skip_range[1]
+        )
+        should_skip_uncond = (
+            self.state.iteration > 0
+            and is_within_timestep_range
+            and self.state.iteration % self.unconditional_batch_skip_range != 0
+            and not self.is_guidance_distilled
+        )
+        if should_skip_uncond:
+            is_any_kwarg_uncond = any(k in self.uncond_cond_input_kwargs_identifiers for k in kwargs.keys())
+            if is_any_kwarg_uncond:
+                logger.debug("FasterCache - Skipping unconditional branch computation")
+                args = tuple([self._get_cond_input(arg) if torch.is_tensor(arg) else arg for arg in args])
+                kwargs = {
+                    k: v if k not in self.uncond_cond_input_kwargs_identifiers else self._get_cond_input(v)
+                    for k, v in kwargs.items()
+                }
+        output = self.fn_ref.original_forward(*args, **kwargs)
+        if self.is_guidance_distilled:
+            self.state.iteration += 1
+            return output
+        if torch.is_tensor(output):
+            hidden_states = output
+        elif isinstance(output, (tuple, Transformer2DModelOutput)):
+            hidden_states = output[0]
+        batch_size = hidden_states.size(0)
+        if should_skip_uncond:
+            self.state.low_frequency_delta = self.state.low_frequency_delta * self.low_frequency_weight_callback(
+                module
+            )
+            self.state.high_frequency_delta = self.state.high_frequency_delta * self.high_frequency_weight_callback(
+                module
+            )
+            if self.tensor_format == "BCFHW":
+                hidden_states = hidden_states.permute(0, 2, 1, 3, 4)
+            if self.tensor_format == "BCFHW" or self.tensor_format == "BFCHW":
+                hidden_states = hidden_states.flatten(0, 1)
+            low_freq_cond, high_freq_cond = _split_low_high_freq(hidden_states.float())
+            # Approximate/compute the unconditional branch outputs as described in Equation 9 and 10 of the paper
+            low_freq_uncond = self.state.low_frequency_delta + low_freq_cond
+            high_freq_uncond = self.state.high_frequency_delta + high_freq_cond
+            uncond_freq = low_freq_uncond + high_freq_uncond
+            uncond_states = torch.fft.ifftshift(uncond_freq)
+            uncond_states = torch.fft.ifft2(uncond_states).real
+            if self.tensor_format == "BCFHW" or self.tensor_format == "BFCHW":
+                uncond_states = uncond_states.unflatten(0, (batch_size, -1))
+                hidden_states = hidden_states.unflatten(0, (batch_size, -1))
+            if self.tensor_format == "BCFHW":
+                uncond_states = uncond_states.permute(0, 2, 1, 3, 4)
+                hidden_states = hidden_states.permute(0, 2, 1, 3, 4)
+            # Concatenate the approximated unconditional and predicted conditional branches
+            uncond_states = uncond_states.to(hidden_states.dtype)
+            hidden_states = torch.cat([uncond_states, hidden_states], dim=0)
+        else:
+            uncond_states, cond_states = hidden_states.chunk(2, dim=0)
+            if self.tensor_format == "BCFHW":
+                uncond_states = uncond_states.permute(0, 2, 1, 3, 4)
+                cond_states = cond_states.permute(0, 2, 1, 3, 4)
+            if self.tensor_format == "BCFHW" or self.tensor_format == "BFCHW":
+                uncond_states = uncond_states.flatten(0, 1)
+                cond_states = cond_states.flatten(0, 1)
+            low_freq_uncond, high_freq_uncond = _split_low_high_freq(uncond_states.float())
+            low_freq_cond, high_freq_cond = _split_low_high_freq(cond_states.float())
+            self.state.low_frequency_delta = low_freq_uncond - low_freq_cond
+            self.state.high_frequency_delta = high_freq_uncond - high_freq_cond
+        self.state.iteration += 1
+        if torch.is_tensor(output):
+            output = hidden_states
+        elif isinstance(output, tuple):
+            output = (hidden_states, *output[1:])
+        else:
+            output.sample = hidden_states
+        return output
+    def reset_state(self, module: torch.nn.Module) -> torch.nn.Module:
+        self.state.reset()
+        return module
+class FasterCacheBlockHook(ModelHook):
+    _is_stateful = True
+    def __init__(
+        self,
+        block_skip_range: int,
+        timestep_skip_range: Tuple[int, int],
+        is_guidance_distilled: bool,
+        weight_callback: Callable[[torch.nn.Module], float],
+        current_timestep_callback: Callable[[], int],
+    ) -> None:
+        super().__init__()
+        self.block_skip_range = block_skip_range
+        self.timestep_skip_range = timestep_skip_range
+        self.is_guidance_distilled = is_guidance_distilled
+        self.weight_callback = weight_callback
+        self.current_timestep_callback = current_timestep_callback
+    def initialize_hook(self, module):
+        self.state = FasterCacheBlockState()
+        return module
+    def _compute_approximated_attention_output(
+        self, t_2_output: torch.Tensor, t_output: torch.Tensor, weight: float, batch_size: int
+    ) -> torch.Tensor:
+        if t_2_output.size(0) != batch_size:
+            # The cache t_2_output contains both batchwise-concatenated unconditional-conditional branch outputs. Just
+            # take the conditional branch outputs.
+            assert t_2_output.size(0) == 2 * batch_size
+            t_2_output = t_2_output[batch_size:]
+        if t_output.size(0) != batch_size:
+            # The cache t_output contains both batchwise-concatenated unconditional-conditional branch outputs. Just
+            # take the conditional branch outputs.
+            assert t_output.size(0) == 2 * batch_size
+            t_output = t_output[batch_size:]
+        return t_output + (t_output - t_2_output) * weight
+    def new_forward(self, module: torch.nn.Module, *args, **kwargs) -> Any:
+        batch_size = [
+            *[arg.size(0) for arg in args if torch.is_tensor(arg)],
+            *[v.size(0) for v in kwargs.values() if torch.is_tensor(v)],
+        ][0]
+        if self.state.batch_size is None:
+            # Will be updated on first forward pass through the denoiser
+            self.state.batch_size = batch_size
+        # If we have to skip due to the skip conditions, then let's skip as expected.
+        # But, we can't skip if the denoiser wants to infer both unconditional and conditional branches. This
+        # is because the expected output shapes of attention layer will not match if we only return values from
+        # the cache (which only caches conditional branch outputs). So, if state.batch_size (which is the true
+        # unconditional-conditional batch size) is same as the current batch size, we don't perform the layer
+        # skip. Otherwise, we conditionally skip the layer based on what state.skip_callback returns.
+        is_within_timestep_range = (
+            self.timestep_skip_range[0] < self.current_timestep_callback() < self.timestep_skip_range[1]
+        )
+        if not is_within_timestep_range:
+            should_skip_attention = False
+        else:
+            should_compute_attention = self.state.iteration > 0 and self.state.iteration % self.block_skip_range == 0
+            should_skip_attention = not should_compute_attention
+        if should_skip_attention:
+            should_skip_attention = self.is_guidance_distilled or self.state.batch_size != batch_size
+        if should_skip_attention:
+            logger.debug("FasterCache - Skipping attention and using approximation")
+            if torch.is_tensor(self.state.cache[-1]):
+                t_2_output, t_output = self.state.cache
+                weight = self.weight_callback(module)
+                output = self._compute_approximated_attention_output(t_2_output, t_output, weight, batch_size)
+            else:
+                # The cache contains multiple tensors from past N iterations (N=2 for FasterCache). We need to handle all of them.
+                # Diffusers blocks can return multiple tensors - let's call them [A, B, C, ...] for simplicity.
+                # In our cache, we would have [[A_1, B_1, C_1, ...], [A_2, B_2, C_2, ...], ...] where each list is the output from
+                # a forward pass of the block. We need to compute the approximated output for each of these tensors.
+                # The zip(*state.cache) operation will give us [(A_1, A_2, ...), (B_1, B_2, ...), (C_1, C_2, ...), ...] which
+                # allows us to compute the approximated attention output for each tensor in the cache.
+                output = ()
+                for t_2_output, t_output in zip(*self.state.cache):
+                    result = self._compute_approximated_attention_output(
+                        t_2_output, t_output, self.weight_callback(module), batch_size
+                    )
+                    output += (result,)
+        else:
+            logger.debug("FasterCache - Computing attention")
+            output = self.fn_ref.original_forward(*args, **kwargs)
+        # Note that the following condition for getting hidden_states should suffice since Diffusers blocks either return
+        # a single hidden_states tensor, or a tuple of (hidden_states, encoder_hidden_states) tensors. We need to handle
+        # both cases.
+        if torch.is_tensor(output):
+            cache_output = output
+            if not self.is_guidance_distilled and cache_output.size(0) == self.state.batch_size:
+                # The output here can be both unconditional-conditional branch outputs or just conditional branch outputs.
+                # This is determined at the higher-level denoiser module. We only want to cache the conditional branch outputs.
+                cache_output = cache_output.chunk(2, dim=0)[1]
+        else:
+            # Cache all return values and perform the same operation as above
+            cache_output = ()
+            for out in output:
+                if not self.is_guidance_distilled and out.size(0) == self.state.batch_size:
+                    out = out.chunk(2, dim=0)[1]
+                cache_output += (out,)
+        if self.state.cache is None:
+            self.state.cache = [cache_output, cache_output]
+        else:
+            self.state.cache = [self.state.cache[-1], cache_output]
+        self.state.iteration += 1
+        return output
+    def reset_state(self, module: torch.nn.Module) -> torch.nn.Module:
+        self.state.reset()
+        return module
+def apply_faster_cache(module: torch.nn.Module, config: FasterCacheConfig) -> None:
+    r"""
+    Applies [FasterCache](https://huggingface.co/papers/2410.19355) to a given pipeline.
+    Args:
+        module (`torch.nn.Module`):
+            The pytorch module to apply FasterCache to. Typically, this should be a transformer architecture supported
+            in Diffusers, such as `CogVideoXTransformer3DModel`, but external implementations may also work.
+        config (`FasterCacheConfig`):
+            The configuration to use for FasterCache.
+    Example:
+    ```python
+    >>> import torch
+    >>> from diffusers import CogVideoXPipeline, FasterCacheConfig, apply_faster_cache
+    >>> pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-5b", torch_dtype=torch.bfloat16)
+    >>> pipe.to("cuda")
+    >>> config = FasterCacheConfig(
+    ...     spatial_attention_block_skip_range=2,
+    ...     spatial_attention_timestep_skip_range=(-1, 681),
+    ...     low_frequency_weight_update_timestep_range=(99, 641),
+    ...     high_frequency_weight_update_timestep_range=(-1, 301),
+    ...     spatial_attention_block_identifiers=["transformer_blocks"],
+    ...     attention_weight_callback=lambda _: 0.3,
+    ...     tensor_format="BFCHW",
+    ... )
+    >>> apply_faster_cache(pipe.transformer, config)
+    ```
+    """
+    logger.warning(
+        "FasterCache is a purely experimental feature and may not work as expected. Not all models support FasterCache. "
+        "The API is subject to change in future releases, with no guarantee of backward compatibility. Please report any issues at "
+        "https://github.com/huggingface/diffusers/issues."
+    )
+    if config.attention_weight_callback is None:
+        # If the user has not provided a weight callback, we default to 0.5 for all timesteps.
+        # In the paper, they recommend using a gradually increasing weight from 0 to 1 as the inference progresses, but
+        # this depends from model-to-model. It is required by the user to provide a weight callback if they want to
+        # use a different weight function. Defaulting to 0.5 works well in practice for most cases.
+        logger.warning(
+            "No `attention_weight_callback` provided when enabling FasterCache. Defaulting to using a weight of 0.5 for all timesteps."
+        )
+        config.attention_weight_callback = lambda _: 0.5
+    if config.low_frequency_weight_callback is None:
+        logger.debug(
+            "Low frequency weight callback not provided when enabling FasterCache. Defaulting to behaviour described in the paper."
+        )
+        def low_frequency_weight_callback(module: torch.nn.Module) -> float:
+            is_within_range = (
+                config.low_frequency_weight_update_timestep_range[0]
+                < config.current_timestep_callback()
+                < config.low_frequency_weight_update_timestep_range[1]
+            )
+            return config.alpha_low_frequency if is_within_range else 1.0
+        config.low_frequency_weight_callback = low_frequency_weight_callback
+    if config.high_frequency_weight_callback is None:
+        logger.debug(
+            "High frequency weight callback not provided when enabling FasterCache. Defaulting to behaviour described in the paper."
+        )
+        def high_frequency_weight_callback(module: torch.nn.Module) -> float:
+            is_within_range = (
+                config.high_frequency_weight_update_timestep_range[0]
+                < config.current_timestep_callback()
+                < config.high_frequency_weight_update_timestep_range[1]
+            )
+            return config.alpha_high_frequency if is_within_range else 1.0
+        config.high_frequency_weight_callback = high_frequency_weight_callback
+    supported_tensor_formats = ["BCFHW", "BFCHW", "BCHW"]  # TODO(aryan): Support BSC for LTX Video
+    if config.tensor_format not in supported_tensor_formats:
+        raise ValueError(f"`tensor_format` must be one of {supported_tensor_formats}, but got {config.tensor_format}.")
+    _apply_faster_cache_on_denoiser(module, config)
+    for name, submodule in module.named_modules():
+        if not isinstance(submodule, _ATTENTION_CLASSES):
+            continue
+        if any(re.search(identifier, name) is not None for identifier in _TRANSFORMER_BLOCK_IDENTIFIERS):
+            _apply_faster_cache_on_attention_class(name, submodule, config)
+def _apply_faster_cache_on_denoiser(module: torch.nn.Module, config: FasterCacheConfig) -> None:
+    hook = FasterCacheDenoiserHook(
+        config.unconditional_batch_skip_range,
+        config.unconditional_batch_timestep_skip_range,
+        config.tensor_format,
+        config.is_guidance_distilled,
+        config._unconditional_conditional_input_kwargs_identifiers,
+        config.current_timestep_callback,
+        config.low_frequency_weight_callback,
+        config.high_frequency_weight_callback,
+    )
+    registry = HookRegistry.check_if_exists_or_initialize(module)
+    registry.register_hook(hook, _FASTER_CACHE_DENOISER_HOOK)
+def _apply_faster_cache_on_attention_class(name: str, module: AttentionModuleMixin, config: FasterCacheConfig) -> None:
+    is_spatial_self_attention = (
+        any(re.search(identifier, name) is not None for identifier in config.spatial_attention_block_identifiers)
+        and config.spatial_attention_block_skip_range is not None
+        and not getattr(module, "is_cross_attention", False)
+    )
+    is_temporal_self_attention = (
+        any(re.search(identifier, name) is not None for identifier in config.temporal_attention_block_identifiers)
+        and config.temporal_attention_block_skip_range is not None
+        and not module.is_cross_attention
+    )
+    block_skip_range, timestep_skip_range, block_type = None, None, None
+    if is_spatial_self_attention:
+        block_skip_range = config.spatial_attention_block_skip_range
+        timestep_skip_range = config.spatial_attention_timestep_skip_range
+        block_type = "spatial"
+    elif is_temporal_self_attention:
+        block_skip_range = config.temporal_attention_block_skip_range
+        timestep_skip_range = config.temporal_attention_timestep_skip_range
+        block_type = "temporal"
+    if block_skip_range is None or timestep_skip_range is None:
+        logger.debug(
+            f'Unable to apply FasterCache to the selected layer: "{name}" because it does '
+            f"not match any of the required criteria for spatial or temporal attention layers. Note, "
+            f"however, that this layer may still be valid for applying PAB. Please specify the correct "
+            f"block identifiers in the configuration or use the specialized `apply_faster_cache_on_module` "
+            f"function to apply FasterCache to this layer."
+        )
+        return
+    logger.debug(f"Enabling FasterCache ({block_type}) for layer: {name}")
+    hook = FasterCacheBlockHook(
+        block_skip_range,
+        timestep_skip_range,
+        config.is_guidance_distilled,
+        config.attention_weight_callback,
+        config.current_timestep_callback,
+    )
+    registry = HookRegistry.check_if_exists_or_initialize(module)
+    registry.register_hook(hook, _FASTER_CACHE_BLOCK_HOOK)
+# Reference: https://github.com/Vchitect/FasterCache/blob/fab32c15014636dc854948319c0a9a8d92c7acb4/scripts/latte/faster_cache_sample_latte.py#L127C1-L143C39
+@torch.no_grad()
+def _split_low_high_freq(x):
+    fft = torch.fft.fft2(x)
+    fft_shifted = torch.fft.fftshift(fft)
+    height, width = x.shape[-2:]
+    radius = min(height, width) // 5
+    y_grid, x_grid = torch.meshgrid(torch.arange(height), torch.arange(width))
+    center_x, center_y = width // 2, height // 2
+    mask = (x_grid - center_x) ** 2 + (y_grid - center_y) ** 2 <= radius**2
+    low_freq_mask = mask.unsqueeze(0).unsqueeze(0).to(x.device)
+    high_freq_mask = ~low_freq_mask
+    low_freq_fft = fft_shifted * low_freq_mask
+    high_freq_fft = fft_shifted * high_freq_mask
+    return low_freq_fft, high_freq_fft

pythonProject/.venv/Lib/site-packages/diffusers/hooks/first_block_cache.py ADDED Viewed

	@@ -0,0 +1,259 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Tuple, Union
+import torch
+from ..utils import get_logger
+from ..utils.torch_utils import unwrap_module
+from ._common import _ALL_TRANSFORMER_BLOCK_IDENTIFIERS
+from ._helpers import TransformerBlockRegistry
+from .hooks import BaseState, HookRegistry, ModelHook, StateManager
+logger = get_logger(__name__)  # pylint: disable=invalid-name
+_FBC_LEADER_BLOCK_HOOK = "fbc_leader_block_hook"
+_FBC_BLOCK_HOOK = "fbc_block_hook"
+@dataclass
+class FirstBlockCacheConfig:
+    r"""
+    Configuration for [First Block
+    Cache](https://github.com/chengzeyi/ParaAttention/blob/7a266123671b55e7e5a2fe9af3121f07a36afc78/README.md#first-block-cache-our-dynamic-caching).
+    Args:
+        threshold (`float`, defaults to `0.05`):
+            The threshold to determine whether or not a forward pass through all layers of the model is required. A
+            higher threshold usually results in a forward pass through a lower number of layers and faster inference,
+            but might lead to poorer generation quality. A lower threshold may not result in significant generation
+            speedup. The threshold is compared against the absmean difference of the residuals between the current and
+            cached outputs from the first transformer block. If the difference is below the threshold, the forward pass
+            is skipped.
+    """
+    threshold: float = 0.05
+class FBCSharedBlockState(BaseState):
+    def __init__(self) -> None:
+        super().__init__()
+        self.head_block_output: Union[torch.Tensor, Tuple[torch.Tensor, ...]] = None
+        self.head_block_residual: torch.Tensor = None
+        self.tail_block_residuals: Union[torch.Tensor, Tuple[torch.Tensor, ...]] = None
+        self.should_compute: bool = True
+    def reset(self):
+        self.tail_block_residuals = None
+        self.should_compute = True
+class FBCHeadBlockHook(ModelHook):
+    _is_stateful = True
+    def __init__(self, state_manager: StateManager, threshold: float):
+        self.state_manager = state_manager
+        self.threshold = threshold
+        self._metadata = None
+    def initialize_hook(self, module):
+        unwrapped_module = unwrap_module(module)
+        self._metadata = TransformerBlockRegistry.get(unwrapped_module.__class__)
+        return module
+    def new_forward(self, module: torch.nn.Module, *args, **kwargs):
+        original_hidden_states = self._metadata._get_parameter_from_args_kwargs("hidden_states", args, kwargs)
+        output = self.fn_ref.original_forward(*args, **kwargs)
+        is_output_tuple = isinstance(output, tuple)
+        if is_output_tuple:
+            hidden_states_residual = output[self._metadata.return_hidden_states_index] - original_hidden_states
+        else:
+            hidden_states_residual = output - original_hidden_states
+        shared_state: FBCSharedBlockState = self.state_manager.get_state()
+        hidden_states = encoder_hidden_states = None
+        should_compute = self._should_compute_remaining_blocks(hidden_states_residual)
+        shared_state.should_compute = should_compute
+        if not should_compute:
+            # Apply caching
+            if is_output_tuple:
+                hidden_states = (
+                    shared_state.tail_block_residuals[0] + output[self._metadata.return_hidden_states_index]
+                )
+            else:
+                hidden_states = shared_state.tail_block_residuals[0] + output
+            if self._metadata.return_encoder_hidden_states_index is not None:
+                assert is_output_tuple
+                encoder_hidden_states = (
+                    shared_state.tail_block_residuals[1] + output[self._metadata.return_encoder_hidden_states_index]
+                )
+            if is_output_tuple:
+                return_output = [None] * len(output)
+                return_output[self._metadata.return_hidden_states_index] = hidden_states
+                return_output[self._metadata.return_encoder_hidden_states_index] = encoder_hidden_states
+                return_output = tuple(return_output)
+            else:
+                return_output = hidden_states
+            output = return_output
+        else:
+            if is_output_tuple:
+                head_block_output = [None] * len(output)
+                head_block_output[0] = output[self._metadata.return_hidden_states_index]
+                head_block_output[1] = output[self._metadata.return_encoder_hidden_states_index]
+            else:
+                head_block_output = output
+            shared_state.head_block_output = head_block_output
+            shared_state.head_block_residual = hidden_states_residual
+        return output
+    def reset_state(self, module):
+        self.state_manager.reset()
+        return module
+    @torch.compiler.disable
+    def _should_compute_remaining_blocks(self, hidden_states_residual: torch.Tensor) -> bool:
+        shared_state = self.state_manager.get_state()
+        if shared_state.head_block_residual is None:
+            return True
+        prev_hidden_states_residual = shared_state.head_block_residual
+        absmean = (hidden_states_residual - prev_hidden_states_residual).abs().mean()
+        prev_hidden_states_absmean = prev_hidden_states_residual.abs().mean()
+        diff = (absmean / prev_hidden_states_absmean).item()
+        return diff > self.threshold
+class FBCBlockHook(ModelHook):
+    def __init__(self, state_manager: StateManager, is_tail: bool = False):
+        super().__init__()
+        self.state_manager = state_manager
+        self.is_tail = is_tail
+        self._metadata = None
+    def initialize_hook(self, module):
+        unwrapped_module = unwrap_module(module)
+        self._metadata = TransformerBlockRegistry.get(unwrapped_module.__class__)
+        return module
+    def new_forward(self, module: torch.nn.Module, *args, **kwargs):
+        original_hidden_states = self._metadata._get_parameter_from_args_kwargs("hidden_states", args, kwargs)
+        original_encoder_hidden_states = None
+        if self._metadata.return_encoder_hidden_states_index is not None:
+            original_encoder_hidden_states = self._metadata._get_parameter_from_args_kwargs(
+                "encoder_hidden_states", args, kwargs
+            )
+        shared_state = self.state_manager.get_state()
+        if shared_state.should_compute:
+            output = self.fn_ref.original_forward(*args, **kwargs)
+            if self.is_tail:
+                hidden_states_residual = encoder_hidden_states_residual = None
+                if isinstance(output, tuple):
+                    hidden_states_residual = (
+                        output[self._metadata.return_hidden_states_index] - shared_state.head_block_output[0]
+                    )
+                    encoder_hidden_states_residual = (
+                        output[self._metadata.return_encoder_hidden_states_index] - shared_state.head_block_output[1]
+                    )
+                else:
+                    hidden_states_residual = output - shared_state.head_block_output
+                shared_state.tail_block_residuals = (hidden_states_residual, encoder_hidden_states_residual)
+            return output
+        if original_encoder_hidden_states is None:
+            return_output = original_hidden_states
+        else:
+            return_output = [None, None]
+            return_output[self._metadata.return_hidden_states_index] = original_hidden_states
+            return_output[self._metadata.return_encoder_hidden_states_index] = original_encoder_hidden_states
+            return_output = tuple(return_output)
+        return return_output
+def apply_first_block_cache(module: torch.nn.Module, config: FirstBlockCacheConfig) -> None:
+    """
+    Applies [First Block
+    Cache](https://github.com/chengzeyi/ParaAttention/blob/4de137c5b96416489f06e43e19f2c14a772e28fd/README.md#first-block-cache-our-dynamic-caching)
+    to a given module.
+    First Block Cache builds on the ideas of [TeaCache](https://huggingface.co/papers/2411.19108). It is much simpler
+    to implement generically for a wide range of models and has been integrated first for experimental purposes.
+    Args:
+        module (`torch.nn.Module`):
+            The pytorch module to apply FBCache to. Typically, this should be a transformer architecture supported in
+            Diffusers, such as `CogVideoXTransformer3DModel`, but external implementations may also work.
+        config (`FirstBlockCacheConfig`):
+            The configuration to use for applying the FBCache method.
+    Example:
+        ```python
+        >>> import torch
+        >>> from diffusers import CogView4Pipeline
+        >>> from diffusers.hooks import apply_first_block_cache, FirstBlockCacheConfig
+        >>> pipe = CogView4Pipeline.from_pretrained("THUDM/CogView4-6B", torch_dtype=torch.bfloat16)
+        >>> pipe.to("cuda")
+        >>> apply_first_block_cache(pipe.transformer, FirstBlockCacheConfig(threshold=0.2))
+        >>> prompt = "A photo of an astronaut riding a horse on mars"
+        >>> image = pipe(prompt, generator=torch.Generator().manual_seed(42)).images[0]
+        >>> image.save("output.png")
+        ```
+    """
+    state_manager = StateManager(FBCSharedBlockState, (), {})
+    remaining_blocks = []
+    for name, submodule in module.named_children():
+        if name not in _ALL_TRANSFORMER_BLOCK_IDENTIFIERS or not isinstance(submodule, torch.nn.ModuleList):
+            continue
+        for index, block in enumerate(submodule):
+            remaining_blocks.append((f"{name}.{index}", block))
+    head_block_name, head_block = remaining_blocks.pop(0)
+    tail_block_name, tail_block = remaining_blocks.pop(-1)
+    logger.debug(f"Applying FBCHeadBlockHook to '{head_block_name}'")
+    _apply_fbc_head_block_hook(head_block, state_manager, config.threshold)
+    for name, block in remaining_blocks:
+        logger.debug(f"Applying FBCBlockHook to '{name}'")
+        _apply_fbc_block_hook(block, state_manager)
+    logger.debug(f"Applying FBCBlockHook to tail block '{tail_block_name}'")
+    _apply_fbc_block_hook(tail_block, state_manager, is_tail=True)
+def _apply_fbc_head_block_hook(block: torch.nn.Module, state_manager: StateManager, threshold: float) -> None:
+    registry = HookRegistry.check_if_exists_or_initialize(block)
+    hook = FBCHeadBlockHook(state_manager, threshold)
+    registry.register_hook(hook, _FBC_LEADER_BLOCK_HOOK)
+def _apply_fbc_block_hook(block: torch.nn.Module, state_manager: StateManager, is_tail: bool = False) -> None:
+    registry = HookRegistry.check_if_exists_or_initialize(block)
+    hook = FBCBlockHook(state_manager, is_tail)
+    registry.register_hook(hook, _FBC_BLOCK_HOOK)

pythonProject/.venv/Lib/site-packages/diffusers/hooks/group_offloading.py ADDED Viewed

	@@ -0,0 +1,898 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import hashlib
+import os
+from contextlib import contextmanager, nullcontext
+from dataclasses import dataclass
+from enum import Enum
+from typing import Dict, List, Optional, Set, Tuple, Union
+import safetensors.torch
+import torch
+from ..utils import get_logger, is_accelerate_available
+from ._common import _GO_LC_SUPPORTED_PYTORCH_LAYERS
+from .hooks import HookRegistry, ModelHook
+if is_accelerate_available():
+    from accelerate.hooks import AlignDevicesHook, CpuOffload
+    from accelerate.utils import send_to_device
+logger = get_logger(__name__)  # pylint: disable=invalid-name
+# fmt: off
+_GROUP_OFFLOADING = "group_offloading"
+_LAYER_EXECUTION_TRACKER = "layer_execution_tracker"
+_LAZY_PREFETCH_GROUP_OFFLOADING = "lazy_prefetch_group_offloading"
+_GROUP_ID_LAZY_LEAF = "lazy_leafs"
+# fmt: on
+class GroupOffloadingType(str, Enum):
+    BLOCK_LEVEL = "block_level"
+    LEAF_LEVEL = "leaf_level"
+@dataclass
+class GroupOffloadingConfig:
+    onload_device: torch.device
+    offload_device: torch.device
+    offload_type: GroupOffloadingType
+    non_blocking: bool
+    record_stream: bool
+    low_cpu_mem_usage: bool
+    num_blocks_per_group: Optional[int] = None
+    offload_to_disk_path: Optional[str] = None
+    stream: Optional[Union[torch.cuda.Stream, torch.Stream]] = None
+class ModuleGroup:
+    def __init__(
+        self,
+        modules: List[torch.nn.Module],
+        offload_device: torch.device,
+        onload_device: torch.device,
+        offload_leader: torch.nn.Module,
+        onload_leader: Optional[torch.nn.Module] = None,
+        parameters: Optional[List[torch.nn.Parameter]] = None,
+        buffers: Optional[List[torch.Tensor]] = None,
+        non_blocking: bool = False,
+        stream: Union[torch.cuda.Stream, torch.Stream, None] = None,
+        record_stream: Optional[bool] = False,
+        low_cpu_mem_usage: bool = False,
+        onload_self: bool = True,
+        offload_to_disk_path: Optional[str] = None,
+        group_id: Optional[int] = None,
+    ) -> None:
+        self.modules = modules
+        self.offload_device = offload_device
+        self.onload_device = onload_device
+        self.offload_leader = offload_leader
+        self.onload_leader = onload_leader
+        self.parameters = parameters or []
+        self.buffers = buffers or []
+        self.non_blocking = non_blocking or stream is not None
+        self.stream = stream
+        self.record_stream = record_stream
+        self.onload_self = onload_self
+        self.low_cpu_mem_usage = low_cpu_mem_usage
+        self.offload_to_disk_path = offload_to_disk_path
+        self._is_offloaded_to_disk = False
+        if self.offload_to_disk_path is not None:
+            # Instead of `group_id or str(id(self))` we do this because `group_id` can be "" as well.
+            self.group_id = group_id if group_id is not None else str(id(self))
+            short_hash = _compute_group_hash(self.group_id)
+            self.safetensors_file_path = os.path.join(self.offload_to_disk_path, f"group_{short_hash}.safetensors")
+            all_tensors = []
+            for module in self.modules:
+                all_tensors.extend(list(module.parameters()))
+                all_tensors.extend(list(module.buffers()))
+            all_tensors.extend(self.parameters)
+            all_tensors.extend(self.buffers)
+            all_tensors = list(dict.fromkeys(all_tensors))  # Remove duplicates
+            self.tensor_to_key = {tensor: f"tensor_{i}" for i, tensor in enumerate(all_tensors)}
+            self.key_to_tensor = {v: k for k, v in self.tensor_to_key.items()}
+            self.cpu_param_dict = {}
+        else:
+            self.cpu_param_dict = self._init_cpu_param_dict()
+        self._torch_accelerator_module = (
+            getattr(torch, torch.accelerator.current_accelerator().type)
+            if hasattr(torch, "accelerator")
+            else torch.cuda
+        )
+    def _init_cpu_param_dict(self):
+        cpu_param_dict = {}
+        if self.stream is None:
+            return cpu_param_dict
+        for module in self.modules:
+            for param in module.parameters():
+                cpu_param_dict[param] = param.data.cpu() if self.low_cpu_mem_usage else param.data.cpu().pin_memory()
+            for buffer in module.buffers():
+                cpu_param_dict[buffer] = (
+                    buffer.data.cpu() if self.low_cpu_mem_usage else buffer.data.cpu().pin_memory()
+                )
+        for param in self.parameters:
+            cpu_param_dict[param] = param.data.cpu() if self.low_cpu_mem_usage else param.data.cpu().pin_memory()
+        for buffer in self.buffers:
+            cpu_param_dict[buffer] = buffer.data.cpu() if self.low_cpu_mem_usage else buffer.data.cpu().pin_memory()
+        return cpu_param_dict
+    @contextmanager
+    def _pinned_memory_tensors(self):
+        try:
+            pinned_dict = {
+                param: tensor.pin_memory() if not tensor.is_pinned() else tensor
+                for param, tensor in self.cpu_param_dict.items()
+            }
+            yield pinned_dict
+        finally:
+            pinned_dict = None
+    def _transfer_tensor_to_device(self, tensor, source_tensor):
+        tensor.data = source_tensor.to(self.onload_device, non_blocking=self.non_blocking)
+        if self.record_stream:
+            tensor.data.record_stream(self._torch_accelerator_module.current_stream())
+    def _process_tensors_from_modules(self, pinned_memory=None):
+        for group_module in self.modules:
+            for param in group_module.parameters():
+                source = pinned_memory[param] if pinned_memory else param.data
+                self._transfer_tensor_to_device(param, source)
+            for buffer in group_module.buffers():
+                source = pinned_memory[buffer] if pinned_memory else buffer.data
+                self._transfer_tensor_to_device(buffer, source)
+        for param in self.parameters:
+            source = pinned_memory[param] if pinned_memory else param.data
+            self._transfer_tensor_to_device(param, source)
+        for buffer in self.buffers:
+            source = pinned_memory[buffer] if pinned_memory else buffer.data
+            self._transfer_tensor_to_device(buffer, source)
+    def _onload_from_disk(self):
+        if self.stream is not None:
+            # Wait for previous Host->Device transfer to complete
+            self.stream.synchronize()
+        context = nullcontext() if self.stream is None else self._torch_accelerator_module.stream(self.stream)
+        current_stream = self._torch_accelerator_module.current_stream() if self.record_stream else None
+        with context:
+            # Load to CPU (if using streams) or directly to target device, pin, and async copy to device
+            device = str(self.onload_device) if self.stream is None else "cpu"
+            loaded_tensors = safetensors.torch.load_file(self.safetensors_file_path, device=device)
+            if self.stream is not None:
+                for key, tensor_obj in self.key_to_tensor.items():
+                    pinned_tensor = loaded_tensors[key].pin_memory()
+                    tensor_obj.data = pinned_tensor.to(self.onload_device, non_blocking=self.non_blocking)
+                    if self.record_stream:
+                        tensor_obj.data.record_stream(current_stream)
+            else:
+                onload_device = (
+                    self.onload_device.type if isinstance(self.onload_device, torch.device) else self.onload_device
+                )
+                loaded_tensors = safetensors.torch.load_file(self.safetensors_file_path, device=onload_device)
+                for key, tensor_obj in self.key_to_tensor.items():
+                    tensor_obj.data = loaded_tensors[key]
+    def _onload_from_memory(self):
+        if self.stream is not None:
+            # Wait for previous Host->Device transfer to complete
+            self.stream.synchronize()
+        context = nullcontext() if self.stream is None else self._torch_accelerator_module.stream(self.stream)
+        with context:
+            if self.stream is not None:
+                with self._pinned_memory_tensors() as pinned_memory:
+                    self._process_tensors_from_modules(pinned_memory)
+            else:
+                self._process_tensors_from_modules(None)
+    def _offload_to_disk(self):
+        # TODO: we can potentially optimize this code path by checking if the _all_ the desired
+        # safetensor files exist on the disk and if so, skip this step entirely, reducing IO
+        # overhead. Currently, we just check if the given `safetensors_file_path` exists and if not
+        # we perform a write.
+        # Check if the file has been saved in this session or if it already exists on disk.
+        if not self._is_offloaded_to_disk and not os.path.exists(self.safetensors_file_path):
+            os.makedirs(os.path.dirname(self.safetensors_file_path), exist_ok=True)
+            tensors_to_save = {key: tensor.data.to(self.offload_device) for tensor, key in self.tensor_to_key.items()}
+            safetensors.torch.save_file(tensors_to_save, self.safetensors_file_path)
+        # The group is now considered offloaded to disk for the rest of the session.
+        self._is_offloaded_to_disk = True
+        # We do this to free up the RAM which is still holding the up tensor data.
+        for tensor_obj in self.tensor_to_key.keys():
+            tensor_obj.data = torch.empty_like(tensor_obj.data, device=self.offload_device)
+    def _offload_to_memory(self):
+        if self.stream is not None:
+            if not self.record_stream:
+                self._torch_accelerator_module.current_stream().synchronize()
+            for group_module in self.modules:
+                for param in group_module.parameters():
+                    param.data = self.cpu_param_dict[param]
+            for param in self.parameters:
+                param.data = self.cpu_param_dict[param]
+            for buffer in self.buffers:
+                buffer.data = self.cpu_param_dict[buffer]
+        else:
+            for group_module in self.modules:
+                group_module.to(self.offload_device, non_blocking=False)
+            for param in self.parameters:
+                param.data = param.data.to(self.offload_device, non_blocking=False)
+            for buffer in self.buffers:
+                buffer.data = buffer.data.to(self.offload_device, non_blocking=False)
+    @torch.compiler.disable()
+    def onload_(self):
+        r"""Onloads the group of parameters to the onload_device."""
+        if self.offload_to_disk_path is not None:
+            self._onload_from_disk()
+        else:
+            self._onload_from_memory()
+    @torch.compiler.disable()
+    def offload_(self):
+        r"""Offloads the group of parameters to the offload_device."""
+        if self.offload_to_disk_path:
+            self._offload_to_disk()
+        else:
+            self._offload_to_memory()
+class GroupOffloadingHook(ModelHook):
+    r"""
+    A hook that offloads groups of torch.nn.Module to the CPU for storage and onloads to accelerator device for
+    computation. Each group has one "onload leader" module that is responsible for onloading, and an "offload leader"
+    module that is responsible for offloading. If prefetching is enabled, the onload leader of the previous module
+    group is responsible for onloading the current module group.
+    """
+    _is_stateful = False
+    def __init__(self, group: ModuleGroup, *, config: GroupOffloadingConfig) -> None:
+        self.group = group
+        self.next_group: Optional[ModuleGroup] = None
+        self.config = config
+    def initialize_hook(self, module: torch.nn.Module) -> torch.nn.Module:
+        if self.group.offload_leader == module:
+            self.group.offload_()
+        return module
+    def pre_forward(self, module: torch.nn.Module, *args, **kwargs):
+        # If there wasn't an onload_leader assigned, we assume that the submodule that first called its forward
+        # method is the onload_leader of the group.
+        if self.group.onload_leader is None:
+            self.group.onload_leader = module
+        # If the current module is the onload_leader of the group, we onload the group if it is supposed
+        # to onload itself. In the case of using prefetching with streams, we onload the next group if
+        # it is not supposed to onload itself.
+        if self.group.onload_leader == module:
+            if self.group.onload_self:
+                self.group.onload_()
+            should_onload_next_group = self.next_group is not None and not self.next_group.onload_self
+            if should_onload_next_group:
+                self.next_group.onload_()
+            should_synchronize = (
+                not self.group.onload_self and self.group.stream is not None and not should_onload_next_group
+            )
+            if should_synchronize:
+                # If this group didn't onload itself, it means it was asynchronously onloaded by the
+                # previous group. We need to synchronize the side stream to ensure parameters
+                # are completely loaded to proceed with forward pass. Without this, uninitialized
+                # weights will be used in the computation, leading to incorrect results
+                # Also, we should only do this synchronization if we don't already do it from the sync call in
+                # self.next_group.onload_, hence the `not should_onload_next_group` check.
+                self.group.stream.synchronize()
+        args = send_to_device(args, self.group.onload_device, non_blocking=self.group.non_blocking)
+        kwargs = send_to_device(kwargs, self.group.onload_device, non_blocking=self.group.non_blocking)
+        return args, kwargs
+    def post_forward(self, module: torch.nn.Module, output):
+        if self.group.offload_leader == module:
+            self.group.offload_()
+        return output
+class LazyPrefetchGroupOffloadingHook(ModelHook):
+    r"""
+    A hook, used in conjunction with GroupOffloadingHook, that applies lazy prefetching to groups of torch.nn.Module.
+    This hook is used to determine the order in which the layers are executed during the forward pass. Once the layer
+    invocation order is known, assignments of the next_group attribute for prefetching can be made, which allows
+    prefetching groups in the correct order.
+    """
+    _is_stateful = False
+    def __init__(self):
+        self.execution_order: List[Tuple[str, torch.nn.Module]] = []
+        self._layer_execution_tracker_module_names = set()
+    def initialize_hook(self, module):
+        def make_execution_order_update_callback(current_name, current_submodule):
+            def callback():
+                if not torch.compiler.is_compiling():
+                    logger.debug(f"Adding {current_name} to the execution order")
+                self.execution_order.append((current_name, current_submodule))
+            return callback
+        # To every submodule that contains a group offloading hook (at this point, no prefetching is enabled for any
+        # of the groups), we add a layer execution tracker hook that will be used to determine the order in which the
+        # layers are executed during the forward pass.
+        for name, submodule in module.named_modules():
+            if name == "" or not hasattr(submodule, "_diffusers_hook"):
+                continue
+            registry = HookRegistry.check_if_exists_or_initialize(submodule)
+            group_offloading_hook = registry.get_hook(_GROUP_OFFLOADING)
+            if group_offloading_hook is not None:
+                # For the first forward pass, we have to load in a blocking manner
+                group_offloading_hook.group.non_blocking = False
+                layer_tracker_hook = LayerExecutionTrackerHook(make_execution_order_update_callback(name, submodule))
+                registry.register_hook(layer_tracker_hook, _LAYER_EXECUTION_TRACKER)
+                self._layer_execution_tracker_module_names.add(name)
+        return module
+    def post_forward(self, module, output):
+        # At this point, for the current modules' submodules, we know the execution order of the layers. We can now
+        # remove the layer execution tracker hooks and apply prefetching by setting the next_group attribute for each
+        # group offloading hook.
+        num_executed = len(self.execution_order)
+        execution_order_module_names = {name for name, _ in self.execution_order}
+        # It may be possible that some layers were not executed during the forward pass. This can happen if the layer
+        # is not used in the forward pass, or if the layer is not executed due to some other reason. In such cases, we
+        # may not be able to apply prefetching in the correct order, which can lead to device-mismatch related errors
+        # if the missing layers end up being executed in the future.
+        if execution_order_module_names != self._layer_execution_tracker_module_names:
+            unexecuted_layers = list(self._layer_execution_tracker_module_names - execution_order_module_names)
+            if not torch.compiler.is_compiling():
+                logger.warning(
+                    "It seems like some layers were not executed during the forward pass. This may lead to problems when "
+                    "applying lazy prefetching with automatic tracing and lead to device-mismatch related errors. Please "
+                    "make sure that all layers are executed during the forward pass. The following layers were not executed:\n"
+                    f"{unexecuted_layers=}"
+                )
+        # Remove the layer execution tracker hooks from the submodules
+        base_module_registry = module._diffusers_hook
+        registries = [submodule._diffusers_hook for _, submodule in self.execution_order]
+        group_offloading_hooks = [registry.get_hook(_GROUP_OFFLOADING) for registry in registries]
+        for i in range(num_executed):
+            registries[i].remove_hook(_LAYER_EXECUTION_TRACKER, recurse=False)
+        # Remove the current lazy prefetch group offloading hook so that it doesn't interfere with the next forward pass
+        base_module_registry.remove_hook(_LAZY_PREFETCH_GROUP_OFFLOADING, recurse=False)
+        # LazyPrefetchGroupOffloadingHook is only used with streams, so we know that non_blocking should be True.
+        # We disable non_blocking for the first forward pass, but need to enable it for the subsequent passes to
+        # see the benefits of prefetching.
+        for hook in group_offloading_hooks:
+            hook.group.non_blocking = True
+        # Set required attributes for prefetching
+        if num_executed > 0:
+            base_module_group_offloading_hook = base_module_registry.get_hook(_GROUP_OFFLOADING)
+            base_module_group_offloading_hook.next_group = group_offloading_hooks[0].group
+            base_module_group_offloading_hook.next_group.onload_self = False
+        for i in range(num_executed - 1):
+            name1, _ = self.execution_order[i]
+            name2, _ = self.execution_order[i + 1]
+            if not torch.compiler.is_compiling():
+                logger.debug(f"Applying lazy prefetch group offloading from {name1} to {name2}")
+            group_offloading_hooks[i].next_group = group_offloading_hooks[i + 1].group
+            group_offloading_hooks[i].next_group.onload_self = False
+        return output
+class LayerExecutionTrackerHook(ModelHook):
+    r"""
+    A hook that tracks the order in which the layers are executed during the forward pass by calling back to the
+    LazyPrefetchGroupOffloadingHook to update the execution order.
+    """
+    _is_stateful = False
+    def __init__(self, execution_order_update_callback):
+        self.execution_order_update_callback = execution_order_update_callback
+    def pre_forward(self, module, *args, **kwargs):
+        self.execution_order_update_callback()
+        return args, kwargs
+def apply_group_offloading(
+    module: torch.nn.Module,
+    onload_device: Union[str, torch.device],
+    offload_device: Union[str, torch.device] = torch.device("cpu"),
+    offload_type: Union[str, GroupOffloadingType] = "block_level",
+    num_blocks_per_group: Optional[int] = None,
+    non_blocking: bool = False,
+    use_stream: bool = False,
+    record_stream: bool = False,
+    low_cpu_mem_usage: bool = False,
+    offload_to_disk_path: Optional[str] = None,
+) -> None:
+    r"""
+    Applies group offloading to the internal layers of a torch.nn.Module. To understand what group offloading is, and
+    where it is beneficial, we need to first provide some context on how other supported offloading methods work.
+    Typically, offloading is done at two levels:
+    - Module-level: In Diffusers, this can be enabled using the `ModelMixin::enable_model_cpu_offload()` method. It
+      works by offloading each component of a pipeline to the CPU for storage, and onloading to the accelerator device
+      when needed for computation. This method is more memory-efficient than keeping all components on the accelerator,
+      but the memory requirements are still quite high. For this method to work, one needs memory equivalent to size of
+      the model in runtime dtype + size of largest intermediate activation tensors to be able to complete the forward
+      pass.
+    - Leaf-level: In Diffusers, this can be enabled using the `ModelMixin::enable_sequential_cpu_offload()` method. It
+      works by offloading the lowest leaf-level parameters of the computation graph to the CPU for storage, and
+      onloading only the leafs to the accelerator device for computation. This uses the lowest amount of accelerator
+      memory, but can be slower due to the excessive number of device synchronizations.
+    Group offloading is a middle ground between the two methods. It works by offloading groups of internal layers,
+    (either `torch.nn.ModuleList` or `torch.nn.Sequential`). This method uses lower memory than module-level
+    offloading. It is also faster than leaf-level/sequential offloading, as the number of device synchronizations is
+    reduced.
+    Another supported feature (for CUDA devices with support for asynchronous data transfer streams) is the ability to
+    overlap data transfer and computation to reduce the overall execution time compared to sequential offloading. This
+    is enabled using layer prefetching with streams, i.e., the layer that is to be executed next starts onloading to
+    the accelerator device while the current layer is being executed - this increases the memory requirements slightly.
+    Note that this implementation also supports leaf-level offloading but can be made much faster when using streams.
+    Args:
+        module (`torch.nn.Module`):
+            The module to which group offloading is applied.
+        onload_device (`torch.device`):
+            The device to which the group of modules are onloaded.
+        offload_device (`torch.device`, defaults to `torch.device("cpu")`):
+            The device to which the group of modules are offloaded. This should typically be the CPU. Default is CPU.
+        offload_type (`str` or `GroupOffloadingType`, defaults to "block_level"):
+            The type of offloading to be applied. Can be one of "block_level" or "leaf_level". Default is
+            "block_level".
+        offload_to_disk_path (`str`, *optional*, defaults to `None`):
+            The path to the directory where parameters will be offloaded. Setting this option can be useful in limited
+            RAM environment settings where a reasonable speed-memory trade-off is desired.
+        num_blocks_per_group (`int`, *optional*):
+            The number of blocks per group when using offload_type="block_level". This is required when using
+            offload_type="block_level".
+        non_blocking (`bool`, defaults to `False`):
+            If True, offloading and onloading is done with non-blocking data transfer.
+        use_stream (`bool`, defaults to `False`):
+            If True, offloading and onloading is done asynchronously using a CUDA stream. This can be useful for
+            overlapping computation and data transfer.
+        record_stream (`bool`, defaults to `False`): When enabled with `use_stream`, it marks the current tensor
+            as having been used by this stream. It is faster at the expense of slightly more memory usage. Refer to the
+            [PyTorch official docs](https://pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html) more
+            details.
+        low_cpu_mem_usage (`bool`, defaults to `False`):
+            If True, the CPU memory usage is minimized by pinning tensors on-the-fly instead of pre-pinning them. This
+            option only matters when using streamed CPU offloading (i.e. `use_stream=True`). This can be useful when
+            the CPU memory is a bottleneck but may counteract the benefits of using streams.
+    Example:
+        ```python
+        >>> from diffusers import CogVideoXTransformer3DModel
+        >>> from diffusers.hooks import apply_group_offloading
+        >>> transformer = CogVideoXTransformer3DModel.from_pretrained(
+        ...     "THUDM/CogVideoX-5b", subfolder="transformer", torch_dtype=torch.bfloat16
+        ... )
+        >>> apply_group_offloading(
+        ...     transformer,
+        ...     onload_device=torch.device("cuda"),
+        ...     offload_device=torch.device("cpu"),
+        ...     offload_type="block_level",
+        ...     num_blocks_per_group=2,
+        ...     use_stream=True,
+        ... )
+        ```
+    """
+    onload_device = torch.device(onload_device) if isinstance(onload_device, str) else onload_device
+    offload_device = torch.device(offload_device) if isinstance(offload_device, str) else offload_device
+    offload_type = GroupOffloadingType(offload_type)
+    stream = None
+    if use_stream:
+        if torch.cuda.is_available():
+            stream = torch.cuda.Stream()
+        elif hasattr(torch, "xpu") and torch.xpu.is_available():
+            stream = torch.Stream()
+        else:
+            raise ValueError("Using streams for data transfer requires a CUDA device, or an Intel XPU device.")
+    if not use_stream and record_stream:
+        raise ValueError("`record_stream` cannot be True when `use_stream=False`.")
+    if offload_type == GroupOffloadingType.BLOCK_LEVEL and num_blocks_per_group is None:
+        raise ValueError("`num_blocks_per_group` must be provided when using `offload_type='block_level'.")
+    _raise_error_if_accelerate_model_or_sequential_hook_present(module)
+    config = GroupOffloadingConfig(
+        onload_device=onload_device,
+        offload_device=offload_device,
+        offload_type=offload_type,
+        num_blocks_per_group=num_blocks_per_group,
+        non_blocking=non_blocking,
+        stream=stream,
+        record_stream=record_stream,
+        low_cpu_mem_usage=low_cpu_mem_usage,
+        offload_to_disk_path=offload_to_disk_path,
+    )
+    _apply_group_offloading(module, config)
+def _apply_group_offloading(module: torch.nn.Module, config: GroupOffloadingConfig) -> None:
+    if config.offload_type == GroupOffloadingType.BLOCK_LEVEL:
+        _apply_group_offloading_block_level(module, config)
+    elif config.offload_type == GroupOffloadingType.LEAF_LEVEL:
+        _apply_group_offloading_leaf_level(module, config)
+    else:
+        assert False
+def _apply_group_offloading_block_level(module: torch.nn.Module, config: GroupOffloadingConfig) -> None:
+    r"""
+    This function applies offloading to groups of torch.nn.ModuleList or torch.nn.Sequential blocks. In comparison to
+    the "leaf_level" offloading, which is more fine-grained, this offloading is done at the top-level blocks.
+    """
+    if config.stream is not None and config.num_blocks_per_group != 1:
+        logger.warning(
+            f"Using streams is only supported for num_blocks_per_group=1. Got {config.num_blocks_per_group=}. Setting it to 1."
+        )
+        config.num_blocks_per_group = 1
+    # Create module groups for ModuleList and Sequential blocks
+    modules_with_group_offloading = set()
+    unmatched_modules = []
+    matched_module_groups = []
+    for name, submodule in module.named_children():
+        if not isinstance(submodule, (torch.nn.ModuleList, torch.nn.Sequential)):
+            unmatched_modules.append((name, submodule))
+            modules_with_group_offloading.add(name)
+            continue
+        for i in range(0, len(submodule), config.num_blocks_per_group):
+            current_modules = submodule[i : i + config.num_blocks_per_group]
+            group_id = f"{name}_{i}_{i + len(current_modules) - 1}"
+            group = ModuleGroup(
+                modules=current_modules,
+                offload_device=config.offload_device,
+                onload_device=config.onload_device,
+                offload_to_disk_path=config.offload_to_disk_path,
+                offload_leader=current_modules[-1],
+                onload_leader=current_modules[0],
+                non_blocking=config.non_blocking,
+                stream=config.stream,
+                record_stream=config.record_stream,
+                low_cpu_mem_usage=config.low_cpu_mem_usage,
+                onload_self=True,
+                group_id=group_id,
+            )
+            matched_module_groups.append(group)
+            for j in range(i, i + len(current_modules)):
+                modules_with_group_offloading.add(f"{name}.{j}")
+    # Apply group offloading hooks to the module groups
+    for i, group in enumerate(matched_module_groups):
+        for group_module in group.modules:
+            _apply_group_offloading_hook(group_module, group, config=config)
+    # Parameters and Buffers of the top-level module need to be offloaded/onloaded separately
+    # when the forward pass of this module is called. This is because the top-level module is not
+    # part of any group (as doing so would lead to no VRAM savings).
+    parameters = _gather_parameters_with_no_group_offloading_parent(module, modules_with_group_offloading)
+    buffers = _gather_buffers_with_no_group_offloading_parent(module, modules_with_group_offloading)
+    parameters = [param for _, param in parameters]
+    buffers = [buffer for _, buffer in buffers]
+    # Create a group for the unmatched submodules of the top-level module so that they are on the correct
+    # device when the forward pass is called.
+    unmatched_modules = [unmatched_module for _, unmatched_module in unmatched_modules]
+    unmatched_group = ModuleGroup(
+        modules=unmatched_modules,
+        offload_device=config.offload_device,
+        onload_device=config.onload_device,
+        offload_to_disk_path=config.offload_to_disk_path,
+        offload_leader=module,
+        onload_leader=module,
+        parameters=parameters,
+        buffers=buffers,
+        non_blocking=False,
+        stream=None,
+        record_stream=False,
+        onload_self=True,
+        group_id=f"{module.__class__.__name__}_unmatched_group",
+    )
+    if config.stream is None:
+        _apply_group_offloading_hook(module, unmatched_group, config=config)
+    else:
+        _apply_lazy_group_offloading_hook(module, unmatched_group, config=config)
+def _apply_group_offloading_leaf_level(module: torch.nn.Module, config: GroupOffloadingConfig) -> None:
+    r"""
+    This function applies offloading to groups of leaf modules in a torch.nn.Module. This method has minimal memory
+    requirements. However, it can be slower compared to other offloading methods due to the excessive number of device
+    synchronizations. When using devices that support streams to overlap data transfer and computation, this method can
+    reduce memory usage without any performance degradation.
+    """
+    # Create module groups for leaf modules and apply group offloading hooks
+    modules_with_group_offloading = set()
+    for name, submodule in module.named_modules():
+        if not isinstance(submodule, _GO_LC_SUPPORTED_PYTORCH_LAYERS):
+            continue
+        group = ModuleGroup(
+            modules=[submodule],
+            offload_device=config.offload_device,
+            onload_device=config.onload_device,
+            offload_to_disk_path=config.offload_to_disk_path,
+            offload_leader=submodule,
+            onload_leader=submodule,
+            non_blocking=config.non_blocking,
+            stream=config.stream,
+            record_stream=config.record_stream,
+            low_cpu_mem_usage=config.low_cpu_mem_usage,
+            onload_self=True,
+            group_id=name,
+        )
+        _apply_group_offloading_hook(submodule, group, config=config)
+        modules_with_group_offloading.add(name)
+    # Parameters and Buffers at all non-leaf levels need to be offloaded/onloaded separately when the forward pass
+    # of the module is called
+    module_dict = dict(module.named_modules())
+    parameters = _gather_parameters_with_no_group_offloading_parent(module, modules_with_group_offloading)
+    buffers = _gather_buffers_with_no_group_offloading_parent(module, modules_with_group_offloading)
+    # Find closest module parent for each parameter and buffer, and attach group hooks
+    parent_to_parameters = {}
+    for name, param in parameters:
+        parent_name = _find_parent_module_in_module_dict(name, module_dict)
+        if parent_name in parent_to_parameters:
+            parent_to_parameters[parent_name].append(param)
+        else:
+            parent_to_parameters[parent_name] = [param]
+    parent_to_buffers = {}
+    for name, buffer in buffers:
+        parent_name = _find_parent_module_in_module_dict(name, module_dict)
+        if parent_name in parent_to_buffers:
+            parent_to_buffers[parent_name].append(buffer)
+        else:
+            parent_to_buffers[parent_name] = [buffer]
+    parent_names = set(parent_to_parameters.keys()) | set(parent_to_buffers.keys())
+    for name in parent_names:
+        parameters = parent_to_parameters.get(name, [])
+        buffers = parent_to_buffers.get(name, [])
+        parent_module = module_dict[name]
+        group = ModuleGroup(
+            modules=[],
+            offload_device=config.offload_device,
+            onload_device=config.onload_device,
+            offload_leader=parent_module,
+            onload_leader=parent_module,
+            offload_to_disk_path=config.offload_to_disk_path,
+            parameters=parameters,
+            buffers=buffers,
+            non_blocking=config.non_blocking,
+            stream=config.stream,
+            record_stream=config.record_stream,
+            low_cpu_mem_usage=config.low_cpu_mem_usage,
+            onload_self=True,
+            group_id=name,
+        )
+        _apply_group_offloading_hook(parent_module, group, config=config)
+    if config.stream is not None:
+        # When using streams, we need to know the layer execution order for applying prefetching (to overlap data transfer
+        # and computation). Since we don't know the order beforehand, we apply a lazy prefetching hook that will find the
+        # execution order and apply prefetching in the correct order.
+        unmatched_group = ModuleGroup(
+            modules=[],
+            offload_device=config.offload_device,
+            onload_device=config.onload_device,
+            offload_to_disk_path=config.offload_to_disk_path,
+            offload_leader=module,
+            onload_leader=module,
+            parameters=None,
+            buffers=None,
+            non_blocking=False,
+            stream=None,
+            record_stream=False,
+            low_cpu_mem_usage=config.low_cpu_mem_usage,
+            onload_self=True,
+            group_id=_GROUP_ID_LAZY_LEAF,
+        )
+        _apply_lazy_group_offloading_hook(module, unmatched_group, config=config)
+def _apply_group_offloading_hook(
+    module: torch.nn.Module,
+    group: ModuleGroup,
+    *,
+    config: GroupOffloadingConfig,
+) -> None:
+    registry = HookRegistry.check_if_exists_or_initialize(module)
+    # We may have already registered a group offloading hook if the module had a torch.nn.Parameter whose parent
+    # is the current module. In such cases, we don't want to overwrite the existing group offloading hook.
+    if registry.get_hook(_GROUP_OFFLOADING) is None:
+        hook = GroupOffloadingHook(group, config=config)
+        registry.register_hook(hook, _GROUP_OFFLOADING)
+def _apply_lazy_group_offloading_hook(
+    module: torch.nn.Module,
+    group: ModuleGroup,
+    *,
+    config: GroupOffloadingConfig,
+) -> None:
+    registry = HookRegistry.check_if_exists_or_initialize(module)
+    # We may have already registered a group offloading hook if the module had a torch.nn.Parameter whose parent
+    # is the current module. In such cases, we don't want to overwrite the existing group offloading hook.
+    if registry.get_hook(_GROUP_OFFLOADING) is None:
+        hook = GroupOffloadingHook(group, config=config)
+        registry.register_hook(hook, _GROUP_OFFLOADING)
+    lazy_prefetch_hook = LazyPrefetchGroupOffloadingHook()
+    registry.register_hook(lazy_prefetch_hook, _LAZY_PREFETCH_GROUP_OFFLOADING)
+def _gather_parameters_with_no_group_offloading_parent(
+    module: torch.nn.Module, modules_with_group_offloading: Set[str]
+) -> List[torch.nn.Parameter]:
+    parameters = []
+    for name, parameter in module.named_parameters():
+        has_parent_with_group_offloading = False
+        atoms = name.split(".")
+        while len(atoms) > 0:
+            parent_name = ".".join(atoms)
+            if parent_name in modules_with_group_offloading:
+                has_parent_with_group_offloading = True
+                break
+            atoms.pop()
+        if not has_parent_with_group_offloading:
+            parameters.append((name, parameter))
+    return parameters
+def _gather_buffers_with_no_group_offloading_parent(
+    module: torch.nn.Module, modules_with_group_offloading: Set[str]
+) -> List[torch.Tensor]:
+    buffers = []
+    for name, buffer in module.named_buffers():
+        has_parent_with_group_offloading = False
+        atoms = name.split(".")
+        while len(atoms) > 0:
+            parent_name = ".".join(atoms)
+            if parent_name in modules_with_group_offloading:
+                has_parent_with_group_offloading = True
+                break
+            atoms.pop()
+        if not has_parent_with_group_offloading:
+            buffers.append((name, buffer))
+    return buffers
+def _find_parent_module_in_module_dict(name: str, module_dict: Dict[str, torch.nn.Module]) -> str:
+    atoms = name.split(".")
+    while len(atoms) > 0:
+        parent_name = ".".join(atoms)
+        if parent_name in module_dict:
+            return parent_name
+        atoms.pop()
+    return ""
+def _raise_error_if_accelerate_model_or_sequential_hook_present(module: torch.nn.Module) -> None:
+    if not is_accelerate_available():
+        return
+    for name, submodule in module.named_modules():
+        if not hasattr(submodule, "_hf_hook"):
+            continue
+        if isinstance(submodule._hf_hook, (AlignDevicesHook, CpuOffload)):
+            raise ValueError(
+                f"Cannot apply group offloading to a module that is already applying an alternative "
+                f"offloading strategy from Accelerate. If you want to apply group offloading, please "
+                f"disable the existing offloading strategy first. Offending module: {name} ({type(submodule)})"
+            )
+def _get_top_level_group_offload_hook(module: torch.nn.Module) -> Optional[GroupOffloadingHook]:
+    for submodule in module.modules():
+        if hasattr(submodule, "_diffusers_hook"):
+            group_offloading_hook = submodule._diffusers_hook.get_hook(_GROUP_OFFLOADING)
+            if group_offloading_hook is not None:
+                return group_offloading_hook
+    return None
+def _is_group_offload_enabled(module: torch.nn.Module) -> bool:
+    top_level_group_offload_hook = _get_top_level_group_offload_hook(module)
+    return top_level_group_offload_hook is not None
+def _get_group_onload_device(module: torch.nn.Module) -> torch.device:
+    top_level_group_offload_hook = _get_top_level_group_offload_hook(module)
+    if top_level_group_offload_hook is not None:
+        return top_level_group_offload_hook.config.onload_device
+    raise ValueError("Group offloading is not enabled for the provided module.")
+def _compute_group_hash(group_id):
+    hashed_id = hashlib.sha256(group_id.encode("utf-8")).hexdigest()
+    # first 16 characters for a reasonably short but unique name
+    return hashed_id[:16]
+def _maybe_remove_and_reapply_group_offloading(module: torch.nn.Module) -> None:
+    r"""
+    Removes the group offloading hook from the module and re-applies it. This is useful when the module has been
+    modified in-place and the group offloading hook references-to-tensors needs to be updated. The in-place
+    modification can happen in a number of ways, for example, fusing QKV or unloading/loading LoRAs on-the-fly.
+    In this implementation, we make an assumption that group offloading has only been applied at the top-level module,
+    and therefore all submodules have the same onload and offload devices. If this assumption is not true, say in the
+    case where user has applied group offloading at multiple levels, this function will not work as expected.
+    There is some performance penalty associated with doing this when non-default streams are used, because we need to
+    retrace the execution order of the layers with `LazyPrefetchGroupOffloadingHook`.
+    """
+    top_level_group_offload_hook = _get_top_level_group_offload_hook(module)
+    if top_level_group_offload_hook is None:
+        return
+    registry = HookRegistry.check_if_exists_or_initialize(module)
+    registry.remove_hook(_GROUP_OFFLOADING, recurse=True)
+    registry.remove_hook(_LAYER_EXECUTION_TRACKER, recurse=True)
+    registry.remove_hook(_LAZY_PREFETCH_GROUP_OFFLOADING, recurse=True)
+    _apply_group_offloading(module, top_level_group_offload_hook.config)