xiaoanyu123 commited on
Commit
1daf802
·
verified ·
1 Parent(s): b4e634b

Add files using upload-large-folder tool

Browse files
Files changed (50) hide show
  1. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/hunyuandit/__init__.py +48 -0
  2. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/pag/__pycache__/__init__.cpython-310.pyc +0 -0
  3. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/pag/__pycache__/pipeline_pag_sd_xl_img2img.cpython-310.pyc +0 -0
  4. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/pag/__pycache__/pipeline_pag_sd_xl_inpaint.cpython-310.pyc +0 -0
  5. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/paint_by_example/__init__.py +55 -0
  6. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/paint_by_example/__pycache__/__init__.cpython-310.pyc +0 -0
  7. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/paint_by_example/__pycache__/image_encoder.cpython-310.pyc +0 -0
  8. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/paint_by_example/__pycache__/pipeline_paint_by_example.cpython-310.pyc +0 -0
  9. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/paint_by_example/image_encoder.py +67 -0
  10. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +637 -0
  11. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/pia/__init__.py +46 -0
  12. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/pia/__pycache__/__init__.cpython-310.pyc +0 -0
  13. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/pia/__pycache__/pipeline_pia.cpython-310.pyc +0 -0
  14. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/pia/pipeline_pia.py +958 -0
  15. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/pixart_alpha/__init__.py +55 -0
  16. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/pixart_alpha/__pycache__/pipeline_pixart_alpha.cpython-310.pyc +0 -0
  17. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/pixart_alpha/__pycache__/pipeline_pixart_sigma.cpython-310.pyc +0 -0
  18. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +976 -0
  19. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +906 -0
  20. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/__init__.py +195 -0
  21. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/__pycache__/clip_image_project_model.cpython-310.pyc +0 -0
  22. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/__pycache__/convert_from_ckpt.cpython-310.pyc +0 -0
  23. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/__pycache__/pipeline_flax_stable_diffusion.cpython-310.pyc +0 -0
  24. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/__pycache__/pipeline_flax_stable_diffusion_img2img.cpython-310.pyc +0 -0
  25. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/__pycache__/pipeline_flax_stable_diffusion_inpaint.cpython-310.pyc +0 -0
  26. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +586 -0
  27. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/pipeline_output.py +45 -0
  28. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +1104 -0
  29. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +897 -0
  30. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +439 -0
  31. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +1161 -0
  32. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +1359 -0
  33. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +917 -0
  34. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +665 -0
  35. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +826 -0
  36. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +952 -0
  37. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +858 -0
  38. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/safety_checker.py +126 -0
  39. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/safety_checker_flax.py +112 -0
  40. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py +57 -0
  41. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion_xl/__pycache__/__init__.cpython-310.pyc +0 -0
  42. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_video_diffusion/__init__.py +58 -0
  43. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_video_diffusion/__pycache__/__init__.cpython-310.pyc +0 -0
  44. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_video_diffusion/__pycache__/pipeline_stable_video_diffusion.cpython-310.pyc +0 -0
  45. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +737 -0
  46. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/t2i_adapter/__init__.py +47 -0
  47. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/t2i_adapter/__pycache__/pipeline_stable_diffusion_adapter.cpython-310.pyc +0 -0
  48. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/t2i_adapter/__pycache__/pipeline_stable_diffusion_xl_adapter.cpython-310.pyc +0 -0
  49. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py +956 -0
  50. pythonProject/.venv/Lib/site-packages/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py +1311 -0
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/hunyuandit/__init__.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import TYPE_CHECKING
2
+
3
+ from ...utils import (
4
+ DIFFUSERS_SLOW_IMPORT,
5
+ OptionalDependencyNotAvailable,
6
+ _LazyModule,
7
+ get_objects_from_module,
8
+ is_torch_available,
9
+ is_transformers_available,
10
+ )
11
+
12
+
13
+ _dummy_objects = {}
14
+ _import_structure = {}
15
+
16
+
17
+ try:
18
+ if not (is_transformers_available() and is_torch_available()):
19
+ raise OptionalDependencyNotAvailable()
20
+ except OptionalDependencyNotAvailable:
21
+ from ...utils import dummy_torch_and_transformers_objects # noqa F403
22
+
23
+ _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
24
+ else:
25
+ _import_structure["pipeline_hunyuandit"] = ["HunyuanDiTPipeline"]
26
+
27
+ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
28
+ try:
29
+ if not (is_transformers_available() and is_torch_available()):
30
+ raise OptionalDependencyNotAvailable()
31
+
32
+ except OptionalDependencyNotAvailable:
33
+ from ...utils.dummy_torch_and_transformers_objects import *
34
+ else:
35
+ from .pipeline_hunyuandit import HunyuanDiTPipeline
36
+
37
+ else:
38
+ import sys
39
+
40
+ sys.modules[__name__] = _LazyModule(
41
+ __name__,
42
+ globals()["__file__"],
43
+ _import_structure,
44
+ module_spec=__spec__,
45
+ )
46
+
47
+ for name, value in _dummy_objects.items():
48
+ setattr(sys.modules[__name__], name, value)
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/pag/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (2.7 kB). View file
 
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/pag/__pycache__/pipeline_pag_sd_xl_img2img.cpython-310.pyc ADDED
Binary file (51.6 kB). View file
 
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/pag/__pycache__/pipeline_pag_sd_xl_inpaint.cpython-310.pyc ADDED
Binary file (57.9 kB). View file
 
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/paint_by_example/__init__.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import TYPE_CHECKING, List, Optional, Union
3
+
4
+ import numpy as np
5
+ import PIL
6
+ from PIL import Image
7
+
8
+ from ...utils import (
9
+ DIFFUSERS_SLOW_IMPORT,
10
+ OptionalDependencyNotAvailable,
11
+ _LazyModule,
12
+ get_objects_from_module,
13
+ is_torch_available,
14
+ is_transformers_available,
15
+ )
16
+
17
+
18
+ _dummy_objects = {}
19
+ _import_structure = {}
20
+
21
+ try:
22
+ if not (is_transformers_available() and is_torch_available()):
23
+ raise OptionalDependencyNotAvailable()
24
+ except OptionalDependencyNotAvailable:
25
+ from ...utils import dummy_torch_and_transformers_objects # noqa F403
26
+
27
+ _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
28
+ else:
29
+ _import_structure["image_encoder"] = ["PaintByExampleImageEncoder"]
30
+ _import_structure["pipeline_paint_by_example"] = ["PaintByExamplePipeline"]
31
+
32
+
33
+ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
34
+ try:
35
+ if not (is_transformers_available() and is_torch_available()):
36
+ raise OptionalDependencyNotAvailable()
37
+
38
+ except OptionalDependencyNotAvailable:
39
+ from ...utils.dummy_torch_and_transformers_objects import *
40
+ else:
41
+ from .image_encoder import PaintByExampleImageEncoder
42
+ from .pipeline_paint_by_example import PaintByExamplePipeline
43
+
44
+ else:
45
+ import sys
46
+
47
+ sys.modules[__name__] = _LazyModule(
48
+ __name__,
49
+ globals()["__file__"],
50
+ _import_structure,
51
+ module_spec=__spec__,
52
+ )
53
+
54
+ for name, value in _dummy_objects.items():
55
+ setattr(sys.modules[__name__], name, value)
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/paint_by_example/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (1.3 kB). View file
 
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/paint_by_example/__pycache__/image_encoder.cpython-310.pyc ADDED
Binary file (2.31 kB). View file
 
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/paint_by_example/__pycache__/pipeline_paint_by_example.cpython-310.pyc ADDED
Binary file (20.2 kB). View file
 
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/paint_by_example/image_encoder.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import torch
15
+ from torch import nn
16
+ from transformers import CLIPPreTrainedModel, CLIPVisionModel
17
+
18
+ from ...models.attention import BasicTransformerBlock
19
+ from ...utils import logging
20
+
21
+
22
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
23
+
24
+
25
+ class PaintByExampleImageEncoder(CLIPPreTrainedModel):
26
+ def __init__(self, config, proj_size=None):
27
+ super().__init__(config)
28
+ self.proj_size = proj_size or getattr(config, "projection_dim", 768)
29
+
30
+ self.model = CLIPVisionModel(config)
31
+ self.mapper = PaintByExampleMapper(config)
32
+ self.final_layer_norm = nn.LayerNorm(config.hidden_size)
33
+ self.proj_out = nn.Linear(config.hidden_size, self.proj_size)
34
+
35
+ # uncondition for scaling
36
+ self.uncond_vector = nn.Parameter(torch.randn((1, 1, self.proj_size)))
37
+
38
+ def forward(self, pixel_values, return_uncond_vector=False):
39
+ clip_output = self.model(pixel_values=pixel_values)
40
+ latent_states = clip_output.pooler_output
41
+ latent_states = self.mapper(latent_states[:, None])
42
+ latent_states = self.final_layer_norm(latent_states)
43
+ latent_states = self.proj_out(latent_states)
44
+ if return_uncond_vector:
45
+ return latent_states, self.uncond_vector
46
+
47
+ return latent_states
48
+
49
+
50
+ class PaintByExampleMapper(nn.Module):
51
+ def __init__(self, config):
52
+ super().__init__()
53
+ num_layers = (config.num_hidden_layers + 1) // 5
54
+ hid_size = config.hidden_size
55
+ num_heads = 1
56
+ self.blocks = nn.ModuleList(
57
+ [
58
+ BasicTransformerBlock(hid_size, num_heads, hid_size, activation_fn="gelu", attention_bias=True)
59
+ for _ in range(num_layers)
60
+ ]
61
+ )
62
+
63
+ def forward(self, hidden_states):
64
+ for block in self.blocks:
65
+ hidden_states = block(hidden_states)
66
+
67
+ return hidden_states
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py ADDED
@@ -0,0 +1,637 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ from typing import Callable, List, Optional, Union
17
+
18
+ import numpy as np
19
+ import PIL.Image
20
+ import torch
21
+ from transformers import CLIPImageProcessor
22
+
23
+ from ...image_processor import VaeImageProcessor
24
+ from ...models import AutoencoderKL, UNet2DConditionModel
25
+ from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
26
+ from ...utils import deprecate, is_torch_xla_available, logging
27
+ from ...utils.torch_utils import randn_tensor
28
+ from ..pipeline_utils import DeprecatedPipelineMixin, DiffusionPipeline, StableDiffusionMixin
29
+ from ..stable_diffusion import StableDiffusionPipelineOutput
30
+ from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
31
+ from .image_encoder import PaintByExampleImageEncoder
32
+
33
+
34
+ if is_torch_xla_available():
35
+ import torch_xla.core.xla_model as xm
36
+
37
+ XLA_AVAILABLE = True
38
+ else:
39
+ XLA_AVAILABLE = False
40
+
41
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
42
+
43
+
44
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
45
+ def retrieve_latents(
46
+ encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
47
+ ):
48
+ if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
49
+ return encoder_output.latent_dist.sample(generator)
50
+ elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
51
+ return encoder_output.latent_dist.mode()
52
+ elif hasattr(encoder_output, "latents"):
53
+ return encoder_output.latents
54
+ else:
55
+ raise AttributeError("Could not access latents of provided encoder_output")
56
+
57
+
58
+ def prepare_mask_and_masked_image(image, mask):
59
+ """
60
+ Prepares a pair (image, mask) to be consumed by the Paint by Example pipeline. This means that those inputs will be
61
+ converted to ``torch.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
62
+ ``image`` and ``1`` for the ``mask``.
63
+
64
+ The ``image`` will be converted to ``torch.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be
65
+ binarized (``mask > 0.5``) and cast to ``torch.float32`` too.
66
+
67
+ Args:
68
+ image (Union[np.array, PIL.Image, torch.Tensor]): The image to inpaint.
69
+ It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array`` or a ``channels x height x width``
70
+ ``torch.Tensor`` or a ``batch x channels x height x width`` ``torch.Tensor``.
71
+ mask (_type_): The mask to apply to the image, i.e. regions to inpaint.
72
+ It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or a ``1 x height x width``
73
+ ``torch.Tensor`` or a ``batch x 1 x height x width`` ``torch.Tensor``.
74
+
75
+
76
+ Raises:
77
+ ValueError: ``torch.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``torch.Tensor`` mask
78
+ should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions.
79
+ TypeError: ``mask`` is a ``torch.Tensor`` but ``image`` is not
80
+ (ot the other way around).
81
+
82
+ Returns:
83
+ tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4
84
+ dimensions: ``batch x channels x height x width``.
85
+ """
86
+ if isinstance(image, torch.Tensor):
87
+ if not isinstance(mask, torch.Tensor):
88
+ raise TypeError(f"`image` is a torch.Tensor but `mask` (type: {type(mask)} is not")
89
+
90
+ # Batch single image
91
+ if image.ndim == 3:
92
+ assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
93
+ image = image.unsqueeze(0)
94
+
95
+ # Batch and add channel dim for single mask
96
+ if mask.ndim == 2:
97
+ mask = mask.unsqueeze(0).unsqueeze(0)
98
+
99
+ # Batch single mask or add channel dim
100
+ if mask.ndim == 3:
101
+ # Batched mask
102
+ if mask.shape[0] == image.shape[0]:
103
+ mask = mask.unsqueeze(1)
104
+ else:
105
+ mask = mask.unsqueeze(0)
106
+
107
+ assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
108
+ assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
109
+ assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
110
+ assert mask.shape[1] == 1, "Mask image must have a single channel"
111
+
112
+ # Check image is in [-1, 1]
113
+ if image.min() < -1 or image.max() > 1:
114
+ raise ValueError("Image should be in [-1, 1] range")
115
+
116
+ # Check mask is in [0, 1]
117
+ if mask.min() < 0 or mask.max() > 1:
118
+ raise ValueError("Mask should be in [0, 1] range")
119
+
120
+ # paint-by-example inverses the mask
121
+ mask = 1 - mask
122
+
123
+ # Binarize mask
124
+ mask[mask < 0.5] = 0
125
+ mask[mask >= 0.5] = 1
126
+
127
+ # Image as float32
128
+ image = image.to(dtype=torch.float32)
129
+ elif isinstance(mask, torch.Tensor):
130
+ raise TypeError(f"`mask` is a torch.Tensor but `image` (type: {type(image)} is not")
131
+ else:
132
+ if isinstance(image, PIL.Image.Image):
133
+ image = [image]
134
+
135
+ image = np.concatenate([np.array(i.convert("RGB"))[None, :] for i in image], axis=0)
136
+ image = image.transpose(0, 3, 1, 2)
137
+ image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
138
+
139
+ # preprocess mask
140
+ if isinstance(mask, PIL.Image.Image):
141
+ mask = [mask]
142
+
143
+ mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
144
+ mask = mask.astype(np.float32) / 255.0
145
+
146
+ # paint-by-example inverses the mask
147
+ mask = 1 - mask
148
+
149
+ mask[mask < 0.5] = 0
150
+ mask[mask >= 0.5] = 1
151
+ mask = torch.from_numpy(mask)
152
+
153
+ masked_image = image * mask
154
+
155
+ return mask, masked_image
156
+
157
+
158
+ class PaintByExamplePipeline(DeprecatedPipelineMixin, DiffusionPipeline, StableDiffusionMixin):
159
+ _last_supported_version = "0.33.1"
160
+ r"""
161
+ <Tip warning={true}>
162
+
163
+ 🧪 This is an experimental feature!
164
+
165
+ </Tip>
166
+
167
+ Pipeline for image-guided image inpainting using Stable Diffusion.
168
+
169
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
170
+ implemented for all pipelines (downloading, saving, running on a particular device, etc.).
171
+
172
+ Args:
173
+ vae ([`AutoencoderKL`]):
174
+ Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
175
+ image_encoder ([`PaintByExampleImageEncoder`]):
176
+ Encodes the example input image. The `unet` is conditioned on the example image instead of a text prompt.
177
+ tokenizer ([`~transformers.CLIPTokenizer`]):
178
+ A `CLIPTokenizer` to tokenize text.
179
+ unet ([`UNet2DConditionModel`]):
180
+ A `UNet2DConditionModel` to denoise the encoded image latents.
181
+ scheduler ([`SchedulerMixin`]):
182
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
183
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
184
+ safety_checker ([`StableDiffusionSafetyChecker`]):
185
+ Classification module that estimates whether generated images could be considered offensive or harmful.
186
+ Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
187
+ about a model's potential harms.
188
+ feature_extractor ([`~transformers.CLIPImageProcessor`]):
189
+ A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
190
+
191
+ """
192
+
193
+ # TODO: feature_extractor is required to encode initial images (if they are in PIL format),
194
+ # we should give a descriptive message if the pipeline doesn't have one.
195
+
196
+ model_cpu_offload_seq = "unet->vae"
197
+ _exclude_from_cpu_offload = ["image_encoder"]
198
+ _optional_components = ["safety_checker"]
199
+
200
+ def __init__(
201
+ self,
202
+ vae: AutoencoderKL,
203
+ image_encoder: PaintByExampleImageEncoder,
204
+ unet: UNet2DConditionModel,
205
+ scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
206
+ safety_checker: StableDiffusionSafetyChecker,
207
+ feature_extractor: CLIPImageProcessor,
208
+ requires_safety_checker: bool = False,
209
+ ):
210
+ super().__init__()
211
+
212
+ self.register_modules(
213
+ vae=vae,
214
+ image_encoder=image_encoder,
215
+ unet=unet,
216
+ scheduler=scheduler,
217
+ safety_checker=safety_checker,
218
+ feature_extractor=feature_extractor,
219
+ )
220
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
221
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
222
+ self.register_to_config(requires_safety_checker=requires_safety_checker)
223
+
224
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
225
+ def run_safety_checker(self, image, device, dtype):
226
+ if self.safety_checker is None:
227
+ has_nsfw_concept = None
228
+ else:
229
+ if torch.is_tensor(image):
230
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
231
+ else:
232
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
233
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
234
+ image, has_nsfw_concept = self.safety_checker(
235
+ images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
236
+ )
237
+ return image, has_nsfw_concept
238
+
239
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
240
+ def prepare_extra_step_kwargs(self, generator, eta):
241
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
242
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
243
+ # eta corresponds to η in DDIM paper: https://huggingface.co/papers/2010.02502
244
+ # and should be between [0, 1]
245
+
246
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
247
+ extra_step_kwargs = {}
248
+ if accepts_eta:
249
+ extra_step_kwargs["eta"] = eta
250
+
251
+ # check if the scheduler accepts generator
252
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
253
+ if accepts_generator:
254
+ extra_step_kwargs["generator"] = generator
255
+ return extra_step_kwargs
256
+
257
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
258
+ def decode_latents(self, latents):
259
+ deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
260
+ deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
261
+
262
+ latents = 1 / self.vae.config.scaling_factor * latents
263
+ image = self.vae.decode(latents, return_dict=False)[0]
264
+ image = (image / 2 + 0.5).clamp(0, 1)
265
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
266
+ image = image.cpu().permute(0, 2, 3, 1).float().numpy()
267
+ return image
268
+
269
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_image_variation.StableDiffusionImageVariationPipeline.check_inputs
270
+ def check_inputs(self, image, height, width, callback_steps):
271
+ if (
272
+ not isinstance(image, torch.Tensor)
273
+ and not isinstance(image, PIL.Image.Image)
274
+ and not isinstance(image, list)
275
+ ):
276
+ raise ValueError(
277
+ "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
278
+ f" {type(image)}"
279
+ )
280
+
281
+ if height % 8 != 0 or width % 8 != 0:
282
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
283
+
284
+ if (callback_steps is None) or (
285
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
286
+ ):
287
+ raise ValueError(
288
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
289
+ f" {type(callback_steps)}."
290
+ )
291
+
292
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
293
+ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
294
+ shape = (
295
+ batch_size,
296
+ num_channels_latents,
297
+ int(height) // self.vae_scale_factor,
298
+ int(width) // self.vae_scale_factor,
299
+ )
300
+ if isinstance(generator, list) and len(generator) != batch_size:
301
+ raise ValueError(
302
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
303
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
304
+ )
305
+
306
+ if latents is None:
307
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
308
+ else:
309
+ latents = latents.to(device)
310
+
311
+ # scale the initial noise by the standard deviation required by the scheduler
312
+ latents = latents * self.scheduler.init_noise_sigma
313
+ return latents
314
+
315
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.StableDiffusionInpaintPipeline.prepare_mask_latents
316
+ def prepare_mask_latents(
317
+ self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance
318
+ ):
319
+ # resize the mask to latents shape as we concatenate the mask to the latents
320
+ # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
321
+ # and half precision
322
+ mask = torch.nn.functional.interpolate(
323
+ mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
324
+ )
325
+ mask = mask.to(device=device, dtype=dtype)
326
+
327
+ masked_image = masked_image.to(device=device, dtype=dtype)
328
+
329
+ if masked_image.shape[1] == 4:
330
+ masked_image_latents = masked_image
331
+ else:
332
+ masked_image_latents = self._encode_vae_image(masked_image, generator=generator)
333
+
334
+ # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
335
+ if mask.shape[0] < batch_size:
336
+ if not batch_size % mask.shape[0] == 0:
337
+ raise ValueError(
338
+ "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
339
+ f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
340
+ " of masks that you pass is divisible by the total requested batch size."
341
+ )
342
+ mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
343
+ if masked_image_latents.shape[0] < batch_size:
344
+ if not batch_size % masked_image_latents.shape[0] == 0:
345
+ raise ValueError(
346
+ "The passed images and the required batch size don't match. Images are supposed to be duplicated"
347
+ f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
348
+ " Make sure the number of images that you pass is divisible by the total requested batch size."
349
+ )
350
+ masked_image_latents = masked_image_latents.repeat(batch_size // masked_image_latents.shape[0], 1, 1, 1)
351
+
352
+ mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
353
+ masked_image_latents = (
354
+ torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
355
+ )
356
+
357
+ # aligning device to prevent device errors when concating it with the latent model input
358
+ masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
359
+ return mask, masked_image_latents
360
+
361
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.StableDiffusionInpaintPipeline._encode_vae_image
362
+ def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
363
+ if isinstance(generator, list):
364
+ image_latents = [
365
+ retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
366
+ for i in range(image.shape[0])
367
+ ]
368
+ image_latents = torch.cat(image_latents, dim=0)
369
+ else:
370
+ image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
371
+
372
+ image_latents = self.vae.config.scaling_factor * image_latents
373
+
374
+ return image_latents
375
+
376
+ def _encode_image(self, image, device, num_images_per_prompt, do_classifier_free_guidance):
377
+ dtype = next(self.image_encoder.parameters()).dtype
378
+
379
+ if not isinstance(image, torch.Tensor):
380
+ image = self.feature_extractor(images=image, return_tensors="pt").pixel_values
381
+
382
+ image = image.to(device=device, dtype=dtype)
383
+ image_embeddings, negative_prompt_embeds = self.image_encoder(image, return_uncond_vector=True)
384
+
385
+ # duplicate image embeddings for each generation per prompt, using mps friendly method
386
+ bs_embed, seq_len, _ = image_embeddings.shape
387
+ image_embeddings = image_embeddings.repeat(1, num_images_per_prompt, 1)
388
+ image_embeddings = image_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
389
+
390
+ if do_classifier_free_guidance:
391
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, image_embeddings.shape[0], 1)
392
+ negative_prompt_embeds = negative_prompt_embeds.view(bs_embed * num_images_per_prompt, 1, -1)
393
+
394
+ # For classifier free guidance, we need to do two forward passes.
395
+ # Here we concatenate the unconditional and text embeddings into a single batch
396
+ # to avoid doing two forward passes
397
+ image_embeddings = torch.cat([negative_prompt_embeds, image_embeddings])
398
+
399
+ return image_embeddings
400
+
401
+ @torch.no_grad()
402
+ def __call__(
403
+ self,
404
+ example_image: Union[torch.Tensor, PIL.Image.Image],
405
+ image: Union[torch.Tensor, PIL.Image.Image],
406
+ mask_image: Union[torch.Tensor, PIL.Image.Image],
407
+ height: Optional[int] = None,
408
+ width: Optional[int] = None,
409
+ num_inference_steps: int = 50,
410
+ guidance_scale: float = 5.0,
411
+ negative_prompt: Optional[Union[str, List[str]]] = None,
412
+ num_images_per_prompt: Optional[int] = 1,
413
+ eta: float = 0.0,
414
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
415
+ latents: Optional[torch.Tensor] = None,
416
+ output_type: Optional[str] = "pil",
417
+ return_dict: bool = True,
418
+ callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
419
+ callback_steps: int = 1,
420
+ ):
421
+ r"""
422
+ The call function to the pipeline for generation.
423
+
424
+ Args:
425
+ example_image (`torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`):
426
+ An example image to guide image generation.
427
+ image (`torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`):
428
+ `Image` or tensor representing an image batch to be inpainted (parts of the image are masked out with
429
+ `mask_image` and repainted according to `prompt`).
430
+ mask_image (`torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`):
431
+ `Image` or tensor representing an image batch to mask `image`. White pixels in the mask are repainted,
432
+ while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a single channel
433
+ (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the
434
+ expected shape would be `(B, H, W, 1)`.
435
+ height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
436
+ The height in pixels of the generated image.
437
+ width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
438
+ The width in pixels of the generated image.
439
+ num_inference_steps (`int`, *optional*, defaults to 50):
440
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
441
+ expense of slower inference.
442
+ guidance_scale (`float`, *optional*, defaults to 7.5):
443
+ A higher guidance scale value encourages the model to generate images closely linked to the text
444
+ `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
445
+ negative_prompt (`str` or `List[str]`, *optional*):
446
+ The prompt or prompts to guide what to not include in image generation. If not defined, you need to
447
+ pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
448
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
449
+ The number of images to generate per prompt.
450
+ eta (`float`, *optional*, defaults to 0.0):
451
+ Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only
452
+ applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
453
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
454
+ A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
455
+ generation deterministic.
456
+ latents (`torch.Tensor`, *optional*):
457
+ Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
458
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
459
+ tensor is generated by sampling using the supplied random `generator`.
460
+ output_type (`str`, *optional*, defaults to `"pil"`):
461
+ The output format of the generated image. Choose between `PIL.Image` or `np.array`.
462
+ return_dict (`bool`, *optional*, defaults to `True`):
463
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
464
+ plain tuple.
465
+ callback (`Callable`, *optional*):
466
+ A function that calls every `callback_steps` steps during inference. The function is called with the
467
+ following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
468
+ callback_steps (`int`, *optional*, defaults to 1):
469
+ The frequency at which the `callback` function is called. If not specified, the callback is called at
470
+ every step.
471
+
472
+ Example:
473
+
474
+ ```py
475
+ >>> import PIL
476
+ >>> import requests
477
+ >>> import torch
478
+ >>> from io import BytesIO
479
+ >>> from diffusers import PaintByExamplePipeline
480
+
481
+
482
+ >>> def download_image(url):
483
+ ... response = requests.get(url)
484
+ ... return PIL.Image.open(BytesIO(response.content)).convert("RGB")
485
+
486
+
487
+ >>> img_url = (
488
+ ... "https://raw.githubusercontent.com/Fantasy-Studio/Paint-by-Example/main/examples/image/example_1.png"
489
+ ... )
490
+ >>> mask_url = (
491
+ ... "https://raw.githubusercontent.com/Fantasy-Studio/Paint-by-Example/main/examples/mask/example_1.png"
492
+ ... )
493
+ >>> example_url = "https://raw.githubusercontent.com/Fantasy-Studio/Paint-by-Example/main/examples/reference/example_1.jpg"
494
+
495
+ >>> init_image = download_image(img_url).resize((512, 512))
496
+ >>> mask_image = download_image(mask_url).resize((512, 512))
497
+ >>> example_image = download_image(example_url).resize((512, 512))
498
+
499
+ >>> pipe = PaintByExamplePipeline.from_pretrained(
500
+ ... "Fantasy-Studio/Paint-by-Example",
501
+ ... torch_dtype=torch.float16,
502
+ ... )
503
+ >>> pipe = pipe.to("cuda")
504
+
505
+ >>> image = pipe(image=init_image, mask_image=mask_image, example_image=example_image).images[0]
506
+ >>> image
507
+ ```
508
+
509
+ Returns:
510
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
511
+ If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
512
+ otherwise a `tuple` is returned where the first element is a list with the generated images and the
513
+ second element is a list of `bool`s indicating whether the corresponding generated image contains
514
+ "not-safe-for-work" (nsfw) content.
515
+ """
516
+ # 1. Define call parameters
517
+ if isinstance(image, PIL.Image.Image):
518
+ batch_size = 1
519
+ elif isinstance(image, list):
520
+ batch_size = len(image)
521
+ else:
522
+ batch_size = image.shape[0]
523
+ device = self._execution_device
524
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
525
+ # of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
526
+ # corresponds to doing no classifier free guidance.
527
+ do_classifier_free_guidance = guidance_scale > 1.0
528
+
529
+ # 2. Preprocess mask and image
530
+ mask, masked_image = prepare_mask_and_masked_image(image, mask_image)
531
+ height, width = masked_image.shape[-2:]
532
+
533
+ # 3. Check inputs
534
+ self.check_inputs(example_image, height, width, callback_steps)
535
+
536
+ # 4. Encode input image
537
+ image_embeddings = self._encode_image(
538
+ example_image, device, num_images_per_prompt, do_classifier_free_guidance
539
+ )
540
+
541
+ # 5. set timesteps
542
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
543
+ timesteps = self.scheduler.timesteps
544
+
545
+ # 6. Prepare latent variables
546
+ num_channels_latents = self.vae.config.latent_channels
547
+ latents = self.prepare_latents(
548
+ batch_size * num_images_per_prompt,
549
+ num_channels_latents,
550
+ height,
551
+ width,
552
+ image_embeddings.dtype,
553
+ device,
554
+ generator,
555
+ latents,
556
+ )
557
+
558
+ # 7. Prepare mask latent variables
559
+ mask, masked_image_latents = self.prepare_mask_latents(
560
+ mask,
561
+ masked_image,
562
+ batch_size * num_images_per_prompt,
563
+ height,
564
+ width,
565
+ image_embeddings.dtype,
566
+ device,
567
+ generator,
568
+ do_classifier_free_guidance,
569
+ )
570
+
571
+ # 8. Check that sizes of mask, masked image and latents match
572
+ num_channels_mask = mask.shape[1]
573
+ num_channels_masked_image = masked_image_latents.shape[1]
574
+ if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
575
+ raise ValueError(
576
+ f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
577
+ f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
578
+ f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
579
+ f" = {num_channels_latents + num_channels_masked_image + num_channels_mask}. Please verify the config of"
580
+ " `pipeline.unet` or your `mask_image` or `image` input."
581
+ )
582
+
583
+ # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
584
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
585
+
586
+ # 10. Denoising loop
587
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
588
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
589
+ for i, t in enumerate(timesteps):
590
+ # expand the latents if we are doing classifier free guidance
591
+ latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
592
+
593
+ # concat latents, mask, masked_image_latents in the channel dimension
594
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
595
+ latent_model_input = torch.cat([latent_model_input, masked_image_latents, mask], dim=1)
596
+
597
+ # predict the noise residual
598
+ noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=image_embeddings).sample
599
+
600
+ # perform guidance
601
+ if do_classifier_free_guidance:
602
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
603
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
604
+
605
+ # compute the previous noisy sample x_t -> x_t-1
606
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
607
+
608
+ # call the callback, if provided
609
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
610
+ progress_bar.update()
611
+ if callback is not None and i % callback_steps == 0:
612
+ step_idx = i // getattr(self.scheduler, "order", 1)
613
+ callback(step_idx, t, latents)
614
+
615
+ if XLA_AVAILABLE:
616
+ xm.mark_step()
617
+
618
+ self.maybe_free_model_hooks()
619
+
620
+ if not output_type == "latent":
621
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
622
+ image, has_nsfw_concept = self.run_safety_checker(image, device, image_embeddings.dtype)
623
+ else:
624
+ image = latents
625
+ has_nsfw_concept = None
626
+
627
+ if has_nsfw_concept is None:
628
+ do_denormalize = [True] * image.shape[0]
629
+ else:
630
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
631
+
632
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
633
+
634
+ if not return_dict:
635
+ return (image, has_nsfw_concept)
636
+
637
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/pia/__init__.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import TYPE_CHECKING
2
+
3
+ from ...utils import (
4
+ DIFFUSERS_SLOW_IMPORT,
5
+ OptionalDependencyNotAvailable,
6
+ _LazyModule,
7
+ get_objects_from_module,
8
+ is_torch_available,
9
+ is_transformers_available,
10
+ )
11
+
12
+
13
+ _dummy_objects = {}
14
+ _import_structure = {}
15
+
16
+ try:
17
+ if not (is_transformers_available() and is_torch_available()):
18
+ raise OptionalDependencyNotAvailable()
19
+ except OptionalDependencyNotAvailable:
20
+ from ...utils import dummy_torch_and_transformers_objects
21
+
22
+ _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
23
+ else:
24
+ _import_structure["pipeline_pia"] = ["PIAPipeline", "PIAPipelineOutput"]
25
+
26
+ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
27
+ try:
28
+ if not (is_transformers_available() and is_torch_available()):
29
+ raise OptionalDependencyNotAvailable()
30
+ except OptionalDependencyNotAvailable:
31
+ from ...utils.dummy_torch_and_transformers_objects import *
32
+
33
+ else:
34
+ from .pipeline_pia import PIAPipeline, PIAPipelineOutput
35
+
36
+ else:
37
+ import sys
38
+
39
+ sys.modules[__name__] = _LazyModule(
40
+ __name__,
41
+ globals()["__file__"],
42
+ _import_structure,
43
+ module_spec=__spec__,
44
+ )
45
+ for name, value in _dummy_objects.items():
46
+ setattr(sys.modules[__name__], name, value)
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/pia/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (1.05 kB). View file
 
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/pia/__pycache__/pipeline_pia.cpython-310.pyc ADDED
Binary file (28.2 kB). View file
 
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/pia/pipeline_pia.py ADDED
@@ -0,0 +1,958 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ from dataclasses import dataclass
17
+ from typing import Any, Callable, Dict, List, Optional, Union
18
+
19
+ import numpy as np
20
+ import PIL
21
+ import torch
22
+ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
23
+
24
+ from ...image_processor import PipelineImageInput
25
+ from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
26
+ from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
27
+ from ...models.lora import adjust_lora_scale_text_encoder
28
+ from ...models.unets.unet_motion_model import MotionAdapter
29
+ from ...schedulers import (
30
+ DDIMScheduler,
31
+ DPMSolverMultistepScheduler,
32
+ EulerAncestralDiscreteScheduler,
33
+ EulerDiscreteScheduler,
34
+ LMSDiscreteScheduler,
35
+ PNDMScheduler,
36
+ )
37
+ from ...utils import (
38
+ USE_PEFT_BACKEND,
39
+ BaseOutput,
40
+ is_torch_xla_available,
41
+ logging,
42
+ replace_example_docstring,
43
+ scale_lora_layers,
44
+ unscale_lora_layers,
45
+ )
46
+ from ...utils.torch_utils import randn_tensor
47
+ from ...video_processor import VideoProcessor
48
+ from ..free_init_utils import FreeInitMixin
49
+ from ..pipeline_utils import DeprecatedPipelineMixin, DiffusionPipeline, StableDiffusionMixin
50
+
51
+
52
+ if is_torch_xla_available():
53
+ import torch_xla.core.xla_model as xm
54
+
55
+ XLA_AVAILABLE = True
56
+ else:
57
+ XLA_AVAILABLE = False
58
+
59
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
60
+
61
+
62
+ EXAMPLE_DOC_STRING = """
63
+ Examples:
64
+ ```py
65
+ >>> import torch
66
+ >>> from diffusers import EulerDiscreteScheduler, MotionAdapter, PIAPipeline
67
+ >>> from diffusers.utils import export_to_gif, load_image
68
+
69
+ >>> adapter = MotionAdapter.from_pretrained("openmmlab/PIA-condition-adapter")
70
+ >>> pipe = PIAPipeline.from_pretrained(
71
+ ... "SG161222/Realistic_Vision_V6.0_B1_noVAE", motion_adapter=adapter, torch_dtype=torch.float16
72
+ ... )
73
+
74
+ >>> pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
75
+ >>> image = load_image(
76
+ ... "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/cat_6.png?download=true"
77
+ ... )
78
+ >>> image = image.resize((512, 512))
79
+ >>> prompt = "cat in a hat"
80
+ >>> negative_prompt = "wrong white balance, dark, sketches, worst quality, low quality, deformed, distorted"
81
+ >>> generator = torch.Generator("cpu").manual_seed(0)
82
+ >>> output = pipe(image=image, prompt=prompt, negative_prompt=negative_prompt, generator=generator)
83
+ >>> frames = output.frames[0]
84
+ >>> export_to_gif(frames, "pia-animation.gif")
85
+ ```
86
+ """
87
+
88
+ RANGE_LIST = [
89
+ [1.0, 0.9, 0.85, 0.85, 0.85, 0.8], # 0 Small Motion
90
+ [1.0, 0.8, 0.8, 0.8, 0.79, 0.78, 0.75], # Moderate Motion
91
+ [1.0, 0.8, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.6, 0.5, 0.5], # Large Motion
92
+ [1.0, 0.9, 0.85, 0.85, 0.85, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.85, 0.85, 0.9, 1.0], # Loop
93
+ [1.0, 0.8, 0.8, 0.8, 0.79, 0.78, 0.75, 0.75, 0.75, 0.75, 0.75, 0.78, 0.79, 0.8, 0.8, 1.0], # Loop
94
+ [1.0, 0.8, 0.7, 0.7, 0.7, 0.7, 0.6, 0.5, 0.5, 0.6, 0.7, 0.7, 0.7, 0.7, 0.8, 1.0], # Loop
95
+ [0.5, 0.4, 0.4, 0.4, 0.35, 0.3], # Style Transfer Candidate Small Motion
96
+ [0.5, 0.4, 0.4, 0.4, 0.35, 0.35, 0.3, 0.25, 0.2], # Style Transfer Moderate Motion
97
+ [0.5, 0.2], # Style Transfer Large Motion
98
+ ]
99
+
100
+
101
+ def prepare_mask_coef_by_statistics(num_frames: int, cond_frame: int, motion_scale: int):
102
+ assert num_frames > 0, "video_length should be greater than 0"
103
+
104
+ assert num_frames > cond_frame, "video_length should be greater than cond_frame"
105
+
106
+ range_list = RANGE_LIST
107
+
108
+ assert motion_scale < len(range_list), f"motion_scale type{motion_scale} not implemented"
109
+
110
+ coef = range_list[motion_scale]
111
+ coef = coef + ([coef[-1]] * (num_frames - len(coef)))
112
+
113
+ order = [abs(i - cond_frame) for i in range(num_frames)]
114
+ coef = [coef[order[i]] for i in range(num_frames)]
115
+
116
+ return coef
117
+
118
+
119
+ @dataclass
120
+ class PIAPipelineOutput(BaseOutput):
121
+ r"""
122
+ Output class for PIAPipeline.
123
+
124
+ Args:
125
+ frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
126
+ Nested list of length `batch_size` with denoised PIL image sequences of length `num_frames`, NumPy array of
127
+ shape `(batch_size, num_frames, channels, height, width, Torch tensor of shape `(batch_size, num_frames,
128
+ channels, height, width)`.
129
+ """
130
+
131
+ frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]]
132
+
133
+
134
+ class PIAPipeline(
135
+ DeprecatedPipelineMixin,
136
+ DiffusionPipeline,
137
+ StableDiffusionMixin,
138
+ TextualInversionLoaderMixin,
139
+ IPAdapterMixin,
140
+ StableDiffusionLoraLoaderMixin,
141
+ FromSingleFileMixin,
142
+ FreeInitMixin,
143
+ ):
144
+ _last_supported_version = "0.33.1"
145
+ r"""
146
+ Pipeline for text-to-video generation.
147
+
148
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
149
+ implemented for all pipelines (downloading, saving, running on a particular device, etc.).
150
+
151
+ The pipeline also inherits the following loading methods:
152
+ - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
153
+ - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
154
+ - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
155
+ - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
156
+
157
+ Args:
158
+ vae ([`AutoencoderKL`]):
159
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
160
+ text_encoder ([`CLIPTextModel`]):
161
+ Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
162
+ tokenizer (`CLIPTokenizer`):
163
+ A [`~transformers.CLIPTokenizer`] to tokenize text.
164
+ unet ([`UNet2DConditionModel`]):
165
+ A [`UNet2DConditionModel`] used to create a UNetMotionModel to denoise the encoded video latents.
166
+ motion_adapter ([`MotionAdapter`]):
167
+ A [`MotionAdapter`] to be used in combination with `unet` to denoise the encoded video latents.
168
+ scheduler ([`SchedulerMixin`]):
169
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
170
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
171
+ """
172
+
173
+ model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
174
+ _optional_components = ["feature_extractor", "image_encoder", "motion_adapter"]
175
+ _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
176
+
177
+ def __init__(
178
+ self,
179
+ vae: AutoencoderKL,
180
+ text_encoder: CLIPTextModel,
181
+ tokenizer: CLIPTokenizer,
182
+ unet: Union[UNet2DConditionModel, UNetMotionModel],
183
+ scheduler: Union[
184
+ DDIMScheduler,
185
+ PNDMScheduler,
186
+ LMSDiscreteScheduler,
187
+ EulerDiscreteScheduler,
188
+ EulerAncestralDiscreteScheduler,
189
+ DPMSolverMultistepScheduler,
190
+ ],
191
+ motion_adapter: Optional[MotionAdapter] = None,
192
+ feature_extractor: CLIPImageProcessor = None,
193
+ image_encoder: CLIPVisionModelWithProjection = None,
194
+ ):
195
+ super().__init__()
196
+ if isinstance(unet, UNet2DConditionModel):
197
+ unet = UNetMotionModel.from_unet2d(unet, motion_adapter)
198
+
199
+ self.register_modules(
200
+ vae=vae,
201
+ text_encoder=text_encoder,
202
+ tokenizer=tokenizer,
203
+ unet=unet,
204
+ motion_adapter=motion_adapter,
205
+ scheduler=scheduler,
206
+ feature_extractor=feature_extractor,
207
+ image_encoder=image_encoder,
208
+ )
209
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
210
+ self.video_processor = VideoProcessor(do_resize=False, vae_scale_factor=self.vae_scale_factor)
211
+
212
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt with num_images_per_prompt -> num_videos_per_prompt
213
+ def encode_prompt(
214
+ self,
215
+ prompt,
216
+ device,
217
+ num_images_per_prompt,
218
+ do_classifier_free_guidance,
219
+ negative_prompt=None,
220
+ prompt_embeds: Optional[torch.Tensor] = None,
221
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
222
+ lora_scale: Optional[float] = None,
223
+ clip_skip: Optional[int] = None,
224
+ ):
225
+ r"""
226
+ Encodes the prompt into text encoder hidden states.
227
+
228
+ Args:
229
+ prompt (`str` or `List[str]`, *optional*):
230
+ prompt to be encoded
231
+ device: (`torch.device`):
232
+ torch device
233
+ num_images_per_prompt (`int`):
234
+ number of images that should be generated per prompt
235
+ do_classifier_free_guidance (`bool`):
236
+ whether to use classifier free guidance or not
237
+ negative_prompt (`str` or `List[str]`, *optional*):
238
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
239
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
240
+ less than `1`).
241
+ prompt_embeds (`torch.Tensor`, *optional*):
242
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
243
+ provided, text embeddings will be generated from `prompt` input argument.
244
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
245
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
246
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
247
+ argument.
248
+ lora_scale (`float`, *optional*):
249
+ A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
250
+ clip_skip (`int`, *optional*):
251
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
252
+ the output of the pre-final layer will be used for computing the prompt embeddings.
253
+ """
254
+ # set lora scale so that monkey patched LoRA
255
+ # function of text encoder can correctly access it
256
+ if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
257
+ self._lora_scale = lora_scale
258
+
259
+ # dynamically adjust the LoRA scale
260
+ if not USE_PEFT_BACKEND:
261
+ adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
262
+ else:
263
+ scale_lora_layers(self.text_encoder, lora_scale)
264
+
265
+ if prompt is not None and isinstance(prompt, str):
266
+ batch_size = 1
267
+ elif prompt is not None and isinstance(prompt, list):
268
+ batch_size = len(prompt)
269
+ else:
270
+ batch_size = prompt_embeds.shape[0]
271
+
272
+ if prompt_embeds is None:
273
+ # textual inversion: process multi-vector tokens if necessary
274
+ if isinstance(self, TextualInversionLoaderMixin):
275
+ prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
276
+
277
+ text_inputs = self.tokenizer(
278
+ prompt,
279
+ padding="max_length",
280
+ max_length=self.tokenizer.model_max_length,
281
+ truncation=True,
282
+ return_tensors="pt",
283
+ )
284
+ text_input_ids = text_inputs.input_ids
285
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
286
+
287
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
288
+ text_input_ids, untruncated_ids
289
+ ):
290
+ removed_text = self.tokenizer.batch_decode(
291
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
292
+ )
293
+ logger.warning(
294
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
295
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
296
+ )
297
+
298
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
299
+ attention_mask = text_inputs.attention_mask.to(device)
300
+ else:
301
+ attention_mask = None
302
+
303
+ if clip_skip is None:
304
+ prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
305
+ prompt_embeds = prompt_embeds[0]
306
+ else:
307
+ prompt_embeds = self.text_encoder(
308
+ text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
309
+ )
310
+ # Access the `hidden_states` first, that contains a tuple of
311
+ # all the hidden states from the encoder layers. Then index into
312
+ # the tuple to access the hidden states from the desired layer.
313
+ prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
314
+ # We also need to apply the final LayerNorm here to not mess with the
315
+ # representations. The `last_hidden_states` that we typically use for
316
+ # obtaining the final prompt representations passes through the LayerNorm
317
+ # layer.
318
+ prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
319
+
320
+ if self.text_encoder is not None:
321
+ prompt_embeds_dtype = self.text_encoder.dtype
322
+ elif self.unet is not None:
323
+ prompt_embeds_dtype = self.unet.dtype
324
+ else:
325
+ prompt_embeds_dtype = prompt_embeds.dtype
326
+
327
+ prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
328
+
329
+ bs_embed, seq_len, _ = prompt_embeds.shape
330
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
331
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
332
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
333
+
334
+ # get unconditional embeddings for classifier free guidance
335
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
336
+ uncond_tokens: List[str]
337
+ if negative_prompt is None:
338
+ uncond_tokens = [""] * batch_size
339
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
340
+ raise TypeError(
341
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
342
+ f" {type(prompt)}."
343
+ )
344
+ elif isinstance(negative_prompt, str):
345
+ uncond_tokens = [negative_prompt]
346
+ elif batch_size != len(negative_prompt):
347
+ raise ValueError(
348
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
349
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
350
+ " the batch size of `prompt`."
351
+ )
352
+ else:
353
+ uncond_tokens = negative_prompt
354
+
355
+ # textual inversion: process multi-vector tokens if necessary
356
+ if isinstance(self, TextualInversionLoaderMixin):
357
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
358
+
359
+ max_length = prompt_embeds.shape[1]
360
+ uncond_input = self.tokenizer(
361
+ uncond_tokens,
362
+ padding="max_length",
363
+ max_length=max_length,
364
+ truncation=True,
365
+ return_tensors="pt",
366
+ )
367
+
368
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
369
+ attention_mask = uncond_input.attention_mask.to(device)
370
+ else:
371
+ attention_mask = None
372
+
373
+ negative_prompt_embeds = self.text_encoder(
374
+ uncond_input.input_ids.to(device),
375
+ attention_mask=attention_mask,
376
+ )
377
+ negative_prompt_embeds = negative_prompt_embeds[0]
378
+
379
+ if do_classifier_free_guidance:
380
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
381
+ seq_len = negative_prompt_embeds.shape[1]
382
+
383
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
384
+
385
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
386
+ negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
387
+
388
+ if self.text_encoder is not None:
389
+ if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
390
+ # Retrieve the original scale by scaling back the LoRA layers
391
+ unscale_lora_layers(self.text_encoder, lora_scale)
392
+
393
+ return prompt_embeds, negative_prompt_embeds
394
+
395
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
396
+ def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
397
+ dtype = next(self.image_encoder.parameters()).dtype
398
+
399
+ if not isinstance(image, torch.Tensor):
400
+ image = self.feature_extractor(image, return_tensors="pt").pixel_values
401
+
402
+ image = image.to(device=device, dtype=dtype)
403
+ if output_hidden_states:
404
+ image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
405
+ image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
406
+ uncond_image_enc_hidden_states = self.image_encoder(
407
+ torch.zeros_like(image), output_hidden_states=True
408
+ ).hidden_states[-2]
409
+ uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
410
+ num_images_per_prompt, dim=0
411
+ )
412
+ return image_enc_hidden_states, uncond_image_enc_hidden_states
413
+ else:
414
+ image_embeds = self.image_encoder(image).image_embeds
415
+ image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
416
+ uncond_image_embeds = torch.zeros_like(image_embeds)
417
+
418
+ return image_embeds, uncond_image_embeds
419
+
420
+ # Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
421
+ def decode_latents(self, latents):
422
+ latents = 1 / self.vae.config.scaling_factor * latents
423
+
424
+ batch_size, channels, num_frames, height, width = latents.shape
425
+ latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
426
+
427
+ image = self.vae.decode(latents).sample
428
+ video = image[None, :].reshape((batch_size, num_frames, -1) + image.shape[2:]).permute(0, 2, 1, 3, 4)
429
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
430
+ video = video.float()
431
+ return video
432
+
433
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
434
+ def prepare_extra_step_kwargs(self, generator, eta):
435
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
436
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
437
+ # eta corresponds to η in DDIM paper: https://huggingface.co/papers/2010.02502
438
+ # and should be between [0, 1]
439
+
440
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
441
+ extra_step_kwargs = {}
442
+ if accepts_eta:
443
+ extra_step_kwargs["eta"] = eta
444
+
445
+ # check if the scheduler accepts generator
446
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
447
+ if accepts_generator:
448
+ extra_step_kwargs["generator"] = generator
449
+ return extra_step_kwargs
450
+
451
+ def check_inputs(
452
+ self,
453
+ prompt,
454
+ height,
455
+ width,
456
+ negative_prompt=None,
457
+ prompt_embeds=None,
458
+ negative_prompt_embeds=None,
459
+ ip_adapter_image=None,
460
+ ip_adapter_image_embeds=None,
461
+ callback_on_step_end_tensor_inputs=None,
462
+ ):
463
+ if height % 8 != 0 or width % 8 != 0:
464
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
465
+
466
+ if callback_on_step_end_tensor_inputs is not None and not all(
467
+ k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
468
+ ):
469
+ raise ValueError(
470
+ f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
471
+ )
472
+
473
+ if prompt is not None and prompt_embeds is not None:
474
+ raise ValueError(
475
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
476
+ " only forward one of the two."
477
+ )
478
+ elif prompt is None and prompt_embeds is None:
479
+ raise ValueError(
480
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
481
+ )
482
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
483
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
484
+
485
+ if negative_prompt is not None and negative_prompt_embeds is not None:
486
+ raise ValueError(
487
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
488
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
489
+ )
490
+
491
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
492
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
493
+ raise ValueError(
494
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
495
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
496
+ f" {negative_prompt_embeds.shape}."
497
+ )
498
+
499
+ if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
500
+ raise ValueError(
501
+ "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
502
+ )
503
+
504
+ if ip_adapter_image_embeds is not None:
505
+ if not isinstance(ip_adapter_image_embeds, list):
506
+ raise ValueError(
507
+ f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
508
+ )
509
+ elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
510
+ raise ValueError(
511
+ f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
512
+ )
513
+
514
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
515
+ def prepare_ip_adapter_image_embeds(
516
+ self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
517
+ ):
518
+ image_embeds = []
519
+ if do_classifier_free_guidance:
520
+ negative_image_embeds = []
521
+ if ip_adapter_image_embeds is None:
522
+ if not isinstance(ip_adapter_image, list):
523
+ ip_adapter_image = [ip_adapter_image]
524
+
525
+ if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
526
+ raise ValueError(
527
+ f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
528
+ )
529
+
530
+ for single_ip_adapter_image, image_proj_layer in zip(
531
+ ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
532
+ ):
533
+ output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
534
+ single_image_embeds, single_negative_image_embeds = self.encode_image(
535
+ single_ip_adapter_image, device, 1, output_hidden_state
536
+ )
537
+
538
+ image_embeds.append(single_image_embeds[None, :])
539
+ if do_classifier_free_guidance:
540
+ negative_image_embeds.append(single_negative_image_embeds[None, :])
541
+ else:
542
+ for single_image_embeds in ip_adapter_image_embeds:
543
+ if do_classifier_free_guidance:
544
+ single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
545
+ negative_image_embeds.append(single_negative_image_embeds)
546
+ image_embeds.append(single_image_embeds)
547
+
548
+ ip_adapter_image_embeds = []
549
+ for i, single_image_embeds in enumerate(image_embeds):
550
+ single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
551
+ if do_classifier_free_guidance:
552
+ single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
553
+ single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
554
+
555
+ single_image_embeds = single_image_embeds.to(device=device)
556
+ ip_adapter_image_embeds.append(single_image_embeds)
557
+
558
+ return ip_adapter_image_embeds
559
+
560
+ # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth.TextToVideoSDPipeline.prepare_latents
561
+ def prepare_latents(
562
+ self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
563
+ ):
564
+ shape = (
565
+ batch_size,
566
+ num_channels_latents,
567
+ num_frames,
568
+ height // self.vae_scale_factor,
569
+ width // self.vae_scale_factor,
570
+ )
571
+ if isinstance(generator, list) and len(generator) != batch_size:
572
+ raise ValueError(
573
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
574
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
575
+ )
576
+
577
+ if latents is None:
578
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
579
+ else:
580
+ latents = latents.to(device)
581
+
582
+ # scale the initial noise by the standard deviation required by the scheduler
583
+ latents = latents * self.scheduler.init_noise_sigma
584
+ return latents
585
+
586
+ def prepare_masked_condition(
587
+ self,
588
+ image,
589
+ batch_size,
590
+ num_channels_latents,
591
+ num_frames,
592
+ height,
593
+ width,
594
+ dtype,
595
+ device,
596
+ generator,
597
+ motion_scale=0,
598
+ ):
599
+ shape = (
600
+ batch_size,
601
+ num_channels_latents,
602
+ num_frames,
603
+ height // self.vae_scale_factor,
604
+ width // self.vae_scale_factor,
605
+ )
606
+ _, _, _, scaled_height, scaled_width = shape
607
+
608
+ image = self.video_processor.preprocess(image)
609
+ image = image.to(device, dtype)
610
+
611
+ if isinstance(generator, list):
612
+ image_latent = [
613
+ self.vae.encode(image[k : k + 1]).latent_dist.sample(generator[k]) for k in range(batch_size)
614
+ ]
615
+ image_latent = torch.cat(image_latent, dim=0)
616
+ else:
617
+ image_latent = self.vae.encode(image).latent_dist.sample(generator)
618
+
619
+ image_latent = image_latent.to(device=device, dtype=dtype)
620
+ image_latent = torch.nn.functional.interpolate(image_latent, size=[scaled_height, scaled_width])
621
+ image_latent_padding = image_latent.clone() * self.vae.config.scaling_factor
622
+
623
+ mask = torch.zeros((batch_size, 1, num_frames, scaled_height, scaled_width)).to(device=device, dtype=dtype)
624
+ mask_coef = prepare_mask_coef_by_statistics(num_frames, 0, motion_scale)
625
+ masked_image = torch.zeros(batch_size, 4, num_frames, scaled_height, scaled_width).to(
626
+ device=device, dtype=self.unet.dtype
627
+ )
628
+ for f in range(num_frames):
629
+ mask[:, :, f, :, :] = mask_coef[f]
630
+ masked_image[:, :, f, :, :] = image_latent_padding.clone()
631
+
632
+ mask = torch.cat([mask] * 2) if self.do_classifier_free_guidance else mask
633
+ masked_image = torch.cat([masked_image] * 2) if self.do_classifier_free_guidance else masked_image
634
+
635
+ return mask, masked_image
636
+
637
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
638
+ def get_timesteps(self, num_inference_steps, strength, device):
639
+ # get the original timestep using init_timestep
640
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
641
+
642
+ t_start = max(num_inference_steps - init_timestep, 0)
643
+ timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
644
+ if hasattr(self.scheduler, "set_begin_index"):
645
+ self.scheduler.set_begin_index(t_start * self.scheduler.order)
646
+
647
+ return timesteps, num_inference_steps - t_start
648
+
649
+ @property
650
+ def guidance_scale(self):
651
+ return self._guidance_scale
652
+
653
+ @property
654
+ def clip_skip(self):
655
+ return self._clip_skip
656
+
657
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
658
+ # of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
659
+ # corresponds to doing no classifier free guidance.
660
+ @property
661
+ def do_classifier_free_guidance(self):
662
+ return self._guidance_scale > 1
663
+
664
+ @property
665
+ def cross_attention_kwargs(self):
666
+ return self._cross_attention_kwargs
667
+
668
+ @property
669
+ def num_timesteps(self):
670
+ return self._num_timesteps
671
+
672
+ @torch.no_grad()
673
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
674
+ def __call__(
675
+ self,
676
+ image: PipelineImageInput,
677
+ prompt: Union[str, List[str]] = None,
678
+ strength: float = 1.0,
679
+ num_frames: Optional[int] = 16,
680
+ height: Optional[int] = None,
681
+ width: Optional[int] = None,
682
+ num_inference_steps: int = 50,
683
+ guidance_scale: float = 7.5,
684
+ negative_prompt: Optional[Union[str, List[str]]] = None,
685
+ num_videos_per_prompt: Optional[int] = 1,
686
+ eta: float = 0.0,
687
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
688
+ latents: Optional[torch.Tensor] = None,
689
+ prompt_embeds: Optional[torch.Tensor] = None,
690
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
691
+ ip_adapter_image: Optional[PipelineImageInput] = None,
692
+ ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
693
+ motion_scale: int = 0,
694
+ output_type: Optional[str] = "pil",
695
+ return_dict: bool = True,
696
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
697
+ clip_skip: Optional[int] = None,
698
+ callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
699
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
700
+ ):
701
+ r"""
702
+ The call function to the pipeline for generation.
703
+
704
+ Args:
705
+ image (`PipelineImageInput`):
706
+ The input image to be used for video generation.
707
+ prompt (`str` or `List[str]`, *optional*):
708
+ The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
709
+ strength (`float`, *optional*, defaults to 1.0):
710
+ Indicates extent to transform the reference `image`. Must be between 0 and 1.
711
+ height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
712
+ The height in pixels of the generated video.
713
+ width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
714
+ The width in pixels of the generated video.
715
+ num_frames (`int`, *optional*, defaults to 16):
716
+ The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds
717
+ amounts to 2 seconds of video.
718
+ num_inference_steps (`int`, *optional*, defaults to 50):
719
+ The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
720
+ expense of slower inference.
721
+ guidance_scale (`float`, *optional*, defaults to 7.5):
722
+ A higher guidance scale value encourages the model to generate images closely linked to the text
723
+ `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
724
+ negative_prompt (`str` or `List[str]`, *optional*):
725
+ The prompt or prompts to guide what to not include in image generation. If not defined, you need to
726
+ pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
727
+ eta (`float`, *optional*, defaults to 0.0):
728
+ Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only
729
+ applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
730
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
731
+ A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
732
+ generation deterministic.
733
+ latents (`torch.Tensor`, *optional*):
734
+ Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
735
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
736
+ tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
737
+ `(batch_size, num_channel, num_frames, height, width)`.
738
+ prompt_embeds (`torch.Tensor`, *optional*):
739
+ Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
740
+ provided, text embeddings are generated from the `prompt` input argument.
741
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
742
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
743
+ not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
744
+ ip_adapter_image: (`PipelineImageInput`, *optional*):
745
+ Optional image input to work with IP Adapters.
746
+ ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
747
+ Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
748
+ IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
749
+ contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
750
+ provided, embeddings are computed from the `ip_adapter_image` input argument.
751
+ motion_scale: (`int`, *optional*, defaults to 0):
752
+ Parameter that controls the amount and type of motion that is added to the image. Increasing the value
753
+ increases the amount of motion, while specific ranges of values control the type of motion that is
754
+ added. Must be between 0 and 8. Set between 0-2 to only increase the amount of motion. Set between 3-5
755
+ to create looping motion. Set between 6-8 to perform motion with image style transfer.
756
+ output_type (`str`, *optional*, defaults to `"pil"`):
757
+ The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or `np.array`.
758
+ return_dict (`bool`, *optional*, defaults to `True`):
759
+ Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead
760
+ of a plain tuple.
761
+ cross_attention_kwargs (`dict`, *optional*):
762
+ A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
763
+ [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
764
+ clip_skip (`int`, *optional*):
765
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
766
+ the output of the pre-final layer will be used for computing the prompt embeddings.
767
+ callback_on_step_end (`Callable`, *optional*):
768
+ A function that calls at the end of each denoising steps during the inference. The function is called
769
+ with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
770
+ callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
771
+ `callback_on_step_end_tensor_inputs`.
772
+ callback_on_step_end_tensor_inputs (`List`, *optional*):
773
+ The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
774
+ will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
775
+ `._callback_tensor_inputs` attribute of your pipeline class.
776
+
777
+ Examples:
778
+
779
+ Returns:
780
+ [`~pipelines.pia.pipeline_pia.PIAPipelineOutput`] or `tuple`:
781
+ If `return_dict` is `True`, [`~pipelines.pia.pipeline_pia.PIAPipelineOutput`] is returned, otherwise a
782
+ `tuple` is returned where the first element is a list with the generated frames.
783
+ """
784
+ # 0. Default height and width to unet
785
+ height = height or self.unet.config.sample_size * self.vae_scale_factor
786
+ width = width or self.unet.config.sample_size * self.vae_scale_factor
787
+
788
+ num_videos_per_prompt = 1
789
+
790
+ # 1. Check inputs. Raise error if not correct
791
+ self.check_inputs(
792
+ prompt,
793
+ height,
794
+ width,
795
+ negative_prompt,
796
+ prompt_embeds,
797
+ negative_prompt_embeds,
798
+ ip_adapter_image,
799
+ ip_adapter_image_embeds,
800
+ callback_on_step_end_tensor_inputs,
801
+ )
802
+
803
+ self._guidance_scale = guidance_scale
804
+ self._clip_skip = clip_skip
805
+ self._cross_attention_kwargs = cross_attention_kwargs
806
+
807
+ # 2. Define call parameters
808
+ if prompt is not None and isinstance(prompt, str):
809
+ batch_size = 1
810
+ elif prompt is not None and isinstance(prompt, list):
811
+ batch_size = len(prompt)
812
+ else:
813
+ batch_size = prompt_embeds.shape[0]
814
+
815
+ device = self._execution_device
816
+
817
+ # 3. Encode input prompt
818
+ text_encoder_lora_scale = (
819
+ self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
820
+ )
821
+ prompt_embeds, negative_prompt_embeds = self.encode_prompt(
822
+ prompt,
823
+ device,
824
+ num_videos_per_prompt,
825
+ self.do_classifier_free_guidance,
826
+ negative_prompt,
827
+ prompt_embeds=prompt_embeds,
828
+ negative_prompt_embeds=negative_prompt_embeds,
829
+ lora_scale=text_encoder_lora_scale,
830
+ clip_skip=self.clip_skip,
831
+ )
832
+ # For classifier free guidance, we need to do two forward passes.
833
+ # Here we concatenate the unconditional and text embeddings into a single batch
834
+ # to avoid doing two forward passes
835
+ if self.do_classifier_free_guidance:
836
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
837
+
838
+ prompt_embeds = prompt_embeds.repeat_interleave(repeats=num_frames, dim=0)
839
+
840
+ if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
841
+ image_embeds = self.prepare_ip_adapter_image_embeds(
842
+ ip_adapter_image,
843
+ ip_adapter_image_embeds,
844
+ device,
845
+ batch_size * num_videos_per_prompt,
846
+ self.do_classifier_free_guidance,
847
+ )
848
+
849
+ # 4. Prepare timesteps
850
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
851
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
852
+ latent_timestep = timesteps[:1].repeat(batch_size * num_videos_per_prompt)
853
+ self._num_timesteps = len(timesteps)
854
+
855
+ # 5. Prepare latent variables
856
+ latents = self.prepare_latents(
857
+ batch_size * num_videos_per_prompt,
858
+ 4,
859
+ num_frames,
860
+ height,
861
+ width,
862
+ prompt_embeds.dtype,
863
+ device,
864
+ generator,
865
+ latents=latents,
866
+ )
867
+ mask, masked_image = self.prepare_masked_condition(
868
+ image,
869
+ batch_size * num_videos_per_prompt,
870
+ 4,
871
+ num_frames=num_frames,
872
+ height=height,
873
+ width=width,
874
+ dtype=self.unet.dtype,
875
+ device=device,
876
+ generator=generator,
877
+ motion_scale=motion_scale,
878
+ )
879
+ if strength < 1.0:
880
+ noise = randn_tensor(latents.shape, generator=generator, device=device, dtype=latents.dtype)
881
+ latents = self.scheduler.add_noise(masked_image[0], noise, latent_timestep)
882
+
883
+ # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
884
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
885
+
886
+ # 7. Add image embeds for IP-Adapter
887
+ added_cond_kwargs = (
888
+ {"image_embeds": image_embeds}
889
+ if ip_adapter_image is not None or ip_adapter_image_embeds is not None
890
+ else None
891
+ )
892
+
893
+ # 8. Denoising loop
894
+ num_free_init_iters = self._free_init_num_iters if self.free_init_enabled else 1
895
+ for free_init_iter in range(num_free_init_iters):
896
+ if self.free_init_enabled:
897
+ latents, timesteps = self._apply_free_init(
898
+ latents, free_init_iter, num_inference_steps, device, latents.dtype, generator
899
+ )
900
+
901
+ self._num_timesteps = len(timesteps)
902
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
903
+
904
+ with self.progress_bar(total=self._num_timesteps) as progress_bar:
905
+ for i, t in enumerate(timesteps):
906
+ # expand the latents if we are doing classifier free guidance
907
+ latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
908
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
909
+ latent_model_input = torch.cat([latent_model_input, mask, masked_image], dim=1)
910
+
911
+ # predict the noise residual
912
+ noise_pred = self.unet(
913
+ latent_model_input,
914
+ t,
915
+ encoder_hidden_states=prompt_embeds,
916
+ cross_attention_kwargs=cross_attention_kwargs,
917
+ added_cond_kwargs=added_cond_kwargs,
918
+ ).sample
919
+
920
+ # perform guidance
921
+ if self.do_classifier_free_guidance:
922
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
923
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
924
+
925
+ # compute the previous noisy sample x_t -> x_t-1
926
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
927
+
928
+ if callback_on_step_end is not None:
929
+ callback_kwargs = {}
930
+ for k in callback_on_step_end_tensor_inputs:
931
+ callback_kwargs[k] = locals()[k]
932
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
933
+
934
+ latents = callback_outputs.pop("latents", latents)
935
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
936
+ negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
937
+
938
+ # call the callback, if provided
939
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
940
+ progress_bar.update()
941
+
942
+ if XLA_AVAILABLE:
943
+ xm.mark_step()
944
+
945
+ # 9. Post processing
946
+ if output_type == "latent":
947
+ video = latents
948
+ else:
949
+ video_tensor = self.decode_latents(latents)
950
+ video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
951
+
952
+ # 10. Offload all models
953
+ self.maybe_free_model_hooks()
954
+
955
+ if not return_dict:
956
+ return (video,)
957
+
958
+ return PIAPipelineOutput(frames=video)
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/pixart_alpha/__init__.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import TYPE_CHECKING
2
+
3
+ from ...utils import (
4
+ DIFFUSERS_SLOW_IMPORT,
5
+ OptionalDependencyNotAvailable,
6
+ _LazyModule,
7
+ get_objects_from_module,
8
+ is_torch_available,
9
+ is_transformers_available,
10
+ )
11
+
12
+
13
+ _dummy_objects = {}
14
+ _import_structure = {}
15
+
16
+
17
+ try:
18
+ if not (is_transformers_available() and is_torch_available()):
19
+ raise OptionalDependencyNotAvailable()
20
+ except OptionalDependencyNotAvailable:
21
+ from ...utils import dummy_torch_and_transformers_objects # noqa F403
22
+
23
+ _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
24
+ else:
25
+ _import_structure["pipeline_pixart_alpha"] = ["PixArtAlphaPipeline"]
26
+ _import_structure["pipeline_pixart_sigma"] = ["PixArtSigmaPipeline"]
27
+
28
+ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
29
+ try:
30
+ if not (is_transformers_available() and is_torch_available()):
31
+ raise OptionalDependencyNotAvailable()
32
+
33
+ except OptionalDependencyNotAvailable:
34
+ from ...utils.dummy_torch_and_transformers_objects import *
35
+ else:
36
+ from .pipeline_pixart_alpha import (
37
+ ASPECT_RATIO_256_BIN,
38
+ ASPECT_RATIO_512_BIN,
39
+ ASPECT_RATIO_1024_BIN,
40
+ PixArtAlphaPipeline,
41
+ )
42
+ from .pipeline_pixart_sigma import ASPECT_RATIO_2048_BIN, PixArtSigmaPipeline
43
+
44
+ else:
45
+ import sys
46
+
47
+ sys.modules[__name__] = _LazyModule(
48
+ __name__,
49
+ globals()["__file__"],
50
+ _import_structure,
51
+ module_spec=__spec__,
52
+ )
53
+
54
+ for name, value in _dummy_objects.items():
55
+ setattr(sys.modules[__name__], name, value)
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/pixart_alpha/__pycache__/pipeline_pixart_alpha.cpython-310.pyc ADDED
Binary file (28.5 kB). View file
 
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/pixart_alpha/__pycache__/pipeline_pixart_sigma.cpython-310.pyc ADDED
Binary file (27.3 kB). View file
 
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py ADDED
@@ -0,0 +1,976 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 PixArt-Alpha Authors and The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import html
16
+ import inspect
17
+ import re
18
+ import urllib.parse as ul
19
+ from typing import Callable, List, Optional, Tuple, Union
20
+
21
+ import torch
22
+ from transformers import T5EncoderModel, T5Tokenizer
23
+
24
+ from ...image_processor import PixArtImageProcessor
25
+ from ...models import AutoencoderKL, PixArtTransformer2DModel
26
+ from ...schedulers import DPMSolverMultistepScheduler
27
+ from ...utils import (
28
+ BACKENDS_MAPPING,
29
+ deprecate,
30
+ is_bs4_available,
31
+ is_ftfy_available,
32
+ is_torch_xla_available,
33
+ logging,
34
+ replace_example_docstring,
35
+ )
36
+ from ...utils.torch_utils import randn_tensor
37
+ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
38
+
39
+
40
+ if is_torch_xla_available():
41
+ import torch_xla.core.xla_model as xm
42
+
43
+ XLA_AVAILABLE = True
44
+ else:
45
+ XLA_AVAILABLE = False
46
+
47
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
48
+
49
+
50
+ if is_bs4_available():
51
+ from bs4 import BeautifulSoup
52
+
53
+ if is_ftfy_available():
54
+ import ftfy
55
+
56
+ EXAMPLE_DOC_STRING = """
57
+ Examples:
58
+ ```py
59
+ >>> import torch
60
+ >>> from diffusers import PixArtAlphaPipeline
61
+
62
+ >>> # You can replace the checkpoint id with "PixArt-alpha/PixArt-XL-2-512x512" too.
63
+ >>> pipe = PixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", torch_dtype=torch.float16)
64
+ >>> # Enable memory optimizations.
65
+ >>> pipe.enable_model_cpu_offload()
66
+
67
+ >>> prompt = "A small cactus with a happy face in the Sahara desert."
68
+ >>> image = pipe(prompt).images[0]
69
+ ```
70
+ """
71
+
72
+ ASPECT_RATIO_1024_BIN = {
73
+ "0.25": [512.0, 2048.0],
74
+ "0.28": [512.0, 1856.0],
75
+ "0.32": [576.0, 1792.0],
76
+ "0.33": [576.0, 1728.0],
77
+ "0.35": [576.0, 1664.0],
78
+ "0.4": [640.0, 1600.0],
79
+ "0.42": [640.0, 1536.0],
80
+ "0.48": [704.0, 1472.0],
81
+ "0.5": [704.0, 1408.0],
82
+ "0.52": [704.0, 1344.0],
83
+ "0.57": [768.0, 1344.0],
84
+ "0.6": [768.0, 1280.0],
85
+ "0.68": [832.0, 1216.0],
86
+ "0.72": [832.0, 1152.0],
87
+ "0.78": [896.0, 1152.0],
88
+ "0.82": [896.0, 1088.0],
89
+ "0.88": [960.0, 1088.0],
90
+ "0.94": [960.0, 1024.0],
91
+ "1.0": [1024.0, 1024.0],
92
+ "1.07": [1024.0, 960.0],
93
+ "1.13": [1088.0, 960.0],
94
+ "1.21": [1088.0, 896.0],
95
+ "1.29": [1152.0, 896.0],
96
+ "1.38": [1152.0, 832.0],
97
+ "1.46": [1216.0, 832.0],
98
+ "1.67": [1280.0, 768.0],
99
+ "1.75": [1344.0, 768.0],
100
+ "2.0": [1408.0, 704.0],
101
+ "2.09": [1472.0, 704.0],
102
+ "2.4": [1536.0, 640.0],
103
+ "2.5": [1600.0, 640.0],
104
+ "3.0": [1728.0, 576.0],
105
+ "4.0": [2048.0, 512.0],
106
+ }
107
+
108
+ ASPECT_RATIO_512_BIN = {
109
+ "0.25": [256.0, 1024.0],
110
+ "0.28": [256.0, 928.0],
111
+ "0.32": [288.0, 896.0],
112
+ "0.33": [288.0, 864.0],
113
+ "0.35": [288.0, 832.0],
114
+ "0.4": [320.0, 800.0],
115
+ "0.42": [320.0, 768.0],
116
+ "0.48": [352.0, 736.0],
117
+ "0.5": [352.0, 704.0],
118
+ "0.52": [352.0, 672.0],
119
+ "0.57": [384.0, 672.0],
120
+ "0.6": [384.0, 640.0],
121
+ "0.68": [416.0, 608.0],
122
+ "0.72": [416.0, 576.0],
123
+ "0.78": [448.0, 576.0],
124
+ "0.82": [448.0, 544.0],
125
+ "0.88": [480.0, 544.0],
126
+ "0.94": [480.0, 512.0],
127
+ "1.0": [512.0, 512.0],
128
+ "1.07": [512.0, 480.0],
129
+ "1.13": [544.0, 480.0],
130
+ "1.21": [544.0, 448.0],
131
+ "1.29": [576.0, 448.0],
132
+ "1.38": [576.0, 416.0],
133
+ "1.46": [608.0, 416.0],
134
+ "1.67": [640.0, 384.0],
135
+ "1.75": [672.0, 384.0],
136
+ "2.0": [704.0, 352.0],
137
+ "2.09": [736.0, 352.0],
138
+ "2.4": [768.0, 320.0],
139
+ "2.5": [800.0, 320.0],
140
+ "3.0": [864.0, 288.0],
141
+ "4.0": [1024.0, 256.0],
142
+ }
143
+
144
+ ASPECT_RATIO_256_BIN = {
145
+ "0.25": [128.0, 512.0],
146
+ "0.28": [128.0, 464.0],
147
+ "0.32": [144.0, 448.0],
148
+ "0.33": [144.0, 432.0],
149
+ "0.35": [144.0, 416.0],
150
+ "0.4": [160.0, 400.0],
151
+ "0.42": [160.0, 384.0],
152
+ "0.48": [176.0, 368.0],
153
+ "0.5": [176.0, 352.0],
154
+ "0.52": [176.0, 336.0],
155
+ "0.57": [192.0, 336.0],
156
+ "0.6": [192.0, 320.0],
157
+ "0.68": [208.0, 304.0],
158
+ "0.72": [208.0, 288.0],
159
+ "0.78": [224.0, 288.0],
160
+ "0.82": [224.0, 272.0],
161
+ "0.88": [240.0, 272.0],
162
+ "0.94": [240.0, 256.0],
163
+ "1.0": [256.0, 256.0],
164
+ "1.07": [256.0, 240.0],
165
+ "1.13": [272.0, 240.0],
166
+ "1.21": [272.0, 224.0],
167
+ "1.29": [288.0, 224.0],
168
+ "1.38": [288.0, 208.0],
169
+ "1.46": [304.0, 208.0],
170
+ "1.67": [320.0, 192.0],
171
+ "1.75": [336.0, 192.0],
172
+ "2.0": [352.0, 176.0],
173
+ "2.09": [368.0, 176.0],
174
+ "2.4": [384.0, 160.0],
175
+ "2.5": [400.0, 160.0],
176
+ "3.0": [432.0, 144.0],
177
+ "4.0": [512.0, 128.0],
178
+ }
179
+
180
+
181
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
182
+ def retrieve_timesteps(
183
+ scheduler,
184
+ num_inference_steps: Optional[int] = None,
185
+ device: Optional[Union[str, torch.device]] = None,
186
+ timesteps: Optional[List[int]] = None,
187
+ sigmas: Optional[List[float]] = None,
188
+ **kwargs,
189
+ ):
190
+ r"""
191
+ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
192
+ custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
193
+
194
+ Args:
195
+ scheduler (`SchedulerMixin`):
196
+ The scheduler to get timesteps from.
197
+ num_inference_steps (`int`):
198
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
199
+ must be `None`.
200
+ device (`str` or `torch.device`, *optional*):
201
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
202
+ timesteps (`List[int]`, *optional*):
203
+ Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
204
+ `num_inference_steps` and `sigmas` must be `None`.
205
+ sigmas (`List[float]`, *optional*):
206
+ Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
207
+ `num_inference_steps` and `timesteps` must be `None`.
208
+
209
+ Returns:
210
+ `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
211
+ second element is the number of inference steps.
212
+ """
213
+ if timesteps is not None and sigmas is not None:
214
+ raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
215
+ if timesteps is not None:
216
+ accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
217
+ if not accepts_timesteps:
218
+ raise ValueError(
219
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
220
+ f" timestep schedules. Please check whether you are using the correct scheduler."
221
+ )
222
+ scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
223
+ timesteps = scheduler.timesteps
224
+ num_inference_steps = len(timesteps)
225
+ elif sigmas is not None:
226
+ accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
227
+ if not accept_sigmas:
228
+ raise ValueError(
229
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
230
+ f" sigmas schedules. Please check whether you are using the correct scheduler."
231
+ )
232
+ scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
233
+ timesteps = scheduler.timesteps
234
+ num_inference_steps = len(timesteps)
235
+ else:
236
+ scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
237
+ timesteps = scheduler.timesteps
238
+ return timesteps, num_inference_steps
239
+
240
+
241
+ class PixArtAlphaPipeline(DiffusionPipeline):
242
+ r"""
243
+ Pipeline for text-to-image generation using PixArt-Alpha.
244
+
245
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
246
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
247
+
248
+ Args:
249
+ vae ([`AutoencoderKL`]):
250
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
251
+ text_encoder ([`T5EncoderModel`]):
252
+ Frozen text-encoder. PixArt-Alpha uses
253
+ [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel), specifically the
254
+ [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant.
255
+ tokenizer (`T5Tokenizer`):
256
+ Tokenizer of class
257
+ [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
258
+ transformer ([`PixArtTransformer2DModel`]):
259
+ A text conditioned `PixArtTransformer2DModel` to denoise the encoded image latents. Initially published as
260
+ [`Transformer2DModel`](https://huggingface.co/PixArt-alpha/PixArt-XL-2-1024-MS/blob/main/transformer/config.json#L2)
261
+ in the config, but the mismatch can be ignored.
262
+ scheduler ([`SchedulerMixin`]):
263
+ A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
264
+ """
265
+
266
+ bad_punct_regex = re.compile(
267
+ r"["
268
+ + "#®•©™&@·º½¾¿¡§~"
269
+ + r"\)"
270
+ + r"\("
271
+ + r"\]"
272
+ + r"\["
273
+ + r"\}"
274
+ + r"\{"
275
+ + r"\|"
276
+ + "\\"
277
+ + r"\/"
278
+ + r"\*"
279
+ + r"]{1,}"
280
+ ) # noqa
281
+
282
+ _optional_components = ["tokenizer", "text_encoder"]
283
+ model_cpu_offload_seq = "text_encoder->transformer->vae"
284
+
285
+ def __init__(
286
+ self,
287
+ tokenizer: T5Tokenizer,
288
+ text_encoder: T5EncoderModel,
289
+ vae: AutoencoderKL,
290
+ transformer: PixArtTransformer2DModel,
291
+ scheduler: DPMSolverMultistepScheduler,
292
+ ):
293
+ super().__init__()
294
+
295
+ self.register_modules(
296
+ tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
297
+ )
298
+
299
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
300
+ self.image_processor = PixArtImageProcessor(vae_scale_factor=self.vae_scale_factor)
301
+
302
+ # Adapted from diffusers.pipelines.deepfloyd_if.pipeline_if.encode_prompt
303
+ def encode_prompt(
304
+ self,
305
+ prompt: Union[str, List[str]],
306
+ do_classifier_free_guidance: bool = True,
307
+ negative_prompt: str = "",
308
+ num_images_per_prompt: int = 1,
309
+ device: Optional[torch.device] = None,
310
+ prompt_embeds: Optional[torch.Tensor] = None,
311
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
312
+ prompt_attention_mask: Optional[torch.Tensor] = None,
313
+ negative_prompt_attention_mask: Optional[torch.Tensor] = None,
314
+ clean_caption: bool = False,
315
+ max_sequence_length: int = 120,
316
+ **kwargs,
317
+ ):
318
+ r"""
319
+ Encodes the prompt into text encoder hidden states.
320
+
321
+ Args:
322
+ prompt (`str` or `List[str]`, *optional*):
323
+ prompt to be encoded
324
+ negative_prompt (`str` or `List[str]`, *optional*):
325
+ The prompt not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`
326
+ instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). For
327
+ PixArt-Alpha, this should be "".
328
+ do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
329
+ whether to use classifier free guidance or not
330
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
331
+ number of images that should be generated per prompt
332
+ device: (`torch.device`, *optional*):
333
+ torch device to place the resulting embeddings on
334
+ prompt_embeds (`torch.Tensor`, *optional*):
335
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
336
+ provided, text embeddings will be generated from `prompt` input argument.
337
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
338
+ Pre-generated negative text embeddings. For PixArt-Alpha, it's should be the embeddings of the ""
339
+ string.
340
+ clean_caption (`bool`, defaults to `False`):
341
+ If `True`, the function will preprocess and clean the provided caption before encoding.
342
+ max_sequence_length (`int`, defaults to 120): Maximum sequence length to use for the prompt.
343
+ """
344
+
345
+ if "mask_feature" in kwargs:
346
+ deprecation_message = "The use of `mask_feature` is deprecated. It is no longer used in any computation and that doesn't affect the end results. It will be removed in a future version."
347
+ deprecate("mask_feature", "1.0.0", deprecation_message, standard_warn=False)
348
+
349
+ if device is None:
350
+ device = self._execution_device
351
+
352
+ # See Section 3.1. of the paper.
353
+ max_length = max_sequence_length
354
+
355
+ if prompt_embeds is None:
356
+ prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
357
+ text_inputs = self.tokenizer(
358
+ prompt,
359
+ padding="max_length",
360
+ max_length=max_length,
361
+ truncation=True,
362
+ add_special_tokens=True,
363
+ return_tensors="pt",
364
+ )
365
+ text_input_ids = text_inputs.input_ids
366
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
367
+
368
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
369
+ text_input_ids, untruncated_ids
370
+ ):
371
+ removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
372
+ logger.warning(
373
+ "The following part of your input was truncated because T5 can only handle sequences up to"
374
+ f" {max_length} tokens: {removed_text}"
375
+ )
376
+
377
+ prompt_attention_mask = text_inputs.attention_mask
378
+ prompt_attention_mask = prompt_attention_mask.to(device)
379
+
380
+ prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=prompt_attention_mask)
381
+ prompt_embeds = prompt_embeds[0]
382
+
383
+ if self.text_encoder is not None:
384
+ dtype = self.text_encoder.dtype
385
+ elif self.transformer is not None:
386
+ dtype = self.transformer.dtype
387
+ else:
388
+ dtype = None
389
+
390
+ prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
391
+
392
+ bs_embed, seq_len, _ = prompt_embeds.shape
393
+ # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
394
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
395
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
396
+ prompt_attention_mask = prompt_attention_mask.repeat(1, num_images_per_prompt)
397
+ prompt_attention_mask = prompt_attention_mask.view(bs_embed * num_images_per_prompt, -1)
398
+
399
+ # get unconditional embeddings for classifier free guidance
400
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
401
+ uncond_tokens = [negative_prompt] * bs_embed if isinstance(negative_prompt, str) else negative_prompt
402
+ uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
403
+ max_length = prompt_embeds.shape[1]
404
+ uncond_input = self.tokenizer(
405
+ uncond_tokens,
406
+ padding="max_length",
407
+ max_length=max_length,
408
+ truncation=True,
409
+ return_attention_mask=True,
410
+ add_special_tokens=True,
411
+ return_tensors="pt",
412
+ )
413
+ negative_prompt_attention_mask = uncond_input.attention_mask
414
+ negative_prompt_attention_mask = negative_prompt_attention_mask.to(device)
415
+
416
+ negative_prompt_embeds = self.text_encoder(
417
+ uncond_input.input_ids.to(device), attention_mask=negative_prompt_attention_mask
418
+ )
419
+ negative_prompt_embeds = negative_prompt_embeds[0]
420
+
421
+ if do_classifier_free_guidance:
422
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
423
+ seq_len = negative_prompt_embeds.shape[1]
424
+
425
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
426
+
427
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
428
+ negative_prompt_embeds = negative_prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
429
+
430
+ negative_prompt_attention_mask = negative_prompt_attention_mask.repeat(1, num_images_per_prompt)
431
+ negative_prompt_attention_mask = negative_prompt_attention_mask.view(bs_embed * num_images_per_prompt, -1)
432
+ else:
433
+ negative_prompt_embeds = None
434
+ negative_prompt_attention_mask = None
435
+
436
+ return prompt_embeds, prompt_attention_mask, negative_prompt_embeds, negative_prompt_attention_mask
437
+
438
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
439
+ def prepare_extra_step_kwargs(self, generator, eta):
440
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
441
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
442
+ # eta corresponds to η in DDIM paper: https://huggingface.co/papers/2010.02502
443
+ # and should be between [0, 1]
444
+
445
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
446
+ extra_step_kwargs = {}
447
+ if accepts_eta:
448
+ extra_step_kwargs["eta"] = eta
449
+
450
+ # check if the scheduler accepts generator
451
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
452
+ if accepts_generator:
453
+ extra_step_kwargs["generator"] = generator
454
+ return extra_step_kwargs
455
+
456
+ def check_inputs(
457
+ self,
458
+ prompt,
459
+ height,
460
+ width,
461
+ negative_prompt,
462
+ callback_steps,
463
+ prompt_embeds=None,
464
+ negative_prompt_embeds=None,
465
+ prompt_attention_mask=None,
466
+ negative_prompt_attention_mask=None,
467
+ ):
468
+ if height % 8 != 0 or width % 8 != 0:
469
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
470
+
471
+ if (callback_steps is None) or (
472
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
473
+ ):
474
+ raise ValueError(
475
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
476
+ f" {type(callback_steps)}."
477
+ )
478
+
479
+ if prompt is not None and prompt_embeds is not None:
480
+ raise ValueError(
481
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
482
+ " only forward one of the two."
483
+ )
484
+ elif prompt is None and prompt_embeds is None:
485
+ raise ValueError(
486
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
487
+ )
488
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
489
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
490
+
491
+ if prompt is not None and negative_prompt_embeds is not None:
492
+ raise ValueError(
493
+ f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
494
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
495
+ )
496
+
497
+ if negative_prompt is not None and negative_prompt_embeds is not None:
498
+ raise ValueError(
499
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
500
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
501
+ )
502
+
503
+ if prompt_embeds is not None and prompt_attention_mask is None:
504
+ raise ValueError("Must provide `prompt_attention_mask` when specifying `prompt_embeds`.")
505
+
506
+ if negative_prompt_embeds is not None and negative_prompt_attention_mask is None:
507
+ raise ValueError("Must provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`.")
508
+
509
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
510
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
511
+ raise ValueError(
512
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
513
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
514
+ f" {negative_prompt_embeds.shape}."
515
+ )
516
+ if prompt_attention_mask.shape != negative_prompt_attention_mask.shape:
517
+ raise ValueError(
518
+ "`prompt_attention_mask` and `negative_prompt_attention_mask` must have the same shape when passed directly, but"
519
+ f" got: `prompt_attention_mask` {prompt_attention_mask.shape} != `negative_prompt_attention_mask`"
520
+ f" {negative_prompt_attention_mask.shape}."
521
+ )
522
+
523
+ # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
524
+ def _text_preprocessing(self, text, clean_caption=False):
525
+ if clean_caption and not is_bs4_available():
526
+ logger.warning(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
527
+ logger.warning("Setting `clean_caption` to False...")
528
+ clean_caption = False
529
+
530
+ if clean_caption and not is_ftfy_available():
531
+ logger.warning(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
532
+ logger.warning("Setting `clean_caption` to False...")
533
+ clean_caption = False
534
+
535
+ if not isinstance(text, (tuple, list)):
536
+ text = [text]
537
+
538
+ def process(text: str):
539
+ if clean_caption:
540
+ text = self._clean_caption(text)
541
+ text = self._clean_caption(text)
542
+ else:
543
+ text = text.lower().strip()
544
+ return text
545
+
546
+ return [process(t) for t in text]
547
+
548
+ # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
549
+ def _clean_caption(self, caption):
550
+ caption = str(caption)
551
+ caption = ul.unquote_plus(caption)
552
+ caption = caption.strip().lower()
553
+ caption = re.sub("<person>", "person", caption)
554
+ # urls:
555
+ caption = re.sub(
556
+ r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa
557
+ "",
558
+ caption,
559
+ ) # regex for urls
560
+ caption = re.sub(
561
+ r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa
562
+ "",
563
+ caption,
564
+ ) # regex for urls
565
+ # html:
566
+ caption = BeautifulSoup(caption, features="html.parser").text
567
+
568
+ # @<nickname>
569
+ caption = re.sub(r"@[\w\d]+\b", "", caption)
570
+
571
+ # 31C0—31EF CJK Strokes
572
+ # 31F0—31FF Katakana Phonetic Extensions
573
+ # 3200—32FF Enclosed CJK Letters and Months
574
+ # 3300—33FF CJK Compatibility
575
+ # 3400—4DBF CJK Unified Ideographs Extension A
576
+ # 4DC0—4DFF Yijing Hexagram Symbols
577
+ # 4E00—9FFF CJK Unified Ideographs
578
+ caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
579
+ caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
580
+ caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
581
+ caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
582
+ caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
583
+ caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
584
+ caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
585
+ #######################################################
586
+
587
+ # все виды тире / all types of dash --> "-"
588
+ caption = re.sub(
589
+ r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+", # noqa
590
+ "-",
591
+ caption,
592
+ )
593
+
594
+ # кавычки к одному стандарту
595
+ caption = re.sub(r"[`´«»“”¨]", '"', caption)
596
+ caption = re.sub(r"[‘’]", "'", caption)
597
+
598
+ # &quot;
599
+ caption = re.sub(r"&quot;?", "", caption)
600
+ # &amp
601
+ caption = re.sub(r"&amp", "", caption)
602
+
603
+ # ip addresses:
604
+ caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
605
+
606
+ # article ids:
607
+ caption = re.sub(r"\d:\d\d\s+$", "", caption)
608
+
609
+ # \n
610
+ caption = re.sub(r"\\n", " ", caption)
611
+
612
+ # "#123"
613
+ caption = re.sub(r"#\d{1,3}\b", "", caption)
614
+ # "#12345.."
615
+ caption = re.sub(r"#\d{5,}\b", "", caption)
616
+ # "123456.."
617
+ caption = re.sub(r"\b\d{6,}\b", "", caption)
618
+ # filenames:
619
+ caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
620
+
621
+ #
622
+ caption = re.sub(r"[\"\']{2,}", r'"', caption) # """AUSVERKAUFT"""
623
+ caption = re.sub(r"[\.]{2,}", r" ", caption) # """AUSVERKAUFT"""
624
+
625
+ caption = re.sub(self.bad_punct_regex, r" ", caption) # ***AUSVERKAUFT***, #AUSVERKAUFT
626
+ caption = re.sub(r"\s+\.\s+", r" ", caption) # " . "
627
+
628
+ # this-is-my-cute-cat / this_is_my_cute_cat
629
+ regex2 = re.compile(r"(?:\-|\_)")
630
+ if len(re.findall(regex2, caption)) > 3:
631
+ caption = re.sub(regex2, " ", caption)
632
+
633
+ caption = ftfy.fix_text(caption)
634
+ caption = html.unescape(html.unescape(caption))
635
+
636
+ caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption) # jc6640
637
+ caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption) # jc6640vc
638
+ caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption) # 6640vc231
639
+
640
+ caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
641
+ caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
642
+ caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
643
+ caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
644
+ caption = re.sub(r"\bpage\s+\d+\b", "", caption)
645
+
646
+ caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption) # j2d1a2a...
647
+
648
+ caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
649
+
650
+ caption = re.sub(r"\b\s+\:\s+", r": ", caption)
651
+ caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
652
+ caption = re.sub(r"\s+", " ", caption)
653
+
654
+ caption.strip()
655
+
656
+ caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
657
+ caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
658
+ caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
659
+ caption = re.sub(r"^\.\S+$", "", caption)
660
+
661
+ return caption.strip()
662
+
663
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
664
+ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
665
+ shape = (
666
+ batch_size,
667
+ num_channels_latents,
668
+ int(height) // self.vae_scale_factor,
669
+ int(width) // self.vae_scale_factor,
670
+ )
671
+ if isinstance(generator, list) and len(generator) != batch_size:
672
+ raise ValueError(
673
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
674
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
675
+ )
676
+
677
+ if latents is None:
678
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
679
+ else:
680
+ latents = latents.to(device)
681
+
682
+ # scale the initial noise by the standard deviation required by the scheduler
683
+ latents = latents * self.scheduler.init_noise_sigma
684
+ return latents
685
+
686
+ @torch.no_grad()
687
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
688
+ def __call__(
689
+ self,
690
+ prompt: Union[str, List[str]] = None,
691
+ negative_prompt: str = "",
692
+ num_inference_steps: int = 20,
693
+ timesteps: List[int] = None,
694
+ sigmas: List[float] = None,
695
+ guidance_scale: float = 4.5,
696
+ num_images_per_prompt: Optional[int] = 1,
697
+ height: Optional[int] = None,
698
+ width: Optional[int] = None,
699
+ eta: float = 0.0,
700
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
701
+ latents: Optional[torch.Tensor] = None,
702
+ prompt_embeds: Optional[torch.Tensor] = None,
703
+ prompt_attention_mask: Optional[torch.Tensor] = None,
704
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
705
+ negative_prompt_attention_mask: Optional[torch.Tensor] = None,
706
+ output_type: Optional[str] = "pil",
707
+ return_dict: bool = True,
708
+ callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
709
+ callback_steps: int = 1,
710
+ clean_caption: bool = True,
711
+ use_resolution_binning: bool = True,
712
+ max_sequence_length: int = 120,
713
+ **kwargs,
714
+ ) -> Union[ImagePipelineOutput, Tuple]:
715
+ """
716
+ Function invoked when calling the pipeline for generation.
717
+
718
+ Args:
719
+ prompt (`str` or `List[str]`, *optional*):
720
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
721
+ instead.
722
+ negative_prompt (`str` or `List[str]`, *optional*):
723
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
724
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
725
+ less than `1`).
726
+ num_inference_steps (`int`, *optional*, defaults to 100):
727
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
728
+ expense of slower inference.
729
+ timesteps (`List[int]`, *optional*):
730
+ Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
731
+ in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
732
+ passed will be used. Must be in descending order.
733
+ sigmas (`List[float]`, *optional*):
734
+ Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
735
+ their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
736
+ will be used.
737
+ guidance_scale (`float`, *optional*, defaults to 4.5):
738
+ Guidance scale as defined in [Classifier-Free Diffusion
739
+ Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
740
+ of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
741
+ `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
742
+ the text `prompt`, usually at the expense of lower image quality.
743
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
744
+ The number of images to generate per prompt.
745
+ height (`int`, *optional*, defaults to self.unet.config.sample_size):
746
+ The height in pixels of the generated image.
747
+ width (`int`, *optional*, defaults to self.unet.config.sample_size):
748
+ The width in pixels of the generated image.
749
+ eta (`float`, *optional*, defaults to 0.0):
750
+ Corresponds to parameter eta (η) in the DDIM paper: https://huggingface.co/papers/2010.02502. Only
751
+ applies to [`schedulers.DDIMScheduler`], will be ignored for others.
752
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
753
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
754
+ to make generation deterministic.
755
+ latents (`torch.Tensor`, *optional*):
756
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
757
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
758
+ tensor will be generated by sampling using the supplied random `generator`.
759
+ prompt_embeds (`torch.Tensor`, *optional*):
760
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
761
+ provided, text embeddings will be generated from `prompt` input argument.
762
+ prompt_attention_mask (`torch.Tensor`, *optional*): Pre-generated attention mask for text embeddings.
763
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
764
+ Pre-generated negative text embeddings. For PixArt-Alpha this negative prompt should be "". If not
765
+ provided, negative_prompt_embeds will be generated from `negative_prompt` input argument.
766
+ negative_prompt_attention_mask (`torch.Tensor`, *optional*):
767
+ Pre-generated attention mask for negative text embeddings.
768
+ output_type (`str`, *optional*, defaults to `"pil"`):
769
+ The output format of the generate image. Choose between
770
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
771
+ return_dict (`bool`, *optional*, defaults to `True`):
772
+ Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
773
+ callback (`Callable`, *optional*):
774
+ A function that will be called every `callback_steps` steps during inference. The function will be
775
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
776
+ callback_steps (`int`, *optional*, defaults to 1):
777
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
778
+ called at every step.
779
+ clean_caption (`bool`, *optional*, defaults to `True`):
780
+ Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
781
+ be installed. If the dependencies are not installed, the embeddings will be created from the raw
782
+ prompt.
783
+ use_resolution_binning (`bool` defaults to `True`):
784
+ If set to `True`, the requested height and width are first mapped to the closest resolutions using
785
+ `ASPECT_RATIO_1024_BIN`. After the produced latents are decoded into images, they are resized back to
786
+ the requested resolution. Useful for generating non-square images.
787
+ max_sequence_length (`int` defaults to 120): Maximum sequence length to use with the `prompt`.
788
+
789
+ Examples:
790
+
791
+ Returns:
792
+ [`~pipelines.ImagePipelineOutput`] or `tuple`:
793
+ If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
794
+ returned where the first element is a list with the generated images
795
+ """
796
+ if "mask_feature" in kwargs:
797
+ deprecation_message = "The use of `mask_feature` is deprecated. It is no longer used in any computation and that doesn't affect the end results. It will be removed in a future version."
798
+ deprecate("mask_feature", "1.0.0", deprecation_message, standard_warn=False)
799
+ # 1. Check inputs. Raise error if not correct
800
+ height = height or self.transformer.config.sample_size * self.vae_scale_factor
801
+ width = width or self.transformer.config.sample_size * self.vae_scale_factor
802
+ if use_resolution_binning:
803
+ if self.transformer.config.sample_size == 128:
804
+ aspect_ratio_bin = ASPECT_RATIO_1024_BIN
805
+ elif self.transformer.config.sample_size == 64:
806
+ aspect_ratio_bin = ASPECT_RATIO_512_BIN
807
+ elif self.transformer.config.sample_size == 32:
808
+ aspect_ratio_bin = ASPECT_RATIO_256_BIN
809
+ else:
810
+ raise ValueError("Invalid sample size")
811
+ orig_height, orig_width = height, width
812
+ height, width = self.image_processor.classify_height_width_bin(height, width, ratios=aspect_ratio_bin)
813
+
814
+ self.check_inputs(
815
+ prompt,
816
+ height,
817
+ width,
818
+ negative_prompt,
819
+ callback_steps,
820
+ prompt_embeds,
821
+ negative_prompt_embeds,
822
+ prompt_attention_mask,
823
+ negative_prompt_attention_mask,
824
+ )
825
+
826
+ # 2. Default height and width to transformer
827
+ if prompt is not None and isinstance(prompt, str):
828
+ batch_size = 1
829
+ elif prompt is not None and isinstance(prompt, list):
830
+ batch_size = len(prompt)
831
+ else:
832
+ batch_size = prompt_embeds.shape[0]
833
+
834
+ device = self._execution_device
835
+
836
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
837
+ # of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
838
+ # corresponds to doing no classifier free guidance.
839
+ do_classifier_free_guidance = guidance_scale > 1.0
840
+
841
+ # 3. Encode input prompt
842
+ (
843
+ prompt_embeds,
844
+ prompt_attention_mask,
845
+ negative_prompt_embeds,
846
+ negative_prompt_attention_mask,
847
+ ) = self.encode_prompt(
848
+ prompt,
849
+ do_classifier_free_guidance,
850
+ negative_prompt=negative_prompt,
851
+ num_images_per_prompt=num_images_per_prompt,
852
+ device=device,
853
+ prompt_embeds=prompt_embeds,
854
+ negative_prompt_embeds=negative_prompt_embeds,
855
+ prompt_attention_mask=prompt_attention_mask,
856
+ negative_prompt_attention_mask=negative_prompt_attention_mask,
857
+ clean_caption=clean_caption,
858
+ max_sequence_length=max_sequence_length,
859
+ )
860
+ if do_classifier_free_guidance:
861
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
862
+ prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0)
863
+
864
+ # 4. Prepare timesteps
865
+ timesteps, num_inference_steps = retrieve_timesteps(
866
+ self.scheduler, num_inference_steps, device, timesteps, sigmas
867
+ )
868
+
869
+ # 5. Prepare latents.
870
+ latent_channels = self.transformer.config.in_channels
871
+ latents = self.prepare_latents(
872
+ batch_size * num_images_per_prompt,
873
+ latent_channels,
874
+ height,
875
+ width,
876
+ prompt_embeds.dtype,
877
+ device,
878
+ generator,
879
+ latents,
880
+ )
881
+
882
+ # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
883
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
884
+
885
+ # 6.1 Prepare micro-conditions.
886
+ added_cond_kwargs = {"resolution": None, "aspect_ratio": None}
887
+ if self.transformer.config.sample_size == 128:
888
+ resolution = torch.tensor([height, width]).repeat(batch_size * num_images_per_prompt, 1)
889
+ aspect_ratio = torch.tensor([float(height / width)]).repeat(batch_size * num_images_per_prompt, 1)
890
+ resolution = resolution.to(dtype=prompt_embeds.dtype, device=device)
891
+ aspect_ratio = aspect_ratio.to(dtype=prompt_embeds.dtype, device=device)
892
+
893
+ if do_classifier_free_guidance:
894
+ resolution = torch.cat([resolution, resolution], dim=0)
895
+ aspect_ratio = torch.cat([aspect_ratio, aspect_ratio], dim=0)
896
+
897
+ added_cond_kwargs = {"resolution": resolution, "aspect_ratio": aspect_ratio}
898
+
899
+ # 7. Denoising loop
900
+ num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
901
+
902
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
903
+ for i, t in enumerate(timesteps):
904
+ latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
905
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
906
+
907
+ current_timestep = t
908
+ if not torch.is_tensor(current_timestep):
909
+ # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
910
+ # This would be a good case for the `match` statement (Python 3.10+)
911
+ is_mps = latent_model_input.device.type == "mps"
912
+ is_npu = latent_model_input.device.type == "npu"
913
+ if isinstance(current_timestep, float):
914
+ dtype = torch.float32 if (is_mps or is_npu) else torch.float64
915
+ else:
916
+ dtype = torch.int32 if (is_mps or is_npu) else torch.int64
917
+ current_timestep = torch.tensor([current_timestep], dtype=dtype, device=latent_model_input.device)
918
+ elif len(current_timestep.shape) == 0:
919
+ current_timestep = current_timestep[None].to(latent_model_input.device)
920
+ # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
921
+ current_timestep = current_timestep.expand(latent_model_input.shape[0])
922
+
923
+ # predict noise model_output
924
+ noise_pred = self.transformer(
925
+ latent_model_input,
926
+ encoder_hidden_states=prompt_embeds,
927
+ encoder_attention_mask=prompt_attention_mask,
928
+ timestep=current_timestep,
929
+ added_cond_kwargs=added_cond_kwargs,
930
+ return_dict=False,
931
+ )[0]
932
+
933
+ # perform guidance
934
+ if do_classifier_free_guidance:
935
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
936
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
937
+
938
+ # learned sigma
939
+ if self.transformer.config.out_channels // 2 == latent_channels:
940
+ noise_pred = noise_pred.chunk(2, dim=1)[0]
941
+ else:
942
+ noise_pred = noise_pred
943
+
944
+ # compute previous image: x_t -> x_t-1
945
+ if num_inference_steps == 1:
946
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[1]
947
+ else:
948
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
949
+
950
+ # call the callback, if provided
951
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
952
+ progress_bar.update()
953
+ if callback is not None and i % callback_steps == 0:
954
+ step_idx = i // getattr(self.scheduler, "order", 1)
955
+ callback(step_idx, t, latents)
956
+
957
+ if XLA_AVAILABLE:
958
+ xm.mark_step()
959
+
960
+ if not output_type == "latent":
961
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
962
+ if use_resolution_binning:
963
+ image = self.image_processor.resize_and_crop_tensor(image, orig_width, orig_height)
964
+ else:
965
+ image = latents
966
+
967
+ if not output_type == "latent":
968
+ image = self.image_processor.postprocess(image, output_type=output_type)
969
+
970
+ # Offload all models
971
+ self.maybe_free_model_hooks()
972
+
973
+ if not return_dict:
974
+ return (image,)
975
+
976
+ return ImagePipelineOutput(images=image)
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py ADDED
@@ -0,0 +1,906 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 PixArt-Sigma Authors and The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import html
16
+ import inspect
17
+ import re
18
+ import urllib.parse as ul
19
+ from typing import Callable, List, Optional, Tuple, Union
20
+
21
+ import torch
22
+ from transformers import T5EncoderModel, T5Tokenizer
23
+
24
+ from ...image_processor import PixArtImageProcessor
25
+ from ...models import AutoencoderKL, PixArtTransformer2DModel
26
+ from ...schedulers import KarrasDiffusionSchedulers
27
+ from ...utils import (
28
+ BACKENDS_MAPPING,
29
+ deprecate,
30
+ is_bs4_available,
31
+ is_ftfy_available,
32
+ is_torch_xla_available,
33
+ logging,
34
+ replace_example_docstring,
35
+ )
36
+ from ...utils.torch_utils import randn_tensor
37
+ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
38
+ from .pipeline_pixart_alpha import (
39
+ ASPECT_RATIO_256_BIN,
40
+ ASPECT_RATIO_512_BIN,
41
+ ASPECT_RATIO_1024_BIN,
42
+ )
43
+
44
+
45
+ if is_torch_xla_available():
46
+ import torch_xla.core.xla_model as xm
47
+
48
+ XLA_AVAILABLE = True
49
+ else:
50
+ XLA_AVAILABLE = False
51
+
52
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
53
+
54
+
55
+ if is_bs4_available():
56
+ from bs4 import BeautifulSoup
57
+
58
+ if is_ftfy_available():
59
+ import ftfy
60
+
61
+
62
+ ASPECT_RATIO_2048_BIN = {
63
+ "0.25": [1024.0, 4096.0],
64
+ "0.26": [1024.0, 3968.0],
65
+ "0.27": [1024.0, 3840.0],
66
+ "0.28": [1024.0, 3712.0],
67
+ "0.32": [1152.0, 3584.0],
68
+ "0.33": [1152.0, 3456.0],
69
+ "0.35": [1152.0, 3328.0],
70
+ "0.4": [1280.0, 3200.0],
71
+ "0.42": [1280.0, 3072.0],
72
+ "0.48": [1408.0, 2944.0],
73
+ "0.5": [1408.0, 2816.0],
74
+ "0.52": [1408.0, 2688.0],
75
+ "0.57": [1536.0, 2688.0],
76
+ "0.6": [1536.0, 2560.0],
77
+ "0.68": [1664.0, 2432.0],
78
+ "0.72": [1664.0, 2304.0],
79
+ "0.78": [1792.0, 2304.0],
80
+ "0.82": [1792.0, 2176.0],
81
+ "0.88": [1920.0, 2176.0],
82
+ "0.94": [1920.0, 2048.0],
83
+ "1.0": [2048.0, 2048.0],
84
+ "1.07": [2048.0, 1920.0],
85
+ "1.13": [2176.0, 1920.0],
86
+ "1.21": [2176.0, 1792.0],
87
+ "1.29": [2304.0, 1792.0],
88
+ "1.38": [2304.0, 1664.0],
89
+ "1.46": [2432.0, 1664.0],
90
+ "1.67": [2560.0, 1536.0],
91
+ "1.75": [2688.0, 1536.0],
92
+ "2.0": [2816.0, 1408.0],
93
+ "2.09": [2944.0, 1408.0],
94
+ "2.4": [3072.0, 1280.0],
95
+ "2.5": [3200.0, 1280.0],
96
+ "2.89": [3328.0, 1152.0],
97
+ "3.0": [3456.0, 1152.0],
98
+ "3.11": [3584.0, 1152.0],
99
+ "3.62": [3712.0, 1024.0],
100
+ "3.75": [3840.0, 1024.0],
101
+ "3.88": [3968.0, 1024.0],
102
+ "4.0": [4096.0, 1024.0],
103
+ }
104
+
105
+
106
+ EXAMPLE_DOC_STRING = """
107
+ Examples:
108
+ ```py
109
+ >>> import torch
110
+ >>> from diffusers import PixArtSigmaPipeline
111
+
112
+ >>> # You can replace the checkpoint id with "PixArt-alpha/PixArt-Sigma-XL-2-512-MS" too.
113
+ >>> pipe = PixArtSigmaPipeline.from_pretrained(
114
+ ... "PixArt-alpha/PixArt-Sigma-XL-2-1024-MS", torch_dtype=torch.float16
115
+ ... )
116
+ >>> # Enable memory optimizations.
117
+ >>> # pipe.enable_model_cpu_offload()
118
+
119
+ >>> prompt = "A small cactus with a happy face in the Sahara desert."
120
+ >>> image = pipe(prompt).images[0]
121
+ ```
122
+ """
123
+
124
+
125
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
126
+ def retrieve_timesteps(
127
+ scheduler,
128
+ num_inference_steps: Optional[int] = None,
129
+ device: Optional[Union[str, torch.device]] = None,
130
+ timesteps: Optional[List[int]] = None,
131
+ sigmas: Optional[List[float]] = None,
132
+ **kwargs,
133
+ ):
134
+ r"""
135
+ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
136
+ custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
137
+
138
+ Args:
139
+ scheduler (`SchedulerMixin`):
140
+ The scheduler to get timesteps from.
141
+ num_inference_steps (`int`):
142
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
143
+ must be `None`.
144
+ device (`str` or `torch.device`, *optional*):
145
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
146
+ timesteps (`List[int]`, *optional*):
147
+ Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
148
+ `num_inference_steps` and `sigmas` must be `None`.
149
+ sigmas (`List[float]`, *optional*):
150
+ Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
151
+ `num_inference_steps` and `timesteps` must be `None`.
152
+
153
+ Returns:
154
+ `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
155
+ second element is the number of inference steps.
156
+ """
157
+ if timesteps is not None and sigmas is not None:
158
+ raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
159
+ if timesteps is not None:
160
+ accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
161
+ if not accepts_timesteps:
162
+ raise ValueError(
163
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
164
+ f" timestep schedules. Please check whether you are using the correct scheduler."
165
+ )
166
+ scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
167
+ timesteps = scheduler.timesteps
168
+ num_inference_steps = len(timesteps)
169
+ elif sigmas is not None:
170
+ accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
171
+ if not accept_sigmas:
172
+ raise ValueError(
173
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
174
+ f" sigmas schedules. Please check whether you are using the correct scheduler."
175
+ )
176
+ scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
177
+ timesteps = scheduler.timesteps
178
+ num_inference_steps = len(timesteps)
179
+ else:
180
+ scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
181
+ timesteps = scheduler.timesteps
182
+ return timesteps, num_inference_steps
183
+
184
+
185
+ class PixArtSigmaPipeline(DiffusionPipeline):
186
+ r"""
187
+ Pipeline for text-to-image generation using PixArt-Sigma.
188
+
189
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
190
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
191
+
192
+ Args:
193
+ vae ([`AutoencoderKL`]):
194
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
195
+ text_encoder ([`T5EncoderModel`]):
196
+ Frozen text-encoder. PixArt-Alpha uses
197
+ [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel), specifically the
198
+ [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant.
199
+ tokenizer (`T5Tokenizer`):
200
+ Tokenizer of class
201
+ [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
202
+ transformer ([`PixArtTransformer2DModel`]):
203
+ A text conditioned `PixArtTransformer2DModel` to denoise the encoded image latents. Initially published as
204
+ [`Transformer2DModel`](https://huggingface.co/PixArt-alpha/PixArt-Sigma-XL-2-1024-MS/blob/main/transformer/config.json#L2)
205
+ in the config, but the mismatch can be ignored.
206
+ scheduler ([`SchedulerMixin`]):
207
+ A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
208
+ """
209
+
210
+ bad_punct_regex = re.compile(
211
+ r"["
212
+ + "#®•©™&@·º½¾¿¡§~"
213
+ + r"\)"
214
+ + r"\("
215
+ + r"\]"
216
+ + r"\["
217
+ + r"\}"
218
+ + r"\{"
219
+ + r"\|"
220
+ + "\\"
221
+ + r"\/"
222
+ + r"\*"
223
+ + r"]{1,}"
224
+ ) # noqa
225
+
226
+ _optional_components = ["tokenizer", "text_encoder"]
227
+ model_cpu_offload_seq = "text_encoder->transformer->vae"
228
+
229
+ def __init__(
230
+ self,
231
+ tokenizer: T5Tokenizer,
232
+ text_encoder: T5EncoderModel,
233
+ vae: AutoencoderKL,
234
+ transformer: PixArtTransformer2DModel,
235
+ scheduler: KarrasDiffusionSchedulers,
236
+ ):
237
+ super().__init__()
238
+
239
+ self.register_modules(
240
+ tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
241
+ )
242
+
243
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
244
+ self.image_processor = PixArtImageProcessor(vae_scale_factor=self.vae_scale_factor)
245
+
246
+ # Copied from diffusers.pipelines.pixart_alpha.pipeline_pixart_alpha.PixArtAlphaPipeline.encode_prompt with 120->300
247
+ def encode_prompt(
248
+ self,
249
+ prompt: Union[str, List[str]],
250
+ do_classifier_free_guidance: bool = True,
251
+ negative_prompt: str = "",
252
+ num_images_per_prompt: int = 1,
253
+ device: Optional[torch.device] = None,
254
+ prompt_embeds: Optional[torch.Tensor] = None,
255
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
256
+ prompt_attention_mask: Optional[torch.Tensor] = None,
257
+ negative_prompt_attention_mask: Optional[torch.Tensor] = None,
258
+ clean_caption: bool = False,
259
+ max_sequence_length: int = 300,
260
+ **kwargs,
261
+ ):
262
+ r"""
263
+ Encodes the prompt into text encoder hidden states.
264
+
265
+ Args:
266
+ prompt (`str` or `List[str]`, *optional*):
267
+ prompt to be encoded
268
+ negative_prompt (`str` or `List[str]`, *optional*):
269
+ The prompt not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`
270
+ instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). For
271
+ PixArt-Alpha, this should be "".
272
+ do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
273
+ whether to use classifier free guidance or not
274
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
275
+ number of images that should be generated per prompt
276
+ device: (`torch.device`, *optional*):
277
+ torch device to place the resulting embeddings on
278
+ prompt_embeds (`torch.Tensor`, *optional*):
279
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
280
+ provided, text embeddings will be generated from `prompt` input argument.
281
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
282
+ Pre-generated negative text embeddings. For PixArt-Alpha, it's should be the embeddings of the ""
283
+ string.
284
+ clean_caption (`bool`, defaults to `False`):
285
+ If `True`, the function will preprocess and clean the provided caption before encoding.
286
+ max_sequence_length (`int`, defaults to 300): Maximum sequence length to use for the prompt.
287
+ """
288
+
289
+ if "mask_feature" in kwargs:
290
+ deprecation_message = "The use of `mask_feature` is deprecated. It is no longer used in any computation and that doesn't affect the end results. It will be removed in a future version."
291
+ deprecate("mask_feature", "1.0.0", deprecation_message, standard_warn=False)
292
+
293
+ if device is None:
294
+ device = self._execution_device
295
+
296
+ # See Section 3.1. of the paper.
297
+ max_length = max_sequence_length
298
+
299
+ if prompt_embeds is None:
300
+ prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
301
+ text_inputs = self.tokenizer(
302
+ prompt,
303
+ padding="max_length",
304
+ max_length=max_length,
305
+ truncation=True,
306
+ add_special_tokens=True,
307
+ return_tensors="pt",
308
+ )
309
+ text_input_ids = text_inputs.input_ids
310
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
311
+
312
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
313
+ text_input_ids, untruncated_ids
314
+ ):
315
+ removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
316
+ logger.warning(
317
+ "The following part of your input was truncated because T5 can only handle sequences up to"
318
+ f" {max_length} tokens: {removed_text}"
319
+ )
320
+
321
+ prompt_attention_mask = text_inputs.attention_mask
322
+ prompt_attention_mask = prompt_attention_mask.to(device)
323
+
324
+ prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=prompt_attention_mask)
325
+ prompt_embeds = prompt_embeds[0]
326
+
327
+ if self.text_encoder is not None:
328
+ dtype = self.text_encoder.dtype
329
+ elif self.transformer is not None:
330
+ dtype = self.transformer.dtype
331
+ else:
332
+ dtype = None
333
+
334
+ prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
335
+
336
+ bs_embed, seq_len, _ = prompt_embeds.shape
337
+ # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
338
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
339
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
340
+ prompt_attention_mask = prompt_attention_mask.repeat(1, num_images_per_prompt)
341
+ prompt_attention_mask = prompt_attention_mask.view(bs_embed * num_images_per_prompt, -1)
342
+
343
+ # get unconditional embeddings for classifier free guidance
344
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
345
+ uncond_tokens = [negative_prompt] * bs_embed if isinstance(negative_prompt, str) else negative_prompt
346
+ uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
347
+ max_length = prompt_embeds.shape[1]
348
+ uncond_input = self.tokenizer(
349
+ uncond_tokens,
350
+ padding="max_length",
351
+ max_length=max_length,
352
+ truncation=True,
353
+ return_attention_mask=True,
354
+ add_special_tokens=True,
355
+ return_tensors="pt",
356
+ )
357
+ negative_prompt_attention_mask = uncond_input.attention_mask
358
+ negative_prompt_attention_mask = negative_prompt_attention_mask.to(device)
359
+
360
+ negative_prompt_embeds = self.text_encoder(
361
+ uncond_input.input_ids.to(device), attention_mask=negative_prompt_attention_mask
362
+ )
363
+ negative_prompt_embeds = negative_prompt_embeds[0]
364
+
365
+ if do_classifier_free_guidance:
366
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
367
+ seq_len = negative_prompt_embeds.shape[1]
368
+
369
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
370
+
371
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
372
+ negative_prompt_embeds = negative_prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
373
+
374
+ negative_prompt_attention_mask = negative_prompt_attention_mask.repeat(1, num_images_per_prompt)
375
+ negative_prompt_attention_mask = negative_prompt_attention_mask.view(bs_embed * num_images_per_prompt, -1)
376
+ else:
377
+ negative_prompt_embeds = None
378
+ negative_prompt_attention_mask = None
379
+
380
+ return prompt_embeds, prompt_attention_mask, negative_prompt_embeds, negative_prompt_attention_mask
381
+
382
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
383
+ def prepare_extra_step_kwargs(self, generator, eta):
384
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
385
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
386
+ # eta corresponds to η in DDIM paper: https://huggingface.co/papers/2010.02502
387
+ # and should be between [0, 1]
388
+
389
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
390
+ extra_step_kwargs = {}
391
+ if accepts_eta:
392
+ extra_step_kwargs["eta"] = eta
393
+
394
+ # check if the scheduler accepts generator
395
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
396
+ if accepts_generator:
397
+ extra_step_kwargs["generator"] = generator
398
+ return extra_step_kwargs
399
+
400
+ # Copied from diffusers.pipelines.pixart_alpha.pipeline_pixart_alpha.PixArtAlphaPipeline.check_inputs
401
+ def check_inputs(
402
+ self,
403
+ prompt,
404
+ height,
405
+ width,
406
+ negative_prompt,
407
+ callback_steps,
408
+ prompt_embeds=None,
409
+ negative_prompt_embeds=None,
410
+ prompt_attention_mask=None,
411
+ negative_prompt_attention_mask=None,
412
+ ):
413
+ if height % 8 != 0 or width % 8 != 0:
414
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
415
+
416
+ if (callback_steps is None) or (
417
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
418
+ ):
419
+ raise ValueError(
420
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
421
+ f" {type(callback_steps)}."
422
+ )
423
+
424
+ if prompt is not None and prompt_embeds is not None:
425
+ raise ValueError(
426
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
427
+ " only forward one of the two."
428
+ )
429
+ elif prompt is None and prompt_embeds is None:
430
+ raise ValueError(
431
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
432
+ )
433
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
434
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
435
+
436
+ if prompt is not None and negative_prompt_embeds is not None:
437
+ raise ValueError(
438
+ f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
439
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
440
+ )
441
+
442
+ if negative_prompt is not None and negative_prompt_embeds is not None:
443
+ raise ValueError(
444
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
445
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
446
+ )
447
+
448
+ if prompt_embeds is not None and prompt_attention_mask is None:
449
+ raise ValueError("Must provide `prompt_attention_mask` when specifying `prompt_embeds`.")
450
+
451
+ if negative_prompt_embeds is not None and negative_prompt_attention_mask is None:
452
+ raise ValueError("Must provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`.")
453
+
454
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
455
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
456
+ raise ValueError(
457
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
458
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
459
+ f" {negative_prompt_embeds.shape}."
460
+ )
461
+ if prompt_attention_mask.shape != negative_prompt_attention_mask.shape:
462
+ raise ValueError(
463
+ "`prompt_attention_mask` and `negative_prompt_attention_mask` must have the same shape when passed directly, but"
464
+ f" got: `prompt_attention_mask` {prompt_attention_mask.shape} != `negative_prompt_attention_mask`"
465
+ f" {negative_prompt_attention_mask.shape}."
466
+ )
467
+
468
+ # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
469
+ def _text_preprocessing(self, text, clean_caption=False):
470
+ if clean_caption and not is_bs4_available():
471
+ logger.warning(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
472
+ logger.warning("Setting `clean_caption` to False...")
473
+ clean_caption = False
474
+
475
+ if clean_caption and not is_ftfy_available():
476
+ logger.warning(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
477
+ logger.warning("Setting `clean_caption` to False...")
478
+ clean_caption = False
479
+
480
+ if not isinstance(text, (tuple, list)):
481
+ text = [text]
482
+
483
+ def process(text: str):
484
+ if clean_caption:
485
+ text = self._clean_caption(text)
486
+ text = self._clean_caption(text)
487
+ else:
488
+ text = text.lower().strip()
489
+ return text
490
+
491
+ return [process(t) for t in text]
492
+
493
+ # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
494
+ def _clean_caption(self, caption):
495
+ caption = str(caption)
496
+ caption = ul.unquote_plus(caption)
497
+ caption = caption.strip().lower()
498
+ caption = re.sub("<person>", "person", caption)
499
+ # urls:
500
+ caption = re.sub(
501
+ r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa
502
+ "",
503
+ caption,
504
+ ) # regex for urls
505
+ caption = re.sub(
506
+ r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa
507
+ "",
508
+ caption,
509
+ ) # regex for urls
510
+ # html:
511
+ caption = BeautifulSoup(caption, features="html.parser").text
512
+
513
+ # @<nickname>
514
+ caption = re.sub(r"@[\w\d]+\b", "", caption)
515
+
516
+ # 31C0—31EF CJK Strokes
517
+ # 31F0—31FF Katakana Phonetic Extensions
518
+ # 3200—32FF Enclosed CJK Letters and Months
519
+ # 3300—33FF CJK Compatibility
520
+ # 3400—4DBF CJK Unified Ideographs Extension A
521
+ # 4DC0—4DFF Yijing Hexagram Symbols
522
+ # 4E00—9FFF CJK Unified Ideographs
523
+ caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
524
+ caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
525
+ caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
526
+ caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
527
+ caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
528
+ caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
529
+ caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
530
+ #######################################################
531
+
532
+ # все виды тире / all types of dash --> "-"
533
+ caption = re.sub(
534
+ r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+", # noqa
535
+ "-",
536
+ caption,
537
+ )
538
+
539
+ # кавычки к одному стандарту
540
+ caption = re.sub(r"[`´«»“”¨]", '"', caption)
541
+ caption = re.sub(r"[‘’]", "'", caption)
542
+
543
+ # &quot;
544
+ caption = re.sub(r"&quot;?", "", caption)
545
+ # &amp
546
+ caption = re.sub(r"&amp", "", caption)
547
+
548
+ # ip addresses:
549
+ caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
550
+
551
+ # article ids:
552
+ caption = re.sub(r"\d:\d\d\s+$", "", caption)
553
+
554
+ # \n
555
+ caption = re.sub(r"\\n", " ", caption)
556
+
557
+ # "#123"
558
+ caption = re.sub(r"#\d{1,3}\b", "", caption)
559
+ # "#12345.."
560
+ caption = re.sub(r"#\d{5,}\b", "", caption)
561
+ # "123456.."
562
+ caption = re.sub(r"\b\d{6,}\b", "", caption)
563
+ # filenames:
564
+ caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
565
+
566
+ #
567
+ caption = re.sub(r"[\"\']{2,}", r'"', caption) # """AUSVERKAUFT"""
568
+ caption = re.sub(r"[\.]{2,}", r" ", caption) # """AUSVERKAUFT"""
569
+
570
+ caption = re.sub(self.bad_punct_regex, r" ", caption) # ***AUSVERKAUFT***, #AUSVERKAUFT
571
+ caption = re.sub(r"\s+\.\s+", r" ", caption) # " . "
572
+
573
+ # this-is-my-cute-cat / this_is_my_cute_cat
574
+ regex2 = re.compile(r"(?:\-|\_)")
575
+ if len(re.findall(regex2, caption)) > 3:
576
+ caption = re.sub(regex2, " ", caption)
577
+
578
+ caption = ftfy.fix_text(caption)
579
+ caption = html.unescape(html.unescape(caption))
580
+
581
+ caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption) # jc6640
582
+ caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption) # jc6640vc
583
+ caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption) # 6640vc231
584
+
585
+ caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
586
+ caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
587
+ caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
588
+ caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
589
+ caption = re.sub(r"\bpage\s+\d+\b", "", caption)
590
+
591
+ caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption) # j2d1a2a...
592
+
593
+ caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
594
+
595
+ caption = re.sub(r"\b\s+\:\s+", r": ", caption)
596
+ caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
597
+ caption = re.sub(r"\s+", " ", caption)
598
+
599
+ caption.strip()
600
+
601
+ caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
602
+ caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
603
+ caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
604
+ caption = re.sub(r"^\.\S+$", "", caption)
605
+
606
+ return caption.strip()
607
+
608
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
609
+ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
610
+ shape = (
611
+ batch_size,
612
+ num_channels_latents,
613
+ int(height) // self.vae_scale_factor,
614
+ int(width) // self.vae_scale_factor,
615
+ )
616
+ if isinstance(generator, list) and len(generator) != batch_size:
617
+ raise ValueError(
618
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
619
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
620
+ )
621
+
622
+ if latents is None:
623
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
624
+ else:
625
+ latents = latents.to(device)
626
+
627
+ # scale the initial noise by the standard deviation required by the scheduler
628
+ latents = latents * self.scheduler.init_noise_sigma
629
+ return latents
630
+
631
+ @torch.no_grad()
632
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
633
+ def __call__(
634
+ self,
635
+ prompt: Union[str, List[str]] = None,
636
+ negative_prompt: str = "",
637
+ num_inference_steps: int = 20,
638
+ timesteps: List[int] = None,
639
+ sigmas: List[float] = None,
640
+ guidance_scale: float = 4.5,
641
+ num_images_per_prompt: Optional[int] = 1,
642
+ height: Optional[int] = None,
643
+ width: Optional[int] = None,
644
+ eta: float = 0.0,
645
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
646
+ latents: Optional[torch.Tensor] = None,
647
+ prompt_embeds: Optional[torch.Tensor] = None,
648
+ prompt_attention_mask: Optional[torch.Tensor] = None,
649
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
650
+ negative_prompt_attention_mask: Optional[torch.Tensor] = None,
651
+ output_type: Optional[str] = "pil",
652
+ return_dict: bool = True,
653
+ callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
654
+ callback_steps: int = 1,
655
+ clean_caption: bool = True,
656
+ use_resolution_binning: bool = True,
657
+ max_sequence_length: int = 300,
658
+ **kwargs,
659
+ ) -> Union[ImagePipelineOutput, Tuple]:
660
+ """
661
+ Function invoked when calling the pipeline for generation.
662
+
663
+ Args:
664
+ prompt (`str` or `List[str]`, *optional*):
665
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
666
+ instead.
667
+ negative_prompt (`str` or `List[str]`, *optional*):
668
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
669
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
670
+ less than `1`).
671
+ num_inference_steps (`int`, *optional*, defaults to 100):
672
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
673
+ expense of slower inference.
674
+ timesteps (`List[int]`, *optional*):
675
+ Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
676
+ in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
677
+ passed will be used. Must be in descending order.
678
+ sigmas (`List[float]`, *optional*):
679
+ Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
680
+ their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
681
+ will be used.
682
+ guidance_scale (`float`, *optional*, defaults to 4.5):
683
+ Guidance scale as defined in [Classifier-Free Diffusion
684
+ Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
685
+ of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
686
+ `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
687
+ the text `prompt`, usually at the expense of lower image quality.
688
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
689
+ The number of images to generate per prompt.
690
+ height (`int`, *optional*, defaults to self.unet.config.sample_size):
691
+ The height in pixels of the generated image.
692
+ width (`int`, *optional*, defaults to self.unet.config.sample_size):
693
+ The width in pixels of the generated image.
694
+ eta (`float`, *optional*, defaults to 0.0):
695
+ Corresponds to parameter eta (η) in the DDIM paper: https://huggingface.co/papers/2010.02502. Only
696
+ applies to [`schedulers.DDIMScheduler`], will be ignored for others.
697
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
698
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
699
+ to make generation deterministic.
700
+ latents (`torch.Tensor`, *optional*):
701
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
702
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
703
+ tensor will be generated by sampling using the supplied random `generator`.
704
+ prompt_embeds (`torch.Tensor`, *optional*):
705
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
706
+ provided, text embeddings will be generated from `prompt` input argument.
707
+ prompt_attention_mask (`torch.Tensor`, *optional*): Pre-generated attention mask for text embeddings.
708
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
709
+ Pre-generated negative text embeddings. For PixArt-Sigma this negative prompt should be "". If not
710
+ provided, negative_prompt_embeds will be generated from `negative_prompt` input argument.
711
+ negative_prompt_attention_mask (`torch.Tensor`, *optional*):
712
+ Pre-generated attention mask for negative text embeddings.
713
+ output_type (`str`, *optional*, defaults to `"pil"`):
714
+ The output format of the generate image. Choose between
715
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
716
+ return_dict (`bool`, *optional*, defaults to `True`):
717
+ Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
718
+ callback (`Callable`, *optional*):
719
+ A function that will be called every `callback_steps` steps during inference. The function will be
720
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
721
+ callback_steps (`int`, *optional*, defaults to 1):
722
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
723
+ called at every step.
724
+ clean_caption (`bool`, *optional*, defaults to `True`):
725
+ Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
726
+ be installed. If the dependencies are not installed, the embeddings will be created from the raw
727
+ prompt.
728
+ use_resolution_binning (`bool` defaults to `True`):
729
+ If set to `True`, the requested height and width are first mapped to the closest resolutions using
730
+ `ASPECT_RATIO_1024_BIN`. After the produced latents are decoded into images, they are resized back to
731
+ the requested resolution. Useful for generating non-square images.
732
+ max_sequence_length (`int` defaults to 300): Maximum sequence length to use with the `prompt`.
733
+
734
+ Examples:
735
+
736
+ Returns:
737
+ [`~pipelines.ImagePipelineOutput`] or `tuple`:
738
+ If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
739
+ returned where the first element is a list with the generated images
740
+ """
741
+ # 1. Check inputs. Raise error if not correct
742
+ height = height or self.transformer.config.sample_size * self.vae_scale_factor
743
+ width = width or self.transformer.config.sample_size * self.vae_scale_factor
744
+ if use_resolution_binning:
745
+ if self.transformer.config.sample_size == 256:
746
+ aspect_ratio_bin = ASPECT_RATIO_2048_BIN
747
+ elif self.transformer.config.sample_size == 128:
748
+ aspect_ratio_bin = ASPECT_RATIO_1024_BIN
749
+ elif self.transformer.config.sample_size == 64:
750
+ aspect_ratio_bin = ASPECT_RATIO_512_BIN
751
+ elif self.transformer.config.sample_size == 32:
752
+ aspect_ratio_bin = ASPECT_RATIO_256_BIN
753
+ else:
754
+ raise ValueError("Invalid sample size")
755
+ orig_height, orig_width = height, width
756
+ height, width = self.image_processor.classify_height_width_bin(height, width, ratios=aspect_ratio_bin)
757
+
758
+ self.check_inputs(
759
+ prompt,
760
+ height,
761
+ width,
762
+ negative_prompt,
763
+ callback_steps,
764
+ prompt_embeds,
765
+ negative_prompt_embeds,
766
+ prompt_attention_mask,
767
+ negative_prompt_attention_mask,
768
+ )
769
+
770
+ # 2. Default height and width to transformer
771
+ if prompt is not None and isinstance(prompt, str):
772
+ batch_size = 1
773
+ elif prompt is not None and isinstance(prompt, list):
774
+ batch_size = len(prompt)
775
+ else:
776
+ batch_size = prompt_embeds.shape[0]
777
+
778
+ device = self._execution_device
779
+
780
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
781
+ # of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
782
+ # corresponds to doing no classifier free guidance.
783
+ do_classifier_free_guidance = guidance_scale > 1.0
784
+
785
+ # 3. Encode input prompt
786
+ (
787
+ prompt_embeds,
788
+ prompt_attention_mask,
789
+ negative_prompt_embeds,
790
+ negative_prompt_attention_mask,
791
+ ) = self.encode_prompt(
792
+ prompt,
793
+ do_classifier_free_guidance,
794
+ negative_prompt=negative_prompt,
795
+ num_images_per_prompt=num_images_per_prompt,
796
+ device=device,
797
+ prompt_embeds=prompt_embeds,
798
+ negative_prompt_embeds=negative_prompt_embeds,
799
+ prompt_attention_mask=prompt_attention_mask,
800
+ negative_prompt_attention_mask=negative_prompt_attention_mask,
801
+ clean_caption=clean_caption,
802
+ max_sequence_length=max_sequence_length,
803
+ )
804
+ if do_classifier_free_guidance:
805
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
806
+ prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0)
807
+
808
+ # 4. Prepare timesteps
809
+ timesteps, num_inference_steps = retrieve_timesteps(
810
+ self.scheduler, num_inference_steps, device, timesteps, sigmas
811
+ )
812
+
813
+ # 5. Prepare latents.
814
+ latent_channels = self.transformer.config.in_channels
815
+ latents = self.prepare_latents(
816
+ batch_size * num_images_per_prompt,
817
+ latent_channels,
818
+ height,
819
+ width,
820
+ prompt_embeds.dtype,
821
+ device,
822
+ generator,
823
+ latents,
824
+ )
825
+
826
+ # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
827
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
828
+
829
+ # 6.1 Prepare micro-conditions.
830
+ added_cond_kwargs = {"resolution": None, "aspect_ratio": None}
831
+
832
+ # 7. Denoising loop
833
+ num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
834
+
835
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
836
+ for i, t in enumerate(timesteps):
837
+ latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
838
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
839
+
840
+ current_timestep = t
841
+ if not torch.is_tensor(current_timestep):
842
+ # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
843
+ # This would be a good case for the `match` statement (Python 3.10+)
844
+ is_mps = latent_model_input.device.type == "mps"
845
+ is_npu = latent_model_input.device.type == "npu"
846
+ if isinstance(current_timestep, float):
847
+ dtype = torch.float32 if (is_mps or is_npu) else torch.float64
848
+ else:
849
+ dtype = torch.int32 if (is_mps or is_npu) else torch.int64
850
+ current_timestep = torch.tensor([current_timestep], dtype=dtype, device=latent_model_input.device)
851
+ elif len(current_timestep.shape) == 0:
852
+ current_timestep = current_timestep[None].to(latent_model_input.device)
853
+ # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
854
+ current_timestep = current_timestep.expand(latent_model_input.shape[0])
855
+
856
+ # predict noise model_output
857
+ noise_pred = self.transformer(
858
+ latent_model_input,
859
+ encoder_hidden_states=prompt_embeds,
860
+ encoder_attention_mask=prompt_attention_mask,
861
+ timestep=current_timestep,
862
+ added_cond_kwargs=added_cond_kwargs,
863
+ return_dict=False,
864
+ )[0]
865
+
866
+ # perform guidance
867
+ if do_classifier_free_guidance:
868
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
869
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
870
+
871
+ # learned sigma
872
+ if self.transformer.config.out_channels // 2 == latent_channels:
873
+ noise_pred = noise_pred.chunk(2, dim=1)[0]
874
+ else:
875
+ noise_pred = noise_pred
876
+
877
+ # compute previous image: x_t -> x_t-1
878
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
879
+
880
+ # call the callback, if provided
881
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
882
+ progress_bar.update()
883
+ if callback is not None and i % callback_steps == 0:
884
+ step_idx = i // getattr(self.scheduler, "order", 1)
885
+ callback(step_idx, t, latents)
886
+
887
+ if XLA_AVAILABLE:
888
+ xm.mark_step()
889
+
890
+ if not output_type == "latent":
891
+ image = self.vae.decode(latents.to(self.vae.dtype) / self.vae.config.scaling_factor, return_dict=False)[0]
892
+ if use_resolution_binning:
893
+ image = self.image_processor.resize_and_crop_tensor(image, orig_width, orig_height)
894
+ else:
895
+ image = latents
896
+
897
+ if not output_type == "latent":
898
+ image = self.image_processor.postprocess(image, output_type=output_type)
899
+
900
+ # Offload all models
901
+ self.maybe_free_model_hooks()
902
+
903
+ if not return_dict:
904
+ return (image,)
905
+
906
+ return ImagePipelineOutput(images=image)
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/__init__.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import TYPE_CHECKING
2
+
3
+ from ...utils import (
4
+ DIFFUSERS_SLOW_IMPORT,
5
+ OptionalDependencyNotAvailable,
6
+ _LazyModule,
7
+ get_objects_from_module,
8
+ is_flax_available,
9
+ is_k_diffusion_available,
10
+ is_k_diffusion_version,
11
+ is_onnx_available,
12
+ is_torch_available,
13
+ is_transformers_available,
14
+ is_transformers_version,
15
+ )
16
+
17
+
18
+ _dummy_objects = {}
19
+ _additional_imports = {}
20
+ _import_structure = {"pipeline_output": ["StableDiffusionPipelineOutput"]}
21
+
22
+ if is_transformers_available() and is_flax_available():
23
+ _import_structure["pipeline_output"].extend(["FlaxStableDiffusionPipelineOutput"])
24
+ try:
25
+ if not (is_transformers_available() and is_torch_available()):
26
+ raise OptionalDependencyNotAvailable()
27
+ except OptionalDependencyNotAvailable:
28
+ from ...utils import dummy_torch_and_transformers_objects # noqa F403
29
+
30
+ _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
31
+ else:
32
+ _import_structure["clip_image_project_model"] = ["CLIPImageProjection"]
33
+ _import_structure["pipeline_stable_diffusion"] = ["StableDiffusionPipeline"]
34
+ _import_structure["pipeline_stable_diffusion_img2img"] = ["StableDiffusionImg2ImgPipeline"]
35
+ _import_structure["pipeline_stable_diffusion_inpaint"] = ["StableDiffusionInpaintPipeline"]
36
+ _import_structure["pipeline_stable_diffusion_instruct_pix2pix"] = ["StableDiffusionInstructPix2PixPipeline"]
37
+ _import_structure["pipeline_stable_diffusion_latent_upscale"] = ["StableDiffusionLatentUpscalePipeline"]
38
+ _import_structure["pipeline_stable_diffusion_upscale"] = ["StableDiffusionUpscalePipeline"]
39
+ _import_structure["pipeline_stable_unclip"] = ["StableUnCLIPPipeline"]
40
+ _import_structure["pipeline_stable_unclip_img2img"] = ["StableUnCLIPImg2ImgPipeline"]
41
+ _import_structure["safety_checker"] = ["StableDiffusionSafetyChecker"]
42
+ _import_structure["stable_unclip_image_normalizer"] = ["StableUnCLIPImageNormalizer"]
43
+ try:
44
+ if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0")):
45
+ raise OptionalDependencyNotAvailable()
46
+ except OptionalDependencyNotAvailable:
47
+ from ...utils.dummy_torch_and_transformers_objects import (
48
+ StableDiffusionImageVariationPipeline,
49
+ )
50
+
51
+ _dummy_objects.update({"StableDiffusionImageVariationPipeline": StableDiffusionImageVariationPipeline})
52
+ else:
53
+ _import_structure["pipeline_stable_diffusion_image_variation"] = ["StableDiffusionImageVariationPipeline"]
54
+ try:
55
+ if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.26.0")):
56
+ raise OptionalDependencyNotAvailable()
57
+ except OptionalDependencyNotAvailable:
58
+ from ...utils.dummy_torch_and_transformers_objects import (
59
+ StableDiffusionDepth2ImgPipeline,
60
+ )
61
+
62
+ _dummy_objects.update(
63
+ {
64
+ "StableDiffusionDepth2ImgPipeline": StableDiffusionDepth2ImgPipeline,
65
+ }
66
+ )
67
+ else:
68
+ _import_structure["pipeline_stable_diffusion_depth2img"] = ["StableDiffusionDepth2ImgPipeline"]
69
+
70
+ try:
71
+ if not (is_transformers_available() and is_onnx_available()):
72
+ raise OptionalDependencyNotAvailable()
73
+ except OptionalDependencyNotAvailable:
74
+ from ...utils import dummy_onnx_objects # noqa F403
75
+
76
+ _dummy_objects.update(get_objects_from_module(dummy_onnx_objects))
77
+ else:
78
+ _import_structure["pipeline_onnx_stable_diffusion"] = [
79
+ "OnnxStableDiffusionPipeline",
80
+ "StableDiffusionOnnxPipeline",
81
+ ]
82
+ _import_structure["pipeline_onnx_stable_diffusion_img2img"] = ["OnnxStableDiffusionImg2ImgPipeline"]
83
+ _import_structure["pipeline_onnx_stable_diffusion_inpaint"] = ["OnnxStableDiffusionInpaintPipeline"]
84
+ _import_structure["pipeline_onnx_stable_diffusion_inpaint_legacy"] = ["OnnxStableDiffusionInpaintPipelineLegacy"]
85
+ _import_structure["pipeline_onnx_stable_diffusion_upscale"] = ["OnnxStableDiffusionUpscalePipeline"]
86
+
87
+ if is_transformers_available() and is_flax_available():
88
+ from ...schedulers.scheduling_pndm_flax import PNDMSchedulerState
89
+
90
+ _additional_imports.update({"PNDMSchedulerState": PNDMSchedulerState})
91
+ _import_structure["pipeline_flax_stable_diffusion"] = ["FlaxStableDiffusionPipeline"]
92
+ _import_structure["pipeline_flax_stable_diffusion_img2img"] = ["FlaxStableDiffusionImg2ImgPipeline"]
93
+ _import_structure["pipeline_flax_stable_diffusion_inpaint"] = ["FlaxStableDiffusionInpaintPipeline"]
94
+ _import_structure["safety_checker_flax"] = ["FlaxStableDiffusionSafetyChecker"]
95
+
96
+ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
97
+ try:
98
+ if not (is_transformers_available() and is_torch_available()):
99
+ raise OptionalDependencyNotAvailable()
100
+
101
+ except OptionalDependencyNotAvailable:
102
+ from ...utils.dummy_torch_and_transformers_objects import *
103
+
104
+ else:
105
+ from .clip_image_project_model import CLIPImageProjection
106
+ from .pipeline_stable_diffusion import (
107
+ StableDiffusionPipeline,
108
+ StableDiffusionPipelineOutput,
109
+ )
110
+ from .pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipeline
111
+ from .pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipeline
112
+ from .pipeline_stable_diffusion_instruct_pix2pix import (
113
+ StableDiffusionInstructPix2PixPipeline,
114
+ )
115
+ from .pipeline_stable_diffusion_latent_upscale import (
116
+ StableDiffusionLatentUpscalePipeline,
117
+ )
118
+ from .pipeline_stable_diffusion_upscale import StableDiffusionUpscalePipeline
119
+ from .pipeline_stable_unclip import StableUnCLIPPipeline
120
+ from .pipeline_stable_unclip_img2img import StableUnCLIPImg2ImgPipeline
121
+ from .safety_checker import StableDiffusionSafetyChecker
122
+ from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
123
+
124
+ try:
125
+ if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0")):
126
+ raise OptionalDependencyNotAvailable()
127
+ except OptionalDependencyNotAvailable:
128
+ from ...utils.dummy_torch_and_transformers_objects import (
129
+ StableDiffusionImageVariationPipeline,
130
+ )
131
+ else:
132
+ from .pipeline_stable_diffusion_image_variation import (
133
+ StableDiffusionImageVariationPipeline,
134
+ )
135
+
136
+ try:
137
+ if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.26.0")):
138
+ raise OptionalDependencyNotAvailable()
139
+ except OptionalDependencyNotAvailable:
140
+ from ...utils.dummy_torch_and_transformers_objects import StableDiffusionDepth2ImgPipeline
141
+ else:
142
+ from .pipeline_stable_diffusion_depth2img import (
143
+ StableDiffusionDepth2ImgPipeline,
144
+ )
145
+
146
+ try:
147
+ if not (is_transformers_available() and is_onnx_available()):
148
+ raise OptionalDependencyNotAvailable()
149
+ except OptionalDependencyNotAvailable:
150
+ from ...utils.dummy_onnx_objects import *
151
+ else:
152
+ from .pipeline_onnx_stable_diffusion import (
153
+ OnnxStableDiffusionPipeline,
154
+ StableDiffusionOnnxPipeline,
155
+ )
156
+ from .pipeline_onnx_stable_diffusion_img2img import (
157
+ OnnxStableDiffusionImg2ImgPipeline,
158
+ )
159
+ from .pipeline_onnx_stable_diffusion_inpaint import (
160
+ OnnxStableDiffusionInpaintPipeline,
161
+ )
162
+ from .pipeline_onnx_stable_diffusion_upscale import (
163
+ OnnxStableDiffusionUpscalePipeline,
164
+ )
165
+
166
+ try:
167
+ if not (is_transformers_available() and is_flax_available()):
168
+ raise OptionalDependencyNotAvailable()
169
+ except OptionalDependencyNotAvailable:
170
+ from ...utils.dummy_flax_objects import *
171
+ else:
172
+ from .pipeline_flax_stable_diffusion import FlaxStableDiffusionPipeline
173
+ from .pipeline_flax_stable_diffusion_img2img import (
174
+ FlaxStableDiffusionImg2ImgPipeline,
175
+ )
176
+ from .pipeline_flax_stable_diffusion_inpaint import (
177
+ FlaxStableDiffusionInpaintPipeline,
178
+ )
179
+ from .pipeline_output import FlaxStableDiffusionPipelineOutput
180
+ from .safety_checker_flax import FlaxStableDiffusionSafetyChecker
181
+
182
+ else:
183
+ import sys
184
+
185
+ sys.modules[__name__] = _LazyModule(
186
+ __name__,
187
+ globals()["__file__"],
188
+ _import_structure,
189
+ module_spec=__spec__,
190
+ )
191
+
192
+ for name, value in _dummy_objects.items():
193
+ setattr(sys.modules[__name__], name, value)
194
+ for name, value in _additional_imports.items():
195
+ setattr(sys.modules[__name__], name, value)
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/__pycache__/clip_image_project_model.cpython-310.pyc ADDED
Binary file (997 Bytes). View file
 
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/__pycache__/convert_from_ckpt.cpython-310.pyc ADDED
Binary file (48 kB). View file
 
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/__pycache__/pipeline_flax_stable_diffusion.cpython-310.pyc ADDED
Binary file (15.3 kB). View file
 
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/__pycache__/pipeline_flax_stable_diffusion_img2img.cpython-310.pyc ADDED
Binary file (17.4 kB). View file
 
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/__pycache__/pipeline_flax_stable_diffusion_inpaint.cpython-310.pyc ADDED
Binary file (19.4 kB). View file
 
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py ADDED
@@ -0,0 +1,586 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ from typing import Any, Callable, List, Optional, Union
17
+
18
+ import numpy as np
19
+ import PIL.Image
20
+ import torch
21
+ from transformers import CLIPImageProcessor, CLIPTokenizer
22
+
23
+ from ...configuration_utils import FrozenDict
24
+ from ...schedulers import DDPMScheduler, KarrasDiffusionSchedulers
25
+ from ...utils import deprecate, logging
26
+ from ..onnx_utils import ORT_TO_NP_TYPE, OnnxRuntimeModel
27
+ from ..pipeline_utils import DiffusionPipeline
28
+ from . import StableDiffusionPipelineOutput
29
+
30
+
31
+ logger = logging.get_logger(__name__)
32
+
33
+
34
+ def preprocess(image):
35
+ if isinstance(image, torch.Tensor):
36
+ return image
37
+ elif isinstance(image, PIL.Image.Image):
38
+ image = [image]
39
+
40
+ if isinstance(image[0], PIL.Image.Image):
41
+ w, h = image[0].size
42
+ w, h = (x - x % 64 for x in (w, h)) # resize to integer multiple of 32
43
+
44
+ image = [np.array(i.resize((w, h)))[None, :] for i in image]
45
+ image = np.concatenate(image, axis=0)
46
+ image = np.array(image).astype(np.float32) / 255.0
47
+ image = image.transpose(0, 3, 1, 2)
48
+ image = 2.0 * image - 1.0
49
+ image = torch.from_numpy(image)
50
+ elif isinstance(image[0], torch.Tensor):
51
+ image = torch.cat(image, dim=0)
52
+
53
+ return image
54
+
55
+
56
+ class OnnxStableDiffusionUpscalePipeline(DiffusionPipeline):
57
+ vae: OnnxRuntimeModel
58
+ text_encoder: OnnxRuntimeModel
59
+ tokenizer: CLIPTokenizer
60
+ unet: OnnxRuntimeModel
61
+ low_res_scheduler: DDPMScheduler
62
+ scheduler: KarrasDiffusionSchedulers
63
+ safety_checker: OnnxRuntimeModel
64
+ feature_extractor: CLIPImageProcessor
65
+
66
+ _optional_components = ["safety_checker", "feature_extractor"]
67
+ _is_onnx = True
68
+
69
+ def __init__(
70
+ self,
71
+ vae: OnnxRuntimeModel,
72
+ text_encoder: OnnxRuntimeModel,
73
+ tokenizer: Any,
74
+ unet: OnnxRuntimeModel,
75
+ low_res_scheduler: DDPMScheduler,
76
+ scheduler: KarrasDiffusionSchedulers,
77
+ safety_checker: Optional[OnnxRuntimeModel] = None,
78
+ feature_extractor: Optional[CLIPImageProcessor] = None,
79
+ max_noise_level: int = 350,
80
+ num_latent_channels=4,
81
+ num_unet_input_channels=7,
82
+ requires_safety_checker: bool = True,
83
+ ):
84
+ super().__init__()
85
+
86
+ if scheduler is not None and getattr(scheduler.config, "steps_offset", 1) != 1:
87
+ deprecation_message = (
88
+ f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
89
+ f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
90
+ "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
91
+ " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
92
+ " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
93
+ " file"
94
+ )
95
+ deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
96
+ new_config = dict(scheduler.config)
97
+ new_config["steps_offset"] = 1
98
+ scheduler._internal_dict = FrozenDict(new_config)
99
+
100
+ if scheduler is not None and getattr(scheduler.config, "clip_sample", False) is True:
101
+ deprecation_message = (
102
+ f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
103
+ " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
104
+ " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
105
+ " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
106
+ " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
107
+ )
108
+ deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
109
+ new_config = dict(scheduler.config)
110
+ new_config["clip_sample"] = False
111
+ scheduler._internal_dict = FrozenDict(new_config)
112
+
113
+ if safety_checker is None and requires_safety_checker:
114
+ logger.warning(
115
+ f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
116
+ " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
117
+ " results in services or applications open to the public. Both the diffusers team and Hugging Face"
118
+ " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
119
+ " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
120
+ " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
121
+ )
122
+
123
+ if safety_checker is not None and feature_extractor is None:
124
+ raise ValueError(
125
+ "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
126
+ " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
127
+ )
128
+
129
+ self.register_modules(
130
+ vae=vae,
131
+ text_encoder=text_encoder,
132
+ tokenizer=tokenizer,
133
+ unet=unet,
134
+ scheduler=scheduler,
135
+ low_res_scheduler=low_res_scheduler,
136
+ safety_checker=safety_checker,
137
+ feature_extractor=feature_extractor,
138
+ )
139
+ self.register_to_config(
140
+ max_noise_level=max_noise_level,
141
+ num_latent_channels=num_latent_channels,
142
+ num_unet_input_channels=num_unet_input_channels,
143
+ )
144
+
145
+ def check_inputs(
146
+ self,
147
+ prompt: Union[str, List[str]],
148
+ image,
149
+ noise_level,
150
+ callback_steps,
151
+ negative_prompt=None,
152
+ prompt_embeds=None,
153
+ negative_prompt_embeds=None,
154
+ ):
155
+ if (callback_steps is None) or (
156
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
157
+ ):
158
+ raise ValueError(
159
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
160
+ f" {type(callback_steps)}."
161
+ )
162
+
163
+ if prompt is not None and prompt_embeds is not None:
164
+ raise ValueError(
165
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
166
+ " only forward one of the two."
167
+ )
168
+ elif prompt is None and prompt_embeds is None:
169
+ raise ValueError(
170
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
171
+ )
172
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
173
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
174
+
175
+ if negative_prompt is not None and negative_prompt_embeds is not None:
176
+ raise ValueError(
177
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
178
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
179
+ )
180
+
181
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
182
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
183
+ raise ValueError(
184
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
185
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
186
+ f" {negative_prompt_embeds.shape}."
187
+ )
188
+
189
+ if (
190
+ not isinstance(image, torch.Tensor)
191
+ and not isinstance(image, PIL.Image.Image)
192
+ and not isinstance(image, np.ndarray)
193
+ and not isinstance(image, list)
194
+ ):
195
+ raise ValueError(
196
+ f"`image` has to be of type `torch.Tensor`, `np.ndarray`, `PIL.Image.Image` or `list` but is {type(image)}"
197
+ )
198
+
199
+ # verify batch size of prompt and image are same if image is a list or tensor or numpy array
200
+ if isinstance(image, (list, np.ndarray)):
201
+ if prompt is not None and isinstance(prompt, str):
202
+ batch_size = 1
203
+ elif prompt is not None and isinstance(prompt, list):
204
+ batch_size = len(prompt)
205
+ else:
206
+ batch_size = prompt_embeds.shape[0]
207
+
208
+ if isinstance(image, list):
209
+ image_batch_size = len(image)
210
+ else:
211
+ image_batch_size = image.shape[0]
212
+ if batch_size != image_batch_size:
213
+ raise ValueError(
214
+ f"`prompt` has batch size {batch_size} and `image` has batch size {image_batch_size}."
215
+ " Please make sure that passed `prompt` matches the batch size of `image`."
216
+ )
217
+
218
+ # check noise level
219
+ if noise_level > self.config.max_noise_level:
220
+ raise ValueError(f"`noise_level` has to be <= {self.config.max_noise_level} but is {noise_level}")
221
+
222
+ if (callback_steps is None) or (
223
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
224
+ ):
225
+ raise ValueError(
226
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
227
+ f" {type(callback_steps)}."
228
+ )
229
+
230
+ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
231
+ shape = (batch_size, num_channels_latents, height, width)
232
+ if latents is None:
233
+ latents = generator.randn(*shape).astype(dtype)
234
+ elif latents.shape != shape:
235
+ raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
236
+
237
+ return latents
238
+
239
+ def decode_latents(self, latents):
240
+ latents = 1 / 0.08333 * latents
241
+ image = self.vae(latent_sample=latents)[0]
242
+ image = np.clip(image / 2 + 0.5, 0, 1)
243
+ image = image.transpose((0, 2, 3, 1))
244
+ return image
245
+
246
+ def _encode_prompt(
247
+ self,
248
+ prompt: Union[str, List[str]],
249
+ num_images_per_prompt: Optional[int],
250
+ do_classifier_free_guidance: bool,
251
+ negative_prompt: Optional[str],
252
+ prompt_embeds: Optional[np.ndarray] = None,
253
+ negative_prompt_embeds: Optional[np.ndarray] = None,
254
+ ):
255
+ r"""
256
+ Encodes the prompt into text encoder hidden states.
257
+
258
+ Args:
259
+ prompt (`str` or `List[str]`):
260
+ prompt to be encoded
261
+ num_images_per_prompt (`int`):
262
+ number of images that should be generated per prompt
263
+ do_classifier_free_guidance (`bool`):
264
+ whether to use classifier free guidance or not
265
+ negative_prompt (`str` or `List[str]`):
266
+ The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
267
+ if `guidance_scale` is less than `1`).
268
+ prompt_embeds (`np.ndarray`, *optional*):
269
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
270
+ provided, text embeddings will be generated from `prompt` input argument.
271
+ negative_prompt_embeds (`np.ndarray`, *optional*):
272
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
273
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
274
+ argument.
275
+ """
276
+ if prompt is not None and isinstance(prompt, str):
277
+ batch_size = 1
278
+ elif prompt is not None and isinstance(prompt, list):
279
+ batch_size = len(prompt)
280
+ else:
281
+ batch_size = prompt_embeds.shape[0]
282
+
283
+ if prompt_embeds is None:
284
+ # get prompt text embeddings
285
+ text_inputs = self.tokenizer(
286
+ prompt,
287
+ padding="max_length",
288
+ max_length=self.tokenizer.model_max_length,
289
+ truncation=True,
290
+ return_tensors="np",
291
+ )
292
+ text_input_ids = text_inputs.input_ids
293
+ untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="np").input_ids
294
+
295
+ if not np.array_equal(text_input_ids, untruncated_ids):
296
+ removed_text = self.tokenizer.batch_decode(
297
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
298
+ )
299
+ logger.warning(
300
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
301
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
302
+ )
303
+
304
+ prompt_embeds = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
305
+
306
+ prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0)
307
+
308
+ # get unconditional embeddings for classifier free guidance
309
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
310
+ uncond_tokens: List[str]
311
+ if negative_prompt is None:
312
+ uncond_tokens = [""] * batch_size
313
+ elif type(prompt) is not type(negative_prompt):
314
+ raise TypeError(
315
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
316
+ f" {type(prompt)}."
317
+ )
318
+ elif isinstance(negative_prompt, str):
319
+ uncond_tokens = [negative_prompt] * batch_size
320
+ elif batch_size != len(negative_prompt):
321
+ raise ValueError(
322
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
323
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
324
+ " the batch size of `prompt`."
325
+ )
326
+ else:
327
+ uncond_tokens = negative_prompt
328
+
329
+ max_length = prompt_embeds.shape[1]
330
+ uncond_input = self.tokenizer(
331
+ uncond_tokens,
332
+ padding="max_length",
333
+ max_length=max_length,
334
+ truncation=True,
335
+ return_tensors="np",
336
+ )
337
+ negative_prompt_embeds = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0]
338
+
339
+ if do_classifier_free_guidance:
340
+ negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0)
341
+
342
+ # For classifier free guidance, we need to do two forward passes.
343
+ # Here we concatenate the unconditional and text embeddings into a single batch
344
+ # to avoid doing two forward passes
345
+ prompt_embeds = np.concatenate([negative_prompt_embeds, prompt_embeds])
346
+
347
+ return prompt_embeds
348
+
349
+ def __call__(
350
+ self,
351
+ prompt: Union[str, List[str]],
352
+ image: Union[np.ndarray, PIL.Image.Image, List[PIL.Image.Image]],
353
+ num_inference_steps: int = 75,
354
+ guidance_scale: float = 9.0,
355
+ noise_level: int = 20,
356
+ negative_prompt: Optional[Union[str, List[str]]] = None,
357
+ num_images_per_prompt: Optional[int] = 1,
358
+ eta: float = 0.0,
359
+ generator: Optional[Union[np.random.RandomState, List[np.random.RandomState]]] = None,
360
+ latents: Optional[np.ndarray] = None,
361
+ prompt_embeds: Optional[np.ndarray] = None,
362
+ negative_prompt_embeds: Optional[np.ndarray] = None,
363
+ output_type: Optional[str] = "pil",
364
+ return_dict: bool = True,
365
+ callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
366
+ callback_steps: Optional[int] = 1,
367
+ ):
368
+ r"""
369
+ Function invoked when calling the pipeline for generation.
370
+
371
+ Args:
372
+ prompt (`str` or `List[str]`):
373
+ The prompt or prompts to guide the image generation.
374
+ image (`np.ndarray` or `PIL.Image.Image`):
375
+ `Image`, or tensor representing an image batch, that will be used as the starting point for the
376
+ process.
377
+ num_inference_steps (`int`, *optional*, defaults to 50):
378
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
379
+ expense of slower inference. This parameter will be modulated by `strength`.
380
+ guidance_scale (`float`, *optional*, defaults to 7.5):
381
+ Guidance scale as defined in [Classifier-Free Diffusion
382
+ Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
383
+ of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
384
+ `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
385
+ the text `prompt`, usually at the expense of lower image quality.
386
+ noise_level (`float`, defaults to 0.2):
387
+ Deteremines the amount of noise to add to the initial image before performing upscaling.
388
+ negative_prompt (`str` or `List[str]`, *optional*):
389
+ The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
390
+ if `guidance_scale` is less than `1`).
391
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
392
+ The number of images to generate per prompt.
393
+ eta (`float`, *optional*, defaults to 0.0):
394
+ Corresponds to parameter eta (η) in the DDIM paper: https://huggingface.co/papers/2010.02502. Only
395
+ applies to [`schedulers.DDIMScheduler`], will be ignored for others.
396
+ generator (`np.random.RandomState`, *optional*):
397
+ A np.random.RandomState to make generation deterministic.
398
+ latents (`torch.Tensor`, *optional*):
399
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
400
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
401
+ tensor will be generated by sampling using the supplied random `generator`.
402
+ prompt_embeds (`np.ndarray`, *optional*):
403
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
404
+ provided, text embeddings will be generated from `prompt` input argument.
405
+ negative_prompt_embeds (`np.ndarray`, *optional*):
406
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
407
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
408
+ argument.
409
+ output_type (`str`, *optional*, defaults to `"pil"`):
410
+ The output format of the generate image. Choose between
411
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
412
+ return_dict (`bool`, *optional*, defaults to `True`):
413
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
414
+ plain tuple.
415
+ callback (`Callable`, *optional*):
416
+ A function that will be called every `callback_steps` steps during inference. The function will be
417
+ called with the following arguments: `callback(step: int, timestep: int, latents: np.ndarray)`.
418
+ callback_steps (`int`, *optional*, defaults to 1):
419
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
420
+ called at every step.
421
+
422
+ Returns:
423
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
424
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
425
+ When returning a tuple, the first element is a list with the generated images, and the second element is a
426
+ list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
427
+ (nsfw) content, according to the `safety_checker`.
428
+ """
429
+
430
+ # 1. Check inputs
431
+ self.check_inputs(
432
+ prompt,
433
+ image,
434
+ noise_level,
435
+ callback_steps,
436
+ negative_prompt,
437
+ prompt_embeds,
438
+ negative_prompt_embeds,
439
+ )
440
+
441
+ # 2. Define call parameters
442
+ if prompt is not None and isinstance(prompt, str):
443
+ batch_size = 1
444
+ elif prompt is not None and isinstance(prompt, list):
445
+ batch_size = len(prompt)
446
+ else:
447
+ batch_size = prompt_embeds.shape[0]
448
+
449
+ if generator is None:
450
+ generator = np.random
451
+
452
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
453
+ # of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
454
+ # corresponds to doing no classifier free guidance.
455
+ do_classifier_free_guidance = guidance_scale > 1.0
456
+
457
+ prompt_embeds = self._encode_prompt(
458
+ prompt,
459
+ num_images_per_prompt,
460
+ do_classifier_free_guidance,
461
+ negative_prompt,
462
+ prompt_embeds=prompt_embeds,
463
+ negative_prompt_embeds=negative_prompt_embeds,
464
+ )
465
+
466
+ latents_dtype = prompt_embeds.dtype
467
+ image = preprocess(image).cpu().numpy()
468
+ height, width = image.shape[2:]
469
+
470
+ latents = self.prepare_latents(
471
+ batch_size * num_images_per_prompt,
472
+ self.config.num_latent_channels,
473
+ height,
474
+ width,
475
+ latents_dtype,
476
+ generator,
477
+ )
478
+ image = image.astype(latents_dtype)
479
+
480
+ self.scheduler.set_timesteps(num_inference_steps)
481
+ timesteps = self.scheduler.timesteps
482
+
483
+ # Scale the initial noise by the standard deviation required by the scheduler
484
+ latents = latents * self.scheduler.init_noise_sigma
485
+
486
+ # 5. Add noise to image
487
+ noise_level = np.array([noise_level]).astype(np.int64)
488
+ noise = generator.randn(*image.shape).astype(latents_dtype)
489
+
490
+ image = self.low_res_scheduler.add_noise(
491
+ torch.from_numpy(image), torch.from_numpy(noise), torch.from_numpy(noise_level)
492
+ )
493
+ image = image.numpy()
494
+
495
+ batch_multiplier = 2 if do_classifier_free_guidance else 1
496
+ image = np.concatenate([image] * batch_multiplier * num_images_per_prompt)
497
+ noise_level = np.concatenate([noise_level] * image.shape[0])
498
+
499
+ # 7. Check that sizes of image and latents match
500
+ num_channels_image = image.shape[1]
501
+ if self.config.num_latent_channels + num_channels_image != self.config.num_unet_input_channels:
502
+ raise ValueError(
503
+ "Incorrect configuration settings! The config of `pipeline.unet` expects"
504
+ f" {self.config.num_unet_input_channels} but received `num_channels_latents`: {self.config.num_latent_channels} +"
505
+ f" `num_channels_image`: {num_channels_image} "
506
+ f" = {self.config.num_latent_channels + num_channels_image}. Please verify the config of"
507
+ " `pipeline.unet` or your `image` input."
508
+ )
509
+
510
+ # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
511
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
512
+ extra_step_kwargs = {}
513
+ if accepts_eta:
514
+ extra_step_kwargs["eta"] = eta
515
+
516
+ timestep_dtype = next(
517
+ (input.type for input in self.unet.model.get_inputs() if input.name == "timestep"), "tensor(float)"
518
+ )
519
+ timestep_dtype = ORT_TO_NP_TYPE[timestep_dtype]
520
+
521
+ # 9. Denoising loop
522
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
523
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
524
+ for i, t in enumerate(timesteps):
525
+ # expand the latents if we are doing classifier free guidance
526
+ latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
527
+
528
+ # concat latents, mask, masked_image_latents in the channel dimension
529
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
530
+ latent_model_input = np.concatenate([latent_model_input, image], axis=1)
531
+
532
+ # timestep to tensor
533
+ timestep = np.array([t], dtype=timestep_dtype)
534
+
535
+ # predict the noise residual
536
+ noise_pred = self.unet(
537
+ sample=latent_model_input,
538
+ timestep=timestep,
539
+ encoder_hidden_states=prompt_embeds,
540
+ class_labels=noise_level,
541
+ )[0]
542
+
543
+ # perform guidance
544
+ if do_classifier_free_guidance:
545
+ noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
546
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
547
+
548
+ # compute the previous noisy sample x_t -> x_t-1
549
+ latents = self.scheduler.step(
550
+ torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs
551
+ ).prev_sample
552
+ latents = latents.numpy()
553
+
554
+ # call the callback, if provided
555
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
556
+ progress_bar.update()
557
+ if callback is not None and i % callback_steps == 0:
558
+ step_idx = i // getattr(self.scheduler, "order", 1)
559
+ callback(step_idx, t, latents)
560
+
561
+ # 10. Post-processing
562
+ image = self.decode_latents(latents)
563
+
564
+ if self.safety_checker is not None:
565
+ safety_checker_input = self.feature_extractor(
566
+ self.numpy_to_pil(image), return_tensors="np"
567
+ ).pixel_values.astype(image.dtype)
568
+
569
+ images, has_nsfw_concept = [], []
570
+ for i in range(image.shape[0]):
571
+ image_i, has_nsfw_concept_i = self.safety_checker(
572
+ clip_input=safety_checker_input[i : i + 1], images=image[i : i + 1]
573
+ )
574
+ images.append(image_i)
575
+ has_nsfw_concept.append(has_nsfw_concept_i[0])
576
+ image = np.concatenate(images)
577
+ else:
578
+ has_nsfw_concept = None
579
+
580
+ if output_type == "pil":
581
+ image = self.numpy_to_pil(image)
582
+
583
+ if not return_dict:
584
+ return (image, has_nsfw_concept)
585
+
586
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/pipeline_output.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import List, Optional, Union
3
+
4
+ import numpy as np
5
+ import PIL.Image
6
+
7
+ from ...utils import BaseOutput, is_flax_available
8
+
9
+
10
+ @dataclass
11
+ class StableDiffusionPipelineOutput(BaseOutput):
12
+ """
13
+ Output class for Stable Diffusion pipelines.
14
+
15
+ Args:
16
+ images (`List[PIL.Image.Image]` or `np.ndarray`)
17
+ List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
18
+ num_channels)`.
19
+ nsfw_content_detected (`List[bool]`)
20
+ List indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content or
21
+ `None` if safety checking could not be performed.
22
+ """
23
+
24
+ images: Union[List[PIL.Image.Image], np.ndarray]
25
+ nsfw_content_detected: Optional[List[bool]]
26
+
27
+
28
+ if is_flax_available():
29
+ import flax
30
+
31
+ @flax.struct.dataclass
32
+ class FlaxStableDiffusionPipelineOutput(BaseOutput):
33
+ """
34
+ Output class for Flax-based Stable Diffusion pipelines.
35
+
36
+ Args:
37
+ images (`np.ndarray`):
38
+ Denoised images of array shape of `(batch_size, height, width, num_channels)`.
39
+ nsfw_content_detected (`List[bool]`):
40
+ List indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content
41
+ or `None` if safety checking could not be performed.
42
+ """
43
+
44
+ images: np.ndarray
45
+ nsfw_content_detected: List[bool]
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py ADDED
@@ -0,0 +1,1104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import inspect
15
+ from typing import Any, Callable, Dict, List, Optional, Union
16
+
17
+ import torch
18
+ from packaging import version
19
+ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
20
+
21
+ from ...callbacks import MultiPipelineCallbacks, PipelineCallback
22
+ from ...configuration_utils import FrozenDict
23
+ from ...image_processor import PipelineImageInput, VaeImageProcessor
24
+ from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
25
+ from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
26
+ from ...models.lora import adjust_lora_scale_text_encoder
27
+ from ...schedulers import KarrasDiffusionSchedulers
28
+ from ...utils import (
29
+ USE_PEFT_BACKEND,
30
+ deprecate,
31
+ is_torch_xla_available,
32
+ logging,
33
+ replace_example_docstring,
34
+ scale_lora_layers,
35
+ unscale_lora_layers,
36
+ )
37
+ from ...utils.torch_utils import randn_tensor
38
+ from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
39
+ from .pipeline_output import StableDiffusionPipelineOutput
40
+ from .safety_checker import StableDiffusionSafetyChecker
41
+
42
+
43
+ if is_torch_xla_available():
44
+ import torch_xla.core.xla_model as xm
45
+
46
+ XLA_AVAILABLE = True
47
+ else:
48
+ XLA_AVAILABLE = False
49
+
50
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
51
+
52
+ EXAMPLE_DOC_STRING = """
53
+ Examples:
54
+ ```py
55
+ >>> import torch
56
+ >>> from diffusers import StableDiffusionPipeline
57
+
58
+ >>> pipe = StableDiffusionPipeline.from_pretrained(
59
+ ... "stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16
60
+ ... )
61
+ >>> pipe = pipe.to("cuda")
62
+
63
+ >>> prompt = "a photo of an astronaut riding a horse on mars"
64
+ >>> image = pipe(prompt).images[0]
65
+ ```
66
+ """
67
+
68
+
69
+ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
70
+ r"""
71
+ Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
72
+ Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
73
+ Flawed](https://huggingface.co/papers/2305.08891).
74
+
75
+ Args:
76
+ noise_cfg (`torch.Tensor`):
77
+ The predicted noise tensor for the guided diffusion process.
78
+ noise_pred_text (`torch.Tensor`):
79
+ The predicted noise tensor for the text-guided diffusion process.
80
+ guidance_rescale (`float`, *optional*, defaults to 0.0):
81
+ A rescale factor applied to the noise predictions.
82
+
83
+ Returns:
84
+ noise_cfg (`torch.Tensor`): The rescaled noise prediction tensor.
85
+ """
86
+ std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
87
+ std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
88
+ # rescale the results from guidance (fixes overexposure)
89
+ noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
90
+ # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
91
+ noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
92
+ return noise_cfg
93
+
94
+
95
+ def retrieve_timesteps(
96
+ scheduler,
97
+ num_inference_steps: Optional[int] = None,
98
+ device: Optional[Union[str, torch.device]] = None,
99
+ timesteps: Optional[List[int]] = None,
100
+ sigmas: Optional[List[float]] = None,
101
+ **kwargs,
102
+ ):
103
+ r"""
104
+ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
105
+ custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
106
+
107
+ Args:
108
+ scheduler (`SchedulerMixin`):
109
+ The scheduler to get timesteps from.
110
+ num_inference_steps (`int`):
111
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
112
+ must be `None`.
113
+ device (`str` or `torch.device`, *optional*):
114
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
115
+ timesteps (`List[int]`, *optional*):
116
+ Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
117
+ `num_inference_steps` and `sigmas` must be `None`.
118
+ sigmas (`List[float]`, *optional*):
119
+ Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
120
+ `num_inference_steps` and `timesteps` must be `None`.
121
+
122
+ Returns:
123
+ `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
124
+ second element is the number of inference steps.
125
+ """
126
+ if timesteps is not None and sigmas is not None:
127
+ raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
128
+ if timesteps is not None:
129
+ accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
130
+ if not accepts_timesteps:
131
+ raise ValueError(
132
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
133
+ f" timestep schedules. Please check whether you are using the correct scheduler."
134
+ )
135
+ scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
136
+ timesteps = scheduler.timesteps
137
+ num_inference_steps = len(timesteps)
138
+ elif sigmas is not None:
139
+ accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
140
+ if not accept_sigmas:
141
+ raise ValueError(
142
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
143
+ f" sigmas schedules. Please check whether you are using the correct scheduler."
144
+ )
145
+ scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
146
+ timesteps = scheduler.timesteps
147
+ num_inference_steps = len(timesteps)
148
+ else:
149
+ scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
150
+ timesteps = scheduler.timesteps
151
+ return timesteps, num_inference_steps
152
+
153
+
154
+ class StableDiffusionPipeline(
155
+ DiffusionPipeline,
156
+ StableDiffusionMixin,
157
+ TextualInversionLoaderMixin,
158
+ StableDiffusionLoraLoaderMixin,
159
+ IPAdapterMixin,
160
+ FromSingleFileMixin,
161
+ ):
162
+ """
163
+ Pipeline for text-to-image generation using Stable Diffusion.
164
+
165
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
166
+ implemented for all pipelines (downloading, saving, running on a particular device, etc.).
167
+
168
+ The pipeline also inherits the following loading methods:
169
+ - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
170
+ - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
171
+ - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
172
+ - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
173
+ - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
174
+
175
+ Args:
176
+ vae ([`AutoencoderKL`]):
177
+ Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
178
+ text_encoder ([`~transformers.CLIPTextModel`]):
179
+ Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
180
+ tokenizer ([`~transformers.CLIPTokenizer`]):
181
+ A `CLIPTokenizer` to tokenize text.
182
+ unet ([`UNet2DConditionModel`]):
183
+ A `UNet2DConditionModel` to denoise the encoded image latents.
184
+ scheduler ([`SchedulerMixin`]):
185
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
186
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
187
+ safety_checker ([`StableDiffusionSafetyChecker`]):
188
+ Classification module that estimates whether generated images could be considered offensive or harmful.
189
+ Please refer to the [model card](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) for
190
+ more details about a model's potential harms.
191
+ feature_extractor ([`~transformers.CLIPImageProcessor`]):
192
+ A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
193
+ """
194
+
195
+ model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
196
+ _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
197
+ _exclude_from_cpu_offload = ["safety_checker"]
198
+ _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
199
+
200
+ def __init__(
201
+ self,
202
+ vae: AutoencoderKL,
203
+ text_encoder: CLIPTextModel,
204
+ tokenizer: CLIPTokenizer,
205
+ unet: UNet2DConditionModel,
206
+ scheduler: KarrasDiffusionSchedulers,
207
+ safety_checker: StableDiffusionSafetyChecker,
208
+ feature_extractor: CLIPImageProcessor,
209
+ image_encoder: CLIPVisionModelWithProjection = None,
210
+ requires_safety_checker: bool = True,
211
+ ):
212
+ super().__init__()
213
+
214
+ if scheduler is not None and getattr(scheduler.config, "steps_offset", 1) != 1:
215
+ deprecation_message = (
216
+ f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
217
+ f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
218
+ "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
219
+ " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
220
+ " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
221
+ " file"
222
+ )
223
+ deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
224
+ new_config = dict(scheduler.config)
225
+ new_config["steps_offset"] = 1
226
+ scheduler._internal_dict = FrozenDict(new_config)
227
+
228
+ if scheduler is not None and getattr(scheduler.config, "clip_sample", False) is True:
229
+ deprecation_message = (
230
+ f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
231
+ " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
232
+ " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
233
+ " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
234
+ " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
235
+ )
236
+ deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
237
+ new_config = dict(scheduler.config)
238
+ new_config["clip_sample"] = False
239
+ scheduler._internal_dict = FrozenDict(new_config)
240
+
241
+ if safety_checker is None and requires_safety_checker:
242
+ logger.warning(
243
+ f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
244
+ " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
245
+ " results in services or applications open to the public. Both the diffusers team and Hugging Face"
246
+ " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
247
+ " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
248
+ " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
249
+ )
250
+
251
+ if safety_checker is not None and feature_extractor is None:
252
+ raise ValueError(
253
+ "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
254
+ " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
255
+ )
256
+
257
+ is_unet_version_less_0_9_0 = (
258
+ unet is not None
259
+ and hasattr(unet.config, "_diffusers_version")
260
+ and version.parse(version.parse(unet.config._diffusers_version).base_version) < version.parse("0.9.0.dev0")
261
+ )
262
+ self._is_unet_config_sample_size_int = unet is not None and isinstance(unet.config.sample_size, int)
263
+ is_unet_sample_size_less_64 = (
264
+ unet is not None
265
+ and hasattr(unet.config, "sample_size")
266
+ and self._is_unet_config_sample_size_int
267
+ and unet.config.sample_size < 64
268
+ )
269
+ if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
270
+ deprecation_message = (
271
+ "The configuration file of the unet has set the default `sample_size` to smaller than"
272
+ " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
273
+ " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
274
+ " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- stable-diffusion-v1-5/stable-diffusion-v1-5"
275
+ " \n- stable-diffusion-v1-5/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
276
+ " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
277
+ " in the config might lead to incorrect results in future versions. If you have downloaded this"
278
+ " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
279
+ " the `unet/config.json` file"
280
+ )
281
+ deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
282
+ new_config = dict(unet.config)
283
+ new_config["sample_size"] = 64
284
+ unet._internal_dict = FrozenDict(new_config)
285
+
286
+ self.register_modules(
287
+ vae=vae,
288
+ text_encoder=text_encoder,
289
+ tokenizer=tokenizer,
290
+ unet=unet,
291
+ scheduler=scheduler,
292
+ safety_checker=safety_checker,
293
+ feature_extractor=feature_extractor,
294
+ image_encoder=image_encoder,
295
+ )
296
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
297
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
298
+ self.register_to_config(requires_safety_checker=requires_safety_checker)
299
+
300
+ def _encode_prompt(
301
+ self,
302
+ prompt,
303
+ device,
304
+ num_images_per_prompt,
305
+ do_classifier_free_guidance,
306
+ negative_prompt=None,
307
+ prompt_embeds: Optional[torch.Tensor] = None,
308
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
309
+ lora_scale: Optional[float] = None,
310
+ **kwargs,
311
+ ):
312
+ deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
313
+ deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
314
+
315
+ prompt_embeds_tuple = self.encode_prompt(
316
+ prompt=prompt,
317
+ device=device,
318
+ num_images_per_prompt=num_images_per_prompt,
319
+ do_classifier_free_guidance=do_classifier_free_guidance,
320
+ negative_prompt=negative_prompt,
321
+ prompt_embeds=prompt_embeds,
322
+ negative_prompt_embeds=negative_prompt_embeds,
323
+ lora_scale=lora_scale,
324
+ **kwargs,
325
+ )
326
+
327
+ # concatenate for backwards comp
328
+ prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
329
+
330
+ return prompt_embeds
331
+
332
+ def encode_prompt(
333
+ self,
334
+ prompt,
335
+ device,
336
+ num_images_per_prompt,
337
+ do_classifier_free_guidance,
338
+ negative_prompt=None,
339
+ prompt_embeds: Optional[torch.Tensor] = None,
340
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
341
+ lora_scale: Optional[float] = None,
342
+ clip_skip: Optional[int] = None,
343
+ ):
344
+ r"""
345
+ Encodes the prompt into text encoder hidden states.
346
+
347
+ Args:
348
+ prompt (`str` or `List[str]`, *optional*):
349
+ prompt to be encoded
350
+ device: (`torch.device`):
351
+ torch device
352
+ num_images_per_prompt (`int`):
353
+ number of images that should be generated per prompt
354
+ do_classifier_free_guidance (`bool`):
355
+ whether to use classifier free guidance or not
356
+ negative_prompt (`str` or `List[str]`, *optional*):
357
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
358
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
359
+ less than `1`).
360
+ prompt_embeds (`torch.Tensor`, *optional*):
361
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
362
+ provided, text embeddings will be generated from `prompt` input argument.
363
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
364
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
365
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
366
+ argument.
367
+ lora_scale (`float`, *optional*):
368
+ A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
369
+ clip_skip (`int`, *optional*):
370
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
371
+ the output of the pre-final layer will be used for computing the prompt embeddings.
372
+ """
373
+ # set lora scale so that monkey patched LoRA
374
+ # function of text encoder can correctly access it
375
+ if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
376
+ self._lora_scale = lora_scale
377
+
378
+ # dynamically adjust the LoRA scale
379
+ if not USE_PEFT_BACKEND:
380
+ adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
381
+ else:
382
+ scale_lora_layers(self.text_encoder, lora_scale)
383
+
384
+ if prompt is not None and isinstance(prompt, str):
385
+ batch_size = 1
386
+ elif prompt is not None and isinstance(prompt, list):
387
+ batch_size = len(prompt)
388
+ else:
389
+ batch_size = prompt_embeds.shape[0]
390
+
391
+ if prompt_embeds is None:
392
+ # textual inversion: process multi-vector tokens if necessary
393
+ if isinstance(self, TextualInversionLoaderMixin):
394
+ prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
395
+
396
+ text_inputs = self.tokenizer(
397
+ prompt,
398
+ padding="max_length",
399
+ max_length=self.tokenizer.model_max_length,
400
+ truncation=True,
401
+ return_tensors="pt",
402
+ )
403
+ text_input_ids = text_inputs.input_ids
404
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
405
+
406
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
407
+ text_input_ids, untruncated_ids
408
+ ):
409
+ removed_text = self.tokenizer.batch_decode(
410
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
411
+ )
412
+ logger.warning(
413
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
414
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
415
+ )
416
+
417
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
418
+ attention_mask = text_inputs.attention_mask.to(device)
419
+ else:
420
+ attention_mask = None
421
+
422
+ if clip_skip is None:
423
+ prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
424
+ prompt_embeds = prompt_embeds[0]
425
+ else:
426
+ prompt_embeds = self.text_encoder(
427
+ text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
428
+ )
429
+ # Access the `hidden_states` first, that contains a tuple of
430
+ # all the hidden states from the encoder layers. Then index into
431
+ # the tuple to access the hidden states from the desired layer.
432
+ prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
433
+ # We also need to apply the final LayerNorm here to not mess with the
434
+ # representations. The `last_hidden_states` that we typically use for
435
+ # obtaining the final prompt representations passes through the LayerNorm
436
+ # layer.
437
+ prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
438
+
439
+ if self.text_encoder is not None:
440
+ prompt_embeds_dtype = self.text_encoder.dtype
441
+ elif self.unet is not None:
442
+ prompt_embeds_dtype = self.unet.dtype
443
+ else:
444
+ prompt_embeds_dtype = prompt_embeds.dtype
445
+
446
+ prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
447
+
448
+ bs_embed, seq_len, _ = prompt_embeds.shape
449
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
450
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
451
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
452
+
453
+ # get unconditional embeddings for classifier free guidance
454
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
455
+ uncond_tokens: List[str]
456
+ if negative_prompt is None:
457
+ uncond_tokens = [""] * batch_size
458
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
459
+ raise TypeError(
460
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
461
+ f" {type(prompt)}."
462
+ )
463
+ elif isinstance(negative_prompt, str):
464
+ uncond_tokens = [negative_prompt]
465
+ elif batch_size != len(negative_prompt):
466
+ raise ValueError(
467
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
468
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
469
+ " the batch size of `prompt`."
470
+ )
471
+ else:
472
+ uncond_tokens = negative_prompt
473
+
474
+ # textual inversion: process multi-vector tokens if necessary
475
+ if isinstance(self, TextualInversionLoaderMixin):
476
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
477
+
478
+ max_length = prompt_embeds.shape[1]
479
+ uncond_input = self.tokenizer(
480
+ uncond_tokens,
481
+ padding="max_length",
482
+ max_length=max_length,
483
+ truncation=True,
484
+ return_tensors="pt",
485
+ )
486
+
487
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
488
+ attention_mask = uncond_input.attention_mask.to(device)
489
+ else:
490
+ attention_mask = None
491
+
492
+ negative_prompt_embeds = self.text_encoder(
493
+ uncond_input.input_ids.to(device),
494
+ attention_mask=attention_mask,
495
+ )
496
+ negative_prompt_embeds = negative_prompt_embeds[0]
497
+
498
+ if do_classifier_free_guidance:
499
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
500
+ seq_len = negative_prompt_embeds.shape[1]
501
+
502
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
503
+
504
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
505
+ negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
506
+
507
+ if self.text_encoder is not None:
508
+ if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
509
+ # Retrieve the original scale by scaling back the LoRA layers
510
+ unscale_lora_layers(self.text_encoder, lora_scale)
511
+
512
+ return prompt_embeds, negative_prompt_embeds
513
+
514
+ def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
515
+ dtype = next(self.image_encoder.parameters()).dtype
516
+
517
+ if not isinstance(image, torch.Tensor):
518
+ image = self.feature_extractor(image, return_tensors="pt").pixel_values
519
+
520
+ image = image.to(device=device, dtype=dtype)
521
+ if output_hidden_states:
522
+ image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
523
+ image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
524
+ uncond_image_enc_hidden_states = self.image_encoder(
525
+ torch.zeros_like(image), output_hidden_states=True
526
+ ).hidden_states[-2]
527
+ uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
528
+ num_images_per_prompt, dim=0
529
+ )
530
+ return image_enc_hidden_states, uncond_image_enc_hidden_states
531
+ else:
532
+ image_embeds = self.image_encoder(image).image_embeds
533
+ image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
534
+ uncond_image_embeds = torch.zeros_like(image_embeds)
535
+
536
+ return image_embeds, uncond_image_embeds
537
+
538
+ def prepare_ip_adapter_image_embeds(
539
+ self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
540
+ ):
541
+ image_embeds = []
542
+ if do_classifier_free_guidance:
543
+ negative_image_embeds = []
544
+ if ip_adapter_image_embeds is None:
545
+ if not isinstance(ip_adapter_image, list):
546
+ ip_adapter_image = [ip_adapter_image]
547
+
548
+ if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
549
+ raise ValueError(
550
+ f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
551
+ )
552
+
553
+ for single_ip_adapter_image, image_proj_layer in zip(
554
+ ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
555
+ ):
556
+ output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
557
+ single_image_embeds, single_negative_image_embeds = self.encode_image(
558
+ single_ip_adapter_image, device, 1, output_hidden_state
559
+ )
560
+
561
+ image_embeds.append(single_image_embeds[None, :])
562
+ if do_classifier_free_guidance:
563
+ negative_image_embeds.append(single_negative_image_embeds[None, :])
564
+ else:
565
+ for single_image_embeds in ip_adapter_image_embeds:
566
+ if do_classifier_free_guidance:
567
+ single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
568
+ negative_image_embeds.append(single_negative_image_embeds)
569
+ image_embeds.append(single_image_embeds)
570
+
571
+ ip_adapter_image_embeds = []
572
+ for i, single_image_embeds in enumerate(image_embeds):
573
+ single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
574
+ if do_classifier_free_guidance:
575
+ single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
576
+ single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
577
+
578
+ single_image_embeds = single_image_embeds.to(device=device)
579
+ ip_adapter_image_embeds.append(single_image_embeds)
580
+
581
+ return ip_adapter_image_embeds
582
+
583
+ def run_safety_checker(self, image, device, dtype):
584
+ if self.safety_checker is None:
585
+ has_nsfw_concept = None
586
+ else:
587
+ if torch.is_tensor(image):
588
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
589
+ else:
590
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
591
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
592
+ image, has_nsfw_concept = self.safety_checker(
593
+ images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
594
+ )
595
+ return image, has_nsfw_concept
596
+
597
+ def decode_latents(self, latents):
598
+ deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
599
+ deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
600
+
601
+ latents = 1 / self.vae.config.scaling_factor * latents
602
+ image = self.vae.decode(latents, return_dict=False)[0]
603
+ image = (image / 2 + 0.5).clamp(0, 1)
604
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
605
+ image = image.cpu().permute(0, 2, 3, 1).float().numpy()
606
+ return image
607
+
608
+ def prepare_extra_step_kwargs(self, generator, eta):
609
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
610
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
611
+ # eta corresponds to η in DDIM paper: https://huggingface.co/papers/2010.02502
612
+ # and should be between [0, 1]
613
+
614
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
615
+ extra_step_kwargs = {}
616
+ if accepts_eta:
617
+ extra_step_kwargs["eta"] = eta
618
+
619
+ # check if the scheduler accepts generator
620
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
621
+ if accepts_generator:
622
+ extra_step_kwargs["generator"] = generator
623
+ return extra_step_kwargs
624
+
625
+ def check_inputs(
626
+ self,
627
+ prompt,
628
+ height,
629
+ width,
630
+ callback_steps,
631
+ negative_prompt=None,
632
+ prompt_embeds=None,
633
+ negative_prompt_embeds=None,
634
+ ip_adapter_image=None,
635
+ ip_adapter_image_embeds=None,
636
+ callback_on_step_end_tensor_inputs=None,
637
+ ):
638
+ if height % 8 != 0 or width % 8 != 0:
639
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
640
+
641
+ if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
642
+ raise ValueError(
643
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
644
+ f" {type(callback_steps)}."
645
+ )
646
+ if callback_on_step_end_tensor_inputs is not None and not all(
647
+ k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
648
+ ):
649
+ raise ValueError(
650
+ f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
651
+ )
652
+
653
+ if prompt is not None and prompt_embeds is not None:
654
+ raise ValueError(
655
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
656
+ " only forward one of the two."
657
+ )
658
+ elif prompt is None and prompt_embeds is None:
659
+ raise ValueError(
660
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
661
+ )
662
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
663
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
664
+
665
+ if negative_prompt is not None and negative_prompt_embeds is not None:
666
+ raise ValueError(
667
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
668
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
669
+ )
670
+
671
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
672
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
673
+ raise ValueError(
674
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
675
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
676
+ f" {negative_prompt_embeds.shape}."
677
+ )
678
+
679
+ if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
680
+ raise ValueError(
681
+ "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
682
+ )
683
+
684
+ if ip_adapter_image_embeds is not None:
685
+ if not isinstance(ip_adapter_image_embeds, list):
686
+ raise ValueError(
687
+ f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
688
+ )
689
+ elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
690
+ raise ValueError(
691
+ f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
692
+ )
693
+
694
+ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
695
+ shape = (
696
+ batch_size,
697
+ num_channels_latents,
698
+ int(height) // self.vae_scale_factor,
699
+ int(width) // self.vae_scale_factor,
700
+ )
701
+ if isinstance(generator, list) and len(generator) != batch_size:
702
+ raise ValueError(
703
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
704
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
705
+ )
706
+
707
+ if latents is None:
708
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
709
+ else:
710
+ latents = latents.to(device)
711
+
712
+ # scale the initial noise by the standard deviation required by the scheduler
713
+ latents = latents * self.scheduler.init_noise_sigma
714
+ return latents
715
+
716
+ # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
717
+ def get_guidance_scale_embedding(
718
+ self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
719
+ ) -> torch.Tensor:
720
+ """
721
+ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
722
+
723
+ Args:
724
+ w (`torch.Tensor`):
725
+ Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
726
+ embedding_dim (`int`, *optional*, defaults to 512):
727
+ Dimension of the embeddings to generate.
728
+ dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
729
+ Data type of the generated embeddings.
730
+
731
+ Returns:
732
+ `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
733
+ """
734
+ assert len(w.shape) == 1
735
+ w = w * 1000.0
736
+
737
+ half_dim = embedding_dim // 2
738
+ emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
739
+ emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
740
+ emb = w.to(dtype)[:, None] * emb[None, :]
741
+ emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
742
+ if embedding_dim % 2 == 1: # zero pad
743
+ emb = torch.nn.functional.pad(emb, (0, 1))
744
+ assert emb.shape == (w.shape[0], embedding_dim)
745
+ return emb
746
+
747
+ @property
748
+ def guidance_scale(self):
749
+ return self._guidance_scale
750
+
751
+ @property
752
+ def guidance_rescale(self):
753
+ return self._guidance_rescale
754
+
755
+ @property
756
+ def clip_skip(self):
757
+ return self._clip_skip
758
+
759
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
760
+ # of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
761
+ # corresponds to doing no classifier free guidance.
762
+ @property
763
+ def do_classifier_free_guidance(self):
764
+ return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
765
+
766
+ @property
767
+ def cross_attention_kwargs(self):
768
+ return self._cross_attention_kwargs
769
+
770
+ @property
771
+ def num_timesteps(self):
772
+ return self._num_timesteps
773
+
774
+ @property
775
+ def interrupt(self):
776
+ return self._interrupt
777
+
778
+ @torch.no_grad()
779
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
780
+ def __call__(
781
+ self,
782
+ prompt: Union[str, List[str]] = None,
783
+ height: Optional[int] = None,
784
+ width: Optional[int] = None,
785
+ num_inference_steps: int = 50,
786
+ timesteps: List[int] = None,
787
+ sigmas: List[float] = None,
788
+ guidance_scale: float = 7.5,
789
+ negative_prompt: Optional[Union[str, List[str]]] = None,
790
+ num_images_per_prompt: Optional[int] = 1,
791
+ eta: float = 0.0,
792
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
793
+ latents: Optional[torch.Tensor] = None,
794
+ prompt_embeds: Optional[torch.Tensor] = None,
795
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
796
+ ip_adapter_image: Optional[PipelineImageInput] = None,
797
+ ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
798
+ output_type: Optional[str] = "pil",
799
+ return_dict: bool = True,
800
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
801
+ guidance_rescale: float = 0.0,
802
+ clip_skip: Optional[int] = None,
803
+ callback_on_step_end: Optional[
804
+ Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
805
+ ] = None,
806
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
807
+ **kwargs,
808
+ ):
809
+ r"""
810
+ The call function to the pipeline for generation.
811
+
812
+ Args:
813
+ prompt (`str` or `List[str]`, *optional*):
814
+ The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
815
+ height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
816
+ The height in pixels of the generated image.
817
+ width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
818
+ The width in pixels of the generated image.
819
+ num_inference_steps (`int`, *optional*, defaults to 50):
820
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
821
+ expense of slower inference.
822
+ timesteps (`List[int]`, *optional*):
823
+ Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
824
+ in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
825
+ passed will be used. Must be in descending order.
826
+ sigmas (`List[float]`, *optional*):
827
+ Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
828
+ their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
829
+ will be used.
830
+ guidance_scale (`float`, *optional*, defaults to 7.5):
831
+ A higher guidance scale value encourages the model to generate images closely linked to the text
832
+ `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
833
+ negative_prompt (`str` or `List[str]`, *optional*):
834
+ The prompt or prompts to guide what to not include in image generation. If not defined, you need to
835
+ pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
836
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
837
+ The number of images to generate per prompt.
838
+ eta (`float`, *optional*, defaults to 0.0):
839
+ Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only
840
+ applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
841
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
842
+ A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
843
+ generation deterministic.
844
+ latents (`torch.Tensor`, *optional*):
845
+ Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
846
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
847
+ tensor is generated by sampling using the supplied random `generator`.
848
+ prompt_embeds (`torch.Tensor`, *optional*):
849
+ Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
850
+ provided, text embeddings are generated from the `prompt` input argument.
851
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
852
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
853
+ not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
854
+ ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
855
+ ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
856
+ Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
857
+ IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
858
+ contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
859
+ provided, embeddings are computed from the `ip_adapter_image` input argument.
860
+ output_type (`str`, *optional*, defaults to `"pil"`):
861
+ The output format of the generated image. Choose between `PIL.Image` or `np.array`.
862
+ return_dict (`bool`, *optional*, defaults to `True`):
863
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
864
+ plain tuple.
865
+ cross_attention_kwargs (`dict`, *optional*):
866
+ A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
867
+ [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
868
+ guidance_rescale (`float`, *optional*, defaults to 0.0):
869
+ Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
870
+ Flawed](https://huggingface.co/papers/2305.08891). Guidance rescale factor should fix overexposure when
871
+ using zero terminal SNR.
872
+ clip_skip (`int`, *optional*):
873
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
874
+ the output of the pre-final layer will be used for computing the prompt embeddings.
875
+ callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
876
+ A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
877
+ each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
878
+ DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
879
+ list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
880
+ callback_on_step_end_tensor_inputs (`List`, *optional*):
881
+ The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
882
+ will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
883
+ `._callback_tensor_inputs` attribute of your pipeline class.
884
+
885
+ Examples:
886
+
887
+ Returns:
888
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
889
+ If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
890
+ otherwise a `tuple` is returned where the first element is a list with the generated images and the
891
+ second element is a list of `bool`s indicating whether the corresponding generated image contains
892
+ "not-safe-for-work" (nsfw) content.
893
+ """
894
+
895
+ callback = kwargs.pop("callback", None)
896
+ callback_steps = kwargs.pop("callback_steps", None)
897
+
898
+ if callback is not None:
899
+ deprecate(
900
+ "callback",
901
+ "1.0.0",
902
+ "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
903
+ )
904
+ if callback_steps is not None:
905
+ deprecate(
906
+ "callback_steps",
907
+ "1.0.0",
908
+ "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
909
+ )
910
+
911
+ if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
912
+ callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
913
+
914
+ # 0. Default height and width to unet
915
+ if not height or not width:
916
+ height = (
917
+ self.unet.config.sample_size
918
+ if self._is_unet_config_sample_size_int
919
+ else self.unet.config.sample_size[0]
920
+ )
921
+ width = (
922
+ self.unet.config.sample_size
923
+ if self._is_unet_config_sample_size_int
924
+ else self.unet.config.sample_size[1]
925
+ )
926
+ height, width = height * self.vae_scale_factor, width * self.vae_scale_factor
927
+ # to deal with lora scaling and other possible forward hooks
928
+
929
+ # 1. Check inputs. Raise error if not correct
930
+ self.check_inputs(
931
+ prompt,
932
+ height,
933
+ width,
934
+ callback_steps,
935
+ negative_prompt,
936
+ prompt_embeds,
937
+ negative_prompt_embeds,
938
+ ip_adapter_image,
939
+ ip_adapter_image_embeds,
940
+ callback_on_step_end_tensor_inputs,
941
+ )
942
+
943
+ self._guidance_scale = guidance_scale
944
+ self._guidance_rescale = guidance_rescale
945
+ self._clip_skip = clip_skip
946
+ self._cross_attention_kwargs = cross_attention_kwargs
947
+ self._interrupt = False
948
+
949
+ # 2. Define call parameters
950
+ if prompt is not None and isinstance(prompt, str):
951
+ batch_size = 1
952
+ elif prompt is not None and isinstance(prompt, list):
953
+ batch_size = len(prompt)
954
+ else:
955
+ batch_size = prompt_embeds.shape[0]
956
+
957
+ device = self._execution_device
958
+
959
+ # 3. Encode input prompt
960
+ lora_scale = (
961
+ self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
962
+ )
963
+
964
+ prompt_embeds, negative_prompt_embeds = self.encode_prompt(
965
+ prompt,
966
+ device,
967
+ num_images_per_prompt,
968
+ self.do_classifier_free_guidance,
969
+ negative_prompt,
970
+ prompt_embeds=prompt_embeds,
971
+ negative_prompt_embeds=negative_prompt_embeds,
972
+ lora_scale=lora_scale,
973
+ clip_skip=self.clip_skip,
974
+ )
975
+
976
+ # For classifier free guidance, we need to do two forward passes.
977
+ # Here we concatenate the unconditional and text embeddings into a single batch
978
+ # to avoid doing two forward passes
979
+ if self.do_classifier_free_guidance:
980
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
981
+
982
+ if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
983
+ image_embeds = self.prepare_ip_adapter_image_embeds(
984
+ ip_adapter_image,
985
+ ip_adapter_image_embeds,
986
+ device,
987
+ batch_size * num_images_per_prompt,
988
+ self.do_classifier_free_guidance,
989
+ )
990
+
991
+ # 4. Prepare timesteps
992
+ timesteps, num_inference_steps = retrieve_timesteps(
993
+ self.scheduler, num_inference_steps, device, timesteps, sigmas
994
+ )
995
+
996
+ # 5. Prepare latent variables
997
+ num_channels_latents = self.unet.config.in_channels
998
+ latents = self.prepare_latents(
999
+ batch_size * num_images_per_prompt,
1000
+ num_channels_latents,
1001
+ height,
1002
+ width,
1003
+ prompt_embeds.dtype,
1004
+ device,
1005
+ generator,
1006
+ latents,
1007
+ )
1008
+
1009
+ # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
1010
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
1011
+
1012
+ # 6.1 Add image embeds for IP-Adapter
1013
+ added_cond_kwargs = (
1014
+ {"image_embeds": image_embeds}
1015
+ if (ip_adapter_image is not None or ip_adapter_image_embeds is not None)
1016
+ else None
1017
+ )
1018
+
1019
+ # 6.2 Optionally get Guidance Scale Embedding
1020
+ timestep_cond = None
1021
+ if self.unet.config.time_cond_proj_dim is not None:
1022
+ guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
1023
+ timestep_cond = self.get_guidance_scale_embedding(
1024
+ guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
1025
+ ).to(device=device, dtype=latents.dtype)
1026
+
1027
+ # 7. Denoising loop
1028
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
1029
+ self._num_timesteps = len(timesteps)
1030
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
1031
+ for i, t in enumerate(timesteps):
1032
+ if self.interrupt:
1033
+ continue
1034
+
1035
+ # expand the latents if we are doing classifier free guidance
1036
+ latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
1037
+ if hasattr(self.scheduler, "scale_model_input"):
1038
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
1039
+
1040
+ # predict the noise residual
1041
+ noise_pred = self.unet(
1042
+ latent_model_input,
1043
+ t,
1044
+ encoder_hidden_states=prompt_embeds,
1045
+ timestep_cond=timestep_cond,
1046
+ cross_attention_kwargs=self.cross_attention_kwargs,
1047
+ added_cond_kwargs=added_cond_kwargs,
1048
+ return_dict=False,
1049
+ )[0]
1050
+
1051
+ # perform guidance
1052
+ if self.do_classifier_free_guidance:
1053
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
1054
+ noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
1055
+
1056
+ if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
1057
+ # Based on 3.4. in https://huggingface.co/papers/2305.08891
1058
+ noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
1059
+
1060
+ # compute the previous noisy sample x_t -> x_t-1
1061
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
1062
+
1063
+ if callback_on_step_end is not None:
1064
+ callback_kwargs = {}
1065
+ for k in callback_on_step_end_tensor_inputs:
1066
+ callback_kwargs[k] = locals()[k]
1067
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
1068
+
1069
+ latents = callback_outputs.pop("latents", latents)
1070
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
1071
+ negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
1072
+
1073
+ # call the callback, if provided
1074
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
1075
+ progress_bar.update()
1076
+ if callback is not None and i % callback_steps == 0:
1077
+ step_idx = i // getattr(self.scheduler, "order", 1)
1078
+ callback(step_idx, t, latents)
1079
+
1080
+ if XLA_AVAILABLE:
1081
+ xm.mark_step()
1082
+
1083
+ if not output_type == "latent":
1084
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
1085
+ 0
1086
+ ]
1087
+ image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
1088
+ else:
1089
+ image = latents
1090
+ has_nsfw_concept = None
1091
+
1092
+ if has_nsfw_concept is None:
1093
+ do_denormalize = [True] * image.shape[0]
1094
+ else:
1095
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
1096
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
1097
+
1098
+ # Offload all models
1099
+ self.maybe_free_model_hooks()
1100
+
1101
+ if not return_dict:
1102
+ return (image, has_nsfw_concept)
1103
+
1104
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py ADDED
@@ -0,0 +1,897 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import contextlib
16
+ import inspect
17
+ from typing import Any, Callable, Dict, List, Optional, Union
18
+
19
+ import numpy as np
20
+ import PIL.Image
21
+ import torch
22
+ from packaging import version
23
+ from transformers import CLIPTextModel, CLIPTokenizer, DPTForDepthEstimation, DPTImageProcessor
24
+
25
+ from ...configuration_utils import FrozenDict
26
+ from ...image_processor import PipelineImageInput, VaeImageProcessor
27
+ from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
28
+ from ...models import AutoencoderKL, UNet2DConditionModel
29
+ from ...models.lora import adjust_lora_scale_text_encoder
30
+ from ...schedulers import KarrasDiffusionSchedulers
31
+ from ...utils import (
32
+ PIL_INTERPOLATION,
33
+ USE_PEFT_BACKEND,
34
+ deprecate,
35
+ is_torch_xla_available,
36
+ logging,
37
+ scale_lora_layers,
38
+ unscale_lora_layers,
39
+ )
40
+ from ...utils.torch_utils import randn_tensor
41
+ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
42
+
43
+
44
+ if is_torch_xla_available():
45
+ import torch_xla.core.xla_model as xm
46
+
47
+ XLA_AVAILABLE = True
48
+ else:
49
+ XLA_AVAILABLE = False
50
+
51
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
52
+
53
+
54
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
55
+ def retrieve_latents(
56
+ encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
57
+ ):
58
+ if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
59
+ return encoder_output.latent_dist.sample(generator)
60
+ elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
61
+ return encoder_output.latent_dist.mode()
62
+ elif hasattr(encoder_output, "latents"):
63
+ return encoder_output.latents
64
+ else:
65
+ raise AttributeError("Could not access latents of provided encoder_output")
66
+
67
+
68
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
69
+ def preprocess(image):
70
+ deprecation_message = "The preprocess method is deprecated and will be removed in diffusers 1.0.0. Please use VaeImageProcessor.preprocess(...) instead"
71
+ deprecate("preprocess", "1.0.0", deprecation_message, standard_warn=False)
72
+ if isinstance(image, torch.Tensor):
73
+ return image
74
+ elif isinstance(image, PIL.Image.Image):
75
+ image = [image]
76
+
77
+ if isinstance(image[0], PIL.Image.Image):
78
+ w, h = image[0].size
79
+ w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8
80
+
81
+ image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
82
+ image = np.concatenate(image, axis=0)
83
+ image = np.array(image).astype(np.float32) / 255.0
84
+ image = image.transpose(0, 3, 1, 2)
85
+ image = 2.0 * image - 1.0
86
+ image = torch.from_numpy(image)
87
+ elif isinstance(image[0], torch.Tensor):
88
+ image = torch.cat(image, dim=0)
89
+ return image
90
+
91
+
92
+ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin):
93
+ r"""
94
+ Pipeline for text-guided depth-based image-to-image generation using Stable Diffusion.
95
+
96
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
97
+ implemented for all pipelines (downloading, saving, running on a particular device, etc.).
98
+
99
+ The pipeline also inherits the following loading methods:
100
+ - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
101
+ - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
102
+ - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
103
+
104
+ Args:
105
+ vae ([`AutoencoderKL`]):
106
+ Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
107
+ text_encoder ([`~transformers.CLIPTextModel`]):
108
+ Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
109
+ tokenizer ([`~transformers.CLIPTokenizer`]):
110
+ A `CLIPTokenizer` to tokenize text.
111
+ unet ([`UNet2DConditionModel`]):
112
+ A `UNet2DConditionModel` to denoise the encoded image latents.
113
+ scheduler ([`SchedulerMixin`]):
114
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
115
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
116
+ """
117
+
118
+ model_cpu_offload_seq = "text_encoder->unet->vae"
119
+ _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "depth_mask"]
120
+
121
+ def __init__(
122
+ self,
123
+ vae: AutoencoderKL,
124
+ text_encoder: CLIPTextModel,
125
+ tokenizer: CLIPTokenizer,
126
+ unet: UNet2DConditionModel,
127
+ scheduler: KarrasDiffusionSchedulers,
128
+ depth_estimator: DPTForDepthEstimation,
129
+ feature_extractor: DPTImageProcessor,
130
+ ):
131
+ super().__init__()
132
+
133
+ is_unet_version_less_0_9_0 = (
134
+ unet is not None
135
+ and hasattr(unet.config, "_diffusers_version")
136
+ and version.parse(version.parse(unet.config._diffusers_version).base_version) < version.parse("0.9.0.dev0")
137
+ )
138
+ is_unet_sample_size_less_64 = (
139
+ unet is not None and hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
140
+ )
141
+ if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
142
+ deprecation_message = (
143
+ "The configuration file of the unet has set the default `sample_size` to smaller than"
144
+ " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
145
+ " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
146
+ " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- stable-diffusion-v1-5/stable-diffusion-v1-5"
147
+ " \n- stable-diffusion-v1-5/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
148
+ " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
149
+ " in the config might lead to incorrect results in future versions. If you have downloaded this"
150
+ " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
151
+ " the `unet/config.json` file"
152
+ )
153
+ deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
154
+ new_config = dict(unet.config)
155
+ new_config["sample_size"] = 64
156
+ unet._internal_dict = FrozenDict(new_config)
157
+
158
+ self.register_modules(
159
+ vae=vae,
160
+ text_encoder=text_encoder,
161
+ tokenizer=tokenizer,
162
+ unet=unet,
163
+ scheduler=scheduler,
164
+ depth_estimator=depth_estimator,
165
+ feature_extractor=feature_extractor,
166
+ )
167
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
168
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
169
+
170
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
171
+ def _encode_prompt(
172
+ self,
173
+ prompt,
174
+ device,
175
+ num_images_per_prompt,
176
+ do_classifier_free_guidance,
177
+ negative_prompt=None,
178
+ prompt_embeds: Optional[torch.Tensor] = None,
179
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
180
+ lora_scale: Optional[float] = None,
181
+ **kwargs,
182
+ ):
183
+ deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
184
+ deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
185
+
186
+ prompt_embeds_tuple = self.encode_prompt(
187
+ prompt=prompt,
188
+ device=device,
189
+ num_images_per_prompt=num_images_per_prompt,
190
+ do_classifier_free_guidance=do_classifier_free_guidance,
191
+ negative_prompt=negative_prompt,
192
+ prompt_embeds=prompt_embeds,
193
+ negative_prompt_embeds=negative_prompt_embeds,
194
+ lora_scale=lora_scale,
195
+ **kwargs,
196
+ )
197
+
198
+ # concatenate for backwards comp
199
+ prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
200
+
201
+ return prompt_embeds
202
+
203
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
204
+ def encode_prompt(
205
+ self,
206
+ prompt,
207
+ device,
208
+ num_images_per_prompt,
209
+ do_classifier_free_guidance,
210
+ negative_prompt=None,
211
+ prompt_embeds: Optional[torch.Tensor] = None,
212
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
213
+ lora_scale: Optional[float] = None,
214
+ clip_skip: Optional[int] = None,
215
+ ):
216
+ r"""
217
+ Encodes the prompt into text encoder hidden states.
218
+
219
+ Args:
220
+ prompt (`str` or `List[str]`, *optional*):
221
+ prompt to be encoded
222
+ device: (`torch.device`):
223
+ torch device
224
+ num_images_per_prompt (`int`):
225
+ number of images that should be generated per prompt
226
+ do_classifier_free_guidance (`bool`):
227
+ whether to use classifier free guidance or not
228
+ negative_prompt (`str` or `List[str]`, *optional*):
229
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
230
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
231
+ less than `1`).
232
+ prompt_embeds (`torch.Tensor`, *optional*):
233
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
234
+ provided, text embeddings will be generated from `prompt` input argument.
235
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
236
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
237
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
238
+ argument.
239
+ lora_scale (`float`, *optional*):
240
+ A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
241
+ clip_skip (`int`, *optional*):
242
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
243
+ the output of the pre-final layer will be used for computing the prompt embeddings.
244
+ """
245
+ # set lora scale so that monkey patched LoRA
246
+ # function of text encoder can correctly access it
247
+ if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
248
+ self._lora_scale = lora_scale
249
+
250
+ # dynamically adjust the LoRA scale
251
+ if not USE_PEFT_BACKEND:
252
+ adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
253
+ else:
254
+ scale_lora_layers(self.text_encoder, lora_scale)
255
+
256
+ if prompt is not None and isinstance(prompt, str):
257
+ batch_size = 1
258
+ elif prompt is not None and isinstance(prompt, list):
259
+ batch_size = len(prompt)
260
+ else:
261
+ batch_size = prompt_embeds.shape[0]
262
+
263
+ if prompt_embeds is None:
264
+ # textual inversion: process multi-vector tokens if necessary
265
+ if isinstance(self, TextualInversionLoaderMixin):
266
+ prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
267
+
268
+ text_inputs = self.tokenizer(
269
+ prompt,
270
+ padding="max_length",
271
+ max_length=self.tokenizer.model_max_length,
272
+ truncation=True,
273
+ return_tensors="pt",
274
+ )
275
+ text_input_ids = text_inputs.input_ids
276
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
277
+
278
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
279
+ text_input_ids, untruncated_ids
280
+ ):
281
+ removed_text = self.tokenizer.batch_decode(
282
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
283
+ )
284
+ logger.warning(
285
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
286
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
287
+ )
288
+
289
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
290
+ attention_mask = text_inputs.attention_mask.to(device)
291
+ else:
292
+ attention_mask = None
293
+
294
+ if clip_skip is None:
295
+ prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
296
+ prompt_embeds = prompt_embeds[0]
297
+ else:
298
+ prompt_embeds = self.text_encoder(
299
+ text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
300
+ )
301
+ # Access the `hidden_states` first, that contains a tuple of
302
+ # all the hidden states from the encoder layers. Then index into
303
+ # the tuple to access the hidden states from the desired layer.
304
+ prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
305
+ # We also need to apply the final LayerNorm here to not mess with the
306
+ # representations. The `last_hidden_states` that we typically use for
307
+ # obtaining the final prompt representations passes through the LayerNorm
308
+ # layer.
309
+ prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
310
+
311
+ if self.text_encoder is not None:
312
+ prompt_embeds_dtype = self.text_encoder.dtype
313
+ elif self.unet is not None:
314
+ prompt_embeds_dtype = self.unet.dtype
315
+ else:
316
+ prompt_embeds_dtype = prompt_embeds.dtype
317
+
318
+ prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
319
+
320
+ bs_embed, seq_len, _ = prompt_embeds.shape
321
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
322
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
323
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
324
+
325
+ # get unconditional embeddings for classifier free guidance
326
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
327
+ uncond_tokens: List[str]
328
+ if negative_prompt is None:
329
+ uncond_tokens = [""] * batch_size
330
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
331
+ raise TypeError(
332
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
333
+ f" {type(prompt)}."
334
+ )
335
+ elif isinstance(negative_prompt, str):
336
+ uncond_tokens = [negative_prompt]
337
+ elif batch_size != len(negative_prompt):
338
+ raise ValueError(
339
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
340
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
341
+ " the batch size of `prompt`."
342
+ )
343
+ else:
344
+ uncond_tokens = negative_prompt
345
+
346
+ # textual inversion: process multi-vector tokens if necessary
347
+ if isinstance(self, TextualInversionLoaderMixin):
348
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
349
+
350
+ max_length = prompt_embeds.shape[1]
351
+ uncond_input = self.tokenizer(
352
+ uncond_tokens,
353
+ padding="max_length",
354
+ max_length=max_length,
355
+ truncation=True,
356
+ return_tensors="pt",
357
+ )
358
+
359
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
360
+ attention_mask = uncond_input.attention_mask.to(device)
361
+ else:
362
+ attention_mask = None
363
+
364
+ negative_prompt_embeds = self.text_encoder(
365
+ uncond_input.input_ids.to(device),
366
+ attention_mask=attention_mask,
367
+ )
368
+ negative_prompt_embeds = negative_prompt_embeds[0]
369
+
370
+ if do_classifier_free_guidance:
371
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
372
+ seq_len = negative_prompt_embeds.shape[1]
373
+
374
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
375
+
376
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
377
+ negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
378
+
379
+ if self.text_encoder is not None:
380
+ if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
381
+ # Retrieve the original scale by scaling back the LoRA layers
382
+ unscale_lora_layers(self.text_encoder, lora_scale)
383
+
384
+ return prompt_embeds, negative_prompt_embeds
385
+
386
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
387
+ def run_safety_checker(self, image, device, dtype):
388
+ if self.safety_checker is None:
389
+ has_nsfw_concept = None
390
+ else:
391
+ if torch.is_tensor(image):
392
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
393
+ else:
394
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
395
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
396
+ image, has_nsfw_concept = self.safety_checker(
397
+ images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
398
+ )
399
+ return image, has_nsfw_concept
400
+
401
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
402
+ def decode_latents(self, latents):
403
+ deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
404
+ deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
405
+
406
+ latents = 1 / self.vae.config.scaling_factor * latents
407
+ image = self.vae.decode(latents, return_dict=False)[0]
408
+ image = (image / 2 + 0.5).clamp(0, 1)
409
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
410
+ image = image.cpu().permute(0, 2, 3, 1).float().numpy()
411
+ return image
412
+
413
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
414
+ def prepare_extra_step_kwargs(self, generator, eta):
415
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
416
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
417
+ # eta corresponds to η in DDIM paper: https://huggingface.co/papers/2010.02502
418
+ # and should be between [0, 1]
419
+
420
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
421
+ extra_step_kwargs = {}
422
+ if accepts_eta:
423
+ extra_step_kwargs["eta"] = eta
424
+
425
+ # check if the scheduler accepts generator
426
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
427
+ if accepts_generator:
428
+ extra_step_kwargs["generator"] = generator
429
+ return extra_step_kwargs
430
+
431
+ def check_inputs(
432
+ self,
433
+ prompt,
434
+ strength,
435
+ callback_steps,
436
+ negative_prompt=None,
437
+ prompt_embeds=None,
438
+ negative_prompt_embeds=None,
439
+ callback_on_step_end_tensor_inputs=None,
440
+ ):
441
+ if strength < 0 or strength > 1:
442
+ raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
443
+
444
+ if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
445
+ raise ValueError(
446
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
447
+ f" {type(callback_steps)}."
448
+ )
449
+
450
+ if callback_on_step_end_tensor_inputs is not None and not all(
451
+ k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
452
+ ):
453
+ raise ValueError(
454
+ f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
455
+ )
456
+ if prompt is not None and prompt_embeds is not None:
457
+ raise ValueError(
458
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
459
+ " only forward one of the two."
460
+ )
461
+ elif prompt is None and prompt_embeds is None:
462
+ raise ValueError(
463
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
464
+ )
465
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
466
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
467
+
468
+ if negative_prompt is not None and negative_prompt_embeds is not None:
469
+ raise ValueError(
470
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
471
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
472
+ )
473
+
474
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
475
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
476
+ raise ValueError(
477
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
478
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
479
+ f" {negative_prompt_embeds.shape}."
480
+ )
481
+
482
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
483
+ def get_timesteps(self, num_inference_steps, strength, device):
484
+ # get the original timestep using init_timestep
485
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
486
+
487
+ t_start = max(num_inference_steps - init_timestep, 0)
488
+ timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
489
+ if hasattr(self.scheduler, "set_begin_index"):
490
+ self.scheduler.set_begin_index(t_start * self.scheduler.order)
491
+
492
+ return timesteps, num_inference_steps - t_start
493
+
494
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.prepare_latents
495
+ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
496
+ if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
497
+ raise ValueError(
498
+ f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
499
+ )
500
+
501
+ image = image.to(device=device, dtype=dtype)
502
+
503
+ batch_size = batch_size * num_images_per_prompt
504
+
505
+ if image.shape[1] == 4:
506
+ init_latents = image
507
+
508
+ else:
509
+ if isinstance(generator, list) and len(generator) != batch_size:
510
+ raise ValueError(
511
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
512
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
513
+ )
514
+
515
+ elif isinstance(generator, list):
516
+ if image.shape[0] < batch_size and batch_size % image.shape[0] == 0:
517
+ image = torch.cat([image] * (batch_size // image.shape[0]), dim=0)
518
+ elif image.shape[0] < batch_size and batch_size % image.shape[0] != 0:
519
+ raise ValueError(
520
+ f"Cannot duplicate `image` of batch size {image.shape[0]} to effective batch_size {batch_size} "
521
+ )
522
+
523
+ init_latents = [
524
+ retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
525
+ for i in range(batch_size)
526
+ ]
527
+ init_latents = torch.cat(init_latents, dim=0)
528
+ else:
529
+ init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
530
+
531
+ init_latents = self.vae.config.scaling_factor * init_latents
532
+
533
+ if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
534
+ # expand init_latents for batch_size
535
+ deprecation_message = (
536
+ f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
537
+ " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
538
+ " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
539
+ " your script to pass as many initial images as text prompts to suppress this warning."
540
+ )
541
+ deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
542
+ additional_image_per_prompt = batch_size // init_latents.shape[0]
543
+ init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
544
+ elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
545
+ raise ValueError(
546
+ f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
547
+ )
548
+ else:
549
+ init_latents = torch.cat([init_latents], dim=0)
550
+
551
+ shape = init_latents.shape
552
+ noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
553
+
554
+ # get latents
555
+ init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
556
+ latents = init_latents
557
+
558
+ return latents
559
+
560
+ def prepare_depth_map(self, image, depth_map, batch_size, do_classifier_free_guidance, dtype, device):
561
+ if isinstance(image, PIL.Image.Image):
562
+ image = [image]
563
+ else:
564
+ image = list(image)
565
+
566
+ if isinstance(image[0], PIL.Image.Image):
567
+ width, height = image[0].size
568
+ elif isinstance(image[0], np.ndarray):
569
+ width, height = image[0].shape[:-1]
570
+ else:
571
+ height, width = image[0].shape[-2:]
572
+
573
+ if depth_map is None:
574
+ pixel_values = self.feature_extractor(images=image, return_tensors="pt").pixel_values
575
+ pixel_values = pixel_values.to(device=device, dtype=dtype)
576
+ # The DPT-Hybrid model uses batch-norm layers which are not compatible with fp16.
577
+ # So we use `torch.autocast` here for half precision inference.
578
+ if torch.backends.mps.is_available():
579
+ autocast_ctx = contextlib.nullcontext()
580
+ logger.warning(
581
+ "The DPT-Hybrid model uses batch-norm layers which are not compatible with fp16, but autocast is not yet supported on MPS."
582
+ )
583
+ else:
584
+ autocast_ctx = torch.autocast(device.type, dtype=dtype)
585
+
586
+ with autocast_ctx:
587
+ depth_map = self.depth_estimator(pixel_values).predicted_depth
588
+ else:
589
+ depth_map = depth_map.to(device=device, dtype=dtype)
590
+
591
+ depth_map = torch.nn.functional.interpolate(
592
+ depth_map.unsqueeze(1),
593
+ size=(height // self.vae_scale_factor, width // self.vae_scale_factor),
594
+ mode="bicubic",
595
+ align_corners=False,
596
+ )
597
+
598
+ depth_min = torch.amin(depth_map, dim=[1, 2, 3], keepdim=True)
599
+ depth_max = torch.amax(depth_map, dim=[1, 2, 3], keepdim=True)
600
+ depth_map = 2.0 * (depth_map - depth_min) / (depth_max - depth_min) - 1.0
601
+ depth_map = depth_map.to(dtype)
602
+
603
+ # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
604
+ if depth_map.shape[0] < batch_size:
605
+ repeat_by = batch_size // depth_map.shape[0]
606
+ depth_map = depth_map.repeat(repeat_by, 1, 1, 1)
607
+
608
+ depth_map = torch.cat([depth_map] * 2) if do_classifier_free_guidance else depth_map
609
+ return depth_map
610
+
611
+ @property
612
+ def guidance_scale(self):
613
+ return self._guidance_scale
614
+
615
+ @property
616
+ def clip_skip(self):
617
+ return self._clip_skip
618
+
619
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
620
+ # of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
621
+ # corresponds to doing no classifier free guidance.
622
+ @property
623
+ def do_classifier_free_guidance(self):
624
+ return self._guidance_scale > 1
625
+
626
+ @property
627
+ def cross_attention_kwargs(self):
628
+ return self._cross_attention_kwargs
629
+
630
+ @property
631
+ def num_timesteps(self):
632
+ return self._num_timesteps
633
+
634
+ @torch.no_grad()
635
+ def __call__(
636
+ self,
637
+ prompt: Union[str, List[str]] = None,
638
+ image: PipelineImageInput = None,
639
+ depth_map: Optional[torch.Tensor] = None,
640
+ strength: float = 0.8,
641
+ num_inference_steps: Optional[int] = 50,
642
+ guidance_scale: Optional[float] = 7.5,
643
+ negative_prompt: Optional[Union[str, List[str]]] = None,
644
+ num_images_per_prompt: Optional[int] = 1,
645
+ eta: Optional[float] = 0.0,
646
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
647
+ prompt_embeds: Optional[torch.Tensor] = None,
648
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
649
+ output_type: Optional[str] = "pil",
650
+ return_dict: bool = True,
651
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
652
+ clip_skip: Optional[int] = None,
653
+ callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
654
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
655
+ **kwargs,
656
+ ):
657
+ r"""
658
+ The call function to the pipeline for generation.
659
+
660
+ Args:
661
+ prompt (`str` or `List[str]`, *optional*):
662
+ The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
663
+ image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
664
+ `Image` or tensor representing an image batch to be used as the starting point. Can accept image
665
+ latents as `image` only if `depth_map` is not `None`.
666
+ depth_map (`torch.Tensor`, *optional*):
667
+ Depth prediction to be used as additional conditioning for the image generation process. If not
668
+ defined, it automatically predicts the depth with `self.depth_estimator`.
669
+ strength (`float`, *optional*, defaults to 0.8):
670
+ Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
671
+ starting point and more noise is added the higher the `strength`. The number of denoising steps depends
672
+ on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
673
+ process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
674
+ essentially ignores `image`.
675
+ num_inference_steps (`int`, *optional*, defaults to 50):
676
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
677
+ expense of slower inference. This parameter is modulated by `strength`.
678
+ guidance_scale (`float`, *optional*, defaults to 7.5):
679
+ A higher guidance scale value encourages the model to generate images closely linked to the text
680
+ `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
681
+ negative_prompt (`str` or `List[str]`, *optional*):
682
+ The prompt or prompts to guide what to not include in image generation. If not defined, you need to
683
+ pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
684
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
685
+ The number of images to generate per prompt.
686
+ eta (`float`, *optional*, defaults to 0.0):
687
+ Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only
688
+ applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
689
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
690
+ A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
691
+ generation deterministic.
692
+ prompt_embeds (`torch.Tensor`, *optional*):
693
+ Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
694
+ provided, text embeddings are generated from the `prompt` input argument.
695
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
696
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
697
+ not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
698
+ output_type (`str`, *optional*, defaults to `"pil"`):
699
+ The output format of the generated image. Choose between `PIL.Image` or `np.array`.
700
+ return_dict (`bool`, *optional*, defaults to `True`):
701
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
702
+ plain tuple.
703
+ cross_attention_kwargs (`dict`, *optional*):
704
+ A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
705
+ [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
706
+ clip_skip (`int`, *optional*):
707
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
708
+ the output of the pre-final layer will be used for computing the prompt embeddings.
709
+ callback_on_step_end (`Callable`, *optional*):
710
+ A function that calls at the end of each denoising steps during the inference. The function is called
711
+ with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
712
+ callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
713
+ `callback_on_step_end_tensor_inputs`.
714
+ callback_on_step_end_tensor_inputs (`List`, *optional*):
715
+ The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
716
+ will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
717
+ `._callback_tensor_inputs` attribute of your pipeline class.
718
+ Examples:
719
+
720
+ ```py
721
+ >>> import torch
722
+ >>> import requests
723
+ >>> from PIL import Image
724
+
725
+ >>> from diffusers import StableDiffusionDepth2ImgPipeline
726
+
727
+ >>> pipe = StableDiffusionDepth2ImgPipeline.from_pretrained(
728
+ ... "stabilityai/stable-diffusion-2-depth",
729
+ ... torch_dtype=torch.float16,
730
+ ... )
731
+ >>> pipe.to("cuda")
732
+
733
+
734
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
735
+ >>> init_image = Image.open(requests.get(url, stream=True).raw)
736
+ >>> prompt = "two tigers"
737
+ >>> n_prompt = "bad, deformed, ugly, bad anotomy"
738
+ >>> image = pipe(prompt=prompt, image=init_image, negative_prompt=n_prompt, strength=0.7).images[0]
739
+ ```
740
+
741
+ Returns:
742
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
743
+ If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
744
+ otherwise a `tuple` is returned where the first element is a list with the generated images.
745
+ """
746
+
747
+ callback = kwargs.pop("callback", None)
748
+ callback_steps = kwargs.pop("callback_steps", None)
749
+
750
+ if callback is not None:
751
+ deprecate(
752
+ "callback",
753
+ "1.0.0",
754
+ "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
755
+ )
756
+ if callback_steps is not None:
757
+ deprecate(
758
+ "callback_steps",
759
+ "1.0.0",
760
+ "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
761
+ )
762
+
763
+ # 1. Check inputs
764
+ self.check_inputs(
765
+ prompt,
766
+ strength,
767
+ callback_steps,
768
+ negative_prompt=negative_prompt,
769
+ prompt_embeds=prompt_embeds,
770
+ negative_prompt_embeds=negative_prompt_embeds,
771
+ callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
772
+ )
773
+
774
+ self._guidance_scale = guidance_scale
775
+ self._clip_skip = clip_skip
776
+ self._cross_attention_kwargs = cross_attention_kwargs
777
+
778
+ if image is None:
779
+ raise ValueError("`image` input cannot be undefined.")
780
+
781
+ # 2. Define call parameters
782
+ if prompt is not None and isinstance(prompt, str):
783
+ batch_size = 1
784
+ elif prompt is not None and isinstance(prompt, list):
785
+ batch_size = len(prompt)
786
+ else:
787
+ batch_size = prompt_embeds.shape[0]
788
+
789
+ device = self._execution_device
790
+
791
+ # 3. Encode input prompt
792
+ text_encoder_lora_scale = (
793
+ self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
794
+ )
795
+ prompt_embeds, negative_prompt_embeds = self.encode_prompt(
796
+ prompt,
797
+ device,
798
+ num_images_per_prompt,
799
+ self.do_classifier_free_guidance,
800
+ negative_prompt,
801
+ prompt_embeds=prompt_embeds,
802
+ negative_prompt_embeds=negative_prompt_embeds,
803
+ lora_scale=text_encoder_lora_scale,
804
+ clip_skip=self.clip_skip,
805
+ )
806
+ # For classifier free guidance, we need to do two forward passes.
807
+ # Here we concatenate the unconditional and text embeddings into a single batch
808
+ # to avoid doing two forward passes
809
+ if self.do_classifier_free_guidance:
810
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
811
+
812
+ # 4. Prepare depth mask
813
+ depth_mask = self.prepare_depth_map(
814
+ image,
815
+ depth_map,
816
+ batch_size * num_images_per_prompt,
817
+ self.do_classifier_free_guidance,
818
+ prompt_embeds.dtype,
819
+ device,
820
+ )
821
+
822
+ # 5. Preprocess image
823
+ image = self.image_processor.preprocess(image)
824
+
825
+ # 6. Set timesteps
826
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
827
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
828
+ latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
829
+
830
+ # 7. Prepare latent variables
831
+ latents = self.prepare_latents(
832
+ image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator
833
+ )
834
+
835
+ # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
836
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
837
+
838
+ # 9. Denoising loop
839
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
840
+ self._num_timesteps = len(timesteps)
841
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
842
+ for i, t in enumerate(timesteps):
843
+ # expand the latents if we are doing classifier free guidance
844
+ latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
845
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
846
+ latent_model_input = torch.cat([latent_model_input, depth_mask], dim=1)
847
+
848
+ # predict the noise residual
849
+ noise_pred = self.unet(
850
+ latent_model_input,
851
+ t,
852
+ encoder_hidden_states=prompt_embeds,
853
+ cross_attention_kwargs=self.cross_attention_kwargs,
854
+ return_dict=False,
855
+ )[0]
856
+
857
+ # perform guidance
858
+ if self.do_classifier_free_guidance:
859
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
860
+ noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
861
+
862
+ # compute the previous noisy sample x_t -> x_t-1
863
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
864
+
865
+ if callback_on_step_end is not None:
866
+ callback_kwargs = {}
867
+ for k in callback_on_step_end_tensor_inputs:
868
+ callback_kwargs[k] = locals()[k]
869
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
870
+
871
+ latents = callback_outputs.pop("latents", latents)
872
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
873
+ negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
874
+ depth_mask = callback_outputs.pop("depth_mask", depth_mask)
875
+
876
+ # call the callback, if provided
877
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
878
+ progress_bar.update()
879
+ if callback is not None and i % callback_steps == 0:
880
+ step_idx = i // getattr(self.scheduler, "order", 1)
881
+ callback(step_idx, t, latents)
882
+
883
+ if XLA_AVAILABLE:
884
+ xm.mark_step()
885
+
886
+ if not output_type == "latent":
887
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
888
+ else:
889
+ image = latents
890
+
891
+ image = self.image_processor.postprocess(image, output_type=output_type)
892
+ self.maybe_free_model_hooks()
893
+
894
+ if not return_dict:
895
+ return (image,)
896
+
897
+ return ImagePipelineOutput(images=image)
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py ADDED
@@ -0,0 +1,439 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ from typing import Callable, List, Optional, Union
17
+
18
+ import PIL.Image
19
+ import torch
20
+ from packaging import version
21
+ from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
22
+
23
+ from ...configuration_utils import FrozenDict
24
+ from ...image_processor import VaeImageProcessor
25
+ from ...models import AutoencoderKL, UNet2DConditionModel
26
+ from ...schedulers import KarrasDiffusionSchedulers
27
+ from ...utils import deprecate, is_torch_xla_available, logging
28
+ from ...utils.torch_utils import randn_tensor
29
+ from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
30
+ from . import StableDiffusionPipelineOutput
31
+ from .safety_checker import StableDiffusionSafetyChecker
32
+
33
+
34
+ if is_torch_xla_available():
35
+ import torch_xla.core.xla_model as xm
36
+
37
+ XLA_AVAILABLE = True
38
+ else:
39
+ XLA_AVAILABLE = False
40
+
41
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
42
+
43
+
44
+ class StableDiffusionImageVariationPipeline(DiffusionPipeline, StableDiffusionMixin):
45
+ r"""
46
+ Pipeline to generate image variations from an input image using Stable Diffusion.
47
+
48
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
49
+ implemented for all pipelines (downloading, saving, running on a particular device, etc.).
50
+
51
+ Args:
52
+ vae ([`AutoencoderKL`]):
53
+ Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
54
+ image_encoder ([`~transformers.CLIPVisionModelWithProjection`]):
55
+ Frozen CLIP image-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
56
+ text_encoder ([`~transformers.CLIPTextModel`]):
57
+ Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
58
+ tokenizer ([`~transformers.CLIPTokenizer`]):
59
+ A `CLIPTokenizer` to tokenize text.
60
+ unet ([`UNet2DConditionModel`]):
61
+ A `UNet2DConditionModel` to denoise the encoded image latents.
62
+ scheduler ([`SchedulerMixin`]):
63
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
64
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
65
+ safety_checker ([`StableDiffusionSafetyChecker`]):
66
+ Classification module that estimates whether generated images could be considered offensive or harmful.
67
+ Please refer to the [model card](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) for
68
+ more details about a model's potential harms.
69
+ feature_extractor ([`~transformers.CLIPImageProcessor`]):
70
+ A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
71
+ """
72
+
73
+ # TODO: feature_extractor is required to encode images (if they are in PIL format),
74
+ # we should give a descriptive message if the pipeline doesn't have one.
75
+ _optional_components = ["safety_checker"]
76
+ model_cpu_offload_seq = "image_encoder->unet->vae"
77
+ _exclude_from_cpu_offload = ["safety_checker"]
78
+
79
+ def __init__(
80
+ self,
81
+ vae: AutoencoderKL,
82
+ image_encoder: CLIPVisionModelWithProjection,
83
+ unet: UNet2DConditionModel,
84
+ scheduler: KarrasDiffusionSchedulers,
85
+ safety_checker: StableDiffusionSafetyChecker,
86
+ feature_extractor: CLIPImageProcessor,
87
+ requires_safety_checker: bool = True,
88
+ ):
89
+ super().__init__()
90
+
91
+ if safety_checker is None and requires_safety_checker:
92
+ logger.warning(
93
+ f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
94
+ " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
95
+ " results in services or applications open to the public. Both the diffusers team and Hugging Face"
96
+ " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
97
+ " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
98
+ " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
99
+ )
100
+
101
+ if safety_checker is not None and feature_extractor is None:
102
+ raise ValueError(
103
+ "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
104
+ " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
105
+ )
106
+
107
+ is_unet_version_less_0_9_0 = (
108
+ unet is not None
109
+ and hasattr(unet.config, "_diffusers_version")
110
+ and version.parse(version.parse(unet.config._diffusers_version).base_version) < version.parse("0.9.0.dev0")
111
+ )
112
+ is_unet_sample_size_less_64 = (
113
+ unet is not None and hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
114
+ )
115
+ if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
116
+ deprecation_message = (
117
+ "The configuration file of the unet has set the default `sample_size` to smaller than"
118
+ " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
119
+ " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
120
+ " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- stable-diffusion-v1-5/stable-diffusion-v1-5"
121
+ " \n- stable-diffusion-v1-5/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
122
+ " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
123
+ " in the config might lead to incorrect results in future versions. If you have downloaded this"
124
+ " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
125
+ " the `unet/config.json` file"
126
+ )
127
+ deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
128
+ new_config = dict(unet.config)
129
+ new_config["sample_size"] = 64
130
+ unet._internal_dict = FrozenDict(new_config)
131
+
132
+ self.register_modules(
133
+ vae=vae,
134
+ image_encoder=image_encoder,
135
+ unet=unet,
136
+ scheduler=scheduler,
137
+ safety_checker=safety_checker,
138
+ feature_extractor=feature_extractor,
139
+ )
140
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
141
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
142
+ self.register_to_config(requires_safety_checker=requires_safety_checker)
143
+
144
+ def _encode_image(self, image, device, num_images_per_prompt, do_classifier_free_guidance):
145
+ dtype = next(self.image_encoder.parameters()).dtype
146
+
147
+ if not isinstance(image, torch.Tensor):
148
+ image = self.feature_extractor(images=image, return_tensors="pt").pixel_values
149
+
150
+ image = image.to(device=device, dtype=dtype)
151
+ image_embeddings = self.image_encoder(image).image_embeds
152
+ image_embeddings = image_embeddings.unsqueeze(1)
153
+
154
+ # duplicate image embeddings for each generation per prompt, using mps friendly method
155
+ bs_embed, seq_len, _ = image_embeddings.shape
156
+ image_embeddings = image_embeddings.repeat(1, num_images_per_prompt, 1)
157
+ image_embeddings = image_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
158
+
159
+ if do_classifier_free_guidance:
160
+ negative_prompt_embeds = torch.zeros_like(image_embeddings)
161
+
162
+ # For classifier free guidance, we need to do two forward passes.
163
+ # Here we concatenate the unconditional and text embeddings into a single batch
164
+ # to avoid doing two forward passes
165
+ image_embeddings = torch.cat([negative_prompt_embeds, image_embeddings])
166
+
167
+ return image_embeddings
168
+
169
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
170
+ def run_safety_checker(self, image, device, dtype):
171
+ if self.safety_checker is None:
172
+ has_nsfw_concept = None
173
+ else:
174
+ if torch.is_tensor(image):
175
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
176
+ else:
177
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
178
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
179
+ image, has_nsfw_concept = self.safety_checker(
180
+ images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
181
+ )
182
+ return image, has_nsfw_concept
183
+
184
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
185
+ def decode_latents(self, latents):
186
+ deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
187
+ deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
188
+
189
+ latents = 1 / self.vae.config.scaling_factor * latents
190
+ image = self.vae.decode(latents, return_dict=False)[0]
191
+ image = (image / 2 + 0.5).clamp(0, 1)
192
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
193
+ image = image.cpu().permute(0, 2, 3, 1).float().numpy()
194
+ return image
195
+
196
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
197
+ def prepare_extra_step_kwargs(self, generator, eta):
198
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
199
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
200
+ # eta corresponds to η in DDIM paper: https://huggingface.co/papers/2010.02502
201
+ # and should be between [0, 1]
202
+
203
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
204
+ extra_step_kwargs = {}
205
+ if accepts_eta:
206
+ extra_step_kwargs["eta"] = eta
207
+
208
+ # check if the scheduler accepts generator
209
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
210
+ if accepts_generator:
211
+ extra_step_kwargs["generator"] = generator
212
+ return extra_step_kwargs
213
+
214
+ def check_inputs(self, image, height, width, callback_steps):
215
+ if (
216
+ not isinstance(image, torch.Tensor)
217
+ and not isinstance(image, PIL.Image.Image)
218
+ and not isinstance(image, list)
219
+ ):
220
+ raise ValueError(
221
+ "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
222
+ f" {type(image)}"
223
+ )
224
+
225
+ if height % 8 != 0 or width % 8 != 0:
226
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
227
+
228
+ if (callback_steps is None) or (
229
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
230
+ ):
231
+ raise ValueError(
232
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
233
+ f" {type(callback_steps)}."
234
+ )
235
+
236
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
237
+ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
238
+ shape = (
239
+ batch_size,
240
+ num_channels_latents,
241
+ int(height) // self.vae_scale_factor,
242
+ int(width) // self.vae_scale_factor,
243
+ )
244
+ if isinstance(generator, list) and len(generator) != batch_size:
245
+ raise ValueError(
246
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
247
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
248
+ )
249
+
250
+ if latents is None:
251
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
252
+ else:
253
+ latents = latents.to(device)
254
+
255
+ # scale the initial noise by the standard deviation required by the scheduler
256
+ latents = latents * self.scheduler.init_noise_sigma
257
+ return latents
258
+
259
+ @torch.no_grad()
260
+ def __call__(
261
+ self,
262
+ image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.Tensor],
263
+ height: Optional[int] = None,
264
+ width: Optional[int] = None,
265
+ num_inference_steps: int = 50,
266
+ guidance_scale: float = 7.5,
267
+ num_images_per_prompt: Optional[int] = 1,
268
+ eta: float = 0.0,
269
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
270
+ latents: Optional[torch.Tensor] = None,
271
+ output_type: Optional[str] = "pil",
272
+ return_dict: bool = True,
273
+ callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
274
+ callback_steps: int = 1,
275
+ ):
276
+ r"""
277
+ The call function to the pipeline for generation.
278
+
279
+ Args:
280
+ image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.Tensor`):
281
+ Image or images to guide image generation. If you provide a tensor, it needs to be compatible with
282
+ [`CLIPImageProcessor`](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json).
283
+ height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
284
+ The height in pixels of the generated image.
285
+ width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
286
+ The width in pixels of the generated image.
287
+ num_inference_steps (`int`, *optional*, defaults to 50):
288
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
289
+ expense of slower inference. This parameter is modulated by `strength`.
290
+ guidance_scale (`float`, *optional*, defaults to 7.5):
291
+ A higher guidance scale value encourages the model to generate images closely linked to the text
292
+ `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
293
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
294
+ The number of images to generate per prompt.
295
+ eta (`float`, *optional*, defaults to 0.0):
296
+ Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only
297
+ applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
298
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
299
+ A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
300
+ generation deterministic.
301
+ latents (`torch.Tensor`, *optional*):
302
+ Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
303
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
304
+ tensor is generated by sampling using the supplied random `generator`.
305
+ output_type (`str`, *optional*, defaults to `"pil"`):
306
+ The output format of the generated image. Choose between `PIL.Image` or `np.array`.
307
+ return_dict (`bool`, *optional*, defaults to `True`):
308
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
309
+ plain tuple.
310
+ callback (`Callable`, *optional*):
311
+ A function that calls every `callback_steps` steps during inference. The function is called with the
312
+ following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
313
+ callback_steps (`int`, *optional*, defaults to 1):
314
+ The frequency at which the `callback` function is called. If not specified, the callback is called at
315
+ every step.
316
+
317
+ Returns:
318
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
319
+ If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
320
+ otherwise a `tuple` is returned where the first element is a list with the generated images and the
321
+ second element is a list of `bool`s indicating whether the corresponding generated image contains
322
+ "not-safe-for-work" (nsfw) content.
323
+
324
+ Examples:
325
+
326
+ ```py
327
+ from diffusers import StableDiffusionImageVariationPipeline
328
+ from PIL import Image
329
+ from io import BytesIO
330
+ import requests
331
+
332
+ pipe = StableDiffusionImageVariationPipeline.from_pretrained(
333
+ "lambdalabs/sd-image-variations-diffusers", revision="v2.0"
334
+ )
335
+ pipe = pipe.to("cuda")
336
+
337
+ url = "https://lh3.googleusercontent.com/y-iFOHfLTwkuQSUegpwDdgKmOjRSTvPxat63dQLB25xkTs4lhIbRUFeNBWZzYf370g=s1200"
338
+
339
+ response = requests.get(url)
340
+ image = Image.open(BytesIO(response.content)).convert("RGB")
341
+
342
+ out = pipe(image, num_images_per_prompt=3, guidance_scale=15)
343
+ out["images"][0].save("result.jpg")
344
+ ```
345
+ """
346
+ # 0. Default height and width to unet
347
+ height = height or self.unet.config.sample_size * self.vae_scale_factor
348
+ width = width or self.unet.config.sample_size * self.vae_scale_factor
349
+
350
+ # 1. Check inputs. Raise error if not correct
351
+ self.check_inputs(image, height, width, callback_steps)
352
+
353
+ # 2. Define call parameters
354
+ if isinstance(image, PIL.Image.Image):
355
+ batch_size = 1
356
+ elif isinstance(image, list):
357
+ batch_size = len(image)
358
+ else:
359
+ batch_size = image.shape[0]
360
+ device = self._execution_device
361
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
362
+ # of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
363
+ # corresponds to doing no classifier free guidance.
364
+ do_classifier_free_guidance = guidance_scale > 1.0
365
+
366
+ # 3. Encode input image
367
+ image_embeddings = self._encode_image(image, device, num_images_per_prompt, do_classifier_free_guidance)
368
+
369
+ # 4. Prepare timesteps
370
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
371
+ timesteps = self.scheduler.timesteps
372
+
373
+ # 5. Prepare latent variables
374
+ num_channels_latents = self.unet.config.in_channels
375
+ latents = self.prepare_latents(
376
+ batch_size * num_images_per_prompt,
377
+ num_channels_latents,
378
+ height,
379
+ width,
380
+ image_embeddings.dtype,
381
+ device,
382
+ generator,
383
+ latents,
384
+ )
385
+
386
+ # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
387
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
388
+
389
+ # 7. Denoising loop
390
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
391
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
392
+ for i, t in enumerate(timesteps):
393
+ # expand the latents if we are doing classifier free guidance
394
+ latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
395
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
396
+
397
+ # predict the noise residual
398
+ noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=image_embeddings).sample
399
+
400
+ # perform guidance
401
+ if do_classifier_free_guidance:
402
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
403
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
404
+
405
+ # compute the previous noisy sample x_t -> x_t-1
406
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
407
+
408
+ # call the callback, if provided
409
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
410
+ progress_bar.update()
411
+ if callback is not None and i % callback_steps == 0:
412
+ step_idx = i // getattr(self.scheduler, "order", 1)
413
+ callback(step_idx, t, latents)
414
+
415
+ if XLA_AVAILABLE:
416
+ xm.mark_step()
417
+
418
+ self.maybe_free_model_hooks()
419
+
420
+ if not output_type == "latent":
421
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
422
+ image, has_nsfw_concept = self.run_safety_checker(image, device, image_embeddings.dtype)
423
+ else:
424
+ image = latents
425
+ has_nsfw_concept = None
426
+
427
+ if has_nsfw_concept is None:
428
+ do_denormalize = [True] * image.shape[0]
429
+ else:
430
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
431
+
432
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
433
+
434
+ self.maybe_free_model_hooks()
435
+
436
+ if not return_dict:
437
+ return (image, has_nsfw_concept)
438
+
439
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py ADDED
@@ -0,0 +1,1161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ from typing import Any, Callable, Dict, List, Optional, Union
17
+
18
+ import numpy as np
19
+ import PIL.Image
20
+ import torch
21
+ from packaging import version
22
+ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
23
+
24
+ from ...callbacks import MultiPipelineCallbacks, PipelineCallback
25
+ from ...configuration_utils import FrozenDict
26
+ from ...image_processor import PipelineImageInput, VaeImageProcessor
27
+ from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
28
+ from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
29
+ from ...models.lora import adjust_lora_scale_text_encoder
30
+ from ...schedulers import KarrasDiffusionSchedulers
31
+ from ...utils import (
32
+ PIL_INTERPOLATION,
33
+ USE_PEFT_BACKEND,
34
+ deprecate,
35
+ is_torch_xla_available,
36
+ logging,
37
+ replace_example_docstring,
38
+ scale_lora_layers,
39
+ unscale_lora_layers,
40
+ )
41
+ from ...utils.torch_utils import randn_tensor
42
+ from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
43
+ from . import StableDiffusionPipelineOutput
44
+ from .safety_checker import StableDiffusionSafetyChecker
45
+
46
+
47
+ if is_torch_xla_available():
48
+ import torch_xla.core.xla_model as xm
49
+
50
+ XLA_AVAILABLE = True
51
+ else:
52
+ XLA_AVAILABLE = False
53
+
54
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
55
+
56
+
57
+ EXAMPLE_DOC_STRING = """
58
+ Examples:
59
+ ```py
60
+ >>> import requests
61
+ >>> import torch
62
+ >>> from PIL import Image
63
+ >>> from io import BytesIO
64
+
65
+ >>> from diffusers import StableDiffusionImg2ImgPipeline
66
+
67
+ >>> device = "cuda"
68
+ >>> model_id_or_path = "stable-diffusion-v1-5/stable-diffusion-v1-5"
69
+ >>> pipe = StableDiffusionImg2ImgPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16)
70
+ >>> pipe = pipe.to(device)
71
+
72
+ >>> url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
73
+
74
+ >>> response = requests.get(url)
75
+ >>> init_image = Image.open(BytesIO(response.content)).convert("RGB")
76
+ >>> init_image = init_image.resize((768, 512))
77
+
78
+ >>> prompt = "A fantasy landscape, trending on artstation"
79
+
80
+ >>> images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
81
+ >>> images[0].save("fantasy_landscape.png")
82
+ ```
83
+ """
84
+
85
+
86
+ def retrieve_latents(
87
+ encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
88
+ ):
89
+ if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
90
+ return encoder_output.latent_dist.sample(generator)
91
+ elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
92
+ return encoder_output.latent_dist.mode()
93
+ elif hasattr(encoder_output, "latents"):
94
+ return encoder_output.latents
95
+ else:
96
+ raise AttributeError("Could not access latents of provided encoder_output")
97
+
98
+
99
+ def preprocess(image):
100
+ deprecation_message = "The preprocess method is deprecated and will be removed in diffusers 1.0.0. Please use VaeImageProcessor.preprocess(...) instead"
101
+ deprecate("preprocess", "1.0.0", deprecation_message, standard_warn=False)
102
+ if isinstance(image, torch.Tensor):
103
+ return image
104
+ elif isinstance(image, PIL.Image.Image):
105
+ image = [image]
106
+
107
+ if isinstance(image[0], PIL.Image.Image):
108
+ w, h = image[0].size
109
+ w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8
110
+
111
+ image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
112
+ image = np.concatenate(image, axis=0)
113
+ image = np.array(image).astype(np.float32) / 255.0
114
+ image = image.transpose(0, 3, 1, 2)
115
+ image = 2.0 * image - 1.0
116
+ image = torch.from_numpy(image)
117
+ elif isinstance(image[0], torch.Tensor):
118
+ image = torch.cat(image, dim=0)
119
+ return image
120
+
121
+
122
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
123
+ def retrieve_timesteps(
124
+ scheduler,
125
+ num_inference_steps: Optional[int] = None,
126
+ device: Optional[Union[str, torch.device]] = None,
127
+ timesteps: Optional[List[int]] = None,
128
+ sigmas: Optional[List[float]] = None,
129
+ **kwargs,
130
+ ):
131
+ r"""
132
+ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
133
+ custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
134
+
135
+ Args:
136
+ scheduler (`SchedulerMixin`):
137
+ The scheduler to get timesteps from.
138
+ num_inference_steps (`int`):
139
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
140
+ must be `None`.
141
+ device (`str` or `torch.device`, *optional*):
142
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
143
+ timesteps (`List[int]`, *optional*):
144
+ Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
145
+ `num_inference_steps` and `sigmas` must be `None`.
146
+ sigmas (`List[float]`, *optional*):
147
+ Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
148
+ `num_inference_steps` and `timesteps` must be `None`.
149
+
150
+ Returns:
151
+ `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
152
+ second element is the number of inference steps.
153
+ """
154
+ if timesteps is not None and sigmas is not None:
155
+ raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
156
+ if timesteps is not None:
157
+ accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
158
+ if not accepts_timesteps:
159
+ raise ValueError(
160
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
161
+ f" timestep schedules. Please check whether you are using the correct scheduler."
162
+ )
163
+ scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
164
+ timesteps = scheduler.timesteps
165
+ num_inference_steps = len(timesteps)
166
+ elif sigmas is not None:
167
+ accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
168
+ if not accept_sigmas:
169
+ raise ValueError(
170
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
171
+ f" sigmas schedules. Please check whether you are using the correct scheduler."
172
+ )
173
+ scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
174
+ timesteps = scheduler.timesteps
175
+ num_inference_steps = len(timesteps)
176
+ else:
177
+ scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
178
+ timesteps = scheduler.timesteps
179
+ return timesteps, num_inference_steps
180
+
181
+
182
+ class StableDiffusionImg2ImgPipeline(
183
+ DiffusionPipeline,
184
+ StableDiffusionMixin,
185
+ TextualInversionLoaderMixin,
186
+ IPAdapterMixin,
187
+ StableDiffusionLoraLoaderMixin,
188
+ FromSingleFileMixin,
189
+ ):
190
+ r"""
191
+ Pipeline for text-guided image-to-image generation using Stable Diffusion.
192
+
193
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
194
+ implemented for all pipelines (downloading, saving, running on a particular device, etc.).
195
+
196
+ The pipeline also inherits the following loading methods:
197
+ - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
198
+ - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
199
+ - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
200
+ - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
201
+ - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
202
+
203
+ Args:
204
+ vae ([`AutoencoderKL`]):
205
+ Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
206
+ text_encoder ([`~transformers.CLIPTextModel`]):
207
+ Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
208
+ tokenizer ([`~transformers.CLIPTokenizer`]):
209
+ A `CLIPTokenizer` to tokenize text.
210
+ unet ([`UNet2DConditionModel`]):
211
+ A `UNet2DConditionModel` to denoise the encoded image latents.
212
+ scheduler ([`SchedulerMixin`]):
213
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
214
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
215
+ safety_checker ([`StableDiffusionSafetyChecker`]):
216
+ Classification module that estimates whether generated images could be considered offensive or harmful.
217
+ Please refer to the [model card](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) for
218
+ more details about a model's potential harms.
219
+ feature_extractor ([`~transformers.CLIPImageProcessor`]):
220
+ A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
221
+ """
222
+
223
+ model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
224
+ _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
225
+ _exclude_from_cpu_offload = ["safety_checker"]
226
+ _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
227
+
228
+ def __init__(
229
+ self,
230
+ vae: AutoencoderKL,
231
+ text_encoder: CLIPTextModel,
232
+ tokenizer: CLIPTokenizer,
233
+ unet: UNet2DConditionModel,
234
+ scheduler: KarrasDiffusionSchedulers,
235
+ safety_checker: StableDiffusionSafetyChecker,
236
+ feature_extractor: CLIPImageProcessor,
237
+ image_encoder: CLIPVisionModelWithProjection = None,
238
+ requires_safety_checker: bool = True,
239
+ ):
240
+ super().__init__()
241
+
242
+ if scheduler is not None and getattr(scheduler.config, "steps_offset", 1) != 1:
243
+ deprecation_message = (
244
+ f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
245
+ f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
246
+ "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
247
+ " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
248
+ " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
249
+ " file"
250
+ )
251
+ deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
252
+ new_config = dict(scheduler.config)
253
+ new_config["steps_offset"] = 1
254
+ scheduler._internal_dict = FrozenDict(new_config)
255
+
256
+ if scheduler is not None and getattr(scheduler.config, "clip_sample", False) is True:
257
+ deprecation_message = (
258
+ f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
259
+ " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
260
+ " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
261
+ " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
262
+ " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
263
+ )
264
+ deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
265
+ new_config = dict(scheduler.config)
266
+ new_config["clip_sample"] = False
267
+ scheduler._internal_dict = FrozenDict(new_config)
268
+
269
+ if safety_checker is None and requires_safety_checker:
270
+ logger.warning(
271
+ f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
272
+ " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
273
+ " results in services or applications open to the public. Both the diffusers team and Hugging Face"
274
+ " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
275
+ " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
276
+ " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
277
+ )
278
+
279
+ if safety_checker is not None and feature_extractor is None:
280
+ raise ValueError(
281
+ "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
282
+ " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
283
+ )
284
+
285
+ is_unet_version_less_0_9_0 = (
286
+ unet is not None
287
+ and hasattr(unet.config, "_diffusers_version")
288
+ and version.parse(version.parse(unet.config._diffusers_version).base_version) < version.parse("0.9.0.dev0")
289
+ )
290
+ is_unet_sample_size_less_64 = (
291
+ unet is not None and hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
292
+ )
293
+ if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
294
+ deprecation_message = (
295
+ "The configuration file of the unet has set the default `sample_size` to smaller than"
296
+ " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
297
+ " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
298
+ " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- stable-diffusion-v1-5/stable-diffusion-v1-5"
299
+ " \n- stable-diffusion-v1-5/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
300
+ " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
301
+ " in the config might lead to incorrect results in future versions. If you have downloaded this"
302
+ " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
303
+ " the `unet/config.json` file"
304
+ )
305
+ deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
306
+ new_config = dict(unet.config)
307
+ new_config["sample_size"] = 64
308
+ unet._internal_dict = FrozenDict(new_config)
309
+
310
+ self.register_modules(
311
+ vae=vae,
312
+ text_encoder=text_encoder,
313
+ tokenizer=tokenizer,
314
+ unet=unet,
315
+ scheduler=scheduler,
316
+ safety_checker=safety_checker,
317
+ feature_extractor=feature_extractor,
318
+ image_encoder=image_encoder,
319
+ )
320
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
321
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
322
+ self.register_to_config(requires_safety_checker=requires_safety_checker)
323
+
324
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
325
+ def _encode_prompt(
326
+ self,
327
+ prompt,
328
+ device,
329
+ num_images_per_prompt,
330
+ do_classifier_free_guidance,
331
+ negative_prompt=None,
332
+ prompt_embeds: Optional[torch.Tensor] = None,
333
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
334
+ lora_scale: Optional[float] = None,
335
+ **kwargs,
336
+ ):
337
+ deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
338
+ deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
339
+
340
+ prompt_embeds_tuple = self.encode_prompt(
341
+ prompt=prompt,
342
+ device=device,
343
+ num_images_per_prompt=num_images_per_prompt,
344
+ do_classifier_free_guidance=do_classifier_free_guidance,
345
+ negative_prompt=negative_prompt,
346
+ prompt_embeds=prompt_embeds,
347
+ negative_prompt_embeds=negative_prompt_embeds,
348
+ lora_scale=lora_scale,
349
+ **kwargs,
350
+ )
351
+
352
+ # concatenate for backwards comp
353
+ prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
354
+
355
+ return prompt_embeds
356
+
357
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
358
+ def encode_prompt(
359
+ self,
360
+ prompt,
361
+ device,
362
+ num_images_per_prompt,
363
+ do_classifier_free_guidance,
364
+ negative_prompt=None,
365
+ prompt_embeds: Optional[torch.Tensor] = None,
366
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
367
+ lora_scale: Optional[float] = None,
368
+ clip_skip: Optional[int] = None,
369
+ ):
370
+ r"""
371
+ Encodes the prompt into text encoder hidden states.
372
+
373
+ Args:
374
+ prompt (`str` or `List[str]`, *optional*):
375
+ prompt to be encoded
376
+ device: (`torch.device`):
377
+ torch device
378
+ num_images_per_prompt (`int`):
379
+ number of images that should be generated per prompt
380
+ do_classifier_free_guidance (`bool`):
381
+ whether to use classifier free guidance or not
382
+ negative_prompt (`str` or `List[str]`, *optional*):
383
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
384
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
385
+ less than `1`).
386
+ prompt_embeds (`torch.Tensor`, *optional*):
387
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
388
+ provided, text embeddings will be generated from `prompt` input argument.
389
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
390
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
391
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
392
+ argument.
393
+ lora_scale (`float`, *optional*):
394
+ A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
395
+ clip_skip (`int`, *optional*):
396
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
397
+ the output of the pre-final layer will be used for computing the prompt embeddings.
398
+ """
399
+ # set lora scale so that monkey patched LoRA
400
+ # function of text encoder can correctly access it
401
+ if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
402
+ self._lora_scale = lora_scale
403
+
404
+ # dynamically adjust the LoRA scale
405
+ if not USE_PEFT_BACKEND:
406
+ adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
407
+ else:
408
+ scale_lora_layers(self.text_encoder, lora_scale)
409
+
410
+ if prompt is not None and isinstance(prompt, str):
411
+ batch_size = 1
412
+ elif prompt is not None and isinstance(prompt, list):
413
+ batch_size = len(prompt)
414
+ else:
415
+ batch_size = prompt_embeds.shape[0]
416
+
417
+ if prompt_embeds is None:
418
+ # textual inversion: process multi-vector tokens if necessary
419
+ if isinstance(self, TextualInversionLoaderMixin):
420
+ prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
421
+
422
+ text_inputs = self.tokenizer(
423
+ prompt,
424
+ padding="max_length",
425
+ max_length=self.tokenizer.model_max_length,
426
+ truncation=True,
427
+ return_tensors="pt",
428
+ )
429
+ text_input_ids = text_inputs.input_ids
430
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
431
+
432
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
433
+ text_input_ids, untruncated_ids
434
+ ):
435
+ removed_text = self.tokenizer.batch_decode(
436
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
437
+ )
438
+ logger.warning(
439
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
440
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
441
+ )
442
+
443
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
444
+ attention_mask = text_inputs.attention_mask.to(device)
445
+ else:
446
+ attention_mask = None
447
+
448
+ if clip_skip is None:
449
+ prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
450
+ prompt_embeds = prompt_embeds[0]
451
+ else:
452
+ prompt_embeds = self.text_encoder(
453
+ text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
454
+ )
455
+ # Access the `hidden_states` first, that contains a tuple of
456
+ # all the hidden states from the encoder layers. Then index into
457
+ # the tuple to access the hidden states from the desired layer.
458
+ prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
459
+ # We also need to apply the final LayerNorm here to not mess with the
460
+ # representations. The `last_hidden_states` that we typically use for
461
+ # obtaining the final prompt representations passes through the LayerNorm
462
+ # layer.
463
+ prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
464
+
465
+ if self.text_encoder is not None:
466
+ prompt_embeds_dtype = self.text_encoder.dtype
467
+ elif self.unet is not None:
468
+ prompt_embeds_dtype = self.unet.dtype
469
+ else:
470
+ prompt_embeds_dtype = prompt_embeds.dtype
471
+
472
+ prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
473
+
474
+ bs_embed, seq_len, _ = prompt_embeds.shape
475
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
476
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
477
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
478
+
479
+ # get unconditional embeddings for classifier free guidance
480
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
481
+ uncond_tokens: List[str]
482
+ if negative_prompt is None:
483
+ uncond_tokens = [""] * batch_size
484
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
485
+ raise TypeError(
486
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
487
+ f" {type(prompt)}."
488
+ )
489
+ elif isinstance(negative_prompt, str):
490
+ uncond_tokens = [negative_prompt]
491
+ elif batch_size != len(negative_prompt):
492
+ raise ValueError(
493
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
494
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
495
+ " the batch size of `prompt`."
496
+ )
497
+ else:
498
+ uncond_tokens = negative_prompt
499
+
500
+ # textual inversion: process multi-vector tokens if necessary
501
+ if isinstance(self, TextualInversionLoaderMixin):
502
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
503
+
504
+ max_length = prompt_embeds.shape[1]
505
+ uncond_input = self.tokenizer(
506
+ uncond_tokens,
507
+ padding="max_length",
508
+ max_length=max_length,
509
+ truncation=True,
510
+ return_tensors="pt",
511
+ )
512
+
513
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
514
+ attention_mask = uncond_input.attention_mask.to(device)
515
+ else:
516
+ attention_mask = None
517
+
518
+ negative_prompt_embeds = self.text_encoder(
519
+ uncond_input.input_ids.to(device),
520
+ attention_mask=attention_mask,
521
+ )
522
+ negative_prompt_embeds = negative_prompt_embeds[0]
523
+
524
+ if do_classifier_free_guidance:
525
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
526
+ seq_len = negative_prompt_embeds.shape[1]
527
+
528
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
529
+
530
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
531
+ negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
532
+
533
+ if self.text_encoder is not None:
534
+ if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
535
+ # Retrieve the original scale by scaling back the LoRA layers
536
+ unscale_lora_layers(self.text_encoder, lora_scale)
537
+
538
+ return prompt_embeds, negative_prompt_embeds
539
+
540
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
541
+ def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
542
+ dtype = next(self.image_encoder.parameters()).dtype
543
+
544
+ if not isinstance(image, torch.Tensor):
545
+ image = self.feature_extractor(image, return_tensors="pt").pixel_values
546
+
547
+ image = image.to(device=device, dtype=dtype)
548
+ if output_hidden_states:
549
+ image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
550
+ image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
551
+ uncond_image_enc_hidden_states = self.image_encoder(
552
+ torch.zeros_like(image), output_hidden_states=True
553
+ ).hidden_states[-2]
554
+ uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
555
+ num_images_per_prompt, dim=0
556
+ )
557
+ return image_enc_hidden_states, uncond_image_enc_hidden_states
558
+ else:
559
+ image_embeds = self.image_encoder(image).image_embeds
560
+ image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
561
+ uncond_image_embeds = torch.zeros_like(image_embeds)
562
+
563
+ return image_embeds, uncond_image_embeds
564
+
565
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
566
+ def prepare_ip_adapter_image_embeds(
567
+ self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
568
+ ):
569
+ image_embeds = []
570
+ if do_classifier_free_guidance:
571
+ negative_image_embeds = []
572
+ if ip_adapter_image_embeds is None:
573
+ if not isinstance(ip_adapter_image, list):
574
+ ip_adapter_image = [ip_adapter_image]
575
+
576
+ if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
577
+ raise ValueError(
578
+ f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
579
+ )
580
+
581
+ for single_ip_adapter_image, image_proj_layer in zip(
582
+ ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
583
+ ):
584
+ output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
585
+ single_image_embeds, single_negative_image_embeds = self.encode_image(
586
+ single_ip_adapter_image, device, 1, output_hidden_state
587
+ )
588
+
589
+ image_embeds.append(single_image_embeds[None, :])
590
+ if do_classifier_free_guidance:
591
+ negative_image_embeds.append(single_negative_image_embeds[None, :])
592
+ else:
593
+ for single_image_embeds in ip_adapter_image_embeds:
594
+ if do_classifier_free_guidance:
595
+ single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
596
+ negative_image_embeds.append(single_negative_image_embeds)
597
+ image_embeds.append(single_image_embeds)
598
+
599
+ ip_adapter_image_embeds = []
600
+ for i, single_image_embeds in enumerate(image_embeds):
601
+ single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
602
+ if do_classifier_free_guidance:
603
+ single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
604
+ single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
605
+
606
+ single_image_embeds = single_image_embeds.to(device=device)
607
+ ip_adapter_image_embeds.append(single_image_embeds)
608
+
609
+ return ip_adapter_image_embeds
610
+
611
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
612
+ def run_safety_checker(self, image, device, dtype):
613
+ if self.safety_checker is None:
614
+ has_nsfw_concept = None
615
+ else:
616
+ if torch.is_tensor(image):
617
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
618
+ else:
619
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
620
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
621
+ image, has_nsfw_concept = self.safety_checker(
622
+ images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
623
+ )
624
+ return image, has_nsfw_concept
625
+
626
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
627
+ def decode_latents(self, latents):
628
+ deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
629
+ deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
630
+
631
+ latents = 1 / self.vae.config.scaling_factor * latents
632
+ image = self.vae.decode(latents, return_dict=False)[0]
633
+ image = (image / 2 + 0.5).clamp(0, 1)
634
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
635
+ image = image.cpu().permute(0, 2, 3, 1).float().numpy()
636
+ return image
637
+
638
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
639
+ def prepare_extra_step_kwargs(self, generator, eta):
640
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
641
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
642
+ # eta corresponds to η in DDIM paper: https://huggingface.co/papers/2010.02502
643
+ # and should be between [0, 1]
644
+
645
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
646
+ extra_step_kwargs = {}
647
+ if accepts_eta:
648
+ extra_step_kwargs["eta"] = eta
649
+
650
+ # check if the scheduler accepts generator
651
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
652
+ if accepts_generator:
653
+ extra_step_kwargs["generator"] = generator
654
+ return extra_step_kwargs
655
+
656
+ def check_inputs(
657
+ self,
658
+ prompt,
659
+ strength,
660
+ callback_steps,
661
+ negative_prompt=None,
662
+ prompt_embeds=None,
663
+ negative_prompt_embeds=None,
664
+ ip_adapter_image=None,
665
+ ip_adapter_image_embeds=None,
666
+ callback_on_step_end_tensor_inputs=None,
667
+ ):
668
+ if strength < 0 or strength > 1:
669
+ raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
670
+
671
+ if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
672
+ raise ValueError(
673
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
674
+ f" {type(callback_steps)}."
675
+ )
676
+
677
+ if callback_on_step_end_tensor_inputs is not None and not all(
678
+ k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
679
+ ):
680
+ raise ValueError(
681
+ f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
682
+ )
683
+ if prompt is not None and prompt_embeds is not None:
684
+ raise ValueError(
685
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
686
+ " only forward one of the two."
687
+ )
688
+ elif prompt is None and prompt_embeds is None:
689
+ raise ValueError(
690
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
691
+ )
692
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
693
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
694
+
695
+ if negative_prompt is not None and negative_prompt_embeds is not None:
696
+ raise ValueError(
697
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
698
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
699
+ )
700
+
701
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
702
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
703
+ raise ValueError(
704
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
705
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
706
+ f" {negative_prompt_embeds.shape}."
707
+ )
708
+
709
+ if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
710
+ raise ValueError(
711
+ "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
712
+ )
713
+
714
+ if ip_adapter_image_embeds is not None:
715
+ if not isinstance(ip_adapter_image_embeds, list):
716
+ raise ValueError(
717
+ f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
718
+ )
719
+ elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
720
+ raise ValueError(
721
+ f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
722
+ )
723
+
724
+ def get_timesteps(self, num_inference_steps, strength, device):
725
+ # get the original timestep using init_timestep
726
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
727
+
728
+ t_start = max(num_inference_steps - init_timestep, 0)
729
+ timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
730
+ if hasattr(self.scheduler, "set_begin_index"):
731
+ self.scheduler.set_begin_index(t_start * self.scheduler.order)
732
+
733
+ return timesteps, num_inference_steps - t_start
734
+
735
+ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
736
+ if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
737
+ raise ValueError(
738
+ f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
739
+ )
740
+
741
+ image = image.to(device=device, dtype=dtype)
742
+
743
+ batch_size = batch_size * num_images_per_prompt
744
+
745
+ if image.shape[1] == 4:
746
+ init_latents = image
747
+
748
+ else:
749
+ if isinstance(generator, list) and len(generator) != batch_size:
750
+ raise ValueError(
751
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
752
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
753
+ )
754
+
755
+ elif isinstance(generator, list):
756
+ if image.shape[0] < batch_size and batch_size % image.shape[0] == 0:
757
+ image = torch.cat([image] * (batch_size // image.shape[0]), dim=0)
758
+ elif image.shape[0] < batch_size and batch_size % image.shape[0] != 0:
759
+ raise ValueError(
760
+ f"Cannot duplicate `image` of batch size {image.shape[0]} to effective batch_size {batch_size} "
761
+ )
762
+
763
+ init_latents = [
764
+ retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
765
+ for i in range(batch_size)
766
+ ]
767
+ init_latents = torch.cat(init_latents, dim=0)
768
+ else:
769
+ init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
770
+
771
+ init_latents = self.vae.config.scaling_factor * init_latents
772
+
773
+ if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
774
+ # expand init_latents for batch_size
775
+ deprecation_message = (
776
+ f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
777
+ " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
778
+ " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
779
+ " your script to pass as many initial images as text prompts to suppress this warning."
780
+ )
781
+ deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
782
+ additional_image_per_prompt = batch_size // init_latents.shape[0]
783
+ init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
784
+ elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
785
+ raise ValueError(
786
+ f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
787
+ )
788
+ else:
789
+ init_latents = torch.cat([init_latents], dim=0)
790
+
791
+ shape = init_latents.shape
792
+ noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
793
+
794
+ # get latents
795
+ init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
796
+ latents = init_latents
797
+
798
+ return latents
799
+
800
+ # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
801
+ def get_guidance_scale_embedding(
802
+ self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
803
+ ) -> torch.Tensor:
804
+ """
805
+ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
806
+
807
+ Args:
808
+ w (`torch.Tensor`):
809
+ Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
810
+ embedding_dim (`int`, *optional*, defaults to 512):
811
+ Dimension of the embeddings to generate.
812
+ dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
813
+ Data type of the generated embeddings.
814
+
815
+ Returns:
816
+ `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
817
+ """
818
+ assert len(w.shape) == 1
819
+ w = w * 1000.0
820
+
821
+ half_dim = embedding_dim // 2
822
+ emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
823
+ emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
824
+ emb = w.to(dtype)[:, None] * emb[None, :]
825
+ emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
826
+ if embedding_dim % 2 == 1: # zero pad
827
+ emb = torch.nn.functional.pad(emb, (0, 1))
828
+ assert emb.shape == (w.shape[0], embedding_dim)
829
+ return emb
830
+
831
+ @property
832
+ def guidance_scale(self):
833
+ return self._guidance_scale
834
+
835
+ @property
836
+ def clip_skip(self):
837
+ return self._clip_skip
838
+
839
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
840
+ # of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
841
+ # corresponds to doing no classifier free guidance.
842
+ @property
843
+ def do_classifier_free_guidance(self):
844
+ return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
845
+
846
+ @property
847
+ def cross_attention_kwargs(self):
848
+ return self._cross_attention_kwargs
849
+
850
+ @property
851
+ def num_timesteps(self):
852
+ return self._num_timesteps
853
+
854
+ @property
855
+ def interrupt(self):
856
+ return self._interrupt
857
+
858
+ @torch.no_grad()
859
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
860
+ def __call__(
861
+ self,
862
+ prompt: Union[str, List[str]] = None,
863
+ image: PipelineImageInput = None,
864
+ strength: float = 0.8,
865
+ num_inference_steps: Optional[int] = 50,
866
+ timesteps: List[int] = None,
867
+ sigmas: List[float] = None,
868
+ guidance_scale: Optional[float] = 7.5,
869
+ negative_prompt: Optional[Union[str, List[str]]] = None,
870
+ num_images_per_prompt: Optional[int] = 1,
871
+ eta: Optional[float] = 0.0,
872
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
873
+ prompt_embeds: Optional[torch.Tensor] = None,
874
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
875
+ ip_adapter_image: Optional[PipelineImageInput] = None,
876
+ ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
877
+ output_type: Optional[str] = "pil",
878
+ return_dict: bool = True,
879
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
880
+ clip_skip: int = None,
881
+ callback_on_step_end: Optional[
882
+ Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
883
+ ] = None,
884
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
885
+ **kwargs,
886
+ ):
887
+ r"""
888
+ The call function to the pipeline for generation.
889
+
890
+ Args:
891
+ prompt (`str` or `List[str]`, *optional*):
892
+ The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
893
+ image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
894
+ `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both
895
+ numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list
896
+ or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a
897
+ list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image
898
+ latents as `image`, but if passing latents directly it is not encoded again.
899
+ strength (`float`, *optional*, defaults to 0.8):
900
+ Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
901
+ starting point and more noise is added the higher the `strength`. The number of denoising steps depends
902
+ on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
903
+ process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
904
+ essentially ignores `image`.
905
+ num_inference_steps (`int`, *optional*, defaults to 50):
906
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
907
+ expense of slower inference. This parameter is modulated by `strength`.
908
+ timesteps (`List[int]`, *optional*):
909
+ Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
910
+ in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
911
+ passed will be used. Must be in descending order.
912
+ sigmas (`List[float]`, *optional*):
913
+ Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
914
+ their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
915
+ will be used.
916
+ guidance_scale (`float`, *optional*, defaults to 7.5):
917
+ A higher guidance scale value encourages the model to generate images closely linked to the text
918
+ `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
919
+ negative_prompt (`str` or `List[str]`, *optional*):
920
+ The prompt or prompts to guide what to not include in image generation. If not defined, you need to
921
+ pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
922
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
923
+ The number of images to generate per prompt.
924
+ eta (`float`, *optional*, defaults to 0.0):
925
+ Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only
926
+ applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
927
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
928
+ A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
929
+ generation deterministic.
930
+ prompt_embeds (`torch.Tensor`, *optional*):
931
+ Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
932
+ provided, text embeddings are generated from the `prompt` input argument.
933
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
934
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
935
+ not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
936
+ ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
937
+ ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
938
+ Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
939
+ IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
940
+ contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
941
+ provided, embeddings are computed from the `ip_adapter_image` input argument.
942
+ output_type (`str`, *optional*, defaults to `"pil"`):
943
+ The output format of the generated image. Choose between `PIL.Image` or `np.array`.
944
+ return_dict (`bool`, *optional*, defaults to `True`):
945
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
946
+ plain tuple.
947
+ cross_attention_kwargs (`dict`, *optional*):
948
+ A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
949
+ [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
950
+ clip_skip (`int`, *optional*):
951
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
952
+ the output of the pre-final layer will be used for computing the prompt embeddings.
953
+ callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
954
+ A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
955
+ each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
956
+ DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
957
+ list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
958
+ callback_on_step_end_tensor_inputs (`List`, *optional*):
959
+ The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
960
+ will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
961
+ `._callback_tensor_inputs` attribute of your pipeline class.
962
+ Examples:
963
+
964
+ Returns:
965
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
966
+ If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
967
+ otherwise a `tuple` is returned where the first element is a list with the generated images and the
968
+ second element is a list of `bool`s indicating whether the corresponding generated image contains
969
+ "not-safe-for-work" (nsfw) content.
970
+ """
971
+
972
+ callback = kwargs.pop("callback", None)
973
+ callback_steps = kwargs.pop("callback_steps", None)
974
+
975
+ if callback is not None:
976
+ deprecate(
977
+ "callback",
978
+ "1.0.0",
979
+ "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
980
+ )
981
+ if callback_steps is not None:
982
+ deprecate(
983
+ "callback_steps",
984
+ "1.0.0",
985
+ "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
986
+ )
987
+
988
+ if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
989
+ callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
990
+
991
+ # 1. Check inputs. Raise error if not correct
992
+ self.check_inputs(
993
+ prompt,
994
+ strength,
995
+ callback_steps,
996
+ negative_prompt,
997
+ prompt_embeds,
998
+ negative_prompt_embeds,
999
+ ip_adapter_image,
1000
+ ip_adapter_image_embeds,
1001
+ callback_on_step_end_tensor_inputs,
1002
+ )
1003
+
1004
+ self._guidance_scale = guidance_scale
1005
+ self._clip_skip = clip_skip
1006
+ self._cross_attention_kwargs = cross_attention_kwargs
1007
+ self._interrupt = False
1008
+
1009
+ # 2. Define call parameters
1010
+ if prompt is not None and isinstance(prompt, str):
1011
+ batch_size = 1
1012
+ elif prompt is not None and isinstance(prompt, list):
1013
+ batch_size = len(prompt)
1014
+ else:
1015
+ batch_size = prompt_embeds.shape[0]
1016
+
1017
+ device = self._execution_device
1018
+
1019
+ # 3. Encode input prompt
1020
+ text_encoder_lora_scale = (
1021
+ self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
1022
+ )
1023
+ prompt_embeds, negative_prompt_embeds = self.encode_prompt(
1024
+ prompt,
1025
+ device,
1026
+ num_images_per_prompt,
1027
+ self.do_classifier_free_guidance,
1028
+ negative_prompt,
1029
+ prompt_embeds=prompt_embeds,
1030
+ negative_prompt_embeds=negative_prompt_embeds,
1031
+ lora_scale=text_encoder_lora_scale,
1032
+ clip_skip=self.clip_skip,
1033
+ )
1034
+ # For classifier free guidance, we need to do two forward passes.
1035
+ # Here we concatenate the unconditional and text embeddings into a single batch
1036
+ # to avoid doing two forward passes
1037
+ if self.do_classifier_free_guidance:
1038
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
1039
+
1040
+ if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
1041
+ image_embeds = self.prepare_ip_adapter_image_embeds(
1042
+ ip_adapter_image,
1043
+ ip_adapter_image_embeds,
1044
+ device,
1045
+ batch_size * num_images_per_prompt,
1046
+ self.do_classifier_free_guidance,
1047
+ )
1048
+
1049
+ # 4. Preprocess image
1050
+ image = self.image_processor.preprocess(image)
1051
+
1052
+ # 5. set timesteps
1053
+ timesteps, num_inference_steps = retrieve_timesteps(
1054
+ self.scheduler, num_inference_steps, device, timesteps, sigmas
1055
+ )
1056
+ timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
1057
+ latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
1058
+
1059
+ # 6. Prepare latent variables
1060
+ latents = self.prepare_latents(
1061
+ image,
1062
+ latent_timestep,
1063
+ batch_size,
1064
+ num_images_per_prompt,
1065
+ prompt_embeds.dtype,
1066
+ device,
1067
+ generator,
1068
+ )
1069
+
1070
+ # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
1071
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
1072
+
1073
+ # 7.1 Add image embeds for IP-Adapter
1074
+ added_cond_kwargs = (
1075
+ {"image_embeds": image_embeds}
1076
+ if ip_adapter_image is not None or ip_adapter_image_embeds is not None
1077
+ else None
1078
+ )
1079
+
1080
+ # 7.2 Optionally get Guidance Scale Embedding
1081
+ timestep_cond = None
1082
+ if self.unet.config.time_cond_proj_dim is not None:
1083
+ guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
1084
+ timestep_cond = self.get_guidance_scale_embedding(
1085
+ guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
1086
+ ).to(device=device, dtype=latents.dtype)
1087
+
1088
+ # 8. Denoising loop
1089
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
1090
+ self._num_timesteps = len(timesteps)
1091
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
1092
+ for i, t in enumerate(timesteps):
1093
+ if self.interrupt:
1094
+ continue
1095
+
1096
+ # expand the latents if we are doing classifier free guidance
1097
+ latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
1098
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
1099
+
1100
+ # predict the noise residual
1101
+ noise_pred = self.unet(
1102
+ latent_model_input,
1103
+ t,
1104
+ encoder_hidden_states=prompt_embeds,
1105
+ timestep_cond=timestep_cond,
1106
+ cross_attention_kwargs=self.cross_attention_kwargs,
1107
+ added_cond_kwargs=added_cond_kwargs,
1108
+ return_dict=False,
1109
+ )[0]
1110
+
1111
+ # perform guidance
1112
+ if self.do_classifier_free_guidance:
1113
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
1114
+ noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
1115
+
1116
+ # compute the previous noisy sample x_t -> x_t-1
1117
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
1118
+
1119
+ if callback_on_step_end is not None:
1120
+ callback_kwargs = {}
1121
+ for k in callback_on_step_end_tensor_inputs:
1122
+ callback_kwargs[k] = locals()[k]
1123
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
1124
+
1125
+ latents = callback_outputs.pop("latents", latents)
1126
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
1127
+ negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
1128
+
1129
+ # call the callback, if provided
1130
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
1131
+ progress_bar.update()
1132
+ if callback is not None and i % callback_steps == 0:
1133
+ step_idx = i // getattr(self.scheduler, "order", 1)
1134
+ callback(step_idx, t, latents)
1135
+
1136
+ if XLA_AVAILABLE:
1137
+ xm.mark_step()
1138
+
1139
+ if not output_type == "latent":
1140
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
1141
+ 0
1142
+ ]
1143
+ image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
1144
+ else:
1145
+ image = latents
1146
+ has_nsfw_concept = None
1147
+
1148
+ if has_nsfw_concept is None:
1149
+ do_denormalize = [True] * image.shape[0]
1150
+ else:
1151
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
1152
+
1153
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
1154
+
1155
+ # Offload all models
1156
+ self.maybe_free_model_hooks()
1157
+
1158
+ if not return_dict:
1159
+ return (image, has_nsfw_concept)
1160
+
1161
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py ADDED
@@ -0,0 +1,1359 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ from typing import Any, Callable, Dict, List, Optional, Union
17
+
18
+ import PIL.Image
19
+ import torch
20
+ from packaging import version
21
+ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
22
+
23
+ from ...callbacks import MultiPipelineCallbacks, PipelineCallback
24
+ from ...configuration_utils import FrozenDict
25
+ from ...image_processor import PipelineImageInput, VaeImageProcessor
26
+ from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
27
+ from ...models import AsymmetricAutoencoderKL, AutoencoderKL, ImageProjection, UNet2DConditionModel
28
+ from ...models.lora import adjust_lora_scale_text_encoder
29
+ from ...schedulers import KarrasDiffusionSchedulers
30
+ from ...utils import (
31
+ USE_PEFT_BACKEND,
32
+ deprecate,
33
+ is_torch_xla_available,
34
+ logging,
35
+ scale_lora_layers,
36
+ unscale_lora_layers,
37
+ )
38
+ from ...utils.torch_utils import randn_tensor
39
+ from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
40
+ from . import StableDiffusionPipelineOutput
41
+ from .safety_checker import StableDiffusionSafetyChecker
42
+
43
+
44
+ if is_torch_xla_available():
45
+ import torch_xla.core.xla_model as xm
46
+
47
+ XLA_AVAILABLE = True
48
+ else:
49
+ XLA_AVAILABLE = False
50
+
51
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
52
+
53
+
54
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
55
+ def retrieve_latents(
56
+ encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
57
+ ):
58
+ if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
59
+ return encoder_output.latent_dist.sample(generator)
60
+ elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
61
+ return encoder_output.latent_dist.mode()
62
+ elif hasattr(encoder_output, "latents"):
63
+ return encoder_output.latents
64
+ else:
65
+ raise AttributeError("Could not access latents of provided encoder_output")
66
+
67
+
68
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
69
+ def retrieve_timesteps(
70
+ scheduler,
71
+ num_inference_steps: Optional[int] = None,
72
+ device: Optional[Union[str, torch.device]] = None,
73
+ timesteps: Optional[List[int]] = None,
74
+ sigmas: Optional[List[float]] = None,
75
+ **kwargs,
76
+ ):
77
+ r"""
78
+ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
79
+ custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
80
+
81
+ Args:
82
+ scheduler (`SchedulerMixin`):
83
+ The scheduler to get timesteps from.
84
+ num_inference_steps (`int`):
85
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
86
+ must be `None`.
87
+ device (`str` or `torch.device`, *optional*):
88
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
89
+ timesteps (`List[int]`, *optional*):
90
+ Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
91
+ `num_inference_steps` and `sigmas` must be `None`.
92
+ sigmas (`List[float]`, *optional*):
93
+ Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
94
+ `num_inference_steps` and `timesteps` must be `None`.
95
+
96
+ Returns:
97
+ `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
98
+ second element is the number of inference steps.
99
+ """
100
+ if timesteps is not None and sigmas is not None:
101
+ raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
102
+ if timesteps is not None:
103
+ accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
104
+ if not accepts_timesteps:
105
+ raise ValueError(
106
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
107
+ f" timestep schedules. Please check whether you are using the correct scheduler."
108
+ )
109
+ scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
110
+ timesteps = scheduler.timesteps
111
+ num_inference_steps = len(timesteps)
112
+ elif sigmas is not None:
113
+ accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
114
+ if not accept_sigmas:
115
+ raise ValueError(
116
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
117
+ f" sigmas schedules. Please check whether you are using the correct scheduler."
118
+ )
119
+ scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
120
+ timesteps = scheduler.timesteps
121
+ num_inference_steps = len(timesteps)
122
+ else:
123
+ scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
124
+ timesteps = scheduler.timesteps
125
+ return timesteps, num_inference_steps
126
+
127
+
128
+ class StableDiffusionInpaintPipeline(
129
+ DiffusionPipeline,
130
+ StableDiffusionMixin,
131
+ TextualInversionLoaderMixin,
132
+ IPAdapterMixin,
133
+ StableDiffusionLoraLoaderMixin,
134
+ FromSingleFileMixin,
135
+ ):
136
+ r"""
137
+ Pipeline for text-guided image inpainting using Stable Diffusion.
138
+
139
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
140
+ implemented for all pipelines (downloading, saving, running on a particular device, etc.).
141
+
142
+ The pipeline also inherits the following loading methods:
143
+ - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
144
+ - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
145
+ - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
146
+ - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
147
+ - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
148
+
149
+ Args:
150
+ vae ([`AutoencoderKL`, `AsymmetricAutoencoderKL`]):
151
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
152
+ text_encoder ([`CLIPTextModel`]):
153
+ Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
154
+ tokenizer ([`~transformers.CLIPTokenizer`]):
155
+ A `CLIPTokenizer` to tokenize text.
156
+ unet ([`UNet2DConditionModel`]):
157
+ A `UNet2DConditionModel` to denoise the encoded image latents.
158
+ scheduler ([`SchedulerMixin`]):
159
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
160
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
161
+ safety_checker ([`StableDiffusionSafetyChecker`]):
162
+ Classification module that estimates whether generated images could be considered offensive or harmful.
163
+ Please refer to the [model card](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) for
164
+ more details about a model's potential harms.
165
+ feature_extractor ([`~transformers.CLIPImageProcessor`]):
166
+ A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
167
+ """
168
+
169
+ model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
170
+ _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
171
+ _exclude_from_cpu_offload = ["safety_checker"]
172
+ _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "mask", "masked_image_latents"]
173
+
174
+ def __init__(
175
+ self,
176
+ vae: Union[AutoencoderKL, AsymmetricAutoencoderKL],
177
+ text_encoder: CLIPTextModel,
178
+ tokenizer: CLIPTokenizer,
179
+ unet: UNet2DConditionModel,
180
+ scheduler: KarrasDiffusionSchedulers,
181
+ safety_checker: StableDiffusionSafetyChecker,
182
+ feature_extractor: CLIPImageProcessor,
183
+ image_encoder: CLIPVisionModelWithProjection = None,
184
+ requires_safety_checker: bool = True,
185
+ ):
186
+ super().__init__()
187
+
188
+ if scheduler is not None and getattr(scheduler.config, "steps_offset", 1) != 1:
189
+ deprecation_message = (
190
+ f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
191
+ f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
192
+ "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
193
+ " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
194
+ " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
195
+ " file"
196
+ )
197
+ deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
198
+ new_config = dict(scheduler.config)
199
+ new_config["steps_offset"] = 1
200
+ scheduler._internal_dict = FrozenDict(new_config)
201
+
202
+ if scheduler is not None and getattr(scheduler.config, "skip_prk_steps", True) is False:
203
+ deprecation_message = (
204
+ f"The configuration file of this scheduler: {scheduler} has not set the configuration"
205
+ " `skip_prk_steps`. `skip_prk_steps` should be set to True in the configuration file. Please make"
206
+ " sure to update the config accordingly as not setting `skip_prk_steps` in the config might lead to"
207
+ " incorrect results in future versions. If you have downloaded this checkpoint from the Hugging Face"
208
+ " Hub, it would be very nice if you could open a Pull request for the"
209
+ " `scheduler/scheduler_config.json` file"
210
+ )
211
+ deprecate("skip_prk_steps not set", "1.0.0", deprecation_message, standard_warn=False)
212
+ new_config = dict(scheduler.config)
213
+ new_config["skip_prk_steps"] = True
214
+ scheduler._internal_dict = FrozenDict(new_config)
215
+
216
+ if safety_checker is None and requires_safety_checker:
217
+ logger.warning(
218
+ f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
219
+ " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
220
+ " results in services or applications open to the public. Both the diffusers team and Hugging Face"
221
+ " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
222
+ " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
223
+ " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
224
+ )
225
+
226
+ if safety_checker is not None and feature_extractor is None:
227
+ raise ValueError(
228
+ "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
229
+ " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
230
+ )
231
+
232
+ is_unet_version_less_0_9_0 = (
233
+ unet is not None
234
+ and hasattr(unet.config, "_diffusers_version")
235
+ and version.parse(version.parse(unet.config._diffusers_version).base_version) < version.parse("0.9.0.dev0")
236
+ )
237
+ is_unet_sample_size_less_64 = (
238
+ unet is not None and hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
239
+ )
240
+ if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
241
+ deprecation_message = (
242
+ "The configuration file of the unet has set the default `sample_size` to smaller than"
243
+ " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
244
+ " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
245
+ " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- stable-diffusion-v1-5/stable-diffusion-v1-5"
246
+ " \n- stable-diffusion-v1-5/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
247
+ " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
248
+ " in the config might lead to incorrect results in future versions. If you have downloaded this"
249
+ " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
250
+ " the `unet/config.json` file"
251
+ )
252
+ deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
253
+ new_config = dict(unet.config)
254
+ new_config["sample_size"] = 64
255
+ unet._internal_dict = FrozenDict(new_config)
256
+
257
+ # Check shapes, assume num_channels_latents == 4, num_channels_mask == 1, num_channels_masked == 4
258
+ if unet is not None and unet.config.in_channels != 9:
259
+ logger.info(f"You have loaded a UNet with {unet.config.in_channels} input channels which.")
260
+
261
+ self.register_modules(
262
+ vae=vae,
263
+ text_encoder=text_encoder,
264
+ tokenizer=tokenizer,
265
+ unet=unet,
266
+ scheduler=scheduler,
267
+ safety_checker=safety_checker,
268
+ feature_extractor=feature_extractor,
269
+ image_encoder=image_encoder,
270
+ )
271
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
272
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
273
+ self.mask_processor = VaeImageProcessor(
274
+ vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True
275
+ )
276
+ self.register_to_config(requires_safety_checker=requires_safety_checker)
277
+
278
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
279
+ def _encode_prompt(
280
+ self,
281
+ prompt,
282
+ device,
283
+ num_images_per_prompt,
284
+ do_classifier_free_guidance,
285
+ negative_prompt=None,
286
+ prompt_embeds: Optional[torch.Tensor] = None,
287
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
288
+ lora_scale: Optional[float] = None,
289
+ **kwargs,
290
+ ):
291
+ deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
292
+ deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
293
+
294
+ prompt_embeds_tuple = self.encode_prompt(
295
+ prompt=prompt,
296
+ device=device,
297
+ num_images_per_prompt=num_images_per_prompt,
298
+ do_classifier_free_guidance=do_classifier_free_guidance,
299
+ negative_prompt=negative_prompt,
300
+ prompt_embeds=prompt_embeds,
301
+ negative_prompt_embeds=negative_prompt_embeds,
302
+ lora_scale=lora_scale,
303
+ **kwargs,
304
+ )
305
+
306
+ # concatenate for backwards comp
307
+ prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
308
+
309
+ return prompt_embeds
310
+
311
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
312
+ def encode_prompt(
313
+ self,
314
+ prompt,
315
+ device,
316
+ num_images_per_prompt,
317
+ do_classifier_free_guidance,
318
+ negative_prompt=None,
319
+ prompt_embeds: Optional[torch.Tensor] = None,
320
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
321
+ lora_scale: Optional[float] = None,
322
+ clip_skip: Optional[int] = None,
323
+ ):
324
+ r"""
325
+ Encodes the prompt into text encoder hidden states.
326
+
327
+ Args:
328
+ prompt (`str` or `List[str]`, *optional*):
329
+ prompt to be encoded
330
+ device: (`torch.device`):
331
+ torch device
332
+ num_images_per_prompt (`int`):
333
+ number of images that should be generated per prompt
334
+ do_classifier_free_guidance (`bool`):
335
+ whether to use classifier free guidance or not
336
+ negative_prompt (`str` or `List[str]`, *optional*):
337
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
338
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
339
+ less than `1`).
340
+ prompt_embeds (`torch.Tensor`, *optional*):
341
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
342
+ provided, text embeddings will be generated from `prompt` input argument.
343
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
344
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
345
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
346
+ argument.
347
+ lora_scale (`float`, *optional*):
348
+ A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
349
+ clip_skip (`int`, *optional*):
350
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
351
+ the output of the pre-final layer will be used for computing the prompt embeddings.
352
+ """
353
+ # set lora scale so that monkey patched LoRA
354
+ # function of text encoder can correctly access it
355
+ if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
356
+ self._lora_scale = lora_scale
357
+
358
+ # dynamically adjust the LoRA scale
359
+ if not USE_PEFT_BACKEND:
360
+ adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
361
+ else:
362
+ scale_lora_layers(self.text_encoder, lora_scale)
363
+
364
+ if prompt is not None and isinstance(prompt, str):
365
+ batch_size = 1
366
+ elif prompt is not None and isinstance(prompt, list):
367
+ batch_size = len(prompt)
368
+ else:
369
+ batch_size = prompt_embeds.shape[0]
370
+
371
+ if prompt_embeds is None:
372
+ # textual inversion: process multi-vector tokens if necessary
373
+ if isinstance(self, TextualInversionLoaderMixin):
374
+ prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
375
+
376
+ text_inputs = self.tokenizer(
377
+ prompt,
378
+ padding="max_length",
379
+ max_length=self.tokenizer.model_max_length,
380
+ truncation=True,
381
+ return_tensors="pt",
382
+ )
383
+ text_input_ids = text_inputs.input_ids
384
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
385
+
386
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
387
+ text_input_ids, untruncated_ids
388
+ ):
389
+ removed_text = self.tokenizer.batch_decode(
390
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
391
+ )
392
+ logger.warning(
393
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
394
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
395
+ )
396
+
397
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
398
+ attention_mask = text_inputs.attention_mask.to(device)
399
+ else:
400
+ attention_mask = None
401
+
402
+ if clip_skip is None:
403
+ prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
404
+ prompt_embeds = prompt_embeds[0]
405
+ else:
406
+ prompt_embeds = self.text_encoder(
407
+ text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
408
+ )
409
+ # Access the `hidden_states` first, that contains a tuple of
410
+ # all the hidden states from the encoder layers. Then index into
411
+ # the tuple to access the hidden states from the desired layer.
412
+ prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
413
+ # We also need to apply the final LayerNorm here to not mess with the
414
+ # representations. The `last_hidden_states` that we typically use for
415
+ # obtaining the final prompt representations passes through the LayerNorm
416
+ # layer.
417
+ prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
418
+
419
+ if self.text_encoder is not None:
420
+ prompt_embeds_dtype = self.text_encoder.dtype
421
+ elif self.unet is not None:
422
+ prompt_embeds_dtype = self.unet.dtype
423
+ else:
424
+ prompt_embeds_dtype = prompt_embeds.dtype
425
+
426
+ prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
427
+
428
+ bs_embed, seq_len, _ = prompt_embeds.shape
429
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
430
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
431
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
432
+
433
+ # get unconditional embeddings for classifier free guidance
434
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
435
+ uncond_tokens: List[str]
436
+ if negative_prompt is None:
437
+ uncond_tokens = [""] * batch_size
438
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
439
+ raise TypeError(
440
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
441
+ f" {type(prompt)}."
442
+ )
443
+ elif isinstance(negative_prompt, str):
444
+ uncond_tokens = [negative_prompt]
445
+ elif batch_size != len(negative_prompt):
446
+ raise ValueError(
447
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
448
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
449
+ " the batch size of `prompt`."
450
+ )
451
+ else:
452
+ uncond_tokens = negative_prompt
453
+
454
+ # textual inversion: process multi-vector tokens if necessary
455
+ if isinstance(self, TextualInversionLoaderMixin):
456
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
457
+
458
+ max_length = prompt_embeds.shape[1]
459
+ uncond_input = self.tokenizer(
460
+ uncond_tokens,
461
+ padding="max_length",
462
+ max_length=max_length,
463
+ truncation=True,
464
+ return_tensors="pt",
465
+ )
466
+
467
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
468
+ attention_mask = uncond_input.attention_mask.to(device)
469
+ else:
470
+ attention_mask = None
471
+
472
+ negative_prompt_embeds = self.text_encoder(
473
+ uncond_input.input_ids.to(device),
474
+ attention_mask=attention_mask,
475
+ )
476
+ negative_prompt_embeds = negative_prompt_embeds[0]
477
+
478
+ if do_classifier_free_guidance:
479
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
480
+ seq_len = negative_prompt_embeds.shape[1]
481
+
482
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
483
+
484
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
485
+ negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
486
+
487
+ if self.text_encoder is not None:
488
+ if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
489
+ # Retrieve the original scale by scaling back the LoRA layers
490
+ unscale_lora_layers(self.text_encoder, lora_scale)
491
+
492
+ return prompt_embeds, negative_prompt_embeds
493
+
494
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
495
+ def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
496
+ dtype = next(self.image_encoder.parameters()).dtype
497
+
498
+ if not isinstance(image, torch.Tensor):
499
+ image = self.feature_extractor(image, return_tensors="pt").pixel_values
500
+
501
+ image = image.to(device=device, dtype=dtype)
502
+ if output_hidden_states:
503
+ image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
504
+ image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
505
+ uncond_image_enc_hidden_states = self.image_encoder(
506
+ torch.zeros_like(image), output_hidden_states=True
507
+ ).hidden_states[-2]
508
+ uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
509
+ num_images_per_prompt, dim=0
510
+ )
511
+ return image_enc_hidden_states, uncond_image_enc_hidden_states
512
+ else:
513
+ image_embeds = self.image_encoder(image).image_embeds
514
+ image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
515
+ uncond_image_embeds = torch.zeros_like(image_embeds)
516
+
517
+ return image_embeds, uncond_image_embeds
518
+
519
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
520
+ def prepare_ip_adapter_image_embeds(
521
+ self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
522
+ ):
523
+ image_embeds = []
524
+ if do_classifier_free_guidance:
525
+ negative_image_embeds = []
526
+ if ip_adapter_image_embeds is None:
527
+ if not isinstance(ip_adapter_image, list):
528
+ ip_adapter_image = [ip_adapter_image]
529
+
530
+ if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
531
+ raise ValueError(
532
+ f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
533
+ )
534
+
535
+ for single_ip_adapter_image, image_proj_layer in zip(
536
+ ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
537
+ ):
538
+ output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
539
+ single_image_embeds, single_negative_image_embeds = self.encode_image(
540
+ single_ip_adapter_image, device, 1, output_hidden_state
541
+ )
542
+
543
+ image_embeds.append(single_image_embeds[None, :])
544
+ if do_classifier_free_guidance:
545
+ negative_image_embeds.append(single_negative_image_embeds[None, :])
546
+ else:
547
+ for single_image_embeds in ip_adapter_image_embeds:
548
+ if do_classifier_free_guidance:
549
+ single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
550
+ negative_image_embeds.append(single_negative_image_embeds)
551
+ image_embeds.append(single_image_embeds)
552
+
553
+ ip_adapter_image_embeds = []
554
+ for i, single_image_embeds in enumerate(image_embeds):
555
+ single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
556
+ if do_classifier_free_guidance:
557
+ single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
558
+ single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
559
+
560
+ single_image_embeds = single_image_embeds.to(device=device)
561
+ ip_adapter_image_embeds.append(single_image_embeds)
562
+
563
+ return ip_adapter_image_embeds
564
+
565
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
566
+ def run_safety_checker(self, image, device, dtype):
567
+ if self.safety_checker is None:
568
+ has_nsfw_concept = None
569
+ else:
570
+ if torch.is_tensor(image):
571
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
572
+ else:
573
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
574
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
575
+ image, has_nsfw_concept = self.safety_checker(
576
+ images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
577
+ )
578
+ return image, has_nsfw_concept
579
+
580
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
581
+ def prepare_extra_step_kwargs(self, generator, eta):
582
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
583
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
584
+ # eta corresponds to η in DDIM paper: https://huggingface.co/papers/2010.02502
585
+ # and should be between [0, 1]
586
+
587
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
588
+ extra_step_kwargs = {}
589
+ if accepts_eta:
590
+ extra_step_kwargs["eta"] = eta
591
+
592
+ # check if the scheduler accepts generator
593
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
594
+ if accepts_generator:
595
+ extra_step_kwargs["generator"] = generator
596
+ return extra_step_kwargs
597
+
598
+ def check_inputs(
599
+ self,
600
+ prompt,
601
+ image,
602
+ mask_image,
603
+ height,
604
+ width,
605
+ strength,
606
+ callback_steps,
607
+ output_type,
608
+ negative_prompt=None,
609
+ prompt_embeds=None,
610
+ negative_prompt_embeds=None,
611
+ ip_adapter_image=None,
612
+ ip_adapter_image_embeds=None,
613
+ callback_on_step_end_tensor_inputs=None,
614
+ padding_mask_crop=None,
615
+ ):
616
+ if strength < 0 or strength > 1:
617
+ raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
618
+
619
+ if height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0:
620
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
621
+
622
+ if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
623
+ raise ValueError(
624
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
625
+ f" {type(callback_steps)}."
626
+ )
627
+
628
+ if callback_on_step_end_tensor_inputs is not None and not all(
629
+ k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
630
+ ):
631
+ raise ValueError(
632
+ f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
633
+ )
634
+
635
+ if prompt is not None and prompt_embeds is not None:
636
+ raise ValueError(
637
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
638
+ " only forward one of the two."
639
+ )
640
+ elif prompt is None and prompt_embeds is None:
641
+ raise ValueError(
642
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
643
+ )
644
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
645
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
646
+
647
+ if negative_prompt is not None and negative_prompt_embeds is not None:
648
+ raise ValueError(
649
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
650
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
651
+ )
652
+
653
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
654
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
655
+ raise ValueError(
656
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
657
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
658
+ f" {negative_prompt_embeds.shape}."
659
+ )
660
+ if padding_mask_crop is not None:
661
+ if not isinstance(image, PIL.Image.Image):
662
+ raise ValueError(
663
+ f"The image should be a PIL image when inpainting mask crop, but is of type {type(image)}."
664
+ )
665
+ if not isinstance(mask_image, PIL.Image.Image):
666
+ raise ValueError(
667
+ f"The mask image should be a PIL image when inpainting mask crop, but is of type"
668
+ f" {type(mask_image)}."
669
+ )
670
+ if output_type != "pil":
671
+ raise ValueError(f"The output type should be PIL when inpainting mask crop, but is {output_type}.")
672
+
673
+ if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
674
+ raise ValueError(
675
+ "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
676
+ )
677
+
678
+ if ip_adapter_image_embeds is not None:
679
+ if not isinstance(ip_adapter_image_embeds, list):
680
+ raise ValueError(
681
+ f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
682
+ )
683
+ elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
684
+ raise ValueError(
685
+ f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
686
+ )
687
+
688
+ def prepare_latents(
689
+ self,
690
+ batch_size,
691
+ num_channels_latents,
692
+ height,
693
+ width,
694
+ dtype,
695
+ device,
696
+ generator,
697
+ latents=None,
698
+ image=None,
699
+ timestep=None,
700
+ is_strength_max=True,
701
+ return_noise=False,
702
+ return_image_latents=False,
703
+ ):
704
+ shape = (
705
+ batch_size,
706
+ num_channels_latents,
707
+ int(height) // self.vae_scale_factor,
708
+ int(width) // self.vae_scale_factor,
709
+ )
710
+ if isinstance(generator, list) and len(generator) != batch_size:
711
+ raise ValueError(
712
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
713
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
714
+ )
715
+
716
+ if (image is None or timestep is None) and not is_strength_max:
717
+ raise ValueError(
718
+ "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise."
719
+ "However, either the image or the noise timestep has not been provided."
720
+ )
721
+
722
+ if return_image_latents or (latents is None and not is_strength_max):
723
+ image = image.to(device=device, dtype=dtype)
724
+
725
+ if image.shape[1] == 4:
726
+ image_latents = image
727
+ else:
728
+ image_latents = self._encode_vae_image(image=image, generator=generator)
729
+ image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)
730
+
731
+ if latents is None:
732
+ noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
733
+ # if strength is 1. then initialise the latents to noise, else initial to image + noise
734
+ latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep)
735
+ # if pure noise then scale the initial latents by the Scheduler's init sigma
736
+ latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
737
+ else:
738
+ noise = latents.to(device)
739
+ latents = noise * self.scheduler.init_noise_sigma
740
+
741
+ outputs = (latents,)
742
+
743
+ if return_noise:
744
+ outputs += (noise,)
745
+
746
+ if return_image_latents:
747
+ outputs += (image_latents,)
748
+
749
+ return outputs
750
+
751
+ def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
752
+ if isinstance(generator, list):
753
+ image_latents = [
754
+ retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
755
+ for i in range(image.shape[0])
756
+ ]
757
+ image_latents = torch.cat(image_latents, dim=0)
758
+ else:
759
+ image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
760
+
761
+ image_latents = self.vae.config.scaling_factor * image_latents
762
+
763
+ return image_latents
764
+
765
+ def prepare_mask_latents(
766
+ self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance
767
+ ):
768
+ # resize the mask to latents shape as we concatenate the mask to the latents
769
+ # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
770
+ # and half precision
771
+ mask = torch.nn.functional.interpolate(
772
+ mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
773
+ )
774
+ mask = mask.to(device=device, dtype=dtype)
775
+
776
+ masked_image = masked_image.to(device=device, dtype=dtype)
777
+
778
+ if masked_image.shape[1] == 4:
779
+ masked_image_latents = masked_image
780
+ else:
781
+ masked_image_latents = self._encode_vae_image(masked_image, generator=generator)
782
+
783
+ # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
784
+ if mask.shape[0] < batch_size:
785
+ if not batch_size % mask.shape[0] == 0:
786
+ raise ValueError(
787
+ "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
788
+ f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
789
+ " of masks that you pass is divisible by the total requested batch size."
790
+ )
791
+ mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
792
+ if masked_image_latents.shape[0] < batch_size:
793
+ if not batch_size % masked_image_latents.shape[0] == 0:
794
+ raise ValueError(
795
+ "The passed images and the required batch size don't match. Images are supposed to be duplicated"
796
+ f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
797
+ " Make sure the number of images that you pass is divisible by the total requested batch size."
798
+ )
799
+ masked_image_latents = masked_image_latents.repeat(batch_size // masked_image_latents.shape[0], 1, 1, 1)
800
+
801
+ mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
802
+ masked_image_latents = (
803
+ torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
804
+ )
805
+
806
+ # aligning device to prevent device errors when concating it with the latent model input
807
+ masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
808
+ return mask, masked_image_latents
809
+
810
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
811
+ def get_timesteps(self, num_inference_steps, strength, device):
812
+ # get the original timestep using init_timestep
813
+ init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
814
+
815
+ t_start = max(num_inference_steps - init_timestep, 0)
816
+ timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
817
+ if hasattr(self.scheduler, "set_begin_index"):
818
+ self.scheduler.set_begin_index(t_start * self.scheduler.order)
819
+
820
+ return timesteps, num_inference_steps - t_start
821
+
822
+ # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
823
+ def get_guidance_scale_embedding(
824
+ self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
825
+ ) -> torch.Tensor:
826
+ """
827
+ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
828
+
829
+ Args:
830
+ w (`torch.Tensor`):
831
+ Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
832
+ embedding_dim (`int`, *optional*, defaults to 512):
833
+ Dimension of the embeddings to generate.
834
+ dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
835
+ Data type of the generated embeddings.
836
+
837
+ Returns:
838
+ `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
839
+ """
840
+ assert len(w.shape) == 1
841
+ w = w * 1000.0
842
+
843
+ half_dim = embedding_dim // 2
844
+ emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
845
+ emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
846
+ emb = w.to(dtype)[:, None] * emb[None, :]
847
+ emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
848
+ if embedding_dim % 2 == 1: # zero pad
849
+ emb = torch.nn.functional.pad(emb, (0, 1))
850
+ assert emb.shape == (w.shape[0], embedding_dim)
851
+ return emb
852
+
853
+ @property
854
+ def guidance_scale(self):
855
+ return self._guidance_scale
856
+
857
+ @property
858
+ def clip_skip(self):
859
+ return self._clip_skip
860
+
861
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
862
+ # of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
863
+ # corresponds to doing no classifier free guidance.
864
+ @property
865
+ def do_classifier_free_guidance(self):
866
+ return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
867
+
868
+ @property
869
+ def cross_attention_kwargs(self):
870
+ return self._cross_attention_kwargs
871
+
872
+ @property
873
+ def num_timesteps(self):
874
+ return self._num_timesteps
875
+
876
+ @property
877
+ def interrupt(self):
878
+ return self._interrupt
879
+
880
+ @torch.no_grad()
881
+ def __call__(
882
+ self,
883
+ prompt: Union[str, List[str]] = None,
884
+ image: PipelineImageInput = None,
885
+ mask_image: PipelineImageInput = None,
886
+ masked_image_latents: torch.Tensor = None,
887
+ height: Optional[int] = None,
888
+ width: Optional[int] = None,
889
+ padding_mask_crop: Optional[int] = None,
890
+ strength: float = 1.0,
891
+ num_inference_steps: int = 50,
892
+ timesteps: List[int] = None,
893
+ sigmas: List[float] = None,
894
+ guidance_scale: float = 7.5,
895
+ negative_prompt: Optional[Union[str, List[str]]] = None,
896
+ num_images_per_prompt: Optional[int] = 1,
897
+ eta: float = 0.0,
898
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
899
+ latents: Optional[torch.Tensor] = None,
900
+ prompt_embeds: Optional[torch.Tensor] = None,
901
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
902
+ ip_adapter_image: Optional[PipelineImageInput] = None,
903
+ ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
904
+ output_type: Optional[str] = "pil",
905
+ return_dict: bool = True,
906
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
907
+ clip_skip: int = None,
908
+ callback_on_step_end: Optional[
909
+ Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
910
+ ] = None,
911
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
912
+ **kwargs,
913
+ ):
914
+ r"""
915
+ The call function to the pipeline for generation.
916
+
917
+ Args:
918
+ prompt (`str` or `List[str]`, *optional*):
919
+ The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
920
+ image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
921
+ `Image`, numpy array or tensor representing an image batch to be inpainted (which parts of the image to
922
+ be masked out with `mask_image` and repainted according to `prompt`). For both numpy array and pytorch
923
+ tensor, the expected value range is between `[0, 1]` If it's a tensor or a list or tensors, the
924
+ expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a list of arrays, the
925
+ expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image latents as `image`, but
926
+ if passing latents directly it is not encoded again.
927
+ mask_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
928
+ `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask
929
+ are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a
930
+ single channel (luminance) before use. If it's a numpy array or pytorch tensor, it should contain one
931
+ color channel (L) instead of 3, so the expected shape for pytorch tensor would be `(B, 1, H, W)`, `(B,
932
+ H, W)`, `(1, H, W)`, `(H, W)`. And for numpy array would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W,
933
+ 1)`, or `(H, W)`.
934
+ height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
935
+ The height in pixels of the generated image.
936
+ width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
937
+ The width in pixels of the generated image.
938
+ padding_mask_crop (`int`, *optional*, defaults to `None`):
939
+ The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to
940
+ image and mask_image. If `padding_mask_crop` is not `None`, it will first find a rectangular region
941
+ with the same aspect ration of the image and contains all masked area, and then expand that area based
942
+ on `padding_mask_crop`. The image and mask_image will then be cropped based on the expanded area before
943
+ resizing to the original image size for inpainting. This is useful when the masked area is small while
944
+ the image is large and contain information irrelevant for inpainting, such as background.
945
+ strength (`float`, *optional*, defaults to 1.0):
946
+ Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
947
+ starting point and more noise is added the higher the `strength`. The number of denoising steps depends
948
+ on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
949
+ process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
950
+ essentially ignores `image`.
951
+ num_inference_steps (`int`, *optional*, defaults to 50):
952
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
953
+ expense of slower inference. This parameter is modulated by `strength`.
954
+ timesteps (`List[int]`, *optional*):
955
+ Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
956
+ in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
957
+ passed will be used. Must be in descending order.
958
+ sigmas (`List[float]`, *optional*):
959
+ Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
960
+ their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
961
+ will be used.
962
+ guidance_scale (`float`, *optional*, defaults to 7.5):
963
+ A higher guidance scale value encourages the model to generate images closely linked to the text
964
+ `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
965
+ negative_prompt (`str` or `List[str]`, *optional*):
966
+ The prompt or prompts to guide what to not include in image generation. If not defined, you need to
967
+ pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
968
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
969
+ The number of images to generate per prompt.
970
+ eta (`float`, *optional*, defaults to 0.0):
971
+ Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only
972
+ applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
973
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
974
+ A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
975
+ generation deterministic.
976
+ latents (`torch.Tensor`, *optional*):
977
+ Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
978
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
979
+ tensor is generated by sampling using the supplied random `generator`.
980
+ prompt_embeds (`torch.Tensor`, *optional*):
981
+ Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
982
+ provided, text embeddings are generated from the `prompt` input argument.
983
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
984
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
985
+ not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
986
+ ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
987
+ ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
988
+ Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
989
+ IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
990
+ contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
991
+ provided, embeddings are computed from the `ip_adapter_image` input argument.
992
+ output_type (`str`, *optional*, defaults to `"pil"`):
993
+ The output format of the generated image. Choose between `PIL.Image` or `np.array`.
994
+ return_dict (`bool`, *optional*, defaults to `True`):
995
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
996
+ plain tuple.
997
+ cross_attention_kwargs (`dict`, *optional*):
998
+ A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
999
+ [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
1000
+ clip_skip (`int`, *optional*):
1001
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
1002
+ the output of the pre-final layer will be used for computing the prompt embeddings.
1003
+ callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
1004
+ A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
1005
+ each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
1006
+ DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
1007
+ list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
1008
+ callback_on_step_end_tensor_inputs (`List`, *optional*):
1009
+ The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
1010
+ will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
1011
+ `._callback_tensor_inputs` attribute of your pipeline class.
1012
+ Examples:
1013
+
1014
+ ```py
1015
+ >>> import PIL
1016
+ >>> import requests
1017
+ >>> import torch
1018
+ >>> from io import BytesIO
1019
+
1020
+ >>> from diffusers import StableDiffusionInpaintPipeline
1021
+
1022
+
1023
+ >>> def download_image(url):
1024
+ ... response = requests.get(url)
1025
+ ... return PIL.Image.open(BytesIO(response.content)).convert("RGB")
1026
+
1027
+
1028
+ >>> img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
1029
+ >>> mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
1030
+
1031
+ >>> init_image = download_image(img_url).resize((512, 512))
1032
+ >>> mask_image = download_image(mask_url).resize((512, 512))
1033
+
1034
+ >>> pipe = StableDiffusionInpaintPipeline.from_pretrained(
1035
+ ... "stable-diffusion-v1-5/stable-diffusion-inpainting", torch_dtype=torch.float16
1036
+ ... )
1037
+ >>> pipe = pipe.to("cuda")
1038
+
1039
+ >>> prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
1040
+ >>> image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
1041
+ ```
1042
+
1043
+ Returns:
1044
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
1045
+ If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
1046
+ otherwise a `tuple` is returned where the first element is a list with the generated images and the
1047
+ second element is a list of `bool`s indicating whether the corresponding generated image contains
1048
+ "not-safe-for-work" (nsfw) content.
1049
+ """
1050
+
1051
+ callback = kwargs.pop("callback", None)
1052
+ callback_steps = kwargs.pop("callback_steps", None)
1053
+
1054
+ if callback is not None:
1055
+ deprecate(
1056
+ "callback",
1057
+ "1.0.0",
1058
+ "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
1059
+ )
1060
+ if callback_steps is not None:
1061
+ deprecate(
1062
+ "callback_steps",
1063
+ "1.0.0",
1064
+ "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
1065
+ )
1066
+
1067
+ if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
1068
+ callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
1069
+
1070
+ # 0. Default height and width to unet
1071
+ height = height or self.unet.config.sample_size * self.vae_scale_factor
1072
+ width = width or self.unet.config.sample_size * self.vae_scale_factor
1073
+
1074
+ # 1. Check inputs
1075
+ self.check_inputs(
1076
+ prompt,
1077
+ image,
1078
+ mask_image,
1079
+ height,
1080
+ width,
1081
+ strength,
1082
+ callback_steps,
1083
+ output_type,
1084
+ negative_prompt,
1085
+ prompt_embeds,
1086
+ negative_prompt_embeds,
1087
+ ip_adapter_image,
1088
+ ip_adapter_image_embeds,
1089
+ callback_on_step_end_tensor_inputs,
1090
+ padding_mask_crop,
1091
+ )
1092
+
1093
+ self._guidance_scale = guidance_scale
1094
+ self._clip_skip = clip_skip
1095
+ self._cross_attention_kwargs = cross_attention_kwargs
1096
+ self._interrupt = False
1097
+
1098
+ # 2. Define call parameters
1099
+ if prompt is not None and isinstance(prompt, str):
1100
+ batch_size = 1
1101
+ elif prompt is not None and isinstance(prompt, list):
1102
+ batch_size = len(prompt)
1103
+ else:
1104
+ batch_size = prompt_embeds.shape[0]
1105
+
1106
+ device = self._execution_device
1107
+
1108
+ # 3. Encode input prompt
1109
+ text_encoder_lora_scale = (
1110
+ cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
1111
+ )
1112
+ prompt_embeds, negative_prompt_embeds = self.encode_prompt(
1113
+ prompt,
1114
+ device,
1115
+ num_images_per_prompt,
1116
+ self.do_classifier_free_guidance,
1117
+ negative_prompt,
1118
+ prompt_embeds=prompt_embeds,
1119
+ negative_prompt_embeds=negative_prompt_embeds,
1120
+ lora_scale=text_encoder_lora_scale,
1121
+ clip_skip=self.clip_skip,
1122
+ )
1123
+ # For classifier free guidance, we need to do two forward passes.
1124
+ # Here we concatenate the unconditional and text embeddings into a single batch
1125
+ # to avoid doing two forward passes
1126
+ if self.do_classifier_free_guidance:
1127
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
1128
+
1129
+ if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
1130
+ image_embeds = self.prepare_ip_adapter_image_embeds(
1131
+ ip_adapter_image,
1132
+ ip_adapter_image_embeds,
1133
+ device,
1134
+ batch_size * num_images_per_prompt,
1135
+ self.do_classifier_free_guidance,
1136
+ )
1137
+
1138
+ # 4. set timesteps
1139
+ timesteps, num_inference_steps = retrieve_timesteps(
1140
+ self.scheduler, num_inference_steps, device, timesteps, sigmas
1141
+ )
1142
+ timesteps, num_inference_steps = self.get_timesteps(
1143
+ num_inference_steps=num_inference_steps, strength=strength, device=device
1144
+ )
1145
+ # check that number of inference steps is not < 1 - as this doesn't make sense
1146
+ if num_inference_steps < 1:
1147
+ raise ValueError(
1148
+ f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
1149
+ f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
1150
+ )
1151
+ # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
1152
+ latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
1153
+ # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
1154
+ is_strength_max = strength == 1.0
1155
+
1156
+ # 5. Preprocess mask and image
1157
+
1158
+ if padding_mask_crop is not None:
1159
+ crops_coords = self.mask_processor.get_crop_region(mask_image, width, height, pad=padding_mask_crop)
1160
+ resize_mode = "fill"
1161
+ else:
1162
+ crops_coords = None
1163
+ resize_mode = "default"
1164
+
1165
+ original_image = image
1166
+ init_image = self.image_processor.preprocess(
1167
+ image, height=height, width=width, crops_coords=crops_coords, resize_mode=resize_mode
1168
+ )
1169
+ init_image = init_image.to(dtype=torch.float32)
1170
+
1171
+ # 6. Prepare latent variables
1172
+ num_channels_latents = self.vae.config.latent_channels
1173
+ num_channels_unet = self.unet.config.in_channels
1174
+ return_image_latents = num_channels_unet == 4
1175
+
1176
+ latents_outputs = self.prepare_latents(
1177
+ batch_size * num_images_per_prompt,
1178
+ num_channels_latents,
1179
+ height,
1180
+ width,
1181
+ prompt_embeds.dtype,
1182
+ device,
1183
+ generator,
1184
+ latents,
1185
+ image=init_image,
1186
+ timestep=latent_timestep,
1187
+ is_strength_max=is_strength_max,
1188
+ return_noise=True,
1189
+ return_image_latents=return_image_latents,
1190
+ )
1191
+
1192
+ if return_image_latents:
1193
+ latents, noise, image_latents = latents_outputs
1194
+ else:
1195
+ latents, noise = latents_outputs
1196
+
1197
+ # 7. Prepare mask latent variables
1198
+ mask_condition = self.mask_processor.preprocess(
1199
+ mask_image, height=height, width=width, resize_mode=resize_mode, crops_coords=crops_coords
1200
+ )
1201
+
1202
+ if masked_image_latents is None:
1203
+ masked_image = init_image * (mask_condition < 0.5)
1204
+ else:
1205
+ masked_image = masked_image_latents
1206
+
1207
+ mask, masked_image_latents = self.prepare_mask_latents(
1208
+ mask_condition,
1209
+ masked_image,
1210
+ batch_size * num_images_per_prompt,
1211
+ height,
1212
+ width,
1213
+ prompt_embeds.dtype,
1214
+ device,
1215
+ generator,
1216
+ self.do_classifier_free_guidance,
1217
+ )
1218
+
1219
+ # 8. Check that sizes of mask, masked image and latents match
1220
+ if num_channels_unet == 9:
1221
+ # default case for stable-diffusion-v1-5/stable-diffusion-inpainting
1222
+ num_channels_mask = mask.shape[1]
1223
+ num_channels_masked_image = masked_image_latents.shape[1]
1224
+ if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
1225
+ raise ValueError(
1226
+ f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
1227
+ f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
1228
+ f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
1229
+ f" = {num_channels_latents + num_channels_masked_image + num_channels_mask}. Please verify the config of"
1230
+ " `pipeline.unet` or your `mask_image` or `image` input."
1231
+ )
1232
+ elif num_channels_unet != 4:
1233
+ raise ValueError(
1234
+ f"The unet {self.unet.__class__} should have either 4 or 9 input channels, not {self.unet.config.in_channels}."
1235
+ )
1236
+
1237
+ # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
1238
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
1239
+
1240
+ # 9.1 Add image embeds for IP-Adapter
1241
+ added_cond_kwargs = (
1242
+ {"image_embeds": image_embeds}
1243
+ if ip_adapter_image is not None or ip_adapter_image_embeds is not None
1244
+ else None
1245
+ )
1246
+
1247
+ # 9.2 Optionally get Guidance Scale Embedding
1248
+ timestep_cond = None
1249
+ if self.unet.config.time_cond_proj_dim is not None:
1250
+ guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
1251
+ timestep_cond = self.get_guidance_scale_embedding(
1252
+ guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
1253
+ ).to(device=device, dtype=latents.dtype)
1254
+
1255
+ # 10. Denoising loop
1256
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
1257
+ self._num_timesteps = len(timesteps)
1258
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
1259
+ for i, t in enumerate(timesteps):
1260
+ if self.interrupt:
1261
+ continue
1262
+
1263
+ # expand the latents if we are doing classifier free guidance
1264
+ latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
1265
+
1266
+ # concat latents, mask, masked_image_latents in the channel dimension
1267
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
1268
+
1269
+ if num_channels_unet == 9:
1270
+ latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
1271
+
1272
+ # predict the noise residual
1273
+ noise_pred = self.unet(
1274
+ latent_model_input,
1275
+ t,
1276
+ encoder_hidden_states=prompt_embeds,
1277
+ timestep_cond=timestep_cond,
1278
+ cross_attention_kwargs=self.cross_attention_kwargs,
1279
+ added_cond_kwargs=added_cond_kwargs,
1280
+ return_dict=False,
1281
+ )[0]
1282
+
1283
+ # perform guidance
1284
+ if self.do_classifier_free_guidance:
1285
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
1286
+ noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
1287
+
1288
+ # compute the previous noisy sample x_t -> x_t-1
1289
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
1290
+ if num_channels_unet == 4:
1291
+ init_latents_proper = image_latents
1292
+ if self.do_classifier_free_guidance:
1293
+ init_mask, _ = mask.chunk(2)
1294
+ else:
1295
+ init_mask = mask
1296
+
1297
+ if i < len(timesteps) - 1:
1298
+ noise_timestep = timesteps[i + 1]
1299
+ init_latents_proper = self.scheduler.add_noise(
1300
+ init_latents_proper, noise, torch.tensor([noise_timestep])
1301
+ )
1302
+
1303
+ latents = (1 - init_mask) * init_latents_proper + init_mask * latents
1304
+
1305
+ if callback_on_step_end is not None:
1306
+ callback_kwargs = {}
1307
+ for k in callback_on_step_end_tensor_inputs:
1308
+ callback_kwargs[k] = locals()[k]
1309
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
1310
+
1311
+ latents = callback_outputs.pop("latents", latents)
1312
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
1313
+ negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
1314
+ mask = callback_outputs.pop("mask", mask)
1315
+ masked_image_latents = callback_outputs.pop("masked_image_latents", masked_image_latents)
1316
+
1317
+ # call the callback, if provided
1318
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
1319
+ progress_bar.update()
1320
+ if callback is not None and i % callback_steps == 0:
1321
+ step_idx = i // getattr(self.scheduler, "order", 1)
1322
+ callback(step_idx, t, latents)
1323
+
1324
+ if XLA_AVAILABLE:
1325
+ xm.mark_step()
1326
+
1327
+ if not output_type == "latent":
1328
+ condition_kwargs = {}
1329
+ if isinstance(self.vae, AsymmetricAutoencoderKL):
1330
+ init_image = init_image.to(device=device, dtype=masked_image_latents.dtype)
1331
+ init_image_condition = init_image.clone()
1332
+ init_image = self._encode_vae_image(init_image, generator=generator)
1333
+ mask_condition = mask_condition.to(device=device, dtype=masked_image_latents.dtype)
1334
+ condition_kwargs = {"image": init_image_condition, "mask": mask_condition}
1335
+ image = self.vae.decode(
1336
+ latents / self.vae.config.scaling_factor, return_dict=False, generator=generator, **condition_kwargs
1337
+ )[0]
1338
+ image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
1339
+ else:
1340
+ image = latents
1341
+ has_nsfw_concept = None
1342
+
1343
+ if has_nsfw_concept is None:
1344
+ do_denormalize = [True] * image.shape[0]
1345
+ else:
1346
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
1347
+
1348
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
1349
+
1350
+ if padding_mask_crop is not None:
1351
+ image = [self.image_processor.apply_overlay(mask_image, original_image, i, crops_coords) for i in image]
1352
+
1353
+ # Offload all models
1354
+ self.maybe_free_model_hooks()
1355
+
1356
+ if not return_dict:
1357
+ return (image, has_nsfw_concept)
1358
+
1359
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py ADDED
@@ -0,0 +1,917 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 The InstructPix2Pix Authors and The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ from typing import Any, Callable, Dict, List, Optional, Union
17
+
18
+ import numpy as np
19
+ import PIL.Image
20
+ import torch
21
+ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
22
+
23
+ from ...callbacks import MultiPipelineCallbacks, PipelineCallback
24
+ from ...image_processor import PipelineImageInput, VaeImageProcessor
25
+ from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
26
+ from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
27
+ from ...schedulers import KarrasDiffusionSchedulers
28
+ from ...utils import PIL_INTERPOLATION, deprecate, is_torch_xla_available, logging
29
+ from ...utils.torch_utils import randn_tensor
30
+ from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
31
+ from . import StableDiffusionPipelineOutput
32
+ from .safety_checker import StableDiffusionSafetyChecker
33
+
34
+
35
+ if is_torch_xla_available():
36
+ import torch_xla.core.xla_model as xm
37
+
38
+ XLA_AVAILABLE = True
39
+ else:
40
+ XLA_AVAILABLE = False
41
+
42
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
43
+
44
+
45
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
46
+ def preprocess(image):
47
+ deprecation_message = "The preprocess method is deprecated and will be removed in diffusers 1.0.0. Please use VaeImageProcessor.preprocess(...) instead"
48
+ deprecate("preprocess", "1.0.0", deprecation_message, standard_warn=False)
49
+ if isinstance(image, torch.Tensor):
50
+ return image
51
+ elif isinstance(image, PIL.Image.Image):
52
+ image = [image]
53
+
54
+ if isinstance(image[0], PIL.Image.Image):
55
+ w, h = image[0].size
56
+ w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8
57
+
58
+ image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
59
+ image = np.concatenate(image, axis=0)
60
+ image = np.array(image).astype(np.float32) / 255.0
61
+ image = image.transpose(0, 3, 1, 2)
62
+ image = 2.0 * image - 1.0
63
+ image = torch.from_numpy(image)
64
+ elif isinstance(image[0], torch.Tensor):
65
+ image = torch.cat(image, dim=0)
66
+ return image
67
+
68
+
69
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
70
+ def retrieve_latents(
71
+ encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
72
+ ):
73
+ if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
74
+ return encoder_output.latent_dist.sample(generator)
75
+ elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
76
+ return encoder_output.latent_dist.mode()
77
+ elif hasattr(encoder_output, "latents"):
78
+ return encoder_output.latents
79
+ else:
80
+ raise AttributeError("Could not access latents of provided encoder_output")
81
+
82
+
83
+ class StableDiffusionInstructPix2PixPipeline(
84
+ DiffusionPipeline,
85
+ StableDiffusionMixin,
86
+ TextualInversionLoaderMixin,
87
+ StableDiffusionLoraLoaderMixin,
88
+ IPAdapterMixin,
89
+ FromSingleFileMixin,
90
+ ):
91
+ r"""
92
+ Pipeline for pixel-level image editing by following text instructions (based on Stable Diffusion).
93
+
94
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
95
+ implemented for all pipelines (downloading, saving, running on a particular device, etc.).
96
+
97
+ The pipeline also inherits the following loading methods:
98
+ - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
99
+ - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
100
+ - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
101
+ - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
102
+
103
+ Args:
104
+ vae ([`AutoencoderKL`]):
105
+ Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
106
+ text_encoder ([`~transformers.CLIPTextModel`]):
107
+ Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
108
+ tokenizer ([`~transformers.CLIPTokenizer`]):
109
+ A `CLIPTokenizer` to tokenize text.
110
+ unet ([`UNet2DConditionModel`]):
111
+ A `UNet2DConditionModel` to denoise the encoded image latents.
112
+ scheduler ([`SchedulerMixin`]):
113
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
114
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
115
+ safety_checker ([`StableDiffusionSafetyChecker`]):
116
+ Classification module that estimates whether generated images could be considered offensive or harmful.
117
+ Please refer to the [model card](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) for
118
+ more details about a model's potential harms.
119
+ feature_extractor ([`~transformers.CLIPImageProcessor`]):
120
+ A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
121
+ """
122
+
123
+ model_cpu_offload_seq = "text_encoder->unet->vae"
124
+ _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
125
+ _exclude_from_cpu_offload = ["safety_checker"]
126
+ _callback_tensor_inputs = ["latents", "prompt_embeds", "image_latents"]
127
+
128
+ def __init__(
129
+ self,
130
+ vae: AutoencoderKL,
131
+ text_encoder: CLIPTextModel,
132
+ tokenizer: CLIPTokenizer,
133
+ unet: UNet2DConditionModel,
134
+ scheduler: KarrasDiffusionSchedulers,
135
+ safety_checker: StableDiffusionSafetyChecker,
136
+ feature_extractor: CLIPImageProcessor,
137
+ image_encoder: Optional[CLIPVisionModelWithProjection] = None,
138
+ requires_safety_checker: bool = True,
139
+ ):
140
+ super().__init__()
141
+
142
+ if safety_checker is None and requires_safety_checker:
143
+ logger.warning(
144
+ f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
145
+ " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
146
+ " results in services or applications open to the public. Both the diffusers team and Hugging Face"
147
+ " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
148
+ " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
149
+ " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
150
+ )
151
+
152
+ if safety_checker is not None and feature_extractor is None:
153
+ raise ValueError(
154
+ "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
155
+ " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
156
+ )
157
+
158
+ self.register_modules(
159
+ vae=vae,
160
+ text_encoder=text_encoder,
161
+ tokenizer=tokenizer,
162
+ unet=unet,
163
+ scheduler=scheduler,
164
+ safety_checker=safety_checker,
165
+ feature_extractor=feature_extractor,
166
+ image_encoder=image_encoder,
167
+ )
168
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
169
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
170
+ self.register_to_config(requires_safety_checker=requires_safety_checker)
171
+
172
+ @torch.no_grad()
173
+ def __call__(
174
+ self,
175
+ prompt: Union[str, List[str]] = None,
176
+ image: PipelineImageInput = None,
177
+ num_inference_steps: int = 100,
178
+ guidance_scale: float = 7.5,
179
+ image_guidance_scale: float = 1.5,
180
+ negative_prompt: Optional[Union[str, List[str]]] = None,
181
+ num_images_per_prompt: Optional[int] = 1,
182
+ eta: float = 0.0,
183
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
184
+ latents: Optional[torch.Tensor] = None,
185
+ prompt_embeds: Optional[torch.Tensor] = None,
186
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
187
+ ip_adapter_image: Optional[PipelineImageInput] = None,
188
+ ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
189
+ output_type: Optional[str] = "pil",
190
+ return_dict: bool = True,
191
+ callback_on_step_end: Optional[
192
+ Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
193
+ ] = None,
194
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
195
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
196
+ **kwargs,
197
+ ):
198
+ r"""
199
+ The call function to the pipeline for generation.
200
+
201
+ Args:
202
+ prompt (`str` or `List[str]`, *optional*):
203
+ The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
204
+ image (`torch.Tensor` `np.ndarray`, `PIL.Image.Image`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
205
+ `Image` or tensor representing an image batch to be repainted according to `prompt`. Can also accept
206
+ image latents as `image`, but if passing latents directly it is not encoded again.
207
+ num_inference_steps (`int`, *optional*, defaults to 100):
208
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
209
+ expense of slower inference.
210
+ guidance_scale (`float`, *optional*, defaults to 7.5):
211
+ A higher guidance scale value encourages the model to generate images closely linked to the text
212
+ `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
213
+ image_guidance_scale (`float`, *optional*, defaults to 1.5):
214
+ Push the generated image towards the initial `image`. Image guidance scale is enabled by setting
215
+ `image_guidance_scale > 1`. Higher image guidance scale encourages generated images that are closely
216
+ linked to the source `image`, usually at the expense of lower image quality. This pipeline requires a
217
+ value of at least `1`.
218
+ negative_prompt (`str` or `List[str]`, *optional*):
219
+ The prompt or prompts to guide what to not include in image generation. If not defined, you need to
220
+ pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
221
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
222
+ The number of images to generate per prompt.
223
+ eta (`float`, *optional*, defaults to 0.0):
224
+ Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only
225
+ applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
226
+ generator (`torch.Generator`, *optional*):
227
+ A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
228
+ generation deterministic.
229
+ latents (`torch.Tensor`, *optional*):
230
+ Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
231
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
232
+ tensor is generated by sampling using the supplied random `generator`.
233
+ prompt_embeds (`torch.Tensor`, *optional*):
234
+ Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
235
+ provided, text embeddings are generated from the `prompt` input argument.
236
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
237
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
238
+ not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
239
+ ip_adapter_image: (`PipelineImageInput`, *optional*):
240
+ Optional image input to work with IP Adapters.
241
+ output_type (`str`, *optional*, defaults to `"pil"`):
242
+ The output format of the generated image. Choose between `PIL.Image` or `np.array`.
243
+ return_dict (`bool`, *optional*, defaults to `True`):
244
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
245
+ plain tuple.
246
+ callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
247
+ A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
248
+ each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
249
+ DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
250
+ list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
251
+ callback_on_step_end_tensor_inputs (`List`, *optional*):
252
+ The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
253
+ will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
254
+ `._callback_tensor_inputs` attribute of your pipeline class.
255
+ cross_attention_kwargs (`dict`, *optional*):
256
+ A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
257
+ [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
258
+
259
+ Examples:
260
+
261
+ ```py
262
+ >>> import PIL
263
+ >>> import requests
264
+ >>> import torch
265
+ >>> from io import BytesIO
266
+
267
+ >>> from diffusers import StableDiffusionInstructPix2PixPipeline
268
+
269
+
270
+ >>> def download_image(url):
271
+ ... response = requests.get(url)
272
+ ... return PIL.Image.open(BytesIO(response.content)).convert("RGB")
273
+
274
+
275
+ >>> img_url = "https://huggingface.co/datasets/diffusers/diffusers-images-docs/resolve/main/mountain.png"
276
+
277
+ >>> image = download_image(img_url).resize((512, 512))
278
+
279
+ >>> pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
280
+ ... "timbrooks/instruct-pix2pix", torch_dtype=torch.float16
281
+ ... )
282
+ >>> pipe = pipe.to("cuda")
283
+
284
+ >>> prompt = "make the mountains snowy"
285
+ >>> image = pipe(prompt=prompt, image=image).images[0]
286
+ ```
287
+
288
+ Returns:
289
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
290
+ If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
291
+ otherwise a `tuple` is returned where the first element is a list with the generated images and the
292
+ second element is a list of `bool`s indicating whether the corresponding generated image contains
293
+ "not-safe-for-work" (nsfw) content.
294
+ """
295
+
296
+ callback = kwargs.pop("callback", None)
297
+ callback_steps = kwargs.pop("callback_steps", None)
298
+
299
+ if callback is not None:
300
+ deprecate(
301
+ "callback",
302
+ "1.0.0",
303
+ "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
304
+ )
305
+ if callback_steps is not None:
306
+ deprecate(
307
+ "callback_steps",
308
+ "1.0.0",
309
+ "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
310
+ )
311
+
312
+ if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
313
+ callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
314
+
315
+ # 0. Check inputs
316
+ self.check_inputs(
317
+ prompt,
318
+ callback_steps,
319
+ negative_prompt,
320
+ prompt_embeds,
321
+ negative_prompt_embeds,
322
+ ip_adapter_image,
323
+ ip_adapter_image_embeds,
324
+ callback_on_step_end_tensor_inputs,
325
+ )
326
+ self._guidance_scale = guidance_scale
327
+ self._image_guidance_scale = image_guidance_scale
328
+
329
+ device = self._execution_device
330
+
331
+ if image is None:
332
+ raise ValueError("`image` input cannot be undefined.")
333
+
334
+ # 1. Define call parameters
335
+ if prompt is not None and isinstance(prompt, str):
336
+ batch_size = 1
337
+ elif prompt is not None and isinstance(prompt, list):
338
+ batch_size = len(prompt)
339
+ else:
340
+ batch_size = prompt_embeds.shape[0]
341
+
342
+ device = self._execution_device
343
+
344
+ # 2. Encode input prompt
345
+ prompt_embeds = self._encode_prompt(
346
+ prompt,
347
+ device,
348
+ num_images_per_prompt,
349
+ self.do_classifier_free_guidance,
350
+ negative_prompt,
351
+ prompt_embeds=prompt_embeds,
352
+ negative_prompt_embeds=negative_prompt_embeds,
353
+ )
354
+
355
+ if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
356
+ image_embeds = self.prepare_ip_adapter_image_embeds(
357
+ ip_adapter_image,
358
+ ip_adapter_image_embeds,
359
+ device,
360
+ batch_size * num_images_per_prompt,
361
+ self.do_classifier_free_guidance,
362
+ )
363
+ # 3. Preprocess image
364
+ image = self.image_processor.preprocess(image)
365
+
366
+ # 4. set timesteps
367
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
368
+ timesteps = self.scheduler.timesteps
369
+
370
+ # 5. Prepare Image latents
371
+ image_latents = self.prepare_image_latents(
372
+ image,
373
+ batch_size,
374
+ num_images_per_prompt,
375
+ prompt_embeds.dtype,
376
+ device,
377
+ self.do_classifier_free_guidance,
378
+ )
379
+
380
+ height, width = image_latents.shape[-2:]
381
+ height = height * self.vae_scale_factor
382
+ width = width * self.vae_scale_factor
383
+
384
+ # 6. Prepare latent variables
385
+ num_channels_latents = self.vae.config.latent_channels
386
+ latents = self.prepare_latents(
387
+ batch_size * num_images_per_prompt,
388
+ num_channels_latents,
389
+ height,
390
+ width,
391
+ prompt_embeds.dtype,
392
+ device,
393
+ generator,
394
+ latents,
395
+ )
396
+
397
+ # 7. Check that shapes of latents and image match the UNet channels
398
+ num_channels_image = image_latents.shape[1]
399
+ if num_channels_latents + num_channels_image != self.unet.config.in_channels:
400
+ raise ValueError(
401
+ f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
402
+ f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
403
+ f" `num_channels_image`: {num_channels_image} "
404
+ f" = {num_channels_latents + num_channels_image}. Please verify the config of"
405
+ " `pipeline.unet` or your `image` input."
406
+ )
407
+
408
+ # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
409
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
410
+
411
+ # 8.1 Add image embeds for IP-Adapter
412
+ added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
413
+
414
+ # 9. Denoising loop
415
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
416
+ self._num_timesteps = len(timesteps)
417
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
418
+ for i, t in enumerate(timesteps):
419
+ # Expand the latents if we are doing classifier free guidance.
420
+ # The latents are expanded 3 times because for pix2pix the guidance\
421
+ # is applied for both the text and the input image.
422
+ latent_model_input = torch.cat([latents] * 3) if self.do_classifier_free_guidance else latents
423
+
424
+ # concat latents, image_latents in the channel dimension
425
+ scaled_latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
426
+ scaled_latent_model_input = torch.cat([scaled_latent_model_input, image_latents], dim=1)
427
+
428
+ # predict the noise residual
429
+ noise_pred = self.unet(
430
+ scaled_latent_model_input,
431
+ t,
432
+ encoder_hidden_states=prompt_embeds,
433
+ added_cond_kwargs=added_cond_kwargs,
434
+ cross_attention_kwargs=cross_attention_kwargs,
435
+ return_dict=False,
436
+ )[0]
437
+
438
+ # perform guidance
439
+ if self.do_classifier_free_guidance:
440
+ noise_pred_text, noise_pred_image, noise_pred_uncond = noise_pred.chunk(3)
441
+ noise_pred = (
442
+ noise_pred_uncond
443
+ + self.guidance_scale * (noise_pred_text - noise_pred_image)
444
+ + self.image_guidance_scale * (noise_pred_image - noise_pred_uncond)
445
+ )
446
+
447
+ # compute the previous noisy sample x_t -> x_t-1
448
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
449
+
450
+ if callback_on_step_end is not None:
451
+ callback_kwargs = {}
452
+ for k in callback_on_step_end_tensor_inputs:
453
+ callback_kwargs[k] = locals()[k]
454
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
455
+
456
+ latents = callback_outputs.pop("latents", latents)
457
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
458
+ negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
459
+ image_latents = callback_outputs.pop("image_latents", image_latents)
460
+
461
+ # call the callback, if provided
462
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
463
+ progress_bar.update()
464
+ if callback is not None and i % callback_steps == 0:
465
+ step_idx = i // getattr(self.scheduler, "order", 1)
466
+ callback(step_idx, t, latents)
467
+
468
+ if XLA_AVAILABLE:
469
+ xm.mark_step()
470
+
471
+ if not output_type == "latent":
472
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
473
+ image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
474
+ else:
475
+ image = latents
476
+ has_nsfw_concept = None
477
+
478
+ if has_nsfw_concept is None:
479
+ do_denormalize = [True] * image.shape[0]
480
+ else:
481
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
482
+
483
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
484
+
485
+ # Offload all models
486
+ self.maybe_free_model_hooks()
487
+
488
+ if not return_dict:
489
+ return (image, has_nsfw_concept)
490
+
491
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
492
+
493
+ def _encode_prompt(
494
+ self,
495
+ prompt,
496
+ device,
497
+ num_images_per_prompt,
498
+ do_classifier_free_guidance,
499
+ negative_prompt=None,
500
+ prompt_embeds: Optional[torch.Tensor] = None,
501
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
502
+ ):
503
+ r"""
504
+ Encodes the prompt into text encoder hidden states.
505
+
506
+ Args:
507
+ prompt (`str` or `List[str]`, *optional*):
508
+ prompt to be encoded
509
+ device: (`torch.device`):
510
+ torch device
511
+ num_images_per_prompt (`int`):
512
+ number of images that should be generated per prompt
513
+ do_classifier_free_guidance (`bool`):
514
+ whether to use classifier free guidance or not
515
+ negative_ prompt (`str` or `List[str]`, *optional*):
516
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
517
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
518
+ less than `1`).
519
+ prompt_embeds (`torch.Tensor`, *optional*):
520
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
521
+ provided, text embeddings will be generated from `prompt` input argument.
522
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
523
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
524
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
525
+ argument.
526
+ """
527
+ if prompt is not None and isinstance(prompt, str):
528
+ batch_size = 1
529
+ elif prompt is not None and isinstance(prompt, list):
530
+ batch_size = len(prompt)
531
+ else:
532
+ batch_size = prompt_embeds.shape[0]
533
+
534
+ if prompt_embeds is None:
535
+ # textual inversion: process multi-vector tokens if necessary
536
+ if isinstance(self, TextualInversionLoaderMixin):
537
+ prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
538
+
539
+ text_inputs = self.tokenizer(
540
+ prompt,
541
+ padding="max_length",
542
+ max_length=self.tokenizer.model_max_length,
543
+ truncation=True,
544
+ return_tensors="pt",
545
+ )
546
+ text_input_ids = text_inputs.input_ids
547
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
548
+
549
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
550
+ text_input_ids, untruncated_ids
551
+ ):
552
+ removed_text = self.tokenizer.batch_decode(
553
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
554
+ )
555
+ logger.warning(
556
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
557
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
558
+ )
559
+
560
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
561
+ attention_mask = text_inputs.attention_mask.to(device)
562
+ else:
563
+ attention_mask = None
564
+
565
+ prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
566
+ prompt_embeds = prompt_embeds[0]
567
+
568
+ if self.text_encoder is not None:
569
+ prompt_embeds_dtype = self.text_encoder.dtype
570
+ else:
571
+ prompt_embeds_dtype = self.unet.dtype
572
+
573
+ prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
574
+
575
+ bs_embed, seq_len, _ = prompt_embeds.shape
576
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
577
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
578
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
579
+
580
+ # get unconditional embeddings for classifier free guidance
581
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
582
+ uncond_tokens: List[str]
583
+ if negative_prompt is None:
584
+ uncond_tokens = [""] * batch_size
585
+ elif type(prompt) is not type(negative_prompt):
586
+ raise TypeError(
587
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
588
+ f" {type(prompt)}."
589
+ )
590
+ elif isinstance(negative_prompt, str):
591
+ uncond_tokens = [negative_prompt]
592
+ elif batch_size != len(negative_prompt):
593
+ raise ValueError(
594
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
595
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
596
+ " the batch size of `prompt`."
597
+ )
598
+ else:
599
+ uncond_tokens = negative_prompt
600
+
601
+ # textual inversion: process multi-vector tokens if necessary
602
+ if isinstance(self, TextualInversionLoaderMixin):
603
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
604
+
605
+ max_length = prompt_embeds.shape[1]
606
+ uncond_input = self.tokenizer(
607
+ uncond_tokens,
608
+ padding="max_length",
609
+ max_length=max_length,
610
+ truncation=True,
611
+ return_tensors="pt",
612
+ )
613
+
614
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
615
+ attention_mask = uncond_input.attention_mask.to(device)
616
+ else:
617
+ attention_mask = None
618
+
619
+ negative_prompt_embeds = self.text_encoder(
620
+ uncond_input.input_ids.to(device),
621
+ attention_mask=attention_mask,
622
+ )
623
+ negative_prompt_embeds = negative_prompt_embeds[0]
624
+
625
+ if do_classifier_free_guidance:
626
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
627
+ seq_len = negative_prompt_embeds.shape[1]
628
+
629
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
630
+
631
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
632
+ negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
633
+
634
+ # For classifier free guidance, we need to do two forward passes.
635
+ # Here we concatenate the unconditional and text embeddings into a single batch
636
+ # to avoid doing two forward passes
637
+ # pix2pix has two negative embeddings, and unlike in other pipelines latents are ordered [prompt_embeds, negative_prompt_embeds, negative_prompt_embeds]
638
+ prompt_embeds = torch.cat([prompt_embeds, negative_prompt_embeds, negative_prompt_embeds])
639
+
640
+ return prompt_embeds
641
+
642
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
643
+ def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
644
+ dtype = next(self.image_encoder.parameters()).dtype
645
+
646
+ if not isinstance(image, torch.Tensor):
647
+ image = self.feature_extractor(image, return_tensors="pt").pixel_values
648
+
649
+ image = image.to(device=device, dtype=dtype)
650
+ if output_hidden_states:
651
+ image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
652
+ image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
653
+ uncond_image_enc_hidden_states = self.image_encoder(
654
+ torch.zeros_like(image), output_hidden_states=True
655
+ ).hidden_states[-2]
656
+ uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
657
+ num_images_per_prompt, dim=0
658
+ )
659
+ return image_enc_hidden_states, uncond_image_enc_hidden_states
660
+ else:
661
+ image_embeds = self.image_encoder(image).image_embeds
662
+ image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
663
+ uncond_image_embeds = torch.zeros_like(image_embeds)
664
+
665
+ return image_embeds, uncond_image_embeds
666
+
667
+ def prepare_ip_adapter_image_embeds(
668
+ self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
669
+ ):
670
+ if ip_adapter_image_embeds is None:
671
+ if not isinstance(ip_adapter_image, list):
672
+ ip_adapter_image = [ip_adapter_image]
673
+
674
+ if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
675
+ raise ValueError(
676
+ f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
677
+ )
678
+
679
+ image_embeds = []
680
+ for single_ip_adapter_image, image_proj_layer in zip(
681
+ ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
682
+ ):
683
+ output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
684
+ single_image_embeds, single_negative_image_embeds = self.encode_image(
685
+ single_ip_adapter_image, device, 1, output_hidden_state
686
+ )
687
+ single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
688
+ single_negative_image_embeds = torch.stack(
689
+ [single_negative_image_embeds] * num_images_per_prompt, dim=0
690
+ )
691
+
692
+ if do_classifier_free_guidance:
693
+ single_image_embeds = torch.cat(
694
+ [single_image_embeds, single_negative_image_embeds, single_negative_image_embeds]
695
+ )
696
+ single_image_embeds = single_image_embeds.to(device)
697
+
698
+ image_embeds.append(single_image_embeds)
699
+ else:
700
+ repeat_dims = [1]
701
+ image_embeds = []
702
+ for single_image_embeds in ip_adapter_image_embeds:
703
+ if do_classifier_free_guidance:
704
+ (
705
+ single_image_embeds,
706
+ single_negative_image_embeds,
707
+ single_negative_image_embeds,
708
+ ) = single_image_embeds.chunk(3)
709
+ single_image_embeds = single_image_embeds.repeat(
710
+ num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
711
+ )
712
+ single_negative_image_embeds = single_negative_image_embeds.repeat(
713
+ num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
714
+ )
715
+ single_image_embeds = torch.cat(
716
+ [single_image_embeds, single_negative_image_embeds, single_negative_image_embeds]
717
+ )
718
+ else:
719
+ single_image_embeds = single_image_embeds.repeat(
720
+ num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
721
+ )
722
+ image_embeds.append(single_image_embeds)
723
+
724
+ return image_embeds
725
+
726
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
727
+ def run_safety_checker(self, image, device, dtype):
728
+ if self.safety_checker is None:
729
+ has_nsfw_concept = None
730
+ else:
731
+ if torch.is_tensor(image):
732
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
733
+ else:
734
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
735
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
736
+ image, has_nsfw_concept = self.safety_checker(
737
+ images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
738
+ )
739
+ return image, has_nsfw_concept
740
+
741
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
742
+ def prepare_extra_step_kwargs(self, generator, eta):
743
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
744
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
745
+ # eta corresponds to η in DDIM paper: https://huggingface.co/papers/2010.02502
746
+ # and should be between [0, 1]
747
+
748
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
749
+ extra_step_kwargs = {}
750
+ if accepts_eta:
751
+ extra_step_kwargs["eta"] = eta
752
+
753
+ # check if the scheduler accepts generator
754
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
755
+ if accepts_generator:
756
+ extra_step_kwargs["generator"] = generator
757
+ return extra_step_kwargs
758
+
759
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
760
+ def decode_latents(self, latents):
761
+ deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
762
+ deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
763
+
764
+ latents = 1 / self.vae.config.scaling_factor * latents
765
+ image = self.vae.decode(latents, return_dict=False)[0]
766
+ image = (image / 2 + 0.5).clamp(0, 1)
767
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
768
+ image = image.cpu().permute(0, 2, 3, 1).float().numpy()
769
+ return image
770
+
771
+ def check_inputs(
772
+ self,
773
+ prompt,
774
+ callback_steps,
775
+ negative_prompt=None,
776
+ prompt_embeds=None,
777
+ negative_prompt_embeds=None,
778
+ ip_adapter_image=None,
779
+ ip_adapter_image_embeds=None,
780
+ callback_on_step_end_tensor_inputs=None,
781
+ ):
782
+ if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
783
+ raise ValueError(
784
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
785
+ f" {type(callback_steps)}."
786
+ )
787
+
788
+ if callback_on_step_end_tensor_inputs is not None and not all(
789
+ k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
790
+ ):
791
+ raise ValueError(
792
+ f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
793
+ )
794
+
795
+ if prompt is not None and prompt_embeds is not None:
796
+ raise ValueError(
797
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
798
+ " only forward one of the two."
799
+ )
800
+ elif prompt is None and prompt_embeds is None:
801
+ raise ValueError(
802
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
803
+ )
804
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
805
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
806
+
807
+ if negative_prompt is not None and negative_prompt_embeds is not None:
808
+ raise ValueError(
809
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
810
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
811
+ )
812
+
813
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
814
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
815
+ raise ValueError(
816
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
817
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
818
+ f" {negative_prompt_embeds.shape}."
819
+ )
820
+
821
+ if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
822
+ raise ValueError(
823
+ "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
824
+ )
825
+
826
+ if ip_adapter_image_embeds is not None:
827
+ if not isinstance(ip_adapter_image_embeds, list):
828
+ raise ValueError(
829
+ f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
830
+ )
831
+ elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
832
+ raise ValueError(
833
+ f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
834
+ )
835
+
836
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
837
+ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
838
+ shape = (
839
+ batch_size,
840
+ num_channels_latents,
841
+ int(height) // self.vae_scale_factor,
842
+ int(width) // self.vae_scale_factor,
843
+ )
844
+ if isinstance(generator, list) and len(generator) != batch_size:
845
+ raise ValueError(
846
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
847
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
848
+ )
849
+
850
+ if latents is None:
851
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
852
+ else:
853
+ latents = latents.to(device)
854
+
855
+ # scale the initial noise by the standard deviation required by the scheduler
856
+ latents = latents * self.scheduler.init_noise_sigma
857
+ return latents
858
+
859
+ def prepare_image_latents(
860
+ self, image, batch_size, num_images_per_prompt, dtype, device, do_classifier_free_guidance, generator=None
861
+ ):
862
+ if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
863
+ raise ValueError(
864
+ f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
865
+ )
866
+
867
+ image = image.to(device=device, dtype=dtype)
868
+
869
+ batch_size = batch_size * num_images_per_prompt
870
+
871
+ if image.shape[1] == 4:
872
+ image_latents = image
873
+ else:
874
+ image_latents = retrieve_latents(self.vae.encode(image), sample_mode="argmax")
875
+
876
+ if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
877
+ # expand image_latents for batch_size
878
+ deprecation_message = (
879
+ f"You have passed {batch_size} text prompts (`prompt`), but only {image_latents.shape[0]} initial"
880
+ " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
881
+ " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
882
+ " your script to pass as many initial images as text prompts to suppress this warning."
883
+ )
884
+ deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
885
+ additional_image_per_prompt = batch_size // image_latents.shape[0]
886
+ image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0)
887
+ elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
888
+ raise ValueError(
889
+ f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
890
+ )
891
+ else:
892
+ image_latents = torch.cat([image_latents], dim=0)
893
+
894
+ if do_classifier_free_guidance:
895
+ uncond_image_latents = torch.zeros_like(image_latents)
896
+ image_latents = torch.cat([image_latents, image_latents, uncond_image_latents], dim=0)
897
+
898
+ return image_latents
899
+
900
+ @property
901
+ def guidance_scale(self):
902
+ return self._guidance_scale
903
+
904
+ @property
905
+ def image_guidance_scale(self):
906
+ return self._image_guidance_scale
907
+
908
+ @property
909
+ def num_timesteps(self):
910
+ return self._num_timesteps
911
+
912
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
913
+ # of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
914
+ # corresponds to doing no classifier free guidance.
915
+ @property
916
+ def do_classifier_free_guidance(self):
917
+ return self.guidance_scale > 1.0 and self.image_guidance_scale >= 1.0
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py ADDED
@@ -0,0 +1,665 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import warnings
16
+ from typing import Callable, List, Optional, Union
17
+
18
+ import numpy as np
19
+ import PIL.Image
20
+ import torch
21
+ import torch.nn.functional as F
22
+ from transformers import CLIPTextModel, CLIPTokenizer
23
+
24
+ from ...image_processor import PipelineImageInput, VaeImageProcessor
25
+ from ...loaders import FromSingleFileMixin
26
+ from ...models import AutoencoderKL, UNet2DConditionModel
27
+ from ...schedulers import EulerDiscreteScheduler
28
+ from ...utils import deprecate, is_torch_xla_available, logging
29
+ from ...utils.torch_utils import randn_tensor
30
+ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput, StableDiffusionMixin
31
+
32
+
33
+ if is_torch_xla_available():
34
+ import torch_xla.core.xla_model as xm
35
+
36
+ XLA_AVAILABLE = True
37
+ else:
38
+ XLA_AVAILABLE = False
39
+
40
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
41
+
42
+
43
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
44
+ def retrieve_latents(
45
+ encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
46
+ ):
47
+ if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
48
+ return encoder_output.latent_dist.sample(generator)
49
+ elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
50
+ return encoder_output.latent_dist.mode()
51
+ elif hasattr(encoder_output, "latents"):
52
+ return encoder_output.latents
53
+ else:
54
+ raise AttributeError("Could not access latents of provided encoder_output")
55
+
56
+
57
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.preprocess
58
+ def preprocess(image):
59
+ warnings.warn(
60
+ "The preprocess method is deprecated and will be removed in a future version. Please"
61
+ " use VaeImageProcessor.preprocess instead",
62
+ FutureWarning,
63
+ )
64
+ if isinstance(image, torch.Tensor):
65
+ return image
66
+ elif isinstance(image, PIL.Image.Image):
67
+ image = [image]
68
+
69
+ if isinstance(image[0], PIL.Image.Image):
70
+ w, h = image[0].size
71
+ w, h = (x - x % 64 for x in (w, h)) # resize to integer multiple of 64
72
+
73
+ image = [np.array(i.resize((w, h)))[None, :] for i in image]
74
+ image = np.concatenate(image, axis=0)
75
+ image = np.array(image).astype(np.float32) / 255.0
76
+ image = image.transpose(0, 3, 1, 2)
77
+ image = 2.0 * image - 1.0
78
+ image = torch.from_numpy(image)
79
+ elif isinstance(image[0], torch.Tensor):
80
+ image = torch.cat(image, dim=0)
81
+ return image
82
+
83
+
84
+ class StableDiffusionLatentUpscalePipeline(DiffusionPipeline, StableDiffusionMixin, FromSingleFileMixin):
85
+ r"""
86
+ Pipeline for upscaling Stable Diffusion output image resolution by a factor of 2.
87
+
88
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
89
+ implemented for all pipelines (downloading, saving, running on a particular device, etc.).
90
+
91
+ The pipeline also inherits the following loading methods:
92
+ - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
93
+
94
+ Args:
95
+ vae ([`AutoencoderKL`]):
96
+ Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
97
+ text_encoder ([`~transformers.CLIPTextModel`]):
98
+ Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
99
+ tokenizer ([`~transformers.CLIPTokenizer`]):
100
+ A `CLIPTokenizer` to tokenize text.
101
+ unet ([`UNet2DConditionModel`]):
102
+ A `UNet2DConditionModel` to denoise the encoded image latents.
103
+ scheduler ([`SchedulerMixin`]):
104
+ A [`EulerDiscreteScheduler`] to be used in combination with `unet` to denoise the encoded image latents.
105
+ """
106
+
107
+ model_cpu_offload_seq = "text_encoder->unet->vae"
108
+
109
+ def __init__(
110
+ self,
111
+ vae: AutoencoderKL,
112
+ text_encoder: CLIPTextModel,
113
+ tokenizer: CLIPTokenizer,
114
+ unet: UNet2DConditionModel,
115
+ scheduler: EulerDiscreteScheduler,
116
+ ):
117
+ super().__init__()
118
+
119
+ self.register_modules(
120
+ vae=vae,
121
+ text_encoder=text_encoder,
122
+ tokenizer=tokenizer,
123
+ unet=unet,
124
+ scheduler=scheduler,
125
+ )
126
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
127
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, resample="bicubic")
128
+
129
+ def _encode_prompt(
130
+ self,
131
+ prompt,
132
+ device,
133
+ do_classifier_free_guidance,
134
+ negative_prompt=None,
135
+ prompt_embeds: Optional[torch.Tensor] = None,
136
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
137
+ pooled_prompt_embeds: Optional[torch.Tensor] = None,
138
+ negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
139
+ **kwargs,
140
+ ):
141
+ deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
142
+ deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
143
+
144
+ (
145
+ prompt_embeds,
146
+ negative_prompt_embeds,
147
+ pooled_prompt_embeds,
148
+ negative_pooled_prompt_embeds,
149
+ ) = self.encode_prompt(
150
+ prompt=prompt,
151
+ device=device,
152
+ do_classifier_free_guidance=do_classifier_free_guidance,
153
+ negative_prompt=negative_prompt,
154
+ prompt_embeds=prompt_embeds,
155
+ negative_prompt_embeds=negative_prompt_embeds,
156
+ pooled_prompt_embeds=pooled_prompt_embeds,
157
+ negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
158
+ **kwargs,
159
+ )
160
+
161
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
162
+ pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds])
163
+
164
+ return prompt_embeds, pooled_prompt_embeds
165
+
166
+ def encode_prompt(
167
+ self,
168
+ prompt,
169
+ device,
170
+ do_classifier_free_guidance,
171
+ negative_prompt=None,
172
+ prompt_embeds: Optional[torch.Tensor] = None,
173
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
174
+ pooled_prompt_embeds: Optional[torch.Tensor] = None,
175
+ negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
176
+ ):
177
+ r"""
178
+ Encodes the prompt into text encoder hidden states.
179
+
180
+ Args:
181
+ prompt (`str` or `list(int)`):
182
+ prompt to be encoded
183
+ device: (`torch.device`):
184
+ torch device
185
+ do_classifier_free_guidance (`bool`):
186
+ whether to use classifier free guidance or not
187
+ negative_prompt (`str` or `List[str]`):
188
+ The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
189
+ if `guidance_scale` is less than `1`).
190
+ prompt_embeds (`torch.FloatTensor`, *optional*):
191
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
192
+ provided, text embeddings will be generated from `prompt` input argument.
193
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
194
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
195
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
196
+ argument.
197
+ pooled_prompt_embeds (`torch.Tensor`, *optional*):
198
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
199
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
200
+ negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
201
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
202
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
203
+ input argument.
204
+ """
205
+ if prompt is not None and isinstance(prompt, str):
206
+ batch_size = 1
207
+ elif prompt is not None and isinstance(prompt, list):
208
+ batch_size = len(prompt)
209
+ else:
210
+ batch_size = prompt_embeds.shape[0]
211
+
212
+ if prompt_embeds is None or pooled_prompt_embeds is None:
213
+ text_inputs = self.tokenizer(
214
+ prompt,
215
+ padding="max_length",
216
+ max_length=self.tokenizer.model_max_length,
217
+ truncation=True,
218
+ return_length=True,
219
+ return_tensors="pt",
220
+ )
221
+ text_input_ids = text_inputs.input_ids
222
+
223
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
224
+
225
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
226
+ text_input_ids, untruncated_ids
227
+ ):
228
+ removed_text = self.tokenizer.batch_decode(
229
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
230
+ )
231
+ logger.warning(
232
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
233
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
234
+ )
235
+
236
+ text_encoder_out = self.text_encoder(
237
+ text_input_ids.to(device),
238
+ output_hidden_states=True,
239
+ )
240
+ prompt_embeds = text_encoder_out.hidden_states[-1]
241
+ pooled_prompt_embeds = text_encoder_out.pooler_output
242
+
243
+ # get unconditional embeddings for classifier free guidance
244
+ if do_classifier_free_guidance:
245
+ if negative_prompt_embeds is None or negative_pooled_prompt_embeds is None:
246
+ uncond_tokens: List[str]
247
+ if negative_prompt is None:
248
+ uncond_tokens = [""] * batch_size
249
+ elif type(prompt) is not type(negative_prompt):
250
+ raise TypeError(
251
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
252
+ f" {type(prompt)}."
253
+ )
254
+ elif isinstance(negative_prompt, str):
255
+ uncond_tokens = [negative_prompt]
256
+ elif batch_size != len(negative_prompt):
257
+ raise ValueError(
258
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
259
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
260
+ " the batch size of `prompt`."
261
+ )
262
+ else:
263
+ uncond_tokens = negative_prompt
264
+
265
+ max_length = text_input_ids.shape[-1]
266
+ uncond_input = self.tokenizer(
267
+ uncond_tokens,
268
+ padding="max_length",
269
+ max_length=max_length,
270
+ truncation=True,
271
+ return_length=True,
272
+ return_tensors="pt",
273
+ )
274
+
275
+ uncond_encoder_out = self.text_encoder(
276
+ uncond_input.input_ids.to(device),
277
+ output_hidden_states=True,
278
+ )
279
+
280
+ negative_prompt_embeds = uncond_encoder_out.hidden_states[-1]
281
+ negative_pooled_prompt_embeds = uncond_encoder_out.pooler_output
282
+
283
+ return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
284
+
285
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
286
+ def decode_latents(self, latents):
287
+ deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
288
+ deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
289
+
290
+ latents = 1 / self.vae.config.scaling_factor * latents
291
+ image = self.vae.decode(latents, return_dict=False)[0]
292
+ image = (image / 2 + 0.5).clamp(0, 1)
293
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
294
+ image = image.cpu().permute(0, 2, 3, 1).float().numpy()
295
+ return image
296
+
297
+ def check_inputs(
298
+ self,
299
+ prompt,
300
+ image,
301
+ callback_steps,
302
+ negative_prompt=None,
303
+ prompt_embeds=None,
304
+ negative_prompt_embeds=None,
305
+ pooled_prompt_embeds=None,
306
+ negative_pooled_prompt_embeds=None,
307
+ ):
308
+ if prompt is not None and prompt_embeds is not None:
309
+ raise ValueError(
310
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
311
+ " only forward one of the two."
312
+ )
313
+ elif prompt is None and prompt_embeds is None:
314
+ raise ValueError(
315
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
316
+ )
317
+ elif prompt is not None and not isinstance(prompt, str) and not isinstance(prompt, list):
318
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
319
+
320
+ if negative_prompt is not None and negative_prompt_embeds is not None:
321
+ raise ValueError(
322
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
323
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
324
+ )
325
+
326
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
327
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
328
+ raise ValueError(
329
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
330
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
331
+ f" {negative_prompt_embeds.shape}."
332
+ )
333
+
334
+ if prompt_embeds is not None and pooled_prompt_embeds is None:
335
+ raise ValueError(
336
+ "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
337
+ )
338
+
339
+ if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
340
+ raise ValueError(
341
+ "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
342
+ )
343
+
344
+ if (
345
+ not isinstance(image, torch.Tensor)
346
+ and not isinstance(image, np.ndarray)
347
+ and not isinstance(image, PIL.Image.Image)
348
+ and not isinstance(image, list)
349
+ ):
350
+ raise ValueError(
351
+ f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or `list` but is {type(image)}"
352
+ )
353
+
354
+ # verify batch size of prompt and image are same if image is a list or tensor
355
+ if isinstance(image, (list, torch.Tensor)):
356
+ if prompt is not None:
357
+ if isinstance(prompt, str):
358
+ batch_size = 1
359
+ else:
360
+ batch_size = len(prompt)
361
+ else:
362
+ batch_size = prompt_embeds.shape[0]
363
+
364
+ if isinstance(image, list):
365
+ image_batch_size = len(image)
366
+ else:
367
+ image_batch_size = image.shape[0] if image.ndim == 4 else 1
368
+ if batch_size != image_batch_size:
369
+ raise ValueError(
370
+ f"`prompt` has batch size {batch_size} and `image` has batch size {image_batch_size}."
371
+ " Please make sure that passed `prompt` matches the batch size of `image`."
372
+ )
373
+
374
+ if (callback_steps is None) or (
375
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
376
+ ):
377
+ raise ValueError(
378
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
379
+ f" {type(callback_steps)}."
380
+ )
381
+
382
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.prepare_latents
383
+ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
384
+ shape = (batch_size, num_channels_latents, height, width)
385
+ if latents is None:
386
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
387
+ else:
388
+ if latents.shape != shape:
389
+ raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
390
+ latents = latents.to(device)
391
+
392
+ # scale the initial noise by the standard deviation required by the scheduler
393
+ latents = latents * self.scheduler.init_noise_sigma
394
+ return latents
395
+
396
+ @torch.no_grad()
397
+ def __call__(
398
+ self,
399
+ prompt: Union[str, List[str]] = None,
400
+ image: PipelineImageInput = None,
401
+ num_inference_steps: int = 75,
402
+ guidance_scale: float = 9.0,
403
+ negative_prompt: Optional[Union[str, List[str]]] = None,
404
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
405
+ latents: Optional[torch.Tensor] = None,
406
+ prompt_embeds: Optional[torch.Tensor] = None,
407
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
408
+ pooled_prompt_embeds: Optional[torch.Tensor] = None,
409
+ negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
410
+ output_type: Optional[str] = "pil",
411
+ return_dict: bool = True,
412
+ callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
413
+ callback_steps: int = 1,
414
+ ):
415
+ r"""
416
+ The call function to the pipeline for generation.
417
+
418
+ Args:
419
+ prompt (`str` or `List[str]`):
420
+ The prompt or prompts to guide image upscaling.
421
+ image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
422
+ `Image` or tensor representing an image batch to be upscaled. If it's a tensor, it can be either a
423
+ latent output from a Stable Diffusion model or an image tensor in the range `[-1, 1]`. It is considered
424
+ a `latent` if `image.shape[1]` is `4`; otherwise, it is considered to be an image representation and
425
+ encoded using this pipeline's `vae` encoder.
426
+ num_inference_steps (`int`, *optional*, defaults to 50):
427
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
428
+ expense of slower inference.
429
+ guidance_scale (`float`, *optional*, defaults to 7.5):
430
+ A higher guidance scale value encourages the model to generate images closely linked to the text
431
+ `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
432
+ negative_prompt (`str` or `List[str]`, *optional*):
433
+ The prompt or prompts to guide what to not include in image generation. If not defined, you need to
434
+ pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
435
+ eta (`float`, *optional*, defaults to 0.0):
436
+ Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only
437
+ applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
438
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
439
+ A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
440
+ generation deterministic.
441
+ latents (`torch.Tensor`, *optional*):
442
+ Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
443
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
444
+ tensor is generated by sampling using the supplied random `generator`.
445
+ output_type (`str`, *optional*, defaults to `"pil"`):
446
+ The output format of the generated image. Choose between `PIL.Image` or `np.array`.
447
+ return_dict (`bool`, *optional*, defaults to `True`):
448
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
449
+ plain tuple.
450
+ callback (`Callable`, *optional*):
451
+ A function that calls every `callback_steps` steps during inference. The function is called with the
452
+ following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
453
+ callback_steps (`int`, *optional*, defaults to 1):
454
+ The frequency at which the `callback` function is called. If not specified, the callback is called at
455
+ every step.
456
+
457
+ Examples:
458
+ ```py
459
+ >>> from diffusers import StableDiffusionLatentUpscalePipeline, StableDiffusionPipeline
460
+ >>> import torch
461
+
462
+
463
+ >>> pipeline = StableDiffusionPipeline.from_pretrained(
464
+ ... "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16
465
+ ... )
466
+ >>> pipeline.to("cuda")
467
+
468
+ >>> model_id = "stabilityai/sd-x2-latent-upscaler"
469
+ >>> upscaler = StableDiffusionLatentUpscalePipeline.from_pretrained(model_id, torch_dtype=torch.float16)
470
+ >>> upscaler.to("cuda")
471
+
472
+ >>> prompt = "a photo of an astronaut high resolution, unreal engine, ultra realistic"
473
+ >>> generator = torch.manual_seed(33)
474
+
475
+ >>> low_res_latents = pipeline(prompt, generator=generator, output_type="latent").images
476
+
477
+ >>> with torch.no_grad():
478
+ ... image = pipeline.decode_latents(low_res_latents)
479
+ >>> image = pipeline.numpy_to_pil(image)[0]
480
+
481
+ >>> image.save("../images/a1.png")
482
+
483
+ >>> upscaled_image = upscaler(
484
+ ... prompt=prompt,
485
+ ... image=low_res_latents,
486
+ ... num_inference_steps=20,
487
+ ... guidance_scale=0,
488
+ ... generator=generator,
489
+ ... ).images[0]
490
+
491
+ >>> upscaled_image.save("../images/a2.png")
492
+ ```
493
+
494
+ Returns:
495
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
496
+ If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
497
+ otherwise a `tuple` is returned where the first element is a list with the generated images.
498
+ """
499
+
500
+ # 1. Check inputs
501
+ self.check_inputs(
502
+ prompt,
503
+ image,
504
+ callback_steps,
505
+ negative_prompt,
506
+ prompt_embeds,
507
+ negative_prompt_embeds,
508
+ pooled_prompt_embeds,
509
+ negative_pooled_prompt_embeds,
510
+ )
511
+
512
+ # 2. Define call parameters
513
+ if prompt is not None:
514
+ batch_size = 1 if isinstance(prompt, str) else len(prompt)
515
+ else:
516
+ batch_size = prompt_embeds.shape[0]
517
+ device = self._execution_device
518
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
519
+ # of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
520
+ # corresponds to doing no classifier free guidance.
521
+ do_classifier_free_guidance = guidance_scale > 1.0
522
+
523
+ if guidance_scale == 0:
524
+ prompt = [""] * batch_size
525
+
526
+ # 3. Encode input prompt
527
+ (
528
+ prompt_embeds,
529
+ negative_prompt_embeds,
530
+ pooled_prompt_embeds,
531
+ negative_pooled_prompt_embeds,
532
+ ) = self.encode_prompt(
533
+ prompt,
534
+ device,
535
+ do_classifier_free_guidance,
536
+ negative_prompt,
537
+ prompt_embeds,
538
+ negative_prompt_embeds,
539
+ pooled_prompt_embeds,
540
+ negative_pooled_prompt_embeds,
541
+ )
542
+
543
+ if do_classifier_free_guidance:
544
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
545
+ pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds])
546
+
547
+ # 4. Preprocess image
548
+ image = self.image_processor.preprocess(image)
549
+ image = image.to(dtype=prompt_embeds.dtype, device=device)
550
+ if image.shape[1] == 3:
551
+ # encode image if not in latent-space yet
552
+ image = retrieve_latents(self.vae.encode(image), generator=generator) * self.vae.config.scaling_factor
553
+
554
+ # 5. set timesteps
555
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
556
+ timesteps = self.scheduler.timesteps
557
+
558
+ batch_multiplier = 2 if do_classifier_free_guidance else 1
559
+ image = image[None, :] if image.ndim == 3 else image
560
+ image = torch.cat([image] * batch_multiplier)
561
+
562
+ # 5. Add noise to image (set to be 0):
563
+ # (see below notes from the author):
564
+ # "the This step theoretically can make the model work better on out-of-distribution inputs, but mostly just seems to make it match the input less, so it's turned off by default."
565
+ noise_level = torch.tensor([0.0], dtype=torch.float32, device=device)
566
+ noise_level = torch.cat([noise_level] * image.shape[0])
567
+ inv_noise_level = (noise_level**2 + 1) ** (-0.5)
568
+
569
+ image_cond = F.interpolate(image, scale_factor=2, mode="nearest") * inv_noise_level[:, None, None, None]
570
+ image_cond = image_cond.to(prompt_embeds.dtype)
571
+
572
+ noise_level_embed = torch.cat(
573
+ [
574
+ torch.ones(pooled_prompt_embeds.shape[0], 64, dtype=pooled_prompt_embeds.dtype, device=device),
575
+ torch.zeros(pooled_prompt_embeds.shape[0], 64, dtype=pooled_prompt_embeds.dtype, device=device),
576
+ ],
577
+ dim=1,
578
+ )
579
+
580
+ timestep_condition = torch.cat([noise_level_embed, pooled_prompt_embeds], dim=1)
581
+
582
+ # 6. Prepare latent variables
583
+ height, width = image.shape[2:]
584
+ num_channels_latents = self.vae.config.latent_channels
585
+ latents = self.prepare_latents(
586
+ batch_size,
587
+ num_channels_latents,
588
+ height * 2, # 2x upscale
589
+ width * 2,
590
+ prompt_embeds.dtype,
591
+ device,
592
+ generator,
593
+ latents,
594
+ )
595
+
596
+ # 7. Check that sizes of image and latents match
597
+ num_channels_image = image.shape[1]
598
+ if num_channels_latents + num_channels_image != self.unet.config.in_channels:
599
+ raise ValueError(
600
+ f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
601
+ f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
602
+ f" `num_channels_image`: {num_channels_image} "
603
+ f" = {num_channels_latents + num_channels_image}. Please verify the config of"
604
+ " `pipeline.unet` or your `image` input."
605
+ )
606
+
607
+ # 9. Denoising loop
608
+ num_warmup_steps = 0
609
+
610
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
611
+ for i, t in enumerate(timesteps):
612
+ sigma = self.scheduler.sigmas[i]
613
+ # expand the latents if we are doing classifier free guidance
614
+ latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
615
+ scaled_model_input = self.scheduler.scale_model_input(latent_model_input, t)
616
+
617
+ scaled_model_input = torch.cat([scaled_model_input, image_cond], dim=1)
618
+ # preconditioning parameter based on Karras et al. (2022) (table 1)
619
+ timestep = torch.log(sigma) * 0.25
620
+
621
+ noise_pred = self.unet(
622
+ scaled_model_input,
623
+ timestep,
624
+ encoder_hidden_states=prompt_embeds,
625
+ timestep_cond=timestep_condition,
626
+ ).sample
627
+
628
+ # in original repo, the output contains a variance channel that's not used
629
+ noise_pred = noise_pred[:, :-1]
630
+
631
+ # apply preconditioning, based on table 1 in Karras et al. (2022)
632
+ inv_sigma = 1 / (sigma**2 + 1)
633
+ noise_pred = inv_sigma * latent_model_input + self.scheduler.scale_model_input(sigma, t) * noise_pred
634
+
635
+ # perform guidance
636
+ if do_classifier_free_guidance:
637
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
638
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
639
+
640
+ # compute the previous noisy sample x_t -> x_t-1
641
+ latents = self.scheduler.step(noise_pred, t, latents).prev_sample
642
+
643
+ # call the callback, if provided
644
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
645
+ progress_bar.update()
646
+ if callback is not None and i % callback_steps == 0:
647
+ step_idx = i // getattr(self.scheduler, "order", 1)
648
+ callback(step_idx, t, latents)
649
+
650
+ if XLA_AVAILABLE:
651
+ xm.mark_step()
652
+
653
+ if not output_type == "latent":
654
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
655
+ else:
656
+ image = latents
657
+
658
+ image = self.image_processor.postprocess(image, output_type=output_type)
659
+
660
+ self.maybe_free_model_hooks()
661
+
662
+ if not return_dict:
663
+ return (image,)
664
+
665
+ return ImagePipelineOutput(images=image)
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py ADDED
@@ -0,0 +1,826 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ import warnings
17
+ from typing import Any, Callable, Dict, List, Optional, Union
18
+
19
+ import numpy as np
20
+ import PIL.Image
21
+ import torch
22
+ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
23
+
24
+ from ...image_processor import PipelineImageInput, VaeImageProcessor
25
+ from ...loaders import FromSingleFileMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
26
+ from ...models import AutoencoderKL, UNet2DConditionModel
27
+ from ...models.attention_processor import (
28
+ AttnProcessor2_0,
29
+ XFormersAttnProcessor,
30
+ )
31
+ from ...models.lora import adjust_lora_scale_text_encoder
32
+ from ...schedulers import DDPMScheduler, KarrasDiffusionSchedulers
33
+ from ...utils import (
34
+ USE_PEFT_BACKEND,
35
+ deprecate,
36
+ is_torch_xla_available,
37
+ logging,
38
+ scale_lora_layers,
39
+ unscale_lora_layers,
40
+ )
41
+ from ...utils.torch_utils import randn_tensor
42
+ from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
43
+ from . import StableDiffusionPipelineOutput
44
+
45
+
46
+ if is_torch_xla_available():
47
+ import torch_xla.core.xla_model as xm
48
+
49
+ XLA_AVAILABLE = True
50
+ else:
51
+ XLA_AVAILABLE = False
52
+
53
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
54
+
55
+
56
+ def preprocess(image):
57
+ warnings.warn(
58
+ "The preprocess method is deprecated and will be removed in a future version. Please"
59
+ " use VaeImageProcessor.preprocess instead",
60
+ FutureWarning,
61
+ )
62
+ if isinstance(image, torch.Tensor):
63
+ return image
64
+ elif isinstance(image, PIL.Image.Image):
65
+ image = [image]
66
+
67
+ if isinstance(image[0], PIL.Image.Image):
68
+ w, h = image[0].size
69
+ w, h = (x - x % 64 for x in (w, h)) # resize to integer multiple of 64
70
+
71
+ image = [np.array(i.resize((w, h)))[None, :] for i in image]
72
+ image = np.concatenate(image, axis=0)
73
+ image = np.array(image).astype(np.float32) / 255.0
74
+ image = image.transpose(0, 3, 1, 2)
75
+ image = 2.0 * image - 1.0
76
+ image = torch.from_numpy(image)
77
+ elif isinstance(image[0], torch.Tensor):
78
+ image = torch.cat(image, dim=0)
79
+ return image
80
+
81
+
82
+ class StableDiffusionUpscalePipeline(
83
+ DiffusionPipeline,
84
+ StableDiffusionMixin,
85
+ TextualInversionLoaderMixin,
86
+ StableDiffusionLoraLoaderMixin,
87
+ FromSingleFileMixin,
88
+ ):
89
+ r"""
90
+ Pipeline for text-guided image super-resolution using Stable Diffusion 2.
91
+
92
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
93
+ implemented for all pipelines (downloading, saving, running on a particular device, etc.).
94
+
95
+ The pipeline also inherits the following loading methods:
96
+ - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
97
+ - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
98
+ - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
99
+ - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
100
+
101
+ Args:
102
+ vae ([`AutoencoderKL`]):
103
+ Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
104
+ text_encoder ([`~transformers.CLIPTextModel`]):
105
+ Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
106
+ tokenizer ([`~transformers.CLIPTokenizer`]):
107
+ A `CLIPTokenizer` to tokenize text.
108
+ unet ([`UNet2DConditionModel`]):
109
+ A `UNet2DConditionModel` to denoise the encoded image latents.
110
+ low_res_scheduler ([`SchedulerMixin`]):
111
+ A scheduler used to add initial noise to the low resolution conditioning image. It must be an instance of
112
+ [`DDPMScheduler`].
113
+ scheduler ([`SchedulerMixin`]):
114
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
115
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
116
+ """
117
+
118
+ model_cpu_offload_seq = "text_encoder->unet->vae"
119
+ _optional_components = ["watermarker", "safety_checker", "feature_extractor"]
120
+ _exclude_from_cpu_offload = ["safety_checker"]
121
+
122
+ def __init__(
123
+ self,
124
+ vae: AutoencoderKL,
125
+ text_encoder: CLIPTextModel,
126
+ tokenizer: CLIPTokenizer,
127
+ unet: UNet2DConditionModel,
128
+ low_res_scheduler: DDPMScheduler,
129
+ scheduler: KarrasDiffusionSchedulers,
130
+ safety_checker: Optional[Any] = None,
131
+ feature_extractor: Optional[CLIPImageProcessor] = None,
132
+ watermarker: Optional[Any] = None,
133
+ max_noise_level: int = 350,
134
+ ):
135
+ super().__init__()
136
+
137
+ if hasattr(
138
+ vae, "config"
139
+ ): # check if vae has a config attribute `scaling_factor` and if it is set to 0.08333, else set it to 0.08333 and deprecate
140
+ is_vae_scaling_factor_set_to_0_08333 = (
141
+ hasattr(vae.config, "scaling_factor") and vae.config.scaling_factor == 0.08333
142
+ )
143
+ if not is_vae_scaling_factor_set_to_0_08333:
144
+ deprecation_message = (
145
+ "The configuration file of the vae does not contain `scaling_factor` or it is set to"
146
+ f" {vae.config.scaling_factor}, which seems highly unlikely. If your checkpoint is a fine-tuned"
147
+ " version of `stabilityai/stable-diffusion-x4-upscaler` you should change 'scaling_factor' to"
148
+ " 0.08333 Please make sure to update the config accordingly, as not doing so might lead to"
149
+ " incorrect results in future versions. If you have downloaded this checkpoint from the Hugging"
150
+ " Face Hub, it would be very nice if you could open a Pull Request for the `vae/config.json` file"
151
+ )
152
+ deprecate("wrong scaling_factor", "1.0.0", deprecation_message, standard_warn=False)
153
+ vae.register_to_config(scaling_factor=0.08333)
154
+
155
+ self.register_modules(
156
+ vae=vae,
157
+ text_encoder=text_encoder,
158
+ tokenizer=tokenizer,
159
+ unet=unet,
160
+ low_res_scheduler=low_res_scheduler,
161
+ scheduler=scheduler,
162
+ safety_checker=safety_checker,
163
+ watermarker=watermarker,
164
+ feature_extractor=feature_extractor,
165
+ )
166
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
167
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, resample="bicubic")
168
+ self.register_to_config(max_noise_level=max_noise_level)
169
+
170
+ def run_safety_checker(self, image, device, dtype):
171
+ if self.safety_checker is not None:
172
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
173
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
174
+ image, nsfw_detected, watermark_detected = self.safety_checker(
175
+ images=image,
176
+ clip_input=safety_checker_input.pixel_values.to(dtype=dtype),
177
+ )
178
+ else:
179
+ nsfw_detected = None
180
+ watermark_detected = None
181
+
182
+ if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None:
183
+ self.unet_offload_hook.offload()
184
+
185
+ return image, nsfw_detected, watermark_detected
186
+
187
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
188
+ def _encode_prompt(
189
+ self,
190
+ prompt,
191
+ device,
192
+ num_images_per_prompt,
193
+ do_classifier_free_guidance,
194
+ negative_prompt=None,
195
+ prompt_embeds: Optional[torch.Tensor] = None,
196
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
197
+ lora_scale: Optional[float] = None,
198
+ **kwargs,
199
+ ):
200
+ deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
201
+ deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
202
+
203
+ prompt_embeds_tuple = self.encode_prompt(
204
+ prompt=prompt,
205
+ device=device,
206
+ num_images_per_prompt=num_images_per_prompt,
207
+ do_classifier_free_guidance=do_classifier_free_guidance,
208
+ negative_prompt=negative_prompt,
209
+ prompt_embeds=prompt_embeds,
210
+ negative_prompt_embeds=negative_prompt_embeds,
211
+ lora_scale=lora_scale,
212
+ **kwargs,
213
+ )
214
+
215
+ # concatenate for backwards comp
216
+ prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
217
+
218
+ return prompt_embeds
219
+
220
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
221
+ def encode_prompt(
222
+ self,
223
+ prompt,
224
+ device,
225
+ num_images_per_prompt,
226
+ do_classifier_free_guidance,
227
+ negative_prompt=None,
228
+ prompt_embeds: Optional[torch.Tensor] = None,
229
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
230
+ lora_scale: Optional[float] = None,
231
+ clip_skip: Optional[int] = None,
232
+ ):
233
+ r"""
234
+ Encodes the prompt into text encoder hidden states.
235
+
236
+ Args:
237
+ prompt (`str` or `List[str]`, *optional*):
238
+ prompt to be encoded
239
+ device: (`torch.device`):
240
+ torch device
241
+ num_images_per_prompt (`int`):
242
+ number of images that should be generated per prompt
243
+ do_classifier_free_guidance (`bool`):
244
+ whether to use classifier free guidance or not
245
+ negative_prompt (`str` or `List[str]`, *optional*):
246
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
247
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
248
+ less than `1`).
249
+ prompt_embeds (`torch.Tensor`, *optional*):
250
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
251
+ provided, text embeddings will be generated from `prompt` input argument.
252
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
253
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
254
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
255
+ argument.
256
+ lora_scale (`float`, *optional*):
257
+ A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
258
+ clip_skip (`int`, *optional*):
259
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
260
+ the output of the pre-final layer will be used for computing the prompt embeddings.
261
+ """
262
+ # set lora scale so that monkey patched LoRA
263
+ # function of text encoder can correctly access it
264
+ if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
265
+ self._lora_scale = lora_scale
266
+
267
+ # dynamically adjust the LoRA scale
268
+ if not USE_PEFT_BACKEND:
269
+ adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
270
+ else:
271
+ scale_lora_layers(self.text_encoder, lora_scale)
272
+
273
+ if prompt is not None and isinstance(prompt, str):
274
+ batch_size = 1
275
+ elif prompt is not None and isinstance(prompt, list):
276
+ batch_size = len(prompt)
277
+ else:
278
+ batch_size = prompt_embeds.shape[0]
279
+
280
+ if prompt_embeds is None:
281
+ # textual inversion: process multi-vector tokens if necessary
282
+ if isinstance(self, TextualInversionLoaderMixin):
283
+ prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
284
+
285
+ text_inputs = self.tokenizer(
286
+ prompt,
287
+ padding="max_length",
288
+ max_length=self.tokenizer.model_max_length,
289
+ truncation=True,
290
+ return_tensors="pt",
291
+ )
292
+ text_input_ids = text_inputs.input_ids
293
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
294
+
295
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
296
+ text_input_ids, untruncated_ids
297
+ ):
298
+ removed_text = self.tokenizer.batch_decode(
299
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
300
+ )
301
+ logger.warning(
302
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
303
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
304
+ )
305
+
306
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
307
+ attention_mask = text_inputs.attention_mask.to(device)
308
+ else:
309
+ attention_mask = None
310
+
311
+ if clip_skip is None:
312
+ prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
313
+ prompt_embeds = prompt_embeds[0]
314
+ else:
315
+ prompt_embeds = self.text_encoder(
316
+ text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
317
+ )
318
+ # Access the `hidden_states` first, that contains a tuple of
319
+ # all the hidden states from the encoder layers. Then index into
320
+ # the tuple to access the hidden states from the desired layer.
321
+ prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
322
+ # We also need to apply the final LayerNorm here to not mess with the
323
+ # representations. The `last_hidden_states` that we typically use for
324
+ # obtaining the final prompt representations passes through the LayerNorm
325
+ # layer.
326
+ prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
327
+
328
+ if self.text_encoder is not None:
329
+ prompt_embeds_dtype = self.text_encoder.dtype
330
+ elif self.unet is not None:
331
+ prompt_embeds_dtype = self.unet.dtype
332
+ else:
333
+ prompt_embeds_dtype = prompt_embeds.dtype
334
+
335
+ prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
336
+
337
+ bs_embed, seq_len, _ = prompt_embeds.shape
338
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
339
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
340
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
341
+
342
+ # get unconditional embeddings for classifier free guidance
343
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
344
+ uncond_tokens: List[str]
345
+ if negative_prompt is None:
346
+ uncond_tokens = [""] * batch_size
347
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
348
+ raise TypeError(
349
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
350
+ f" {type(prompt)}."
351
+ )
352
+ elif isinstance(negative_prompt, str):
353
+ uncond_tokens = [negative_prompt]
354
+ elif batch_size != len(negative_prompt):
355
+ raise ValueError(
356
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
357
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
358
+ " the batch size of `prompt`."
359
+ )
360
+ else:
361
+ uncond_tokens = negative_prompt
362
+
363
+ # textual inversion: process multi-vector tokens if necessary
364
+ if isinstance(self, TextualInversionLoaderMixin):
365
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
366
+
367
+ max_length = prompt_embeds.shape[1]
368
+ uncond_input = self.tokenizer(
369
+ uncond_tokens,
370
+ padding="max_length",
371
+ max_length=max_length,
372
+ truncation=True,
373
+ return_tensors="pt",
374
+ )
375
+
376
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
377
+ attention_mask = uncond_input.attention_mask.to(device)
378
+ else:
379
+ attention_mask = None
380
+
381
+ negative_prompt_embeds = self.text_encoder(
382
+ uncond_input.input_ids.to(device),
383
+ attention_mask=attention_mask,
384
+ )
385
+ negative_prompt_embeds = negative_prompt_embeds[0]
386
+
387
+ if do_classifier_free_guidance:
388
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
389
+ seq_len = negative_prompt_embeds.shape[1]
390
+
391
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
392
+
393
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
394
+ negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
395
+
396
+ if self.text_encoder is not None:
397
+ if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
398
+ # Retrieve the original scale by scaling back the LoRA layers
399
+ unscale_lora_layers(self.text_encoder, lora_scale)
400
+
401
+ return prompt_embeds, negative_prompt_embeds
402
+
403
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
404
+ def prepare_extra_step_kwargs(self, generator, eta):
405
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
406
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
407
+ # eta corresponds to η in DDIM paper: https://huggingface.co/papers/2010.02502
408
+ # and should be between [0, 1]
409
+
410
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
411
+ extra_step_kwargs = {}
412
+ if accepts_eta:
413
+ extra_step_kwargs["eta"] = eta
414
+
415
+ # check if the scheduler accepts generator
416
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
417
+ if accepts_generator:
418
+ extra_step_kwargs["generator"] = generator
419
+ return extra_step_kwargs
420
+
421
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
422
+ def decode_latents(self, latents):
423
+ deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
424
+ deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
425
+
426
+ latents = 1 / self.vae.config.scaling_factor * latents
427
+ image = self.vae.decode(latents, return_dict=False)[0]
428
+ image = (image / 2 + 0.5).clamp(0, 1)
429
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
430
+ image = image.cpu().permute(0, 2, 3, 1).float().numpy()
431
+ return image
432
+
433
+ def check_inputs(
434
+ self,
435
+ prompt,
436
+ image,
437
+ noise_level,
438
+ callback_steps,
439
+ negative_prompt=None,
440
+ prompt_embeds=None,
441
+ negative_prompt_embeds=None,
442
+ ):
443
+ if (callback_steps is None) or (
444
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
445
+ ):
446
+ raise ValueError(
447
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
448
+ f" {type(callback_steps)}."
449
+ )
450
+
451
+ if prompt is not None and prompt_embeds is not None:
452
+ raise ValueError(
453
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
454
+ " only forward one of the two."
455
+ )
456
+ elif prompt is None and prompt_embeds is None:
457
+ raise ValueError(
458
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
459
+ )
460
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
461
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
462
+
463
+ if negative_prompt is not None and negative_prompt_embeds is not None:
464
+ raise ValueError(
465
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
466
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
467
+ )
468
+
469
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
470
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
471
+ raise ValueError(
472
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
473
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
474
+ f" {negative_prompt_embeds.shape}."
475
+ )
476
+
477
+ if (
478
+ not isinstance(image, torch.Tensor)
479
+ and not isinstance(image, PIL.Image.Image)
480
+ and not isinstance(image, np.ndarray)
481
+ and not isinstance(image, list)
482
+ ):
483
+ raise ValueError(
484
+ f"`image` has to be of type `torch.Tensor`, `np.ndarray`, `PIL.Image.Image` or `list` but is {type(image)}"
485
+ )
486
+
487
+ # verify batch size of prompt and image are same if image is a list or tensor or numpy array
488
+ if isinstance(image, (list, np.ndarray, torch.Tensor)):
489
+ if prompt is not None and isinstance(prompt, str):
490
+ batch_size = 1
491
+ elif prompt is not None and isinstance(prompt, list):
492
+ batch_size = len(prompt)
493
+ else:
494
+ batch_size = prompt_embeds.shape[0]
495
+
496
+ if isinstance(image, list):
497
+ image_batch_size = len(image)
498
+ else:
499
+ image_batch_size = image.shape[0]
500
+ if batch_size != image_batch_size:
501
+ raise ValueError(
502
+ f"`prompt` has batch size {batch_size} and `image` has batch size {image_batch_size}."
503
+ " Please make sure that passed `prompt` matches the batch size of `image`."
504
+ )
505
+
506
+ # check noise level
507
+ if noise_level > self.config.max_noise_level:
508
+ raise ValueError(f"`noise_level` has to be <= {self.config.max_noise_level} but is {noise_level}")
509
+
510
+ if (callback_steps is None) or (
511
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
512
+ ):
513
+ raise ValueError(
514
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
515
+ f" {type(callback_steps)}."
516
+ )
517
+
518
+ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
519
+ shape = (batch_size, num_channels_latents, height, width)
520
+ if latents is None:
521
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
522
+ else:
523
+ if latents.shape != shape:
524
+ raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
525
+ latents = latents.to(device)
526
+
527
+ # scale the initial noise by the standard deviation required by the scheduler
528
+ latents = latents * self.scheduler.init_noise_sigma
529
+ return latents
530
+
531
+ def upcast_vae(self):
532
+ dtype = self.vae.dtype
533
+ self.vae.to(dtype=torch.float32)
534
+ use_torch_2_0_or_xformers = isinstance(
535
+ self.vae.decoder.mid_block.attentions[0].processor,
536
+ (
537
+ AttnProcessor2_0,
538
+ XFormersAttnProcessor,
539
+ ),
540
+ )
541
+ # if xformers or torch_2_0 is used attention block does not need
542
+ # to be in float32 which can save lots of memory
543
+ if use_torch_2_0_or_xformers:
544
+ self.vae.post_quant_conv.to(dtype)
545
+ self.vae.decoder.conv_in.to(dtype)
546
+ self.vae.decoder.mid_block.to(dtype)
547
+
548
+ @torch.no_grad()
549
+ def __call__(
550
+ self,
551
+ prompt: Union[str, List[str]] = None,
552
+ image: PipelineImageInput = None,
553
+ num_inference_steps: int = 75,
554
+ guidance_scale: float = 9.0,
555
+ noise_level: int = 20,
556
+ negative_prompt: Optional[Union[str, List[str]]] = None,
557
+ num_images_per_prompt: Optional[int] = 1,
558
+ eta: float = 0.0,
559
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
560
+ latents: Optional[torch.Tensor] = None,
561
+ prompt_embeds: Optional[torch.Tensor] = None,
562
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
563
+ output_type: Optional[str] = "pil",
564
+ return_dict: bool = True,
565
+ callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
566
+ callback_steps: int = 1,
567
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
568
+ clip_skip: int = None,
569
+ ):
570
+ r"""
571
+ The call function to the pipeline for generation.
572
+
573
+ Args:
574
+ prompt (`str` or `List[str]`, *optional*):
575
+ The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
576
+ image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
577
+ `Image` or tensor representing an image batch to be upscaled.
578
+ num_inference_steps (`int`, *optional*, defaults to 50):
579
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
580
+ expense of slower inference.
581
+ guidance_scale (`float`, *optional*, defaults to 7.5):
582
+ A higher guidance scale value encourages the model to generate images closely linked to the text
583
+ `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
584
+ negative_prompt (`str` or `List[str]`, *optional*):
585
+ The prompt or prompts to guide what to not include in image generation. If not defined, you need to
586
+ pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
587
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
588
+ The number of images to generate per prompt.
589
+ eta (`float`, *optional*, defaults to 0.0):
590
+ Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only
591
+ applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
592
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
593
+ A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
594
+ generation deterministic.
595
+ latents (`torch.Tensor`, *optional*):
596
+ Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
597
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
598
+ tensor is generated by sampling using the supplied random `generator`.
599
+ prompt_embeds (`torch.Tensor`, *optional*):
600
+ Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
601
+ provided, text embeddings are generated from the `prompt` input argument.
602
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
603
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
604
+ not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
605
+ output_type (`str`, *optional*, defaults to `"pil"`):
606
+ The output format of the generated image. Choose between `PIL.Image` or `np.array`.
607
+ return_dict (`bool`, *optional*, defaults to `True`):
608
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
609
+ plain tuple.
610
+ callback (`Callable`, *optional*):
611
+ A function that calls every `callback_steps` steps during inference. The function is called with the
612
+ following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
613
+ callback_steps (`int`, *optional*, defaults to 1):
614
+ The frequency at which the `callback` function is called. If not specified, the callback is called at
615
+ every step.
616
+ cross_attention_kwargs (`dict`, *optional*):
617
+ A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
618
+ [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
619
+ clip_skip (`int`, *optional*):
620
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
621
+ the output of the pre-final layer will be used for computing the prompt embeddings.
622
+ Examples:
623
+ ```py
624
+ >>> import requests
625
+ >>> from PIL import Image
626
+ >>> from io import BytesIO
627
+ >>> from diffusers import StableDiffusionUpscalePipeline
628
+ >>> import torch
629
+
630
+ >>> # load model and scheduler
631
+ >>> model_id = "stabilityai/stable-diffusion-x4-upscaler"
632
+ >>> pipeline = StableDiffusionUpscalePipeline.from_pretrained(
633
+ ... model_id, variant="fp16", torch_dtype=torch.float16
634
+ ... )
635
+ >>> pipeline = pipeline.to("cuda")
636
+
637
+ >>> # let's download an image
638
+ >>> url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-upscale/low_res_cat.png"
639
+ >>> response = requests.get(url)
640
+ >>> low_res_img = Image.open(BytesIO(response.content)).convert("RGB")
641
+ >>> low_res_img = low_res_img.resize((128, 128))
642
+ >>> prompt = "a white cat"
643
+
644
+ >>> upscaled_image = pipeline(prompt=prompt, image=low_res_img).images[0]
645
+ >>> upscaled_image.save("upsampled_cat.png")
646
+ ```
647
+
648
+ Returns:
649
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
650
+ If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
651
+ otherwise a `tuple` is returned where the first element is a list with the generated images and the
652
+ second element is a list of `bool`s indicating whether the corresponding generated image contains
653
+ "not-safe-for-work" (nsfw) content.
654
+ """
655
+
656
+ # 1. Check inputs
657
+ self.check_inputs(
658
+ prompt,
659
+ image,
660
+ noise_level,
661
+ callback_steps,
662
+ negative_prompt,
663
+ prompt_embeds,
664
+ negative_prompt_embeds,
665
+ )
666
+
667
+ if image is None:
668
+ raise ValueError("`image` input cannot be undefined.")
669
+
670
+ # 2. Define call parameters
671
+ if prompt is not None and isinstance(prompt, str):
672
+ batch_size = 1
673
+ elif prompt is not None and isinstance(prompt, list):
674
+ batch_size = len(prompt)
675
+ else:
676
+ batch_size = prompt_embeds.shape[0]
677
+
678
+ device = self._execution_device
679
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
680
+ # of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
681
+ # corresponds to doing no classifier free guidance.
682
+ do_classifier_free_guidance = guidance_scale > 1.0
683
+
684
+ # 3. Encode input prompt
685
+ text_encoder_lora_scale = (
686
+ cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
687
+ )
688
+ prompt_embeds, negative_prompt_embeds = self.encode_prompt(
689
+ prompt,
690
+ device,
691
+ num_images_per_prompt,
692
+ do_classifier_free_guidance,
693
+ negative_prompt,
694
+ prompt_embeds=prompt_embeds,
695
+ negative_prompt_embeds=negative_prompt_embeds,
696
+ lora_scale=text_encoder_lora_scale,
697
+ clip_skip=clip_skip,
698
+ )
699
+ # For classifier free guidance, we need to do two forward passes.
700
+ # Here we concatenate the unconditional and text embeddings into a single batch
701
+ # to avoid doing two forward passes
702
+ if do_classifier_free_guidance:
703
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
704
+
705
+ # 4. Preprocess image
706
+ image = self.image_processor.preprocess(image)
707
+ image = image.to(dtype=prompt_embeds.dtype, device=device)
708
+
709
+ # 5. set timesteps
710
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
711
+ timesteps = self.scheduler.timesteps
712
+
713
+ # 5. Add noise to image
714
+ noise_level = torch.tensor([noise_level], dtype=torch.long, device=device)
715
+ noise = randn_tensor(image.shape, generator=generator, device=device, dtype=prompt_embeds.dtype)
716
+ image = self.low_res_scheduler.add_noise(image, noise, noise_level)
717
+
718
+ batch_multiplier = 2 if do_classifier_free_guidance else 1
719
+ image = torch.cat([image] * batch_multiplier * num_images_per_prompt)
720
+ noise_level = torch.cat([noise_level] * image.shape[0])
721
+
722
+ # 6. Prepare latent variables
723
+ height, width = image.shape[2:]
724
+ num_channels_latents = self.vae.config.latent_channels
725
+ latents = self.prepare_latents(
726
+ batch_size * num_images_per_prompt,
727
+ num_channels_latents,
728
+ height,
729
+ width,
730
+ prompt_embeds.dtype,
731
+ device,
732
+ generator,
733
+ latents,
734
+ )
735
+
736
+ # 7. Check that sizes of image and latents match
737
+ num_channels_image = image.shape[1]
738
+ if num_channels_latents + num_channels_image != self.unet.config.in_channels:
739
+ raise ValueError(
740
+ f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
741
+ f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
742
+ f" `num_channels_image`: {num_channels_image} "
743
+ f" = {num_channels_latents + num_channels_image}. Please verify the config of"
744
+ " `pipeline.unet` or your `image` input."
745
+ )
746
+
747
+ # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
748
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
749
+
750
+ # 9. Denoising loop
751
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
752
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
753
+ for i, t in enumerate(timesteps):
754
+ # expand the latents if we are doing classifier free guidance
755
+ latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
756
+
757
+ # concat latents, mask, masked_image_latents in the channel dimension
758
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
759
+ latent_model_input = torch.cat([latent_model_input, image], dim=1)
760
+
761
+ # predict the noise residual
762
+ noise_pred = self.unet(
763
+ latent_model_input,
764
+ t,
765
+ encoder_hidden_states=prompt_embeds,
766
+ cross_attention_kwargs=cross_attention_kwargs,
767
+ class_labels=noise_level,
768
+ return_dict=False,
769
+ )[0]
770
+
771
+ # perform guidance
772
+ if do_classifier_free_guidance:
773
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
774
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
775
+
776
+ # compute the previous noisy sample x_t -> x_t-1
777
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
778
+
779
+ # call the callback, if provided
780
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
781
+ progress_bar.update()
782
+ if callback is not None and i % callback_steps == 0:
783
+ step_idx = i // getattr(self.scheduler, "order", 1)
784
+ callback(step_idx, t, latents)
785
+
786
+ if XLA_AVAILABLE:
787
+ xm.mark_step()
788
+
789
+ if not output_type == "latent":
790
+ # make sure the VAE is in float32 mode, as it overflows in float16
791
+ needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
792
+
793
+ if needs_upcasting:
794
+ self.upcast_vae()
795
+
796
+ # Ensure latents are always the same type as the VAE
797
+ latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
798
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
799
+
800
+ # cast back to fp16 if needed
801
+ if needs_upcasting:
802
+ self.vae.to(dtype=torch.float16)
803
+
804
+ image, has_nsfw_concept, _ = self.run_safety_checker(image, device, prompt_embeds.dtype)
805
+ else:
806
+ image = latents
807
+ has_nsfw_concept = None
808
+
809
+ if has_nsfw_concept is None:
810
+ do_denormalize = [True] * image.shape[0]
811
+ else:
812
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
813
+
814
+ image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
815
+
816
+ # 11. Apply watermark
817
+ if output_type == "pil" and self.watermarker is not None:
818
+ image = self.watermarker.apply_watermark(image)
819
+
820
+ # Offload all models
821
+ self.maybe_free_model_hooks()
822
+
823
+ if not return_dict:
824
+ return (image, has_nsfw_concept)
825
+
826
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py ADDED
@@ -0,0 +1,952 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
17
+
18
+ import torch
19
+ from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
20
+ from transformers.models.clip.modeling_clip import CLIPTextModelOutput
21
+
22
+ from ...image_processor import VaeImageProcessor
23
+ from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
24
+ from ...models import AutoencoderKL, PriorTransformer, UNet2DConditionModel
25
+ from ...models.embeddings import get_timestep_embedding
26
+ from ...models.lora import adjust_lora_scale_text_encoder
27
+ from ...schedulers import KarrasDiffusionSchedulers
28
+ from ...utils import (
29
+ USE_PEFT_BACKEND,
30
+ deprecate,
31
+ is_torch_xla_available,
32
+ logging,
33
+ replace_example_docstring,
34
+ scale_lora_layers,
35
+ unscale_lora_layers,
36
+ )
37
+ from ...utils.torch_utils import randn_tensor
38
+ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput, StableDiffusionMixin
39
+ from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
40
+
41
+
42
+ if is_torch_xla_available():
43
+ import torch_xla.core.xla_model as xm
44
+
45
+ XLA_AVAILABLE = True
46
+ else:
47
+ XLA_AVAILABLE = False
48
+
49
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
50
+
51
+
52
+ EXAMPLE_DOC_STRING = """
53
+ Examples:
54
+ ```py
55
+ >>> import torch
56
+ >>> from diffusers import StableUnCLIPPipeline
57
+
58
+ >>> pipe = StableUnCLIPPipeline.from_pretrained(
59
+ ... "fusing/stable-unclip-2-1-l", torch_dtype=torch.float16
60
+ ... ) # TODO update model path
61
+ >>> pipe = pipe.to("cuda")
62
+
63
+ >>> prompt = "a photo of an astronaut riding a horse on mars"
64
+ >>> images = pipe(prompt).images
65
+ >>> images[0].save("astronaut_horse.png")
66
+ ```
67
+ """
68
+
69
+
70
+ class StableUnCLIPPipeline(
71
+ DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin
72
+ ):
73
+ """
74
+ Pipeline for text-to-image generation using stable unCLIP.
75
+
76
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
77
+ implemented for all pipelines (downloading, saving, running on a particular device, etc.).
78
+
79
+ The pipeline also inherits the following loading methods:
80
+ - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
81
+ - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
82
+ - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
83
+
84
+ Args:
85
+ prior_tokenizer ([`CLIPTokenizer`]):
86
+ A [`CLIPTokenizer`].
87
+ prior_text_encoder ([`CLIPTextModelWithProjection`]):
88
+ Frozen [`CLIPTextModelWithProjection`] text-encoder.
89
+ prior ([`PriorTransformer`]):
90
+ The canonical unCLIP prior to approximate the image embedding from the text embedding.
91
+ prior_scheduler ([`KarrasDiffusionSchedulers`]):
92
+ Scheduler used in the prior denoising process.
93
+ image_normalizer ([`StableUnCLIPImageNormalizer`]):
94
+ Used to normalize the predicted image embeddings before the noise is applied and un-normalize the image
95
+ embeddings after the noise has been applied.
96
+ image_noising_scheduler ([`KarrasDiffusionSchedulers`]):
97
+ Noise schedule for adding noise to the predicted image embeddings. The amount of noise to add is determined
98
+ by the `noise_level`.
99
+ tokenizer ([`CLIPTokenizer`]):
100
+ A [`CLIPTokenizer`].
101
+ text_encoder ([`CLIPTextModel`]):
102
+ Frozen [`CLIPTextModel`] text-encoder.
103
+ unet ([`UNet2DConditionModel`]):
104
+ A [`UNet2DConditionModel`] to denoise the encoded image latents.
105
+ scheduler ([`KarrasDiffusionSchedulers`]):
106
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents.
107
+ vae ([`AutoencoderKL`]):
108
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
109
+ """
110
+
111
+ _exclude_from_cpu_offload = ["prior", "image_normalizer"]
112
+ model_cpu_offload_seq = "text_encoder->prior_text_encoder->unet->vae"
113
+
114
+ # prior components
115
+ prior_tokenizer: CLIPTokenizer
116
+ prior_text_encoder: CLIPTextModelWithProjection
117
+ prior: PriorTransformer
118
+ prior_scheduler: KarrasDiffusionSchedulers
119
+
120
+ # image noising components
121
+ image_normalizer: StableUnCLIPImageNormalizer
122
+ image_noising_scheduler: KarrasDiffusionSchedulers
123
+
124
+ # regular denoising components
125
+ tokenizer: CLIPTokenizer
126
+ text_encoder: CLIPTextModel
127
+ unet: UNet2DConditionModel
128
+ scheduler: KarrasDiffusionSchedulers
129
+
130
+ vae: AutoencoderKL
131
+
132
+ def __init__(
133
+ self,
134
+ # prior components
135
+ prior_tokenizer: CLIPTokenizer,
136
+ prior_text_encoder: CLIPTextModelWithProjection,
137
+ prior: PriorTransformer,
138
+ prior_scheduler: KarrasDiffusionSchedulers,
139
+ # image noising components
140
+ image_normalizer: StableUnCLIPImageNormalizer,
141
+ image_noising_scheduler: KarrasDiffusionSchedulers,
142
+ # regular denoising components
143
+ tokenizer: CLIPTokenizer,
144
+ text_encoder: CLIPTextModel,
145
+ unet: UNet2DConditionModel,
146
+ scheduler: KarrasDiffusionSchedulers,
147
+ # vae
148
+ vae: AutoencoderKL,
149
+ ):
150
+ super().__init__()
151
+
152
+ self.register_modules(
153
+ prior_tokenizer=prior_tokenizer,
154
+ prior_text_encoder=prior_text_encoder,
155
+ prior=prior,
156
+ prior_scheduler=prior_scheduler,
157
+ image_normalizer=image_normalizer,
158
+ image_noising_scheduler=image_noising_scheduler,
159
+ tokenizer=tokenizer,
160
+ text_encoder=text_encoder,
161
+ unet=unet,
162
+ scheduler=scheduler,
163
+ vae=vae,
164
+ )
165
+
166
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
167
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
168
+
169
+ # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline._encode_prompt with _encode_prompt->_encode_prior_prompt, tokenizer->prior_tokenizer, text_encoder->prior_text_encoder
170
+ def _encode_prior_prompt(
171
+ self,
172
+ prompt,
173
+ device,
174
+ num_images_per_prompt,
175
+ do_classifier_free_guidance,
176
+ text_model_output: Optional[Union[CLIPTextModelOutput, Tuple]] = None,
177
+ text_attention_mask: Optional[torch.Tensor] = None,
178
+ ):
179
+ if text_model_output is None:
180
+ batch_size = len(prompt) if isinstance(prompt, list) else 1
181
+ # get prompt text embeddings
182
+ text_inputs = self.prior_tokenizer(
183
+ prompt,
184
+ padding="max_length",
185
+ max_length=self.prior_tokenizer.model_max_length,
186
+ truncation=True,
187
+ return_tensors="pt",
188
+ )
189
+ text_input_ids = text_inputs.input_ids
190
+ text_mask = text_inputs.attention_mask.bool().to(device)
191
+
192
+ untruncated_ids = self.prior_tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
193
+
194
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
195
+ text_input_ids, untruncated_ids
196
+ ):
197
+ removed_text = self.prior_tokenizer.batch_decode(
198
+ untruncated_ids[:, self.prior_tokenizer.model_max_length - 1 : -1]
199
+ )
200
+ logger.warning(
201
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
202
+ f" {self.prior_tokenizer.model_max_length} tokens: {removed_text}"
203
+ )
204
+ text_input_ids = text_input_ids[:, : self.prior_tokenizer.model_max_length]
205
+
206
+ prior_text_encoder_output = self.prior_text_encoder(text_input_ids.to(device))
207
+
208
+ prompt_embeds = prior_text_encoder_output.text_embeds
209
+ text_enc_hid_states = prior_text_encoder_output.last_hidden_state
210
+
211
+ else:
212
+ batch_size = text_model_output[0].shape[0]
213
+ prompt_embeds, text_enc_hid_states = text_model_output[0], text_model_output[1]
214
+ text_mask = text_attention_mask
215
+
216
+ prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
217
+ text_enc_hid_states = text_enc_hid_states.repeat_interleave(num_images_per_prompt, dim=0)
218
+ text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0)
219
+
220
+ if do_classifier_free_guidance:
221
+ uncond_tokens = [""] * batch_size
222
+
223
+ uncond_input = self.prior_tokenizer(
224
+ uncond_tokens,
225
+ padding="max_length",
226
+ max_length=self.prior_tokenizer.model_max_length,
227
+ truncation=True,
228
+ return_tensors="pt",
229
+ )
230
+ uncond_text_mask = uncond_input.attention_mask.bool().to(device)
231
+ negative_prompt_embeds_prior_text_encoder_output = self.prior_text_encoder(
232
+ uncond_input.input_ids.to(device)
233
+ )
234
+
235
+ negative_prompt_embeds = negative_prompt_embeds_prior_text_encoder_output.text_embeds
236
+ uncond_text_enc_hid_states = negative_prompt_embeds_prior_text_encoder_output.last_hidden_state
237
+
238
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
239
+
240
+ seq_len = negative_prompt_embeds.shape[1]
241
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt)
242
+ negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len)
243
+
244
+ seq_len = uncond_text_enc_hid_states.shape[1]
245
+ uncond_text_enc_hid_states = uncond_text_enc_hid_states.repeat(1, num_images_per_prompt, 1)
246
+ uncond_text_enc_hid_states = uncond_text_enc_hid_states.view(
247
+ batch_size * num_images_per_prompt, seq_len, -1
248
+ )
249
+ uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0)
250
+
251
+ # done duplicates
252
+
253
+ # For classifier free guidance, we need to do two forward passes.
254
+ # Here we concatenate the unconditional and text embeddings into a single batch
255
+ # to avoid doing two forward passes
256
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
257
+ text_enc_hid_states = torch.cat([uncond_text_enc_hid_states, text_enc_hid_states])
258
+
259
+ text_mask = torch.cat([uncond_text_mask, text_mask])
260
+
261
+ return prompt_embeds, text_enc_hid_states, text_mask
262
+
263
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
264
+ def _encode_prompt(
265
+ self,
266
+ prompt,
267
+ device,
268
+ num_images_per_prompt,
269
+ do_classifier_free_guidance,
270
+ negative_prompt=None,
271
+ prompt_embeds: Optional[torch.Tensor] = None,
272
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
273
+ lora_scale: Optional[float] = None,
274
+ **kwargs,
275
+ ):
276
+ deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
277
+ deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
278
+
279
+ prompt_embeds_tuple = self.encode_prompt(
280
+ prompt=prompt,
281
+ device=device,
282
+ num_images_per_prompt=num_images_per_prompt,
283
+ do_classifier_free_guidance=do_classifier_free_guidance,
284
+ negative_prompt=negative_prompt,
285
+ prompt_embeds=prompt_embeds,
286
+ negative_prompt_embeds=negative_prompt_embeds,
287
+ lora_scale=lora_scale,
288
+ **kwargs,
289
+ )
290
+
291
+ # concatenate for backwards comp
292
+ prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
293
+
294
+ return prompt_embeds
295
+
296
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
297
+ def encode_prompt(
298
+ self,
299
+ prompt,
300
+ device,
301
+ num_images_per_prompt,
302
+ do_classifier_free_guidance,
303
+ negative_prompt=None,
304
+ prompt_embeds: Optional[torch.Tensor] = None,
305
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
306
+ lora_scale: Optional[float] = None,
307
+ clip_skip: Optional[int] = None,
308
+ ):
309
+ r"""
310
+ Encodes the prompt into text encoder hidden states.
311
+
312
+ Args:
313
+ prompt (`str` or `List[str]`, *optional*):
314
+ prompt to be encoded
315
+ device: (`torch.device`):
316
+ torch device
317
+ num_images_per_prompt (`int`):
318
+ number of images that should be generated per prompt
319
+ do_classifier_free_guidance (`bool`):
320
+ whether to use classifier free guidance or not
321
+ negative_prompt (`str` or `List[str]`, *optional*):
322
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
323
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
324
+ less than `1`).
325
+ prompt_embeds (`torch.Tensor`, *optional*):
326
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
327
+ provided, text embeddings will be generated from `prompt` input argument.
328
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
329
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
330
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
331
+ argument.
332
+ lora_scale (`float`, *optional*):
333
+ A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
334
+ clip_skip (`int`, *optional*):
335
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
336
+ the output of the pre-final layer will be used for computing the prompt embeddings.
337
+ """
338
+ # set lora scale so that monkey patched LoRA
339
+ # function of text encoder can correctly access it
340
+ if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
341
+ self._lora_scale = lora_scale
342
+
343
+ # dynamically adjust the LoRA scale
344
+ if not USE_PEFT_BACKEND:
345
+ adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
346
+ else:
347
+ scale_lora_layers(self.text_encoder, lora_scale)
348
+
349
+ if prompt is not None and isinstance(prompt, str):
350
+ batch_size = 1
351
+ elif prompt is not None and isinstance(prompt, list):
352
+ batch_size = len(prompt)
353
+ else:
354
+ batch_size = prompt_embeds.shape[0]
355
+
356
+ if prompt_embeds is None:
357
+ # textual inversion: process multi-vector tokens if necessary
358
+ if isinstance(self, TextualInversionLoaderMixin):
359
+ prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
360
+
361
+ text_inputs = self.tokenizer(
362
+ prompt,
363
+ padding="max_length",
364
+ max_length=self.tokenizer.model_max_length,
365
+ truncation=True,
366
+ return_tensors="pt",
367
+ )
368
+ text_input_ids = text_inputs.input_ids
369
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
370
+
371
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
372
+ text_input_ids, untruncated_ids
373
+ ):
374
+ removed_text = self.tokenizer.batch_decode(
375
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
376
+ )
377
+ logger.warning(
378
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
379
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
380
+ )
381
+
382
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
383
+ attention_mask = text_inputs.attention_mask.to(device)
384
+ else:
385
+ attention_mask = None
386
+
387
+ if clip_skip is None:
388
+ prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
389
+ prompt_embeds = prompt_embeds[0]
390
+ else:
391
+ prompt_embeds = self.text_encoder(
392
+ text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
393
+ )
394
+ # Access the `hidden_states` first, that contains a tuple of
395
+ # all the hidden states from the encoder layers. Then index into
396
+ # the tuple to access the hidden states from the desired layer.
397
+ prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
398
+ # We also need to apply the final LayerNorm here to not mess with the
399
+ # representations. The `last_hidden_states` that we typically use for
400
+ # obtaining the final prompt representations passes through the LayerNorm
401
+ # layer.
402
+ prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
403
+
404
+ if self.text_encoder is not None:
405
+ prompt_embeds_dtype = self.text_encoder.dtype
406
+ elif self.unet is not None:
407
+ prompt_embeds_dtype = self.unet.dtype
408
+ else:
409
+ prompt_embeds_dtype = prompt_embeds.dtype
410
+
411
+ prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
412
+
413
+ bs_embed, seq_len, _ = prompt_embeds.shape
414
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
415
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
416
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
417
+
418
+ # get unconditional embeddings for classifier free guidance
419
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
420
+ uncond_tokens: List[str]
421
+ if negative_prompt is None:
422
+ uncond_tokens = [""] * batch_size
423
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
424
+ raise TypeError(
425
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
426
+ f" {type(prompt)}."
427
+ )
428
+ elif isinstance(negative_prompt, str):
429
+ uncond_tokens = [negative_prompt]
430
+ elif batch_size != len(negative_prompt):
431
+ raise ValueError(
432
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
433
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
434
+ " the batch size of `prompt`."
435
+ )
436
+ else:
437
+ uncond_tokens = negative_prompt
438
+
439
+ # textual inversion: process multi-vector tokens if necessary
440
+ if isinstance(self, TextualInversionLoaderMixin):
441
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
442
+
443
+ max_length = prompt_embeds.shape[1]
444
+ uncond_input = self.tokenizer(
445
+ uncond_tokens,
446
+ padding="max_length",
447
+ max_length=max_length,
448
+ truncation=True,
449
+ return_tensors="pt",
450
+ )
451
+
452
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
453
+ attention_mask = uncond_input.attention_mask.to(device)
454
+ else:
455
+ attention_mask = None
456
+
457
+ negative_prompt_embeds = self.text_encoder(
458
+ uncond_input.input_ids.to(device),
459
+ attention_mask=attention_mask,
460
+ )
461
+ negative_prompt_embeds = negative_prompt_embeds[0]
462
+
463
+ if do_classifier_free_guidance:
464
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
465
+ seq_len = negative_prompt_embeds.shape[1]
466
+
467
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
468
+
469
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
470
+ negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
471
+
472
+ if self.text_encoder is not None:
473
+ if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
474
+ # Retrieve the original scale by scaling back the LoRA layers
475
+ unscale_lora_layers(self.text_encoder, lora_scale)
476
+
477
+ return prompt_embeds, negative_prompt_embeds
478
+
479
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
480
+ def decode_latents(self, latents):
481
+ deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
482
+ deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
483
+
484
+ latents = 1 / self.vae.config.scaling_factor * latents
485
+ image = self.vae.decode(latents, return_dict=False)[0]
486
+ image = (image / 2 + 0.5).clamp(0, 1)
487
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
488
+ image = image.cpu().permute(0, 2, 3, 1).float().numpy()
489
+ return image
490
+
491
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs with prepare_extra_step_kwargs->prepare_prior_extra_step_kwargs, scheduler->prior_scheduler
492
+ def prepare_prior_extra_step_kwargs(self, generator, eta):
493
+ # prepare extra kwargs for the prior_scheduler step, since not all prior_schedulers have the same signature
494
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other prior_schedulers.
495
+ # eta corresponds to η in DDIM paper: https://huggingface.co/papers/2010.02502
496
+ # and should be between [0, 1]
497
+
498
+ accepts_eta = "eta" in set(inspect.signature(self.prior_scheduler.step).parameters.keys())
499
+ extra_step_kwargs = {}
500
+ if accepts_eta:
501
+ extra_step_kwargs["eta"] = eta
502
+
503
+ # check if the prior_scheduler accepts generator
504
+ accepts_generator = "generator" in set(inspect.signature(self.prior_scheduler.step).parameters.keys())
505
+ if accepts_generator:
506
+ extra_step_kwargs["generator"] = generator
507
+ return extra_step_kwargs
508
+
509
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
510
+ def prepare_extra_step_kwargs(self, generator, eta):
511
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
512
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
513
+ # eta corresponds to η in DDIM paper: https://huggingface.co/papers/2010.02502
514
+ # and should be between [0, 1]
515
+
516
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
517
+ extra_step_kwargs = {}
518
+ if accepts_eta:
519
+ extra_step_kwargs["eta"] = eta
520
+
521
+ # check if the scheduler accepts generator
522
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
523
+ if accepts_generator:
524
+ extra_step_kwargs["generator"] = generator
525
+ return extra_step_kwargs
526
+
527
+ def check_inputs(
528
+ self,
529
+ prompt,
530
+ height,
531
+ width,
532
+ callback_steps,
533
+ noise_level,
534
+ negative_prompt=None,
535
+ prompt_embeds=None,
536
+ negative_prompt_embeds=None,
537
+ ):
538
+ if height % 8 != 0 or width % 8 != 0:
539
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
540
+
541
+ if (callback_steps is None) or (
542
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
543
+ ):
544
+ raise ValueError(
545
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
546
+ f" {type(callback_steps)}."
547
+ )
548
+
549
+ if prompt is not None and prompt_embeds is not None:
550
+ raise ValueError(
551
+ "Provide either `prompt` or `prompt_embeds`. Please make sure to define only one of the two."
552
+ )
553
+
554
+ if prompt is None and prompt_embeds is None:
555
+ raise ValueError(
556
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
557
+ )
558
+
559
+ if prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
560
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
561
+
562
+ if negative_prompt is not None and negative_prompt_embeds is not None:
563
+ raise ValueError(
564
+ "Provide either `negative_prompt` or `negative_prompt_embeds`. Cannot leave both `negative_prompt` and `negative_prompt_embeds` undefined."
565
+ )
566
+
567
+ if prompt is not None and negative_prompt is not None:
568
+ if type(prompt) is not type(negative_prompt):
569
+ raise TypeError(
570
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
571
+ f" {type(prompt)}."
572
+ )
573
+
574
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
575
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
576
+ raise ValueError(
577
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
578
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
579
+ f" {negative_prompt_embeds.shape}."
580
+ )
581
+
582
+ if noise_level < 0 or noise_level >= self.image_noising_scheduler.config.num_train_timesteps:
583
+ raise ValueError(
584
+ f"`noise_level` must be between 0 and {self.image_noising_scheduler.config.num_train_timesteps - 1}, inclusive."
585
+ )
586
+
587
+ # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
588
+ def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
589
+ if latents is None:
590
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
591
+ else:
592
+ if latents.shape != shape:
593
+ raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
594
+ latents = latents.to(device)
595
+
596
+ latents = latents * scheduler.init_noise_sigma
597
+ return latents
598
+
599
+ def noise_image_embeddings(
600
+ self,
601
+ image_embeds: torch.Tensor,
602
+ noise_level: int,
603
+ noise: Optional[torch.Tensor] = None,
604
+ generator: Optional[torch.Generator] = None,
605
+ ):
606
+ """
607
+ Add noise to the image embeddings. The amount of noise is controlled by a `noise_level` input. A higher
608
+ `noise_level` increases the variance in the final un-noised images.
609
+
610
+ The noise is applied in two ways:
611
+ 1. A noise schedule is applied directly to the embeddings.
612
+ 2. A vector of sinusoidal time embeddings are appended to the output.
613
+
614
+ In both cases, the amount of noise is controlled by the same `noise_level`.
615
+
616
+ The embeddings are normalized before the noise is applied and un-normalized after the noise is applied.
617
+ """
618
+ if noise is None:
619
+ noise = randn_tensor(
620
+ image_embeds.shape, generator=generator, device=image_embeds.device, dtype=image_embeds.dtype
621
+ )
622
+
623
+ noise_level = torch.tensor([noise_level] * image_embeds.shape[0], device=image_embeds.device)
624
+
625
+ self.image_normalizer.to(image_embeds.device)
626
+ image_embeds = self.image_normalizer.scale(image_embeds)
627
+
628
+ image_embeds = self.image_noising_scheduler.add_noise(image_embeds, timesteps=noise_level, noise=noise)
629
+
630
+ image_embeds = self.image_normalizer.unscale(image_embeds)
631
+
632
+ noise_level = get_timestep_embedding(
633
+ timesteps=noise_level, embedding_dim=image_embeds.shape[-1], flip_sin_to_cos=True, downscale_freq_shift=0
634
+ )
635
+
636
+ # `get_timestep_embeddings` does not contain any weights and will always return f32 tensors,
637
+ # but we might actually be running in fp16. so we need to cast here.
638
+ # there might be better ways to encapsulate this.
639
+ noise_level = noise_level.to(image_embeds.dtype)
640
+
641
+ image_embeds = torch.cat((image_embeds, noise_level), 1)
642
+
643
+ return image_embeds
644
+
645
+ @torch.no_grad()
646
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
647
+ def __call__(
648
+ self,
649
+ # regular denoising process args
650
+ prompt: Optional[Union[str, List[str]]] = None,
651
+ height: Optional[int] = None,
652
+ width: Optional[int] = None,
653
+ num_inference_steps: int = 20,
654
+ guidance_scale: float = 10.0,
655
+ negative_prompt: Optional[Union[str, List[str]]] = None,
656
+ num_images_per_prompt: Optional[int] = 1,
657
+ eta: float = 0.0,
658
+ generator: Optional[torch.Generator] = None,
659
+ latents: Optional[torch.Tensor] = None,
660
+ prompt_embeds: Optional[torch.Tensor] = None,
661
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
662
+ output_type: Optional[str] = "pil",
663
+ return_dict: bool = True,
664
+ callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
665
+ callback_steps: int = 1,
666
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
667
+ noise_level: int = 0,
668
+ # prior args
669
+ prior_num_inference_steps: int = 25,
670
+ prior_guidance_scale: float = 4.0,
671
+ prior_latents: Optional[torch.Tensor] = None,
672
+ clip_skip: Optional[int] = None,
673
+ ):
674
+ """
675
+ The call function to the pipeline for generation.
676
+
677
+ Args:
678
+ prompt (`str` or `List[str]`, *optional*):
679
+ The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
680
+ height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
681
+ The height in pixels of the generated image.
682
+ width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
683
+ The width in pixels of the generated image.
684
+ num_inference_steps (`int`, *optional*, defaults to 20):
685
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
686
+ expense of slower inference.
687
+ guidance_scale (`float`, *optional*, defaults to 10.0):
688
+ A higher guidance scale value encourages the model to generate images closely linked to the text
689
+ `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
690
+ negative_prompt (`str` or `List[str]`, *optional*):
691
+ The prompt or prompts to guide what to not include in image generation. If not defined, you need to
692
+ pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
693
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
694
+ The number of images to generate per prompt.
695
+ eta (`float`, *optional*, defaults to 0.0):
696
+ Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only
697
+ applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
698
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
699
+ A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
700
+ generation deterministic.
701
+ latents (`torch.Tensor`, *optional*):
702
+ Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
703
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
704
+ tensor is generated by sampling using the supplied random `generator`.
705
+ prompt_embeds (`torch.Tensor`, *optional*):
706
+ Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
707
+ provided, text embeddings are generated from the `prompt` input argument.
708
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
709
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
710
+ not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
711
+ output_type (`str`, *optional*, defaults to `"pil"`):
712
+ The output format of the generated image. Choose between `PIL.Image` or `np.array`.
713
+ return_dict (`bool`, *optional*, defaults to `True`):
714
+ Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
715
+ callback (`Callable`, *optional*):
716
+ A function that calls every `callback_steps` steps during inference. The function is called with the
717
+ following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
718
+ callback_steps (`int`, *optional*, defaults to 1):
719
+ The frequency at which the `callback` function is called. If not specified, the callback is called at
720
+ every step.
721
+ cross_attention_kwargs (`dict`, *optional*):
722
+ A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
723
+ [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
724
+ noise_level (`int`, *optional*, defaults to `0`):
725
+ The amount of noise to add to the image embeddings. A higher `noise_level` increases the variance in
726
+ the final un-noised images. See [`StableUnCLIPPipeline.noise_image_embeddings`] for more details.
727
+ prior_num_inference_steps (`int`, *optional*, defaults to 25):
728
+ The number of denoising steps in the prior denoising process. More denoising steps usually lead to a
729
+ higher quality image at the expense of slower inference.
730
+ prior_guidance_scale (`float`, *optional*, defaults to 4.0):
731
+ A higher guidance scale value encourages the model to generate images closely linked to the text
732
+ `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
733
+ prior_latents (`torch.Tensor`, *optional*):
734
+ Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
735
+ embedding generation in the prior denoising process. Can be used to tweak the same generation with
736
+ different prompts. If not provided, a latents tensor is generated by sampling using the supplied random
737
+ `generator`.
738
+ clip_skip (`int`, *optional*):
739
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
740
+ the output of the pre-final layer will be used for computing the prompt embeddings.
741
+ Examples:
742
+
743
+ Returns:
744
+ [`~pipelines.ImagePipelineOutput`] or `tuple`:
745
+ [`~ pipeline_utils.ImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple`. When returning
746
+ a tuple, the first element is a list with the generated images.
747
+ """
748
+ # 0. Default height and width to unet
749
+ height = height or self.unet.config.sample_size * self.vae_scale_factor
750
+ width = width or self.unet.config.sample_size * self.vae_scale_factor
751
+
752
+ # 1. Check inputs. Raise error if not correct
753
+ self.check_inputs(
754
+ prompt=prompt,
755
+ height=height,
756
+ width=width,
757
+ callback_steps=callback_steps,
758
+ noise_level=noise_level,
759
+ negative_prompt=negative_prompt,
760
+ prompt_embeds=prompt_embeds,
761
+ negative_prompt_embeds=negative_prompt_embeds,
762
+ )
763
+
764
+ # 2. Define call parameters
765
+ if prompt is not None and isinstance(prompt, str):
766
+ batch_size = 1
767
+ elif prompt is not None and isinstance(prompt, list):
768
+ batch_size = len(prompt)
769
+ else:
770
+ batch_size = prompt_embeds.shape[0]
771
+
772
+ batch_size = batch_size * num_images_per_prompt
773
+
774
+ device = self._execution_device
775
+
776
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
777
+ # of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
778
+ # corresponds to doing no classifier free guidance.
779
+ prior_do_classifier_free_guidance = prior_guidance_scale > 1.0
780
+
781
+ # 3. Encode input prompt
782
+ prior_prompt_embeds, prior_text_encoder_hidden_states, prior_text_mask = self._encode_prior_prompt(
783
+ prompt=prompt,
784
+ device=device,
785
+ num_images_per_prompt=num_images_per_prompt,
786
+ do_classifier_free_guidance=prior_do_classifier_free_guidance,
787
+ )
788
+
789
+ # 4. Prepare prior timesteps
790
+ self.prior_scheduler.set_timesteps(prior_num_inference_steps, device=device)
791
+ prior_timesteps_tensor = self.prior_scheduler.timesteps
792
+
793
+ # 5. Prepare prior latent variables
794
+ embedding_dim = self.prior.config.embedding_dim
795
+ prior_latents = self.prepare_latents(
796
+ (batch_size, embedding_dim),
797
+ prior_prompt_embeds.dtype,
798
+ device,
799
+ generator,
800
+ prior_latents,
801
+ self.prior_scheduler,
802
+ )
803
+
804
+ # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
805
+ prior_extra_step_kwargs = self.prepare_prior_extra_step_kwargs(generator, eta)
806
+
807
+ # 7. Prior denoising loop
808
+ for i, t in enumerate(self.progress_bar(prior_timesteps_tensor)):
809
+ # expand the latents if we are doing classifier free guidance
810
+ latent_model_input = torch.cat([prior_latents] * 2) if prior_do_classifier_free_guidance else prior_latents
811
+ latent_model_input = self.prior_scheduler.scale_model_input(latent_model_input, t)
812
+
813
+ predicted_image_embedding = self.prior(
814
+ latent_model_input,
815
+ timestep=t,
816
+ proj_embedding=prior_prompt_embeds,
817
+ encoder_hidden_states=prior_text_encoder_hidden_states,
818
+ attention_mask=prior_text_mask,
819
+ ).predicted_image_embedding
820
+
821
+ if prior_do_classifier_free_guidance:
822
+ predicted_image_embedding_uncond, predicted_image_embedding_text = predicted_image_embedding.chunk(2)
823
+ predicted_image_embedding = predicted_image_embedding_uncond + prior_guidance_scale * (
824
+ predicted_image_embedding_text - predicted_image_embedding_uncond
825
+ )
826
+
827
+ prior_latents = self.prior_scheduler.step(
828
+ predicted_image_embedding,
829
+ timestep=t,
830
+ sample=prior_latents,
831
+ **prior_extra_step_kwargs,
832
+ return_dict=False,
833
+ )[0]
834
+
835
+ if callback is not None and i % callback_steps == 0:
836
+ callback(i, t, prior_latents)
837
+
838
+ prior_latents = self.prior.post_process_latents(prior_latents)
839
+
840
+ image_embeds = prior_latents
841
+
842
+ # done prior
843
+
844
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
845
+ # of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
846
+ # corresponds to doing no classifier free guidance.
847
+ do_classifier_free_guidance = guidance_scale > 1.0
848
+
849
+ # 8. Encode input prompt
850
+ text_encoder_lora_scale = (
851
+ cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
852
+ )
853
+ prompt_embeds, negative_prompt_embeds = self.encode_prompt(
854
+ prompt=prompt,
855
+ device=device,
856
+ num_images_per_prompt=num_images_per_prompt,
857
+ do_classifier_free_guidance=do_classifier_free_guidance,
858
+ negative_prompt=negative_prompt,
859
+ prompt_embeds=prompt_embeds,
860
+ negative_prompt_embeds=negative_prompt_embeds,
861
+ lora_scale=text_encoder_lora_scale,
862
+ clip_skip=clip_skip,
863
+ )
864
+ # For classifier free guidance, we need to do two forward passes.
865
+ # Here we concatenate the unconditional and text embeddings into a single batch
866
+ # to avoid doing two forward passes
867
+ if do_classifier_free_guidance:
868
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
869
+
870
+ # 9. Prepare image embeddings
871
+ image_embeds = self.noise_image_embeddings(
872
+ image_embeds=image_embeds,
873
+ noise_level=noise_level,
874
+ generator=generator,
875
+ )
876
+
877
+ if do_classifier_free_guidance:
878
+ negative_prompt_embeds = torch.zeros_like(image_embeds)
879
+
880
+ # For classifier free guidance, we need to do two forward passes.
881
+ # Here we concatenate the unconditional and text embeddings into a single batch
882
+ # to avoid doing two forward passes
883
+ image_embeds = torch.cat([negative_prompt_embeds, image_embeds])
884
+
885
+ # 10. Prepare timesteps
886
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
887
+ timesteps = self.scheduler.timesteps
888
+
889
+ # 11. Prepare latent variables
890
+ num_channels_latents = self.unet.config.in_channels
891
+ shape = (
892
+ batch_size,
893
+ num_channels_latents,
894
+ int(height) // self.vae_scale_factor,
895
+ int(width) // self.vae_scale_factor,
896
+ )
897
+ latents = self.prepare_latents(
898
+ shape=shape,
899
+ dtype=prompt_embeds.dtype,
900
+ device=device,
901
+ generator=generator,
902
+ latents=latents,
903
+ scheduler=self.scheduler,
904
+ )
905
+
906
+ # 12. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
907
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
908
+
909
+ # 13. Denoising loop
910
+ for i, t in enumerate(self.progress_bar(timesteps)):
911
+ latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
912
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
913
+
914
+ # predict the noise residual
915
+ noise_pred = self.unet(
916
+ latent_model_input,
917
+ t,
918
+ encoder_hidden_states=prompt_embeds,
919
+ class_labels=image_embeds,
920
+ cross_attention_kwargs=cross_attention_kwargs,
921
+ return_dict=False,
922
+ )[0]
923
+
924
+ # perform guidance
925
+ if do_classifier_free_guidance:
926
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
927
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
928
+
929
+ # compute the previous noisy sample x_t -> x_t-1
930
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
931
+
932
+ if callback is not None and i % callback_steps == 0:
933
+ step_idx = i // getattr(self.scheduler, "order", 1)
934
+ callback(step_idx, t, latents)
935
+
936
+ if XLA_AVAILABLE:
937
+ xm.mark_step()
938
+
939
+ if not output_type == "latent":
940
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
941
+ else:
942
+ image = latents
943
+
944
+ image = self.image_processor.postprocess(image, output_type=output_type)
945
+
946
+ # Offload all models
947
+ self.maybe_free_model_hooks()
948
+
949
+ if not return_dict:
950
+ return (image,)
951
+
952
+ return ImagePipelineOutput(images=image)
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py ADDED
@@ -0,0 +1,858 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ from typing import Any, Callable, Dict, List, Optional, Union
17
+
18
+ import PIL.Image
19
+ import torch
20
+ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
21
+
22
+ from ...image_processor import VaeImageProcessor
23
+ from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
24
+ from ...models import AutoencoderKL, UNet2DConditionModel
25
+ from ...models.embeddings import get_timestep_embedding
26
+ from ...models.lora import adjust_lora_scale_text_encoder
27
+ from ...schedulers import KarrasDiffusionSchedulers
28
+ from ...utils import (
29
+ USE_PEFT_BACKEND,
30
+ deprecate,
31
+ is_torch_xla_available,
32
+ logging,
33
+ replace_example_docstring,
34
+ scale_lora_layers,
35
+ unscale_lora_layers,
36
+ )
37
+ from ...utils.torch_utils import randn_tensor
38
+ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput, StableDiffusionMixin
39
+ from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
40
+
41
+
42
+ if is_torch_xla_available():
43
+ import torch_xla.core.xla_model as xm
44
+
45
+ XLA_AVAILABLE = True
46
+ else:
47
+ XLA_AVAILABLE = False
48
+
49
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
50
+
51
+
52
+ EXAMPLE_DOC_STRING = """
53
+ Examples:
54
+ ```py
55
+ >>> import requests
56
+ >>> import torch
57
+ >>> from PIL import Image
58
+ >>> from io import BytesIO
59
+
60
+ >>> from diffusers import StableUnCLIPImg2ImgPipeline
61
+
62
+ >>> pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
63
+ ... "stabilityai/stable-diffusion-2-1-unclip-small", torch_dtype=torch.float16
64
+ ... )
65
+ >>> pipe = pipe.to("cuda")
66
+
67
+ >>> url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
68
+
69
+ >>> response = requests.get(url)
70
+ >>> init_image = Image.open(BytesIO(response.content)).convert("RGB")
71
+ >>> init_image = init_image.resize((768, 512))
72
+
73
+ >>> prompt = "A fantasy landscape, trending on artstation"
74
+
75
+ >>> images = pipe(init_image, prompt).images
76
+ >>> images[0].save("fantasy_landscape.png")
77
+ ```
78
+ """
79
+
80
+
81
+ class StableUnCLIPImg2ImgPipeline(
82
+ DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin
83
+ ):
84
+ """
85
+ Pipeline for text-guided image-to-image generation using stable unCLIP.
86
+
87
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
88
+ implemented for all pipelines (downloading, saving, running on a particular device, etc.).
89
+
90
+ The pipeline also inherits the following loading methods:
91
+ - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
92
+ - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
93
+ - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
94
+
95
+ Args:
96
+ feature_extractor ([`CLIPImageProcessor`]):
97
+ Feature extractor for image pre-processing before being encoded.
98
+ image_encoder ([`CLIPVisionModelWithProjection`]):
99
+ CLIP vision model for encoding images.
100
+ image_normalizer ([`StableUnCLIPImageNormalizer`]):
101
+ Used to normalize the predicted image embeddings before the noise is applied and un-normalize the image
102
+ embeddings after the noise has been applied.
103
+ image_noising_scheduler ([`KarrasDiffusionSchedulers`]):
104
+ Noise schedule for adding noise to the predicted image embeddings. The amount of noise to add is determined
105
+ by the `noise_level`.
106
+ tokenizer (`~transformers.CLIPTokenizer`):
107
+ A [`~transformers.CLIPTokenizer`)].
108
+ text_encoder ([`~transformers.CLIPTextModel`]):
109
+ Frozen [`~transformers.CLIPTextModel`] text-encoder.
110
+ unet ([`UNet2DConditionModel`]):
111
+ A [`UNet2DConditionModel`] to denoise the encoded image latents.
112
+ scheduler ([`KarrasDiffusionSchedulers`]):
113
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents.
114
+ vae ([`AutoencoderKL`]):
115
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
116
+ """
117
+
118
+ model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
119
+ _exclude_from_cpu_offload = ["image_normalizer"]
120
+
121
+ # image encoding components
122
+ feature_extractor: CLIPImageProcessor
123
+ image_encoder: CLIPVisionModelWithProjection
124
+
125
+ # image noising components
126
+ image_normalizer: StableUnCLIPImageNormalizer
127
+ image_noising_scheduler: KarrasDiffusionSchedulers
128
+
129
+ # regular denoising components
130
+ tokenizer: CLIPTokenizer
131
+ text_encoder: CLIPTextModel
132
+ unet: UNet2DConditionModel
133
+ scheduler: KarrasDiffusionSchedulers
134
+
135
+ vae: AutoencoderKL
136
+
137
+ def __init__(
138
+ self,
139
+ # image encoding components
140
+ feature_extractor: CLIPImageProcessor,
141
+ image_encoder: CLIPVisionModelWithProjection,
142
+ # image noising components
143
+ image_normalizer: StableUnCLIPImageNormalizer,
144
+ image_noising_scheduler: KarrasDiffusionSchedulers,
145
+ # regular denoising components
146
+ tokenizer: CLIPTokenizer,
147
+ text_encoder: CLIPTextModel,
148
+ unet: UNet2DConditionModel,
149
+ scheduler: KarrasDiffusionSchedulers,
150
+ # vae
151
+ vae: AutoencoderKL,
152
+ ):
153
+ super().__init__()
154
+
155
+ self.register_modules(
156
+ feature_extractor=feature_extractor,
157
+ image_encoder=image_encoder,
158
+ image_normalizer=image_normalizer,
159
+ image_noising_scheduler=image_noising_scheduler,
160
+ tokenizer=tokenizer,
161
+ text_encoder=text_encoder,
162
+ unet=unet,
163
+ scheduler=scheduler,
164
+ vae=vae,
165
+ )
166
+
167
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
168
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
169
+
170
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
171
+ def _encode_prompt(
172
+ self,
173
+ prompt,
174
+ device,
175
+ num_images_per_prompt,
176
+ do_classifier_free_guidance,
177
+ negative_prompt=None,
178
+ prompt_embeds: Optional[torch.Tensor] = None,
179
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
180
+ lora_scale: Optional[float] = None,
181
+ **kwargs,
182
+ ):
183
+ deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
184
+ deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
185
+
186
+ prompt_embeds_tuple = self.encode_prompt(
187
+ prompt=prompt,
188
+ device=device,
189
+ num_images_per_prompt=num_images_per_prompt,
190
+ do_classifier_free_guidance=do_classifier_free_guidance,
191
+ negative_prompt=negative_prompt,
192
+ prompt_embeds=prompt_embeds,
193
+ negative_prompt_embeds=negative_prompt_embeds,
194
+ lora_scale=lora_scale,
195
+ **kwargs,
196
+ )
197
+
198
+ # concatenate for backwards comp
199
+ prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
200
+
201
+ return prompt_embeds
202
+
203
+ def _encode_image(
204
+ self,
205
+ image,
206
+ device,
207
+ batch_size,
208
+ num_images_per_prompt,
209
+ do_classifier_free_guidance,
210
+ noise_level,
211
+ generator,
212
+ image_embeds,
213
+ ):
214
+ dtype = next(self.image_encoder.parameters()).dtype
215
+
216
+ if isinstance(image, PIL.Image.Image):
217
+ # the image embedding should repeated so it matches the total batch size of the prompt
218
+ repeat_by = batch_size
219
+ else:
220
+ # assume the image input is already properly batched and just needs to be repeated so
221
+ # it matches the num_images_per_prompt.
222
+ #
223
+ # NOTE(will) this is probably missing a few number of side cases. I.e. batched/non-batched
224
+ # `image_embeds`. If those happen to be common use cases, let's think harder about
225
+ # what the expected dimensions of inputs should be and how we handle the encoding.
226
+ repeat_by = num_images_per_prompt
227
+
228
+ if image_embeds is None:
229
+ if not isinstance(image, torch.Tensor):
230
+ image = self.feature_extractor(images=image, return_tensors="pt").pixel_values
231
+
232
+ image = image.to(device=device, dtype=dtype)
233
+ image_embeds = self.image_encoder(image).image_embeds
234
+
235
+ image_embeds = self.noise_image_embeddings(
236
+ image_embeds=image_embeds,
237
+ noise_level=noise_level,
238
+ generator=generator,
239
+ )
240
+
241
+ # duplicate image embeddings for each generation per prompt, using mps friendly method
242
+ image_embeds = image_embeds.unsqueeze(1)
243
+ bs_embed, seq_len, _ = image_embeds.shape
244
+ image_embeds = image_embeds.repeat(1, repeat_by, 1)
245
+ image_embeds = image_embeds.view(bs_embed * repeat_by, seq_len, -1)
246
+ image_embeds = image_embeds.squeeze(1)
247
+
248
+ if do_classifier_free_guidance:
249
+ negative_prompt_embeds = torch.zeros_like(image_embeds)
250
+
251
+ # For classifier free guidance, we need to do two forward passes.
252
+ # Here we concatenate the unconditional and text embeddings into a single batch
253
+ # to avoid doing two forward passes
254
+ image_embeds = torch.cat([negative_prompt_embeds, image_embeds])
255
+
256
+ return image_embeds
257
+
258
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
259
+ def encode_prompt(
260
+ self,
261
+ prompt,
262
+ device,
263
+ num_images_per_prompt,
264
+ do_classifier_free_guidance,
265
+ negative_prompt=None,
266
+ prompt_embeds: Optional[torch.Tensor] = None,
267
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
268
+ lora_scale: Optional[float] = None,
269
+ clip_skip: Optional[int] = None,
270
+ ):
271
+ r"""
272
+ Encodes the prompt into text encoder hidden states.
273
+
274
+ Args:
275
+ prompt (`str` or `List[str]`, *optional*):
276
+ prompt to be encoded
277
+ device: (`torch.device`):
278
+ torch device
279
+ num_images_per_prompt (`int`):
280
+ number of images that should be generated per prompt
281
+ do_classifier_free_guidance (`bool`):
282
+ whether to use classifier free guidance or not
283
+ negative_prompt (`str` or `List[str]`, *optional*):
284
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
285
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
286
+ less than `1`).
287
+ prompt_embeds (`torch.Tensor`, *optional*):
288
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
289
+ provided, text embeddings will be generated from `prompt` input argument.
290
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
291
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
292
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
293
+ argument.
294
+ lora_scale (`float`, *optional*):
295
+ A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
296
+ clip_skip (`int`, *optional*):
297
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
298
+ the output of the pre-final layer will be used for computing the prompt embeddings.
299
+ """
300
+ # set lora scale so that monkey patched LoRA
301
+ # function of text encoder can correctly access it
302
+ if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
303
+ self._lora_scale = lora_scale
304
+
305
+ # dynamically adjust the LoRA scale
306
+ if not USE_PEFT_BACKEND:
307
+ adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
308
+ else:
309
+ scale_lora_layers(self.text_encoder, lora_scale)
310
+
311
+ if prompt is not None and isinstance(prompt, str):
312
+ batch_size = 1
313
+ elif prompt is not None and isinstance(prompt, list):
314
+ batch_size = len(prompt)
315
+ else:
316
+ batch_size = prompt_embeds.shape[0]
317
+
318
+ if prompt_embeds is None:
319
+ # textual inversion: process multi-vector tokens if necessary
320
+ if isinstance(self, TextualInversionLoaderMixin):
321
+ prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
322
+
323
+ text_inputs = self.tokenizer(
324
+ prompt,
325
+ padding="max_length",
326
+ max_length=self.tokenizer.model_max_length,
327
+ truncation=True,
328
+ return_tensors="pt",
329
+ )
330
+ text_input_ids = text_inputs.input_ids
331
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
332
+
333
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
334
+ text_input_ids, untruncated_ids
335
+ ):
336
+ removed_text = self.tokenizer.batch_decode(
337
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
338
+ )
339
+ logger.warning(
340
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
341
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
342
+ )
343
+
344
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
345
+ attention_mask = text_inputs.attention_mask.to(device)
346
+ else:
347
+ attention_mask = None
348
+
349
+ if clip_skip is None:
350
+ prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
351
+ prompt_embeds = prompt_embeds[0]
352
+ else:
353
+ prompt_embeds = self.text_encoder(
354
+ text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
355
+ )
356
+ # Access the `hidden_states` first, that contains a tuple of
357
+ # all the hidden states from the encoder layers. Then index into
358
+ # the tuple to access the hidden states from the desired layer.
359
+ prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
360
+ # We also need to apply the final LayerNorm here to not mess with the
361
+ # representations. The `last_hidden_states` that we typically use for
362
+ # obtaining the final prompt representations passes through the LayerNorm
363
+ # layer.
364
+ prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
365
+
366
+ if self.text_encoder is not None:
367
+ prompt_embeds_dtype = self.text_encoder.dtype
368
+ elif self.unet is not None:
369
+ prompt_embeds_dtype = self.unet.dtype
370
+ else:
371
+ prompt_embeds_dtype = prompt_embeds.dtype
372
+
373
+ prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
374
+
375
+ bs_embed, seq_len, _ = prompt_embeds.shape
376
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
377
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
378
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
379
+
380
+ # get unconditional embeddings for classifier free guidance
381
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
382
+ uncond_tokens: List[str]
383
+ if negative_prompt is None:
384
+ uncond_tokens = [""] * batch_size
385
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
386
+ raise TypeError(
387
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
388
+ f" {type(prompt)}."
389
+ )
390
+ elif isinstance(negative_prompt, str):
391
+ uncond_tokens = [negative_prompt]
392
+ elif batch_size != len(negative_prompt):
393
+ raise ValueError(
394
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
395
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
396
+ " the batch size of `prompt`."
397
+ )
398
+ else:
399
+ uncond_tokens = negative_prompt
400
+
401
+ # textual inversion: process multi-vector tokens if necessary
402
+ if isinstance(self, TextualInversionLoaderMixin):
403
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
404
+
405
+ max_length = prompt_embeds.shape[1]
406
+ uncond_input = self.tokenizer(
407
+ uncond_tokens,
408
+ padding="max_length",
409
+ max_length=max_length,
410
+ truncation=True,
411
+ return_tensors="pt",
412
+ )
413
+
414
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
415
+ attention_mask = uncond_input.attention_mask.to(device)
416
+ else:
417
+ attention_mask = None
418
+
419
+ negative_prompt_embeds = self.text_encoder(
420
+ uncond_input.input_ids.to(device),
421
+ attention_mask=attention_mask,
422
+ )
423
+ negative_prompt_embeds = negative_prompt_embeds[0]
424
+
425
+ if do_classifier_free_guidance:
426
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
427
+ seq_len = negative_prompt_embeds.shape[1]
428
+
429
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
430
+
431
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
432
+ negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
433
+
434
+ if self.text_encoder is not None:
435
+ if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
436
+ # Retrieve the original scale by scaling back the LoRA layers
437
+ unscale_lora_layers(self.text_encoder, lora_scale)
438
+
439
+ return prompt_embeds, negative_prompt_embeds
440
+
441
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
442
+ def decode_latents(self, latents):
443
+ deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
444
+ deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
445
+
446
+ latents = 1 / self.vae.config.scaling_factor * latents
447
+ image = self.vae.decode(latents, return_dict=False)[0]
448
+ image = (image / 2 + 0.5).clamp(0, 1)
449
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
450
+ image = image.cpu().permute(0, 2, 3, 1).float().numpy()
451
+ return image
452
+
453
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
454
+ def prepare_extra_step_kwargs(self, generator, eta):
455
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
456
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
457
+ # eta corresponds to η in DDIM paper: https://huggingface.co/papers/2010.02502
458
+ # and should be between [0, 1]
459
+
460
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
461
+ extra_step_kwargs = {}
462
+ if accepts_eta:
463
+ extra_step_kwargs["eta"] = eta
464
+
465
+ # check if the scheduler accepts generator
466
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
467
+ if accepts_generator:
468
+ extra_step_kwargs["generator"] = generator
469
+ return extra_step_kwargs
470
+
471
+ def check_inputs(
472
+ self,
473
+ prompt,
474
+ image,
475
+ height,
476
+ width,
477
+ callback_steps,
478
+ noise_level,
479
+ negative_prompt=None,
480
+ prompt_embeds=None,
481
+ negative_prompt_embeds=None,
482
+ image_embeds=None,
483
+ ):
484
+ if height % 8 != 0 or width % 8 != 0:
485
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
486
+
487
+ if (callback_steps is None) or (
488
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
489
+ ):
490
+ raise ValueError(
491
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
492
+ f" {type(callback_steps)}."
493
+ )
494
+
495
+ if prompt is not None and prompt_embeds is not None:
496
+ raise ValueError(
497
+ "Provide either `prompt` or `prompt_embeds`. Please make sure to define only one of the two."
498
+ )
499
+
500
+ if prompt is None and prompt_embeds is None:
501
+ raise ValueError(
502
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
503
+ )
504
+
505
+ if prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
506
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
507
+
508
+ if negative_prompt is not None and negative_prompt_embeds is not None:
509
+ raise ValueError(
510
+ "Provide either `negative_prompt` or `negative_prompt_embeds`. Cannot leave both `negative_prompt` and `negative_prompt_embeds` undefined."
511
+ )
512
+
513
+ if prompt is not None and negative_prompt is not None:
514
+ if type(prompt) is not type(negative_prompt):
515
+ raise TypeError(
516
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
517
+ f" {type(prompt)}."
518
+ )
519
+
520
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
521
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
522
+ raise ValueError(
523
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
524
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
525
+ f" {negative_prompt_embeds.shape}."
526
+ )
527
+
528
+ if noise_level < 0 or noise_level >= self.image_noising_scheduler.config.num_train_timesteps:
529
+ raise ValueError(
530
+ f"`noise_level` must be between 0 and {self.image_noising_scheduler.config.num_train_timesteps - 1}, inclusive."
531
+ )
532
+
533
+ if image is not None and image_embeds is not None:
534
+ raise ValueError(
535
+ "Provide either `image` or `image_embeds`. Please make sure to define only one of the two."
536
+ )
537
+
538
+ if image is None and image_embeds is None:
539
+ raise ValueError(
540
+ "Provide either `image` or `image_embeds`. Cannot leave both `image` and `image_embeds` undefined."
541
+ )
542
+
543
+ if image is not None:
544
+ if (
545
+ not isinstance(image, torch.Tensor)
546
+ and not isinstance(image, PIL.Image.Image)
547
+ and not isinstance(image, list)
548
+ ):
549
+ raise ValueError(
550
+ "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
551
+ f" {type(image)}"
552
+ )
553
+
554
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
555
+ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
556
+ shape = (
557
+ batch_size,
558
+ num_channels_latents,
559
+ int(height) // self.vae_scale_factor,
560
+ int(width) // self.vae_scale_factor,
561
+ )
562
+ if isinstance(generator, list) and len(generator) != batch_size:
563
+ raise ValueError(
564
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
565
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
566
+ )
567
+
568
+ if latents is None:
569
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
570
+ else:
571
+ latents = latents.to(device)
572
+
573
+ # scale the initial noise by the standard deviation required by the scheduler
574
+ latents = latents * self.scheduler.init_noise_sigma
575
+ return latents
576
+
577
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_unclip.StableUnCLIPPipeline.noise_image_embeddings
578
+ def noise_image_embeddings(
579
+ self,
580
+ image_embeds: torch.Tensor,
581
+ noise_level: int,
582
+ noise: Optional[torch.Tensor] = None,
583
+ generator: Optional[torch.Generator] = None,
584
+ ):
585
+ """
586
+ Add noise to the image embeddings. The amount of noise is controlled by a `noise_level` input. A higher
587
+ `noise_level` increases the variance in the final un-noised images.
588
+
589
+ The noise is applied in two ways:
590
+ 1. A noise schedule is applied directly to the embeddings.
591
+ 2. A vector of sinusoidal time embeddings are appended to the output.
592
+
593
+ In both cases, the amount of noise is controlled by the same `noise_level`.
594
+
595
+ The embeddings are normalized before the noise is applied and un-normalized after the noise is applied.
596
+ """
597
+ if noise is None:
598
+ noise = randn_tensor(
599
+ image_embeds.shape, generator=generator, device=image_embeds.device, dtype=image_embeds.dtype
600
+ )
601
+
602
+ noise_level = torch.tensor([noise_level] * image_embeds.shape[0], device=image_embeds.device)
603
+
604
+ self.image_normalizer.to(image_embeds.device)
605
+ image_embeds = self.image_normalizer.scale(image_embeds)
606
+
607
+ image_embeds = self.image_noising_scheduler.add_noise(image_embeds, timesteps=noise_level, noise=noise)
608
+
609
+ image_embeds = self.image_normalizer.unscale(image_embeds)
610
+
611
+ noise_level = get_timestep_embedding(
612
+ timesteps=noise_level, embedding_dim=image_embeds.shape[-1], flip_sin_to_cos=True, downscale_freq_shift=0
613
+ )
614
+
615
+ # `get_timestep_embeddings` does not contain any weights and will always return f32 tensors,
616
+ # but we might actually be running in fp16. so we need to cast here.
617
+ # there might be better ways to encapsulate this.
618
+ noise_level = noise_level.to(image_embeds.dtype)
619
+
620
+ image_embeds = torch.cat((image_embeds, noise_level), 1)
621
+
622
+ return image_embeds
623
+
624
+ @torch.no_grad()
625
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
626
+ def __call__(
627
+ self,
628
+ image: Union[torch.Tensor, PIL.Image.Image] = None,
629
+ prompt: Union[str, List[str]] = None,
630
+ height: Optional[int] = None,
631
+ width: Optional[int] = None,
632
+ num_inference_steps: int = 20,
633
+ guidance_scale: float = 10,
634
+ negative_prompt: Optional[Union[str, List[str]]] = None,
635
+ num_images_per_prompt: Optional[int] = 1,
636
+ eta: float = 0.0,
637
+ generator: Optional[torch.Generator] = None,
638
+ latents: Optional[torch.Tensor] = None,
639
+ prompt_embeds: Optional[torch.Tensor] = None,
640
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
641
+ output_type: Optional[str] = "pil",
642
+ return_dict: bool = True,
643
+ callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
644
+ callback_steps: int = 1,
645
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
646
+ noise_level: int = 0,
647
+ image_embeds: Optional[torch.Tensor] = None,
648
+ clip_skip: Optional[int] = None,
649
+ ):
650
+ r"""
651
+ The call function to the pipeline for generation.
652
+
653
+ Args:
654
+ prompt (`str` or `List[str]`, *optional*):
655
+ The prompt or prompts to guide the image generation. If not defined, either `prompt_embeds` will be
656
+ used or prompt is initialized to `""`.
657
+ image (`torch.Tensor` or `PIL.Image.Image`):
658
+ `Image` or tensor representing an image batch. The image is encoded to its CLIP embedding which the
659
+ `unet` is conditioned on. The image is _not_ encoded by the `vae` and then used as the latents in the
660
+ denoising process like it is in the standard Stable Diffusion text-guided image variation process.
661
+ height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
662
+ The height in pixels of the generated image.
663
+ width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
664
+ The width in pixels of the generated image.
665
+ num_inference_steps (`int`, *optional*, defaults to 20):
666
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
667
+ expense of slower inference.
668
+ guidance_scale (`float`, *optional*, defaults to 10.0):
669
+ A higher guidance scale value encourages the model to generate images closely linked to the text
670
+ `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
671
+ negative_prompt (`str` or `List[str]`, *optional*):
672
+ The prompt or prompts to guide what to not include in image generation. If not defined, you need to
673
+ pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
674
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
675
+ The number of images to generate per prompt.
676
+ eta (`float`, *optional*, defaults to 0.0):
677
+ Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only
678
+ applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
679
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
680
+ A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
681
+ generation deterministic.
682
+ latents (`torch.Tensor`, *optional*):
683
+ Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
684
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
685
+ tensor is generated by sampling using the supplied random `generator`.
686
+ prompt_embeds (`torch.Tensor`, *optional*):
687
+ Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
688
+ provided, text embeddings are generated from the `prompt` input argument.
689
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
690
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
691
+ not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
692
+ output_type (`str`, *optional*, defaults to `"pil"`):
693
+ The output format of the generated image. Choose between `PIL.Image` or `np.array`.
694
+ return_dict (`bool`, *optional*, defaults to `True`):
695
+ Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
696
+ callback (`Callable`, *optional*):
697
+ A function that calls every `callback_steps` steps during inference. The function is called with the
698
+ following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
699
+ callback_steps (`int`, *optional*, defaults to 1):
700
+ The frequency at which the `callback` function is called. If not specified, the callback is called at
701
+ every step.
702
+ cross_attention_kwargs (`dict`, *optional*):
703
+ A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
704
+ [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
705
+ noise_level (`int`, *optional*, defaults to `0`):
706
+ The amount of noise to add to the image embeddings. A higher `noise_level` increases the variance in
707
+ the final un-noised images. See [`StableUnCLIPPipeline.noise_image_embeddings`] for more details.
708
+ image_embeds (`torch.Tensor`, *optional*):
709
+ Pre-generated CLIP embeddings to condition the `unet` on. These latents are not used in the denoising
710
+ process. If you want to provide pre-generated latents, pass them to `__call__` as `latents`.
711
+ clip_skip (`int`, *optional*):
712
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
713
+ the output of the pre-final layer will be used for computing the prompt embeddings.
714
+
715
+ Examples:
716
+
717
+ Returns:
718
+ [`~pipelines.ImagePipelineOutput`] or `tuple`:
719
+ [`~ pipeline_utils.ImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple`. When returning
720
+ a tuple, the first element is a list with the generated images.
721
+ """
722
+ # 0. Default height and width to unet
723
+ height = height or self.unet.config.sample_size * self.vae_scale_factor
724
+ width = width or self.unet.config.sample_size * self.vae_scale_factor
725
+
726
+ if prompt is None and prompt_embeds is None:
727
+ prompt = len(image) * [""] if isinstance(image, list) else ""
728
+
729
+ # 1. Check inputs. Raise error if not correct
730
+ self.check_inputs(
731
+ prompt=prompt,
732
+ image=image,
733
+ height=height,
734
+ width=width,
735
+ callback_steps=callback_steps,
736
+ noise_level=noise_level,
737
+ negative_prompt=negative_prompt,
738
+ prompt_embeds=prompt_embeds,
739
+ negative_prompt_embeds=negative_prompt_embeds,
740
+ image_embeds=image_embeds,
741
+ )
742
+
743
+ # 2. Define call parameters
744
+ if prompt is not None and isinstance(prompt, str):
745
+ batch_size = 1
746
+ elif prompt is not None and isinstance(prompt, list):
747
+ batch_size = len(prompt)
748
+ else:
749
+ batch_size = prompt_embeds.shape[0]
750
+
751
+ batch_size = batch_size * num_images_per_prompt
752
+
753
+ device = self._execution_device
754
+
755
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
756
+ # of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
757
+ # corresponds to doing no classifier free guidance.
758
+ do_classifier_free_guidance = guidance_scale > 1.0
759
+
760
+ # 3. Encode input prompt
761
+ text_encoder_lora_scale = (
762
+ cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
763
+ )
764
+ prompt_embeds, negative_prompt_embeds = self.encode_prompt(
765
+ prompt=prompt,
766
+ device=device,
767
+ num_images_per_prompt=num_images_per_prompt,
768
+ do_classifier_free_guidance=do_classifier_free_guidance,
769
+ negative_prompt=negative_prompt,
770
+ prompt_embeds=prompt_embeds,
771
+ negative_prompt_embeds=negative_prompt_embeds,
772
+ lora_scale=text_encoder_lora_scale,
773
+ )
774
+ # For classifier free guidance, we need to do two forward passes.
775
+ # Here we concatenate the unconditional and text embeddings into a single batch
776
+ # to avoid doing two forward passes
777
+ if do_classifier_free_guidance:
778
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
779
+
780
+ # 4. Encoder input image
781
+ noise_level = torch.tensor([noise_level], device=device)
782
+ image_embeds = self._encode_image(
783
+ image=image,
784
+ device=device,
785
+ batch_size=batch_size,
786
+ num_images_per_prompt=num_images_per_prompt,
787
+ do_classifier_free_guidance=do_classifier_free_guidance,
788
+ noise_level=noise_level,
789
+ generator=generator,
790
+ image_embeds=image_embeds,
791
+ )
792
+
793
+ # 5. Prepare timesteps
794
+ self.scheduler.set_timesteps(num_inference_steps, device=device)
795
+ timesteps = self.scheduler.timesteps
796
+
797
+ # 6. Prepare latent variables
798
+ num_channels_latents = self.unet.config.in_channels
799
+ if latents is None:
800
+ latents = self.prepare_latents(
801
+ batch_size=batch_size,
802
+ num_channels_latents=num_channels_latents,
803
+ height=height,
804
+ width=width,
805
+ dtype=prompt_embeds.dtype,
806
+ device=device,
807
+ generator=generator,
808
+ latents=latents,
809
+ )
810
+
811
+ # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
812
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
813
+
814
+ # 8. Denoising loop
815
+ for i, t in enumerate(self.progress_bar(timesteps)):
816
+ latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
817
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
818
+
819
+ # predict the noise residual
820
+ noise_pred = self.unet(
821
+ latent_model_input,
822
+ t,
823
+ encoder_hidden_states=prompt_embeds,
824
+ class_labels=image_embeds,
825
+ cross_attention_kwargs=cross_attention_kwargs,
826
+ return_dict=False,
827
+ )[0]
828
+
829
+ # perform guidance
830
+ if do_classifier_free_guidance:
831
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
832
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
833
+
834
+ # compute the previous noisy sample x_t -> x_t-1
835
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
836
+
837
+ if callback is not None and i % callback_steps == 0:
838
+ step_idx = i // getattr(self.scheduler, "order", 1)
839
+ callback(step_idx, t, latents)
840
+
841
+ if XLA_AVAILABLE:
842
+ xm.mark_step()
843
+
844
+ # 9. Post-processing
845
+ if not output_type == "latent":
846
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
847
+ else:
848
+ image = latents
849
+
850
+ image = self.image_processor.postprocess(image, output_type=output_type)
851
+
852
+ # Offload all models
853
+ self.maybe_free_model_hooks()
854
+
855
+ if not return_dict:
856
+ return (image,)
857
+
858
+ return ImagePipelineOutput(images=image)
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/safety_checker.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import numpy as np
16
+ import torch
17
+ import torch.nn as nn
18
+ from transformers import CLIPConfig, CLIPVisionModel, PreTrainedModel
19
+
20
+ from ...utils import logging
21
+
22
+
23
+ logger = logging.get_logger(__name__)
24
+
25
+
26
+ def cosine_distance(image_embeds, text_embeds):
27
+ normalized_image_embeds = nn.functional.normalize(image_embeds)
28
+ normalized_text_embeds = nn.functional.normalize(text_embeds)
29
+ return torch.mm(normalized_image_embeds, normalized_text_embeds.t())
30
+
31
+
32
+ class StableDiffusionSafetyChecker(PreTrainedModel):
33
+ config_class = CLIPConfig
34
+ main_input_name = "clip_input"
35
+
36
+ _no_split_modules = ["CLIPEncoderLayer"]
37
+
38
+ def __init__(self, config: CLIPConfig):
39
+ super().__init__(config)
40
+
41
+ self.vision_model = CLIPVisionModel(config.vision_config)
42
+ self.visual_projection = nn.Linear(config.vision_config.hidden_size, config.projection_dim, bias=False)
43
+
44
+ self.concept_embeds = nn.Parameter(torch.ones(17, config.projection_dim), requires_grad=False)
45
+ self.special_care_embeds = nn.Parameter(torch.ones(3, config.projection_dim), requires_grad=False)
46
+
47
+ self.concept_embeds_weights = nn.Parameter(torch.ones(17), requires_grad=False)
48
+ self.special_care_embeds_weights = nn.Parameter(torch.ones(3), requires_grad=False)
49
+
50
+ @torch.no_grad()
51
+ def forward(self, clip_input, images):
52
+ pooled_output = self.vision_model(clip_input)[1] # pooled_output
53
+ image_embeds = self.visual_projection(pooled_output)
54
+
55
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
56
+ special_cos_dist = cosine_distance(image_embeds, self.special_care_embeds).cpu().float().numpy()
57
+ cos_dist = cosine_distance(image_embeds, self.concept_embeds).cpu().float().numpy()
58
+
59
+ result = []
60
+ batch_size = image_embeds.shape[0]
61
+ for i in range(batch_size):
62
+ result_img = {"special_scores": {}, "special_care": [], "concept_scores": {}, "bad_concepts": []}
63
+
64
+ # increase this value to create a stronger `nfsw` filter
65
+ # at the cost of increasing the possibility of filtering benign images
66
+ adjustment = 0.0
67
+
68
+ for concept_idx in range(len(special_cos_dist[0])):
69
+ concept_cos = special_cos_dist[i][concept_idx]
70
+ concept_threshold = self.special_care_embeds_weights[concept_idx].item()
71
+ result_img["special_scores"][concept_idx] = round(concept_cos - concept_threshold + adjustment, 3)
72
+ if result_img["special_scores"][concept_idx] > 0:
73
+ result_img["special_care"].append({concept_idx, result_img["special_scores"][concept_idx]})
74
+ adjustment = 0.01
75
+
76
+ for concept_idx in range(len(cos_dist[0])):
77
+ concept_cos = cos_dist[i][concept_idx]
78
+ concept_threshold = self.concept_embeds_weights[concept_idx].item()
79
+ result_img["concept_scores"][concept_idx] = round(concept_cos - concept_threshold + adjustment, 3)
80
+ if result_img["concept_scores"][concept_idx] > 0:
81
+ result_img["bad_concepts"].append(concept_idx)
82
+
83
+ result.append(result_img)
84
+
85
+ has_nsfw_concepts = [len(res["bad_concepts"]) > 0 for res in result]
86
+
87
+ for idx, has_nsfw_concept in enumerate(has_nsfw_concepts):
88
+ if has_nsfw_concept:
89
+ if torch.is_tensor(images) or torch.is_tensor(images[0]):
90
+ images[idx] = torch.zeros_like(images[idx]) # black image
91
+ else:
92
+ images[idx] = np.zeros(images[idx].shape) # black image
93
+
94
+ if any(has_nsfw_concepts):
95
+ logger.warning(
96
+ "Potential NSFW content was detected in one or more images. A black image will be returned instead."
97
+ " Try again with a different prompt and/or seed."
98
+ )
99
+
100
+ return images, has_nsfw_concepts
101
+
102
+ @torch.no_grad()
103
+ def forward_onnx(self, clip_input: torch.Tensor, images: torch.Tensor):
104
+ pooled_output = self.vision_model(clip_input)[1] # pooled_output
105
+ image_embeds = self.visual_projection(pooled_output)
106
+
107
+ special_cos_dist = cosine_distance(image_embeds, self.special_care_embeds)
108
+ cos_dist = cosine_distance(image_embeds, self.concept_embeds)
109
+
110
+ # increase this value to create a stronger `nsfw` filter
111
+ # at the cost of increasing the possibility of filtering benign images
112
+ adjustment = 0.0
113
+
114
+ special_scores = special_cos_dist - self.special_care_embeds_weights + adjustment
115
+ # special_scores = special_scores.round(decimals=3)
116
+ special_care = torch.any(special_scores > 0, dim=1)
117
+ special_adjustment = special_care * 0.01
118
+ special_adjustment = special_adjustment.unsqueeze(1).expand(-1, cos_dist.shape[1])
119
+
120
+ concept_scores = (cos_dist - self.concept_embeds_weights) + special_adjustment
121
+ # concept_scores = concept_scores.round(decimals=3)
122
+ has_nsfw_concepts = torch.any(concept_scores > 0, dim=1)
123
+
124
+ images[has_nsfw_concepts] = 0.0 # black image
125
+
126
+ return images, has_nsfw_concepts
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/safety_checker_flax.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import Optional, Tuple
16
+
17
+ import jax
18
+ import jax.numpy as jnp
19
+ from flax import linen as nn
20
+ from flax.core.frozen_dict import FrozenDict
21
+ from transformers import CLIPConfig, FlaxPreTrainedModel
22
+ from transformers.models.clip.modeling_flax_clip import FlaxCLIPVisionModule
23
+
24
+
25
+ def jax_cosine_distance(emb_1, emb_2, eps=1e-12):
26
+ norm_emb_1 = jnp.divide(emb_1.T, jnp.clip(jnp.linalg.norm(emb_1, axis=1), a_min=eps)).T
27
+ norm_emb_2 = jnp.divide(emb_2.T, jnp.clip(jnp.linalg.norm(emb_2, axis=1), a_min=eps)).T
28
+ return jnp.matmul(norm_emb_1, norm_emb_2.T)
29
+
30
+
31
+ class FlaxStableDiffusionSafetyCheckerModule(nn.Module):
32
+ config: CLIPConfig
33
+ dtype: jnp.dtype = jnp.float32
34
+
35
+ def setup(self):
36
+ self.vision_model = FlaxCLIPVisionModule(self.config.vision_config)
37
+ self.visual_projection = nn.Dense(self.config.projection_dim, use_bias=False, dtype=self.dtype)
38
+
39
+ self.concept_embeds = self.param("concept_embeds", jax.nn.initializers.ones, (17, self.config.projection_dim))
40
+ self.special_care_embeds = self.param(
41
+ "special_care_embeds", jax.nn.initializers.ones, (3, self.config.projection_dim)
42
+ )
43
+
44
+ self.concept_embeds_weights = self.param("concept_embeds_weights", jax.nn.initializers.ones, (17,))
45
+ self.special_care_embeds_weights = self.param("special_care_embeds_weights", jax.nn.initializers.ones, (3,))
46
+
47
+ def __call__(self, clip_input):
48
+ pooled_output = self.vision_model(clip_input)[1]
49
+ image_embeds = self.visual_projection(pooled_output)
50
+
51
+ special_cos_dist = jax_cosine_distance(image_embeds, self.special_care_embeds)
52
+ cos_dist = jax_cosine_distance(image_embeds, self.concept_embeds)
53
+
54
+ # increase this value to create a stronger `nfsw` filter
55
+ # at the cost of increasing the possibility of filtering benign image inputs
56
+ adjustment = 0.0
57
+
58
+ special_scores = special_cos_dist - self.special_care_embeds_weights[None, :] + adjustment
59
+ special_scores = jnp.round(special_scores, 3)
60
+ is_special_care = jnp.any(special_scores > 0, axis=1, keepdims=True)
61
+ # Use a lower threshold if an image has any special care concept
62
+ special_adjustment = is_special_care * 0.01
63
+
64
+ concept_scores = cos_dist - self.concept_embeds_weights[None, :] + special_adjustment
65
+ concept_scores = jnp.round(concept_scores, 3)
66
+ has_nsfw_concepts = jnp.any(concept_scores > 0, axis=1)
67
+
68
+ return has_nsfw_concepts
69
+
70
+
71
+ class FlaxStableDiffusionSafetyChecker(FlaxPreTrainedModel):
72
+ config_class = CLIPConfig
73
+ main_input_name = "clip_input"
74
+ module_class = FlaxStableDiffusionSafetyCheckerModule
75
+
76
+ def __init__(
77
+ self,
78
+ config: CLIPConfig,
79
+ input_shape: Optional[Tuple] = None,
80
+ seed: int = 0,
81
+ dtype: jnp.dtype = jnp.float32,
82
+ _do_init: bool = True,
83
+ **kwargs,
84
+ ):
85
+ if input_shape is None:
86
+ input_shape = (1, 224, 224, 3)
87
+ module = self.module_class(config=config, dtype=dtype, **kwargs)
88
+ super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
89
+
90
+ def init_weights(self, rng: jax.Array, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
91
+ # init input tensor
92
+ clip_input = jax.random.normal(rng, input_shape)
93
+
94
+ params_rng, dropout_rng = jax.random.split(rng)
95
+ rngs = {"params": params_rng, "dropout": dropout_rng}
96
+
97
+ random_params = self.module.init(rngs, clip_input)["params"]
98
+
99
+ return random_params
100
+
101
+ def __call__(
102
+ self,
103
+ clip_input,
104
+ params: dict = None,
105
+ ):
106
+ clip_input = jnp.transpose(clip_input, (0, 2, 3, 1))
107
+
108
+ return self.module.apply(
109
+ {"params": params or self.params},
110
+ jnp.array(clip_input, dtype=jnp.float32),
111
+ rngs={},
112
+ )
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import Optional, Union
16
+
17
+ import torch
18
+ from torch import nn
19
+
20
+ from ...configuration_utils import ConfigMixin, register_to_config
21
+ from ...models.modeling_utils import ModelMixin
22
+
23
+
24
+ class StableUnCLIPImageNormalizer(ModelMixin, ConfigMixin):
25
+ """
26
+ This class is used to hold the mean and standard deviation of the CLIP embedder used in stable unCLIP.
27
+
28
+ It is used to normalize the image embeddings before the noise is applied and un-normalize the noised image
29
+ embeddings.
30
+ """
31
+
32
+ @register_to_config
33
+ def __init__(
34
+ self,
35
+ embedding_dim: int = 768,
36
+ ):
37
+ super().__init__()
38
+
39
+ self.mean = nn.Parameter(torch.zeros(1, embedding_dim))
40
+ self.std = nn.Parameter(torch.ones(1, embedding_dim))
41
+
42
+ def to(
43
+ self,
44
+ torch_device: Optional[Union[str, torch.device]] = None,
45
+ torch_dtype: Optional[torch.dtype] = None,
46
+ ):
47
+ self.mean = nn.Parameter(self.mean.to(torch_device).to(torch_dtype))
48
+ self.std = nn.Parameter(self.std.to(torch_device).to(torch_dtype))
49
+ return self
50
+
51
+ def scale(self, embeds):
52
+ embeds = (embeds - self.mean) * 1.0 / self.std
53
+ return embeds
54
+
55
+ def unscale(self, embeds):
56
+ embeds = (embeds * self.std) + self.mean
57
+ return embeds
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_diffusion_xl/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (2 kB). View file
 
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_video_diffusion/__init__.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import TYPE_CHECKING
2
+
3
+ from ...utils import (
4
+ DIFFUSERS_SLOW_IMPORT,
5
+ BaseOutput,
6
+ OptionalDependencyNotAvailable,
7
+ _LazyModule,
8
+ get_objects_from_module,
9
+ is_torch_available,
10
+ is_transformers_available,
11
+ )
12
+
13
+
14
+ _dummy_objects = {}
15
+ _import_structure = {}
16
+
17
+ try:
18
+ if not (is_transformers_available() and is_torch_available()):
19
+ raise OptionalDependencyNotAvailable()
20
+ except OptionalDependencyNotAvailable:
21
+ from ...utils import dummy_torch_and_transformers_objects
22
+
23
+ _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
24
+ else:
25
+ _import_structure.update(
26
+ {
27
+ "pipeline_stable_video_diffusion": [
28
+ "StableVideoDiffusionPipeline",
29
+ "StableVideoDiffusionPipelineOutput",
30
+ ],
31
+ }
32
+ )
33
+
34
+
35
+ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
36
+ try:
37
+ if not (is_transformers_available() and is_torch_available()):
38
+ raise OptionalDependencyNotAvailable()
39
+ except OptionalDependencyNotAvailable:
40
+ from ...utils.dummy_torch_and_transformers_objects import *
41
+ else:
42
+ from .pipeline_stable_video_diffusion import (
43
+ StableVideoDiffusionPipeline,
44
+ StableVideoDiffusionPipelineOutput,
45
+ )
46
+
47
+ else:
48
+ import sys
49
+
50
+ sys.modules[__name__] = _LazyModule(
51
+ __name__,
52
+ globals()["__file__"],
53
+ _import_structure,
54
+ module_spec=__spec__,
55
+ )
56
+
57
+ for name, value in _dummy_objects.items():
58
+ setattr(sys.modules[__name__], name, value)
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_video_diffusion/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (1.17 kB). View file
 
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_video_diffusion/__pycache__/pipeline_stable_video_diffusion.cpython-310.pyc ADDED
Binary file (22.9 kB). View file
 
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py ADDED
@@ -0,0 +1,737 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ from dataclasses import dataclass
17
+ from typing import Callable, Dict, List, Optional, Union
18
+
19
+ import numpy as np
20
+ import PIL.Image
21
+ import torch
22
+ from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
23
+
24
+ from ...image_processor import PipelineImageInput
25
+ from ...models import AutoencoderKLTemporalDecoder, UNetSpatioTemporalConditionModel
26
+ from ...schedulers import EulerDiscreteScheduler
27
+ from ...utils import BaseOutput, is_torch_xla_available, logging, replace_example_docstring
28
+ from ...utils.torch_utils import is_compiled_module, randn_tensor
29
+ from ...video_processor import VideoProcessor
30
+ from ..pipeline_utils import DiffusionPipeline
31
+
32
+
33
+ if is_torch_xla_available():
34
+ import torch_xla.core.xla_model as xm
35
+
36
+ XLA_AVAILABLE = True
37
+ else:
38
+ XLA_AVAILABLE = False
39
+
40
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
41
+
42
+
43
+ EXAMPLE_DOC_STRING = """
44
+ Examples:
45
+ ```py
46
+ >>> from diffusers import StableVideoDiffusionPipeline
47
+ >>> from diffusers.utils import load_image, export_to_video
48
+
49
+ >>> pipe = StableVideoDiffusionPipeline.from_pretrained(
50
+ ... "stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16"
51
+ ... )
52
+ >>> pipe.to("cuda")
53
+
54
+ >>> image = load_image(
55
+ ... "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd-docstring-example.jpeg"
56
+ ... )
57
+ >>> image = image.resize((1024, 576))
58
+
59
+ >>> frames = pipe(image, num_frames=25, decode_chunk_size=8).frames[0]
60
+ >>> export_to_video(frames, "generated.mp4", fps=7)
61
+ ```
62
+ """
63
+
64
+
65
+ def _append_dims(x, target_dims):
66
+ """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
67
+ dims_to_append = target_dims - x.ndim
68
+ if dims_to_append < 0:
69
+ raise ValueError(f"input has {x.ndim} dims but target_dims is {target_dims}, which is less")
70
+ return x[(...,) + (None,) * dims_to_append]
71
+
72
+
73
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
74
+ def retrieve_timesteps(
75
+ scheduler,
76
+ num_inference_steps: Optional[int] = None,
77
+ device: Optional[Union[str, torch.device]] = None,
78
+ timesteps: Optional[List[int]] = None,
79
+ sigmas: Optional[List[float]] = None,
80
+ **kwargs,
81
+ ):
82
+ r"""
83
+ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
84
+ custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
85
+
86
+ Args:
87
+ scheduler (`SchedulerMixin`):
88
+ The scheduler to get timesteps from.
89
+ num_inference_steps (`int`):
90
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
91
+ must be `None`.
92
+ device (`str` or `torch.device`, *optional*):
93
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
94
+ timesteps (`List[int]`, *optional*):
95
+ Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
96
+ `num_inference_steps` and `sigmas` must be `None`.
97
+ sigmas (`List[float]`, *optional*):
98
+ Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
99
+ `num_inference_steps` and `timesteps` must be `None`.
100
+
101
+ Returns:
102
+ `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
103
+ second element is the number of inference steps.
104
+ """
105
+ if timesteps is not None and sigmas is not None:
106
+ raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
107
+ if timesteps is not None:
108
+ accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
109
+ if not accepts_timesteps:
110
+ raise ValueError(
111
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
112
+ f" timestep schedules. Please check whether you are using the correct scheduler."
113
+ )
114
+ scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
115
+ timesteps = scheduler.timesteps
116
+ num_inference_steps = len(timesteps)
117
+ elif sigmas is not None:
118
+ accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
119
+ if not accept_sigmas:
120
+ raise ValueError(
121
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
122
+ f" sigmas schedules. Please check whether you are using the correct scheduler."
123
+ )
124
+ scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
125
+ timesteps = scheduler.timesteps
126
+ num_inference_steps = len(timesteps)
127
+ else:
128
+ scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
129
+ timesteps = scheduler.timesteps
130
+ return timesteps, num_inference_steps
131
+
132
+
133
+ @dataclass
134
+ class StableVideoDiffusionPipelineOutput(BaseOutput):
135
+ r"""
136
+ Output class for Stable Video Diffusion pipeline.
137
+
138
+ Args:
139
+ frames (`[List[List[PIL.Image.Image]]`, `np.ndarray`, `torch.Tensor`]):
140
+ List of denoised PIL images of length `batch_size` or numpy array or torch tensor of shape `(batch_size,
141
+ num_frames, height, width, num_channels)`.
142
+ """
143
+
144
+ frames: Union[List[List[PIL.Image.Image]], np.ndarray, torch.Tensor]
145
+
146
+
147
+ class StableVideoDiffusionPipeline(DiffusionPipeline):
148
+ r"""
149
+ Pipeline to generate video from an input image using Stable Video Diffusion.
150
+
151
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
152
+ implemented for all pipelines (downloading, saving, running on a particular device, etc.).
153
+
154
+ Args:
155
+ vae ([`AutoencoderKLTemporalDecoder`]):
156
+ Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
157
+ image_encoder ([`~transformers.CLIPVisionModelWithProjection`]):
158
+ Frozen CLIP image-encoder
159
+ ([laion/CLIP-ViT-H-14-laion2B-s32B-b79K](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K)).
160
+ unet ([`UNetSpatioTemporalConditionModel`]):
161
+ A `UNetSpatioTemporalConditionModel` to denoise the encoded image latents.
162
+ scheduler ([`EulerDiscreteScheduler`]):
163
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents.
164
+ feature_extractor ([`~transformers.CLIPImageProcessor`]):
165
+ A `CLIPImageProcessor` to extract features from generated images.
166
+ """
167
+
168
+ model_cpu_offload_seq = "image_encoder->unet->vae"
169
+ _callback_tensor_inputs = ["latents"]
170
+
171
+ def __init__(
172
+ self,
173
+ vae: AutoencoderKLTemporalDecoder,
174
+ image_encoder: CLIPVisionModelWithProjection,
175
+ unet: UNetSpatioTemporalConditionModel,
176
+ scheduler: EulerDiscreteScheduler,
177
+ feature_extractor: CLIPImageProcessor,
178
+ ):
179
+ super().__init__()
180
+
181
+ self.register_modules(
182
+ vae=vae,
183
+ image_encoder=image_encoder,
184
+ unet=unet,
185
+ scheduler=scheduler,
186
+ feature_extractor=feature_extractor,
187
+ )
188
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
189
+ self.video_processor = VideoProcessor(do_resize=True, vae_scale_factor=self.vae_scale_factor)
190
+
191
+ def _encode_image(
192
+ self,
193
+ image: PipelineImageInput,
194
+ device: Union[str, torch.device],
195
+ num_videos_per_prompt: int,
196
+ do_classifier_free_guidance: bool,
197
+ ) -> torch.Tensor:
198
+ dtype = next(self.image_encoder.parameters()).dtype
199
+
200
+ if not isinstance(image, torch.Tensor):
201
+ image = self.video_processor.pil_to_numpy(image)
202
+ image = self.video_processor.numpy_to_pt(image)
203
+
204
+ # We normalize the image before resizing to match with the original implementation.
205
+ # Then we unnormalize it after resizing.
206
+ image = image * 2.0 - 1.0
207
+ image = _resize_with_antialiasing(image, (224, 224))
208
+ image = (image + 1.0) / 2.0
209
+
210
+ # Normalize the image with for CLIP input
211
+ image = self.feature_extractor(
212
+ images=image,
213
+ do_normalize=True,
214
+ do_center_crop=False,
215
+ do_resize=False,
216
+ do_rescale=False,
217
+ return_tensors="pt",
218
+ ).pixel_values
219
+
220
+ image = image.to(device=device, dtype=dtype)
221
+ image_embeddings = self.image_encoder(image).image_embeds
222
+ image_embeddings = image_embeddings.unsqueeze(1)
223
+
224
+ # duplicate image embeddings for each generation per prompt, using mps friendly method
225
+ bs_embed, seq_len, _ = image_embeddings.shape
226
+ image_embeddings = image_embeddings.repeat(1, num_videos_per_prompt, 1)
227
+ image_embeddings = image_embeddings.view(bs_embed * num_videos_per_prompt, seq_len, -1)
228
+
229
+ if do_classifier_free_guidance:
230
+ negative_image_embeddings = torch.zeros_like(image_embeddings)
231
+
232
+ # For classifier free guidance, we need to do two forward passes.
233
+ # Here we concatenate the unconditional and text embeddings into a single batch
234
+ # to avoid doing two forward passes
235
+ image_embeddings = torch.cat([negative_image_embeddings, image_embeddings])
236
+
237
+ return image_embeddings
238
+
239
+ def _encode_vae_image(
240
+ self,
241
+ image: torch.Tensor,
242
+ device: Union[str, torch.device],
243
+ num_videos_per_prompt: int,
244
+ do_classifier_free_guidance: bool,
245
+ ):
246
+ image = image.to(device=device)
247
+ image_latents = self.vae.encode(image).latent_dist.mode()
248
+
249
+ # duplicate image_latents for each generation per prompt, using mps friendly method
250
+ image_latents = image_latents.repeat(num_videos_per_prompt, 1, 1, 1)
251
+
252
+ if do_classifier_free_guidance:
253
+ negative_image_latents = torch.zeros_like(image_latents)
254
+
255
+ # For classifier free guidance, we need to do two forward passes.
256
+ # Here we concatenate the unconditional and text embeddings into a single batch
257
+ # to avoid doing two forward passes
258
+ image_latents = torch.cat([negative_image_latents, image_latents])
259
+
260
+ return image_latents
261
+
262
+ def _get_add_time_ids(
263
+ self,
264
+ fps: int,
265
+ motion_bucket_id: int,
266
+ noise_aug_strength: float,
267
+ dtype: torch.dtype,
268
+ batch_size: int,
269
+ num_videos_per_prompt: int,
270
+ do_classifier_free_guidance: bool,
271
+ ):
272
+ add_time_ids = [fps, motion_bucket_id, noise_aug_strength]
273
+
274
+ passed_add_embed_dim = self.unet.config.addition_time_embed_dim * len(add_time_ids)
275
+ expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
276
+
277
+ if expected_add_embed_dim != passed_add_embed_dim:
278
+ raise ValueError(
279
+ f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
280
+ )
281
+
282
+ add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
283
+ add_time_ids = add_time_ids.repeat(batch_size * num_videos_per_prompt, 1)
284
+
285
+ if do_classifier_free_guidance:
286
+ add_time_ids = torch.cat([add_time_ids, add_time_ids])
287
+
288
+ return add_time_ids
289
+
290
+ def decode_latents(self, latents: torch.Tensor, num_frames: int, decode_chunk_size: int = 14):
291
+ # [batch, frames, channels, height, width] -> [batch*frames, channels, height, width]
292
+ latents = latents.flatten(0, 1)
293
+
294
+ latents = 1 / self.vae.config.scaling_factor * latents
295
+
296
+ forward_vae_fn = self.vae._orig_mod.forward if is_compiled_module(self.vae) else self.vae.forward
297
+ accepts_num_frames = "num_frames" in set(inspect.signature(forward_vae_fn).parameters.keys())
298
+
299
+ # decode decode_chunk_size frames at a time to avoid OOM
300
+ frames = []
301
+ for i in range(0, latents.shape[0], decode_chunk_size):
302
+ num_frames_in = latents[i : i + decode_chunk_size].shape[0]
303
+ decode_kwargs = {}
304
+ if accepts_num_frames:
305
+ # we only pass num_frames_in if it's expected
306
+ decode_kwargs["num_frames"] = num_frames_in
307
+
308
+ frame = self.vae.decode(latents[i : i + decode_chunk_size], **decode_kwargs).sample
309
+ frames.append(frame)
310
+ frames = torch.cat(frames, dim=0)
311
+
312
+ # [batch*frames, channels, height, width] -> [batch, channels, frames, height, width]
313
+ frames = frames.reshape(-1, num_frames, *frames.shape[1:]).permute(0, 2, 1, 3, 4)
314
+
315
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
316
+ frames = frames.float()
317
+ return frames
318
+
319
+ def check_inputs(self, image, height, width):
320
+ if (
321
+ not isinstance(image, torch.Tensor)
322
+ and not isinstance(image, PIL.Image.Image)
323
+ and not isinstance(image, list)
324
+ ):
325
+ raise ValueError(
326
+ "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
327
+ f" {type(image)}"
328
+ )
329
+
330
+ if height % 8 != 0 or width % 8 != 0:
331
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
332
+
333
+ def prepare_latents(
334
+ self,
335
+ batch_size: int,
336
+ num_frames: int,
337
+ num_channels_latents: int,
338
+ height: int,
339
+ width: int,
340
+ dtype: torch.dtype,
341
+ device: Union[str, torch.device],
342
+ generator: torch.Generator,
343
+ latents: Optional[torch.Tensor] = None,
344
+ ):
345
+ shape = (
346
+ batch_size,
347
+ num_frames,
348
+ num_channels_latents // 2,
349
+ height // self.vae_scale_factor,
350
+ width // self.vae_scale_factor,
351
+ )
352
+ if isinstance(generator, list) and len(generator) != batch_size:
353
+ raise ValueError(
354
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
355
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
356
+ )
357
+
358
+ if latents is None:
359
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
360
+ else:
361
+ latents = latents.to(device)
362
+
363
+ # scale the initial noise by the standard deviation required by the scheduler
364
+ latents = latents * self.scheduler.init_noise_sigma
365
+ return latents
366
+
367
+ @property
368
+ def guidance_scale(self):
369
+ return self._guidance_scale
370
+
371
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
372
+ # of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
373
+ # corresponds to doing no classifier free guidance.
374
+ @property
375
+ def do_classifier_free_guidance(self):
376
+ if isinstance(self.guidance_scale, (int, float)):
377
+ return self.guidance_scale > 1
378
+ return self.guidance_scale.max() > 1
379
+
380
+ @property
381
+ def num_timesteps(self):
382
+ return self._num_timesteps
383
+
384
+ @torch.no_grad()
385
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
386
+ def __call__(
387
+ self,
388
+ image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.Tensor],
389
+ height: int = 576,
390
+ width: int = 1024,
391
+ num_frames: Optional[int] = None,
392
+ num_inference_steps: int = 25,
393
+ sigmas: Optional[List[float]] = None,
394
+ min_guidance_scale: float = 1.0,
395
+ max_guidance_scale: float = 3.0,
396
+ fps: int = 7,
397
+ motion_bucket_id: int = 127,
398
+ noise_aug_strength: float = 0.02,
399
+ decode_chunk_size: Optional[int] = None,
400
+ num_videos_per_prompt: Optional[int] = 1,
401
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
402
+ latents: Optional[torch.Tensor] = None,
403
+ output_type: Optional[str] = "pil",
404
+ callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
405
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
406
+ return_dict: bool = True,
407
+ ):
408
+ r"""
409
+ The call function to the pipeline for generation.
410
+
411
+ Args:
412
+ image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.Tensor`):
413
+ Image(s) to guide image generation. If you provide a tensor, the expected value range is between `[0,
414
+ 1]`.
415
+ height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
416
+ The height in pixels of the generated image.
417
+ width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
418
+ The width in pixels of the generated image.
419
+ num_frames (`int`, *optional*):
420
+ The number of video frames to generate. Defaults to `self.unet.config.num_frames` (14 for
421
+ `stable-video-diffusion-img2vid` and to 25 for `stable-video-diffusion-img2vid-xt`).
422
+ num_inference_steps (`int`, *optional*, defaults to 25):
423
+ The number of denoising steps. More denoising steps usually lead to a higher quality video at the
424
+ expense of slower inference. This parameter is modulated by `strength`.
425
+ sigmas (`List[float]`, *optional*):
426
+ Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
427
+ their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
428
+ will be used.
429
+ min_guidance_scale (`float`, *optional*, defaults to 1.0):
430
+ The minimum guidance scale. Used for the classifier free guidance with first frame.
431
+ max_guidance_scale (`float`, *optional*, defaults to 3.0):
432
+ The maximum guidance scale. Used for the classifier free guidance with last frame.
433
+ fps (`int`, *optional*, defaults to 7):
434
+ Frames per second. The rate at which the generated images shall be exported to a video after
435
+ generation. Note that Stable Diffusion Video's UNet was micro-conditioned on fps-1 during training.
436
+ motion_bucket_id (`int`, *optional*, defaults to 127):
437
+ Used for conditioning the amount of motion for the generation. The higher the number the more motion
438
+ will be in the video.
439
+ noise_aug_strength (`float`, *optional*, defaults to 0.02):
440
+ The amount of noise added to the init image, the higher it is the less the video will look like the
441
+ init image. Increase it for more motion.
442
+ decode_chunk_size (`int`, *optional*):
443
+ The number of frames to decode at a time. Higher chunk size leads to better temporal consistency at the
444
+ expense of more memory usage. By default, the decoder decodes all frames at once for maximal quality.
445
+ For lower memory usage, reduce `decode_chunk_size`.
446
+ num_videos_per_prompt (`int`, *optional*, defaults to 1):
447
+ The number of videos to generate per prompt.
448
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
449
+ A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
450
+ generation deterministic.
451
+ latents (`torch.Tensor`, *optional*):
452
+ Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
453
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
454
+ tensor is generated by sampling using the supplied random `generator`.
455
+ output_type (`str`, *optional*, defaults to `"pil"`):
456
+ The output format of the generated image. Choose between `pil`, `np` or `pt`.
457
+ callback_on_step_end (`Callable`, *optional*):
458
+ A function that is called at the end of each denoising step during inference. The function is called
459
+ with the following arguments:
460
+ `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`.
461
+ `callback_kwargs` will include a list of all tensors as specified by
462
+ `callback_on_step_end_tensor_inputs`.
463
+ callback_on_step_end_tensor_inputs (`List`, *optional*):
464
+ The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
465
+ will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
466
+ `._callback_tensor_inputs` attribute of your pipeline class.
467
+ return_dict (`bool`, *optional*, defaults to `True`):
468
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
469
+ plain tuple.
470
+
471
+ Examples:
472
+
473
+ Returns:
474
+ [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] or `tuple`:
475
+ If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is
476
+ returned, otherwise a `tuple` of (`List[List[PIL.Image.Image]]` or `np.ndarray` or `torch.Tensor`) is
477
+ returned.
478
+ """
479
+ # 0. Default height and width to unet
480
+ height = height or self.unet.config.sample_size * self.vae_scale_factor
481
+ width = width or self.unet.config.sample_size * self.vae_scale_factor
482
+
483
+ num_frames = num_frames if num_frames is not None else self.unet.config.num_frames
484
+ decode_chunk_size = decode_chunk_size if decode_chunk_size is not None else num_frames
485
+
486
+ # 1. Check inputs. Raise error if not correct
487
+ self.check_inputs(image, height, width)
488
+
489
+ # 2. Define call parameters
490
+ if isinstance(image, PIL.Image.Image):
491
+ batch_size = 1
492
+ elif isinstance(image, list):
493
+ batch_size = len(image)
494
+ else:
495
+ batch_size = image.shape[0]
496
+ device = self._execution_device
497
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
498
+ # of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
499
+ # corresponds to doing no classifier free guidance.
500
+ self._guidance_scale = max_guidance_scale
501
+
502
+ # 3. Encode input image
503
+ image_embeddings = self._encode_image(image, device, num_videos_per_prompt, self.do_classifier_free_guidance)
504
+
505
+ # NOTE: Stable Video Diffusion was conditioned on fps - 1, which is why it is reduced here.
506
+ # See: https://github.com/Stability-AI/generative-models/blob/ed0997173f98eaf8f4edf7ba5fe8f15c6b877fd3/scripts/sampling/simple_video_sample.py#L188
507
+ fps = fps - 1
508
+
509
+ # 4. Encode input image using VAE
510
+ image = self.video_processor.preprocess(image, height=height, width=width).to(device)
511
+ noise = randn_tensor(image.shape, generator=generator, device=device, dtype=image.dtype)
512
+ image = image + noise_aug_strength * noise
513
+
514
+ needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
515
+ if needs_upcasting:
516
+ self.vae.to(dtype=torch.float32)
517
+
518
+ image_latents = self._encode_vae_image(
519
+ image,
520
+ device=device,
521
+ num_videos_per_prompt=num_videos_per_prompt,
522
+ do_classifier_free_guidance=self.do_classifier_free_guidance,
523
+ )
524
+ image_latents = image_latents.to(image_embeddings.dtype)
525
+
526
+ # cast back to fp16 if needed
527
+ if needs_upcasting:
528
+ self.vae.to(dtype=torch.float16)
529
+
530
+ # Repeat the image latents for each frame so we can concatenate them with the noise
531
+ # image_latents [batch, channels, height, width] ->[batch, num_frames, channels, height, width]
532
+ image_latents = image_latents.unsqueeze(1).repeat(1, num_frames, 1, 1, 1)
533
+
534
+ # 5. Get Added Time IDs
535
+ added_time_ids = self._get_add_time_ids(
536
+ fps,
537
+ motion_bucket_id,
538
+ noise_aug_strength,
539
+ image_embeddings.dtype,
540
+ batch_size,
541
+ num_videos_per_prompt,
542
+ self.do_classifier_free_guidance,
543
+ )
544
+ added_time_ids = added_time_ids.to(device)
545
+
546
+ # 6. Prepare timesteps
547
+ timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, None, sigmas)
548
+
549
+ # 7. Prepare latent variables
550
+ num_channels_latents = self.unet.config.in_channels
551
+ latents = self.prepare_latents(
552
+ batch_size * num_videos_per_prompt,
553
+ num_frames,
554
+ num_channels_latents,
555
+ height,
556
+ width,
557
+ image_embeddings.dtype,
558
+ device,
559
+ generator,
560
+ latents,
561
+ )
562
+
563
+ # 8. Prepare guidance scale
564
+ guidance_scale = torch.linspace(min_guidance_scale, max_guidance_scale, num_frames).unsqueeze(0)
565
+ guidance_scale = guidance_scale.to(device, latents.dtype)
566
+ guidance_scale = guidance_scale.repeat(batch_size * num_videos_per_prompt, 1)
567
+ guidance_scale = _append_dims(guidance_scale, latents.ndim)
568
+
569
+ self._guidance_scale = guidance_scale
570
+
571
+ # 9. Denoising loop
572
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
573
+ self._num_timesteps = len(timesteps)
574
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
575
+ for i, t in enumerate(timesteps):
576
+ # expand the latents if we are doing classifier free guidance
577
+ latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
578
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
579
+
580
+ # Concatenate image_latents over channels dimension
581
+ latent_model_input = torch.cat([latent_model_input, image_latents], dim=2)
582
+
583
+ # predict the noise residual
584
+ noise_pred = self.unet(
585
+ latent_model_input,
586
+ t,
587
+ encoder_hidden_states=image_embeddings,
588
+ added_time_ids=added_time_ids,
589
+ return_dict=False,
590
+ )[0]
591
+
592
+ # perform guidance
593
+ if self.do_classifier_free_guidance:
594
+ noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
595
+ noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond)
596
+
597
+ # compute the previous noisy sample x_t -> x_t-1
598
+ latents = self.scheduler.step(noise_pred, t, latents).prev_sample
599
+
600
+ if callback_on_step_end is not None:
601
+ callback_kwargs = {}
602
+ for k in callback_on_step_end_tensor_inputs:
603
+ callback_kwargs[k] = locals()[k]
604
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
605
+
606
+ latents = callback_outputs.pop("latents", latents)
607
+
608
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
609
+ progress_bar.update()
610
+
611
+ if XLA_AVAILABLE:
612
+ xm.mark_step()
613
+
614
+ if not output_type == "latent":
615
+ # cast back to fp16 if needed
616
+ if needs_upcasting:
617
+ self.vae.to(dtype=torch.float16)
618
+ frames = self.decode_latents(latents, num_frames, decode_chunk_size)
619
+ frames = self.video_processor.postprocess_video(video=frames, output_type=output_type)
620
+ else:
621
+ frames = latents
622
+
623
+ self.maybe_free_model_hooks()
624
+
625
+ if not return_dict:
626
+ return frames
627
+
628
+ return StableVideoDiffusionPipelineOutput(frames=frames)
629
+
630
+
631
+ # resizing utils
632
+ # TODO: clean up later
633
+ def _resize_with_antialiasing(input, size, interpolation="bicubic", align_corners=True):
634
+ h, w = input.shape[-2:]
635
+ factors = (h / size[0], w / size[1])
636
+
637
+ # First, we have to determine sigma
638
+ # Taken from skimage: https://github.com/scikit-image/scikit-image/blob/v0.19.2/skimage/transform/_warps.py#L171
639
+ sigmas = (
640
+ max((factors[0] - 1.0) / 2.0, 0.001),
641
+ max((factors[1] - 1.0) / 2.0, 0.001),
642
+ )
643
+
644
+ # Now kernel size. Good results are for 3 sigma, but that is kind of slow. Pillow uses 1 sigma
645
+ # https://github.com/python-pillow/Pillow/blob/master/src/libImaging/Resample.c#L206
646
+ # But they do it in the 2 passes, which gives better results. Let's try 2 sigmas for now
647
+ ks = int(max(2.0 * 2 * sigmas[0], 3)), int(max(2.0 * 2 * sigmas[1], 3))
648
+
649
+ # Make sure it is odd
650
+ if (ks[0] % 2) == 0:
651
+ ks = ks[0] + 1, ks[1]
652
+
653
+ if (ks[1] % 2) == 0:
654
+ ks = ks[0], ks[1] + 1
655
+
656
+ input = _gaussian_blur2d(input, ks, sigmas)
657
+
658
+ output = torch.nn.functional.interpolate(input, size=size, mode=interpolation, align_corners=align_corners)
659
+ return output
660
+
661
+
662
+ def _compute_padding(kernel_size):
663
+ """Compute padding tuple."""
664
+ # 4 or 6 ints: (padding_left, padding_right,padding_top,padding_bottom)
665
+ # https://pytorch.org/docs/stable/nn.html#torch.nn.functional.pad
666
+ if len(kernel_size) < 2:
667
+ raise AssertionError(kernel_size)
668
+ computed = [k - 1 for k in kernel_size]
669
+
670
+ # for even kernels we need to do asymmetric padding :(
671
+ out_padding = 2 * len(kernel_size) * [0]
672
+
673
+ for i in range(len(kernel_size)):
674
+ computed_tmp = computed[-(i + 1)]
675
+
676
+ pad_front = computed_tmp // 2
677
+ pad_rear = computed_tmp - pad_front
678
+
679
+ out_padding[2 * i + 0] = pad_front
680
+ out_padding[2 * i + 1] = pad_rear
681
+
682
+ return out_padding
683
+
684
+
685
+ def _filter2d(input, kernel):
686
+ # prepare kernel
687
+ b, c, h, w = input.shape
688
+ tmp_kernel = kernel[:, None, ...].to(device=input.device, dtype=input.dtype)
689
+
690
+ tmp_kernel = tmp_kernel.expand(-1, c, -1, -1)
691
+
692
+ height, width = tmp_kernel.shape[-2:]
693
+
694
+ padding_shape: List[int] = _compute_padding([height, width])
695
+ input = torch.nn.functional.pad(input, padding_shape, mode="reflect")
696
+
697
+ # kernel and input tensor reshape to align element-wise or batch-wise params
698
+ tmp_kernel = tmp_kernel.reshape(-1, 1, height, width)
699
+ input = input.view(-1, tmp_kernel.size(0), input.size(-2), input.size(-1))
700
+
701
+ # convolve the tensor with the kernel.
702
+ output = torch.nn.functional.conv2d(input, tmp_kernel, groups=tmp_kernel.size(0), padding=0, stride=1)
703
+
704
+ out = output.view(b, c, h, w)
705
+ return out
706
+
707
+
708
+ def _gaussian(window_size: int, sigma):
709
+ if isinstance(sigma, float):
710
+ sigma = torch.tensor([[sigma]])
711
+
712
+ batch_size = sigma.shape[0]
713
+
714
+ x = (torch.arange(window_size, device=sigma.device, dtype=sigma.dtype) - window_size // 2).expand(batch_size, -1)
715
+
716
+ if window_size % 2 == 0:
717
+ x = x + 0.5
718
+
719
+ gauss = torch.exp(-x.pow(2.0) / (2 * sigma.pow(2.0)))
720
+
721
+ return gauss / gauss.sum(-1, keepdim=True)
722
+
723
+
724
+ def _gaussian_blur2d(input, kernel_size, sigma):
725
+ if isinstance(sigma, tuple):
726
+ sigma = torch.tensor([sigma], dtype=input.dtype)
727
+ else:
728
+ sigma = sigma.to(dtype=input.dtype)
729
+
730
+ ky, kx = int(kernel_size[0]), int(kernel_size[1])
731
+ bs = sigma.shape[0]
732
+ kernel_x = _gaussian(kx, sigma[:, 1].view(bs, 1))
733
+ kernel_y = _gaussian(ky, sigma[:, 0].view(bs, 1))
734
+ out_x = _filter2d(input, kernel_x[..., None, :])
735
+ out = _filter2d(out_x, kernel_y[..., None])
736
+
737
+ return out
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/t2i_adapter/__init__.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import TYPE_CHECKING
2
+
3
+ from ...utils import (
4
+ DIFFUSERS_SLOW_IMPORT,
5
+ OptionalDependencyNotAvailable,
6
+ _LazyModule,
7
+ get_objects_from_module,
8
+ is_torch_available,
9
+ is_transformers_available,
10
+ )
11
+
12
+
13
+ _dummy_objects = {}
14
+ _import_structure = {}
15
+
16
+ try:
17
+ if not (is_transformers_available() and is_torch_available()):
18
+ raise OptionalDependencyNotAvailable()
19
+ except OptionalDependencyNotAvailable:
20
+ from ...utils import dummy_torch_and_transformers_objects # noqa F403
21
+
22
+ _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
23
+ else:
24
+ _import_structure["pipeline_stable_diffusion_adapter"] = ["StableDiffusionAdapterPipeline"]
25
+ _import_structure["pipeline_stable_diffusion_xl_adapter"] = ["StableDiffusionXLAdapterPipeline"]
26
+
27
+
28
+ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
29
+ try:
30
+ if not (is_transformers_available() and is_torch_available()):
31
+ raise OptionalDependencyNotAvailable()
32
+ except OptionalDependencyNotAvailable:
33
+ from ...utils.dummy_torch_and_transformers_objects import * # noqa F403
34
+ else:
35
+ from .pipeline_stable_diffusion_adapter import StableDiffusionAdapterPipeline
36
+ from .pipeline_stable_diffusion_xl_adapter import StableDiffusionXLAdapterPipeline
37
+ else:
38
+ import sys
39
+
40
+ sys.modules[__name__] = _LazyModule(
41
+ __name__,
42
+ globals()["__file__"],
43
+ _import_structure,
44
+ module_spec=__spec__,
45
+ )
46
+ for name, value in _dummy_objects.items():
47
+ setattr(sys.modules[__name__], name, value)
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/t2i_adapter/__pycache__/pipeline_stable_diffusion_adapter.cpython-310.pyc ADDED
Binary file (32.5 kB). View file
 
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/t2i_adapter/__pycache__/pipeline_stable_diffusion_xl_adapter.cpython-310.pyc ADDED
Binary file (46.3 kB). View file
 
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py ADDED
@@ -0,0 +1,956 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 TencentARC and The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ from dataclasses import dataclass
17
+ from typing import Any, Callable, Dict, List, Optional, Union
18
+
19
+ import numpy as np
20
+ import PIL.Image
21
+ import torch
22
+ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
23
+
24
+ from ...image_processor import VaeImageProcessor
25
+ from ...loaders import FromSingleFileMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
26
+ from ...models import AutoencoderKL, MultiAdapter, T2IAdapter, UNet2DConditionModel
27
+ from ...models.lora import adjust_lora_scale_text_encoder
28
+ from ...schedulers import KarrasDiffusionSchedulers
29
+ from ...utils import (
30
+ PIL_INTERPOLATION,
31
+ USE_PEFT_BACKEND,
32
+ BaseOutput,
33
+ deprecate,
34
+ is_torch_xla_available,
35
+ logging,
36
+ replace_example_docstring,
37
+ scale_lora_layers,
38
+ unscale_lora_layers,
39
+ )
40
+ from ...utils.torch_utils import randn_tensor
41
+ from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
42
+ from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
43
+
44
+
45
+ if is_torch_xla_available():
46
+ import torch_xla.core.xla_model as xm
47
+
48
+ XLA_AVAILABLE = True
49
+ else:
50
+ XLA_AVAILABLE = False
51
+
52
+
53
+ @dataclass
54
+ class StableDiffusionAdapterPipelineOutput(BaseOutput):
55
+ """
56
+ Args:
57
+ images (`List[PIL.Image.Image]` or `np.ndarray`)
58
+ List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
59
+ num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
60
+ nsfw_content_detected (`List[bool]`)
61
+ List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work"
62
+ (nsfw) content, or `None` if safety checking could not be performed.
63
+ """
64
+
65
+ images: Union[List[PIL.Image.Image], np.ndarray]
66
+ nsfw_content_detected: Optional[List[bool]]
67
+
68
+
69
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
70
+
71
+
72
+ EXAMPLE_DOC_STRING = """
73
+ Examples:
74
+ ```py
75
+ >>> from PIL import Image
76
+ >>> from diffusers.utils import load_image
77
+ >>> import torch
78
+ >>> from diffusers import StableDiffusionAdapterPipeline, T2IAdapter
79
+
80
+ >>> image = load_image(
81
+ ... "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/color_ref.png"
82
+ ... )
83
+
84
+ >>> color_palette = image.resize((8, 8))
85
+ >>> color_palette = color_palette.resize((512, 512), resample=Image.Resampling.NEAREST)
86
+
87
+ >>> adapter = T2IAdapter.from_pretrained("TencentARC/t2iadapter_color_sd14v1", torch_dtype=torch.float16)
88
+ >>> pipe = StableDiffusionAdapterPipeline.from_pretrained(
89
+ ... "CompVis/stable-diffusion-v1-4",
90
+ ... adapter=adapter,
91
+ ... torch_dtype=torch.float16,
92
+ ... )
93
+
94
+ >>> pipe.to("cuda")
95
+
96
+ >>> out_image = pipe(
97
+ ... "At night, glowing cubes in front of the beach",
98
+ ... image=color_palette,
99
+ ... ).images[0]
100
+ ```
101
+ """
102
+
103
+
104
+ def _preprocess_adapter_image(image, height, width):
105
+ if isinstance(image, torch.Tensor):
106
+ return image
107
+ elif isinstance(image, PIL.Image.Image):
108
+ image = [image]
109
+
110
+ if isinstance(image[0], PIL.Image.Image):
111
+ image = [np.array(i.resize((width, height), resample=PIL_INTERPOLATION["lanczos"])) for i in image]
112
+ image = [
113
+ i[None, ..., None] if i.ndim == 2 else i[None, ...] for i in image
114
+ ] # expand [h, w] or [h, w, c] to [b, h, w, c]
115
+ image = np.concatenate(image, axis=0)
116
+ image = np.array(image).astype(np.float32) / 255.0
117
+ image = image.transpose(0, 3, 1, 2)
118
+ image = torch.from_numpy(image)
119
+ elif isinstance(image[0], torch.Tensor):
120
+ if image[0].ndim == 3:
121
+ image = torch.stack(image, dim=0)
122
+ elif image[0].ndim == 4:
123
+ image = torch.cat(image, dim=0)
124
+ else:
125
+ raise ValueError(
126
+ f"Invalid image tensor! Expecting image tensor with 3 or 4 dimension, but receive: {image[0].ndim}"
127
+ )
128
+ return image
129
+
130
+
131
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
132
+ def retrieve_timesteps(
133
+ scheduler,
134
+ num_inference_steps: Optional[int] = None,
135
+ device: Optional[Union[str, torch.device]] = None,
136
+ timesteps: Optional[List[int]] = None,
137
+ sigmas: Optional[List[float]] = None,
138
+ **kwargs,
139
+ ):
140
+ r"""
141
+ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
142
+ custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
143
+
144
+ Args:
145
+ scheduler (`SchedulerMixin`):
146
+ The scheduler to get timesteps from.
147
+ num_inference_steps (`int`):
148
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
149
+ must be `None`.
150
+ device (`str` or `torch.device`, *optional*):
151
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
152
+ timesteps (`List[int]`, *optional*):
153
+ Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
154
+ `num_inference_steps` and `sigmas` must be `None`.
155
+ sigmas (`List[float]`, *optional*):
156
+ Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
157
+ `num_inference_steps` and `timesteps` must be `None`.
158
+
159
+ Returns:
160
+ `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
161
+ second element is the number of inference steps.
162
+ """
163
+ if timesteps is not None and sigmas is not None:
164
+ raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
165
+ if timesteps is not None:
166
+ accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
167
+ if not accepts_timesteps:
168
+ raise ValueError(
169
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
170
+ f" timestep schedules. Please check whether you are using the correct scheduler."
171
+ )
172
+ scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
173
+ timesteps = scheduler.timesteps
174
+ num_inference_steps = len(timesteps)
175
+ elif sigmas is not None:
176
+ accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
177
+ if not accept_sigmas:
178
+ raise ValueError(
179
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
180
+ f" sigmas schedules. Please check whether you are using the correct scheduler."
181
+ )
182
+ scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
183
+ timesteps = scheduler.timesteps
184
+ num_inference_steps = len(timesteps)
185
+ else:
186
+ scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
187
+ timesteps = scheduler.timesteps
188
+ return timesteps, num_inference_steps
189
+
190
+
191
+ class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin, FromSingleFileMixin):
192
+ r"""
193
+ Pipeline for text-to-image generation using Stable Diffusion augmented with T2I-Adapter
194
+ https://huggingface.co/papers/2302.08453
195
+
196
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
197
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
198
+
199
+ Args:
200
+ adapter ([`T2IAdapter`] or [`MultiAdapter`] or `List[T2IAdapter]`):
201
+ Provides additional conditioning to the unet during the denoising process. If you set multiple Adapter as a
202
+ list, the outputs from each Adapter are added together to create one combined additional conditioning.
203
+ adapter_weights (`List[float]`, *optional*, defaults to None):
204
+ List of floats representing the weight which will be multiply to each adapter's output before adding them
205
+ together.
206
+ vae ([`AutoencoderKL`]):
207
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
208
+ text_encoder ([`CLIPTextModel`]):
209
+ Frozen text-encoder. Stable Diffusion uses the text portion of
210
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
211
+ the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
212
+ tokenizer (`CLIPTokenizer`):
213
+ Tokenizer of class
214
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
215
+ unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
216
+ scheduler ([`SchedulerMixin`]):
217
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
218
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
219
+ safety_checker ([`StableDiffusionSafetyChecker`]):
220
+ Classification module that estimates whether generated images could be considered offensive or harmful.
221
+ Please, refer to the [model card](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) for
222
+ details.
223
+ feature_extractor ([`CLIPImageProcessor`]):
224
+ Model that extracts features from generated images to be used as inputs for the `safety_checker`.
225
+ """
226
+
227
+ model_cpu_offload_seq = "text_encoder->adapter->unet->vae"
228
+ _optional_components = ["safety_checker", "feature_extractor"]
229
+
230
+ def __init__(
231
+ self,
232
+ vae: AutoencoderKL,
233
+ text_encoder: CLIPTextModel,
234
+ tokenizer: CLIPTokenizer,
235
+ unet: UNet2DConditionModel,
236
+ adapter: Union[T2IAdapter, MultiAdapter, List[T2IAdapter]],
237
+ scheduler: KarrasDiffusionSchedulers,
238
+ safety_checker: StableDiffusionSafetyChecker,
239
+ feature_extractor: CLIPImageProcessor,
240
+ requires_safety_checker: bool = True,
241
+ ):
242
+ super().__init__()
243
+
244
+ if safety_checker is None and requires_safety_checker:
245
+ logger.warning(
246
+ f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
247
+ " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
248
+ " results in services or applications open to the public. Both the diffusers team and Hugging Face"
249
+ " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
250
+ " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
251
+ " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
252
+ )
253
+
254
+ if safety_checker is not None and feature_extractor is None:
255
+ raise ValueError(
256
+ "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
257
+ " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
258
+ )
259
+
260
+ if isinstance(adapter, (list, tuple)):
261
+ adapter = MultiAdapter(adapter)
262
+
263
+ self.register_modules(
264
+ vae=vae,
265
+ text_encoder=text_encoder,
266
+ tokenizer=tokenizer,
267
+ unet=unet,
268
+ adapter=adapter,
269
+ scheduler=scheduler,
270
+ safety_checker=safety_checker,
271
+ feature_extractor=feature_extractor,
272
+ )
273
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
274
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
275
+ self.register_to_config(requires_safety_checker=requires_safety_checker)
276
+
277
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
278
+ def _encode_prompt(
279
+ self,
280
+ prompt,
281
+ device,
282
+ num_images_per_prompt,
283
+ do_classifier_free_guidance,
284
+ negative_prompt=None,
285
+ prompt_embeds: Optional[torch.Tensor] = None,
286
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
287
+ lora_scale: Optional[float] = None,
288
+ **kwargs,
289
+ ):
290
+ deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
291
+ deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
292
+
293
+ prompt_embeds_tuple = self.encode_prompt(
294
+ prompt=prompt,
295
+ device=device,
296
+ num_images_per_prompt=num_images_per_prompt,
297
+ do_classifier_free_guidance=do_classifier_free_guidance,
298
+ negative_prompt=negative_prompt,
299
+ prompt_embeds=prompt_embeds,
300
+ negative_prompt_embeds=negative_prompt_embeds,
301
+ lora_scale=lora_scale,
302
+ **kwargs,
303
+ )
304
+
305
+ # concatenate for backwards comp
306
+ prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
307
+
308
+ return prompt_embeds
309
+
310
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
311
+ def encode_prompt(
312
+ self,
313
+ prompt,
314
+ device,
315
+ num_images_per_prompt,
316
+ do_classifier_free_guidance,
317
+ negative_prompt=None,
318
+ prompt_embeds: Optional[torch.Tensor] = None,
319
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
320
+ lora_scale: Optional[float] = None,
321
+ clip_skip: Optional[int] = None,
322
+ ):
323
+ r"""
324
+ Encodes the prompt into text encoder hidden states.
325
+
326
+ Args:
327
+ prompt (`str` or `List[str]`, *optional*):
328
+ prompt to be encoded
329
+ device: (`torch.device`):
330
+ torch device
331
+ num_images_per_prompt (`int`):
332
+ number of images that should be generated per prompt
333
+ do_classifier_free_guidance (`bool`):
334
+ whether to use classifier free guidance or not
335
+ negative_prompt (`str` or `List[str]`, *optional*):
336
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
337
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
338
+ less than `1`).
339
+ prompt_embeds (`torch.Tensor`, *optional*):
340
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
341
+ provided, text embeddings will be generated from `prompt` input argument.
342
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
343
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
344
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
345
+ argument.
346
+ lora_scale (`float`, *optional*):
347
+ A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
348
+ clip_skip (`int`, *optional*):
349
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
350
+ the output of the pre-final layer will be used for computing the prompt embeddings.
351
+ """
352
+ # set lora scale so that monkey patched LoRA
353
+ # function of text encoder can correctly access it
354
+ if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
355
+ self._lora_scale = lora_scale
356
+
357
+ # dynamically adjust the LoRA scale
358
+ if not USE_PEFT_BACKEND:
359
+ adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
360
+ else:
361
+ scale_lora_layers(self.text_encoder, lora_scale)
362
+
363
+ if prompt is not None and isinstance(prompt, str):
364
+ batch_size = 1
365
+ elif prompt is not None and isinstance(prompt, list):
366
+ batch_size = len(prompt)
367
+ else:
368
+ batch_size = prompt_embeds.shape[0]
369
+
370
+ if prompt_embeds is None:
371
+ # textual inversion: process multi-vector tokens if necessary
372
+ if isinstance(self, TextualInversionLoaderMixin):
373
+ prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
374
+
375
+ text_inputs = self.tokenizer(
376
+ prompt,
377
+ padding="max_length",
378
+ max_length=self.tokenizer.model_max_length,
379
+ truncation=True,
380
+ return_tensors="pt",
381
+ )
382
+ text_input_ids = text_inputs.input_ids
383
+ untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
384
+
385
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
386
+ text_input_ids, untruncated_ids
387
+ ):
388
+ removed_text = self.tokenizer.batch_decode(
389
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
390
+ )
391
+ logger.warning(
392
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
393
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
394
+ )
395
+
396
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
397
+ attention_mask = text_inputs.attention_mask.to(device)
398
+ else:
399
+ attention_mask = None
400
+
401
+ if clip_skip is None:
402
+ prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
403
+ prompt_embeds = prompt_embeds[0]
404
+ else:
405
+ prompt_embeds = self.text_encoder(
406
+ text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
407
+ )
408
+ # Access the `hidden_states` first, that contains a tuple of
409
+ # all the hidden states from the encoder layers. Then index into
410
+ # the tuple to access the hidden states from the desired layer.
411
+ prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
412
+ # We also need to apply the final LayerNorm here to not mess with the
413
+ # representations. The `last_hidden_states` that we typically use for
414
+ # obtaining the final prompt representations passes through the LayerNorm
415
+ # layer.
416
+ prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
417
+
418
+ if self.text_encoder is not None:
419
+ prompt_embeds_dtype = self.text_encoder.dtype
420
+ elif self.unet is not None:
421
+ prompt_embeds_dtype = self.unet.dtype
422
+ else:
423
+ prompt_embeds_dtype = prompt_embeds.dtype
424
+
425
+ prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
426
+
427
+ bs_embed, seq_len, _ = prompt_embeds.shape
428
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
429
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
430
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
431
+
432
+ # get unconditional embeddings for classifier free guidance
433
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
434
+ uncond_tokens: List[str]
435
+ if negative_prompt is None:
436
+ uncond_tokens = [""] * batch_size
437
+ elif prompt is not None and type(prompt) is not type(negative_prompt):
438
+ raise TypeError(
439
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
440
+ f" {type(prompt)}."
441
+ )
442
+ elif isinstance(negative_prompt, str):
443
+ uncond_tokens = [negative_prompt]
444
+ elif batch_size != len(negative_prompt):
445
+ raise ValueError(
446
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
447
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
448
+ " the batch size of `prompt`."
449
+ )
450
+ else:
451
+ uncond_tokens = negative_prompt
452
+
453
+ # textual inversion: process multi-vector tokens if necessary
454
+ if isinstance(self, TextualInversionLoaderMixin):
455
+ uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
456
+
457
+ max_length = prompt_embeds.shape[1]
458
+ uncond_input = self.tokenizer(
459
+ uncond_tokens,
460
+ padding="max_length",
461
+ max_length=max_length,
462
+ truncation=True,
463
+ return_tensors="pt",
464
+ )
465
+
466
+ if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
467
+ attention_mask = uncond_input.attention_mask.to(device)
468
+ else:
469
+ attention_mask = None
470
+
471
+ negative_prompt_embeds = self.text_encoder(
472
+ uncond_input.input_ids.to(device),
473
+ attention_mask=attention_mask,
474
+ )
475
+ negative_prompt_embeds = negative_prompt_embeds[0]
476
+
477
+ if do_classifier_free_guidance:
478
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
479
+ seq_len = negative_prompt_embeds.shape[1]
480
+
481
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
482
+
483
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
484
+ negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
485
+
486
+ if self.text_encoder is not None:
487
+ if isinstance(self, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
488
+ # Retrieve the original scale by scaling back the LoRA layers
489
+ unscale_lora_layers(self.text_encoder, lora_scale)
490
+
491
+ return prompt_embeds, negative_prompt_embeds
492
+
493
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
494
+ def run_safety_checker(self, image, device, dtype):
495
+ if self.safety_checker is None:
496
+ has_nsfw_concept = None
497
+ else:
498
+ if torch.is_tensor(image):
499
+ feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
500
+ else:
501
+ feature_extractor_input = self.image_processor.numpy_to_pil(image)
502
+ safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
503
+ image, has_nsfw_concept = self.safety_checker(
504
+ images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
505
+ )
506
+ return image, has_nsfw_concept
507
+
508
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
509
+ def decode_latents(self, latents):
510
+ deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
511
+ deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
512
+
513
+ latents = 1 / self.vae.config.scaling_factor * latents
514
+ image = self.vae.decode(latents, return_dict=False)[0]
515
+ image = (image / 2 + 0.5).clamp(0, 1)
516
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
517
+ image = image.cpu().permute(0, 2, 3, 1).float().numpy()
518
+ return image
519
+
520
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
521
+ def prepare_extra_step_kwargs(self, generator, eta):
522
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
523
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
524
+ # eta corresponds to η in DDIM paper: https://huggingface.co/papers/2010.02502
525
+ # and should be between [0, 1]
526
+
527
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
528
+ extra_step_kwargs = {}
529
+ if accepts_eta:
530
+ extra_step_kwargs["eta"] = eta
531
+
532
+ # check if the scheduler accepts generator
533
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
534
+ if accepts_generator:
535
+ extra_step_kwargs["generator"] = generator
536
+ return extra_step_kwargs
537
+
538
+ def check_inputs(
539
+ self,
540
+ prompt,
541
+ height,
542
+ width,
543
+ callback_steps,
544
+ image,
545
+ negative_prompt=None,
546
+ prompt_embeds=None,
547
+ negative_prompt_embeds=None,
548
+ ):
549
+ if height % 8 != 0 or width % 8 != 0:
550
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
551
+
552
+ if (callback_steps is None) or (
553
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
554
+ ):
555
+ raise ValueError(
556
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
557
+ f" {type(callback_steps)}."
558
+ )
559
+
560
+ if prompt is not None and prompt_embeds is not None:
561
+ raise ValueError(
562
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
563
+ " only forward one of the two."
564
+ )
565
+ elif prompt is None and prompt_embeds is None:
566
+ raise ValueError(
567
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
568
+ )
569
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
570
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
571
+
572
+ if negative_prompt is not None and negative_prompt_embeds is not None:
573
+ raise ValueError(
574
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
575
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
576
+ )
577
+
578
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
579
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
580
+ raise ValueError(
581
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
582
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
583
+ f" {negative_prompt_embeds.shape}."
584
+ )
585
+
586
+ if isinstance(self.adapter, MultiAdapter):
587
+ if not isinstance(image, list):
588
+ raise ValueError(
589
+ "MultiAdapter is enabled, but `image` is not a list. Please pass a list of images to `image`."
590
+ )
591
+
592
+ if len(image) != len(self.adapter.adapters):
593
+ raise ValueError(
594
+ f"MultiAdapter requires passing the same number of images as adapters. Given {len(image)} images and {len(self.adapter.adapters)} adapters."
595
+ )
596
+
597
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
598
+ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
599
+ shape = (
600
+ batch_size,
601
+ num_channels_latents,
602
+ int(height) // self.vae_scale_factor,
603
+ int(width) // self.vae_scale_factor,
604
+ )
605
+ if isinstance(generator, list) and len(generator) != batch_size:
606
+ raise ValueError(
607
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
608
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
609
+ )
610
+
611
+ if latents is None:
612
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
613
+ else:
614
+ latents = latents.to(device)
615
+
616
+ # scale the initial noise by the standard deviation required by the scheduler
617
+ latents = latents * self.scheduler.init_noise_sigma
618
+ return latents
619
+
620
+ def _default_height_width(self, height, width, image):
621
+ # NOTE: It is possible that a list of images have different
622
+ # dimensions for each image, so just checking the first image
623
+ # is not _exactly_ correct, but it is simple.
624
+ while isinstance(image, list):
625
+ image = image[0]
626
+
627
+ if height is None:
628
+ if isinstance(image, PIL.Image.Image):
629
+ height = image.height
630
+ elif isinstance(image, torch.Tensor):
631
+ height = image.shape[-2]
632
+
633
+ # round down to nearest multiple of `self.adapter.downscale_factor`
634
+ height = (height // self.adapter.downscale_factor) * self.adapter.downscale_factor
635
+
636
+ if width is None:
637
+ if isinstance(image, PIL.Image.Image):
638
+ width = image.width
639
+ elif isinstance(image, torch.Tensor):
640
+ width = image.shape[-1]
641
+
642
+ # round down to nearest multiple of `self.adapter.downscale_factor`
643
+ width = (width // self.adapter.downscale_factor) * self.adapter.downscale_factor
644
+
645
+ return height, width
646
+
647
+ # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
648
+ def get_guidance_scale_embedding(
649
+ self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
650
+ ) -> torch.Tensor:
651
+ """
652
+ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
653
+
654
+ Args:
655
+ w (`torch.Tensor`):
656
+ Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
657
+ embedding_dim (`int`, *optional*, defaults to 512):
658
+ Dimension of the embeddings to generate.
659
+ dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
660
+ Data type of the generated embeddings.
661
+
662
+ Returns:
663
+ `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
664
+ """
665
+ assert len(w.shape) == 1
666
+ w = w * 1000.0
667
+
668
+ half_dim = embedding_dim // 2
669
+ emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
670
+ emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
671
+ emb = w.to(dtype)[:, None] * emb[None, :]
672
+ emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
673
+ if embedding_dim % 2 == 1: # zero pad
674
+ emb = torch.nn.functional.pad(emb, (0, 1))
675
+ assert emb.shape == (w.shape[0], embedding_dim)
676
+ return emb
677
+
678
+ @property
679
+ def guidance_scale(self):
680
+ return self._guidance_scale
681
+
682
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
683
+ # of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
684
+ # corresponds to doing no classifier free guidance.
685
+ @property
686
+ def do_classifier_free_guidance(self):
687
+ return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
688
+
689
+ @torch.no_grad()
690
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
691
+ def __call__(
692
+ self,
693
+ prompt: Union[str, List[str]] = None,
694
+ image: Union[torch.Tensor, PIL.Image.Image, List[PIL.Image.Image]] = None,
695
+ height: Optional[int] = None,
696
+ width: Optional[int] = None,
697
+ num_inference_steps: int = 50,
698
+ timesteps: List[int] = None,
699
+ sigmas: List[float] = None,
700
+ guidance_scale: float = 7.5,
701
+ negative_prompt: Optional[Union[str, List[str]]] = None,
702
+ num_images_per_prompt: Optional[int] = 1,
703
+ eta: float = 0.0,
704
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
705
+ latents: Optional[torch.Tensor] = None,
706
+ prompt_embeds: Optional[torch.Tensor] = None,
707
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
708
+ output_type: Optional[str] = "pil",
709
+ return_dict: bool = True,
710
+ callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
711
+ callback_steps: int = 1,
712
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
713
+ adapter_conditioning_scale: Union[float, List[float]] = 1.0,
714
+ clip_skip: Optional[int] = None,
715
+ ):
716
+ r"""
717
+ Function invoked when calling the pipeline for generation.
718
+
719
+ Args:
720
+ prompt (`str` or `List[str]`, *optional*):
721
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
722
+ instead.
723
+ image (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]` or `List[PIL.Image.Image]` or `List[List[PIL.Image.Image]]`):
724
+ The Adapter input condition. Adapter uses this input condition to generate guidance to Unet. If the
725
+ type is specified as `torch.Tensor`, it is passed to Adapter as is. PIL.Image.Image` can also be
726
+ accepted as an image. The control image is automatically resized to fit the output image.
727
+ height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
728
+ The height in pixels of the generated image.
729
+ width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
730
+ The width in pixels of the generated image.
731
+ num_inference_steps (`int`, *optional*, defaults to 50):
732
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
733
+ expense of slower inference.
734
+ timesteps (`List[int]`, *optional*):
735
+ Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
736
+ in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
737
+ passed will be used. Must be in descending order.
738
+ sigmas (`List[float]`, *optional*):
739
+ Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
740
+ their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
741
+ will be used.
742
+ guidance_scale (`float`, *optional*, defaults to 7.5):
743
+ Guidance scale as defined in [Classifier-Free Diffusion
744
+ Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
745
+ of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
746
+ `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
747
+ the text `prompt`, usually at the expense of lower image quality.
748
+ negative_prompt (`str` or `List[str]`, *optional*):
749
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
750
+ `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
751
+ Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
752
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
753
+ The number of images to generate per prompt.
754
+ eta (`float`, *optional*, defaults to 0.0):
755
+ Corresponds to parameter eta (η) in the DDIM paper: https://huggingface.co/papers/2010.02502. Only
756
+ applies to [`schedulers.DDIMScheduler`], will be ignored for others.
757
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
758
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
759
+ to make generation deterministic.
760
+ latents (`torch.Tensor`, *optional*):
761
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
762
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
763
+ tensor will be generated by sampling using the supplied random `generator`.
764
+ prompt_embeds (`torch.Tensor`, *optional*):
765
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
766
+ provided, text embeddings will be generated from `prompt` input argument.
767
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
768
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
769
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
770
+ argument.
771
+ output_type (`str`, *optional*, defaults to `"pil"`):
772
+ The output format of the generate image. Choose between
773
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
774
+ return_dict (`bool`, *optional*, defaults to `True`):
775
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionAdapterPipelineOutput`] instead
776
+ of a plain tuple.
777
+ callback (`Callable`, *optional*):
778
+ A function that will be called every `callback_steps` steps during inference. The function will be
779
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
780
+ callback_steps (`int`, *optional*, defaults to 1):
781
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
782
+ called at every step.
783
+ cross_attention_kwargs (`dict`, *optional*):
784
+ A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
785
+ `self.processor` in
786
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
787
+ adapter_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
788
+ The outputs of the adapter are multiplied by `adapter_conditioning_scale` before they are added to the
789
+ residual in the original unet. If multiple adapters are specified in init, you can set the
790
+ corresponding scale as a list.
791
+ clip_skip (`int`, *optional*):
792
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
793
+ the output of the pre-final layer will be used for computing the prompt embeddings.
794
+ Examples:
795
+
796
+ Returns:
797
+ [`~pipelines.stable_diffusion.StableDiffusionAdapterPipelineOutput`] or `tuple`:
798
+ [`~pipelines.stable_diffusion.StableDiffusionAdapterPipelineOutput`] if `return_dict` is True, otherwise a
799
+ `tuple. When returning a tuple, the first element is a list with the generated images, and the second
800
+ element is a list of `bool`s denoting whether the corresponding generated image likely represents
801
+ "not-safe-for-work" (nsfw) content, according to the `safety_checker`.
802
+ """
803
+ # 0. Default height and width to unet
804
+ height, width = self._default_height_width(height, width, image)
805
+ device = self._execution_device
806
+
807
+ # 1. Check inputs. Raise error if not correct
808
+ self.check_inputs(
809
+ prompt, height, width, callback_steps, image, negative_prompt, prompt_embeds, negative_prompt_embeds
810
+ )
811
+
812
+ self._guidance_scale = guidance_scale
813
+
814
+ if isinstance(self.adapter, MultiAdapter):
815
+ adapter_input = []
816
+
817
+ for one_image in image:
818
+ one_image = _preprocess_adapter_image(one_image, height, width)
819
+ one_image = one_image.to(device=device, dtype=self.adapter.dtype)
820
+ adapter_input.append(one_image)
821
+ else:
822
+ adapter_input = _preprocess_adapter_image(image, height, width)
823
+ adapter_input = adapter_input.to(device=device, dtype=self.adapter.dtype)
824
+
825
+ # 2. Define call parameters
826
+ if prompt is not None and isinstance(prompt, str):
827
+ batch_size = 1
828
+ elif prompt is not None and isinstance(prompt, list):
829
+ batch_size = len(prompt)
830
+ else:
831
+ batch_size = prompt_embeds.shape[0]
832
+
833
+ # 3. Encode input prompt
834
+ prompt_embeds, negative_prompt_embeds = self.encode_prompt(
835
+ prompt,
836
+ device,
837
+ num_images_per_prompt,
838
+ self.do_classifier_free_guidance,
839
+ negative_prompt,
840
+ prompt_embeds=prompt_embeds,
841
+ negative_prompt_embeds=negative_prompt_embeds,
842
+ clip_skip=clip_skip,
843
+ )
844
+ # For classifier free guidance, we need to do two forward passes.
845
+ # Here we concatenate the unconditional and text embeddings into a single batch
846
+ # to avoid doing two forward passes
847
+ if self.do_classifier_free_guidance:
848
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
849
+
850
+ # 4. Prepare timesteps
851
+ timesteps, num_inference_steps = retrieve_timesteps(
852
+ self.scheduler, num_inference_steps, device, timesteps, sigmas
853
+ )
854
+
855
+ # 5. Prepare latent variables
856
+ num_channels_latents = self.unet.config.in_channels
857
+ latents = self.prepare_latents(
858
+ batch_size * num_images_per_prompt,
859
+ num_channels_latents,
860
+ height,
861
+ width,
862
+ prompt_embeds.dtype,
863
+ device,
864
+ generator,
865
+ latents,
866
+ )
867
+
868
+ # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
869
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
870
+
871
+ # 6.5 Optionally get Guidance Scale Embedding
872
+ timestep_cond = None
873
+ if self.unet.config.time_cond_proj_dim is not None:
874
+ guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
875
+ timestep_cond = self.get_guidance_scale_embedding(
876
+ guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
877
+ ).to(device=device, dtype=latents.dtype)
878
+
879
+ # 7. Denoising loop
880
+ if isinstance(self.adapter, MultiAdapter):
881
+ adapter_state = self.adapter(adapter_input, adapter_conditioning_scale)
882
+ for k, v in enumerate(adapter_state):
883
+ adapter_state[k] = v
884
+ else:
885
+ adapter_state = self.adapter(adapter_input)
886
+ for k, v in enumerate(adapter_state):
887
+ adapter_state[k] = v * adapter_conditioning_scale
888
+ if num_images_per_prompt > 1:
889
+ for k, v in enumerate(adapter_state):
890
+ adapter_state[k] = v.repeat(num_images_per_prompt, 1, 1, 1)
891
+ if self.do_classifier_free_guidance:
892
+ for k, v in enumerate(adapter_state):
893
+ adapter_state[k] = torch.cat([v] * 2, dim=0)
894
+
895
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
896
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
897
+ for i, t in enumerate(timesteps):
898
+ # expand the latents if we are doing classifier free guidance
899
+ latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
900
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
901
+
902
+ # predict the noise residual
903
+ noise_pred = self.unet(
904
+ latent_model_input,
905
+ t,
906
+ encoder_hidden_states=prompt_embeds,
907
+ timestep_cond=timestep_cond,
908
+ cross_attention_kwargs=cross_attention_kwargs,
909
+ down_intrablock_additional_residuals=[state.clone() for state in adapter_state],
910
+ return_dict=False,
911
+ )[0]
912
+
913
+ # perform guidance
914
+ if self.do_classifier_free_guidance:
915
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
916
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
917
+
918
+ # compute the previous noisy sample x_t -> x_t-1
919
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
920
+
921
+ # call the callback, if provided
922
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
923
+ progress_bar.update()
924
+ if callback is not None and i % callback_steps == 0:
925
+ step_idx = i // getattr(self.scheduler, "order", 1)
926
+ callback(step_idx, t, latents)
927
+
928
+ if XLA_AVAILABLE:
929
+ xm.mark_step()
930
+
931
+ if output_type == "latent":
932
+ image = latents
933
+ has_nsfw_concept = None
934
+ elif output_type == "pil":
935
+ # 8. Post-processing
936
+ image = self.decode_latents(latents)
937
+
938
+ # 9. Run safety checker
939
+ image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
940
+
941
+ # 10. Convert to PIL
942
+ image = self.numpy_to_pil(image)
943
+ else:
944
+ # 8. Post-processing
945
+ image = self.decode_latents(latents)
946
+
947
+ # 9. Run safety checker
948
+ image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
949
+
950
+ # Offload all models
951
+ self.maybe_free_model_hooks()
952
+
953
+ if not return_dict:
954
+ return (image, has_nsfw_concept)
955
+
956
+ return StableDiffusionAdapterPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
pythonProject/.venv/Lib/site-packages/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py ADDED
@@ -0,0 +1,1311 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 TencentARC and The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
17
+
18
+ import numpy as np
19
+ import PIL.Image
20
+ import torch
21
+ from transformers import (
22
+ CLIPImageProcessor,
23
+ CLIPTextModel,
24
+ CLIPTextModelWithProjection,
25
+ CLIPTokenizer,
26
+ CLIPVisionModelWithProjection,
27
+ )
28
+
29
+ from ...image_processor import PipelineImageInput, VaeImageProcessor
30
+ from ...loaders import (
31
+ FromSingleFileMixin,
32
+ IPAdapterMixin,
33
+ StableDiffusionXLLoraLoaderMixin,
34
+ TextualInversionLoaderMixin,
35
+ )
36
+ from ...models import AutoencoderKL, ImageProjection, MultiAdapter, T2IAdapter, UNet2DConditionModel
37
+ from ...models.attention_processor import (
38
+ AttnProcessor2_0,
39
+ XFormersAttnProcessor,
40
+ )
41
+ from ...models.lora import adjust_lora_scale_text_encoder
42
+ from ...schedulers import KarrasDiffusionSchedulers
43
+ from ...utils import (
44
+ PIL_INTERPOLATION,
45
+ USE_PEFT_BACKEND,
46
+ is_torch_xla_available,
47
+ logging,
48
+ replace_example_docstring,
49
+ scale_lora_layers,
50
+ unscale_lora_layers,
51
+ )
52
+ from ...utils.torch_utils import randn_tensor
53
+ from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
54
+ from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
55
+
56
+
57
+ if is_torch_xla_available():
58
+ import torch_xla.core.xla_model as xm
59
+
60
+ XLA_AVAILABLE = True
61
+ else:
62
+ XLA_AVAILABLE = False
63
+
64
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
65
+
66
+
67
+ EXAMPLE_DOC_STRING = """
68
+ Examples:
69
+ ```py
70
+ >>> import torch
71
+ >>> from diffusers import T2IAdapter, StableDiffusionXLAdapterPipeline, DDPMScheduler
72
+ >>> from diffusers.utils import load_image
73
+
74
+ >>> sketch_image = load_image("https://huggingface.co/Adapter/t2iadapter/resolve/main/sketch.png").convert("L")
75
+
76
+ >>> model_id = "stabilityai/stable-diffusion-xl-base-1.0"
77
+
78
+ >>> adapter = T2IAdapter.from_pretrained(
79
+ ... "Adapter/t2iadapter",
80
+ ... subfolder="sketch_sdxl_1.0",
81
+ ... torch_dtype=torch.float16,
82
+ ... adapter_type="full_adapter_xl",
83
+ ... )
84
+ >>> scheduler = DDPMScheduler.from_pretrained(model_id, subfolder="scheduler")
85
+
86
+ >>> pipe = StableDiffusionXLAdapterPipeline.from_pretrained(
87
+ ... model_id, adapter=adapter, torch_dtype=torch.float16, variant="fp16", scheduler=scheduler
88
+ ... ).to("cuda")
89
+
90
+ >>> generator = torch.manual_seed(42)
91
+ >>> sketch_image_out = pipe(
92
+ ... prompt="a photo of a dog in real world, high quality",
93
+ ... negative_prompt="extra digit, fewer digits, cropped, worst quality, low quality",
94
+ ... image=sketch_image,
95
+ ... generator=generator,
96
+ ... guidance_scale=7.5,
97
+ ... ).images[0]
98
+ ```
99
+ """
100
+
101
+
102
+ def _preprocess_adapter_image(image, height, width):
103
+ if isinstance(image, torch.Tensor):
104
+ return image
105
+ elif isinstance(image, PIL.Image.Image):
106
+ image = [image]
107
+
108
+ if isinstance(image[0], PIL.Image.Image):
109
+ image = [np.array(i.resize((width, height), resample=PIL_INTERPOLATION["lanczos"])) for i in image]
110
+ image = [
111
+ i[None, ..., None] if i.ndim == 2 else i[None, ...] for i in image
112
+ ] # expand [h, w] or [h, w, c] to [b, h, w, c]
113
+ image = np.concatenate(image, axis=0)
114
+ image = np.array(image).astype(np.float32) / 255.0
115
+ image = image.transpose(0, 3, 1, 2)
116
+ image = torch.from_numpy(image)
117
+ elif isinstance(image[0], torch.Tensor):
118
+ if image[0].ndim == 3:
119
+ image = torch.stack(image, dim=0)
120
+ elif image[0].ndim == 4:
121
+ image = torch.cat(image, dim=0)
122
+ else:
123
+ raise ValueError(
124
+ f"Invalid image tensor! Expecting image tensor with 3 or 4 dimension, but receive: {image[0].ndim}"
125
+ )
126
+ return image
127
+
128
+
129
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
130
+ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
131
+ r"""
132
+ Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
133
+ Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
134
+ Flawed](https://huggingface.co/papers/2305.08891).
135
+
136
+ Args:
137
+ noise_cfg (`torch.Tensor`):
138
+ The predicted noise tensor for the guided diffusion process.
139
+ noise_pred_text (`torch.Tensor`):
140
+ The predicted noise tensor for the text-guided diffusion process.
141
+ guidance_rescale (`float`, *optional*, defaults to 0.0):
142
+ A rescale factor applied to the noise predictions.
143
+
144
+ Returns:
145
+ noise_cfg (`torch.Tensor`): The rescaled noise prediction tensor.
146
+ """
147
+ std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
148
+ std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
149
+ # rescale the results from guidance (fixes overexposure)
150
+ noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
151
+ # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
152
+ noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
153
+ return noise_cfg
154
+
155
+
156
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
157
+ def retrieve_timesteps(
158
+ scheduler,
159
+ num_inference_steps: Optional[int] = None,
160
+ device: Optional[Union[str, torch.device]] = None,
161
+ timesteps: Optional[List[int]] = None,
162
+ sigmas: Optional[List[float]] = None,
163
+ **kwargs,
164
+ ):
165
+ r"""
166
+ Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
167
+ custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
168
+
169
+ Args:
170
+ scheduler (`SchedulerMixin`):
171
+ The scheduler to get timesteps from.
172
+ num_inference_steps (`int`):
173
+ The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
174
+ must be `None`.
175
+ device (`str` or `torch.device`, *optional*):
176
+ The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
177
+ timesteps (`List[int]`, *optional*):
178
+ Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
179
+ `num_inference_steps` and `sigmas` must be `None`.
180
+ sigmas (`List[float]`, *optional*):
181
+ Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
182
+ `num_inference_steps` and `timesteps` must be `None`.
183
+
184
+ Returns:
185
+ `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
186
+ second element is the number of inference steps.
187
+ """
188
+ if timesteps is not None and sigmas is not None:
189
+ raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
190
+ if timesteps is not None:
191
+ accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
192
+ if not accepts_timesteps:
193
+ raise ValueError(
194
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
195
+ f" timestep schedules. Please check whether you are using the correct scheduler."
196
+ )
197
+ scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
198
+ timesteps = scheduler.timesteps
199
+ num_inference_steps = len(timesteps)
200
+ elif sigmas is not None:
201
+ accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
202
+ if not accept_sigmas:
203
+ raise ValueError(
204
+ f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
205
+ f" sigmas schedules. Please check whether you are using the correct scheduler."
206
+ )
207
+ scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
208
+ timesteps = scheduler.timesteps
209
+ num_inference_steps = len(timesteps)
210
+ else:
211
+ scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
212
+ timesteps = scheduler.timesteps
213
+ return timesteps, num_inference_steps
214
+
215
+
216
+ class StableDiffusionXLAdapterPipeline(
217
+ DiffusionPipeline,
218
+ StableDiffusionMixin,
219
+ TextualInversionLoaderMixin,
220
+ StableDiffusionXLLoraLoaderMixin,
221
+ IPAdapterMixin,
222
+ FromSingleFileMixin,
223
+ ):
224
+ r"""
225
+ Pipeline for text-to-image generation using Stable Diffusion augmented with T2I-Adapter
226
+ https://huggingface.co/papers/2302.08453
227
+
228
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
229
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
230
+
231
+ The pipeline also inherits the following loading methods:
232
+ - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
233
+ - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
234
+ - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
235
+ - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
236
+ - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
237
+
238
+ Args:
239
+ adapter ([`T2IAdapter`] or [`MultiAdapter`] or `List[T2IAdapter]`):
240
+ Provides additional conditioning to the unet during the denoising process. If you set multiple Adapter as a
241
+ list, the outputs from each Adapter are added together to create one combined additional conditioning.
242
+ adapter_weights (`List[float]`, *optional*, defaults to None):
243
+ List of floats representing the weight which will be multiply to each adapter's output before adding them
244
+ together.
245
+ vae ([`AutoencoderKL`]):
246
+ Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
247
+ text_encoder ([`CLIPTextModel`]):
248
+ Frozen text-encoder. Stable Diffusion uses the text portion of
249
+ [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
250
+ the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
251
+ tokenizer (`CLIPTokenizer`):
252
+ Tokenizer of class
253
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
254
+ unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
255
+ scheduler ([`SchedulerMixin`]):
256
+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
257
+ [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
258
+ safety_checker ([`StableDiffusionSafetyChecker`]):
259
+ Classification module that estimates whether generated images could be considered offensive or harmful.
260
+ Please, refer to the [model card](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) for
261
+ details.
262
+ feature_extractor ([`CLIPImageProcessor`]):
263
+ Model that extracts features from generated images to be used as inputs for the `safety_checker`.
264
+ """
265
+
266
+ model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->unet->vae"
267
+ _optional_components = [
268
+ "tokenizer",
269
+ "tokenizer_2",
270
+ "text_encoder",
271
+ "text_encoder_2",
272
+ "feature_extractor",
273
+ "image_encoder",
274
+ ]
275
+
276
+ def __init__(
277
+ self,
278
+ vae: AutoencoderKL,
279
+ text_encoder: CLIPTextModel,
280
+ text_encoder_2: CLIPTextModelWithProjection,
281
+ tokenizer: CLIPTokenizer,
282
+ tokenizer_2: CLIPTokenizer,
283
+ unet: UNet2DConditionModel,
284
+ adapter: Union[T2IAdapter, MultiAdapter, List[T2IAdapter]],
285
+ scheduler: KarrasDiffusionSchedulers,
286
+ force_zeros_for_empty_prompt: bool = True,
287
+ feature_extractor: CLIPImageProcessor = None,
288
+ image_encoder: CLIPVisionModelWithProjection = None,
289
+ ):
290
+ super().__init__()
291
+
292
+ self.register_modules(
293
+ vae=vae,
294
+ text_encoder=text_encoder,
295
+ text_encoder_2=text_encoder_2,
296
+ tokenizer=tokenizer,
297
+ tokenizer_2=tokenizer_2,
298
+ unet=unet,
299
+ adapter=adapter,
300
+ scheduler=scheduler,
301
+ feature_extractor=feature_extractor,
302
+ image_encoder=image_encoder,
303
+ )
304
+ self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
305
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
306
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
307
+ self.default_sample_size = (
308
+ self.unet.config.sample_size
309
+ if hasattr(self, "unet") and self.unet is not None and hasattr(self.unet.config, "sample_size")
310
+ else 128
311
+ )
312
+
313
+ # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
314
+ def encode_prompt(
315
+ self,
316
+ prompt: str,
317
+ prompt_2: Optional[str] = None,
318
+ device: Optional[torch.device] = None,
319
+ num_images_per_prompt: int = 1,
320
+ do_classifier_free_guidance: bool = True,
321
+ negative_prompt: Optional[str] = None,
322
+ negative_prompt_2: Optional[str] = None,
323
+ prompt_embeds: Optional[torch.Tensor] = None,
324
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
325
+ pooled_prompt_embeds: Optional[torch.Tensor] = None,
326
+ negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
327
+ lora_scale: Optional[float] = None,
328
+ clip_skip: Optional[int] = None,
329
+ ):
330
+ r"""
331
+ Encodes the prompt into text encoder hidden states.
332
+
333
+ Args:
334
+ prompt (`str` or `List[str]`, *optional*):
335
+ prompt to be encoded
336
+ prompt_2 (`str` or `List[str]`, *optional*):
337
+ The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
338
+ used in both text-encoders
339
+ device: (`torch.device`):
340
+ torch device
341
+ num_images_per_prompt (`int`):
342
+ number of images that should be generated per prompt
343
+ do_classifier_free_guidance (`bool`):
344
+ whether to use classifier free guidance or not
345
+ negative_prompt (`str` or `List[str]`, *optional*):
346
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
347
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
348
+ less than `1`).
349
+ negative_prompt_2 (`str` or `List[str]`, *optional*):
350
+ The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
351
+ `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
352
+ prompt_embeds (`torch.Tensor`, *optional*):
353
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
354
+ provided, text embeddings will be generated from `prompt` input argument.
355
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
356
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
357
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
358
+ argument.
359
+ pooled_prompt_embeds (`torch.Tensor`, *optional*):
360
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
361
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
362
+ negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
363
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
364
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
365
+ input argument.
366
+ lora_scale (`float`, *optional*):
367
+ A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
368
+ clip_skip (`int`, *optional*):
369
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
370
+ the output of the pre-final layer will be used for computing the prompt embeddings.
371
+ """
372
+ device = device or self._execution_device
373
+
374
+ # set lora scale so that monkey patched LoRA
375
+ # function of text encoder can correctly access it
376
+ if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
377
+ self._lora_scale = lora_scale
378
+
379
+ # dynamically adjust the LoRA scale
380
+ if self.text_encoder is not None:
381
+ if not USE_PEFT_BACKEND:
382
+ adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
383
+ else:
384
+ scale_lora_layers(self.text_encoder, lora_scale)
385
+
386
+ if self.text_encoder_2 is not None:
387
+ if not USE_PEFT_BACKEND:
388
+ adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
389
+ else:
390
+ scale_lora_layers(self.text_encoder_2, lora_scale)
391
+
392
+ prompt = [prompt] if isinstance(prompt, str) else prompt
393
+
394
+ if prompt is not None:
395
+ batch_size = len(prompt)
396
+ else:
397
+ batch_size = prompt_embeds.shape[0]
398
+
399
+ # Define tokenizers and text encoders
400
+ tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
401
+ text_encoders = (
402
+ [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
403
+ )
404
+
405
+ if prompt_embeds is None:
406
+ prompt_2 = prompt_2 or prompt
407
+ prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
408
+
409
+ # textual inversion: process multi-vector tokens if necessary
410
+ prompt_embeds_list = []
411
+ prompts = [prompt, prompt_2]
412
+ for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
413
+ if isinstance(self, TextualInversionLoaderMixin):
414
+ prompt = self.maybe_convert_prompt(prompt, tokenizer)
415
+
416
+ text_inputs = tokenizer(
417
+ prompt,
418
+ padding="max_length",
419
+ max_length=tokenizer.model_max_length,
420
+ truncation=True,
421
+ return_tensors="pt",
422
+ )
423
+
424
+ text_input_ids = text_inputs.input_ids
425
+ untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
426
+
427
+ if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
428
+ text_input_ids, untruncated_ids
429
+ ):
430
+ removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
431
+ logger.warning(
432
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
433
+ f" {tokenizer.model_max_length} tokens: {removed_text}"
434
+ )
435
+
436
+ prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
437
+
438
+ # We are only ALWAYS interested in the pooled output of the final text encoder
439
+ if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
440
+ pooled_prompt_embeds = prompt_embeds[0]
441
+
442
+ if clip_skip is None:
443
+ prompt_embeds = prompt_embeds.hidden_states[-2]
444
+ else:
445
+ # "2" because SDXL always indexes from the penultimate layer.
446
+ prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
447
+
448
+ prompt_embeds_list.append(prompt_embeds)
449
+
450
+ prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
451
+
452
+ # get unconditional embeddings for classifier free guidance
453
+ zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
454
+ if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
455
+ negative_prompt_embeds = torch.zeros_like(prompt_embeds)
456
+ negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
457
+ elif do_classifier_free_guidance and negative_prompt_embeds is None:
458
+ negative_prompt = negative_prompt or ""
459
+ negative_prompt_2 = negative_prompt_2 or negative_prompt
460
+
461
+ # normalize str to list
462
+ negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
463
+ negative_prompt_2 = (
464
+ batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
465
+ )
466
+
467
+ uncond_tokens: List[str]
468
+ if prompt is not None and type(prompt) is not type(negative_prompt):
469
+ raise TypeError(
470
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
471
+ f" {type(prompt)}."
472
+ )
473
+ elif batch_size != len(negative_prompt):
474
+ raise ValueError(
475
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
476
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
477
+ " the batch size of `prompt`."
478
+ )
479
+ else:
480
+ uncond_tokens = [negative_prompt, negative_prompt_2]
481
+
482
+ negative_prompt_embeds_list = []
483
+ for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
484
+ if isinstance(self, TextualInversionLoaderMixin):
485
+ negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
486
+
487
+ max_length = prompt_embeds.shape[1]
488
+ uncond_input = tokenizer(
489
+ negative_prompt,
490
+ padding="max_length",
491
+ max_length=max_length,
492
+ truncation=True,
493
+ return_tensors="pt",
494
+ )
495
+
496
+ negative_prompt_embeds = text_encoder(
497
+ uncond_input.input_ids.to(device),
498
+ output_hidden_states=True,
499
+ )
500
+
501
+ # We are only ALWAYS interested in the pooled output of the final text encoder
502
+ if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
503
+ negative_pooled_prompt_embeds = negative_prompt_embeds[0]
504
+ negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
505
+
506
+ negative_prompt_embeds_list.append(negative_prompt_embeds)
507
+
508
+ negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
509
+
510
+ if self.text_encoder_2 is not None:
511
+ prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
512
+ else:
513
+ prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
514
+
515
+ bs_embed, seq_len, _ = prompt_embeds.shape
516
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
517
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
518
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
519
+
520
+ if do_classifier_free_guidance:
521
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
522
+ seq_len = negative_prompt_embeds.shape[1]
523
+
524
+ if self.text_encoder_2 is not None:
525
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
526
+ else:
527
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
528
+
529
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
530
+ negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
531
+
532
+ pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
533
+ bs_embed * num_images_per_prompt, -1
534
+ )
535
+ if do_classifier_free_guidance:
536
+ negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
537
+ bs_embed * num_images_per_prompt, -1
538
+ )
539
+
540
+ if self.text_encoder is not None:
541
+ if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
542
+ # Retrieve the original scale by scaling back the LoRA layers
543
+ unscale_lora_layers(self.text_encoder, lora_scale)
544
+
545
+ if self.text_encoder_2 is not None:
546
+ if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
547
+ # Retrieve the original scale by scaling back the LoRA layers
548
+ unscale_lora_layers(self.text_encoder_2, lora_scale)
549
+
550
+ return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
551
+
552
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
553
+ def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
554
+ dtype = next(self.image_encoder.parameters()).dtype
555
+
556
+ if not isinstance(image, torch.Tensor):
557
+ image = self.feature_extractor(image, return_tensors="pt").pixel_values
558
+
559
+ image = image.to(device=device, dtype=dtype)
560
+ if output_hidden_states:
561
+ image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
562
+ image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
563
+ uncond_image_enc_hidden_states = self.image_encoder(
564
+ torch.zeros_like(image), output_hidden_states=True
565
+ ).hidden_states[-2]
566
+ uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
567
+ num_images_per_prompt, dim=0
568
+ )
569
+ return image_enc_hidden_states, uncond_image_enc_hidden_states
570
+ else:
571
+ image_embeds = self.image_encoder(image).image_embeds
572
+ image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
573
+ uncond_image_embeds = torch.zeros_like(image_embeds)
574
+
575
+ return image_embeds, uncond_image_embeds
576
+
577
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
578
+ def prepare_ip_adapter_image_embeds(
579
+ self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
580
+ ):
581
+ image_embeds = []
582
+ if do_classifier_free_guidance:
583
+ negative_image_embeds = []
584
+ if ip_adapter_image_embeds is None:
585
+ if not isinstance(ip_adapter_image, list):
586
+ ip_adapter_image = [ip_adapter_image]
587
+
588
+ if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
589
+ raise ValueError(
590
+ f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
591
+ )
592
+
593
+ for single_ip_adapter_image, image_proj_layer in zip(
594
+ ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
595
+ ):
596
+ output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
597
+ single_image_embeds, single_negative_image_embeds = self.encode_image(
598
+ single_ip_adapter_image, device, 1, output_hidden_state
599
+ )
600
+
601
+ image_embeds.append(single_image_embeds[None, :])
602
+ if do_classifier_free_guidance:
603
+ negative_image_embeds.append(single_negative_image_embeds[None, :])
604
+ else:
605
+ for single_image_embeds in ip_adapter_image_embeds:
606
+ if do_classifier_free_guidance:
607
+ single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
608
+ negative_image_embeds.append(single_negative_image_embeds)
609
+ image_embeds.append(single_image_embeds)
610
+
611
+ ip_adapter_image_embeds = []
612
+ for i, single_image_embeds in enumerate(image_embeds):
613
+ single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
614
+ if do_classifier_free_guidance:
615
+ single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
616
+ single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
617
+
618
+ single_image_embeds = single_image_embeds.to(device=device)
619
+ ip_adapter_image_embeds.append(single_image_embeds)
620
+
621
+ return ip_adapter_image_embeds
622
+
623
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
624
+ def prepare_extra_step_kwargs(self, generator, eta):
625
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
626
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
627
+ # eta corresponds to η in DDIM paper: https://huggingface.co/papers/2010.02502
628
+ # and should be between [0, 1]
629
+
630
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
631
+ extra_step_kwargs = {}
632
+ if accepts_eta:
633
+ extra_step_kwargs["eta"] = eta
634
+
635
+ # check if the scheduler accepts generator
636
+ accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
637
+ if accepts_generator:
638
+ extra_step_kwargs["generator"] = generator
639
+ return extra_step_kwargs
640
+
641
+ # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.check_inputs
642
+ def check_inputs(
643
+ self,
644
+ prompt,
645
+ prompt_2,
646
+ height,
647
+ width,
648
+ callback_steps,
649
+ negative_prompt=None,
650
+ negative_prompt_2=None,
651
+ prompt_embeds=None,
652
+ negative_prompt_embeds=None,
653
+ pooled_prompt_embeds=None,
654
+ negative_pooled_prompt_embeds=None,
655
+ ip_adapter_image=None,
656
+ ip_adapter_image_embeds=None,
657
+ callback_on_step_end_tensor_inputs=None,
658
+ ):
659
+ if height % 8 != 0 or width % 8 != 0:
660
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
661
+
662
+ if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
663
+ raise ValueError(
664
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
665
+ f" {type(callback_steps)}."
666
+ )
667
+
668
+ if callback_on_step_end_tensor_inputs is not None and not all(
669
+ k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
670
+ ):
671
+ raise ValueError(
672
+ f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
673
+ )
674
+
675
+ if prompt is not None and prompt_embeds is not None:
676
+ raise ValueError(
677
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
678
+ " only forward one of the two."
679
+ )
680
+ elif prompt_2 is not None and prompt_embeds is not None:
681
+ raise ValueError(
682
+ f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
683
+ " only forward one of the two."
684
+ )
685
+ elif prompt is None and prompt_embeds is None:
686
+ raise ValueError(
687
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
688
+ )
689
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
690
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
691
+ elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
692
+ raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
693
+
694
+ if negative_prompt is not None and negative_prompt_embeds is not None:
695
+ raise ValueError(
696
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
697
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
698
+ )
699
+ elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
700
+ raise ValueError(
701
+ f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
702
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
703
+ )
704
+
705
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
706
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
707
+ raise ValueError(
708
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
709
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
710
+ f" {negative_prompt_embeds.shape}."
711
+ )
712
+
713
+ if prompt_embeds is not None and pooled_prompt_embeds is None:
714
+ raise ValueError(
715
+ "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
716
+ )
717
+
718
+ if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
719
+ raise ValueError(
720
+ "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
721
+ )
722
+
723
+ if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
724
+ raise ValueError(
725
+ "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
726
+ )
727
+
728
+ if ip_adapter_image_embeds is not None:
729
+ if not isinstance(ip_adapter_image_embeds, list):
730
+ raise ValueError(
731
+ f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
732
+ )
733
+ elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
734
+ raise ValueError(
735
+ f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
736
+ )
737
+
738
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
739
+ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
740
+ shape = (
741
+ batch_size,
742
+ num_channels_latents,
743
+ int(height) // self.vae_scale_factor,
744
+ int(width) // self.vae_scale_factor,
745
+ )
746
+ if isinstance(generator, list) and len(generator) != batch_size:
747
+ raise ValueError(
748
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
749
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
750
+ )
751
+
752
+ if latents is None:
753
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
754
+ else:
755
+ latents = latents.to(device)
756
+
757
+ # scale the initial noise by the standard deviation required by the scheduler
758
+ latents = latents * self.scheduler.init_noise_sigma
759
+ return latents
760
+
761
+ # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline._get_add_time_ids
762
+ def _get_add_time_ids(
763
+ self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
764
+ ):
765
+ add_time_ids = list(original_size + crops_coords_top_left + target_size)
766
+
767
+ passed_add_embed_dim = (
768
+ self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
769
+ )
770
+ expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
771
+
772
+ if expected_add_embed_dim != passed_add_embed_dim:
773
+ raise ValueError(
774
+ f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
775
+ )
776
+
777
+ add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
778
+ return add_time_ids
779
+
780
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
781
+ def upcast_vae(self):
782
+ dtype = self.vae.dtype
783
+ self.vae.to(dtype=torch.float32)
784
+ use_torch_2_0_or_xformers = isinstance(
785
+ self.vae.decoder.mid_block.attentions[0].processor,
786
+ (
787
+ AttnProcessor2_0,
788
+ XFormersAttnProcessor,
789
+ ),
790
+ )
791
+ # if xformers or torch_2_0 is used attention block does not need
792
+ # to be in float32 which can save lots of memory
793
+ if use_torch_2_0_or_xformers:
794
+ self.vae.post_quant_conv.to(dtype)
795
+ self.vae.decoder.conv_in.to(dtype)
796
+ self.vae.decoder.mid_block.to(dtype)
797
+
798
+ # Copied from diffusers.pipelines.t2i_adapter.pipeline_stable_diffusion_adapter.StableDiffusionAdapterPipeline._default_height_width
799
+ def _default_height_width(self, height, width, image):
800
+ # NOTE: It is possible that a list of images have different
801
+ # dimensions for each image, so just checking the first image
802
+ # is not _exactly_ correct, but it is simple.
803
+ while isinstance(image, list):
804
+ image = image[0]
805
+
806
+ if height is None:
807
+ if isinstance(image, PIL.Image.Image):
808
+ height = image.height
809
+ elif isinstance(image, torch.Tensor):
810
+ height = image.shape[-2]
811
+
812
+ # round down to nearest multiple of `self.adapter.downscale_factor`
813
+ height = (height // self.adapter.downscale_factor) * self.adapter.downscale_factor
814
+
815
+ if width is None:
816
+ if isinstance(image, PIL.Image.Image):
817
+ width = image.width
818
+ elif isinstance(image, torch.Tensor):
819
+ width = image.shape[-1]
820
+
821
+ # round down to nearest multiple of `self.adapter.downscale_factor`
822
+ width = (width // self.adapter.downscale_factor) * self.adapter.downscale_factor
823
+
824
+ return height, width
825
+
826
+ # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
827
+ def get_guidance_scale_embedding(
828
+ self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
829
+ ) -> torch.Tensor:
830
+ """
831
+ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
832
+
833
+ Args:
834
+ w (`torch.Tensor`):
835
+ Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
836
+ embedding_dim (`int`, *optional*, defaults to 512):
837
+ Dimension of the embeddings to generate.
838
+ dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
839
+ Data type of the generated embeddings.
840
+
841
+ Returns:
842
+ `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
843
+ """
844
+ assert len(w.shape) == 1
845
+ w = w * 1000.0
846
+
847
+ half_dim = embedding_dim // 2
848
+ emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
849
+ emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
850
+ emb = w.to(dtype)[:, None] * emb[None, :]
851
+ emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
852
+ if embedding_dim % 2 == 1: # zero pad
853
+ emb = torch.nn.functional.pad(emb, (0, 1))
854
+ assert emb.shape == (w.shape[0], embedding_dim)
855
+ return emb
856
+
857
+ @property
858
+ def guidance_scale(self):
859
+ return self._guidance_scale
860
+
861
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
862
+ # of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
863
+ # corresponds to doing no classifier free guidance.
864
+ @property
865
+ def do_classifier_free_guidance(self):
866
+ return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
867
+
868
+ @torch.no_grad()
869
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
870
+ def __call__(
871
+ self,
872
+ prompt: Union[str, List[str]] = None,
873
+ prompt_2: Optional[Union[str, List[str]]] = None,
874
+ image: PipelineImageInput = None,
875
+ height: Optional[int] = None,
876
+ width: Optional[int] = None,
877
+ num_inference_steps: int = 50,
878
+ timesteps: List[int] = None,
879
+ sigmas: List[float] = None,
880
+ denoising_end: Optional[float] = None,
881
+ guidance_scale: float = 5.0,
882
+ negative_prompt: Optional[Union[str, List[str]]] = None,
883
+ negative_prompt_2: Optional[Union[str, List[str]]] = None,
884
+ num_images_per_prompt: Optional[int] = 1,
885
+ eta: float = 0.0,
886
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
887
+ latents: Optional[torch.Tensor] = None,
888
+ prompt_embeds: Optional[torch.Tensor] = None,
889
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
890
+ pooled_prompt_embeds: Optional[torch.Tensor] = None,
891
+ negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
892
+ ip_adapter_image: Optional[PipelineImageInput] = None,
893
+ ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
894
+ output_type: Optional[str] = "pil",
895
+ return_dict: bool = True,
896
+ callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
897
+ callback_steps: int = 1,
898
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
899
+ guidance_rescale: float = 0.0,
900
+ original_size: Optional[Tuple[int, int]] = None,
901
+ crops_coords_top_left: Tuple[int, int] = (0, 0),
902
+ target_size: Optional[Tuple[int, int]] = None,
903
+ negative_original_size: Optional[Tuple[int, int]] = None,
904
+ negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
905
+ negative_target_size: Optional[Tuple[int, int]] = None,
906
+ adapter_conditioning_scale: Union[float, List[float]] = 1.0,
907
+ adapter_conditioning_factor: float = 1.0,
908
+ clip_skip: Optional[int] = None,
909
+ ):
910
+ r"""
911
+ Function invoked when calling the pipeline for generation.
912
+
913
+ Args:
914
+ prompt (`str` or `List[str]`, *optional*):
915
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
916
+ instead.
917
+ prompt_2 (`str` or `List[str]`, *optional*):
918
+ The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
919
+ used in both text-encoders
920
+ image (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]` or `List[PIL.Image.Image]` or `List[List[PIL.Image.Image]]`):
921
+ The Adapter input condition. Adapter uses this input condition to generate guidance to Unet. If the
922
+ type is specified as `torch.Tensor`, it is passed to Adapter as is. PIL.Image.Image` can also be
923
+ accepted as an image. The control image is automatically resized to fit the output image.
924
+ height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
925
+ The height in pixels of the generated image. Anything below 512 pixels won't work well for
926
+ [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
927
+ and checkpoints that are not specifically fine-tuned on low resolutions.
928
+ width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
929
+ The width in pixels of the generated image. Anything below 512 pixels won't work well for
930
+ [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
931
+ and checkpoints that are not specifically fine-tuned on low resolutions.
932
+ num_inference_steps (`int`, *optional*, defaults to 50):
933
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
934
+ expense of slower inference.
935
+ timesteps (`List[int]`, *optional*):
936
+ Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
937
+ in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
938
+ passed will be used. Must be in descending order.
939
+ sigmas (`List[float]`, *optional*):
940
+ Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
941
+ their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
942
+ will be used.
943
+ denoising_end (`float`, *optional*):
944
+ When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
945
+ completed before it is intentionally prematurely terminated. As a result, the returned sample will
946
+ still retain a substantial amount of noise as determined by the discrete timesteps selected by the
947
+ scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
948
+ "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
949
+ Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
950
+ guidance_scale (`float`, *optional*, defaults to 5.0):
951
+ Guidance scale as defined in [Classifier-Free Diffusion
952
+ Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
953
+ of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
954
+ `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
955
+ the text `prompt`, usually at the expense of lower image quality.
956
+ negative_prompt (`str` or `List[str]`, *optional*):
957
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
958
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
959
+ less than `1`).
960
+ negative_prompt_2 (`str` or `List[str]`, *optional*):
961
+ The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
962
+ `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
963
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
964
+ The number of images to generate per prompt.
965
+ eta (`float`, *optional*, defaults to 0.0):
966
+ Corresponds to parameter eta (η) in the DDIM paper: https://huggingface.co/papers/2010.02502. Only
967
+ applies to [`schedulers.DDIMScheduler`], will be ignored for others.
968
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
969
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
970
+ to make generation deterministic.
971
+ latents (`torch.Tensor`, *optional*):
972
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
973
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
974
+ tensor will be generated by sampling using the supplied random `generator`.
975
+ prompt_embeds (`torch.Tensor`, *optional*):
976
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
977
+ provided, text embeddings will be generated from `prompt` input argument.
978
+ negative_prompt_embeds (`torch.Tensor`, *optional*):
979
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
980
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
981
+ argument.
982
+ pooled_prompt_embeds (`torch.Tensor`, *optional*):
983
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
984
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
985
+ negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
986
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
987
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
988
+ input argument.
989
+ ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
990
+ ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
991
+ Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
992
+ IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
993
+ contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
994
+ provided, embeddings are computed from the `ip_adapter_image` input argument.
995
+ output_type (`str`, *optional*, defaults to `"pil"`):
996
+ The output format of the generate image. Choose between
997
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
998
+ return_dict (`bool`, *optional*, defaults to `True`):
999
+ Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionAdapterPipelineOutput`]
1000
+ instead of a plain tuple.
1001
+ callback (`Callable`, *optional*):
1002
+ A function that will be called every `callback_steps` steps during inference. The function will be
1003
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
1004
+ callback_steps (`int`, *optional*, defaults to 1):
1005
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
1006
+ called at every step.
1007
+ cross_attention_kwargs (`dict`, *optional*):
1008
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
1009
+ `self.processor` in
1010
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
1011
+ guidance_rescale (`float`, *optional*, defaults to 0.0):
1012
+ Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
1013
+ Flawed](https://huggingface.co/papers/2305.08891) `guidance_scale` is defined as `φ` in equation 16. of
1014
+ [Common Diffusion Noise Schedules and Sample Steps are
1015
+ Flawed](https://huggingface.co/papers/2305.08891). Guidance rescale factor should fix overexposure when
1016
+ using zero terminal SNR.
1017
+ original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
1018
+ If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
1019
+ `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
1020
+ explained in section 2.2 of
1021
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
1022
+ crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
1023
+ `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
1024
+ `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
1025
+ `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
1026
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
1027
+ target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
1028
+ For most cases, `target_size` should be set to the desired height and width of the generated image. If
1029
+ not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
1030
+ section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
1031
+ section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
1032
+ negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
1033
+ To negatively condition the generation process based on a specific image resolution. Part of SDXL's
1034
+ micro-conditioning as explained in section 2.2 of
1035
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
1036
+ information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
1037
+ negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
1038
+ To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
1039
+ micro-conditioning as explained in section 2.2 of
1040
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
1041
+ information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
1042
+ negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
1043
+ To negatively condition the generation process based on a target image resolution. It should be as same
1044
+ as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
1045
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
1046
+ information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
1047
+ adapter_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
1048
+ The outputs of the adapter are multiplied by `adapter_conditioning_scale` before they are added to the
1049
+ residual in the original unet. If multiple adapters are specified in init, you can set the
1050
+ corresponding scale as a list.
1051
+ adapter_conditioning_factor (`float`, *optional*, defaults to 1.0):
1052
+ The fraction of timesteps for which adapter should be applied. If `adapter_conditioning_factor` is
1053
+ `0.0`, adapter is not applied at all. If `adapter_conditioning_factor` is `1.0`, adapter is applied for
1054
+ all timesteps. If `adapter_conditioning_factor` is `0.5`, adapter is applied for half of the timesteps.
1055
+ clip_skip (`int`, *optional*):
1056
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
1057
+ the output of the pre-final layer will be used for computing the prompt embeddings.
1058
+
1059
+ Examples:
1060
+
1061
+ Returns:
1062
+ [`~pipelines.stable_diffusion.StableDiffusionAdapterPipelineOutput`] or `tuple`:
1063
+ [`~pipelines.stable_diffusion.StableDiffusionAdapterPipelineOutput`] if `return_dict` is True, otherwise a
1064
+ `tuple`. When returning a tuple, the first element is a list with the generated images.
1065
+ """
1066
+ # 0. Default height and width to unet
1067
+
1068
+ height, width = self._default_height_width(height, width, image)
1069
+ device = self._execution_device
1070
+
1071
+ if isinstance(self.adapter, MultiAdapter):
1072
+ adapter_input = []
1073
+
1074
+ for one_image in image:
1075
+ one_image = _preprocess_adapter_image(one_image, height, width)
1076
+ one_image = one_image.to(device=device, dtype=self.adapter.dtype)
1077
+ adapter_input.append(one_image)
1078
+ else:
1079
+ adapter_input = _preprocess_adapter_image(image, height, width)
1080
+ adapter_input = adapter_input.to(device=device, dtype=self.adapter.dtype)
1081
+ original_size = original_size or (height, width)
1082
+ target_size = target_size or (height, width)
1083
+
1084
+ # 1. Check inputs. Raise error if not correct
1085
+ self.check_inputs(
1086
+ prompt,
1087
+ prompt_2,
1088
+ height,
1089
+ width,
1090
+ callback_steps,
1091
+ negative_prompt,
1092
+ negative_prompt_2,
1093
+ prompt_embeds,
1094
+ negative_prompt_embeds,
1095
+ pooled_prompt_embeds,
1096
+ negative_pooled_prompt_embeds,
1097
+ ip_adapter_image,
1098
+ ip_adapter_image_embeds,
1099
+ )
1100
+
1101
+ self._guidance_scale = guidance_scale
1102
+
1103
+ # 2. Define call parameters
1104
+ if prompt is not None and isinstance(prompt, str):
1105
+ batch_size = 1
1106
+ elif prompt is not None and isinstance(prompt, list):
1107
+ batch_size = len(prompt)
1108
+ else:
1109
+ batch_size = prompt_embeds.shape[0]
1110
+
1111
+ device = self._execution_device
1112
+
1113
+ # 3.1 Encode input prompt
1114
+ (
1115
+ prompt_embeds,
1116
+ negative_prompt_embeds,
1117
+ pooled_prompt_embeds,
1118
+ negative_pooled_prompt_embeds,
1119
+ ) = self.encode_prompt(
1120
+ prompt=prompt,
1121
+ prompt_2=prompt_2,
1122
+ device=device,
1123
+ num_images_per_prompt=num_images_per_prompt,
1124
+ do_classifier_free_guidance=self.do_classifier_free_guidance,
1125
+ negative_prompt=negative_prompt,
1126
+ negative_prompt_2=negative_prompt_2,
1127
+ prompt_embeds=prompt_embeds,
1128
+ negative_prompt_embeds=negative_prompt_embeds,
1129
+ pooled_prompt_embeds=pooled_prompt_embeds,
1130
+ negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
1131
+ clip_skip=clip_skip,
1132
+ )
1133
+
1134
+ # 3.2 Encode ip_adapter_image
1135
+ if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
1136
+ image_embeds = self.prepare_ip_adapter_image_embeds(
1137
+ ip_adapter_image,
1138
+ ip_adapter_image_embeds,
1139
+ device,
1140
+ batch_size * num_images_per_prompt,
1141
+ self.do_classifier_free_guidance,
1142
+ )
1143
+
1144
+ # 4. Prepare timesteps
1145
+ timesteps, num_inference_steps = retrieve_timesteps(
1146
+ self.scheduler, num_inference_steps, device, timesteps, sigmas
1147
+ )
1148
+
1149
+ # 5. Prepare latent variables
1150
+ num_channels_latents = self.unet.config.in_channels
1151
+ latents = self.prepare_latents(
1152
+ batch_size * num_images_per_prompt,
1153
+ num_channels_latents,
1154
+ height,
1155
+ width,
1156
+ prompt_embeds.dtype,
1157
+ device,
1158
+ generator,
1159
+ latents,
1160
+ )
1161
+
1162
+ # 6.1 Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
1163
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
1164
+
1165
+ # 6.2 Optionally get Guidance Scale Embedding
1166
+ timestep_cond = None
1167
+ if self.unet.config.time_cond_proj_dim is not None:
1168
+ guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
1169
+ timestep_cond = self.get_guidance_scale_embedding(
1170
+ guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
1171
+ ).to(device=device, dtype=latents.dtype)
1172
+
1173
+ # 7. Prepare added time ids & embeddings & adapter features
1174
+ if isinstance(self.adapter, MultiAdapter):
1175
+ adapter_state = self.adapter(adapter_input, adapter_conditioning_scale)
1176
+ for k, v in enumerate(adapter_state):
1177
+ adapter_state[k] = v
1178
+ else:
1179
+ adapter_state = self.adapter(adapter_input)
1180
+ for k, v in enumerate(adapter_state):
1181
+ adapter_state[k] = v * adapter_conditioning_scale
1182
+ if num_images_per_prompt > 1:
1183
+ for k, v in enumerate(adapter_state):
1184
+ adapter_state[k] = v.repeat(num_images_per_prompt, 1, 1, 1)
1185
+ if self.do_classifier_free_guidance:
1186
+ for k, v in enumerate(adapter_state):
1187
+ adapter_state[k] = torch.cat([v] * 2, dim=0)
1188
+
1189
+ add_text_embeds = pooled_prompt_embeds
1190
+ if self.text_encoder_2 is None:
1191
+ text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
1192
+ else:
1193
+ text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
1194
+
1195
+ add_time_ids = self._get_add_time_ids(
1196
+ original_size,
1197
+ crops_coords_top_left,
1198
+ target_size,
1199
+ dtype=prompt_embeds.dtype,
1200
+ text_encoder_projection_dim=text_encoder_projection_dim,
1201
+ )
1202
+ if negative_original_size is not None and negative_target_size is not None:
1203
+ negative_add_time_ids = self._get_add_time_ids(
1204
+ negative_original_size,
1205
+ negative_crops_coords_top_left,
1206
+ negative_target_size,
1207
+ dtype=prompt_embeds.dtype,
1208
+ text_encoder_projection_dim=text_encoder_projection_dim,
1209
+ )
1210
+ else:
1211
+ negative_add_time_ids = add_time_ids
1212
+
1213
+ if self.do_classifier_free_guidance:
1214
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
1215
+ add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
1216
+ add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
1217
+
1218
+ prompt_embeds = prompt_embeds.to(device)
1219
+ add_text_embeds = add_text_embeds.to(device)
1220
+ add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
1221
+
1222
+ # 8. Denoising loop
1223
+ num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
1224
+ # Apply denoising_end
1225
+ if denoising_end is not None and isinstance(denoising_end, float) and denoising_end > 0 and denoising_end < 1:
1226
+ discrete_timestep_cutoff = int(
1227
+ round(
1228
+ self.scheduler.config.num_train_timesteps
1229
+ - (denoising_end * self.scheduler.config.num_train_timesteps)
1230
+ )
1231
+ )
1232
+ num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
1233
+ timesteps = timesteps[:num_inference_steps]
1234
+
1235
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
1236
+ for i, t in enumerate(timesteps):
1237
+ # expand the latents if we are doing classifier free guidance
1238
+ latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
1239
+
1240
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
1241
+
1242
+ added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
1243
+
1244
+ if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
1245
+ added_cond_kwargs["image_embeds"] = image_embeds
1246
+
1247
+ # predict the noise residual
1248
+ if i < int(num_inference_steps * adapter_conditioning_factor):
1249
+ down_intrablock_additional_residuals = [state.clone() for state in adapter_state]
1250
+ else:
1251
+ down_intrablock_additional_residuals = None
1252
+
1253
+ noise_pred = self.unet(
1254
+ latent_model_input,
1255
+ t,
1256
+ encoder_hidden_states=prompt_embeds,
1257
+ timestep_cond=timestep_cond,
1258
+ cross_attention_kwargs=cross_attention_kwargs,
1259
+ down_intrablock_additional_residuals=down_intrablock_additional_residuals,
1260
+ added_cond_kwargs=added_cond_kwargs,
1261
+ return_dict=False,
1262
+ )[0]
1263
+
1264
+ # perform guidance
1265
+ if self.do_classifier_free_guidance:
1266
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
1267
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
1268
+
1269
+ if self.do_classifier_free_guidance and guidance_rescale > 0.0:
1270
+ # Based on 3.4. in https://huggingface.co/papers/2305.08891
1271
+ noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
1272
+
1273
+ # compute the previous noisy sample x_t -> x_t-1
1274
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
1275
+
1276
+ # call the callback, if provided
1277
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
1278
+ progress_bar.update()
1279
+ if callback is not None and i % callback_steps == 0:
1280
+ step_idx = i // getattr(self.scheduler, "order", 1)
1281
+ callback(step_idx, t, latents)
1282
+
1283
+ if XLA_AVAILABLE:
1284
+ xm.mark_step()
1285
+
1286
+ if not output_type == "latent":
1287
+ # make sure the VAE is in float32 mode, as it overflows in float16
1288
+ needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
1289
+
1290
+ if needs_upcasting:
1291
+ self.upcast_vae()
1292
+ latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
1293
+
1294
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
1295
+
1296
+ # cast back to fp16 if needed
1297
+ if needs_upcasting:
1298
+ self.vae.to(dtype=torch.float16)
1299
+ else:
1300
+ image = latents
1301
+ return StableDiffusionXLPipelineOutput(images=image)
1302
+
1303
+ image = self.image_processor.postprocess(image, output_type=output_type)
1304
+
1305
+ # Offload all models
1306
+ self.maybe_free_model_hooks()
1307
+
1308
+ if not return_dict:
1309
+ return (image,)
1310
+
1311
+ return StableDiffusionXLPipelineOutput(images=image)